Files
simplex-chat/src/Simplex/Chat/Markdown.hs
Evgeny ef60ceea12 core, ui: markdown for hyperlinks, warn on unsanitized links, option to sanitize sent links (#6160)
* core: markdown for "hidden" links

* update, test

* api docs

* chatParseUri FFI function

* ios: hyperlinks, offer to open sanitized links, an option to send sanitized links (enabled by default)

* update markdown

* android, desktop: ditto

* ios: export localizations

* core: rename constructor, change Maybe semantics for web links

* rename
2025-08-09 10:52:35 +01:00

417 lines
15 KiB
Haskell

{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE DuplicateRecordFields #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE MultiWayIf #-}
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TupleSections #-}
{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
{-# HLINT ignore "Use newtype instead of data" #-}
module Simplex.Chat.Markdown where
import Control.Applicative (optional, (<|>))
import Control.Monad
import Data.Aeson (FromJSON, ToJSON)
import qualified Data.Aeson as J
import qualified Data.Aeson.TH as JQ
import Data.Attoparsec.Text (Parser)
import qualified Data.Attoparsec.Text as A
import qualified Data.ByteString.Char8 as B
import Data.Char (isAlpha, isAscii, isDigit, isPunctuation, isSpace)
import Data.Either (fromRight)
import Data.Functor (($>))
import Data.List (foldl', intercalate)
import Data.List.NonEmpty (NonEmpty)
import qualified Data.List.NonEmpty as L
import Data.Maybe (fromMaybe, isNothing, mapMaybe)
import Data.Semigroup (sconcat)
import Data.String
import Data.Text (Text)
import qualified Data.Text as T
import Data.Text.Encoding (encodeUtf8)
import Simplex.Chat.Types
import Simplex.Messaging.Agent.Protocol (AConnectionLink (..), ConnReqUriData (..), ConnShortLink (..), ConnectionLink (..), ConnectionRequestUri (..), ContactConnType (..), SMPQueue (..), simplexConnReqUri, simplexShortLink)
import Simplex.Messaging.Encoding.String
import Simplex.Messaging.Parsers (defaultJSON, dropPrefix, enumJSON, fstToLower, sumTypeJSON)
import Simplex.Messaging.Protocol (ProtocolServer (..))
import Simplex.Messaging.Util (decodeJSON, safeDecodeUtf8, tshow)
import System.Console.ANSI.Types
import qualified Text.Email.Validate as Email
import qualified URI.ByteString as U
data Markdown = Markdown (Maybe Format) Text | Markdown :|: Markdown
deriving (Eq, Show)
data Format
= Bold
| Italic
| StrikeThrough
| Snippet
| Secret
| Colored {color :: FormatColor}
| Uri
-- showText is Nothing for the usual Uri without text
| HyperLink {showText :: Maybe Text, linkUri :: Text}
| SimplexLink {showText :: Maybe Text, linkType :: SimplexLinkType, simplexUri :: AConnectionLink, smpHosts :: NonEmpty Text}
| Command {commandStr :: Text}
| Mention {memberName :: Text}
| Email
| Phone
| Unknown {json :: J.Value}
deriving (Eq, Show)
mentionedNames :: MarkdownList -> [Text]
mentionedNames = mapMaybe (\(FormattedText f _) -> mentionedName =<< f)
where
mentionedName = \case
Mention name -> Just name
_ -> Nothing
data SimplexLinkType = XLContact | XLInvitation | XLGroup | XLChannel
deriving (Eq, Show)
colored :: Color -> Format
colored = Colored . FormatColor
{-# INLINE colored #-}
markdown :: Format -> Text -> Markdown
markdown = Markdown . Just
{-# INLINE markdown #-}
instance Semigroup Markdown where
m <> (Markdown _ "") = m
(Markdown _ "") <> m = m
m1@(Markdown f1 s1) <> m2@(Markdown f2 s2)
| f1 == f2 = Markdown f1 $ s1 <> s2
| otherwise = m1 :|: m2
m1@(Markdown f1 s1) <> ms@(Markdown f2 s2 :|: m3)
| f1 == f2 = Markdown f1 (s1 <> s2) :|: m3
| otherwise = m1 :|: ms
ms@(m1 :|: Markdown f2 s2) <> m3@(Markdown f3 s3)
| f2 == f3 = m1 :|: Markdown f2 (s2 <> s3)
| otherwise = ms :|: m3
m1 <> m2 = m1 :|: m2
instance Monoid Markdown where mempty = unmarked ""
instance IsString Markdown where fromString = unmarked . T.pack
newtype FormatColor = FormatColor Color
deriving (Eq, Show)
instance FromJSON FormatColor where
parseJSON =
J.withText "FormatColor" $
fmap FormatColor . \case
"red" -> pure Red
"green" -> pure Green
"blue" -> pure Blue
"yellow" -> pure Yellow
"cyan" -> pure Cyan
"magenta" -> pure Magenta
"black" -> pure Black
"white" -> pure White
unexpected -> fail $ "unexpected FormatColor: " <> show unexpected
instance ToJSON FormatColor where
toJSON (FormatColor c) = case c of
Red -> "red"
Green -> "green"
Blue -> "blue"
Yellow -> "yellow"
Cyan -> "cyan"
Magenta -> "magenta"
Black -> "black"
White -> "white"
data FormattedText = FormattedText {format :: Maybe Format, text :: Text}
deriving (Eq, Show)
instance IsString FormattedText where
fromString = FormattedText Nothing . T.pack
type MarkdownList = [FormattedText]
data ParsedMarkdown = ParsedMarkdown {formattedText :: Maybe MarkdownList}
unmarked :: Text -> Markdown
unmarked = Markdown Nothing
parseMaybeMarkdownList :: Text -> Maybe MarkdownList
parseMaybeMarkdownList s = case ls of
[] -> Nothing
[l]
| T.null cmd -> Nothing
| isCmd -> Just [FormattedText (Just $ Command cmd) l]
where
(isCmd, cmd) = case T.uncons $ T.dropWhile (== ' ') l of
Just (c, rest) -> (c == '/', rest)
Nothing -> (False, "")
_
| all (isNothing . format) ml -> Nothing
| otherwise -> Just . reverse $ foldl' acc [] ml
where
ls = T.lines s
ml = intercalate ["\n"] $ map (markdownToList . parseMarkdown) ls
acc [] m = [m]
acc ms@(FormattedText f t : ms') ft@(FormattedText f' t')
| f == f' = FormattedText f (t <> t') : ms'
| otherwise = ft : ms
parseMarkdownList :: Text -> MarkdownList
parseMarkdownList = markdownToList . parseMarkdown
markdownToList :: Markdown -> MarkdownList
markdownToList (Markdown f s) = [FormattedText f s]
markdownToList (m1 :|: m2) = markdownToList m1 <> markdownToList m2
parseMarkdown :: Text -> Markdown
parseMarkdown s = fromRight (unmarked s) $ A.parseOnly (markdownP <* A.endOfInput) s
isSimplexLink :: Format -> Bool
isSimplexLink = \case
SimplexLink {} -> True
_ -> False
markdownP :: Parser Markdown
markdownP = mconcat <$> A.many' fragmentP
where
fragmentP :: Parser Markdown
fragmentP =
A.peekChar >>= \case
Just c -> case c of
' ' -> unmarked <$> A.takeWhile (== ' ')
'+' -> phoneP <|> wordP
'*' -> formattedP '*' Bold
'_' -> formattedP '_' Italic
'~' -> formattedP '~' StrikeThrough
'`' -> formattedP '`' Snippet
'#' -> A.char '#' *> secretP
'!' -> coloredP <|> wordP
'@' -> mentionP <|> wordP
'/' -> commandP <|> wordP
'[' -> sowLinkP <|> wordP
_
| isDigit c -> phoneP <|> wordP
| otherwise -> wordP
Nothing -> fail ""
formattedP :: Char -> Format -> Parser Markdown
formattedP c f = do
s <- A.char c *> A.takeTill (== c)
(A.char c $> md c f s) <|> noFormat (c `T.cons` s)
md :: Char -> Format -> Text -> Markdown
md c f s
| T.null s || T.head s == ' ' || T.last s == ' ' =
unmarked $ c `T.cons` s `T.snoc` c
| otherwise = markdown f s
secretP :: Parser Markdown
secretP = secret <$> A.takeWhile (== '#') <*> A.takeTill (== '#') <*> A.takeWhile (== '#')
secret :: Text -> Text -> Text -> Markdown
secret b s a
| T.null a || T.null s || T.head s == ' ' || T.last s == ' ' =
unmarked $ '#' `T.cons` ss
| otherwise = markdown Secret $ T.init ss
where
ss = b <> s <> a
coloredP :: Parser Markdown
coloredP = do
clr <- A.char '!' *> colorP <* A.space
s <- ((<>) <$> A.takeWhile1 (\c -> c /= ' ' && c /= '!') <*> A.takeTill (== '!')) <* A.char '!'
if T.null s || T.last s == ' '
then fail "not colored"
else pure $ markdown (colored clr) s
mentionP = prefixedStringP '@' displayNameTextP_ Mention
commandP = prefixedStringP '/' commandTextP Command
prefixedStringP pfx parser format = do
c <- A.char pfx *> A.peekChar'
(str, punct) <- parser
let origStr = if c == '\'' then '\'' `T.cons` str `T.snoc` '\'' else str
res = markdown (format str) (pfx `T.cons` origStr)
pure $ if T.null punct then res else res :|: unmarked punct
sowLinkP = do
t <- '[' `inParens` ']'
l <- '(' `inParens` ')'
let hasPunct = T.any (\c -> isPunctuation c && c /= '-' && c /= '_') t
when (hasPunct && t /= l && ("https://" <> t) /= l) $ fail "punctuation in hyperlink text"
f <- case strDecode $ encodeUtf8 l of
Right lnk@(ACL _ cLink) -> case cLink of
CLShort _ -> pure $ simplexUriFormat (Just t) lnk
CLFull _ -> fail "full SimpleX link in hyperlink"
Left _ -> case parseUri $ encodeUtf8 l of
Right _ -> pure $ HyperLink (Just t) l
Left e -> fail $ "not uri: " <> T.unpack e
pure $ markdown f $ T.concat ["[", t, "](", l, ")"]
inParens open close = A.char open *> A.takeWhile1 (/= close) <* A.char close
colorP =
A.anyChar >>= \case
'r' -> optional "ed" $> Red
'g' -> optional "reen" $> Green
'b' -> optional "lue" $> Blue
'y' -> optional "ellow" $> Yellow
'c' -> optional "yan" $> Cyan
'm' -> optional "agenta" $> Magenta
'1' -> pure Red
'2' -> pure Green
'3' -> pure Blue
'4' -> pure Yellow
'5' -> pure Cyan
'6' -> pure Magenta
_ -> fail "not color"
phoneP = do
country <- optional $ T.cons <$> A.char '+' <*> A.takeWhile1 isDigit
code <- optional $ conc4 <$> phoneSep <*> "(" <*> A.takeWhile1 isDigit <*> ")"
segments <- mconcat <$> A.many' ((<>) <$> phoneSep <*> A.takeWhile1 isDigit)
let s = fromMaybe "" country <> fromMaybe "" code <> segments
len = T.length s
if 7 <= len && len <= 22 then pure $ markdown Phone s else fail "not phone"
conc4 s1 s2 s3 s4 = s1 <> s2 <> s3 <> s4
phoneSep = " " <|> "-" <|> "." <|> ""
wordP :: Parser Markdown
wordP = wordMD <$> A.takeTill (== ' ')
wordMD :: Text -> Markdown
wordMD s
| T.null s = unmarked s
| isUri s' = case strDecode $ encodeUtf8 s of
Right cLink -> res $ markdown (simplexUriFormat Nothing cLink) s'
Left _ -> case parseUri $ encodeUtf8 s' of
Right _ -> res $ markdown Uri s'
Left _ -> unmarked s
| isDomain s' = res $ markdown Uri s'
| isEmail s' = res $ markdown Email s'
| otherwise = unmarked s
where
punct = T.takeWhileEnd isPunctuation' s
s' = T.dropWhileEnd isPunctuation' s
res md' = if T.null punct then md' else md' :|: unmarked punct
isPunctuation' = \case
'/' -> False
')' -> False
c -> isPunctuation c
isUri s = T.length s >= 10 && any (`T.isPrefixOf` s) ["http://", "https://", "simplex:/"]
-- matches what is likely to be a domain, not all valid domain names
isDomain s = case T.splitOn "." s of
[name, tld] -> isDomain_ name tld
[sub, name, tld] -> T.length sub >= 3 && T.length sub <= 8 && isDomain_ name tld
_ -> False
where
isDomain_ name tld =
(let n = T.length name in n >= 1 && n <= 24)
&& (let n = T.length tld in n >= 2 && n <= 8)
&& (let p c = isAscii c && isAlpha c in T.all p name && T.all p tld)
isEmail s = T.any (== '@') s && Email.isValid (encodeUtf8 s)
noFormat = pure . unmarked
simplexUriFormat :: Maybe Text -> AConnectionLink -> Format
simplexUriFormat showText = \case
ACL m (CLFull cReq) -> case cReq of
CRContactUri crData -> SimplexLink showText (linkType' crData) cLink $ uriHosts crData
CRInvitationUri crData _ -> SimplexLink showText XLInvitation cLink $ uriHosts crData
where
cLink = ACL m $ CLFull $ simplexConnReqUri cReq
uriHosts ConnReqUriData {crSmpQueues} = L.map strEncodeText $ sconcat $ L.map (host . qServer) crSmpQueues
linkType' ConnReqUriData {crClientData} = case crClientData >>= decodeJSON of
Just (CRDataGroup _) -> XLGroup
Nothing -> XLContact
ACL m (CLShort sLnk) -> case sLnk of
CSLContact _ ct srv _ -> SimplexLink showText (linkType' ct) cLink $ uriHosts srv
CSLInvitation _ srv _ _ -> SimplexLink showText XLInvitation cLink $ uriHosts srv
where
cLink = ACL m $ CLShort $ simplexShortLink sLnk
uriHosts srv = L.map strEncodeText $ host srv
linkType' = \case
CCTGroup -> XLGroup
CCTChannel -> XLChannel
CCTContact -> XLContact
strEncodeText :: StrEncoding a => a -> Text
strEncodeText = safeDecodeUtf8 . strEncode
parseUri :: B.ByteString -> Either Text U.URI
parseUri s = case U.parseURI U.laxURIParserOptions s of
Left e -> Left $ "Invalid URI: " <> tshow e
Right uri@U.URI {uriScheme = U.Scheme sch, uriAuthority}
| sch /= "http" && sch /= "https" -> Left $ "Unsupported URI scheme: " <> safeDecodeUtf8 sch
| otherwise -> case uriAuthority of
Nothing -> Left "No URI host"
Just U.Authority {authorityHost = U.Host h}
| '.' `B.notElem` h -> Left $ "Invalid URI host: " <> safeDecodeUtf8 h
| otherwise -> Right uri
sanitizeUri :: U.URI -> Maybe U.URI
sanitizeUri uri@U.URI {uriQuery = U.Query originalQS} =
let sanitizedQS = filter (\(p, _) -> p == "q" || p == "search") originalQS
in if length sanitizedQS == length originalQS
then Nothing
else Just $ uri {U.uriQuery = U.Query sanitizedQS}
markdownText :: FormattedText -> Text
markdownText (FormattedText f_ t) = case f_ of
Nothing -> t
Just f -> case f of
Bold -> around '*'
Italic -> around '_'
StrikeThrough -> around '~'
Snippet -> around '`'
Secret -> around '#'
Colored (FormatColor c) -> color c
Uri -> t
HyperLink {} -> t
SimplexLink {} -> t
Mention _ -> t
Command _ -> t
Email -> t
Phone -> t
Unknown _ -> t
where
around c = c `T.cons` t `T.snoc` c
color c = case colorStr c of
Just cStr -> cStr <> t `T.snoc` '!'
Nothing -> t
colorStr = \case
Red -> Just "!1 "
Green -> Just "!2 "
Blue -> Just "!3 "
Yellow -> Just "!4 "
Cyan -> Just "!5 "
Magenta -> Just "!6 "
Black -> Nothing
White -> Nothing
displayNameTextP :: Parser Text
displayNameTextP = displayNameTextP_ >>= \(t, sfx) -> if T.null sfx then pure t else fail "Name ends with punctuation"
{-# INLINE displayNameTextP #-}
displayNameTextP_ :: Parser (Text, Text)
displayNameTextP_ = (,"") <$> quoted '\'' <|> splitPunctuation <$> takeNameTill isSpace
where
takeNameTill p =
A.peekChar' >>= \c ->
if refChar c then A.takeTill p else fail "invalid first character in display name"
splitPunctuation s = (T.dropWhileEnd isPunctuation s, T.takeWhileEnd isPunctuation s)
quoted c = A.char c *> takeNameTill (== c) <* A.char c
refChar c = c > ' ' && c /= '#' && c /= '@' && c /= '\''
commandTextP :: Parser (Text, Text)
commandTextP = do
(cmd, punct) <- displayNameTextP_
case T.words cmd of
(keyword : _) | T.all (\c -> isAlpha c || isDigit c || c == '_') keyword -> pure (cmd, punct)
_ -> fail "invalid command keyword"
-- quotes names that contain spaces or end on punctuation
viewName :: Text -> Text
viewName s = if T.any isSpace s || maybe False (isPunctuation . snd) (T.unsnoc s) then "'" <> s <> "'" else s
$(JQ.deriveJSON (enumJSON $ dropPrefix "XL") ''SimplexLinkType)
$(JQ.deriveToJSON (sumTypeJSON fstToLower) ''Format)
instance FromJSON Format where
parseJSON v = $(JQ.mkParseJSON (sumTypeJSON fstToLower) ''Format) v <|> pure (Unknown v)
$(JQ.deriveJSON defaultJSON ''FormattedText)
$(JQ.deriveToJSON defaultJSON ''ParsedMarkdown)