Files
simplex-chat/src/Simplex/Chat/Markdown.hs
Evgeny 2a43a02af3 core, ui: support trailing punctuation for mentions, URIs (also support domains), and email addresses (#5888)
* core: improve markdown parser for mentions, URIs, and email addresses

* ui
2025-05-12 11:22:35 +01:00

348 lines
12 KiB
Haskell

{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TupleSections #-}
{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
{-# HLINT ignore "Use newtype instead of data" #-}
module Simplex.Chat.Markdown where
import Control.Applicative (optional, (<|>))
import Data.Aeson (FromJSON, ToJSON)
import qualified Data.Aeson as J
import qualified Data.Aeson.TH as JQ
import Data.Attoparsec.Text (Parser)
import qualified Data.Attoparsec.Text as A
import Data.Char (isAlpha, isAscii, isDigit, isPunctuation, isSpace)
import Data.Either (fromRight)
import Data.Functor (($>))
import Data.List (foldl', intercalate)
import Data.List.NonEmpty (NonEmpty)
import qualified Data.List.NonEmpty as L
import Data.Maybe (fromMaybe, isNothing, mapMaybe)
import Data.Semigroup (sconcat)
import Data.String
import Data.Text (Text)
import qualified Data.Text as T
import Data.Text.Encoding (encodeUtf8)
import Simplex.Chat.Types
import Simplex.Messaging.Agent.Protocol (AConnectionLink (..), ConnReqUriData (..), ConnShortLink (..), ConnectionLink (..), ConnectionRequestUri (..), ContactConnType (..), SMPQueue (..), simplexConnReqUri, simplexShortLink)
import Simplex.Messaging.Encoding.String
import Simplex.Messaging.Parsers (defaultJSON, dropPrefix, enumJSON, fstToLower, sumTypeJSON)
import Simplex.Messaging.Protocol (ProtocolServer (..))
import Simplex.Messaging.Util (decodeJSON, safeDecodeUtf8)
import System.Console.ANSI.Types
import qualified Text.Email.Validate as Email
data Markdown = Markdown (Maybe Format) Text | Markdown :|: Markdown
deriving (Eq, Show)
data Format
= Bold
| Italic
| StrikeThrough
| Snippet
| Secret
| Colored {color :: FormatColor}
| Uri
| SimplexLink {linkType :: SimplexLinkType, simplexUri :: AConnectionLink, smpHosts :: NonEmpty Text}
| Mention {memberName :: Text}
| Email
| Phone
deriving (Eq, Show)
mentionedNames :: MarkdownList -> [Text]
mentionedNames = mapMaybe (\(FormattedText f _) -> mentionedName =<< f)
where
mentionedName = \case
Mention name -> Just name
_ -> Nothing
data SimplexLinkType = XLContact | XLInvitation | XLGroup | XLChannel
deriving (Eq, Show)
colored :: Color -> Format
colored = Colored . FormatColor
{-# INLINE colored #-}
markdown :: Format -> Text -> Markdown
markdown = Markdown . Just
{-# INLINE markdown #-}
instance Semigroup Markdown where
m <> (Markdown _ "") = m
(Markdown _ "") <> m = m
m1@(Markdown f1 s1) <> m2@(Markdown f2 s2)
| f1 == f2 = Markdown f1 $ s1 <> s2
| otherwise = m1 :|: m2
m1@(Markdown f1 s1) <> ms@(Markdown f2 s2 :|: m3)
| f1 == f2 = Markdown f1 (s1 <> s2) :|: m3
| otherwise = m1 :|: ms
ms@(m1 :|: Markdown f2 s2) <> m3@(Markdown f3 s3)
| f2 == f3 = m1 :|: Markdown f2 (s2 <> s3)
| otherwise = ms :|: m3
m1 <> m2 = m1 :|: m2
instance Monoid Markdown where mempty = unmarked ""
instance IsString Markdown where fromString = unmarked . T.pack
newtype FormatColor = FormatColor Color
deriving (Eq, Show)
instance FromJSON FormatColor where
parseJSON =
J.withText "FormatColor" $
fmap FormatColor . \case
"red" -> pure Red
"green" -> pure Green
"blue" -> pure Blue
"yellow" -> pure Yellow
"cyan" -> pure Cyan
"magenta" -> pure Magenta
"black" -> pure Black
"white" -> pure White
unexpected -> fail $ "unexpected FormatColor: " <> show unexpected
instance ToJSON FormatColor where
toJSON (FormatColor c) = case c of
Red -> "red"
Green -> "green"
Blue -> "blue"
Yellow -> "yellow"
Cyan -> "cyan"
Magenta -> "magenta"
Black -> "black"
White -> "white"
data FormattedText = FormattedText {format :: Maybe Format, text :: Text}
deriving (Eq, Show)
instance IsString FormattedText where
fromString = FormattedText Nothing . T.pack
type MarkdownList = [FormattedText]
data ParsedMarkdown = ParsedMarkdown {formattedText :: Maybe MarkdownList}
unmarked :: Text -> Markdown
unmarked = Markdown Nothing
parseMaybeMarkdownList :: Text -> Maybe MarkdownList
parseMaybeMarkdownList s
| all (isNothing . format) ml = Nothing
| otherwise = Just . reverse $ foldl' acc [] ml
where
ml = intercalate ["\n"] . map (markdownToList . parseMarkdown) $ T.lines s
acc [] m = [m]
acc ms@(FormattedText f t : ms') ft@(FormattedText f' t')
| f == f' = FormattedText f (t <> t') : ms'
| otherwise = ft : ms
parseMarkdownList :: Text -> MarkdownList
parseMarkdownList = markdownToList . parseMarkdown
markdownToList :: Markdown -> MarkdownList
markdownToList (Markdown f s) = [FormattedText f s]
markdownToList (m1 :|: m2) = markdownToList m1 <> markdownToList m2
parseMarkdown :: Text -> Markdown
parseMarkdown s = fromRight (unmarked s) $ A.parseOnly (markdownP <* A.endOfInput) s
isSimplexLink :: Format -> Bool
isSimplexLink = \case
SimplexLink {} -> True
_ -> False
markdownP :: Parser Markdown
markdownP = mconcat <$> A.many' fragmentP
where
fragmentP :: Parser Markdown
fragmentP =
A.peekChar >>= \case
Just c -> case c of
' ' -> unmarked <$> A.takeWhile (== ' ')
'+' -> phoneP <|> wordP
'*' -> formattedP '*' Bold
'_' -> formattedP '_' Italic
'~' -> formattedP '~' StrikeThrough
'`' -> formattedP '`' Snippet
'#' -> A.char '#' *> secretP
'!' -> coloredP <|> wordP
'@' -> mentionP <|> wordP
_
| isDigit c -> phoneP <|> wordP
| otherwise -> wordP
Nothing -> fail ""
formattedP :: Char -> Format -> Parser Markdown
formattedP c f = do
s <- A.char c *> A.takeTill (== c)
(A.char c $> md c f s) <|> noFormat (c `T.cons` s)
md :: Char -> Format -> Text -> Markdown
md c f s
| T.null s || T.head s == ' ' || T.last s == ' ' =
unmarked $ c `T.cons` s `T.snoc` c
| otherwise = markdown f s
secretP :: Parser Markdown
secretP = secret <$> A.takeWhile (== '#') <*> A.takeTill (== '#') <*> A.takeWhile (== '#')
secret :: Text -> Text -> Text -> Markdown
secret b s a
| T.null a || T.null s || T.head s == ' ' || T.last s == ' ' =
unmarked $ '#' `T.cons` ss
| otherwise = markdown Secret $ T.init ss
where
ss = b <> s <> a
coloredP :: Parser Markdown
coloredP = do
clr <- A.char '!' *> colorP <* A.space
s <- ((<>) <$> A.takeWhile1 (\c -> c /= ' ' && c /= '!') <*> A.takeTill (== '!')) <* A.char '!'
if T.null s || T.last s == ' '
then fail "not colored"
else pure $ markdown (colored clr) s
mentionP = do
c <- A.char '@' *> A.peekChar'
(name, punct) <- displayNameTextP_
let sName = if c == '\'' then '\'' `T.cons` name `T.snoc` '\'' else name
mention = markdown (Mention name) ('@' `T.cons` sName)
pure $ if T.null punct then mention else mention :|: unmarked punct
colorP =
A.anyChar >>= \case
'r' -> optional "ed" $> Red
'g' -> optional "reen" $> Green
'b' -> optional "lue" $> Blue
'y' -> optional "ellow" $> Yellow
'c' -> optional "yan" $> Cyan
'm' -> optional "agenta" $> Magenta
'1' -> pure Red
'2' -> pure Green
'3' -> pure Blue
'4' -> pure Yellow
'5' -> pure Cyan
'6' -> pure Magenta
_ -> fail "not color"
phoneP = do
country <- optional $ T.cons <$> A.char '+' <*> A.takeWhile1 isDigit
code <- optional $ conc4 <$> phoneSep <*> "(" <*> A.takeWhile1 isDigit <*> ")"
segments <- mconcat <$> A.many' ((<>) <$> phoneSep <*> A.takeWhile1 isDigit)
let s = fromMaybe "" country <> fromMaybe "" code <> segments
len = T.length s
if 7 <= len && len <= 22 then pure $ markdown Phone s else fail "not phone"
conc4 s1 s2 s3 s4 = s1 <> s2 <> s3 <> s4
phoneSep = " " <|> "-" <|> "." <|> ""
wordP :: Parser Markdown
wordP = wordMD <$> A.takeTill (== ' ')
wordMD :: Text -> Markdown
wordMD s
| T.null s = unmarked s
| isUri s' = res $ uriMarkdown s'
| isDomain s' = res $ markdown Uri s'
| isEmail s' = res $ markdown Email s'
| otherwise = unmarked s
where
punct = T.takeWhileEnd isPunctuation' s
s' = T.dropWhileEnd isPunctuation' s
res md' = if T.null punct then md' else md' :|: unmarked punct
isPunctuation' = \case
'/' -> False
')' -> False
c -> isPunctuation c
uriMarkdown s = case strDecode $ encodeUtf8 s of
Right cLink -> markdown (simplexUriFormat cLink) s
_ -> markdown Uri s
isUri s = T.length s >= 10 && any (`T.isPrefixOf` s) ["http://", "https://", "simplex:/"]
-- matches what is likely to be a domain, not all valid domain names
isDomain s = case T.splitOn "." s of
[name, tld] -> isDomain_ name tld
[sub, name, tld] -> T.length sub >= 3 && T.length sub <= 8 && isDomain_ name tld
_ -> False
where
isDomain_ name tld =
(let n = T.length name in n >= 1 && n <= 24)
&& (let n = T.length tld in n >= 2 && n <= 8)
&& (let p c = isAscii c && isAlpha c in T.all p name && T.all p tld)
isEmail s = T.any (== '@') s && Email.isValid (encodeUtf8 s)
noFormat = pure . unmarked
simplexUriFormat :: AConnectionLink -> Format
simplexUriFormat = \case
ACL m (CLFull cReq) -> case cReq of
CRContactUri crData -> SimplexLink (linkType' crData) cLink $ uriHosts crData
CRInvitationUri crData _ -> SimplexLink XLInvitation cLink $ uriHosts crData
where
cLink = ACL m $ CLFull $ simplexConnReqUri cReq
uriHosts ConnReqUriData {crSmpQueues} = L.map strEncodeText $ sconcat $ L.map (host . qServer) crSmpQueues
linkType' ConnReqUriData {crClientData} = case crClientData >>= decodeJSON of
Just (CRDataGroup _) -> XLGroup
Nothing -> XLContact
ACL m (CLShort sLnk) -> case sLnk of
CSLContact _ ct srv _ -> SimplexLink (linkType' ct) cLink $ uriHosts srv
CSLInvitation _ srv _ _ -> SimplexLink XLInvitation cLink $ uriHosts srv
where
cLink = ACL m $ CLShort $ simplexShortLink sLnk
uriHosts srv = L.map strEncodeText $ host srv
linkType' = \case
CCTGroup -> XLGroup
CCTChannel -> XLChannel
CCTContact -> XLContact
strEncodeText :: StrEncoding a => a -> Text
strEncodeText = safeDecodeUtf8 . strEncode
markdownText :: FormattedText -> Text
markdownText (FormattedText f_ t) = case f_ of
Nothing -> t
Just f -> case f of
Bold -> around '*'
Italic -> around '_'
StrikeThrough -> around '~'
Snippet -> around '`'
Secret -> around '#'
Colored (FormatColor c) -> color c
Uri -> t
SimplexLink {} -> t
Mention _ -> t
Email -> t
Phone -> t
where
around c = c `T.cons` t `T.snoc` c
color c = case colorStr c of
Just cStr -> cStr <> t `T.snoc` '!'
Nothing -> t
colorStr = \case
Red -> Just "!1 "
Green -> Just "!2 "
Blue -> Just "!3 "
Yellow -> Just "!4 "
Cyan -> Just "!5 "
Magenta -> Just "!6 "
Black -> Nothing
White -> Nothing
displayNameTextP :: Parser Text
displayNameTextP = displayNameTextP_ >>= \(t, sfx) -> if T.null sfx then pure t else fail "Name ends with punctuation"
{-# INLINE displayNameTextP #-}
displayNameTextP_ :: Parser (Text, Text)
displayNameTextP_ = (,"") <$> quoted '\'' <|> splitPunctuation <$> takeNameTill isSpace
where
takeNameTill p =
A.peekChar' >>= \c ->
if refChar c then A.takeTill p else fail "invalid first character in display name"
splitPunctuation s = (T.dropWhileEnd isPunctuation s, T.takeWhileEnd isPunctuation s)
quoted c = A.char c *> takeNameTill (== c) <* A.char c
refChar c = c > ' ' && c /= '#' && c /= '@' && c /= '\''
-- quotes names that contain spaces or end on punctuation
viewName :: Text -> Text
viewName s = if T.any isSpace s || maybe False (isPunctuation . snd) (T.unsnoc s) then "'" <> s <> "'" else s
$(JQ.deriveJSON (enumJSON $ dropPrefix "XL") ''SimplexLinkType)
$(JQ.deriveJSON (sumTypeJSON fstToLower) ''Format)
$(JQ.deriveJSON defaultJSON ''FormattedText)
$(JQ.deriveToJSON defaultJSON ''ParsedMarkdown)