From 4c782d319182a974e7390381cb45e1d7f080d49b Mon Sep 17 00:00:00 2001 From: Ritiek Malhotra Date: Wed, 7 Jan 2026 10:49:21 +0000 Subject: [PATCH 01/14] docker: build multi-arch images for x86-64 and arm64 (#1540) --- .github/workflows/docker-image.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml index 6e3e25e82..c3e312050 100644 --- a/.github/workflows/docker-image.yml +++ b/.github/workflows/docker-image.yml @@ -39,9 +39,17 @@ jobs: type=semver,pattern=v{{major}}.{{minor}} type=semver,pattern=v{{major}} + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Build and push Docker image uses: simplex-chat/docker-build-push-action@v6 with: + context: . + platforms: linux/amd64,linux/arm64 push: true build-args: | APP=${{ matrix.app }} From 0f3b8a4a165925a97e46fbb57b730640a6f7cbff Mon Sep 17 00:00:00 2001 From: Evgeny Poberezkin Date: Sat, 24 Jan 2026 18:05:01 +0000 Subject: [PATCH 02/14] docs: update contributing --- contributing/CODE.md | 5 +++++ contributing/README.md | 1 + 2 files changed, 6 insertions(+) diff --git a/contributing/CODE.md b/contributing/CODE.md index 415d40475..85b4ce96c 100644 --- a/contributing/CODE.md +++ b/contributing/CODE.md @@ -36,6 +36,11 @@ Some files that use CPP language extension cannot be formatted as a whole, so in - Do not add comments like "wire format encoding" (Encoding class is always wire format) or "check if X" when the function name already says that - Assume a competent Haskell reader +**Diff and refactoring:** +- Avoid unnecessary changes and code movements +- Never do refactoring unless it substantially reduces cost of solving the current problem, including the cost of refactoring +- Aim to minimize the code changes - do what is minimally required to solve users' problems + ### Haskell Extensions - `StrictData` enabled by default - Use STM for safe concurrency diff --git a/contributing/README.md b/contributing/README.md index 8333829ce..d0771545d 100644 --- a/contributing/README.md +++ b/contributing/README.md @@ -17,6 +17,7 @@ Please discuss the problem you want to solve and your detailed implementation pl This files can be used with LLM prompts, e.g. if you use Claude Code you can create CLAUDE.md file in project root importing content from these files: ```markdown +@README.md @contributing/PROJECT.md @contributing/CODE.md ``` From 66cc06738ea83633e0eead63ffc25fb246d23410 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Mon, 26 Jan 2026 00:11:50 +0000 Subject: [PATCH 03/14] agent: fix tests for short links (#1703) --- src/Simplex/Messaging/Agent/Protocol.hs | 2 +- tests/AgentTests/FunctionalAPITests.hs | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/Simplex/Messaging/Agent/Protocol.hs b/src/Simplex/Messaging/Agent/Protocol.hs index 46d5ebaa9..d9aafe1ef 100644 --- a/src/Simplex/Messaging/Agent/Protocol.hs +++ b/src/Simplex/Messaging/Agent/Protocol.hs @@ -1814,7 +1814,7 @@ instance ConnectionModeI c => Encoding (FixedLinkData c) where smpEncode (agentVRange, rootKey, linkConnReq) <> maybe "" smpEncode linkEntityId smpP = do (agentVRange, rootKey, linkConnReq) <- smpP - linkEntityId <- (smpP <|> pure Nothing) <* A.takeByteString -- ignoring tail for forward compatibility with the future link data encoding + linkEntityId <- optional smpP <* A.takeByteString -- ignoring tail for forward compatibility with the future link data encoding pure FixedLinkData {agentVRange, rootKey, linkConnReq, linkEntityId} instance ConnectionModeI c => Encoding (ConnLinkData c) where diff --git a/tests/AgentTests/FunctionalAPITests.hs b/tests/AgentTests/FunctionalAPITests.hs index be873befb..77cd9b6bd 100644 --- a/tests/AgentTests/FunctionalAPITests.hs +++ b/tests/AgentTests/FunctionalAPITests.hs @@ -1553,16 +1553,16 @@ testContactShortLinkRestart ps = withAgentClients2 $ \a b -> do updatedCtData = UserContactData {direct = False, owners = [], relays = [relayLink1, relayLink2], userData = updatedData} updatedLinkData = UserContactLinkData updatedCtData withSmpServer ps $ do - (connReq', ContactLinkData _ userCtData') <- runRight $ getConnShortLink b 1 shortLink + (fd', ContactLinkData _ userCtData') <- runRight $ getConnShortLink b 1 shortLink strDecode (strEncode shortLink) `shouldBe` Right shortLink - connReq' `shouldBe` connReq + linkConnReq fd' `shouldBe` connReq userCtData' `shouldBe` userCtData -- update user data shortLink' <- runRight $ setConnShortLink a contactId SCMContact updatedLinkData Nothing shortLink' `shouldBe` shortLink withSmpServer ps $ do - (connReq4, ContactLinkData _ updatedCtData') <- runRight $ getConnShortLink b 1 shortLink - connReq4 `shouldBe` connReq + (fd4, ContactLinkData _ updatedCtData') <- runRight $ getConnShortLink b 1 shortLink + linkConnReq fd4 `shouldBe` connReq updatedCtData' `shouldBe` updatedCtData testAddContactShortLinkRestart :: HasCallStack => (ASrvTransport, AStoreType) -> IO () @@ -1578,16 +1578,16 @@ testAddContactShortLinkRestart ps = withAgentClients2 $ \a b -> do updatedCtData = UserContactData {direct = False, owners = [], relays = [relayLink1, relayLink2], userData = updatedData} updatedLinkData = UserContactLinkData updatedCtData withSmpServer ps $ do - (connReq', ContactLinkData _ userCtData') <- runRight $ getConnShortLink b 1 shortLink + (fd', ContactLinkData _ userCtData') <- runRight $ getConnShortLink b 1 shortLink strDecode (strEncode shortLink) `shouldBe` Right shortLink - connReq' `shouldBe` connReq + linkConnReq fd' `shouldBe` connReq userCtData' `shouldBe` userCtData -- update user data shortLink' <- runRight $ setConnShortLink a contactId SCMContact updatedLinkData Nothing shortLink' `shouldBe` shortLink withSmpServer ps $ do - (connReq4, ContactLinkData _ updatedCtData') <- runRight $ getConnShortLink b 1 shortLink - connReq4 `shouldBe` connReq + (fd4, ContactLinkData _ updatedCtData') <- runRight $ getConnShortLink b 1 shortLink + linkConnReq fd4 `shouldBe` connReq updatedCtData' `shouldBe` updatedCtData testOldContactQueueShortLink :: HasCallStack => (ASrvTransport, AStoreType) -> IO () From d10e05b7968e7f20313bf6f5b07f2290d9420e0d Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 27 Jan 2026 10:54:13 +0000 Subject: [PATCH 04/14] agent: split creating connection to two steps to prepare connection link in advance (#1704) * agent: split creating connection to two steps to prepare connection link in advance * linkEntityId, newOwnerAuth * simplify --- contributing/CODE.md | 5 ++ src/Simplex/Messaging/Agent.hs | 98 ++++++++++++++++++----- src/Simplex/Messaging/Agent/Protocol.hs | 18 +++++ src/Simplex/Messaging/Crypto/ShortLink.hs | 25 ++++-- tests/AgentTests/FunctionalAPITests.hs | 31 +++++++ tests/AgentTests/ShortLinkTests.hs | 14 ++-- 6 files changed, 161 insertions(+), 30 deletions(-) diff --git a/contributing/CODE.md b/contributing/CODE.md index 85b4ce96c..c644a1606 100644 --- a/contributing/CODE.md +++ b/contributing/CODE.md @@ -41,6 +41,11 @@ Some files that use CPP language extension cannot be formatted as a whole, so in - Never do refactoring unless it substantially reduces cost of solving the current problem, including the cost of refactoring - Aim to minimize the code changes - do what is minimally required to solve users' problems +**Code analysis and review:** +- Trace data flows end-to-end: from origin, through storage/parameters, to consumption. Flag values that are discarded and reconstructed from partial data (e.g. extracted from a URI missing original fields) — this is usually a bug. +- Read implementations of called functions, not just signatures — if duplication involves a called function, check whether decomposing it resolves the duplication. +- Do not save time on analysis. Read every function in the data flow even when the interface seems clear — wrong assumptions about internals are the main source of missed bugs. + ### Haskell Extensions - `StrictData` enabled by default - Use STM for safe concurrency diff --git a/src/Simplex/Messaging/Agent.hs b/src/Simplex/Messaging/Agent.hs index 116c32b70..5623496ca 100644 --- a/src/Simplex/Messaging/Agent.hs +++ b/src/Simplex/Messaging/Agent.hs @@ -59,6 +59,8 @@ module Simplex.Messaging.Agent deleteConnectionAsync, deleteConnectionsAsync, createConnection, + prepareConnectionLink, + createConnectionForLink, setConnShortLink, deleteConnShortLink, getConnShortLink, @@ -398,6 +400,19 @@ createConnection :: ConnectionModeI c => AgentClient -> NetworkRequestMode -> Us createConnection c nm userId enableNtfs checkNotices = withAgentEnv c .::. newConn c nm userId enableNtfs checkNotices {-# INLINE createConnection #-} +-- | Prepare connection link for contact mode (no network call). +-- Returns root key pair (for signing OwnerAuth), the created link, and internal params. +-- The link address is fully determined at this point. +prepareConnectionLink :: AgentClient -> UserId -> Maybe ByteString -> Bool -> Maybe CRClientData -> AE (C.KeyPairEd25519, CreatedConnLink 'CMContact, PreparedLinkParams) +prepareConnectionLink c userId linkEntityId checkNotices = withAgentEnv c . prepareConnectionLink' c userId linkEntityId checkNotices +{-# INLINE prepareConnectionLink #-} + +-- | Create connection for prepared link (single network call). +-- Validates that server response matches the prepared link. +createConnectionForLink :: AgentClient -> NetworkRequestMode -> UserId -> Bool -> CreatedConnLink 'CMContact -> PreparedLinkParams -> UserConnLinkData 'CMContact -> CR.InitialKeys -> SubscriptionMode -> AE ConnId +createConnectionForLink c nm userId enableNtfs = withAgentEnv c .::. createConnectionForLink' c nm userId enableNtfs +{-# INLINE createConnectionForLink #-} + -- | Create or update user's contact connection short link setConnShortLink :: AgentClient -> NetworkRequestMode -> ConnId -> SConnectionMode c -> UserConnLinkData c -> Maybe CRClientData -> AE (ConnShortLink c) setConnShortLink c = withAgentEnv c .::. setConnShortLink' c @@ -902,6 +917,66 @@ newConn c nm userId enableNtfs checkNotices cMode linkData_ clientData pqInitKey <$> newRcvConnSrv c nm userId connId enableNtfs cMode linkData_ clientData pqInitKeys subMode srv `catchE` \e -> withStore' c (`deleteConnRecord` connId) >> throwE e +-- | Prepare connection link for contact mode (no network, no database). +-- Generates all cryptographic material and returns the link that will be created. +prepareConnectionLink' :: AgentClient -> UserId -> Maybe ByteString -> Bool -> Maybe CRClientData -> AM (C.KeyPairEd25519, CreatedConnLink 'CMContact, PreparedLinkParams) +prepareConnectionLink' c userId linkEntityId checkNotices clientData = do + g <- asks random + plpSrvWithAuth@(ProtoServerWithAuth srv _) <- getSMPServer c userId + when checkNotices $ checkClientNotices c plpSrvWithAuth + AgentConfig {smpClientVRange, smpAgentVRange} <- asks config + plpNonce@(C.CbNonce corrId) <- atomically $ C.randomCbNonce g + sigKeys@(_, plpRootPrivKey) <- atomically $ C.generateKeyPair g + plpQueueE2EKeys@(e2ePubKey, _) <- atomically $ C.generateKeyPair g + let sndId = SMP.EntityId $ B.take 24 $ C.sha3_384 corrId + qUri = SMPQueueUri smpClientVRange $ SMPQueueAddress srv sndId e2ePubKey (Just QMContact) + connReq = CRContactUri $ ConnReqUriData SSSimplex smpAgentVRange [qUri] clientData + (plpLinkKey, plpSignedFixedData) = SL.encodeSignFixedData sigKeys smpAgentVRange connReq linkEntityId + ccLink = CCLink connReq $ Just $ CSLContact SLSServer CCTContact srv plpLinkKey + params = PreparedLinkParams {plpNonce, plpQueueE2EKeys, plpLinkKey, plpRootPrivKey, plpSignedFixedData, plpSrvWithAuth} + pure (sigKeys, ccLink, params) + +-- | Create connection for prepared link (single network call). +createConnectionForLink' :: AgentClient -> NetworkRequestMode -> UserId -> Bool -> CreatedConnLink 'CMContact -> PreparedLinkParams -> UserConnLinkData 'CMContact -> CR.InitialKeys -> SubscriptionMode -> AM ConnId +createConnectionForLink' c nm userId enableNtfs (CCLink connReq _) PreparedLinkParams {plpNonce, plpQueueE2EKeys, plpLinkKey, plpRootPrivKey, plpSignedFixedData, plpSrvWithAuth} userLinkData pqInitKeys subMode = do + g <- asks random + AgentConfig {smpAgentVRange} <- asks config + case pqInitKeys of + CR.IKUsePQ -> throwE $ CMD PROHIBITED "createConnectionForLink" + _ -> pure () + connId <- newConnNoQueues c userId enableNtfs SCMContact (CR.connPQEncryption pqInitKeys) + let CRContactUri ConnReqUriData {crSmpQueues = SMPQueueUri _ SMPQueueAddress {senderId = sndId} :| _} = connReq + md = SL.encodeSignUserData SCMContact plpRootPrivKey smpAgentVRange userLinkData + linkData = (plpSignedFixedData, md) + qd <- encryptContactLinkData g plpRootPrivKey plpLinkKey sndId linkData + (_, qUri) <- + createRcvQueue c nm userId connId plpSrvWithAuth enableNtfs subMode (Just plpNonce) qd plpQueueE2EKeys + `catchE` \e -> withStore' c (`deleteConnRecord` connId) >> throwE e + let SMPQueueUri _ SMPQueueAddress {senderId = actualSndId} = qUri + unless (actualSndId == sndId) $ throwE $ INTERNAL "createConnectionForLink: sender ID mismatch" + pure connId + +-- | Encrypt signed link data for contact mode. +encryptContactLinkData :: TVar ChaChaDRG -> C.PrivateKeyEd25519 -> LinkKey -> SMP.SenderId -> (ByteString, ByteString) -> AM ClntQueueReqData +encryptContactLinkData g privSigKey linkKey sndId linkData = do + let (linkId, k) = SL.contactShortLinkKdf linkKey + srvData <- liftError id $ SL.encryptLinkData g k linkData + pure $ CQRContact $ Just CQRData {linkKey, privSigKey, srvReq = (linkId, (sndId, srvData))} + +-- | Shared helper: create receive queue and set up subscriptions. +createRcvQueue :: AgentClient -> NetworkRequestMode -> UserId -> ConnId -> SMPServerWithAuth -> Bool -> SubscriptionMode -> Maybe C.CbNonce -> ClntQueueReqData -> C.KeyPairX25519 -> AM (RcvQueue, SMPQueueUri) +createRcvQueue c nm userId connId srvWithAuth@(ProtoServerWithAuth srv _) enableNtfs subMode nonce_ qd e2eKeys = do + AgentConfig {smpClientVRange = vr} <- asks config + ntfServer_ <- if enableNtfs then newQueueNtfServer else pure Nothing + (rq, qUri, tSess, sessId) <- + newRcvQueue_ c nm userId connId srvWithAuth vr qd (isJust ntfServer_) subMode nonce_ e2eKeys + `catchAllErrors` \e -> liftIO (print e) >> throwE e + atomically $ incSMPServerStat c userId srv connCreated + rq' <- withStore c $ \db -> updateNewConnRcv db connId rq subMode + lift . when (subMode == SMSubscribe) $ addNewQueueSubscription c rq' tSess sessId + mapM_ (newQueueNtfSubscription c rq') ntfServer_ + pure (rq', qUri) + checkClientNotices :: AgentClient -> SMPServerWithAuth -> AM () checkClientNotices AgentClient {clientNotices, presetServers} (ProtoServerWithAuth srv@(ProtocolServer {host}) _) = do notices <- readTVarIO clientNotices @@ -978,7 +1053,7 @@ setConnShortLink' c nm connId cMode userLinkData clientData = sigKeys@(_, privSigKey) <- atomically $ C.generateKeyPair @'C.Ed25519 g let qUri = SMPQueueUri vr $ (rcvSMPQueueAddress rq) {queueMode = Just QMContact} connReq = CRContactUri $ ConnReqUriData SSSimplex smpAgentVRange [qUri] clientData - (linkKey, linkData) = SL.encodeSignLinkData sigKeys smpAgentVRange connReq ud + (linkKey, linkData) = SL.encodeSignLinkData sigKeys smpAgentVRange connReq Nothing ud (linkId, k) = SL.contactShortLinkKdf linkKey srvData <- liftError id $ SL.encryptLinkData g k linkData let slCreds = ShortLinkCreds linkId linkKey privSigKey Nothing (fst srvData) @@ -1065,25 +1140,15 @@ newRcvConnSrv c nm userId connId enableNtfs cMode userLinkData_ clientData pqIni case userLinkData_ of Just d -> do (nonce, qUri, cReq, qd) <- prepareLinkData d $ fst e2eKeys - (rq, qUri') <- createRcvQueue (Just nonce) qd e2eKeys + (rq, qUri') <- createRcvQueue c nm userId connId srvWithAuth enableNtfs subMode (Just nonce) qd e2eKeys ccLink <- connReqWithShortLink qUri cReq qUri' (shortLink rq) pure (ccLink, clientServiceId rq) Nothing -> do let qd = case cMode of SCMContact -> CQRContact Nothing; SCMInvitation -> CQRMessaging Nothing - (rq, qUri) <- createRcvQueue Nothing qd e2eKeys + (rq, qUri) <- createRcvQueue c nm userId connId srvWithAuth enableNtfs subMode Nothing qd e2eKeys cReq <- createConnReq qUri pure (CCLink cReq Nothing, clientServiceId rq) where - createRcvQueue :: Maybe C.CbNonce -> ClntQueueReqData -> C.KeyPairX25519 -> AM (RcvQueue, SMPQueueUri) - createRcvQueue nonce_ qd e2eKeys = do - AgentConfig {smpClientVRange = vr} <- asks config - ntfServer_ <- if enableNtfs then newQueueNtfServer else pure Nothing - (rq, qUri, tSess, sessId) <- newRcvQueue_ c nm userId connId srvWithAuth vr qd (isJust ntfServer_) subMode nonce_ e2eKeys `catchAllErrors` \e -> liftIO (print e) >> throwE e - atomically $ incSMPServerStat c userId srv connCreated - rq' <- withStore c $ \db -> updateNewConnRcv db connId rq subMode - lift . when (subMode == SMSubscribe) $ addNewQueueSubscription c rq' tSess sessId - mapM_ (newQueueNtfSubscription c rq') ntfServer_ - pure (rq', qUri) createConnReq :: SMPQueueUri -> AM (ConnectionRequestUri c) createConnReq qUri = do AgentConfig {smpAgentVRange, e2eEncryptVRange} <- asks config @@ -1107,12 +1172,9 @@ newRcvConnSrv c nm userId connId enableNtfs cMode userLinkData_ clientData pqIni qm = case cMode of SCMContact -> QMContact; SCMInvitation -> QMMessaging qUri = SMPQueueUri vr $ SMPQueueAddress srv sndId e2eDhKey (Just qm) connReq <- createConnReq qUri - let (linkKey, linkData) = SL.encodeSignLinkData sigKeys smpAgentVRange connReq userLinkData + let (linkKey, linkData) = SL.encodeSignLinkData sigKeys smpAgentVRange connReq Nothing userLinkData qd <- case cMode of - SCMContact -> do - let (linkId, k) = SL.contactShortLinkKdf linkKey - srvData <- liftError id $ SL.encryptLinkData g k linkData - pure $ CQRContact $ Just CQRData {linkKey, privSigKey, srvReq = (linkId, (sndId, srvData))} + SCMContact -> encryptContactLinkData g privSigKey linkKey sndId linkData SCMInvitation -> do let k = SL.invShortLinkKdf linkKey srvData <- liftError id $ SL.encryptLinkData g k linkData diff --git a/src/Simplex/Messaging/Agent/Protocol.hs b/src/Simplex/Messaging/Agent/Protocol.hs index d9aafe1ef..e988347ae 100644 --- a/src/Simplex/Messaging/Agent/Protocol.hs +++ b/src/Simplex/Messaging/Agent/Protocol.hs @@ -129,6 +129,7 @@ module Simplex.Messaging.Agent.Protocol ContactConnType (..), ShortLinkScheme (..), LinkKey (..), + PreparedLinkParams (..), StoredClientService (..), ClientService, ClientServiceId, @@ -1479,6 +1480,23 @@ newtype LinkKey = LinkKey ByteString -- sha3-256(fixed_data) instance ToField LinkKey where toField (LinkKey s) = toField $ Binary s +-- | Parameters for creating a connection with a prepared link. +data PreparedLinkParams = PreparedLinkParams + { -- | Correlation ID / determines sender ID + plpNonce :: C.CbNonce, + -- | Queue E2EE DH key pair + plpQueueE2EKeys :: C.KeyPairX25519, + -- | For encrypting link data + plpLinkKey :: LinkKey, + -- | Root signing key (for signing link data) + plpRootPrivKey :: C.PrivateKeyEd25519, + -- | smpEncode of FixedLinkData (includes linkEntityId) + plpSignedFixedData :: ByteString, + -- | Server with basic auth (not stored in link) + plpSrvWithAuth :: SMPServerWithAuth + } + deriving (Show) + instance ConnectionModeI c => ToField (ConnectionLink c) where toField = toField . Binary . strEncode instance (Typeable c, ConnectionModeI c) => FromField (ConnectionLink c) where fromField = blobFieldDecoder strDecode diff --git a/src/Simplex/Messaging/Crypto/ShortLink.hs b/src/Simplex/Messaging/Crypto/ShortLink.hs index ae9049889..013559fd1 100644 --- a/src/Simplex/Messaging/Crypto/ShortLink.hs +++ b/src/Simplex/Messaging/Crypto/ShortLink.hs @@ -13,7 +13,9 @@ module Simplex.Messaging.Crypto.ShortLink ( contactShortLinkKdf, invShortLinkKdf, encodeSignLinkData, + encodeSignFixedData, encodeSignUserData, + newOwnerAuth, encryptLinkData, encryptUserData, decryptLinkData, @@ -50,11 +52,16 @@ contactShortLinkKdf (LinkKey k) = invShortLinkKdf :: LinkKey -> C.SbKey invShortLinkKdf (LinkKey k) = C.unsafeSbKey $ C.hkdf "" k "SimpleXInvLink" 32 -encodeSignLinkData :: ConnectionModeI c => C.KeyPairEd25519 -> VersionRangeSMPA -> ConnectionRequestUri c -> UserConnLinkData c -> (LinkKey, (ByteString, ByteString)) -encodeSignLinkData (rootKey, pk) agentVRange linkConnReq userData = - let fd = smpEncode FixedLinkData {agentVRange, rootKey, linkConnReq, linkEntityId = Nothing} - md = smpEncode $ connLinkData agentVRange userData - in (LinkKey (C.sha3_256 fd), (encodeSign pk fd, encodeSign pk md)) +encodeSignLinkData :: forall c. ConnectionModeI c => C.KeyPairEd25519 -> VersionRangeSMPA -> ConnectionRequestUri c -> Maybe ByteString -> UserConnLinkData c -> (LinkKey, (ByteString, ByteString)) +encodeSignLinkData keys@(_, pk) agentVRange linkConnReq linkEntityId userData = + let (linkKey, fd) = encodeSignFixedData keys agentVRange linkConnReq linkEntityId + md = encodeSignUserData (sConnectionMode @c) pk agentVRange userData + in (linkKey, (fd, md)) + +encodeSignFixedData :: ConnectionModeI c => C.KeyPairEd25519 -> VersionRangeSMPA -> ConnectionRequestUri c -> Maybe ByteString -> (LinkKey, ByteString) +encodeSignFixedData (rootKey, pk) agentVRange linkConnReq linkEntityId = + let fd = smpEncode FixedLinkData {agentVRange, rootKey, linkConnReq, linkEntityId} + in (LinkKey (C.sha3_256 fd), encodeSign pk fd) encodeSignUserData :: ConnectionModeI c => SConnectionMode c -> C.PrivateKeyEd25519 -> VersionRangeSMPA -> UserConnLinkData c -> ByteString encodeSignUserData _ pk agentVRange userLinkData = @@ -68,6 +75,14 @@ connLinkData vr = \case encodeSign :: C.PrivateKeyEd25519 -> ByteString -> ByteString encodeSign pk s = smpEncode (C.sign' pk s) <> s +-- | Generate a new owner key pair and create OwnerAuth signed by the authorizing key. +-- ownerId is application-specific (e.g., MemberId in chat). +newOwnerAuth :: TVar ChaChaDRG -> OwnerId -> C.PrivateKeyEd25519 -> IO (C.PrivateKeyEd25519, OwnerAuth) +newOwnerAuth g ownerId signingKey = do + (ownerKey, ownerPrivKey) <- atomically $ C.generateKeyPair @'C.Ed25519 g + let authOwnerSig = C.sign' signingKey $ ownerId <> C.encodePubKey ownerKey + pure (ownerPrivKey, OwnerAuth {ownerId, ownerKey, authOwnerSig}) + encryptLinkData :: TVar ChaChaDRG -> C.SbKey -> (ByteString, ByteString) -> ExceptT AgentErrorType IO QueueLinkData encryptLinkData g k = bimapM (encrypt fixedDataPaddedLength) (encrypt userDataPaddedLength) where diff --git a/tests/AgentTests/FunctionalAPITests.hs b/tests/AgentTests/FunctionalAPITests.hs index 77cd9b6bd..b87ebcfad 100644 --- a/tests/AgentTests/FunctionalAPITests.hs +++ b/tests/AgentTests/FunctionalAPITests.hs @@ -383,6 +383,7 @@ functionalAPITests ps = do it "should connect via contact short link after restart" $ testContactShortLinkRestart ps it "should connect via added contact short link after restart" $ testAddContactShortLinkRestart ps it "should create and get short links with the old contact queues" $ testOldContactQueueShortLink ps + it "should connect via prepared connection link" $ testPrepareCreateConnectionLink ps describe "Message delivery" $ do describe "update connection agent version on received messages" $ do it "should increase if compatible, shouldn'ps decrease" $ @@ -1646,6 +1647,36 @@ replaceSubstringInFile filePath oldText newText = do let newContent = T.replace oldText newText content T.writeFile filePath newContent +testPrepareCreateConnectionLink :: HasCallStack => (ASrvTransport, AStoreType) -> IO () +testPrepareCreateConnectionLink ps = withSmpServer ps $ withAgentClients2 $ \a b -> do + let userData = UserLinkData "test user data" + userCtData = UserContactData {direct = True, owners = [], relays = [], userData} + userLinkData = UserContactLinkData userCtData + g <- C.newRandom + linkEntId <- atomically $ C.randomBytes 32 g + runRight $ do + ((_rootPubKey, _rootPrivKey), ccLink@(CCLink connReq (Just shortLink)), preparedParams) <- + A.prepareConnectionLink a 1 (Just linkEntId) True Nothing + liftIO $ strDecode (strEncode shortLink) `shouldBe` Right shortLink + _ <- A.createConnectionForLink a NRMInteractive 1 True ccLink preparedParams userLinkData CR.IKPQOn SMSubscribe + (FixedLinkData {linkConnReq = connReq', linkEntityId}, ContactLinkData _ userCtData') <- getConnShortLink b 1 shortLink + liftIO $ Just linkEntId `shouldBe` linkEntityId + Right connReqDecoded <- pure $ smpDecode (smpEncode connReq) + liftIO $ connReq' `shouldBe` connReqDecoded + liftIO $ userCtData' `shouldBe` userCtData + (bId, sndSecure) <- joinConnection b 1 True connReq' "bob's connInfo" SMSubscribe + liftIO $ sndSecure `shouldBe` False + ("", _, REQ invId _ "bob's connInfo") <- get a + aId <- A.prepareConnectionToAccept a 1 True invId PQSupportOn + (sndSecure', Nothing) <- acceptContact a 1 aId True invId "alice's connInfo" PQSupportOn SMSubscribe + liftIO $ sndSecure' `shouldBe` True + ("", _, CONF confId _ "alice's connInfo") <- get b + allowConnection b bId confId "bob's connInfo" + get a ##> ("", aId, INFO "bob's connInfo") + get a ##> ("", aId, CON) + get b ##> ("", bId, CON) + exchangeGreetings a aId b bId + testIncreaseConnAgentVersion :: HasCallStack => (ASrvTransport, AStoreType) -> IO () testIncreaseConnAgentVersion ps = do alice <- getSMPAgentClient' 1 agentCfg {smpAgentVRange = mkVersionRange 1 2} initAgentServers testDB diff --git a/tests/AgentTests/ShortLinkTests.hs b/tests/AgentTests/ShortLinkTests.hs index 97472bec1..edffaa3d9 100644 --- a/tests/AgentTests/ShortLinkTests.hs +++ b/tests/AgentTests/ShortLinkTests.hs @@ -44,7 +44,7 @@ testInvShortLink = do sigKeys <- atomically $ C.generateKeyPair @'C.Ed25519 g let userData = UserLinkData "some user data" userLinkData = UserInvLinkData userData - (linkKey, linkData) = SL.encodeSignLinkData sigKeys supportedSMPAgentVRange invConnRequest userLinkData + (linkKey, linkData) = SL.encodeSignLinkData sigKeys supportedSMPAgentVRange invConnRequest Nothing userLinkData k = SL.invShortLinkKdf linkKey Right srvData <- runExceptT $ SL.encryptLinkData g k linkData -- decrypt @@ -59,7 +59,7 @@ testInvShortLinkBadDataHash = do sigKeys <- atomically $ C.generateKeyPair @'C.Ed25519 g let userData = UserLinkData "some user data" userLinkData = UserInvLinkData userData - (_linkKey, linkData) = SL.encodeSignLinkData sigKeys supportedSMPAgentVRange invConnRequest userLinkData + (_linkKey, linkData) = SL.encodeSignLinkData sigKeys supportedSMPAgentVRange invConnRequest Nothing userLinkData -- different key linkKey <- LinkKey <$> atomically (C.randomBytes 32 g) let k = SL.invShortLinkKdf linkKey @@ -82,7 +82,7 @@ testContactShortLink = do let userData = UserLinkData "some user data" userCtData = UserContactData {direct = True, owners = [], relays = [], userData} userLinkData = UserContactLinkData userCtData - (linkKey, linkData) = SL.encodeSignLinkData sigKeys supportedSMPAgentVRange contactConnRequest userLinkData + (linkKey, linkData) = SL.encodeSignLinkData sigKeys supportedSMPAgentVRange contactConnRequest Nothing userLinkData (_linkId, k) = SL.contactShortLinkKdf linkKey Right srvData <- runExceptT $ SL.encryptLinkData g k linkData -- decrypt @@ -98,7 +98,7 @@ testUpdateContactShortLink = do let userData = UserLinkData "some user data" userCtData = UserContactData {direct = True, owners = [], relays = [], userData} userLinkData = UserContactLinkData userCtData - (linkKey, linkData) = SL.encodeSignLinkData sigKeys supportedSMPAgentVRange contactConnRequest userLinkData + (linkKey, linkData) = SL.encodeSignLinkData sigKeys supportedSMPAgentVRange contactConnRequest Nothing userLinkData (_linkId, k) = SL.contactShortLinkKdf linkKey Right (fd, _ud) <- runExceptT $ SL.encryptLinkData g k linkData -- encrypt updated user data @@ -119,7 +119,7 @@ testContactShortLinkBadDataHash = do sigKeys <- atomically $ C.generateKeyPair @'C.Ed25519 g let userData = UserLinkData "some user data" userLinkData = UserContactLinkData UserContactData {direct = True, owners = [], relays = [], userData} - (_linkKey, linkData) = SL.encodeSignLinkData sigKeys supportedSMPAgentVRange contactConnRequest userLinkData + (_linkKey, linkData) = SL.encodeSignLinkData sigKeys supportedSMPAgentVRange contactConnRequest Nothing userLinkData -- different key linkKey <- LinkKey <$> atomically (C.randomBytes 32 g) let (_linkId, k) = SL.contactShortLinkKdf linkKey @@ -135,7 +135,7 @@ testContactShortLinkBadSignature = do sigKeys <- atomically $ C.generateKeyPair @'C.Ed25519 g let userData = UserLinkData "some user data" userLinkData = UserContactLinkData UserContactData {direct = True, owners = [], relays = [], userData} - (linkKey, linkData) = SL.encodeSignLinkData sigKeys supportedSMPAgentVRange contactConnRequest userLinkData + (linkKey, linkData) = SL.encodeSignLinkData sigKeys supportedSMPAgentVRange contactConnRequest Nothing userLinkData (_linkId, k) = SL.contactShortLinkKdf linkKey Right (fd, _ud) <- runExceptT $ SL.encryptLinkData g k linkData -- encrypt updated user data @@ -167,7 +167,7 @@ encryptLink g = do sigKeys@(_, pk) <- atomically $ C.generateKeyPair @'C.Ed25519 g let userData = UserLinkData "some user data" userLinkData = UserContactLinkData UserContactData {direct = True, owners = [], relays = [], userData} - (linkKey, linkData) = SL.encodeSignLinkData sigKeys supportedSMPAgentVRange contactConnRequest userLinkData + (linkKey, linkData) = SL.encodeSignLinkData sigKeys supportedSMPAgentVRange contactConnRequest Nothing userLinkData (_linkId, k) = SL.contactShortLinkKdf linkKey Right (fd, _ud) <- runExceptT $ SL.encryptLinkData g k linkData pure (pk, (fd, linkKey, k)) From 3c5ec8d9a185e3decd8a52912022c9ec5f2a7730 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Wed, 28 Jan 2026 21:54:41 +0000 Subject: [PATCH 05/14] agent: improve error handling (#1707) * agent: improve error handling * simplify * report critical error when subscriber crashes * fix test --- src/Simplex/Messaging/Agent.hs | 24 +++---- src/Simplex/Messaging/Agent/Protocol.hs | 6 +- src/Simplex/Messaging/Compression.hs | 14 ++-- src/Simplex/Messaging/Util.hs | 87 ++++++++++++++++++++----- tests/AgentTests/MigrationTests.hs | 2 +- tests/CoreTests/UtilTests.hs | 34 +++++++++- 6 files changed, 128 insertions(+), 39 deletions(-) diff --git a/src/Simplex/Messaging/Agent.hs b/src/Simplex/Messaging/Agent.hs index 982c3099a..3174e05ec 100644 --- a/src/Simplex/Messaging/Agent.hs +++ b/src/Simplex/Messaging/Agent.hs @@ -180,14 +180,15 @@ import Simplex.Messaging.Agent.Store import Simplex.Messaging.Agent.Store.AgentStore import Simplex.Messaging.Agent.Store.Common (DBStore) import qualified Simplex.Messaging.Agent.Store.DB as DB +import Simplex.Messaging.Agent.Store.Entity import Simplex.Messaging.Agent.Store.Interface (closeDBStore, execSQL, getCurrentMigrations) import Simplex.Messaging.Agent.Store.Shared (UpMigration (..), upMigration) import Simplex.Messaging.Client (NetworkRequestMode (..), SMPClientError, ServerTransmission (..), ServerTransmissionBatch, nonBlockingWriteTBQueue, temporaryClientError, unexpectedResponse) import qualified Simplex.Messaging.Crypto as C import Simplex.Messaging.Crypto.File (CryptoFile, CryptoFileArgs) import Simplex.Messaging.Crypto.Ratchet (PQEncryption, PQSupport (..), pattern PQEncOff, pattern PQEncOn, pattern PQSupportOff, pattern PQSupportOn) -import qualified Simplex.Messaging.Crypto.ShortLink as SL import qualified Simplex.Messaging.Crypto.Ratchet as CR +import qualified Simplex.Messaging.Crypto.ShortLink as SL import Simplex.Messaging.Encoding import Simplex.Messaging.Encoding.String import Simplex.Messaging.Notifications.Protocol (DeviceToken, NtfRegCode (NtfRegCode), NtfTknStatus (..), NtfTokenId, PNMessageData (..), pnMessagesP) @@ -217,7 +218,6 @@ import Simplex.Messaging.Protocol ) import qualified Simplex.Messaging.Protocol as SMP import Simplex.Messaging.ServiceScheme (ServiceScheme (..)) -import Simplex.Messaging.Agent.Store.Entity import qualified Simplex.Messaging.TMap as TM import Simplex.Messaging.Transport (SMPVersion) import Simplex.Messaging.Util @@ -833,8 +833,9 @@ newConn :: ConnectionModeI c => AgentClient -> NetworkRequestMode -> UserId -> B newConn c nm userId enableNtfs cMode userData_ clientData pqInitKeys subMode = do srv <- getSMPServer c userId connId <- newConnNoQueues c userId enableNtfs cMode (CR.connPQEncryption pqInitKeys) - (connId,) <$> newRcvConnSrv c nm userId connId enableNtfs cMode userData_ clientData pqInitKeys subMode srv - `catchE` \e -> withStore' c (`deleteConnRecord` connId) >> throwE e + (connId,) + <$> newRcvConnSrv c nm userId connId enableNtfs cMode userData_ clientData pqInitKeys subMode srv + `catchE` \e -> withStore' c (`deleteConnRecord` connId) >> throwE e setConnShortLink' :: AgentClient -> NetworkRequestMode -> ConnId -> SConnectionMode c -> UserLinkData -> Maybe CRClientData -> AM (ConnShortLink c) setConnShortLink' c nm connId cMode userData clientData = @@ -914,8 +915,7 @@ getConnShortLink' c nm userId = \case decryptData srv linkKey k (sndId, d) = do r@(cReq, clData) <- liftEither $ SL.decryptLinkData @c linkKey k d let (srv', sndId') = qAddress (connReqQueue cReq) - unless (srv `sameSrvHost` srv' && sndId == sndId') $ - throwE $ AGENT $ A_LINK "different address" + unless (srv `sameSrvHost` srv' && sndId == sndId') $ throwE $ AGENT $ A_LINK "different address" pure $ if srv' == srv then r else (updateConnReqServer srv cReq, clData) sameSrvHost ProtocolServer {host = h :| _} ProtocolServer {host = hs} = h `elem` hs updateConnReqServer :: SMPServer -> ConnectionRequestUri c -> ConnectionRequestUri c @@ -1004,7 +1004,7 @@ newRcvConnSrv c nm userId connId enableNtfs cMode userData_ clientData pqInitKey connReqWithShortLink :: SMPQueueUri -> ConnectionRequestUri c -> SMPQueueUri -> Maybe ShortLinkCreds -> AM (CreatedConnLink c) connReqWithShortLink qUri cReq qUri' shortLink = case shortLink of Just ShortLinkCreds {shortLinkId, shortLinkKey} - | qUri == qUri' -> pure $ case cReq of + | qUri == qUri' -> pure $ case cReq of CRContactUri _ -> CCLink cReq $ Just $ CSLContact SLSServer CCTContact srv shortLinkKey CRInvitationUri crData (CR.E2ERatchetParamsUri vr k1 k2 _) -> let cReq' = case pqInitKeys of @@ -1682,7 +1682,7 @@ enqueueMessageB c reqs = do storeSentMsg db cfg aMessageIds = \case Left e -> pure (aMessageIds, Left e) Right req@(csqs_, pqEnc_, msgFlags, mbr) -> case mbr of - VRValue i_ aMessage -> case i_ >>= (`IM.lookup` aMessageIds) of + VRValue i_ aMessage -> case i_ >>= (`IM.lookup` aMessageIds) of Just _ -> pure (aMessageIds, Left $ INTERNAL "enqueueMessageB: storeSentMsg duplicate saved message body") Nothing -> do (mbId_, r) <- case csqs_ of @@ -1724,7 +1724,6 @@ enqueueMessageB c reqs = do handleInternal :: E.SomeException -> IO (Either AgentErrorType b) handleInternal = pure . Left . INTERNAL . show - encodeAgentMsgStr :: AMessage -> InternalSndId -> PrevSndMsgHash -> ByteString encodeAgentMsgStr aMessage internalSndId prevMsgHash = do let privHeader = APrivHeader (unSndId internalSndId) prevMsgHash @@ -2536,10 +2535,13 @@ getNextSMPServer c userId = getNextServer c userId storageSrvs {-# INLINE getNextSMPServer #-} subscriber :: AgentClient -> AM' () -subscriber c@AgentClient {msgQ} = forever $ do +subscriber c@AgentClient {msgQ, subQ} = run $ forever $ do t <- atomically $ readTBQueue msgQ agentOperationBracket c AORcvNetwork waitUntilActive $ processSMPTransmissions c t + where + run a = a `catchOwn` \e -> notify $ CRITICAL True $ "Agent subscriber stopped: " <> show e + notify err = atomically $ writeTBQueue subQ ("", "", AEvt SAEConn $ ERR err) cleanupManager :: AgentClient -> AM' () cleanupManager c@AgentClient {subQ} = do @@ -2848,7 +2850,7 @@ processSMPTransmissions c@AgentClient {subQ} (tSess@(userId, srv, _), _v, sessId ackDel :: InternalId -> AM ACKd ackDel aId = enqueueCmd (ICAckDel rId srvMsgId aId) $> ACKd handleNotifyAck :: AM ACKd -> AM ACKd - handleNotifyAck m = m `catchAllErrors` \e -> notify (ERR e) >> ack + handleNotifyAck m = m `catchAllOwnErrors` \e -> notify (ERR e) >> ack SMP.END -> atomically (ifM (activeClientSession c tSess sessId) (removeSubscription c connId $> True) (pure False)) >>= notifyEnd diff --git a/src/Simplex/Messaging/Agent/Protocol.hs b/src/Simplex/Messaging/Agent/Protocol.hs index d4d302df7..df04d7c12 100644 --- a/src/Simplex/Messaging/Agent/Protocol.hs +++ b/src/Simplex/Messaging/Agent/Protocol.hs @@ -173,7 +173,7 @@ module Simplex.Messaging.Agent.Protocol where import Control.Applicative (optional, (<|>)) -import Control.Exception (BlockedIndefinitelyOnSTM (..), fromException) +import Control.Exception (BlockedIndefinitelyOnMVar (..), BlockedIndefinitelyOnSTM (..), fromException) import Data.Aeson (FromJSON (..), ToJSON (..), Value (..), (.:), (.:?)) import qualified Data.Aeson as J' import qualified Data.Aeson.Encoding as JE @@ -1870,7 +1870,9 @@ data AgentErrorType instance AnyError AgentErrorType where fromSomeException e = case fromException e of Just BlockedIndefinitelyOnSTM -> CRITICAL True "Thread blocked indefinitely in STM transaction" - _ -> INTERNAL $ show e + _ -> case fromException e of + Just BlockedIndefinitelyOnMVar -> CRITICAL True "Thread blocked indefinitely on MVar" + _ -> INTERNAL $ show e {-# INLINE fromSomeException #-} -- | SMP agent protocol command or response error. diff --git a/src/Simplex/Messaging/Compression.hs b/src/Simplex/Messaging/Compression.hs index 19d91a300..18efe1a0d 100644 --- a/src/Simplex/Messaging/Compression.hs +++ b/src/Simplex/Messaging/Compression.hs @@ -36,10 +36,12 @@ compress1 bs | B.length bs <= maxLengthPassthrough = Passthrough bs | otherwise = Compressed . Large $ Z1.compress compressionLevel bs -decompress1 :: Compressed -> Either String ByteString -decompress1 = \case +decompress1 :: Int -> Compressed -> Either String ByteString +decompress1 limit = \case Passthrough bs -> Right bs - Compressed (Large bs) -> case Z1.decompress bs of - Z1.Error e -> Left e - Z1.Skip -> Right mempty - Z1.Decompress bs' -> Right bs' + Compressed (Large bs) -> case Z1.decompressedSize bs of + Just sz | sz <= limit -> case Z1.decompress bs of + Z1.Error e -> Left e + Z1.Skip -> Right mempty + Z1.Decompress bs' -> Right bs' + _ -> Left $ "compressed size not specified or exceeds " <> show limit diff --git a/src/Simplex/Messaging/Util.hs b/src/Simplex/Messaging/Util.hs index f93119b3c..780fc0b1c 100644 --- a/src/Simplex/Messaging/Util.hs +++ b/src/Simplex/Messaging/Util.hs @@ -4,6 +4,7 @@ module Simplex.Messaging.Util where +import Control.Exception (AllocationLimitExceeded (..), AsyncException (..)) import qualified Control.Exception as E import Control.Monad import Control.Monad.Except @@ -21,9 +22,9 @@ import Data.Int (Int64) import Data.List (groupBy, sortOn) import Data.List.NonEmpty (NonEmpty (..)) import qualified Data.List.NonEmpty as L -import Data.Maybe (listToMaybe) import Data.Map.Strict (Map) import qualified Data.Map.Strict as M +import Data.Maybe (listToMaybe) import Data.Text (Text) import qualified Data.Text as T import Data.Text.Encoding (decodeUtf8With, encodeUtf8) @@ -93,7 +94,7 @@ anyM :: Monad m => [m Bool] -> m Bool anyM = foldM (\r a -> if r then pure r else (r ||) <$!> a) False {-# INLINE anyM #-} -infixl 1 $>>, $>>= +infixl 1 $>>, $>>= ($>>=) :: (Monad m, Monad f, Traversable f) => m (f a) -> (a -> m (f b)) -> m (f b) f $>>= g = f >>= fmap join . mapM g @@ -115,15 +116,19 @@ forME :: (Monad m, Traversable t) => t (Either e a) -> (a -> m (Either e b)) -> forME = flip mapME {-# INLINE forME #-} - -- | Monadic version of mapAccumL -- Copied from ghc-9.6.3 package: https://hackage.haskell.org/package/ghc-9.12.1/docs/GHC-Utils-Monad.html#v:mapAccumLM -- for backward compatibility with 8.10.7. -mapAccumLM :: (Monad m, Traversable t) - => (acc -> x -> m (acc, y)) -- ^ combining function - -> acc -- ^ initial state - -> t x -- ^ inputs - -> m (acc, t y) -- ^ final state, outputs +mapAccumLM :: + (Monad m, Traversable t) => + -- | combining function + (acc -> x -> m (acc, y)) -> + -- | initial state + acc -> + -- | inputs + t x -> + -- | final state, outputs + m (acc, t y) {-# INLINE [1] mapAccumLM #-} -- INLINE pragma. mapAccumLM is called in inner loops. Like 'map', -- we inline it so that we can take advantage of knowing 'f'. @@ -132,26 +137,31 @@ mapAccumLM :: (Monad m, Traversable t) mapAccumLM f s = fmap swap . flip runStateT s . traverse f' where f' = StateT . (fmap . fmap) swap . flip f + {-# RULES "mapAccumLM/List" mapAccumLM = mapAccumLM_List #-} {-# RULES "mapAccumLM/NonEmpty" mapAccumLM = mapAccumLM_NonEmpty #-} -mapAccumLM_List - :: Monad m - => (acc -> x -> m (acc, y)) - -> acc -> [x] -> m (acc, [y]) +mapAccumLM_List :: + Monad m => + (acc -> x -> m (acc, y)) -> + acc -> + [x] -> + m (acc, [y]) {-# INLINE mapAccumLM_List #-} mapAccumLM_List f = go where go s (x : xs) = do - (s1, x') <- f s x + (s1, x') <- f s x (s2, xs') <- go s1 xs - return (s2, x' : xs') + return (s2, x' : xs') go s [] = return (s, []) -mapAccumLM_NonEmpty - :: Monad m - => (acc -> x -> m (acc, y)) - -> acc -> NonEmpty x -> m (acc, NonEmpty y) +mapAccumLM_NonEmpty :: + Monad m => + (acc -> x -> m (acc, y)) -> + acc -> + NonEmpty x -> + m (acc, NonEmpty y) {-# INLINE mapAccumLM_NonEmpty #-} mapAccumLM_NonEmpty f s (x :| xs) = [(s2, x' :| xs') | (s1, x') <- f s x, (s2, xs') <- mapAccumLM_List f s1 xs] @@ -197,6 +207,47 @@ allFinally :: (AnyError e, MonadUnliftIO m) => ExceptT e m a -> ExceptT e m b -> allFinally action final = tryAllErrors action >>= \r -> final >> except r {-# INLINE allFinally #-} +isOwnException :: E.SomeException -> Bool +isOwnException e = case E.fromException e of + Just StackOverflow -> True + Just HeapOverflow -> True + _ -> case E.fromException e of + Just AllocationLimitExceeded -> True + _ -> False +{-# INLINE isOwnException #-} + +isAsyncCancellation :: E.SomeException -> Bool +isAsyncCancellation e = case E.fromException e of + Just (_ :: SomeAsyncException) -> not $ isOwnException e + Nothing -> False +{-# INLINE isAsyncCancellation #-} + +catchOwn' :: IO a -> (E.SomeException -> IO a) -> IO a +catchOwn' action handleInternal = action `E.catch` \e -> if isAsyncCancellation e then E.throwIO e else handleInternal e +{-# INLINE catchOwn' #-} + +catchOwn :: MonadUnliftIO m => m a -> (E.SomeException -> m a) -> m a +catchOwn action handleInternal = + withRunInIO $ \run -> + run action `E.catch` \e -> if isAsyncCancellation e then E.throwIO e else run (handleInternal e) +{-# INLINE catchOwn #-} + +tryAllOwnErrors :: (AnyError e, MonadUnliftIO m) => ExceptT e m a -> ExceptT e m (Either e a) +tryAllOwnErrors action = ExceptT $ Right <$> runExceptT action `catchOwn` (pure . Left . fromSomeException) +{-# INLINE tryAllOwnErrors #-} + +tryAllOwnErrors' :: (AnyError e, MonadUnliftIO m) => ExceptT e m a -> m (Either e a) +tryAllOwnErrors' action = runExceptT action `catchOwn` (pure . Left . fromSomeException) +{-# INLINE tryAllOwnErrors' #-} + +catchAllOwnErrors :: (AnyError e, MonadUnliftIO m) => ExceptT e m a -> (e -> ExceptT e m a) -> ExceptT e m a +catchAllOwnErrors action handler = tryAllOwnErrors action >>= either handler pure +{-# INLINE catchAllOwnErrors #-} + +catchAllOwnErrors' :: (AnyError e, MonadUnliftIO m) => ExceptT e m a -> (e -> m a) -> m a +catchAllOwnErrors' action handler = tryAllOwnErrors' action >>= either handler pure +{-# INLINE catchAllOwnErrors' #-} + eitherToMaybe :: Either a b -> Maybe b eitherToMaybe = either (const Nothing) Just {-# INLINE eitherToMaybe #-} diff --git a/tests/AgentTests/MigrationTests.hs b/tests/AgentTests/MigrationTests.hs index e4de45c7a..8245cfd51 100644 --- a/tests/AgentTests/MigrationTests.hs +++ b/tests/AgentTests/MigrationTests.hs @@ -212,7 +212,7 @@ createStore randSuffix migrations confirmMigrations = do poolSize = 1, createSchema = True } - createDBStore dbOpts migrations confirmMigrations + createDBStore dbOpts migrations (MigrationConfig confirmMigrations Nothing) cleanup :: Word32 -> IO () cleanup randSuffix = dropSchema testDBConnectInfo (testSchema randSuffix) diff --git a/tests/CoreTests/UtilTests.hs b/tests/CoreTests/UtilTests.hs index 946902358..580f4e9b0 100644 --- a/tests/CoreTests/UtilTests.hs +++ b/tests/CoreTests/UtilTests.hs @@ -3,7 +3,7 @@ module CoreTests.UtilTests where import AgentTests.FunctionalAPITests () -import Control.Exception (Exception, SomeException, throwIO) +import Control.Exception (AllocationLimitExceeded (..), AsyncException (..), Exception, SomeException, throwIO) import Control.Monad.Except import Control.Monad.IO.Class import Data.IORef @@ -71,11 +71,43 @@ utilTests = do runExceptT (throwTestException `allFinally` final) `shouldReturn` Left (TestException "user error (error)") it "and should not throw if there are no exceptions" $ withFinal $ \final -> runExceptT (noErrors `allFinally` final) `shouldReturn` Right "no errors" + describe "tryAllOwnErrors" $ do + it "should return ExceptT error as Left" $ + runExceptT (tryAllOwnErrors throwTestError) `shouldReturn` Right (Left (TestError "error")) + it "should return SomeException as Left" $ + runExceptT (tryAllOwnErrors throwTestException) `shouldReturn` Right (Left (TestException "user error (error)")) + it "should catch StackOverflow" $ + runExceptT (tryAllOwnErrors $ throwAsync StackOverflow) `shouldReturn` Right (Left (TestException "stack overflow")) + it "should catch HeapOverflow" $ + runExceptT (tryAllOwnErrors $ throwAsync HeapOverflow) `shouldReturn` Right (Left (TestException "heap overflow")) + it "should catch AllocationLimitExceeded" $ + runExceptT (tryAllOwnErrors $ throwAsync AllocationLimitExceeded) `shouldReturn` Right (Left (TestException "allocation limit exceeded")) + it "should rethrow ThreadKilled" $ + runExceptT (tryAllOwnErrors $ throwAsync ThreadKilled) `shouldThrow` (\e -> e == ThreadKilled) + it "should return no errors as Right" $ + runExceptT (tryAllOwnErrors noErrors) `shouldReturn` Right (Right "no errors") + describe "catchAllOwnErrors" $ do + it "should catch ExceptT error" $ + runExceptT (throwTestError `catchAllOwnErrors` handleCatch) `shouldReturn` Right "caught TestError \"error\"" + it "should catch SomeException" $ + runExceptT (throwTestException `catchAllOwnErrors` handleCatch) `shouldReturn` Right "caught TestException \"user error (error)\"" + it "should catch StackOverflow" $ + runExceptT (throwAsync StackOverflow `catchAllOwnErrors` handleCatch) `shouldReturn` Right "caught TestException \"stack overflow\"" + it "should catch HeapOverflow" $ + runExceptT (throwAsync HeapOverflow `catchAllOwnErrors` handleCatch) `shouldReturn` Right "caught TestException \"heap overflow\"" + it "should catch AllocationLimitExceeded" $ + runExceptT (throwAsync AllocationLimitExceeded `catchAllOwnErrors` handleCatch) `shouldReturn` Right "caught TestException \"allocation limit exceeded\"" + it "should rethrow ThreadKilled" $ + runExceptT (throwAsync ThreadKilled `catchAllOwnErrors` handleCatch) `shouldThrow` (\e -> e == ThreadKilled) + it "should not throw if there are no errors" $ + runExceptT (noErrors `catchAllOwnErrors` throwError) `shouldReturn` Right "no errors" where throwTestError :: ExceptT TestError IO String throwTestError = throwError $ TestError "error" throwTestException :: ExceptT TestError IO String throwTestException = liftIO $ throwIO $ userError "error" + throwAsync :: Exception e => e -> ExceptT TestError IO String + throwAsync = liftIO . throwIO noErrors :: ExceptT TestError IO String noErrors = pure "no errors" handleCatch :: TestError -> ExceptT TestError IO String From 9346b85c3f34f8b12fefef4631ba21087cf5f0e3 Mon Sep 17 00:00:00 2001 From: Evgeny Poberezkin Date: Wed, 28 Jan 2026 21:58:32 +0000 Subject: [PATCH 06/14] 6.4.8.0 --- simplexmq.cabal | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplexmq.cabal b/simplexmq.cabal index ebbf156db..73339aee1 100644 --- a/simplexmq.cabal +++ b/simplexmq.cabal @@ -1,7 +1,7 @@ cabal-version: 1.12 name: simplexmq -version: 6.4.7.0 +version: 6.4.8.0 synopsis: SimpleXMQ message broker description: This package includes <./docs/Simplex-Messaging-Server.html server>, <./docs/Simplex-Messaging-Client.html client> and From 8fdc0703bc9b89dae8b2fe6820b705580a669281 Mon Sep 17 00:00:00 2001 From: Evgeny Poberezkin Date: Wed, 28 Jan 2026 23:12:19 +0000 Subject: [PATCH 07/14] 6.5.0.8 --- simplexmq.cabal | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplexmq.cabal b/simplexmq.cabal index 1a469ca89..2ff5de800 100644 --- a/simplexmq.cabal +++ b/simplexmq.cabal @@ -1,7 +1,7 @@ cabal-version: 1.12 name: simplexmq -version: 6.5.0.7 +version: 6.5.0.8 synopsis: SimpleXMQ message broker description: This package includes <./docs/Simplex-Messaging-Server.html server>, <./docs/Simplex-Messaging-Client.html client> and From 3dba9c2b1e9b0860ea465d6c36068f4c569f74be Mon Sep 17 00:00:00 2001 From: Evgeny Date: Sun, 22 Feb 2026 18:58:09 +0000 Subject: [PATCH 08/14] rfc: SimpleX Network Consortium governance (#1718) * rfc: SimpleX Network Consortium governance * update * add draft label --------- Co-authored-by: Evgeny @ SimpleX Chat <259188159+evgeny-simplex@users.noreply.github.com> --- rfcs/README.md | 150 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 rfcs/README.md diff --git a/rfcs/README.md b/rfcs/README.md new file mode 100644 index 000000000..07f3c2320 --- /dev/null +++ b/rfcs/README.md @@ -0,0 +1,150 @@ +# SimpleX Network Protocol Specifications — Governance and Evolution (draft) + +## Why this document exists + +SimpleX Network protocol specifications must evolve as the network grows. This document defines how specifications change, who governs those changes, and how the history of changes is preserved. + +### Lessons from the web: why ratcheted governance matters + +The web's governance history demonstrates both the necessity of consortium governance and the dangers of getting the transition wrong. + +[Tim Berners-Lee invented the web in 1991](https://home.cern/science/computing/birth-web/short-history-web). [Netscape took over in 1994](https://en.wikipedia.org/wiki/Netscape_Navigator), driving rapid innovation as a single company — SSL, cookies, JavaScript, and the features that made the web commercially viable. In 1994, [W3C was founded](https://www.w3.org/about/history/) as a consortium hosted across multiple independent institutions (MIT in the US, INRIA/ERCIM in Europe, Keio University in Japan, later Beihang University in China) to govern web standards. + +The transition from company-led innovation to consortium governance was abrupt rather than gradual. Netscape's decline (accelerated by the [browser wars](https://en.wikipedia.org/wiki/Browser_wars) and [AOL acquisition](https://cybercultural.com/p/1999-the-fall-of-netscape-and-the-rise-of-mozilla/)) transferred control to a standards body that prioritized process over progress. The result was [a lost decade of web stagnation](https://eev.ee/blog/2020/02/01/old-css-new-css/): CSS 2.0 shipped in 1998; CSS 2.1 didn't reach Candidate Recommendation until 2004 and wasn't finalized until 2011. W3C pursued XHTML and rejected proposed enhancements to HTML, until frustrated engineers from Apple, Mozilla, and Opera formed [WHATWG in 2004](https://en.wikipedia.org/wiki/WHATWG) to build HTML5 outside W3C's process. The abrupt governance transition, without a mechanism to balance community guarantees against the imperative to continue evolving the product at pace, dramatically slowed web evolution at the time it was needed most. + +Then in 2023, [W3C restructured from a multi-host consortium into a single 501(c)(3) nonprofit entity](https://www.w3.org/press-releases/2023/w3c-le-launched/) — W3C Inc, incorporated in the US. The previous structure distributed governance across four independent university hosts in different countries, making capture by any single entity structurally difficult. The new structure concentrates governance in a single legal entity with a board of directors. While presented as modernization, this effectively ended the decentralized consortium model that had protected web standards for nearly three decades. + +### The governance double ratchet + +SimpleX follows the same Netscape-to-consortium evolution path, but with two ratchets designed to prevent both failure modes — stagnation from premature governance transfer, and capture from governance centralization: + +- **Licensing ratchet**: all contributed IP is licensed under AGPLv3 (software) and Creative Commons (documentation), perpetually and irrevocably. What is licensed cannot be unlicensed. If a Party transfers Licensed IP, the licensing obligations transfer with it. + +- **Governance ratchet**: power can be given to the SimpleX Network Consortium, but never taken back. The Consortium Agreement requires unanimous approval of all Governing Parties for changes to the agreement itself, IP policy, and admission or removal of parties. + +The ratcheted transition is not just a clever device — it is a historically proven imperative. It allows the company to continue driving rapid product innovation (as Netscape did for the web) while incrementally and irreversibly transferring governance to the consortium, without the abrupt handover that stalled web evolution or the centralization that later undermined it. + +### Specification governance via the Consortium Agreement + +The SimpleX Network Consortium Agreement (being deployed in 2026) establishes two levels of intellectual property governance: **Licensed IP** (all contributed protocol specifications, software, and documentation, licensed perpetually and irrevocably) and **Core IP** (the subset essential to the network, requiring consortium governance to change). The distinction between these levels and how they map to the RFC process is described in [Standard vs Core specifications](#standard-vs-core-specifications) below. + +## Specification change process: protocol specifications and RFCs + +Protocol knowledge lives in two places: + +### `protocol/` — Consolidated specifications + +Each file is a complete, self-contained description of a protocol as it exists today. Like consolidated legislation in the UK legal system: the full current law in one document, not a patchwork of amendments. + +Consolidated specifications are maintained on every code change that affects protocol behavior. With LLMs, the cost of maintaining consolidated documents collapses — reworking prose to incorporate a new RFC is now inexpensive relative to the value of a single authoritative document per protocol. + +Implementers read `protocol/`. They should never need to reconstruct current behavior from a base spec plus a chain of RFCs. + +### `rfcs/` — Protocol evolution commits + +Each RFC describes a single change to a protocol specification. RFCs are the atomic unit of protocol evolution — analogous to commits in version control, or amending acts in legislation. + +An RFC is not part of the protocol specification. It becomes part of the specification only when embedded into the consolidated `protocol/` document. The RFC itself remains as a permanent historical record of what changed, when, and why. + +## RFC lifecycle + +``` + ┌——> done/ ——> standard/ +draft (root) ——>──┤ + └——> rejected/ +``` + +### Draft — `rfcs/*.md` + +A proposal for a protocol change. Not yet implemented. Active proposals live in the `rfcs/` root directory. + +Named by proposal date: `YYYY-MM-DD-topic.md`. + +A draft may be rejected if the proposal is considered but not accepted for implementation. + +### Done — `rfcs/done/` + +Implemented in code. The protocol change described by this RFC exists in the codebase, but the RFC has not yet been verified against the actual implementation (code may have diverged from the proposal during implementation). + +### Standard — `rfcs/standard/` + +Verified against the actual implementation and synchronized with code. The RFC accurately describes what was implemented. This is a permanent historical record — standard RFCs are never modified or removed. + +On promotion to standard, the RFC is: +1. Renamed from proposal date to standardization date: `YYYY-MM-DD-topic.md` (new date, same topic slug) +2. Updated with a document history header capturing the full lifecycle +3. Embedded into the corresponding `protocol/` consolidated specification + +The `protocol/` document references embedded RFCs by name (e.g., "Private message routing added by RFC 2023-09-12-second-relays, standardized 2026-XX-XX"), similar to UK legislation citing the amending act for each clause. + +Protocol version numbers make it clear which RFCs are included in which protocol revision — no separate tracking is needed. + +### Rejected — `rfcs/rejected/` + +Draft proposals that were considered but not accepted for implementation. Only drafts move to rejected — once an RFC is implemented (done/), it proceeds to standard/ after verification. Preserved for historical record of design decisions. + +### Document history header + +Every RFC in `standard/` carries a history header: + +``` +--- +Proposed: YYYY-MM-DD +Implemented: YYYY-MM-DD +Standardized: YYYY-MM-DD +Protocol: simplex-messaging v9 (or whichever protocol this amends) +--- +``` + +## Governance + +SimpleX Network follows the Netscape-to-W3C evolution path, with ratcheted rather than abrupt transitions: + +| Phase | Period | Governance | Development process | +|-------|--------|-----------|-------------------| +| Protocol invented | 2020 | Two people | Prototype developed | +| SimpleX Chat Ltd | 2022 | One company | Product-first: code leads, specs follow | +| SimpleX Network Consortium | 2026 | Agreement of SimpleX Chat Ltd and non-profit entities | Product-first for standard; standards-first for core | +| Decentralized governance | Future | TBD (DAO research ongoing) | Standards-first | + +### Current: product-first development + +SimpleX protocols currently follow a product-first development process: requirements drive code, code drives specification. RFCs are written as design proposals before implementation, but implementation details are figured out in code. Consolidated protocol specifications in `protocol/` are then amended to match the implementation. + +This process is governed by SimpleX Chat Ltd as the IP Holding Party under the Consortium Agreement. + +Any Specification Author (as defined in the Consortium Agreement) may propose RFCs. Acceptance and standardization decisions are made by SimpleX Chat Ltd during the current product-first phase. + +### Standard vs Core specifications + +The distinction between standard and core maps directly to the two levels of IP governance in the Consortium Agreement, and reflects the difference between product-first and standards-first development: + +**Standard** — Licensed IP, not yet under consortium governance. Governed by the company. + +All contributed protocol specifications are Licensed IP under the Consortium Agreement. Standard specifications follow product-first development: the company can evolve them with product needs, and they must be maintained on every code change that affects protocol behavior. + +Standard specifications live in `rfcs/standard/` and `protocol/`. + +**Core** — Governed IP, governed by the consortium. + +A subset of standard specifications will be designated as Core IP under the Consortium Agreement. Core specifications will follow standards-first development: specification changes must be agreed via Governing Decision before code changes. + +This is a legally binding commitment. Once Licensed IP is included in Core IP, the company that owns the code cannot unilaterally change it — even though they own the code, the Consortium Agreement requires a Governing Decision for any change to Core IP. This protects the fundamental properties of the network (privacy, security, decentralization) from unilateral modification by any single party. + +The designation of specific specifications as Core IP is itself a Governing Decision requiring unanimous approval. The transition will happen incrementally as protocols stabilize — the governance ratchet ensures that each designation is irreversible. + +The exact mechanism for distinguishing core from standard within the RFC and protocol folder structure is TBD — it will be decided as the first protocols are designated as Core IP. + +### Future: standards-first development + +As more protocols are designated as Core IP, development naturally transitions to a standards-first process for a growing portion of the protocol suite. The governance ratchet ensures this transition is gradual and irreversible — each protocol that becomes core gains the protection of consortium governance permanently, while remaining standard protocols continue to evolve at product pace. + +## Current state + +| Location | Contents | Count | +|----------|----------|-------| +| `protocol/` | Consolidated specs (SMP v9, Agent v5, XFTP v2, XRCP v1, Push v2, PQDR v1) | 6 specs + overview | +| `rfcs/` root | Active draft proposals | 19 | +| `rfcs/done/` | Implemented, not yet verified | 25 | +| `rfcs/standard/` | Verified against implementation | (to be populated) | +| `rfcs/rejected/` | Draft proposals not accepted | 7 | From 483ac674fb0625defa4fb78e8164de812e65185b Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 24 Feb 2026 16:47:00 +0000 Subject: [PATCH 09/14] agent: fix possible deadlocks of queue overloading when processing messages (#1713) * agent: fix possibly deadlocks of queue overloading when processing messages * simplify * refactor * refactor * refactor 2 --------- Co-authored-by: Evgeny @ SimpleX Chat <259188159+evgeny-simplex@users.noreply.github.com> --- rfcs/2026-02-17-fix-subq-deadlock.md | 415 +++++++++++++++++++++++++++ src/Simplex/Messaging/Agent.hs | 50 +++- 2 files changed, 451 insertions(+), 14 deletions(-) create mode 100644 rfcs/2026-02-17-fix-subq-deadlock.md diff --git a/rfcs/2026-02-17-fix-subq-deadlock.md b/rfcs/2026-02-17-fix-subq-deadlock.md new file mode 100644 index 000000000..9c38e6721 --- /dev/null +++ b/rfcs/2026-02-17-fix-subq-deadlock.md @@ -0,0 +1,415 @@ +# Fix subQ deadlock: blocking writeTBQueue inside connLock + +## Problem + +Users report that message reception silently and permanently stops across all connections, with no error alerts. The app appears functional but no messages arrive. Recovery requires restart. + +Root cause: a deadlock between worker threads holding `connLock` and the `agentSubscriber` (sole `subQ` reader). + +### The deadlock mechanism + +`subQ` (`TBQueue ATransmission`, capacity 4096 on mobile / 1024 on desktop) is the single pipeline between the agent layer and the chat layer. The `agentSubscriber` thread (`Commands.hs:4373`) is its **sole reader**. + +Three code sites hold `connLock` and call blocking `writeTBQueue subQ` without a fullness check. When `subQ` is full, these block while holding the lock. If `agentSubscriber` simultaneously needs the same `connLock` (via `sendMessagesB_` → `withConnLocks`), it blocks too — creating a circular wait: + +- **Worker**: holds `connLock(X)`, waits for `subQ` space (needs `agentSubscriber` to read) +- **agentSubscriber**: sole `subQ` reader, waits for `connLock(X)` (needs worker to release) +- **Result**: permanent silent deadlock — no exception, no alert, all connections blocked + +### Confirmed deadlock scenarios + +**Scenario 1**: Delivery worker during queue rotation test + +``` +Delivery worker: agentSubscriber (sole subQ reader): + withConnLock(X) [2187] readTBQueue subQ → processAgentMessageConn + ...DB operations... → sendPendingGroupMessages (on CON/SENT/QCONT) + notify → writeTBQueue subQ [2238] → batchSendConnMessages → deliverMessagesB + [BLOCKED — subQ full] → withAgent sendMessagesB [synchronous] + → sendMessagesB_ → withConnLocks({..X..}) [1708] + [BLOCKED — connLock(X) held] +``` + +**Scenario 2**: Async command worker during message ACK with notification + +``` +Async cmd worker: agentSubscriber (sole subQ reader): + tryWithLock "ICAck" [1930→1824] readTBQueue subQ → processAgentMessageConn + → withConnLock(X) → sendPendingGroupMessages + → ack → ackQueueMessage [1899] → sendMessagesB_ → withConnLocks({..X..}) + → sendMsgNtf [2381] [BLOCKED — connLock(X) held] + → writeTBQueue subQ [2386] + [BLOCKED — subQ full] +``` + +**Scenario 3**: Synchronous `ackMessage'` API (same mechanism as Scenario 2 but from external API caller) + +``` +ackMessage' caller: agentSubscriber (sole subQ reader): + withConnLock(X) [2254] → sendMessagesB_ → withConnLocks({..X..}) + → ack → ackQueueMessage [2267] [BLOCKED — connLock(X) held] + → sendMsgNtf [2381] + → writeTBQueue subQ [2386] + [BLOCKED — subQ full] +``` + +### ConnId overlap verified + +No guard prevents a connection undergoing queue rotation (AM_QTEST_) or ACK processing from being included in `sendMessagesB_`'s batch. During these operations, the connection has `connStatus == ConnReady`, passing all filters in `memberSendAction`. + +### Cascade amplification + +Once any single deadlock triggers, `subQ` never drains. ALL other threads that attempt `writeTBQueue subQ` block progressively — their locks are held forever too. The entire threading system freezes within seconds. + +### Affected code sites (blocking `writeTBQueue subQ` inside `connLock`) + +| Site | File | Lock line | Write line | Events written | +|------|------|-----------|------------|----------------| +| `runSmpQueueMsgDelivery::notify` | Agent.hs | 2187 | 2238 | SWITCH SPCompleted, ERR INTERNAL | +| `runSmpQueueMsgDelivery::internalErr/notifyDel` | Agent.hs | 2187 | 2238 (via notifyDel→notify) | ERR INTERNAL + delMsg | +| `ackQueueMessage::sendMsgNtf` | Agent.hs | 2254 or 1930 | 2386 | MSGNTF | + +### Safe patterns that already exist in the codebase + +1. **`isFullTBQueue` + pending TVar** (used at `runCommandProcessing` lines 1782-1784/1937, and `runProcessSMP` lines 3027-3029/3216): + ```haskell + -- Before processing (e.g. line 1782): + pending <- newTVarIO [] + -- During processing — safe notify (e.g. line 1937): + notify cmd = + let t = (corrId, connId, AEvt (sAEntity @e) cmd) + in atomically $ ifM (isFullTBQueue subQ) (modifyTVar' pendingCmds (t :)) (writeTBQueue subQ t) + -- After processing — flush (e.g. line 1784): + mapM_ (atomically . writeTBQueue subQ) . reverse =<< readTVarIO pending + ``` + +2. **`nonBlockingWriteTBQueue`** (used at Client.hs:789, NtfSubSupervisor.hs:507): + ```haskell + nonBlockingWriteTBQueue q x = do + sent <- atomically $ tryWriteTBQueue q x + unless sent $ void $ forkIO $ atomically $ writeTBQueue q x + ``` + Note: `nonBlockingWriteTBQueue` does NOT preserve ordering — the spawned background thread may complete out of order relative to subsequent direct writes from the same calling thread. + +### Exhaustive proof: no other deadlock scenarios exist + +All 15 `withConnLock` sites in Agent.hs were analyzed. Only 3 write to `subQ`: + +| withConnLock site | Writes subQ? | Safe? | +|-------------------|-------------|-------| +| switchConnectionAsync' (899) | No | ✓ | +| setConnShortLinkAsync' (995) | No | ✓ | +| setConnShortLink' (1031) | No | ✓ | +| deleteConnShortLink' (1075) | No | ✓ | +| allowConnection' (1407) | No | ✓ | +| acceptContact' (1417) | No | ✓ | +| sendMessagesB_ (1708, `withConnLocks`) | No | ✓ | +| tryWithLock/runSmpCommand (1930) | Yes (1937) | ✓ — `isFullTBQueue` check | +| tryMoveableWithLock/runSmpCommand (1931) | Yes (1937) | ✓ — `isFullTBQueue` check | +| **runSmpQueueMsgDelivery AM_QTEST_ (2187)** | **Yes (2238)** | **✗ — DEADLOCK** | +| **ackMessage' (2254)** | **Yes (2386)** | **✗ — DEADLOCK** | +| switchConnection' (2298) | No | ✓ | +| abortConnectionSwitch' (2328) | No | ✓ | +| synchronizeRatchet' (2351) | No | ✓ | +| suspendConnection' (2390) | No | ✓ | +| **processSMP (3037)** | Yes (3216) | ✓ — `isFullTBQueue` check | + +Note: `processSMP` (line 3037) holds `connLock` and its local `notify` (line 3216) writes to `subQ`, but it uses the safe `isFullTBQueue` pattern. Its `ack` (line 3196) uses `enqueueCmd` (DB-only), NOT `ackQueueMessage`. The actual `ackQueueMessage` runs later from the async command worker via ICAck/ICAckDel. + +Other lock pairs checked — no circular dependencies: +- `connLock × DB MVar`: DB never acquires connLock +- `entityLock × connLock`: consistent ordering (entity first in chat, conn in agent) +- `connLock(X) × connLock(Y)`: single agentSubscriber thread, one `withConnLocks` at a time + +--- + +## Deadlock call graph: agentSubscriber → connLock + +All deadlock paths require `agentSubscriber` to synchronously acquire `connLock`. Exhaustive analysis shows that **every such path converges on a single agent function**: `sendMessagesB_` → `withConnLocks` (Agent.hs:1708). No other agent API function called synchronously from the agentSubscriber acquires connLock. + +Verified (FACT): `ackMessageAsync` → `enqueueCommand` only (no connLock). `toggleConnectionNtfs` → no lock. `deleteConnectionAsync` → `deleteLock` not `connLock`. `joinConnectionAsync` → `withInvLock` not `connLock`. + +Also verified (FACT): `Lock = TMVar Text` (Lock.hs:24) is **non-reentrant** — double acquisition on the same thread deadlocks. + +### All 22 trigger paths + +Every path goes through `deliverMessage`/`deliverMessages`/`deliverMessagesB` → `withAgent sendMessagesB` → `sendMessagesB_` → `withConnLocks`: + +| # | Trigger | Chat function | ConnIds locked | Risk | +|---|---------|--------------|----------------|------| +| 1 | Group CON (Invitee) | `introduceToAll` → broadcast XGrpMemNew | **ALL member connIds** | **HIGHEST** | +| 2 | Group MSG XGrpLinkAcpt | `introduceToRemaining` → broadcast | **ALL member connIds** | **HIGHEST** | +| 3 | Group CON (Invitee) | `sendIntroductions` → batch intros to new member | new member connId | Medium | +| 4 | Group CON (Invitee) | `sendHistory` → batch to new member | new member connId | Medium | +| 5 | Group CON | `sendPendingGroupMessages` | member connId | Medium | +| 6 | Group SENT | `sendPendingGroupMessages` | member connId | Medium | +| 7 | Group QCONT | `sendPendingGroupMessages` | member connId | Medium | +| 8 | Group CON (PendingReview) | `introduceToModerators` → to moderators | moderator connIds | Medium | +| 9 | Group CON (PreMember) | `sendXGrpMemCon` → to host | host connId | Low | +| 10 | Group CON (PreMember) | `probeMatchingMemberContact` → probes + hashes | member + N matching connIds | Medium | +| 11 | Direct CON | `probeMatchingMembers` → probes + hashes | contact + N matching connIds | Medium | +| 12 | Direct JOINED | `sendAutoReply` | contact connId | Low | +| 13 | Group JOINED | `sendGroupAutoReply` | member connId | Low | +| 14 | Group INV | `sendXGrpMemInv` → to host | host connId | Low | +| 15 | Group INV (legacy) | `sendGrpInvitation` → to contact | contact connId | Low | +| 16 | Group MSG XGrpMemInv | `xGrpMemInv` → `sendGroupMemberMessage` | re-member connId | Low | +| 17 | Group MSG XGrpMemDel | `forwardToMember` | deleted member connId | Low | +| 18 | Group MSG XGrpLinkMem | `probeMatchingMemberContact` | member + N matching connIds | Medium | +| 19 | Group MSG (dup relay) | `saveGroupRcvMsg` error → `sendDirectMemberMessage` | forwarder connId | Low | +| 20 | SFDONE | `sendFileDescriptions` → to recipients | recipient connIds | Medium | +| 21 | Group MSG XGrpLinkAcpt | `sendHistory` → to accepted member | accepted member connId | Medium | +| 22 | Direct MSG (autoAccept) | `autoAcceptFile` → inline accept reply | contact connId | Low (test-only config) | + +### Key observations + +1. **Single bottleneck**: All 22 paths converge on `sendMessagesB_` → `withConnLocks` (Agent.hs:1708). The deadlock is between this lock acquisition and any worker thread holding `connLock` + blocking on `writeTBQueue subQ`. + +2. **Highest-risk paths** (#1, #2): Broadcasting to ALL group members in `introduceToAll` / `introduceToRemaining` acquires `withConnLocks` on ALL member connIds in a single batch. For large groups, this holds the agentSubscriber thread for a long time, during which subQ fills, which causes worker threads holding connLock on any of those connIds to deadlock. + +3. **Medium-risk paths** (#5-7): `sendPendingGroupMessages` fires on every CON/SENT/QCONT. These are frequent and lock the member's connId, which is the SAME connId that a delivery worker or ACK worker may hold while writing to subQ. + +--- + +## Analysis: `withConnLocks` in `sendMessagesB_` + +### FACT: the lock protects ratchet encryption state + +`sendMessagesB_` (Agent.hs:1708-1713) acquires `withConnLocks` and executes: + +1. **`getConn_`** — reads connection metadata, send queues from DB +2. **`setConnPQSupport`** — updates PQ encryption flag per connection +3. **`enqueueMessagesB`** → `enqueueMessageB` → `storeSentMsg_` which calls: + - **`updateSndIds`** (AgentStore.hs:899) — increments `internalSndId` (sequential send counter) + - **`agentRatchetEncryptHeader`** (Agent.hs:3698) — reads current ratchet via `getRatchetForUpdate`, encrypts message header via `rcEncryptHeader`, writes advanced ratchet state via `updateRatchet` + - **`createSndMsg`** + **`createSndMsgDelivery`** — inserts message and delivery records + +All operations run within `unsafeWithStore` → `withTransaction` (single DB transaction per batch). + +### FACT: the lock CANNOT be removed + +Without `withConnLocks`, concurrent `sendMessagesB_` calls targeting the same connection would: +- Read the same ratchet state, both encrypt, one overwrite the other → **ratchet desync** (unrecoverable) +- Get duplicate `internalSndId` values → **message ID collision** +- Race on `setConnPQSupport` → **PQ state inconsistency** + +The lock serializes ALL operations on the connection's encryption state. Removing it would introduce data corruption. + +Note: `sendMessage` (singular, line 530) uses the same `sendMessagesB_` function — there is no lock-free send path. + +### Eliminated strategies + +- **Strategy C (remove lock)**: The lock protects ratchet encryption. Removing it causes unrecoverable ratchet desync. Eliminated. +- **Strategy A (async dispatch)**: All 22 chat-layer callers use `deliverMessagesB` return values (delivery IDs, PQ state) synchronously. `forkIO` loses results. Eliminated. +- **Strategy W (isFullTBQueue + pending TVar)**: The existing pattern (lines 1937, 3216) buffers events in a local TVar and flushes after lock release. Between lock release and flush, another thread can acquire the same connLock and write events to subQ — reordering events within the same connection. This trades a visible deadlock for invisible ordering bugs. Eliminated. +- **Strategy O (per-connection overflow queues)**: Bounded overflow queues with "drop when full" were analyzed. Drop consequences are unacceptable at 5 of 6 write sites — CONF, INFO, CON cause permanent connection failure after ACK; INV loses connection invitations; SENT/MERR leave messages stuck forever. Unbounded overflow defeats backpressure. Eliminated. + +--- + +## Solution: move subQ writes outside connLock + +### Root cause + +The `writeTBQueue subQ` calls at the 3 deadlock sites are inside `connLock` by accident of code structure, not necessity. `connLock` protects ratchet encryption state and DB consistency. The `notify` calls write informational events to `subQ` — they do not modify any state that `connLock` protects. + +Moving the writes outside the lock scope eliminates the deadlock: blocking `writeTBQueue subQ` without holding `connLock` is safe — agentSubscriber is free to acquire the lock, process events, and drain `subQ`. + +### Why reordering doesn't matter at these sites + +The chat layer handlers for the 3 deadlock site events do NOT advance the ratchet: + +| Event | Chat handler | Calls sendMessagesB_? | +|-------|-------------|----------------------| +| SWITCH SPCompleted | Creates internal chat item, updates UI | **No** | +| ERR INTERNAL | Logs error to view | **No** | +| MSGNTF | `toView CEvtNtfMessage` → empty output | **No** | + +Events that DO trigger ratchet advances (CON, SENT, QCONT → `sendPendingGroupMessages` → `sendMessagesB_`) are all already written OUTSIDE `connLock` in the current code. + +Ratchet state lives in the DB, not in subQ events. agentSubscriber processes events sequentially regardless of arrival order. The SENT-before-SWITCH race already exists in the current code (new queue worker writes SENT outside connLock while old queue worker writes SWITCH inside connLock). + +### Fix: Site 1 — `runSmpQueueMsgDelivery` AM_QTEST_ (line 2187) + +Restructure `withConnLock` to return the event, write outside. + +**Current code** (Agent.hs:2187-2214): +```haskell +AM_QTEST_ -> withConnLock c connId "runSmpQueueMsgDelivery AM_QTEST_" $ do + withStore' c $ \db -> setSndQueueStatus db sq Active + SomeConn _ conn <- withStore c (`getConn` connId) + case conn of + DuplexConnection cData' rqs sqs -> do + let addr = qAddress sq + case findQ addr sqs of + Just SndQueue {dbReplaceQueueId = Just replacedId, primary} -> + case removeQP (\sq' -> dbQId sq' == replacedId && not (sameQueue addr sq')) sqs of + Nothing -> internalErr msgId "sent QTEST: queue not found in connection" + Just (sq', sq'' : sqs') -> do + checkSQSwchStatus sq' SSSendingQTEST + atomically $ TM.delete (qAddress sq') $ smpDeliveryWorkers c + withStore' c $ \db -> do + when primary $ setSndQueuePrimary db connId sq + deletePendingMsgs db connId sq' + deleteConnSndQueue db connId sq' + let sqs'' = sq'' :| sqs' + conn' = DuplexConnection cData' rqs sqs'' + cStats <- connectionStats c conn' + notify $ SWITCH QDSnd SPCompleted cStats -- DEADLOCK + _ -> internalErr msgId "sent QTEST: ..." -- DEADLOCK (via notifyDel → notify) + _ -> internalErr msgId "sent QTEST: ..." -- DEADLOCK + _ -> internalErr msgId "QTEST sent not in duplex ..." -- DEADLOCK +``` + +**New code:** +```haskell +AM_QTEST_ -> do + evt_ <- withConnLock c connId "runSmpQueueMsgDelivery AM_QTEST_" $ do + withStore' c $ \db -> setSndQueueStatus db sq Active + SomeConn _ conn <- withStore c (`getConn` connId) + case conn of + DuplexConnection cData' rqs sqs -> do + let addr = qAddress sq + case findQ addr sqs of + Just SndQueue {dbReplaceQueueId = Just replacedId, primary} -> + case removeQP (\sq' -> dbQId sq' == replacedId && not (sameQueue addr sq')) sqs of + Nothing -> pure $ Left "sent QTEST: queue not found in connection" + Just (sq', sq'' : sqs') -> do + checkSQSwchStatus sq' SSSendingQTEST + atomically $ TM.delete (qAddress sq') $ smpDeliveryWorkers c + withStore' c $ \db -> do + when primary $ setSndQueuePrimary db connId sq + deletePendingMsgs db connId sq' + deleteConnSndQueue db connId sq' + let sqs'' = sq'' :| sqs' + conn' = DuplexConnection cData' rqs sqs'' + cStats <- connectionStats c conn' + pure $ Right $ SWITCH QDSnd SPCompleted cStats + _ -> pure $ Left "sent QTEST: there is only one queue in connection" + _ -> pure $ Left "sent QTEST: queue not in connection or not replacing another queue" + _ -> pure $ Left "QTEST sent not in duplex connection" + -- subQ write is now OUTSIDE connLock — blocking writeTBQueue is safe + case evt_ of + Right evt -> notify evt + Left err -> internalErr msgId err +``` + +All DB operations remain inside the lock. Only `notify`/`internalErr` (which write to subQ) move outside. `internalErr` calls `notifyDel` = `notify >> delMsg` — both `notify` (subQ write) and `delMsg` (`deleteSndMsgDelivery`, keyed on unique msgId) are safe outside the lock. The existing double-delete pattern (`delMsg` inside `internalErr` + `delMsgKeep` at line 2216) is preserved. + +### Fix: Sites 2 & 3 — `ackQueueMessage::sendMsgNtf` (line 2386) + +Change `ackQueueMessage` to return the MSGNTF event instead of writing it to subQ. Callers write to subQ after releasing connLock. + +**Current code** (Agent.hs:2371-2386): +```haskell +ackQueueMessage :: AgentClient -> RcvQueue -> SMP.MsgId -> AM () +ackQueueMessage c rq@RcvQueue {userId, connId, server} srvMsgId = do + atomically $ incSMPServerStat c userId server ackAttempts + tryAllErrors (sendAck c rq srvMsgId) >>= \case + Right _ -> sendMsgNtf ackMsgs + Left (SMP _ SMP.NO_MSG) -> sendMsgNtf ackNoMsgErrs + Left e -> ... + where + sendMsgNtf stat = do + atomically $ incSMPServerStat c userId server stat + whenM (liftIO $ hasGetLock c rq) $ do + atomically $ releaseGetLock c rq + brokerTs_ <- eitherToMaybe <$> tryAllErrors (withStore c $ \db -> getRcvMsgBrokerTs db connId srvMsgId) + atomically $ writeTBQueue (subQ c) ("", connId, AEvt SAEConn $ MSGNTF srvMsgId brokerTs_) +``` + +**New code** — return `Maybe ATransmission` instead of writing: +```haskell +ackQueueMessage :: AgentClient -> RcvQueue -> SMP.MsgId -> AM (Maybe ATransmission) +ackQueueMessage c rq@RcvQueue {userId, connId, server} srvMsgId = do + atomically $ incSMPServerStat c userId server ackAttempts + tryAllErrors (sendAck c rq srvMsgId) >>= \case + Right _ -> sendMsgNtf ackMsgs + Left (SMP _ SMP.NO_MSG) -> sendMsgNtf ackNoMsgErrs + Left e -> ... >> pure Nothing + where + sendMsgNtf stat = do + atomically $ incSMPServerStat c userId server stat + ifM (liftIO $ hasGetLock c rq) + (do atomically $ releaseGetLock c rq + brokerTs_ <- eitherToMaybe <$> tryAllErrors (withStore c $ \db -> getRcvMsgBrokerTs db connId srvMsgId) + pure $ Just ("", connId, AEvt SAEConn $ MSGNTF srvMsgId brokerTs_)) + (pure Nothing) +``` + +**Caller 1: `ackMessage'`** (Agent.hs:2253-2267) — return event from `withConnLock`, write after: +```haskell +ackMessage' c connId msgId rcptInfo_ = do + t_ <- withConnLock c connId "ackMessage" $ do + SomeConn _ conn <- withStore c (`getConn` connId) + case conn of + DuplexConnection {} -> do + t_ <- ack + sendRcpt conn + del + pure t_ + RcvConnection {} -> do + t_ <- ack + del + pure t_ + SndConnection {} -> throwE $ CONN SIMPLEX "ackMessage" + ContactConnection {} -> throwE $ CMD PROHIBITED "ackMessage: ContactConnection" + NewConnection _ -> throwE $ CMD PROHIBITED "ackMessage: NewConnection" + -- subQ write is OUTSIDE connLock + case t_ of + Just t -> atomically $ writeTBQueue (subQ c) t + Nothing -> pure () +``` + +**Caller 2: `ICAck` / `ICAckDel`** (Agent.hs:1823-1824) — inline `tryWithLock` as `tryCommand` + `withConnLock`, write subQ between the two scopes: + +`tryWithLock name = tryCommand . withConnLock c connId name` — by inlining, the subQ write can be placed outside `withConnLock` but inside `tryCommand` (retaining retry/error handling). + +```haskell +ICAck rId srvMsgId -> withServer $ \srv -> + tryCommand $ do + t_ <- withConnLock c connId "ICAck" $ ack srv rId srvMsgId + -- subQ write is OUTSIDE connLock — cannot deadlock with agentSubscriber + forM_ t_ $ atomically . writeTBQueue subQ + +ICAckDel rId srvMsgId msgId -> withServer $ \srv -> + tryCommand $ do + t_ <- withConnLock c connId "ICAckDel" $ do + t_ <- ack srv rId srvMsgId + withStore' c (\db -> deleteMsg db connId msgId) + pure t_ + -- subQ write is OUTSIDE connLock — cannot deadlock with agentSubscriber + forM_ t_ $ atomically . writeTBQueue subQ +``` + +Where `ack` now returns `AM (Maybe ATransmission)`: +```haskell +ack srv rId srvMsgId = do + rq <- withStore c $ \db -> getRcvQueue db connId srv rId + ackQueueMessage c rq srvMsgId +``` + +All subQ writes for MSGNTF are now outside connLock. FIFO ordering is preserved — no `nonBlockingWriteTBQueue`, no forked threads. The same thread that held the lock writes to subQ sequentially after releasing it. + +### Race analysis + +Window between connLock release and subQ write at Site 1: + +| Thread | Can acquire connLock(X)? | Writes subQ? | Consequence | +|--------|-------------------------|-------------|-------------| +| agentSubscriber via sendMessagesB_ | Yes | **No** (encrypts only) | No race | +| processSMP for connId X | Yes | Yes (pending flush) | MSG before SWITCH — cosmetic | +| runCommandProcessing for connId X | Yes | Yes (pending flush) | Command response before SWITCH — cosmetic | +| New queue delivery worker | No (SENT outside lock) | Yes | SENT before SWITCH — cosmetic, **already exists in current code** | + +All races are cosmetic UI ordering. None affect ratchet state, protocol correctness, or message delivery. + +### Summary of changes + +| File | Change | Lines affected | +|------|--------|---------------| +| Agent.hs | Restructure AM_QTEST_ to return event from `withConnLock`, write outside | ~2187-2214 | +| Agent.hs | Change `ackQueueMessage` return type to `AM (Maybe ATransmission)`, return event instead of writing | ~2371-2386 | +| Agent.hs | `ackMessage'`: return event from `withConnLock`, write outside | ~2253-2267 | +| Agent.hs | `ICAck`/`ICAckDel`: inline `tryCommand` + `withConnLock`, write subQ between scopes | ~1823-1824 | +| Agent.hs | `ack` helper: propagate new return type | ~1899-1901 | + +No new data structures. No new modules. No changes to other write sites (1937, 3216 — already safe). ~25 lines changed total. diff --git a/src/Simplex/Messaging/Agent.hs b/src/Simplex/Messaging/Agent.hs index 296bba26e..e8eb22fcf 100644 --- a/src/Simplex/Messaging/Agent.hs +++ b/src/Simplex/Messaging/Agent.hs @@ -1820,8 +1820,13 @@ runCommandProcessing c@AgentClient {subQ} connId server_ Worker {doWork} = do _ -> throwE $ CMD PROHIBITED "SWCH: not duplex" DEL -> withServer' . tryCommand $ deleteConnection' c NRMBackground connId >> notify OK AInternalCommand cmd -> case cmd of - ICAckDel rId srvMsgId msgId -> withServer $ \srv -> tryWithLock "ICAckDel" $ ack srv rId srvMsgId >> withStore' c (\db -> deleteMsg db connId msgId) - ICAck rId srvMsgId -> withServer $ \srv -> tryWithLock "ICAck" $ ack srv rId srvMsgId + ICAckDel rId srvMsgId msgId -> withServer $ \srv -> + tryCommand $ withConnLockNotify c connId "ICAckDel" $ do + t_ <- ack srv rId srvMsgId + withStore' c (\db -> deleteMsg db connId msgId) + pure t_ + ICAck rId srvMsgId -> withServer $ \srv -> + tryCommand $ withConnLockNotify c connId "ICAck" $ ack srv rId srvMsgId ICAllowSecure _rId senderKey -> withServer' . tryMoveableWithLock "ICAllowSecure" $ do (SomeConn _ conn, AcceptedConfirmation {senderConf, ownConnInfo}) <- withStore c $ \db -> runExceptT $ (,) <$> ExceptT (getConn db connId) <*> ExceptT (getAcceptedConfirmation db connId) @@ -2184,7 +2189,7 @@ runSmpQueueMsgDelivery c@AgentClient {subQ} sq@SndQueue {userId, connId, server, cStats <- connectionStats c conn notify $ SWITCH QDSnd SPConfirmed cStats AM_QUSE_ -> pure () - AM_QTEST_ -> withConnLock c connId "runSmpQueueMsgDelivery AM_QTEST_" $ do + AM_QTEST_ -> withConnLockNotify c connId "runSmpQueueMsgDelivery AM_QTEST_" $ do withStore' c $ \db -> setSndQueueStatus db sq Active SomeConn _ conn <- withStore c (`getConn` connId) case conn of @@ -2208,7 +2213,7 @@ runSmpQueueMsgDelivery c@AgentClient {subQ} sq@SndQueue {userId, connId, server, let sqs'' = sq'' :| sqs' conn' = DuplexConnection cData' rqs sqs'' cStats <- connectionStats c conn' - notify $ SWITCH QDSnd SPCompleted cStats + pure $ Just ("", connId, AEvt SAEConn $ SWITCH QDSnd SPCompleted cStats) _ -> internalErr msgId "sent QTEST: there is only one queue in connection" _ -> internalErr msgId "sent QTEST: queue not in connection or not replacing another queue" _ -> internalErr msgId "QTEST sent not in duplex connection" @@ -2240,7 +2245,9 @@ runSmpQueueMsgDelivery c@AgentClient {subQ} sq@SndQueue {userId, connId, server, notifyDel msgId cmd = notify cmd >> delMsg msgId connError msgId = notifyDel msgId . ERR . (`CONN` "") qError msgId = notifyDel msgId . ERR . AGENT . A_QUEUE - internalErr msgId = notifyDel msgId . ERR . INTERNAL + internalErr msgId s = do + delMsg msgId + pure $ Just ("", connId, AEvt SAEConn $ ERR $ INTERNAL s) retrySndOp :: AgentClient -> AM () -> AM () retrySndOp c loop = do @@ -2250,17 +2257,31 @@ retrySndOp c loop = do atomically $ beginAgentOperation c AOSndNetwork loop +-- | Like 'withConnLock', but writes the returned 'ATransmission' to 'subQ' +-- after releasing the lock, preventing deadlock with agentSubscriber. +withConnLockNotify :: AgentClient -> ConnId -> Text -> AM (Maybe ATransmission) -> AM () +withConnLockNotify c connId name action = do + t_ <- withConnLock c connId name action + forM_ t_ $ atomically . writeTBQueue (subQ c) + ackMessage' :: AgentClient -> ConnId -> AgentMsgId -> Maybe MsgReceiptInfo -> AM () -ackMessage' c connId msgId rcptInfo_ = withConnLock c connId "ackMessage" $ do +ackMessage' c connId msgId rcptInfo_ = withConnLockNotify c connId "ackMessage" $ do SomeConn _ conn <- withStore c (`getConn` connId) case conn of - DuplexConnection {} -> ack >> sendRcpt conn >> del - RcvConnection {} -> ack >> del + DuplexConnection {} -> do + t_ <- ack + sendRcpt conn + del + pure t_ + RcvConnection {} -> do + t_ <- ack + del + pure t_ SndConnection {} -> throwE $ CONN SIMPLEX "ackMessage" ContactConnection {} -> throwE $ CMD PROHIBITED "ackMessage: ContactConnection" NewConnection _ -> throwE $ CMD PROHIBITED "ackMessage: NewConnection" where - ack :: AM () + ack :: AM (Maybe ATransmission) ack = do -- the stored message was delivered via a specific queue, the rest failed to decrypt and were already acknowledged (rq, srvMsgId) <- withStore c $ \db -> setMsgUserAck db connId $ InternalId msgId @@ -2368,7 +2389,7 @@ synchronizeRatchet' c connId pqSupport' force = withConnLock c connId "synchroni | otherwise -> throwE $ CMD PROHIBITED "synchronizeRatchet: not allowed" _ -> throwE $ CMD PROHIBITED "synchronizeRatchet: not duplex" -ackQueueMessage :: AgentClient -> RcvQueue -> SMP.MsgId -> AM () +ackQueueMessage :: AgentClient -> RcvQueue -> SMP.MsgId -> AM (Maybe ATransmission) ackQueueMessage c rq@RcvQueue {userId, connId, server} srvMsgId = do atomically $ incSMPServerStat c userId server ackAttempts tryAllErrors (sendAck c rq srvMsgId) >>= \case @@ -2380,10 +2401,11 @@ ackQueueMessage c rq@RcvQueue {userId, connId, server} srvMsgId = do where sendMsgNtf stat = do atomically $ incSMPServerStat c userId server stat - whenM (liftIO $ hasGetLock c rq) $ do - atomically $ releaseGetLock c rq - brokerTs_ <- eitherToMaybe <$> tryAllErrors (withStore c $ \db -> getRcvMsgBrokerTs db connId srvMsgId) - atomically $ writeTBQueue (subQ c) ("", connId, AEvt SAEConn $ MSGNTF srvMsgId brokerTs_) + ifM (liftIO $ hasGetLock c rq) + (do atomically $ releaseGetLock c rq + brokerTs_ <- eitherToMaybe <$> tryAllErrors (withStore c $ \db -> getRcvMsgBrokerTs db connId srvMsgId) + pure $ Just ("", connId, AEvt SAEConn $ MSGNTF srvMsgId brokerTs_)) + (pure Nothing) -- | Suspend SMP agent connection (OFF command) in Reader monad suspendConnection' :: AgentClient -> NetworkRequestMode -> ConnId -> AM () From 50ae1e1c3e9400e841cc6db455e20cf05bd465b8 Mon Sep 17 00:00:00 2001 From: Evgeny Poberezkin Date: Tue, 24 Feb 2026 18:05:12 +0000 Subject: [PATCH 10/14] docs: update governance process --- rfcs/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rfcs/README.md b/rfcs/README.md index 07f3c2320..b7a87d7d0 100644 --- a/rfcs/README.md +++ b/rfcs/README.md @@ -20,9 +20,9 @@ SimpleX follows the same Netscape-to-consortium evolution path, but with two rat - **Licensing ratchet**: all contributed IP is licensed under AGPLv3 (software) and Creative Commons (documentation), perpetually and irrevocably. What is licensed cannot be unlicensed. If a Party transfers Licensed IP, the licensing obligations transfer with it. -- **Governance ratchet**: power can be given to the SimpleX Network Consortium, but never taken back. The Consortium Agreement requires unanimous approval of all Governing Parties for changes to the agreement itself, IP policy, and admission or removal of parties. +- **Governance ratchet**: power can be given to the SimpleX Network Consortium, but never taken back. The Consortium Agreement requires majority decision of all Governing Parties for changes to the agreement itself, IP policy, and admission or removal of parties. -The ratcheted transition is not just a clever device — it is a historically proven imperative. It allows the company to continue driving rapid product innovation (as Netscape did for the web) while incrementally and irreversibly transferring governance to the consortium, without the abrupt handover that stalled web evolution or the centralization that later undermined it. +The ratcheted transition is historically proven to be necessary. It allows the company to continue driving rapid product innovation (as Netscape did for the web) while incrementally and irreversibly transferring governance to the consortium, without the abrupt handover that stalled web evolution or the centralization that later undermined it. ### Specification governance via the Consortium Agreement From f6aca4760474b6e3d73cc3e5b74ad8d07a160ed0 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Mon, 2 Mar 2026 09:57:46 +0000 Subject: [PATCH 11/14] xftp: implementation of XFTP client as web page (#1708) * xftp: implementation of XFTP client as web page (rfc, low level functions) * protocol, file descriptions, more cryptogrpahy, handshake encoding, etc. * xftp server changes to support web slients: SNI-based certificate choice, CORS headers, OPTIONS request * web handshake * test for xftp web handshake * xftp-web client functions, fix transmission encoding * support description "redirect" in agent.ts and cross-platform compatibility tests (Haskell <> TypeScript) * rfc: web transport * client transport abstraction * browser environment * persistent client sessions * move rfcs * web page plan * improve plan * webpage implementation (not tested) * fix test * fix test 2 * fix test 3 * fixes and page test plan * allow sending xftp client hello after handshake - for web clients that dont know if established connection exists * page tests pass * concurrent and padded hellos in the server * update TS client to pad hellos * fix tests * preview:local * local preview over https * fixed https in the test page * web test cert fixtures * debug logging in web page and server * remove debug logging in server/browser, run preview xftp server via cabal run to ensure the latest code is used * debug logging for page sessions * add plan * improve error handling, handle browser reconnections/re-handshake * fix * debugging * opfs fallback * delete test screenshot * xftp CLI to support link * fix encoding for XFTPServerHandshake * support redirect file descriptions in xftp CLI receive * refactor CLI redirect * xftp-web: fixes and multi-server upload (#1714) * fix: await sodium.ready in crypto/keys.ts (+ digest.ts StateAddress cast) * multi-server parallel upload, remove pickRandomServer * fix worker message race: wait for ready signal before posting messages * suppress vite build warnings: emptyOutDir, externals, chunkSizeWarningLimit * fix Haskell web tests: use agent+server API, wrap server in array, suppress debug logs * remove dead APIs: un-export connectXFTP, delete closeXFTP * fix TypeScript errors in check:web (#1716) - client.ts: cast globalThis.process to any for browser tsconfig, suppress node:http2 import, use any for Buffer/chunks, cast fetch body - crypto.worker.ts: cast sha512_init() return to StateAddress * fix: serialize worker message processing to prevent OPFS handle race async onmessage allows interleaved execution at await points. When downloadFileRaw fetches chunks from multiple servers in parallel, concurrent handleDecryptAndStore calls both see downloadWriteHandle as null and race on createSyncAccessHandle for the same file, causing intermittent NoModificationAllowedError. Chain message handlers on a promise queue so each runs to completion before the next starts. * xftp-web: prepare for npm publishing (#1715) * prepare package.json for npm publishing Remove private flag, add description/license/repository/publishConfig, rename postinstall to pretest, add prepublishOnly, set files and main. * stable output filenames in production build * fix repository url format, expand files array * embeddable component: scoped CSS, dark mode, i18n, events, share - worker output to assets/ for single-directory deployment - scoped all CSS under #app, removed global resets - dark mode via .dark ancestor class - progress ring reads colors from CSS custom properties - i18n via window.__XFTP_I18N__ with t() helper - configurable mount element via data-xftp-app attribute - optional hashchange listener (data-no-hashchange) - completion events: xftp:upload-complete, xftp:download-complete - enhanced file-too-large error mentioning SimpleX app - native share button via navigator.share * deferred init and runtime server configuration - data-defer-init attribute skips auto-initialization - window.__XFTP_SERVERS__ overrides baked-in server list * use relative base path for relocatable build output * xftp-web: retry resets to default state, use innerHTML for errors * xftp-web: only enter download mode for valid XFTP URIs in hash * xftp-web: render UI before WASM is ready Move sodium.ready await after UI initialization so the upload/download interface appears instantly. WASM is only needed when user triggers an actual upload or download. Dispatch xftp:ready event once WASM loads. * xftp-web: CLS placeholder HTML and embedder CSS selectors Add placeholder HTML to index.html so the page renders a styled card before JS executes, preventing layout shift. Use a