diff --git a/plans/20260522_01_smp_public_namespaces.md b/plans/20260522_01_smp_public_namespaces.md index a19763562..5f90e67fd 100644 --- a/plans/20260522_01_smp_public_namespaces.md +++ b/plans/20260522_01_smp_public_namespaces.md @@ -177,7 +177,7 @@ In-flight `resolveName` calls during shutdown receive `ConnectionClosed` → `Et **`incStat` relocation.** Defined at `Server.hs:2220`, currently unexported. Move to `Server/Stats.hs` (one-line transplant + export) so `Resolver.hs` can use it. -**Co-located proxy refused.** `newEnv` aborts startup if both `allowSMPProxy = True` and `namesConfig = Just _`, unless `allow_dangerous_colocation = on`. RSLV is the first slow forwarded command; on a proxy host it can serialise other forwarded commands on the same proxy-relay session up to `rpcTimeoutMs` per cache miss. `forkForwardedCmd` async dispatch is the longer-term fix, tracked as a follow-up. +**Co-located proxy warning.** `newEnv` logs a startup warning whenever `allowSMPProxy = True` and `namesConfig = Just _`. RSLV is the first slow forwarded command; on a proxy host it can serialise other forwarded commands on the same proxy-relay session up to `rpcTimeoutMs` per cache miss. The warning is not a hard refusal because `[PROXY]` has no `enable: on/off` toggle — proxy is always on for every smp-server. `forkForwardedCmd` async dispatch is the longer-term fix, tracked as a follow-up; once the proxy role is gateable per-server, the warning can be tightened back to a refusal. ## Resolver subtree @@ -256,7 +256,6 @@ data NamesConfig = NamesConfig , rpcTimeoutMs :: Int -- 3000 , rpcMaxResponseBytes :: Int -- 262144 (256 KB) , rpcMaxConcurrency :: Int -- 8 - , dangerousColocation :: Bool -- override the §"Server changes" startup guard } data RpcAuth = AuthBearer Text | AuthBasic Text Text diff --git a/src/Simplex/Messaging/Protocol.hs b/src/Simplex/Messaging/Protocol.hs index e1f8f54d1..3a5f88903 100644 --- a/src/Simplex/Messaging/Protocol.hs +++ b/src/Simplex/Messaging/Protocol.hs @@ -164,7 +164,6 @@ module Simplex.Messaging.Protocol FwdResponse (..), FwdTransmission (..), LookupKey (..), - unLookupKey, NameRecord (..), NameOwner, mkNameOwner, @@ -573,10 +572,6 @@ type QueueId = EntityId newtype LookupKey = LookupKey ByteString deriving (Eq, Show) -unLookupKey :: LookupKey -> ByteString -unLookupKey (LookupKey s) = s -{-# INLINE unLookupKey #-} - instance Encoding LookupKey where smpEncode (LookupKey s) = smpEncode s smpP = do diff --git a/src/Simplex/Messaging/Server/Env/STM.hs b/src/Simplex/Messaging/Server/Env/STM.hs index 382c820c8..6f6ca1aaf 100644 --- a/src/Simplex/Messaging/Server/Env/STM.hs +++ b/src/Simplex/Messaging/Server/Env/STM.hs @@ -115,7 +115,9 @@ import Simplex.Messaging.Server.Information import Simplex.Messaging.Server.MsgStore.Journal import Simplex.Messaging.Server.MsgStore.STM import Simplex.Messaging.Server.MsgStore.Types -import Simplex.Messaging.Server.Names (NamesConfig (..), NamesEnv, closeNamesEnv, newNamesEnv) +import Simplex.Messaging.Server.Names (NamesConfig (..), NamesEnv, newNamesEnv, pingEndpoint) +import Simplex.Messaging.Server.Names.Eth.RPC (scrubUrl) +import Simplex.Messaging.Util (tshow) import Simplex.Messaging.Server.NtfStore import Simplex.Messaging.Server.QueueStore import Simplex.Messaging.Server.QueueStore.Postgres.Config @@ -609,13 +611,19 @@ newEnv config@ServerConfig {allowSMPProxy, smpCredentials, httpCredentials, serv proxyAgent <- newSMPProxyAgent smpAgentCfg random namesEnv <- case namesConfig of Nothing -> pure Nothing - Just nc - | allowSMPProxy && not (dangerousColocation nc) -> do - logError "[NAMES] enable: on with [PROXY] is refused — RSLV cache misses can serialise other forwarded commands. Set allow_dangerous_colocation = on to override." - exitFailure - | otherwise -> do - let rs = rslvStats serverStats - Just <$> newNamesEnv nc (rslvCacheHits rs) (rslvCacheMiss rs) + Just nc -> do + logInfo $ "[NAMES] resolver enabled, endpoint=" <> scrubUrl (ethereumEndpoint nc) + when allowSMPProxy $ + logWarn "[NAMES] enable: on on a proxy-role host: slow RSLV cache misses can serialise other forwarded commands on the same proxy-relay session. For high-volume deployments, run [NAMES] on a separate host." + let rs = rslvStats serverStats + env <- newNamesEnv nc (rslvCacheHits rs) (rslvCacheMiss rs) + -- Probe the endpoint at startup. Don't exitFailure: a flapping + -- network or an Ethereum host coming up minutes after smp-server + -- should not block the server. Log so operators can spot it. + pingEndpoint env >>= \case + Right _ -> logInfo "[NAMES] endpoint probe ok" + Left e -> logWarn $ "[NAMES] endpoint probe failed (server will still start, RSLV will return ERR AUTH until reachable): " <> tshow e + pure (Just env) pure Env { serverActive, diff --git a/src/Simplex/Messaging/Server/Main.hs b/src/Simplex/Messaging/Server/Main.hs index edde8a78b..9dfec8aba 100644 --- a/src/Simplex/Messaging/Server/Main.hs +++ b/src/Simplex/Messaging/Server/Main.hs @@ -39,7 +39,6 @@ module Simplex.Messaging.Server.Main strParse, ) where -import Control.Applicative ((<|>)) import Control.Concurrent.STM import Control.Exception (finally) import Control.Logger.Simple @@ -77,8 +76,10 @@ import Simplex.Messaging.Server.Main.Init import Simplex.Messaging.Server.Web (EmbeddedWebParams (..), WebHttpsParams (..)) import Simplex.Messaging.Server.MsgStore.Journal (JournalMsgStore (..), QStoreCfg (..), stmQueueStore) import Simplex.Messaging.Server.MsgStore.Types (MsgStoreClass (..), SQSType (..), SMSType (..), newMsgStore) +import Network.URI (URI (..), URIAuth (..), parseAbsoluteURI) import Simplex.Messaging.Protocol (mkNameOwner, NameOwner) import Simplex.Messaging.Server.Names (NamesConfig (..), RpcAuth (..)) +import Simplex.Messaging.Server.Names.Eth.RPC (fromHex) import Simplex.Messaging.Server.QueueStore.Postgres.Config import Simplex.Messaging.Server.StoreLog.ReadWrite (readQueueStore) import Simplex.Messaging.Transport (supportedProxyClientSMPRelayVRange, alpnSupportedSMPHandshakes, supportedServerSMPRelayVRange) @@ -804,55 +805,73 @@ readNamesConfig :: Ini -> Maybe NamesConfig readNamesConfig ini | not enabled = Nothing | otherwise = - Just - NamesConfig - { ethereumEndpoint = requiredText "ethereum_endpoint", - snrcAddress = either (error . ("[NAMES] snrc_address: " <>)) id $ parseEthAddr (requiredText "snrc_address"), - rpcAuth = either (error . ("[NAMES] rpc_auth: " <>)) Just . parseRpcAuth =<< eitherToMaybe (lookupValue "NAMES" "rpc_auth" ini), - cacheSeconds = readIniDefault 300 "NAMES" "cache_seconds" ini, - cacheMaxEntries = readIniDefault 100000 "NAMES" "cache_max_entries" ini, - cacheMaxBytes = readIniDefault 67108864 "NAMES" "cache_max_bytes" ini, - rpcTimeoutMs = readIniDefault 3000 "NAMES" "rpc_timeout_ms" ini, - rpcMaxResponseBytes = readIniDefault 262144 "NAMES" "rpc_max_response_bytes" ini, - rpcMaxConcurrency = readIniDefault 8 "NAMES" "rpc_max_concurrency" ini, - dangerousColocation = fromMaybe False (iniOnOff "NAMES" "allow_dangerous_colocation" ini) - } + let rpcAuth_ = either (error . ("[NAMES] rpc_auth: " <>)) Just . parseRpcAuth =<< eitherToMaybe (lookupValue "NAMES" "rpc_auth" ini) + endpoint = requiredText "ethereum_endpoint" + in Just + NamesConfig + { ethereumEndpoint = either (error . ("[NAMES] ethereum_endpoint: " <>)) id (validateUrl endpoint rpcAuth_), + snrcAddress = either (error . ("[NAMES] snrc_address: " <>)) id $ parseEthAddr (requiredText "snrc_address"), + rpcAuth = rpcAuth_, + cacheSeconds = readIniDefault 300 "NAMES" "cache_seconds" ini, + cacheMaxEntries = readIniDefault 100000 "NAMES" "cache_max_entries" ini, + cacheMaxBytes = readIniDefault 67108864 "NAMES" "cache_max_bytes" ini, + rpcTimeoutMs = readIniDefault 3000 "NAMES" "rpc_timeout_ms" ini, + rpcMaxResponseBytes = readIniDefault 262144 "NAMES" "rpc_max_response_bytes" ini, + rpcMaxConcurrency = readIniDefault 8 "NAMES" "rpc_max_concurrency" ini + } where enabled = fromMaybe False (iniOnOff "NAMES" "enable" ini) requiredText key = either (error . (("[NAMES] " <> T.unpack key <> " is required: ") <>)) id $ lookupValue "NAMES" key ini +-- | Validate the ethereum_endpoint URL: +-- * scheme must be http: or https: +-- * authority (host) must be present and non-empty +-- * port MUST be explicit (rejects http://host without :8545 to avoid +-- accidentally hitting :80 when Reth listens on :8545) +-- * userinfo (user:pass@) MUST NOT be present (credentials belong in +-- rpc_auth so they don't leak via Host header or logs) +-- * query and fragment MUST NOT be present +-- * https requires rpc_auth on non-loopback hosts (operator misconfig +-- guard — a public HTTPS endpoint without auth is almost always wrong) +validateUrl :: Text -> Maybe RpcAuth -> Either String Text +validateUrl url auth_ = do + uri <- maybe (Left "not an absolute URI") Right $ parseAbsoluteURI (T.unpack url) + let scheme = uriScheme uri + unless (scheme == "http:" || scheme == "https:") $ + Left ("scheme " <> show scheme <> " not supported (use http or https)") + ua <- maybe (Left "missing authority (host)") Right (uriAuthority uri) + when (null (uriRegName ua)) $ Left "empty host" + unless (null (uriUserInfo ua)) $ Left "userinfo (user:pass@) not allowed; use rpc_auth instead" + when (null (uriPort ua)) $ Left "explicit port required (e.g. http://host:8545)" + unless (null (uriQuery uri)) $ Left "query string not allowed" + unless (null (uriFragment uri)) $ Left "fragment not allowed" + let path = uriPath uri + unless (path == "" || path == "/") $ + Left "URL path not allowed; API keys embedded in the path leak to logs — use rpc_auth instead" + when (scheme == "https:" && not (isLoopback (uriRegName ua)) && isNothing auth_) $ + Left "https endpoint on a non-loopback host requires rpc_auth" + Right url + where + isLoopback h = h == "127.0.0.1" || h == "localhost" || h == "[::1]" + -- | Parse a 20-byte Ethereum address as text "0x[hex40]" or "[hex40]". --- Step 4 minimal validation; EIP-55 checksum check lands in step 5. +-- EIP-55 mixed-case checksum verification is a follow-up. parseEthAddr :: Text -> Either String NameOwner -parseEthAddr t = - let s = case T.stripPrefix "0x" t <|> T.stripPrefix "0X" t of - Just rest -> rest - Nothing -> t - in if T.length s == 40 && T.all isHex s - then mkNameOwner (hexDecode (encodeUtf8 s)) - else Left "expected 0x-prefixed 40 hex characters" - where - isHex c = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') - --- | Decode a hex string of even length. Precondition: input is already --- validated as even-length and all-hex (validated by caller). -hexDecode :: ByteString -> ByteString -hexDecode = B.pack . go - where - go s - | B.null s = [] - | otherwise = toEnum (16 * digit (B.head s) + digit (B.index s 1)) : go (B.drop 2 s) - digit c - | c >= '0' && c <= '9' = fromEnum c - fromEnum '0' - | c >= 'a' && c <= 'f' = 10 + fromEnum c - fromEnum 'a' - | otherwise = 10 + fromEnum c - fromEnum 'A' +parseEthAddr t = do + bs <- fromHex (encodeUtf8 t) + if B.length bs == 20 + then mkNameOwner bs + else Left "expected a 20-byte address (40 hex characters, optionally 0x-prefixed)" +-- | Parse an rpc_auth INI value. Scheme keyword is case-insensitive so +-- "Bearer " / "BEARER " (Caddy / RFC 7235 convention) work +-- as well as the lowercase form. parseRpcAuth :: Text -> Either String RpcAuth parseRpcAuth t = case T.words t of - ["bearer", tok] -> Right $ AuthBearer tok - ["basic", up] -> case T.breakOn ":" up of + [scheme, tok] | T.toLower scheme == "bearer" -> Right $ AuthBearer tok + [scheme, up] | T.toLower scheme == "basic" -> case T.breakOn ":" up of (u, rest) | not (T.null u) && ":" `T.isPrefixOf` rest -> Right $ AuthBasic u (T.drop 1 rest) _ -> Left "basic auth expects user:password" diff --git a/src/Simplex/Messaging/Server/Main/Init.hs b/src/Simplex/Messaging/Server/Main/Init.hs index 1091bb261..bc6689d85 100644 --- a/src/Simplex/Messaging/Server/Main/Init.hs +++ b/src/Simplex/Messaging/Server/Main/Init.hs @@ -158,7 +158,8 @@ iniFileContent cfgPath logPath opts host basicAuth controlPortPwds = \[NAMES]\n\ \# Public-namespace resolution (SNRC on Ethereum).\n\ \# Requires an Ethereum JSON-RPC endpoint (Reth+Nimbus). See deployment guide.\n\ - \# Cannot be combined with [PROXY] enable: on by default - see allow_dangerous_colocation.\n\ + \# Co-locating with the proxy role logs a warning at startup - slow RSLV cache misses\n\ + \# can serialise other forwarded commands. For high-volume deployments, run on a separate host.\n\ \# Restart required to change settings.\n\ \enable: off\n\ \# Same-host:\n\ @@ -166,14 +167,14 @@ iniFileContent cfgPath logPath opts host basicAuth controlPortPwds = \# Central Reth via Caddy:\n\ \# ethereum_endpoint: https://eth.simplex.chat:443\n\ \# rpc_auth: basic :\n\ - \# snrc_address: 0x0000000000000000000000000000000000000000\n\ + \# snrc_address: 0x\n\ + \# (cache_max_entries and cache_max_bytes both cap the cache; whichever fills first triggers FIFO eviction)\n\ \# cache_seconds: 300\n\ \# cache_max_entries: 100000\n\ \# cache_max_bytes: 67108864\n\ \# rpc_timeout_ms: 3000\n\ \# rpc_max_response_bytes: 262144\n\ - \# rpc_max_concurrency: 8\n\ - \# allow_dangerous_colocation: off\n\n\ + \# rpc_max_concurrency: 8\n\n\ \[INACTIVE_CLIENTS]\n\ \# TTL and interval to check inactive clients\n\ \disconnect = on\n" diff --git a/src/Simplex/Messaging/Server/Names.hs b/src/Simplex/Messaging/Server/Names.hs index eea09b013..a3088a79d 100644 --- a/src/Simplex/Messaging/Server/Names.hs +++ b/src/Simplex/Messaging/Server/Names.hs @@ -10,9 +10,10 @@ module Simplex.Messaging.Server.Names ResolveError (..), newNamesEnv, closeNamesEnv, + pingEndpoint, resolveName, ) where import Simplex.Messaging.Server.Names.Eth.RPC (RpcAuth (..)) -import Simplex.Messaging.Server.Names.Resolver (NamesConfig (..), NamesEnv, ResolveError (..), closeNamesEnv, newNamesEnv, resolveName) +import Simplex.Messaging.Server.Names.Resolver (NamesConfig (..), NamesEnv, ResolveError (..), closeNamesEnv, newNamesEnv, pingEndpoint, resolveName) diff --git a/src/Simplex/Messaging/Server/Names/Eth/RPC.hs b/src/Simplex/Messaging/Server/Names/Eth/RPC.hs index f89127343..f7d1a0649 100644 --- a/src/Simplex/Messaging/Server/Names/Eth/RPC.hs +++ b/src/Simplex/Messaging/Server/Names/Eth/RPC.hs @@ -20,6 +20,7 @@ module Simplex.Messaging.Server.Names.Eth.RPC newEthRpcEnv, closeEthRpcEnv, ethCallReal, + fromHex, scrubUrl, ) where diff --git a/src/Simplex/Messaging/Server/Names/Eth/SNRC.hs b/src/Simplex/Messaging/Server/Names/Eth/SNRC.hs index c645b8ebe..80b11a255 100644 --- a/src/Simplex/Messaging/Server/Names/Eth/SNRC.hs +++ b/src/Simplex/Messaging/Server/Names/Eth/SNRC.hs @@ -33,6 +33,7 @@ module Simplex.Messaging.Server.Names.Eth.SNRC decodeWord256Int64, decodeAddress, decodeString, + decodeUtf8Text, decodeStringArray, ) where @@ -42,6 +43,8 @@ import qualified Data.ByteArray as BA import Data.ByteString.Char8 (ByteString) import qualified Data.ByteString.Char8 as B import Data.Int (Int64) +import Data.Text (Text) +import Data.Text.Encoding (decodeUtf8') import Simplex.Messaging.Protocol (NameOwner, NameRecord, mkNameOwner, unNameOwner) -- | ABI-decode failure modes (caller collapses to ResolveError EthDecodeErr). @@ -88,11 +91,16 @@ padLeft32 bs where n = B.length bs --- | Read a uint256 at byte offset, fail if it doesn't fit in Int64. +-- | Read a uint256 at byte offset, fail if it doesn't fit in *signed* Int64. +-- Rejects both (a) any non-zero byte in the high 24 bytes and (b) the high +-- bit of the low 8 bytes being set — the latter is essential because Int64 +-- would otherwise sign-flip a uint64 value into a negative integer, silently +-- corrupting downstream length math. decodeWord256Int64 :: Int -> ByteString -> Either AbiError Int64 decodeWord256Int64 off buf | off + 32 > B.length buf = Left AbiTruncated - | B.any (/= toEnum 0) (B.take 24 (B.drop off buf)) = Left AbiNonZeroHighBytes + | B.any (/= '\NUL') (B.take 24 (B.drop off buf)) = Left AbiNonZeroHighBytes + | B.index buf (off + 24) >= '\x80' = Left AbiNonZeroHighBytes | otherwise = Right $ B.foldl shiftIn 0 (B.take 8 (B.drop (off + 24) buf)) where shiftIn :: Int64 -> Char -> Int64 @@ -109,6 +117,8 @@ decodeAddress off buf Left e -> Left (AbiInvariantViolated e) -- | Decode a Solidity `string` whose data starts at byte offset `off`. +-- Returns raw bytes; UTF-8 validity is the caller's choice (use +-- `decodeUtf8Text` if a Text is required). decodeString :: Int -> Int -> Int -> ByteString -> Either AbiError ByteString decodeString headEnd off cap buf | off < headEnd = Left AbiBackwardOffset @@ -123,6 +133,13 @@ decodeString headEnd off cap buf then Left AbiTruncated else Right $ B.take len (B.drop (off + 32) buf) +-- | Decode a Solidity `string` as Text, failing with AbiBadUtf8 on +-- invalid UTF-8. This is what NameRecord decoder composition will use. +decodeUtf8Text :: Int -> Int -> Int -> ByteString -> Either AbiError Text +decodeUtf8Text headEnd off cap buf = do + raw <- decodeString headEnd off cap buf + either (const (Left AbiBadUtf8)) Right (decodeUtf8' raw) + -- | Decode a Solidity `string[]` at byte offset `off`. Each element capped -- at `byteCap` bytes, total element count capped at `cntCap`. Depth must be -- < 2 (recurses one level into decodeString). diff --git a/src/Simplex/Messaging/Server/Names/Resolver.hs b/src/Simplex/Messaging/Server/Names/Resolver.hs index 52be961f1..8dbd8d60f 100644 --- a/src/Simplex/Messaging/Server/Names/Resolver.hs +++ b/src/Simplex/Messaging/Server/Names/Resolver.hs @@ -16,19 +16,25 @@ module Simplex.Messaging.Server.Names.Resolver newNamesEnv, newNamesEnvWith, closeNamesEnv, + pingEndpoint, resolveName, ) where import Control.Concurrent.STM import qualified Control.Exception as E +import Control.Logger.Simple (logError) import Data.ByteString.Char8 (ByteString) import qualified Data.HashPSQ as PSQ import Data.IORef (IORef) import Data.Text (Text) +import qualified Data.Text as T import Data.Word (Word64) +import Data.Time.Clock.POSIX (getPOSIXTime) import GHC.Clock (getMonotonicTimeNSec) -import Simplex.Messaging.Protocol (NameOwner, NameRecord, unNameOwner) +import qualified Data.ByteString.Char8 as B +import qualified Data.Text.Encoding as T +import Simplex.Messaging.Protocol (NameLink, NameOwner, NameRecord (..), unNameLink, unNameOwner) import Simplex.Messaging.Server.Names.Eth.RPC (EthRpcEnv, EthRpcError (..), RpcAuth (..), closeEthRpcEnv, ethCallReal, newEthRpcEnv) import Simplex.Messaging.Server.Names.Eth.SNRC (decodeGetRecord, encodeGetRecord, namehash) import Simplex.Messaging.Util (atomicModifyIORef'_) @@ -44,8 +50,7 @@ data NamesConfig = NamesConfig cacheMaxBytes :: Int, rpcTimeoutMs :: Int, rpcMaxResponseBytes :: Int, - rpcMaxConcurrency :: Int, - dangerousColocation :: Bool + rpcMaxConcurrency :: Int } deriving (Show) @@ -61,11 +66,14 @@ data ResolveError -- Production wires this to ethCallReal; tests substitute a stub. type EthCall = ByteString -> ByteString -> IO (Either EthRpcError ByteString) --- | Cache value bundles a NameRecord with its insertion-time byte cost --- so eviction can keep total cache bytes under cacheMaxBytes. +-- | Cache value bundles a result (NameRecord or NotFound sentinel) with +-- its insertion-time byte cost and per-entry TTL (NotFound expires faster +-- than positive results so newly-registered names become visible quickly +-- while still preventing DoS via unique-name spam). data CacheEntry = CacheEntry - { ceRecord :: NameRecord, - ceBytes :: Int + { ceResult :: Maybe NameRecord, -- Nothing = NotFound; Just = Found + ceBytes :: Int, + ceTtlNs :: Word64 } -- | Cache state: (PSQ keyed by LookupKey, priority = insert time in ns, total bytes). @@ -101,33 +109,61 @@ newNamesEnvWith config ethCall rpcEnv cacheHitsRef cacheMissRef = do closeNamesEnv :: NamesEnv -> IO () closeNamesEnv NamesEnv {rpcEnv} = maybe (pure ()) closeEthRpcEnv rpcEnv +-- | Reach the configured endpoint with a harmless probe call to confirm +-- network reachability and basic config sanity. Returns Left only on +-- transport-level failures (DNS, TLS, refused) — a JSON-RPC error (e.g. +-- a misconfigured snrc_address) is treated as "endpoint reachable", +-- because the operator-friendly signal we want is "is the eth host alive, +-- not is your contract address right." That distinction surfaces later +-- via the rslvEthErrs counter. +pingEndpoint :: NamesEnv -> IO (Either EthRpcError ()) +pingEndpoint NamesEnv {ethCall, config} = do + let to = unNameOwner (snrcAddress config) + -- Use the ENS-style root node (32 zero bytes) — always a valid + -- bytes32 input that costs the contract nothing to look up. + callData = encodeGetRecord (namehash "") + ethCall to callData >>= \case + Left e@(HttpFailure _) -> pure (Left e) + Left e@(HttpStatusErr _) -> pure (Left e) + _ -> pure (Right ()) + -- | Resolve a lookup key. Coalesces concurrent identical requests, caches -- results for cacheSeconds, and bounds RPCs by rpcTimeoutMs. resolveName :: NamesEnv -> ByteString -> IO (Either ResolveError NameRecord) resolveName env key = do now <- getMonotonicTimeNSec cacheLookup env key now >>= \case - Just rec -> do + Just result -> do atomicModifyIORef'_ (cacheHitsRef env) (+ 1) - pure (Right rec) + pure $ maybe (Left NotFound) Right result Nothing -> do atomicModifyIORef'_ (cacheMissRef env) (+ 1) coalesce env key now -cacheLookup :: NamesEnv -> ByteString -> Word64 -> IO (Maybe NameRecord) -cacheLookup NamesEnv {config, cache} key now = atomically $ do +-- | Look up the key in cache. Returns: +-- Nothing — cache miss (or expired entry, which is evicted) +-- Just Nothing — cache hit for NotFound +-- Just (Just rec) — cache hit for a NameRecord +cacheLookup :: NamesEnv -> ByteString -> Word64 -> IO (Maybe (Maybe NameRecord)) +cacheLookup NamesEnv {cache} key now = atomically $ do (psq, totalBytes) <- readTVar cache case PSQ.lookup key psq of Just (insertedAt, ce) - | now < insertedAt + ttlNs config -> pure (Just (ceRecord ce)) + | now < insertedAt + ceTtlNs ce -> pure (Just (ceResult ce)) | otherwise -> do -- Expired: evict and signal miss. writeTVar cache (PSQ.delete key psq, totalBytes - ceBytes ce) pure Nothing Nothing -> pure Nothing -ttlNs :: NamesConfig -> Word64 -ttlNs cfg = fromIntegral (cacheSeconds cfg) * 1000000000 +ttlFoundNs :: NamesConfig -> Word64 +ttlFoundNs cfg = fromIntegral (cacheSeconds cfg) * 1000000000 + +-- | NotFound cache TTL — short enough that a newly-registered name becomes +-- visible within seconds, long enough to absorb a unique-name DoS burst. +-- Bounded by cacheSeconds in case the operator deliberately ran a tiny TTL. +ttlNotFoundNs :: NamesConfig -> Word64 +ttlNotFoundNs cfg = min (ttlFoundNs cfg) (30 * 1000000000) -- | Leader/waiter coalescing. Leader runs the RPC under E.mask; waiters -- block on the leader's TMVar. Cleanup runs even on async exception. @@ -144,19 +180,33 @@ coalesce env@NamesEnv {inflight} key now = do case ticket of Right mv -> atomically (readTMVar mv) -- waiter Left mv -> E.mask $ \restore -> do + -- Run the fetch with sync-only catching: async exceptions (cancel, + -- killThread) must propagate after we've completed the STM cleanup + -- so waiters never block on an orphan TMVar. r <- - restore (fetchOnceTimed env key) - `E.catch` \(e :: E.SomeException) -> pure (Left (mapEthExn e)) + E.try (restore (fetchOnceTimed env key)) >>= \case + Right ok -> pure ok + Left e + | Just (_ :: E.SomeAsyncException) <- E.fromException e -> do + -- Tell waiters the lookup failed, then rethrow. + atomically $ do + putTMVar mv (Left EthHttpErr) + modifyTVar' inflight (PSQ.delete key) + E.throwIO e + | otherwise -> do + logError $ "[NAMES] resolver fetch raised " <> T.pack (E.displayException e) + pure (Left (mapSyncEthExn e)) atomically $ do putTMVar mv r modifyTVar' inflight (PSQ.delete key) case r of - Right rec -> cacheInsert env key now rec - Left _ -> pure () + Right rec -> cacheInsert env key now (Just rec) (ttlFoundNs (config env)) + Left NotFound -> cacheInsert env key now Nothing (ttlNotFoundNs (config env)) + Left _ -> pure () -- transient errors (HTTP, decode, timeout) are not cached pure r -mapEthExn :: E.SomeException -> ResolveError -mapEthExn _ = EthHttpErr +mapSyncEthExn :: E.SomeException -> ResolveError +mapSyncEthExn _ = EthHttpErr fetchOnceTimed :: NamesEnv -> ByteString -> IO (Either ResolveError NameRecord) fetchOnceTimed env key = @@ -165,7 +215,7 @@ fetchOnceTimed env key = Nothing -> pure (Left TimedOut) fetchOnce :: NamesEnv -> ByteString -> IO (Either ResolveError NameRecord) -fetchOnce env@NamesEnv {ethCall, config} key = do +fetchOnce NamesEnv {ethCall, config} key = do let node = namehash key callData = encodeGetRecord node to = unNameOwner (snrcAddress config) @@ -177,13 +227,21 @@ fetchOnce env@NamesEnv {ethCall, config} key = do Left (JsonRpcErr c m) -> pure (Left EthRpcErr {rpcCode = c, rpcMessage = m}) Right ret -> case decodeGetRecord ret of Right Nothing -> pure (Left NotFound) - Right (Just rec) -> pure (Right rec) + Right (Just rec) -> do + nowSec <- floor <$> getPOSIXTime + -- Defense in depth: the SNRC contract should already return the + -- zero-owner sentinel for expired records, but a buggy / pre-upgrade + -- contract might not. nrExpiry == 0 means "never expires" (reserved + -- names); any positive expiry in the past is treated as NotFound. + if nrExpiry rec /= 0 && nrExpiry rec < nowSec + then pure (Left NotFound) + else pure (Right rec) Left _ -> pure (Left EthDecodeErr) -cacheInsert :: NamesEnv -> ByteString -> Word64 -> NameRecord -> IO () -cacheInsert NamesEnv {config, cache} key now rec = atomically $ do +cacheInsert :: NamesEnv -> ByteString -> Word64 -> Maybe NameRecord -> Word64 -> IO () +cacheInsert NamesEnv {config, cache} key now result ttl = atomically $ do (psq, totalBytes) <- readTVar cache - let entryBytes = estimateBytes rec + let entryBytes = maybe notFoundOverhead estimateBytes result (psq', totalBytes') = evictWhile psq totalBytes evictWhile p tb | PSQ.size p > cacheMaxEntries config || tb + entryBytes > cacheMaxBytes config = @@ -191,10 +249,26 @@ cacheInsert NamesEnv {config, cache} key now rec = atomically $ do Just (_, _, ce, rest) -> evictWhile rest (tb - ceBytes ce) Nothing -> (p, tb) | otherwise = (p, tb) - ce = CacheEntry {ceRecord = rec, ceBytes = entryBytes} + ce = CacheEntry {ceResult = result, ceBytes = entryBytes, ceTtlNs = ttl} writeTVar cache (PSQ.insert key now ce psq', totalBytes' + entryBytes) + where + notFoundOverhead = 128 -- PSQ node + key copy + small constant for the Nothing sentinel --- | Approximate byte cost of a cached NameRecord (overhead + content). --- Tight enough that cacheMaxBytes bounds real memory; not byte-exact. +-- | Approximate byte cost of a cached NameRecord. Counts the user-controlled +-- variable-length content plus a fixed per-entry overhead for the wrapper +-- (TVar/PSQ node + ByteString headers + IORef). Tighter than a constant upper +-- bound so cacheMaxBytes is a meaningful cap. estimateBytes :: NameRecord -> Int -estimateBytes _ = 4096 -- conservative upper bound per NameRecord +estimateBytes NameRecord {nrDisplayName, nrChannelLinks, nrContactLinks, nrAdminAddress, nrAdminEmail} = + perEntryOverhead + + utf8Len nrDisplayName + + 20 -- nrOwner + + sum (map nameLinkBytes nrChannelLinks) + + sum (map nameLinkBytes nrContactLinks) + + maybe 0 utf8Len nrAdminAddress + + maybe 0 utf8Len nrAdminEmail + where + perEntryOverhead = 256 -- PSQ node + key copy + ByteString headers + utf8Len = B.length . T.encodeUtf8 + nameLinkBytes :: NameLink -> Int + nameLinkBytes = utf8Len . unNameLink diff --git a/tests/SMPNamesTests.hs b/tests/SMPNamesTests.hs index 452474e23..f597c4ae0 100644 --- a/tests/SMPNamesTests.hs +++ b/tests/SMPNamesTests.hs @@ -4,7 +4,9 @@ module SMPNamesTests (smpNamesTests) where -import Control.Concurrent.Async (replicateConcurrently) +import Control.Concurrent (threadDelay) +import Control.Concurrent.Async (async, replicateConcurrently, wait) +import Control.Concurrent.STM (atomically, newEmptyTMVarIO, putTMVar, readTMVar) import qualified Crypto.Hash as Crypton import Data.ByteString.Char8 (ByteString) import qualified Data.ByteString.Char8 as B @@ -189,6 +191,17 @@ abiBoundsSpec = do let buf = B.replicate 23 '\NUL' <> B.singleton '\x01' <> B.replicate 8 '\NUL' decodeWord256Int64 0 buf `shouldBe` Left AbiNonZeroHighBytes + it "decodeWord256Int64 rejects sign bit set in low 8 bytes (silent negative)" $ do + -- 0x8000000000000000 would decode to Int64.minBound without the check; + -- downstream length math would then see a negative len and silently + -- return empty bytes from B.take instead of failing. + let buf = B.replicate 24 '\NUL' <> "\x80\x00\x00\x00\x00\x00\x00\x00" + decodeWord256Int64 0 buf `shouldBe` Left AbiNonZeroHighBytes + + it "decodeWord256Int64 succeeds for the max representable positive value" $ do + let buf = B.replicate 24 '\NUL' <> "\x7F\xFF\xFF\xFF\xFF\xFF\xFF\xFF" + decodeWord256Int64 0 buf `shouldBe` Right maxBound + it "decodeWord256Int64 succeeds for low 8 bytes set" $ do let buf = B.replicate 24 '\NUL' <> "\x00\x00\x00\x00\x00\x00\x12\x34" decodeWord256Int64 0 buf `shouldBe` Right 0x1234 @@ -239,8 +252,7 @@ resolverCacheSpec = do cacheMaxBytes = 1024 * 1024, rpcTimeoutMs = 1000, rpcMaxResponseBytes = 65536, - rpcMaxConcurrency = 4, - dangerousColocation = False + rpcMaxConcurrency = 4 } env <- newNamesEnvWith cfg ethCall Nothing hitsRef missRef pure (env, hitsRef, missRef) @@ -252,15 +264,40 @@ resolverCacheSpec = do misses <- readIORef missRef misses `shouldBe` 1 - it "concurrent identical lookups don't crash and all return NotFound" $ do + it "subsequent NotFound lookups hit the cache (no second RPC)" $ do + callCount <- newIORef (0 :: Int) + (env, hitsRef, missRef) <- mkEnv $ \_ _ -> do + atomicModifyIORef' callCount (\v -> (v + 1, ())) + pure (Right (B.replicate (32 * 8) '\NUL')) + -- First lookup: miss, eth_call fires, NotFound cached. + _ <- resolveName env "alice" + -- Second lookup: should hit cache, not call ethCall. + r2 <- resolveName env "alice" + r2 `shouldBe` Left NotFound + callCount' <- readIORef callCount + callCount' `shouldBe` 1 + missCount <- readIORef missRef + hitCount <- readIORef hitsRef + missCount `shouldBe` 1 + hitCount `shouldBe` 1 + + it "concurrent identical lookups coalesce — only the leader makes the RPC" $ do + -- Block the stub on a TMVar so the leader's eth_call doesn't return + -- before the 7 waiters race to attach to the inflight TMap. Without + -- coalescing, every caller would invoke ethCall and callCount would + -- be 8; with coalescing, only the leader fires. + gate <- newEmptyTMVarIO callCount <- newIORef (0 :: Int) (env, _, _) <- mkEnv $ \_ _ -> do atomicModifyIORef' callCount (\v -> (v + 1, ())) + atomically (readTMVar gate) pure (Right (B.replicate (32 * 8) '\NUL')) - rs <- replicateConcurrently 8 (resolveName env "alice") + -- Run the 8 callers in a background task so we can release the gate + -- only after they've all had a chance to register on the inflight map. + callers <- async $ replicateConcurrently 8 (resolveName env "alice") + threadDelay 50000 -- 50 ms — ample time for the 7 waiters to attach + atomically (putTMVar gate ()) + rs <- wait callers all (== Left NotFound) rs `shouldBe` True - -- NotFound is currently not cached, so each leader makes an RPC. - -- Once decodeGetRecord returns Just rec (post-SNRC), coalescing - -- means concurrent callers share one RPC and call count == 1. n <- readIORef callCount - n `shouldSatisfy` (>= 1) + n `shouldBe` 1