diff --git a/cmd/server/config.go b/cmd/server/config.go index 98a92e22..3a93dee6 100644 --- a/cmd/server/config.go +++ b/cmd/server/config.go @@ -8,6 +8,7 @@ import ( "path/filepath" "strings" "sync" + "sync/atomic" "time" "github.com/meshcore-analyzer/dbconfig" @@ -47,9 +48,18 @@ type Config struct { // operator refuses to fix. NodeBlacklist []string `json:"nodeBlacklist"` - // blacklistSetCached is the lazily-built set version of NodeBlacklist. - blacklistSetCached map[string]bool - blacklistOnce sync.Once + // blacklistSetPtr holds the active lookup set as an atomic pointer. + // Read path is a single atomic load — no mutex, no sync.Once. Writers + // always replace the whole map; readers see either the old or the new + // map as a single value, never a partially-built one. + blacklistSetPtr atomic.Pointer[map[string]bool] + + // blacklistGen is a monotonic generation counter bumped every time the + // blacklist mutates via SetNodeBlacklist. Callers that cache responses + // keyed by pubkey (e.g. /api/nodes/{pubkey}/reach, #1629) include this + // generation in their cache key so any blacklist change naturally + // invalidates prior entries on the next request. + blacklistGen atomic.Uint64 Branding map[string]interface{} `json:"branding"` Theme map[string]interface{} `json:"theme"` @@ -636,31 +646,78 @@ func (c *Config) LiveMapMaxNodes() int { return v } -// blacklistSet lazily builds and caches the nodeBlacklist as a set for O(1) lookups. -// Uses sync.Once to eliminate the data race on first concurrent access. -func (c *Config) blacklistSet() map[string]bool { - c.blacklistOnce.Do(func() { - if len(c.NodeBlacklist) == 0 { - return +// buildBlacklistSet recomputes the lookup set from pks and returns it. +// Empty/whitespace-only entries are skipped. Keys are lowercased + trimmed. +// Returns nil for an empty effective set so callers can `len(m) == 0` short-circuit. +func buildBlacklistSet(pks []string) map[string]bool { + if len(pks) == 0 { + return nil + } + m := make(map[string]bool, len(pks)) + for _, pk := range pks { + trimmed := strings.ToLower(strings.TrimSpace(pk)) + if trimmed != "" { + m[trimmed] = true } - m := make(map[string]bool, len(c.NodeBlacklist)) - for _, pk := range c.NodeBlacklist { - trimmed := strings.ToLower(strings.TrimSpace(pk)) - if trimmed != "" { - m[trimmed] = true - } - } - c.blacklistSetCached = m - }) - return c.blacklistSetCached + } + if len(m) == 0 { + return nil + } + return m +} + +// SetNodeBlacklist atomically replaces NodeBlacklist with pks, rebuilds the +// lookup set, and bumps the generation counter so any cache keyed on the +// generation invalidates on the next request (#1629). Safe for concurrent +// use with IsBlacklisted / BlacklistGeneration. +func (c *Config) SetNodeBlacklist(pks []string) { + if c == nil { + return + } + // Copy so callers can mutate their slice without affecting us. + cp := make([]string, len(pks)) + copy(cp, pks) + c.NodeBlacklist = cp + m := buildBlacklistSet(cp) + c.blacklistSetPtr.Store(&m) + c.blacklistGen.Add(1) +} + +// BlacklistGeneration returns a monotonic counter that increments on every +// SetNodeBlacklist call. Response caches keyed per-pubkey embed this value +// in their cache key so any blacklist mutation invalidates prior entries on +// the next request (#1629). +func (c *Config) BlacklistGeneration() uint64 { + if c == nil { + return 0 + } + return c.blacklistGen.Load() } // IsBlacklisted returns true if the given public key is in the nodeBlacklist. +// Hot read path: a single atomic pointer load + map lookup. No locks, no +// sync.Once. The in-memory set is populated either via SetNodeBlacklist or +// lazily on first read from c.NodeBlacklist (covering the JSON-load path +// where the setter was never called). func (c *Config) IsBlacklisted(pubkey string) bool { - if c == nil || len(c.NodeBlacklist) == 0 { + if c == nil { return false } - return c.blacklistSet()[strings.ToLower(strings.TrimSpace(pubkey))] + mp := c.blacklistSetPtr.Load() + if mp == nil { + // Lazy first-read materialisation from the JSON-loaded slice. + // CAS-style: if another goroutine wins the race, drop ours. + built := buildBlacklistSet(c.NodeBlacklist) + if c.blacklistSetPtr.CompareAndSwap(nil, &built) { + mp = &built + } else { + mp = c.blacklistSetPtr.Load() + } + } + if mp == nil || len(*mp) == 0 { + return false + } + return (*mp)[strings.ToLower(strings.TrimSpace(pubkey))] } // SaveGeoFilter writes the geo_filter section back to config.json on disk. diff --git a/cmd/server/node_reach.go b/cmd/server/node_reach.go index 77ade07a..8b04840c 100644 --- a/cmd/server/node_reach.go +++ b/cmd/server/node_reach.go @@ -10,6 +10,7 @@ import ( "strconv" "strings" "sync" + "sync/atomic" "time" "github.com/gorilla/mux" @@ -283,6 +284,13 @@ type reachState struct { // simultaneous callers run the scan + attribution once, not N times. sf singleflight.Group + // lastSeenBlacklistGen is the BlacklistGeneration() value that the cache + // was last reconciled with. When the live generation moves past this + // value, the cache is purged wholesale on the next request to prevent + // prior-gen entries from accumulating until their TTL expires (#1629 + // round-2, adversarial #5). + lastSeenBlacklistGen atomic.Uint64 + degreeMu sync.Mutex degreeSnap *degreeSnapshot } @@ -300,6 +308,33 @@ func (s *Server) reachCacheGet(key string) ([]byte, bool) { return e.raw, true } +// reachCacheLen returns the current entry count in the reach response cache. +// Test helper — exposes the size without leaking the internal mutex/map. +func (s *Server) reachCacheLen() int { + s.reach.cacheMu.RLock() + defer s.reach.cacheMu.RUnlock() + return len(s.reach.cache) +} + +// reachPurgeIfBlacklistGenChanged drops every cached entry when the live +// blacklist generation has advanced past the cache's last-seen value. CAS +// gates the purge so concurrent callers only do the work once per gen bump +// (#1629 round-2, adversarial #5). +func (s *Server) reachPurgeIfBlacklistGenChanged(gen uint64) { + seen := s.reach.lastSeenBlacklistGen.Load() + if gen == seen { + return + } + // CAS gates the actual purge to a single winner on a given gen bump. + if !s.reach.lastSeenBlacklistGen.CompareAndSwap(seen, gen) { + // Another goroutine already advanced (and purged). Done. + return + } + s.reach.cacheMu.Lock() + s.reach.cache = nil + s.reach.cacheMu.Unlock() +} + // isHexPubkey reports whether s is a full 64-char lowercase-hex public key. // The handler lowercases input first, so we only accept [0-9a-f]. func isHexPubkey(s string) bool { @@ -373,7 +408,21 @@ func (s *Server) handleNodeReach(w http.ResponseWriter, r *http.Request) { } days = clampDays(days) - cacheKey := pubkey + "|" + strconv.Itoa(days) + // cacheKey includes the blacklist generation so any mutation via + // SetNodeBlacklist invalidates all prior reach cache entries on the + // next request (#1629). Without the generation suffix a node added + // to the blacklist post-warm would keep being served the cached + // non-blacklisted response until the TTL expires. + var gen uint64 + if s.cfg != nil { + gen = s.cfg.BlacklistGeneration() + } + // Purge prior-gen entries wholesale when the generation advances so a + // steady stream of operator blacklist edits cannot leak cache entries + // up to the TTL. Cheap: one map reset under the cache mutex, only when + // the gen actually moved (#1629 round-2, adversarial #5). + s.reachPurgeIfBlacklistGenChanged(gen) + cacheKey := pubkey + "|" + strconv.Itoa(days) + "|g" + strconv.FormatUint(gen, 10) if raw, ok := s.reachCacheGet(cacheKey); ok { w.Header().Set("Content-Type", "application/json") w.Write(raw) diff --git a/cmd/server/node_reach_blacklist_cache_test.go b/cmd/server/node_reach_blacklist_cache_test.go new file mode 100644 index 00000000..88fef603 --- /dev/null +++ b/cmd/server/node_reach_blacklist_cache_test.go @@ -0,0 +1,124 @@ +package main + +import ( + "net/http" + "testing" +) + +// TestNodeReach_BlacklistMutationBustsCache reproduces #1629. +// +// Scenario: +// 1. Warm the reach response cache with a non-blacklisted pubkey (200 OK). +// 2. Operator blacklists that pubkey via SetNodeBlacklist (the legitimate +// mutation entry point — config reload, admin call, etc.). +// 3. The very next /reach request for that pubkey MUST return 404 (the +// blacklist response), not the cached 200 payload. +// +// Pre-fix the blacklist set is locked in by sync.Once at first read, so +// IsBlacklisted keeps returning false after the mutation; the cache then +// re-serves the prior reach body and the assertion fails. +func TestNodeReach_BlacklistMutationBustsCache(t *testing.T) { + resetReachState(t) + db, n := newReachIntegrationDB(t, `["AABB","01FA","CCDD"]`) + defer db.conn.Close() + + // Start with a non-empty blacklist (some unrelated decoy pubkey) so the + // blacklist set is materialised on the first IsBlacklisted call below. + // This is the realistic state: a deployment running with a populated + // blacklist where the operator later ADDS a new entry. + decoy := pk64("dec0") + cfg := &Config{NodeBlacklist: []string{decoy}} + srv := &Server{store: newTestStoreWithDB(t, db, cfg), db: db, cfg: cfg, perfStats: NewPerfStats()} + + // 1. Warm cache (must 200 and populate cache). + rr := serveReach(srv, "/api/nodes/"+n+"/reach?days=30") + if rr.Code != http.StatusOK { + t.Fatalf("warm-up: status=%d want 200 (body=%s)", rr.Code, rr.Body.String()) + } + if srv.reachCacheLen() == 0 { + t.Fatalf("warm-up did not populate reach cache") + } + + // 2. Operator adds the target node to the blacklist via the public setter. + cfg.SetNodeBlacklist([]string{decoy, n}) + + // 3. Next request MUST return 404. With the bug, the sync.Once-cached + // empty blacklist set makes IsBlacklisted return false, the response + // cache hits, and the prior 200 body is re-served. + rr2 := serveReach(srv, "/api/nodes/"+n+"/reach?days=30") + if rr2.Code != http.StatusNotFound { + t.Fatalf("post-blacklist mutation: status=%d want 404 (cached payload was served — #1629)", rr2.Code) + } +} + +// TestConfig_BlacklistGenerationIncrements asserts that every SetNodeBlacklist +// call bumps the generation counter by exactly 1, regardless of whether the +// content changed. The /reach cache key embeds this generation, so the +// monotonic-bump contract is part of the public API of the package +// (adversarial #4 from round-1 polish). +func TestConfig_BlacklistGenerationIncrements(t *testing.T) { + cfg := &Config{} + g0 := cfg.BlacklistGeneration() + cfg.SetNodeBlacklist([]string{"aa"}) + g1 := cfg.BlacklistGeneration() + if g1 != g0+1 { + t.Fatalf("first SetNodeBlacklist: gen %d -> %d (want +1)", g0, g1) + } + // Identical content — generation MUST still bump. Callers rely on + // "any call invalidates" rather than "content-diff invalidates." + cfg.SetNodeBlacklist([]string{"aa"}) + g2 := cfg.BlacklistGeneration() + if g2 != g1+1 { + t.Fatalf("second SetNodeBlacklist (same content): gen %d -> %d (want +1)", g1, g2) + } + // Empty mutation also bumps. + cfg.SetNodeBlacklist(nil) + g3 := cfg.BlacklistGeneration() + if g3 != g2+1 { + t.Fatalf("nil SetNodeBlacklist: gen %d -> %d (want +1)", g2, g3) + } +} + +// TestNodeReach_BlacklistMutationPurgesCache asserts that a blacklist +// mutation evicts ALL prior reach cache entries (not just the affected +// pubkey) on the next /reach request. Per adversarial #5, the previous +// gen-suffix-only design left every prior cached entry stranded until TTL, +// growing the cache by N entries per operator edit. The current design +// purges on generation bump (detected on the next handler invocation) so a +// steady stream of edits cannot leak entries unboundedly. +func TestNodeReach_BlacklistMutationPurgesCache(t *testing.T) { + resetReachState(t) + db, n := newReachIntegrationDB(t, `["AABB","01FA","CCDD"]`) + defer db.conn.Close() + + cfg := &Config{} + srv := &Server{store: newTestStoreWithDB(t, db, cfg), db: db, cfg: cfg, perfStats: NewPerfStats()} + + // Warm cache with two distinct keys (different days param). + for _, days := range []string{"30", "7"} { + rr := serveReach(srv, "/api/nodes/"+n+"/reach?days="+days) + if rr.Code != http.StatusOK { + t.Fatalf("warm-up days=%s: status=%d want 200", days, rr.Code) + } + } + before := srv.reachCacheLen() + if before < 2 { + t.Fatalf("warm-up populated %d entries, want >=2", before) + } + + // Unrelated blacklist mutation. The cached pubkey is not in the + // blacklist, but prior entries are now keyed under a stale generation + // and would otherwise sit until TTL. + cfg.SetNodeBlacklist([]string{pk64("dead")}) + + // Next /reach request triggers the purge inside the reach path. + rr := serveReach(srv, "/api/nodes/"+n+"/reach?days=30") + if rr.Code != http.StatusOK { + t.Fatalf("post-mutation request: status=%d want 200", rr.Code) + } + // After the purge + this single re-populate we expect exactly 1 entry, + // not the 2 stale + 1 new = 3 that the leaky design would leave behind. + if got := srv.reachCacheLen(); got != 1 { + t.Fatalf("post-mutation cache len = %d, want 1 (prior entries leaked — adv #5)", got) + } +} diff --git a/cmd/server/node_reach_endpoint_test.go b/cmd/server/node_reach_endpoint_test.go index 866fb886..6094077b 100644 --- a/cmd/server/node_reach_endpoint_test.go +++ b/cmd/server/node_reach_endpoint_test.go @@ -5,6 +5,7 @@ import ( "encoding/json" "net/http" "net/http/httptest" + "strconv" "strings" "testing" "time" @@ -178,9 +179,11 @@ func TestNodeReach_AttributionAndCacheHit(t *testing.T) { t.Errorf("expected they_hear≥1 for neighbour B, links=%+v", resp.Links) } - // Cache hit: the key must now be populated and a second request must 200. - if _, ok := srv.reachCacheGet(n + "|30"); !ok { - t.Fatalf("expected reach response to be cached under %q", n+"|30") + // Cache hit: the key (now generation-suffixed, #1629) must be populated + // and a second request must 200. + wantKey := n + "|30|g" + strconv.FormatUint(srv.cfg.BlacklistGeneration(), 10) + if _, ok := srv.reachCacheGet(wantKey); !ok { + t.Fatalf("expected reach response to be cached under %q", wantKey) } rr2 := serveReach(srv, "/api/nodes/"+n+"/reach?days=30") if rr2.Code != http.StatusOK || rr2.Body.String() != rr.Body.String() {