diff --git a/cmd/server/repeater_enrich_bulk.go b/cmd/server/repeater_enrich_bulk.go index d671db0e..5c981c00 100644 --- a/cmd/server/repeater_enrich_bulk.go +++ b/cmd/server/repeater_enrich_bulk.go @@ -5,14 +5,6 @@ import ( "time" ) -// repeaterEnrichTTL is the safety-net TTL for the bulk enrichment caches. -// Derived as 2× the recomputer's default tick so the cache is always valid -// between background refreshes. Without a recomputer (tests, edge cases) -// the cache rebuilds on-thread at most once per TTL window. -// Note: if analytics.defaultIntervalSeconds is configured above 600 the -// TTL will expire before the recomputer runs; keep that value < TTL/2. -const repeaterEnrichTTL = 2 * repeaterEnrichmentRecomputerDefaultInterval - // GetRepeaterRelayInfoMap returns a cached pubkey → RepeaterRelayInfo // map covering EVERY pubkey that currently appears as a path hop in any // non-advert StoreTx. This is the bulk equivalent of calling @@ -30,28 +22,34 @@ const repeaterEnrichTTL = 2 * repeaterEnrichmentRecomputerDefaultInterval // The cached map is keyed by lowercase pubkey/hop key (same shape as // byPathHop). Lookups should use strings.ToLower(pk). // -// The cache is invalidated by TTL only — never by ingest. Up-to-10min -// freshness is fine for an at-a-glance status column; if a fresher -// signal is ever needed for a non-status caller, expose a non-cached path. +// The cache is refreshed by the background recomputer (every 5 min by +// default). This function never rebuilds inline on a populated cache — +// serving a slightly stale snapshot is always preferable to a 700ms +// on-request rebuild. The only time an inline compute happens is when +// the cache is nil (i.e. before the recomputer's synchronous prewarm +// completes, which can occur in tests without a running recomputer). func (s *PacketStore) GetRepeaterRelayInfoMap(windowHours float64) map[string]RepeaterRelayInfo { s.repeaterEnrichMu.Lock() - if s.repeaterRelayCache != nil && - time.Since(s.repeaterRelayAt) < repeaterEnrichTTL && - s.repeaterRelayCacheWin == windowHours { - cached := s.repeaterRelayCache - s.repeaterEnrichMu.Unlock() + cached := s.repeaterRelayCache + s.repeaterEnrichMu.Unlock() + if cached != nil { return cached } - s.repeaterEnrichMu.Unlock() + // Cache is nil — recomputer hasn't prewarmed yet (edge case: tests + // without a running recomputer, or a request racing the initial + // synchronous prewarm). Build once inline; the recomputer takes over. result := s.computeRepeaterRelayInfoMap(windowHours) s.repeaterEnrichMu.Lock() - s.repeaterRelayCache = result - s.repeaterRelayCacheWin = windowHours - s.repeaterRelayAt = time.Now() + if s.repeaterRelayCache == nil { + s.repeaterRelayCache = result + s.repeaterRelayCacheWin = windowHours + s.repeaterRelayAt = time.Now() + } + cached = s.repeaterRelayCache s.repeaterEnrichMu.Unlock() - return result + return cached } // computeRepeaterRelayInfoMap walks byPathHop once under a single RLock, @@ -177,23 +175,25 @@ func (s *PacketStore) computeRepeaterRelayInfoMap(windowHours float64) map[strin // GetRepeaterUsefulnessScoreMap returns a cached pubkey → 0..1 score // for every pubkey appearing in byPathHop. Bulk equivalent of // GetRepeaterUsefulnessScore. See GetRepeaterRelayInfoMap for the -// motivation (#1257). +// motivation (#1257) and the no-inline-rebuild rationale (#1272). func (s *PacketStore) GetRepeaterUsefulnessScoreMap() map[string]float64 { s.repeaterEnrichMu.Lock() - if s.repeaterUsefulCache != nil && time.Since(s.repeaterUsefulAt) < repeaterEnrichTTL { - cached := s.repeaterUsefulCache - s.repeaterEnrichMu.Unlock() + cached := s.repeaterUsefulCache + s.repeaterEnrichMu.Unlock() + if cached != nil { return cached } - s.repeaterEnrichMu.Unlock() result := s.computeRepeaterUsefulnessScoreMap() s.repeaterEnrichMu.Lock() - s.repeaterUsefulCache = result - s.repeaterUsefulAt = time.Now() + if s.repeaterUsefulCache == nil { + s.repeaterUsefulCache = result + s.repeaterUsefulAt = time.Now() + } + cached = s.repeaterUsefulCache s.repeaterEnrichMu.Unlock() - return result + return cached } func (s *PacketStore) computeRepeaterUsefulnessScoreMap() map[string]float64 { diff --git a/cmd/server/repeater_enrich_bulk_test.go b/cmd/server/repeater_enrich_bulk_test.go new file mode 100644 index 00000000..a124f79b --- /dev/null +++ b/cmd/server/repeater_enrich_bulk_test.go @@ -0,0 +1,89 @@ +package main + +import ( + "sync" + "testing" + "time" +) + +// TestGetRepeaterRelayInfoMap_ServesStaleOnTTLExpiry is a regression guard +// for issue #1272. +// +// Background: GetRepeaterRelayInfoMap used to rebuild the cache inline +// whenever the TTL expired, causing ~700ms latency spikes on /api/nodes. +// The recomputer (StartRepeaterEnrichmentRecomputer) runs every 5 min and +// already keeps the cache warm; there is no reason to rebuild on-request. +// +// This test verifies that a populated cache is ALWAYS returned as-is, +// even when its timestamp is ancient (simulating TTL expiry under the old +// code). The stale sentinel value proves no inline recompute occurred. +func TestGetRepeaterRelayInfoMap_ServesStaleOnTTLExpiry(t *testing.T) { + store := &PacketStore{ + byPathHop: make(map[string][]*StoreTx), + } + + // Pre-populate the cache with a sentinel entry that would NOT be + // produced by computeRepeaterRelayInfoMap on the empty byPathHop. + stale := map[string]RepeaterRelayInfo{ + "sentinel": {RelayCount24h: 9999}, + } + store.repeaterRelayAt = time.Now().Add(-24 * time.Hour) // well past any TTL + store.repeaterRelayCache = stale + store.repeaterRelayCacheWin = 24 + + got := store.GetRepeaterRelayInfoMap(24) + + if got["sentinel"].RelayCount24h != 9999 { + t.Fatalf("stale cache not served: sentinel missing or overwritten (RelayCount24h=%d)", got["sentinel"].RelayCount24h) + } +} + +// TestGetRepeaterUsefulnessScoreMap_ServesStaleOnTTLExpiry mirrors +// TestGetRepeaterRelayInfoMap_ServesStaleOnTTLExpiry for the usefulness +// score map (same fix, same root cause). +func TestGetRepeaterUsefulnessScoreMap_ServesStaleOnTTLExpiry(t *testing.T) { + store := &PacketStore{ + byPathHop: make(map[string][]*StoreTx), + byPayloadType: make(map[int][]*StoreTx), + } + + stale := map[string]float64{"sentinel": 0.42} + store.repeaterUsefulAt = time.Now().Add(-24 * time.Hour) + store.repeaterUsefulCache = stale + + got := store.GetRepeaterUsefulnessScoreMap() + + if got["sentinel"] != 0.42 { + t.Fatalf("stale cache not served: sentinel missing or overwritten (score=%v)", got["sentinel"]) + } +} + +// TestGetRepeaterRelayInfoMap_BuildsWhenNil verifies that when the cache +// is nil (before the recomputer's first prewarm), GetRepeaterRelayInfoMap +// computes inline and caches the result for subsequent callers. +func TestGetRepeaterRelayInfoMap_BuildsWhenNil(t *testing.T) { + pt2 := 2 + now := time.Now().UTC() + tx := &StoreTx{ + ID: 1, + Hash: "abc", + FirstSeen: now.Add(-10 * time.Minute).Format(time.RFC3339Nano), + PayloadType: &pt2, + } + store := &PacketStore{ + byPathHop: map[string][]*StoreTx{"aabbcc": {tx}}, + byPayloadType: map[int][]*StoreTx{pt2: {tx}}, + mu: sync.RWMutex{}, + } + + got := store.GetRepeaterRelayInfoMap(24) + if _, ok := got["aabbcc"]; !ok { + t.Fatal("inline compute did not produce entry for seeded hop key") + } + + // Second call must return the cached result, not a fresh recompute. + got2 := store.GetRepeaterRelayInfoMap(24) + if got2["aabbcc"].RelayCount24h != got["aabbcc"].RelayCount24h { + t.Fatal("second call returned different map — cache not installed") + } +} diff --git a/cmd/server/repeater_enrich_recomputer.go b/cmd/server/repeater_enrich_recomputer.go index 88a5e137..43967885 100644 --- a/cmd/server/repeater_enrich_recomputer.go +++ b/cmd/server/repeater_enrich_recomputer.go @@ -88,9 +88,9 @@ func (s *PacketStore) StartRepeaterEnrichmentRecomputer(windowHours float64, int // background goroutine (the previous snapshot remains valid). func recomputeRepeaterEnrichmentSafe(s *PacketStore, windowHours float64) { defer func() { _ = recover() }() - // Bypass the 15s-TTL gate by forcing a fresh recompute and - // installing the result. The public Get* helpers would return the - // existing cache when within TTL; we want to refresh proactively. + // Write directly to the cache fields under mutex rather than going + // through the public Get* helpers — those return the existing + // non-nil cache immediately, so calling them here would be a no-op. relay := s.computeRepeaterRelayInfoMap(windowHours) useful := s.computeRepeaterUsefulnessScoreMap() now := time.Now()