diff --git a/cmd/server/main.go b/cmd/server/main.go index cd65a6eb..ab3e29a8 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -361,6 +361,21 @@ func main() { defer stopAnalyticsRecomp() log.Printf("[analytics-recompute] background recompute enabled (default=%s)", cfg.AnalyticsDefaultRecomputeInterval()) + // Steady-state repeater-enrichment recomputer (issue #1262). + // Prewarms the bulk caches feeding handleNodes so the very first + // /api/nodes?limit=2000 from live.js's SPA bootstrap hits a + // populated cache instead of paying a 15.7s on-thread rebuild. + // Uses the configured RelayActiveHours window and the same + // default recompute interval as the other analytics caches. + relayWindowHours := cfg.GetHealthThresholds().RelayActiveHours + stopRepeaterEnrichRecomp := store.StartRepeaterEnrichmentRecomputer( + relayWindowHours, + cfg.AnalyticsDefaultRecomputeInterval(), + ) + defer stopRepeaterEnrichRecomp() + log.Printf("[repeater-enrich-recompute] background recompute enabled (window=%.1fh, interval=%s)", + relayWindowHours, cfg.AnalyticsDefaultRecomputeInterval()) + // Auto-prune old packets if retention.packetDays is configured vacuumPages := cfg.IncrementalVacuumPages() var stopPrune func() diff --git a/cmd/server/nodes_perf_limit2000_test.go b/cmd/server/nodes_perf_limit2000_test.go new file mode 100644 index 00000000..e03a7ad3 --- /dev/null +++ b/cmd/server/nodes_perf_limit2000_test.go @@ -0,0 +1,119 @@ +package main + +import ( + "fmt" + "net/http" + "net/http/httptest" + "testing" + "time" +) + +// TestHandleNodesLimit2000ColdMiss is a regression guard for issue #1262. +// +// Background: PR #1260 added a 15s-TTL bulk-cache for repeater +// enrichment in handleNodes (GetRepeaterRelayInfoMap / +// GetRepeaterUsefulnessScoreMap). On warm hits the request is ~40ms. +// On the very first request after server startup (or after the 15s TTL +// expires) the cache rebuild runs on the request-serving goroutine and +// is O(byPathHop + parsed timestamps). On staging (75k tx, 600 nodes) +// the cold rebuild took 15.7s. +// +// /api/nodes?limit=2000 is the SPA's hop-resolver bootstrap call (see +// public/live.js) so EVERY cold SPA load eats the cold-rebuild cost. +// +// Acceptance: /api/nodes?limit=2000 must return in <2s on a +// realistic-shape fleet WITHOUT a prior warmup request — i.e. once the +// store has been initialized and the steady-state repeater-enrichment +// recomputer prewarm has run. +func TestHandleNodesLimit2000ColdMiss(t *testing.T) { + if testing.Short() { + t.Skip("perf test") + } + srv, router := setupTestServer(t) + conn := srv.db.conn + + // Seed 600 nodes — 50 repeaters/rooms with most-recent last_seen so + // they sit at the top of the limit=2000 page, plus 550 stale + // companions. + tx, err := conn.Begin() + if err != nil { + t.Fatal(err) + } + stmt, err := tx.Prepare(`INSERT INTO nodes + (public_key, name, role, lat, lon, last_seen, first_seen, advert_count, foreign_advert) + VALUES (?, ?, ?, 0, 0, ?, '2026-01-01T00:00:00Z', 1, 0)`) + if err != nil { + t.Fatal(err) + } + now := time.Now().UTC() + for i := 0; i < 50; i++ { + pk := fmt.Sprintf("pkrepeat%056x", i) + ts := now.Add(-time.Duration(i) * time.Minute).Format(time.RFC3339Nano) + if _, err := stmt.Exec(pk, fmt.Sprintf("rep%d", i), "repeater", ts); err != nil { + t.Fatal(err) + } + } + for i := 0; i < 550; i++ { + pk := fmt.Sprintf("pkcompan%056x", i) + ts := now.Add(-time.Duration(60+i) * time.Minute).Format(time.RFC3339Nano) + if _, err := stmt.Exec(pk, fmt.Sprintf("comp%d", i), "companion", ts); err != nil { + t.Fatal(err) + } + } + if err := tx.Commit(); err != nil { + t.Fatal(err) + } + + // Seed the in-memory packet store: a non-trivial body of non-advert + // traffic where each repeater appears as a path hop on many txs. + // This is what makes the bulk-cache rebuild expensive. + const numTx = 150000 + const hopsPerTx = 6 + pt2 := 2 + store := srv.store + for i := 0; i < numTx; i++ { + txID := 100000 + i + ts := now.Add(-time.Duration(i) * time.Second).Format(time.RFC3339Nano) + stx := &StoreTx{ + ID: txID, + Hash: fmt.Sprintf("h%d", txID), + FirstSeen: ts, + PayloadType: &pt2, + } + store.byPayloadType[pt2] = append(store.byPayloadType[pt2], stx) + // Shared 1-byte prefix bucket to mirror production hop-prefix + // collisions. + store.byPathHop["pk"] = append(store.byPathHop["pk"], stx) + for h := 0; h < hopsPerTx; h++ { + repIdx := (i + h) % 50 + pk := fmt.Sprintf("pkrepeat%056x", repIdx) + store.byPathHop[pk] = append(store.byPathHop[pk], stx) + } + } + + // Steady-state repeater-enrichment recomputer (the fix for #1262) + // prewarms the bulk caches at startup so the first handler request + // — which is /api/nodes?limit=2000 from live.js on every cold SPA + // load — hits the cache instead of rebuilding it on-thread. + stop := store.StartRepeaterEnrichmentRecomputer(24, 5*time.Minute) + defer stop() + + // NO HTTP warmup — we are explicitly measuring the first + // limit=2000 request, the way live.js sees it. + + start := time.Now() + req := httptest.NewRequest("GET", "/api/nodes?limit=2000", nil) + w := httptest.NewRecorder() + router.ServeHTTP(w, req) + elapsed := time.Since(start) + if w.Code != http.StatusOK { + t.Fatalf("status=%d body=%s", w.Code, w.Body.String()) + } + + const budget = 2 * time.Second + t.Logf("/api/nodes?limit=2000 elapsed=%v on %d nodes, %d tx", elapsed, 600, numTx) + if elapsed > budget { + t.Fatalf("/api/nodes?limit=2000 cold-miss too slow for #1262: %v (budget %v) on %d nodes, %d tx", + elapsed, budget, 600, numTx) + } +} diff --git a/cmd/server/repeater_enrich_recomputer.go b/cmd/server/repeater_enrich_recomputer.go new file mode 100644 index 00000000..0400f853 --- /dev/null +++ b/cmd/server/repeater_enrich_recomputer.go @@ -0,0 +1,104 @@ +package main + +import ( + "sync" + "time" +) + +// repeaterEnrichmentRecomputerInterval is the default tick interval +// for the steady-state recompute of the repeater enrichment bulk +// caches. The on-request 15s-TTL fallback in repeater_enrich_bulk.go +// is kept as a safety net — the recomputer just makes sure the cache +// is populated before any request arrives. +// +// 5min mirrors the analytics_recomputer default from #1240 and is +// plenty fresh for an at-a-glance status column. +const repeaterEnrichmentRecomputerDefaultInterval = 5 * time.Minute + +// StartRepeaterEnrichmentRecomputer is the steady-state background +// recompute loop for the repeater enrichment bulk caches consumed by +// handleNodes (GetRepeaterRelayInfoMap + GetRepeaterUsefulnessScoreMap). +// +// Why this exists (issue #1262): PR #1260 added a 15s-TTL bulk cache, +// but the rebuild itself runs on the request-serving goroutine on the +// first request after startup or after the TTL expires. On staging +// (75k tx, 600 nodes) that cold rebuild took 15.7s and was triggered +// by every cold SPA load via live.js's /api/nodes?limit=2000 call. +// +// On Start this does an initial synchronous compute (so the next +// request hits cache) and then ticks every `interval` to keep the +// snapshot fresh — same pattern as analytics_recomputer.go (#1240). +// +// Returns a stop closure that signals the goroutine and waits for it +// to exit (with a 5s defensive timeout). +// +// Safe to call multiple times: subsequent calls are no-ops and return +// a no-op stop closure (the original goroutine retains ownership). +func (s *PacketStore) StartRepeaterEnrichmentRecomputer(windowHours float64, interval time.Duration) func() { + if interval <= 0 { + interval = repeaterEnrichmentRecomputerDefaultInterval + } + + s.repeaterEnrichRecompMu.Lock() + if s.repeaterEnrichRecompStarted { + s.repeaterEnrichRecompMu.Unlock() + return func() {} + } + s.repeaterEnrichRecompStarted = true + stop := make(chan struct{}) + done := make(chan struct{}) + s.repeaterEnrichRecompStop = stop + s.repeaterEnrichRecompDone = done + s.repeaterEnrichRecompMu.Unlock() + + // Initial synchronous prewarm — the entire point of this recomputer + // is to make sure the very first /api/nodes?limit=2000 from + // live.js's SPA bootstrap (issue #1262) hits a populated cache + // instead of paying the on-thread rebuild cost. + recomputeRepeaterEnrichmentSafe(s, windowHours) + + var stopOnce sync.Once + go func() { + defer close(done) + t := time.NewTicker(interval) + defer t.Stop() + for { + select { + case <-t.C: + recomputeRepeaterEnrichmentSafe(s, windowHours) + case <-stop: + return + } + } + }() + + return func() { + stopOnce.Do(func() { + close(stop) + }) + select { + case <-done: + case <-time.After(5 * time.Second): + } + } +} + +// recomputeRepeaterEnrichmentSafe runs both bulk-cache compute paths +// behind a panic recover — a panic in compute must not kill the +// background goroutine (the previous snapshot remains valid). +func recomputeRepeaterEnrichmentSafe(s *PacketStore, windowHours float64) { + defer func() { _ = recover() }() + // Bypass the 15s-TTL gate by forcing a fresh recompute and + // installing the result. The public Get* helpers would return the + // existing cache when within TTL; we want to refresh proactively. + relay := s.computeRepeaterRelayInfoMap(windowHours) + useful := s.computeRepeaterUsefulnessScoreMap() + now := time.Now() + s.repeaterEnrichMu.Lock() + s.repeaterRelayCache = relay + s.repeaterRelayCacheWin = windowHours + s.repeaterRelayAt = now + s.repeaterUsefulCache = useful + s.repeaterUsefulAt = now + s.repeaterEnrichMu.Unlock() +} diff --git a/cmd/server/store.go b/cmd/server/store.go index a910081a..cd382844 100644 --- a/cmd/server/store.go +++ b/cmd/server/store.go @@ -225,6 +225,15 @@ type PacketStore struct { repeaterUsefulCache map[string]float64 repeaterUsefulAt time.Time + // Steady-state recomputer for the two caches above (#1262). When + // started, an initial sync compute prewarms the caches so the very + // first /api/nodes?limit=2000 from live.js's SPA bootstrap hits a + // populated cache instead of paying the 15.7s on-thread rebuild. + repeaterEnrichRecompMu sync.Mutex + repeaterEnrichRecompStarted bool + repeaterEnrichRecompStop chan struct{} + repeaterEnrichRecompDone chan struct{} + // Precomputed distinct advert pubkey count (refcounted for eviction correctness). // Updated incrementally during Load/Ingest/Evict — avoids JSON parsing in GetPerfStoreStats. advertPubkeys map[string]int // pubkey → number of advert packets referencing it