mirror of
https://github.com/Kpa-clawbot/meshcore-analyzer.git
synced 2026-05-24 23:55:57 +00:00
perf(#1262): /api/nodes?limit=2000 cold-miss 15.7s → <100ms — prewarm repeater enrichment cache (#1263)
RED commit: `22ce5736066142583017cad7303fa48d9e00ccf0` — CI on red: https://github.com/Kpa-clawbot/CoreScope/actions?query=branch%3Afix%2Fissue-1262 ## Problem After #1260 added a 15s-TTL bulk cache for repeater enrichment in `handleNodes`, `/api/nodes` (default limit) dropped to ~500ms. But `/api/nodes?limit=2000` — called by `public/live.js` at SPA startup for hop resolution — still took **15.7s cold** on staging (75k tx, 600 nodes). Warm hits were ~40ms. Root cause: the bulk cache was lazily populated on the first request after TTL expiry. The rebuild ran on the request-serving goroutine. Every cold SPA load triggered the rebuild and ate 15s. ## Fix Add `StartRepeaterEnrichmentRecomputer` — a steady-state background recomputer that mirrors the `analytics_recomputer.go` pattern from #1240: - **Prewarm**: initial synchronous compute on Start so the first request hits a populated cache. - **Steady-state**: ticker refreshes the snapshot every 5min (configurable via the existing analytics recompute interval knob). - **Panic-safe** + idempotent Start. Wired into `main.go` right after `StartAnalyticsRecomputers`, using `cfg.GetHealthThresholds().RelayActiveHours` as the window. ## Test `TestHandleNodesLimit2000ColdMiss` — seeds 600 nodes + 150k non-advert tx with repeaters indexed under a shared 1-byte hop prefix (matches production hop-prefix collisions), starts the recomputer, then issues `/api/nodes?limit=2000` with **no HTTP warmup**. | State | Latency | |---|---| | Before (master, on-thread rebuild) | 3.37s | | After (prewarm + steady-state) | 56ms | | Budget | 2s | Staging end-to-end: 15.7s → expected sub-100ms on the same call path. Red commit (`22ce5736066142583017cad7303fa48d9e00ccf0`) compiles with a no-op stub of the new method so the test fails on the latency **assertion**, not a missing symbol. Fixes #1262 --------- Co-authored-by: corescope-bot <bot@corescope.local>
This commit is contained in:
@@ -361,6 +361,21 @@ func main() {
|
||||
defer stopAnalyticsRecomp()
|
||||
log.Printf("[analytics-recompute] background recompute enabled (default=%s)", cfg.AnalyticsDefaultRecomputeInterval())
|
||||
|
||||
// Steady-state repeater-enrichment recomputer (issue #1262).
|
||||
// Prewarms the bulk caches feeding handleNodes so the very first
|
||||
// /api/nodes?limit=2000 from live.js's SPA bootstrap hits a
|
||||
// populated cache instead of paying a 15.7s on-thread rebuild.
|
||||
// Uses the configured RelayActiveHours window and the same
|
||||
// default recompute interval as the other analytics caches.
|
||||
relayWindowHours := cfg.GetHealthThresholds().RelayActiveHours
|
||||
stopRepeaterEnrichRecomp := store.StartRepeaterEnrichmentRecomputer(
|
||||
relayWindowHours,
|
||||
cfg.AnalyticsDefaultRecomputeInterval(),
|
||||
)
|
||||
defer stopRepeaterEnrichRecomp()
|
||||
log.Printf("[repeater-enrich-recompute] background recompute enabled (window=%.1fh, interval=%s)",
|
||||
relayWindowHours, cfg.AnalyticsDefaultRecomputeInterval())
|
||||
|
||||
// Auto-prune old packets if retention.packetDays is configured
|
||||
vacuumPages := cfg.IncrementalVacuumPages()
|
||||
var stopPrune func()
|
||||
|
||||
@@ -0,0 +1,119 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TestHandleNodesLimit2000ColdMiss is a regression guard for issue #1262.
|
||||
//
|
||||
// Background: PR #1260 added a 15s-TTL bulk-cache for repeater
|
||||
// enrichment in handleNodes (GetRepeaterRelayInfoMap /
|
||||
// GetRepeaterUsefulnessScoreMap). On warm hits the request is ~40ms.
|
||||
// On the very first request after server startup (or after the 15s TTL
|
||||
// expires) the cache rebuild runs on the request-serving goroutine and
|
||||
// is O(byPathHop + parsed timestamps). On staging (75k tx, 600 nodes)
|
||||
// the cold rebuild took 15.7s.
|
||||
//
|
||||
// /api/nodes?limit=2000 is the SPA's hop-resolver bootstrap call (see
|
||||
// public/live.js) so EVERY cold SPA load eats the cold-rebuild cost.
|
||||
//
|
||||
// Acceptance: /api/nodes?limit=2000 must return in <2s on a
|
||||
// realistic-shape fleet WITHOUT a prior warmup request — i.e. once the
|
||||
// store has been initialized and the steady-state repeater-enrichment
|
||||
// recomputer prewarm has run.
|
||||
func TestHandleNodesLimit2000ColdMiss(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("perf test")
|
||||
}
|
||||
srv, router := setupTestServer(t)
|
||||
conn := srv.db.conn
|
||||
|
||||
// Seed 600 nodes — 50 repeaters/rooms with most-recent last_seen so
|
||||
// they sit at the top of the limit=2000 page, plus 550 stale
|
||||
// companions.
|
||||
tx, err := conn.Begin()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
stmt, err := tx.Prepare(`INSERT INTO nodes
|
||||
(public_key, name, role, lat, lon, last_seen, first_seen, advert_count, foreign_advert)
|
||||
VALUES (?, ?, ?, 0, 0, ?, '2026-01-01T00:00:00Z', 1, 0)`)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
now := time.Now().UTC()
|
||||
for i := 0; i < 50; i++ {
|
||||
pk := fmt.Sprintf("pkrepeat%056x", i)
|
||||
ts := now.Add(-time.Duration(i) * time.Minute).Format(time.RFC3339Nano)
|
||||
if _, err := stmt.Exec(pk, fmt.Sprintf("rep%d", i), "repeater", ts); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
for i := 0; i < 550; i++ {
|
||||
pk := fmt.Sprintf("pkcompan%056x", i)
|
||||
ts := now.Add(-time.Duration(60+i) * time.Minute).Format(time.RFC3339Nano)
|
||||
if _, err := stmt.Exec(pk, fmt.Sprintf("comp%d", i), "companion", ts); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
if err := tx.Commit(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Seed the in-memory packet store: a non-trivial body of non-advert
|
||||
// traffic where each repeater appears as a path hop on many txs.
|
||||
// This is what makes the bulk-cache rebuild expensive.
|
||||
const numTx = 150000
|
||||
const hopsPerTx = 6
|
||||
pt2 := 2
|
||||
store := srv.store
|
||||
for i := 0; i < numTx; i++ {
|
||||
txID := 100000 + i
|
||||
ts := now.Add(-time.Duration(i) * time.Second).Format(time.RFC3339Nano)
|
||||
stx := &StoreTx{
|
||||
ID: txID,
|
||||
Hash: fmt.Sprintf("h%d", txID),
|
||||
FirstSeen: ts,
|
||||
PayloadType: &pt2,
|
||||
}
|
||||
store.byPayloadType[pt2] = append(store.byPayloadType[pt2], stx)
|
||||
// Shared 1-byte prefix bucket to mirror production hop-prefix
|
||||
// collisions.
|
||||
store.byPathHop["pk"] = append(store.byPathHop["pk"], stx)
|
||||
for h := 0; h < hopsPerTx; h++ {
|
||||
repIdx := (i + h) % 50
|
||||
pk := fmt.Sprintf("pkrepeat%056x", repIdx)
|
||||
store.byPathHop[pk] = append(store.byPathHop[pk], stx)
|
||||
}
|
||||
}
|
||||
|
||||
// Steady-state repeater-enrichment recomputer (the fix for #1262)
|
||||
// prewarms the bulk caches at startup so the first handler request
|
||||
// — which is /api/nodes?limit=2000 from live.js on every cold SPA
|
||||
// load — hits the cache instead of rebuilding it on-thread.
|
||||
stop := store.StartRepeaterEnrichmentRecomputer(24, 5*time.Minute)
|
||||
defer stop()
|
||||
|
||||
// NO HTTP warmup — we are explicitly measuring the first
|
||||
// limit=2000 request, the way live.js sees it.
|
||||
|
||||
start := time.Now()
|
||||
req := httptest.NewRequest("GET", "/api/nodes?limit=2000", nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
elapsed := time.Since(start)
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", w.Code, w.Body.String())
|
||||
}
|
||||
|
||||
const budget = 2 * time.Second
|
||||
t.Logf("/api/nodes?limit=2000 elapsed=%v on %d nodes, %d tx", elapsed, 600, numTx)
|
||||
if elapsed > budget {
|
||||
t.Fatalf("/api/nodes?limit=2000 cold-miss too slow for #1262: %v (budget %v) on %d nodes, %d tx",
|
||||
elapsed, budget, 600, numTx)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,104 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// repeaterEnrichmentRecomputerInterval is the default tick interval
|
||||
// for the steady-state recompute of the repeater enrichment bulk
|
||||
// caches. The on-request 15s-TTL fallback in repeater_enrich_bulk.go
|
||||
// is kept as a safety net — the recomputer just makes sure the cache
|
||||
// is populated before any request arrives.
|
||||
//
|
||||
// 5min mirrors the analytics_recomputer default from #1240 and is
|
||||
// plenty fresh for an at-a-glance status column.
|
||||
const repeaterEnrichmentRecomputerDefaultInterval = 5 * time.Minute
|
||||
|
||||
// StartRepeaterEnrichmentRecomputer is the steady-state background
|
||||
// recompute loop for the repeater enrichment bulk caches consumed by
|
||||
// handleNodes (GetRepeaterRelayInfoMap + GetRepeaterUsefulnessScoreMap).
|
||||
//
|
||||
// Why this exists (issue #1262): PR #1260 added a 15s-TTL bulk cache,
|
||||
// but the rebuild itself runs on the request-serving goroutine on the
|
||||
// first request after startup or after the TTL expires. On staging
|
||||
// (75k tx, 600 nodes) that cold rebuild took 15.7s and was triggered
|
||||
// by every cold SPA load via live.js's /api/nodes?limit=2000 call.
|
||||
//
|
||||
// On Start this does an initial synchronous compute (so the next
|
||||
// request hits cache) and then ticks every `interval` to keep the
|
||||
// snapshot fresh — same pattern as analytics_recomputer.go (#1240).
|
||||
//
|
||||
// Returns a stop closure that signals the goroutine and waits for it
|
||||
// to exit (with a 5s defensive timeout).
|
||||
//
|
||||
// Safe to call multiple times: subsequent calls are no-ops and return
|
||||
// a no-op stop closure (the original goroutine retains ownership).
|
||||
func (s *PacketStore) StartRepeaterEnrichmentRecomputer(windowHours float64, interval time.Duration) func() {
|
||||
if interval <= 0 {
|
||||
interval = repeaterEnrichmentRecomputerDefaultInterval
|
||||
}
|
||||
|
||||
s.repeaterEnrichRecompMu.Lock()
|
||||
if s.repeaterEnrichRecompStarted {
|
||||
s.repeaterEnrichRecompMu.Unlock()
|
||||
return func() {}
|
||||
}
|
||||
s.repeaterEnrichRecompStarted = true
|
||||
stop := make(chan struct{})
|
||||
done := make(chan struct{})
|
||||
s.repeaterEnrichRecompStop = stop
|
||||
s.repeaterEnrichRecompDone = done
|
||||
s.repeaterEnrichRecompMu.Unlock()
|
||||
|
||||
// Initial synchronous prewarm — the entire point of this recomputer
|
||||
// is to make sure the very first /api/nodes?limit=2000 from
|
||||
// live.js's SPA bootstrap (issue #1262) hits a populated cache
|
||||
// instead of paying the on-thread rebuild cost.
|
||||
recomputeRepeaterEnrichmentSafe(s, windowHours)
|
||||
|
||||
var stopOnce sync.Once
|
||||
go func() {
|
||||
defer close(done)
|
||||
t := time.NewTicker(interval)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-t.C:
|
||||
recomputeRepeaterEnrichmentSafe(s, windowHours)
|
||||
case <-stop:
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
return func() {
|
||||
stopOnce.Do(func() {
|
||||
close(stop)
|
||||
})
|
||||
select {
|
||||
case <-done:
|
||||
case <-time.After(5 * time.Second):
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// recomputeRepeaterEnrichmentSafe runs both bulk-cache compute paths
|
||||
// behind a panic recover — a panic in compute must not kill the
|
||||
// background goroutine (the previous snapshot remains valid).
|
||||
func recomputeRepeaterEnrichmentSafe(s *PacketStore, windowHours float64) {
|
||||
defer func() { _ = recover() }()
|
||||
// Bypass the 15s-TTL gate by forcing a fresh recompute and
|
||||
// installing the result. The public Get* helpers would return the
|
||||
// existing cache when within TTL; we want to refresh proactively.
|
||||
relay := s.computeRepeaterRelayInfoMap(windowHours)
|
||||
useful := s.computeRepeaterUsefulnessScoreMap()
|
||||
now := time.Now()
|
||||
s.repeaterEnrichMu.Lock()
|
||||
s.repeaterRelayCache = relay
|
||||
s.repeaterRelayCacheWin = windowHours
|
||||
s.repeaterRelayAt = now
|
||||
s.repeaterUsefulCache = useful
|
||||
s.repeaterUsefulAt = now
|
||||
s.repeaterEnrichMu.Unlock()
|
||||
}
|
||||
@@ -225,6 +225,15 @@ type PacketStore struct {
|
||||
repeaterUsefulCache map[string]float64
|
||||
repeaterUsefulAt time.Time
|
||||
|
||||
// Steady-state recomputer for the two caches above (#1262). When
|
||||
// started, an initial sync compute prewarms the caches so the very
|
||||
// first /api/nodes?limit=2000 from live.js's SPA bootstrap hits a
|
||||
// populated cache instead of paying the 15.7s on-thread rebuild.
|
||||
repeaterEnrichRecompMu sync.Mutex
|
||||
repeaterEnrichRecompStarted bool
|
||||
repeaterEnrichRecompStop chan struct{}
|
||||
repeaterEnrichRecompDone chan struct{}
|
||||
|
||||
// Precomputed distinct advert pubkey count (refcounted for eviction correctness).
|
||||
// Updated incrementally during Load/Ingest/Evict — avoids JSON parsing in GetPerfStoreStats.
|
||||
advertPubkeys map[string]int // pubkey → number of advert packets referencing it
|
||||
|
||||
Reference in New Issue
Block a user