Files
meshcore-analyzer/cmd/server/nodes_perf_limit2000_test.go
Kpa-clawbot ae17a2be12 perf(#1262): /api/nodes?limit=2000 cold-miss 15.7s → <100ms — prewarm repeater enrichment cache (#1263)
RED commit: `22ce5736066142583017cad7303fa48d9e00ccf0` — CI on red:
https://github.com/Kpa-clawbot/CoreScope/actions?query=branch%3Afix%2Fissue-1262

## Problem
After #1260 added a 15s-TTL bulk cache for repeater enrichment in
`handleNodes`,
`/api/nodes` (default limit) dropped to ~500ms. But
`/api/nodes?limit=2000` —
called by `public/live.js` at SPA startup for hop resolution — still
took
**15.7s cold** on staging (75k tx, 600 nodes). Warm hits were ~40ms.

Root cause: the bulk cache was lazily populated on the first request
after
TTL expiry. The rebuild ran on the request-serving goroutine. Every cold
SPA
load triggered the rebuild and ate 15s.

## Fix
Add `StartRepeaterEnrichmentRecomputer` — a steady-state background
recomputer that mirrors the `analytics_recomputer.go` pattern from
#1240:

- **Prewarm**: initial synchronous compute on Start so the first request
  hits a populated cache.
- **Steady-state**: ticker refreshes the snapshot every 5min
(configurable
  via the existing analytics recompute interval knob).
- **Panic-safe** + idempotent Start.

Wired into `main.go` right after `StartAnalyticsRecomputers`, using
`cfg.GetHealthThresholds().RelayActiveHours` as the window.

## Test
`TestHandleNodesLimit2000ColdMiss` — seeds 600 nodes + 150k non-advert
tx with repeaters indexed under a shared 1-byte hop prefix (matches
production hop-prefix collisions), starts the recomputer, then issues
`/api/nodes?limit=2000` with **no HTTP warmup**.

| State | Latency |
|---|---|
| Before (master, on-thread rebuild) | 3.37s |
| After (prewarm + steady-state) | 56ms |
| Budget | 2s |

Staging end-to-end: 15.7s → expected sub-100ms on the same call path.

Red commit (`22ce5736066142583017cad7303fa48d9e00ccf0`) compiles with a
no-op stub of the new method so the
test fails on the latency **assertion**, not a missing symbol.

Fixes #1262

---------

Co-authored-by: corescope-bot <bot@corescope.local>
2026-05-18 09:22:27 -07:00

120 lines
4.0 KiB
Go

package main
import (
"fmt"
"net/http"
"net/http/httptest"
"testing"
"time"
)
// TestHandleNodesLimit2000ColdMiss is a regression guard for issue #1262.
//
// Background: PR #1260 added a 15s-TTL bulk-cache for repeater
// enrichment in handleNodes (GetRepeaterRelayInfoMap /
// GetRepeaterUsefulnessScoreMap). On warm hits the request is ~40ms.
// On the very first request after server startup (or after the 15s TTL
// expires) the cache rebuild runs on the request-serving goroutine and
// is O(byPathHop + parsed timestamps). On staging (75k tx, 600 nodes)
// the cold rebuild took 15.7s.
//
// /api/nodes?limit=2000 is the SPA's hop-resolver bootstrap call (see
// public/live.js) so EVERY cold SPA load eats the cold-rebuild cost.
//
// Acceptance: /api/nodes?limit=2000 must return in <2s on a
// realistic-shape fleet WITHOUT a prior warmup request — i.e. once the
// store has been initialized and the steady-state repeater-enrichment
// recomputer prewarm has run.
func TestHandleNodesLimit2000ColdMiss(t *testing.T) {
if testing.Short() {
t.Skip("perf test")
}
srv, router := setupTestServer(t)
conn := srv.db.conn
// Seed 600 nodes — 50 repeaters/rooms with most-recent last_seen so
// they sit at the top of the limit=2000 page, plus 550 stale
// companions.
tx, err := conn.Begin()
if err != nil {
t.Fatal(err)
}
stmt, err := tx.Prepare(`INSERT INTO nodes
(public_key, name, role, lat, lon, last_seen, first_seen, advert_count, foreign_advert)
VALUES (?, ?, ?, 0, 0, ?, '2026-01-01T00:00:00Z', 1, 0)`)
if err != nil {
t.Fatal(err)
}
now := time.Now().UTC()
for i := 0; i < 50; i++ {
pk := fmt.Sprintf("pkrepeat%056x", i)
ts := now.Add(-time.Duration(i) * time.Minute).Format(time.RFC3339Nano)
if _, err := stmt.Exec(pk, fmt.Sprintf("rep%d", i), "repeater", ts); err != nil {
t.Fatal(err)
}
}
for i := 0; i < 550; i++ {
pk := fmt.Sprintf("pkcompan%056x", i)
ts := now.Add(-time.Duration(60+i) * time.Minute).Format(time.RFC3339Nano)
if _, err := stmt.Exec(pk, fmt.Sprintf("comp%d", i), "companion", ts); err != nil {
t.Fatal(err)
}
}
if err := tx.Commit(); err != nil {
t.Fatal(err)
}
// Seed the in-memory packet store: a non-trivial body of non-advert
// traffic where each repeater appears as a path hop on many txs.
// This is what makes the bulk-cache rebuild expensive.
const numTx = 150000
const hopsPerTx = 6
pt2 := 2
store := srv.store
for i := 0; i < numTx; i++ {
txID := 100000 + i
ts := now.Add(-time.Duration(i) * time.Second).Format(time.RFC3339Nano)
stx := &StoreTx{
ID: txID,
Hash: fmt.Sprintf("h%d", txID),
FirstSeen: ts,
PayloadType: &pt2,
}
store.byPayloadType[pt2] = append(store.byPayloadType[pt2], stx)
// Shared 1-byte prefix bucket to mirror production hop-prefix
// collisions.
store.byPathHop["pk"] = append(store.byPathHop["pk"], stx)
for h := 0; h < hopsPerTx; h++ {
repIdx := (i + h) % 50
pk := fmt.Sprintf("pkrepeat%056x", repIdx)
store.byPathHop[pk] = append(store.byPathHop[pk], stx)
}
}
// Steady-state repeater-enrichment recomputer (the fix for #1262)
// prewarms the bulk caches at startup so the first handler request
// — which is /api/nodes?limit=2000 from live.js on every cold SPA
// load — hits the cache instead of rebuilding it on-thread.
stop := store.StartRepeaterEnrichmentRecomputer(24, 5*time.Minute)
defer stop()
// NO HTTP warmup — we are explicitly measuring the first
// limit=2000 request, the way live.js sees it.
start := time.Now()
req := httptest.NewRequest("GET", "/api/nodes?limit=2000", nil)
w := httptest.NewRecorder()
router.ServeHTTP(w, req)
elapsed := time.Since(start)
if w.Code != http.StatusOK {
t.Fatalf("status=%d body=%s", w.Code, w.Body.String())
}
const budget = 2 * time.Second
t.Logf("/api/nodes?limit=2000 elapsed=%v on %d nodes, %d tx", elapsed, 600, numTx)
if elapsed > budget {
t.Fatalf("/api/nodes?limit=2000 cold-miss too slow for #1262: %v (budget %v) on %d nodes, %d tx",
elapsed, budget, 600, numTx)
}
}