Files
Kpa-clawbot 1efe93d7f6 perf(#1257): bulk-cache repeater enrichment in /api/nodes — 32s → <500ms (#1260)
RED commit `a2879e12` — perf regression test; CI run: see Actions tab.

Fixes #1257.

## Root cause

`handleNodes` looped over the response page and called
`store.GetRepeaterRelayInfo(pk, win)` +
`store.GetRepeaterUsefulnessScore(pk)` for every repeater/room. Each
call:

- grabbed its own `s.mu.RLock`,
- walked `byPathHop[pk]` (+ the matching 1-byte raw-prefix bucket, which
on busy networks fans out to nearly the entire non-advert tx set),
- and re-parsed every `tx.FirstSeen` with `parseRelayTS`.

Default page is the 50 most-recently-seen nodes — almost all hot
repeaters — so the request did O(50) lock acquisitions and hundreds of
thousands of timestamp parses on the same set of txs. That's the classic
load-then-paginate / per-row N+1 shape called out in the issue (same
family as #1226).

The `?limit=2000` variant looks faster relatively only because per-node
enrichment dwarfs serialization; on staging both still bottleneck on the
same loop.

## Fix

Two new bulk methods on `PacketStore`:

- `GetRepeaterRelayInfoMap(windowHours)` → `pubkey → RepeaterRelayInfo`
- `GetRepeaterUsefulnessScoreMap()` → `pubkey → 0..1`

Both snapshot `byPathHop` under a single `RLock`, pre-parse each
`FirstSeen` exactly once (a tx that appears in N hop buckets used to be
parsed N times), and emit one entry per hop key. Cached 15s — same TTL
as `GetNodeHashSizeInfo` / `GetMultiByteCapMap`, same status-column
freshness budget.

`handleNodes` is one map-lookup per node; behavior, output schema, and
`RelayActive` / `RelayCount{1h,24h}` / `LastRelayed` /
`usefulness_score` semantics are preserved.

## Why no `limit` default change

The issue mentioned a default-limit knob. Investigated: `queryInt(r,
"limit", 50)` already defaults to 50 — frontends calling `/api/nodes`
(no limit) get a 50-row page today. Capping further would change
behavior (live.js already passes `?limit=2000` when it wants more); the
cost was per-repeater enrichment, not page size. Fixing the N+1 is the
correct lever and preserves backward compat.

## Perf

Regression test `TestHandleNodesPerfLargeFleet` (600 nodes, 150k
non-advert tx, repeaters indexed under `byPathHop`):

| | elapsed | vs 2s budget |
|---|---|---|
| before (master) | 4.72s | ✗ |
| after | ~4ms | ✓ (~1000×) |

## TDD

- RED: `a2879e12` — test fails at 4.72s on master.
- GREEN: `c529d29a` — fix; full `cmd/server` + `cmd/ingestor` suites
green.

---------

Co-authored-by: corescope-bot <bot@corescope>
2026-05-18 07:36:33 -07:00

128 lines
4.4 KiB
Go

package main
import (
"fmt"
"net/http"
"net/http/httptest"
"testing"
"time"
)
// TestHandleNodesPerfLargeFleet asserts the /api/nodes endpoint (no `limit`
// param — relying on the server-side default) returns in well under 2s on a
// realistic-shape fleet: 600 nodes, ~50 of them repeaters/rooms with rich
// path-hop activity, and a non-trivial byPayloadType + byPathHop index.
//
// Regression guard for issue #1257:
// /api/nodes (no limit) → 32.9s, 30KB on staging (637 nodes)
// /api/nodes?limit=2000 → 4.9s, 360KB
//
// Root cause class: per-repeater enrichment in handleNodes calls
// store.GetRepeaterRelayInfo + GetRepeaterUsefulnessScore separately for
// each node in the page. Each call takes its own RLock and walks
// byPathHop[pk] / byPayloadType, doing expensive timestamp parsing.
// For the default-page case (top-50 by last_seen, mostly hot repeaters)
// that is hundreds of thousands of timestamp parses per request.
//
// Budget: 2s. On the broken implementation with this fixture the
// endpoint blows the budget; with batched/cached per-page enrichment it
// completes in well under 500ms.
func TestHandleNodesPerfLargeFleet(t *testing.T) {
if testing.Short() {
t.Skip("perf test")
}
srv, router := setupTestServer(t)
conn := srv.db.conn
// Seed 600 nodes — 50 repeaters/rooms with most-recent last_seen so
// they land on the default page, plus 550 stale companions.
tx, err := conn.Begin()
if err != nil {
t.Fatal(err)
}
stmt, err := tx.Prepare(`INSERT INTO nodes
(public_key, name, role, lat, lon, last_seen, first_seen, advert_count, foreign_advert)
VALUES (?, ?, ?, 0, 0, ?, '2026-01-01T00:00:00Z', 1, 0)`)
if err != nil {
t.Fatal(err)
}
now := time.Now().UTC()
for i := 0; i < 50; i++ {
pk := fmt.Sprintf("pkrepeat%056x", i)
ts := now.Add(-time.Duration(i) * time.Minute).Format(time.RFC3339Nano)
if _, err := stmt.Exec(pk, fmt.Sprintf("rep%d", i), "repeater", ts); err != nil {
t.Fatal(err)
}
}
for i := 0; i < 550; i++ {
pk := fmt.Sprintf("pkcompan%056x", i)
ts := now.Add(-time.Duration(60+i) * time.Minute).Format(time.RFC3339Nano)
if _, err := stmt.Exec(pk, fmt.Sprintf("comp%d", i), "companion", ts); err != nil {
t.Fatal(err)
}
}
if err := tx.Commit(); err != nil {
t.Fatal(err)
}
// Seed the in-memory packet store: a body of non-advert traffic with
// each repeater appearing as a path hop on many of them. This is what
// makes the per-node GetRepeaterRelayInfo / GetRepeaterUsefulnessScore
// calls expensive on the broken impl.
const numTx = 150000
const hopsPerTx = 6
pt2 := 2 // non-advert payload type
store := srv.store
// Also index every tx under a single shared 1-byte prefix so the
// GetRepeaterRelayInfo prefix-collision branch fans every per-node
// call through the full non-advert tx set (matches production where
// many repeaters share a 1-byte hop prefix).
for i := 0; i < numTx; i++ {
txID := 100000 + i
ts := now.Add(-time.Duration(i) * time.Second).Format(time.RFC3339Nano)
stx := &StoreTx{
ID: txID,
Hash: fmt.Sprintf("h%d", txID),
FirstSeen: ts,
PayloadType: &pt2,
}
store.byPayloadType[pt2] = append(store.byPayloadType[pt2], stx)
store.byPathHop["pk"] = append(store.byPathHop["pk"], stx)
// Index each repeater under byPathHop so per-node enrichment walks
// a non-trivial slice.
for h := 0; h < hopsPerTx; h++ {
repIdx := (i + h) % 50
pk := fmt.Sprintf("pkrepeat%056x", repIdx)
store.byPathHop[pk] = append(store.byPathHop[pk], stx)
}
}
// Warm-up to amortize first-call costs (cache misses, prepare). Note:
// the per-node Repeater* enrichment is NOT cached, so this warmup
// does not hide the perf bug — it only amortizes one-shot prep.
{
req := httptest.NewRequest("GET", "/api/nodes?limit=1", nil)
w := httptest.NewRecorder()
router.ServeHTTP(w, req)
if w.Code != http.StatusOK {
t.Fatalf("warmup status=%d body=%s", w.Code, w.Body.String())
}
}
start := time.Now()
req := httptest.NewRequest("GET", "/api/nodes", nil)
w := httptest.NewRecorder()
router.ServeHTTP(w, req)
elapsed := time.Since(start)
if w.Code != http.StatusOK {
t.Fatalf("status=%d body=%s", w.Code, w.Body.String())
}
const budget = 2 * time.Second
t.Logf("/api/nodes (no limit) elapsed=%v on %d nodes, %d tx", elapsed, 600, numTx)
if elapsed > budget {
t.Fatalf("/api/nodes (no limit) too slow for #1257: %v (budget %v) on %d nodes, %d tx",
elapsed, budget, 600, numTx)
}
}