From 078225a54ea3d647bafe4f22b00129dc8387488b Mon Sep 17 00:00:00 2001 From: Kpa-clawbot Date: Tue, 9 Jun 2026 01:24:46 -0700 Subject: [PATCH] =?UTF-8?q?perf(neighbor=5Fapi):=20fold=20first=5Fseen=20i?= =?UTF-8?q?nto=20cached=20map=20=E2=80=94=20fix=20#1627=20r3=20regression?= =?UTF-8?q?=20(#1632)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## TL;DR Post-merge regression introduced by #1627 r3 (commit `e2212f50`): `buildNodeInfoMap` in `cmd/server/neighbor_api.go` ran an uncached `SELECT … FROM nodes` scan on every call. Folded `first_seen` into the already-cached `getCachedNodesAndPM` (30s TTL) so the 4 hot handlers that call `buildNodeInfoMap` no longer pay for a full table scan per request. ## Before / After `buildNodeInfoMap` is called by **4 hot handlers**: - `cmd/server/neighbor_api.go:130` - `cmd/server/neighbor_api.go:297` - `cmd/server/neighbor_debug.go:83` - `cmd/server/node_reach.go:421` | | Before | After | |---|---|---| | `SELECT … FROM nodes` per call | 1 (uncached) | 0 (cache hit) | | `SELECT … FROM observers` per call | 1 (uncached) | 1 (unchanged) | | At Cascadia scale (~2600 nodes) | full scan × 4 handlers × N req/s | one scan / 30s | ## How - Extended the `getAllNodes` schema probe to also `COALESCE(first_seen, '')`. Falls back through the existing richest → leanest ladder if the column is missing. - `nodeInfo.FirstSeen` is therefore populated for every cached entry in `getCachedNodesAndPM`. - `buildNodeInfoMap` drops its second `SELECT` entirely and just copies `nodeInfo` values out of the cached map. - Public signature of `buildNodeInfoMap` is unchanged. `node_reach.go:421` still sees `nodeInfo.FirstSeen` populated, served from cache. `cmd/server/store.go` is touched because `getAllNodes` is the only sensible owner of the `first_seen` SELECT — adding a parallel cache would duplicate the 30s TTL machinery this fix is designed to leverage. ## Test (red → green) - Commit 1 (`test:`): `TestBuildNodeInfoMap_FirstSeenIsCached` — calls `buildNodeInfoMap`, mutates `first_seen` out-of-band via a separate rw connection, calls it again, and asserts both calls return the same (cached) value. Fails on `origin/master` (call 2 sees the mutated value, proving the uncached scan). - Commit 2 (`perf:`): the fold. Test now passes. ## Refs Post-merge audit identified this as the only MAJOR finding from #1627; recommendation was a follow-up hot-fix PR. This is that PR. --------- Co-authored-by: openclaw-bot Co-authored-by: openclaw-bot --- cmd/server/neighbor_api.go | 25 +------ cmd/server/neighbor_api_test.go | 120 ++++++++++++++++++++++++++++++++ cmd/server/store.go | 41 +++++++---- 3 files changed, 151 insertions(+), 35 deletions(-) diff --git a/cmd/server/neighbor_api.go b/cmd/server/neighbor_api.go index 8fd4cf1b..b2585fa8 100644 --- a/cmd/server/neighbor_api.go +++ b/cmd/server/neighbor_api.go @@ -429,6 +429,9 @@ func (s *Server) buildNodeInfoMap() map[string]nodeInfo { if s.store == nil { return nil } + // FirstSeen is folded into getAllNodes (and therefore into the 30s + // node cache) so callers like /api/nodes/{pk}/reach get the field + // without a per-request SELECT — fixes #1627 r3 regression. nodes, _ := s.store.getCachedNodesAndPM() m := make(map[string]nodeInfo, len(nodes)) for _, n := range nodes { @@ -452,28 +455,6 @@ func (s *Server) buildNodeInfoMap() map[string]nodeInfo { } } } - - // Fold in nodes.first_seen so callers (e.g. /api/nodes/{pk}/reach) - // don't need a per-request single-row SELECT. One bulk scan amortises - // across the whole map; missing/NULL rows are silently skipped (the - // node may be observer-only or pre-first_seen-schema). - fsRows, err := s.db.conn.Query("SELECT LOWER(public_key), COALESCE(first_seen,'') FROM nodes") - if err == nil { - defer fsRows.Close() - for fsRows.Next() { - var pk, fs string - if fsRows.Scan(&pk, &fs) != nil { - continue - } - if fs == "" { - continue - } - if entry, ok := m[pk]; ok { - entry.FirstSeen = fs - m[pk] = entry - } - } - } } return m diff --git a/cmd/server/neighbor_api_test.go b/cmd/server/neighbor_api_test.go index 40ce23c1..a34d1220 100644 --- a/cmd/server/neighbor_api_test.go +++ b/cmd/server/neighbor_api_test.go @@ -525,3 +525,123 @@ func TestBuildNodeInfoMap_ObserverEnrichment(t *testing.T) { } } } + +// TestBuildNodeInfoMap_FirstSeenIsCached asserts the regression introduced by +// #1627 r3 stays fixed: the per-pubkey first_seen field MUST come from the +// already-30s-cached getCachedNodesAndPM path, not from a fresh uncached +// `SELECT … FROM nodes` scan on every call. +// +// Method (no DB-driver wrapper needed): mutate the underlying SQLite file's +// first_seen via a separate rw connection between two consecutive calls to +// buildNodeInfoMap(). If first_seen is read fresh on every call (the +// regression), the second call sees the new value. If folded into the +// existing 30s node cache, both calls return the original value — same as +// every other nodeInfo field that comes from getAllNodes(). +func TestBuildNodeInfoMap_FirstSeenIsCached(t *testing.T) { + tmpDir := t.TempDir() + dbPath := tmpDir + "/test.db" + + // Seed via rw connection. + rw, err := sql.Open("sqlite", dbPath) + if err != nil { + t.Fatal(err) + } + defer rw.Close() + for _, stmt := range []string{ + "CREATE TABLE nodes (public_key TEXT PRIMARY KEY, name TEXT, role TEXT, lat REAL, lon REAL, last_seen TEXT, first_seen TEXT, advert_count INTEGER)", + "CREATE TABLE observers (id TEXT, name TEXT, iata TEXT)", + "INSERT INTO nodes VALUES ('AAAA1111', 'Repeater-1', 'repeater', 0, 0, '', '2024-01-01T00:00:00Z', 0)", + } { + if _, err := rw.Exec(stmt); err != nil { + t.Fatalf("seed exec %q: %v", stmt, err) + } + } + + db, err := OpenDB(dbPath) + if err != nil { + t.Fatal(err) + } + defer db.conn.Close() + + store := NewPacketStore(db, nil) + store.Load() + + srv := &Server{ + db: db, + store: store, + perfStats: NewPerfStats(), + } + + // Call 1: warm cache and record observed first_seen. + m1 := srv.buildNodeInfoMap() + first1 := m1["aaaa1111"].FirstSeen + if first1 != "2024-01-01T00:00:00Z" { + t.Fatalf("setup: expected first_seen=2024-01-01T00:00:00Z, got %q", first1) + } + + // Mutate first_seen out-of-band via the rw connection. Any code path + // that re-reads first_seen from disk (uncached) will see this new + // value; a path that folds first_seen into the 30s node cache will + // not, because the cache is well under 30s old. + if _, err := rw.Exec("UPDATE nodes SET first_seen='2099-12-31T23:59:59Z' WHERE public_key='AAAA1111'"); err != nil { + t.Fatalf("mutate: %v", err) + } + + // Call 2: should match call 1 if first_seen is cached. + m2 := srv.buildNodeInfoMap() + first2 := m2["aaaa1111"].FirstSeen + if first2 != first1 { + t.Errorf("buildNodeInfoMap re-scanned nodes.first_seen uncached (#1627 r3 regression): "+ + "call 1 saw %q, call 2 saw %q after out-of-band UPDATE; expected both calls to return "+ + "the cached value because getCachedNodesAndPM has a 30s TTL", + first1, first2) + } +} + +// TestGetAllNodes_FirstSeenSchemaFallback exercises the schema-probe rung that +// fires when nodes.first_seen is missing. The richest SELECT errors out, the +// loop falls through to the next-richest query, and the resulting nodeInfo +// values must have empty FirstSeen with no panic. Regression coverage for the +// existing fallback branch (#1632 review loop 1). +func TestGetAllNodes_FirstSeenSchemaFallback(t *testing.T) { + tmpDir := t.TempDir() + dbPath := tmpDir + "/test.db" + + // Seed a nodes table WITHOUT first_seen (advert_count + last_seen present). + rw, err := sql.Open("sqlite", dbPath) + if err != nil { + t.Fatal(err) + } + defer rw.Close() + for _, stmt := range []string{ + "CREATE TABLE nodes (public_key TEXT PRIMARY KEY, name TEXT, role TEXT, lat REAL, lon REAL, last_seen TEXT, advert_count INTEGER)", + "CREATE TABLE observers (id TEXT, name TEXT, iata TEXT)", + "INSERT INTO nodes VALUES ('BBBB2222', 'Repeater-2', 'repeater', 0, 0, '2024-02-02T00:00:00Z', 3)", + } { + if _, err := rw.Exec(stmt); err != nil { + t.Fatalf("seed exec %q: %v", stmt, err) + } + } + + db, err := OpenDB(dbPath) + if err != nil { + t.Fatal(err) + } + defer db.conn.Close() + + store := NewPacketStore(db, nil) + nodes := store.getAllNodes() + if len(nodes) != 1 { + t.Fatalf("expected 1 row from fallback rung, got %d", len(nodes)) + } + n := nodes[0] + if n.PublicKey != "BBBB2222" { + t.Errorf("PublicKey mismatch: got %q", n.PublicKey) + } + if n.FirstSeen != "" { + t.Errorf("FirstSeen should be empty when nodes.first_seen column is missing, got %q", n.FirstSeen) + } + if n.ObservationCount != 3 { + t.Errorf("ObservationCount should still populate from advert_count fallback, got %d", n.ObservationCount) + } +} diff --git a/cmd/server/store.go b/cmd/server/store.go index 870b7d63..7b9e4677 100644 --- a/cmd/server/store.go +++ b/cmd/server/store.go @@ -6064,23 +6064,32 @@ func (s *PacketStore) logSchemaDegradationOnce(msg string) { } func (s *PacketStore) getAllNodes() []nodeInfo { - // Schema probe: try richest → leanest. Logs a one-shot warning when we - // fall back to a thinner schema so operators see that a column is - // missing and the new tiebreak features are degraded. See #1197 - // (adversarial r1 #10). - rows, err := s.db.conn.Query("SELECT public_key, name, role, lat, lon, last_seen, COALESCE(advert_count, 0) FROM nodes") + // Schema probe: try richest → leanest. Logs a one-shot warning at each + // rung when we fall back to a thinner schema so operators see that a + // column is missing and the new tiebreak features are degraded. See + // #1197 (adversarial r1 #10). first_seen was added (#1627 r3) so callers + // like /api/nodes/{pk}/reach can avoid a per-request single-row SELECT; + // we fold it into the cached load (see getCachedNodesAndPM, 30s TTL) so + // the 4 buildNodeInfoMap call sites don't each pay for a fresh scan. + rows, err := s.db.conn.Query("SELECT public_key, name, role, lat, lon, last_seen, COALESCE(advert_count, 0), COALESCE(first_seen, '') FROM nodes") hasLastSeen := true hasAdvertCount := true + hasFirstSeen := true if err != nil { - s.logSchemaDegradationOnce("nodes.advert_count missing — tier-3/4 ObservationCount tiebreak degraded; resolveWithContext will fall back to lex-pubkey order") - rows, err = s.db.conn.Query("SELECT public_key, name, role, lat, lon, last_seen FROM nodes") - hasAdvertCount = false + s.logSchemaDegradationOnce("nodes.first_seen missing — buildNodeInfoMap will not populate FirstSeen for /api/nodes/{pk}/reach and friends") + hasFirstSeen = false + rows, err = s.db.conn.Query("SELECT public_key, name, role, lat, lon, last_seen, COALESCE(advert_count, 0) FROM nodes") if err != nil { - s.logSchemaDegradationOnce("nodes.last_seen missing — node freshness signal unavailable") - rows, err = s.db.conn.Query("SELECT public_key, name, role, lat, lon FROM nodes") - hasLastSeen = false + s.logSchemaDegradationOnce("nodes.advert_count missing — tier-3/4 ObservationCount tiebreak degraded; resolveWithContext will fall back to lex-pubkey order") + rows, err = s.db.conn.Query("SELECT public_key, name, role, lat, lon, last_seen FROM nodes") + hasAdvertCount = false if err != nil { - return nil + s.logSchemaDegradationOnce("nodes.last_seen missing — node freshness signal unavailable") + rows, err = s.db.conn.Query("SELECT public_key, name, role, lat, lon FROM nodes") + hasLastSeen = false + if err != nil { + return nil + } } } } @@ -6092,7 +6101,10 @@ func (s *PacketStore) getAllNodes() []nodeInfo { var lat, lon sql.NullFloat64 var lastSeen sql.NullString var advertCount sql.NullInt64 - if hasAdvertCount { + var firstSeen sql.NullString + if hasFirstSeen { + rows.Scan(&pk, &name, &role, &lat, &lon, &lastSeen, &advertCount, &firstSeen) + } else if hasAdvertCount { rows.Scan(&pk, &name, &role, &lat, &lon, &lastSeen, &advertCount) } else if hasLastSeen { rows.Scan(&pk, &name, &role, &lat, &lon, &lastSeen) @@ -6100,6 +6112,9 @@ func (s *PacketStore) getAllNodes() []nodeInfo { rows.Scan(&pk, &name, &role, &lat, &lon) } n := nodeInfo{PublicKey: pk, Name: nullStrVal(name), Role: nullStrVal(role)} + if hasFirstSeen && firstSeen.Valid { + n.FirstSeen = firstSeen.String + } if lat.Valid && lon.Valid { n.Lat = lat.Float64 n.Lon = lon.Float64