fix(reach): scanReachRows DB errors must surface as 500 not 404 (#1631) (#1635)

Red commit: 67088342ec (CI run: pending)

## Summary

Fixes #1631 — `scanReachRows` swallowed `QueryContext` / `rows.Err()`
failures and returned `nil`. The handler treated that as "genuinely no
reach" and rendered a 200 with empty arrays (or 404 in some flows), so
transient SQLite failures surfaced to operators as "this node has no
reach" — misleading and undiagnosable without log access.

## Fix

`cmd/server/node_reach.go`:
- `scanReachRows` now returns `([]pathRow, error)`; propagates
`QueryContext` + `rows.Err()` failures.
- `computeNodeReach` signature gains an error return: non-nil error
means real backend failure (NOT "unknown node").
- `handleNodeReach` renders **500** on that error path and does **NOT**
cache the failure (next request retries cleanly). Genuinely-empty reach
still renders **200** with empty arrays; unknown/blacklisted nodes still
render 404.

## TDD

- Red commit `67088342`: adds `TestNodeReach_ScanDBErrorReturns500` —
warms the integration DB, drops the `observations` table, asserts
handler returns 500. Pre-fix this got 200 with empty arrays.
- Green commit `5408be3a`: the fix + caller updates. Adds
`TestScanReachRows_ErrorReturn` (unit-level: closed-DB → non-nil err).
- `TestNodeReach_ShapeAndClamp` had to be tightened: the v2 fixture's
`observations` table was missing `observer_idx`; the swallowed error
masked that schema gap. Now rebuilt with the right shape.

## Scope

- `cmd/server/node_reach.go` — fix.
- `cmd/server/node_reach_endpoint_test.go` — new red test +
ShapeAndClamp fixture fix.
- `cmd/server/node_reach_test.go`, `node_reach_bench_test.go` — caller
updates for new signature + one new unit assertion test.

No cache changes (#1629 is separate). No sibling refactors. No frontend.

## Verification

- `go test ./cmd/server/...` — green (48s, all tests).
- pr-preflight — clean (PII, scope, red-commit, CSS vars, LIKE-on-JSON,
async-migration, XSS).

---------

Co-authored-by: clawbot <bot@kpa-clawbot.local>
This commit is contained in:
Kpa-clawbot
2026-06-09 00:27:56 -07:00
committed by GitHub
parent 718e74e8e3
commit 43be1bb76a
4 changed files with 113 additions and 17 deletions
+32 -11
View File
@@ -388,7 +388,13 @@ func (s *Server) handleNodeReach(w http.ResponseWriter, r *http.Request) {
if raw, ok := s.reachCacheGet(cacheKey); ok {
return raw, nil
}
resp, ok := s.computeNodeReach(r.Context(), pubkey, days)
resp, ok, cErr := s.computeNodeReach(r.Context(), pubkey, days)
if cErr != nil {
// Real backend failure (e.g. DB scan exploded) — propagate so the
// caller renders 500 instead of the misleading empty-reach
// response. Do NOT cache. (#1631)
return nil, cErr
}
if !ok {
return []byte(nil), nil
}
@@ -413,15 +419,18 @@ func (s *Server) handleNodeReach(w http.ResponseWriter, r *http.Request) {
w.Write(raw)
}
// computeNodeReach does the read-only scan + assembly. ok=false → 404.
func (s *Server) computeNodeReach(ctx context.Context, pubkey string, days int) (NodeReachResponse, bool) {
// computeNodeReach does the read-only scan + assembly. ok=false → 404
// (target node not present / inputs unavailable). A non-nil error signals a
// real backend failure (e.g. DB scan exploded) — caller should render 500,
// not 404 (issue #1631).
func (s *Server) computeNodeReach(ctx context.Context, pubkey string, days int) (NodeReachResponse, bool, error) {
if s.store == nil || s.db == nil || s.db.conn == nil {
return NodeReachResponse{}, false
return NodeReachResponse{}, false, nil
}
nodeMap := s.buildNodeInfoMap()
self, found := nodeMap[pubkey]
if !found {
return NodeReachResponse{}, false
return NodeReachResponse{}, false, nil
}
_, pm := s.store.getCachedNodesAndPM()
tokens := reliableTokens(pubkey, pm)
@@ -431,7 +440,10 @@ func (s *Server) computeNodeReach(ctx context.Context, pubkey string, days int)
var d dirCounts
if len(tokens) > 0 {
rows := s.scanReachRows(ctx, tokens, sinceEpoch)
rows, err := s.scanReachRows(ctx, tokens, sinceEpoch)
if err != nil {
return NodeReachResponse{}, false, err
}
d = attributeDirections(rows, tokens, pubkey, newResolver(pm))
} else {
d = dirCounts{we: map[string]int{}, they: map[string]int{}, obs: map[string]obsAgg{}}
@@ -520,7 +532,7 @@ func (s *Server) computeNodeReach(ctx context.Context, pubkey string, days int)
},
DirectObservers: directObs,
Links: links,
}, true
}, true, nil
}
// --- neighbor-degree snapshot ---------------------------------------------
@@ -602,9 +614,14 @@ func (s *Server) getDegreeSnapshot(ctx context.Context) *degreeSnapshot {
// id and originator pubkey are lowercased in SQL (not per row), the path slice
// is uppercased in place (no second allocation), and the result is hard-capped
// at reachScanRowLimit.
func (s *Server) scanReachRows(ctx context.Context, tokens map[string]bool, sinceEpoch int64) []pathRow {
//
// Returns a non-nil error if the underlying QueryContext or rows.Err() fails;
// callers MUST treat that as a 500 (issue #1631 — previously the error was
// swallowed, surfacing a transient DB failure as a misleading 404 / empty
// reach to operators).
func (s *Server) scanReachRows(ctx context.Context, tokens map[string]bool, sinceEpoch int64) ([]pathRow, error) {
if len(tokens) == 0 {
return nil // defensive: an empty LIKE chain would render `AND ()` (SQL error)
return nil, nil // defensive: an empty LIKE chain would render `AND ()` (SQL error)
}
likes := make([]string, 0, len(tokens))
args := []interface{}{sinceEpoch}
@@ -630,7 +647,7 @@ func (s *Server) scanReachRows(ctx context.Context, tokens map[string]bool, sinc
rows, err := s.db.conn.QueryContext(ctx, q, args...)
if err != nil {
log.Printf("[reach] scan query failed: %v", err)
return nil
return nil, err
}
defer rows.Close()
// Modest preallocation: most nodes return far fewer than the cap, so seed a
@@ -660,5 +677,9 @@ func (s *Server) scanReachRows(ctx context.Context, tokens map[string]bool, sinc
if skipped > 0 {
log.Printf("[reach] scan discarded %d malformed/empty rows (kept %d)", skipped, len(out))
}
return out
if err := rows.Err(); err != nil {
log.Printf("[reach] scan rows iteration failed: %v", err)
return nil, err
}
return out, nil
}
+28 -4
View File
@@ -77,7 +77,7 @@ func BenchmarkNodeReachScan(b *testing.B) {
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
rows := srv.scanReachRows(context.Background(), tokens, 0)
rows, _ := srv.scanReachRows(context.Background(), tokens, 0)
if len(rows) == 0 {
b.Fatal("expected rows")
}
@@ -95,7 +95,7 @@ func BenchmarkNodeReachScanMixed(b *testing.B) {
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
rows := srv.scanReachRows(context.Background(), tokens, 0)
rows, _ := srv.scanReachRows(context.Background(), tokens, 0)
if len(rows) == 0 {
b.Fatal("expected rows")
}
@@ -113,7 +113,7 @@ func BenchmarkNodeReachScanLowerCase(b *testing.B) {
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
rows := srv.scanReachRows(context.Background(), tokens, 0)
rows, _ := srv.scanReachRows(context.Background(), tokens, 0)
if len(rows) == 0 {
b.Fatal("expected rows")
}
@@ -127,7 +127,7 @@ func BenchmarkNodeReachAttribute(b *testing.B) {
tokens := map[string]bool{"01FA": true}
db := benchReachDB(b, 100000, 1, false)
srv := &Server{db: db}
rows := srv.scanReachRows(context.Background(), tokens, 0)
rows, _ := srv.scanReachRows(context.Background(), tokens, 0)
if len(rows) == 0 {
b.Fatal("expected rows")
}
@@ -149,3 +149,27 @@ func BenchmarkNodeReachAttribute(b *testing.B) {
}
}
}
// TestScanReachRows_ErrorReturn anchors the new ([]pathRow, error) signature
// at the unit-level (issue #1631). Passing a Server whose db.conn is closed
// must surface an error, not a swallowed nil. Lives in this file because
// the bench callers in the same file rely on the same signature.
func TestScanReachRows_ErrorReturn(t *testing.T) {
conn, err := sql.Open("sqlite", ":memory:")
if err != nil {
t.Fatalf("open: %v", err)
}
// PREFLIGHT: async=true reason="test-only in-memory scratch schema, immediately closed"
if _, err := conn.Exec(`CREATE TABLE observations (id INTEGER); CREATE TABLE transmissions (id INTEGER); CREATE TABLE observers (rowid INTEGER, id TEXT)`); err != nil {
t.Fatalf("schema: %v", err)
}
conn.Close() // force QueryContext to fail
srv := &Server{db: &DB{conn: conn}}
rows, err := srv.scanReachRows(context.Background(), map[string]bool{"01FA": true}, 0)
if err == nil {
t.Fatalf("expected error from closed DB, got nil (rows=%d)", len(rows))
}
if rows != nil {
t.Fatalf("expected nil rows on error, got %d", len(rows))
}
}
+51
View File
@@ -222,6 +222,21 @@ func TestNodeReach_ShapeAndClamp(t *testing.T) {
const pk = "01fa326b475800a31105abcb9e4cac000b3e5d9e2b5ba0739981ce8d5f3a6754"
mustExecDB(t, db, `INSERT INTO nodes (public_key, name, role, lat, lon, last_seen, first_seen, advert_count)
VALUES ('`+pk+`', 'BE-Test', 'repeater', 50.9, 5.4, '2026-06-07T00:00:00Z', '2026-06-01T00:00:00Z', 3)`)
// scanReachRows joins observations on observer_idx; the v2 schema's
// observations table lacks that column. Previously the scan error was
// swallowed (issue #1631) and the test still saw empty arrays. With the
// fix that returns 500, we rebuild observations to the observer_idx
// shape (empty — no rows needed for shape/clamp assertions).
mustExecDB(t, db, `DROP TABLE observations`)
// PREFLIGHT: async=true reason="test-only in-memory schema rebuild; not a production migration"
mustExecDB(t, db, `CREATE TABLE observations (
id INTEGER PRIMARY KEY AUTOINCREMENT,
transmission_id INTEGER,
observer_idx INTEGER,
snr REAL,
path_json TEXT,
timestamp INTEGER
)`)
cfg := &Config{}
srv := &Server{store: newTestStoreWithDB(t, db, cfg), db: db, cfg: cfg, perfStats: NewPerfStats()}
@@ -248,6 +263,42 @@ func TestNodeReach_ShapeAndClamp(t *testing.T) {
}
}
// Issue #1631: a DB failure inside scanReachRows must surface as 500, not
// as a misleading "no reach" 200 or 404. We warm the integration DB, drop
// the observations table so the next reach scan query fails inside
// QueryContext, then assert the handler returns 500 (not 200 with empty
// arrays, which is the buggy current behavior — scanReachRows swallows the
// error and returns nil).
func TestNodeReach_ScanDBErrorReturns500(t *testing.T) {
resetReachState(t)
db, n := newReachIntegrationDB(t, `["AABB","01FA","CCDD"]`)
defer db.conn.Close()
cfg := &Config{}
srv := &Server{store: newTestStoreWithDB(t, db, cfg), db: db, cfg: cfg, perfStats: NewPerfStats()}
// Warm the store's node cache (so buildNodeInfoMap on the failing call
// still finds the target node). One healthy call also primes the
// reach response cache — clear it below so the next call recomputes.
if rr := serveReach(srv, "/api/nodes/"+n+"/reach?days=30"); rr.Code != http.StatusOK {
t.Fatalf("warm-up call: status=%d want 200 (body=%s)", rr.Code, rr.Body.String())
}
srv.reach.cacheMu.Lock()
srv.reach.cache = map[string]reachCacheEntry{}
srv.reach.cacheMu.Unlock()
// Break the table that scanReachRows reads from. nodes / observers /
// neighbor_edges remain intact so the failure is isolated to the
// scanReachRows QueryContext path.
if _, err := db.conn.Exec("DROP TABLE observations"); err != nil {
t.Fatalf("drop observations: %v", err)
}
rr := serveReach(srv, "/api/nodes/"+n+"/reach?days=30")
if rr.Code != http.StatusInternalServerError {
t.Fatalf("expected 500 on DB error inside scanReachRows, got %d (body=%s)", rr.Code, rr.Body.String())
}
}
func contains(s []string, v string) bool {
for _, x := range s {
if x == v {
+2 -2
View File
@@ -179,7 +179,7 @@ func TestScanReachRows_CapTruncates(t *testing.T) {
db := newReachScanTestDB(t)
defer db.conn.Close()
srv := &Server{db: db}
rows := srv.scanReachRows(context.Background(), map[string]bool{"01FA": true}, 0)
rows, _ := srv.scanReachRows(context.Background(), map[string]bool{"01FA": true}, 0)
if len(rows) != 1 {
t.Fatalf("scan must hard-cap at reachScanRowLimit (1), got %d rows", len(rows))
}
@@ -264,7 +264,7 @@ func TestScanReachRows_DecodesRows(t *testing.T) {
db := newReachScanTestDB(t)
defer db.conn.Close()
srv := &Server{db: db}
rows := srv.scanReachRows(context.Background(), map[string]bool{"01FA": true}, 0)
rows, _ := srv.scanReachRows(context.Background(), map[string]bool{"01FA": true}, 0)
if len(rows) != 2 {
t.Fatalf("expected 2 matching rows (non-matching path excluded), got %d", len(rows))
}