diff --git a/cmd/server/node_reach.go b/cmd/server/node_reach.go index 07d3fe2f..77ade07a 100644 --- a/cmd/server/node_reach.go +++ b/cmd/server/node_reach.go @@ -388,7 +388,13 @@ func (s *Server) handleNodeReach(w http.ResponseWriter, r *http.Request) { if raw, ok := s.reachCacheGet(cacheKey); ok { return raw, nil } - resp, ok := s.computeNodeReach(r.Context(), pubkey, days) + resp, ok, cErr := s.computeNodeReach(r.Context(), pubkey, days) + if cErr != nil { + // Real backend failure (e.g. DB scan exploded) — propagate so the + // caller renders 500 instead of the misleading empty-reach + // response. Do NOT cache. (#1631) + return nil, cErr + } if !ok { return []byte(nil), nil } @@ -413,15 +419,18 @@ func (s *Server) handleNodeReach(w http.ResponseWriter, r *http.Request) { w.Write(raw) } -// computeNodeReach does the read-only scan + assembly. ok=false → 404. -func (s *Server) computeNodeReach(ctx context.Context, pubkey string, days int) (NodeReachResponse, bool) { +// computeNodeReach does the read-only scan + assembly. ok=false → 404 +// (target node not present / inputs unavailable). A non-nil error signals a +// real backend failure (e.g. DB scan exploded) — caller should render 500, +// not 404 (issue #1631). +func (s *Server) computeNodeReach(ctx context.Context, pubkey string, days int) (NodeReachResponse, bool, error) { if s.store == nil || s.db == nil || s.db.conn == nil { - return NodeReachResponse{}, false + return NodeReachResponse{}, false, nil } nodeMap := s.buildNodeInfoMap() self, found := nodeMap[pubkey] if !found { - return NodeReachResponse{}, false + return NodeReachResponse{}, false, nil } _, pm := s.store.getCachedNodesAndPM() tokens := reliableTokens(pubkey, pm) @@ -431,7 +440,10 @@ func (s *Server) computeNodeReach(ctx context.Context, pubkey string, days int) var d dirCounts if len(tokens) > 0 { - rows := s.scanReachRows(ctx, tokens, sinceEpoch) + rows, err := s.scanReachRows(ctx, tokens, sinceEpoch) + if err != nil { + return NodeReachResponse{}, false, err + } d = attributeDirections(rows, tokens, pubkey, newResolver(pm)) } else { d = dirCounts{we: map[string]int{}, they: map[string]int{}, obs: map[string]obsAgg{}} @@ -520,7 +532,7 @@ func (s *Server) computeNodeReach(ctx context.Context, pubkey string, days int) }, DirectObservers: directObs, Links: links, - }, true + }, true, nil } // --- neighbor-degree snapshot --------------------------------------------- @@ -602,9 +614,14 @@ func (s *Server) getDegreeSnapshot(ctx context.Context) *degreeSnapshot { // id and originator pubkey are lowercased in SQL (not per row), the path slice // is uppercased in place (no second allocation), and the result is hard-capped // at reachScanRowLimit. -func (s *Server) scanReachRows(ctx context.Context, tokens map[string]bool, sinceEpoch int64) []pathRow { +// +// Returns a non-nil error if the underlying QueryContext or rows.Err() fails; +// callers MUST treat that as a 500 (issue #1631 — previously the error was +// swallowed, surfacing a transient DB failure as a misleading 404 / empty +// reach to operators). +func (s *Server) scanReachRows(ctx context.Context, tokens map[string]bool, sinceEpoch int64) ([]pathRow, error) { if len(tokens) == 0 { - return nil // defensive: an empty LIKE chain would render `AND ()` (SQL error) + return nil, nil // defensive: an empty LIKE chain would render `AND ()` (SQL error) } likes := make([]string, 0, len(tokens)) args := []interface{}{sinceEpoch} @@ -630,7 +647,7 @@ func (s *Server) scanReachRows(ctx context.Context, tokens map[string]bool, sinc rows, err := s.db.conn.QueryContext(ctx, q, args...) if err != nil { log.Printf("[reach] scan query failed: %v", err) - return nil + return nil, err } defer rows.Close() // Modest preallocation: most nodes return far fewer than the cap, so seed a @@ -660,5 +677,9 @@ func (s *Server) scanReachRows(ctx context.Context, tokens map[string]bool, sinc if skipped > 0 { log.Printf("[reach] scan discarded %d malformed/empty rows (kept %d)", skipped, len(out)) } - return out + if err := rows.Err(); err != nil { + log.Printf("[reach] scan rows iteration failed: %v", err) + return nil, err + } + return out, nil } diff --git a/cmd/server/node_reach_bench_test.go b/cmd/server/node_reach_bench_test.go index efce7731..4159c80e 100644 --- a/cmd/server/node_reach_bench_test.go +++ b/cmd/server/node_reach_bench_test.go @@ -77,7 +77,7 @@ func BenchmarkNodeReachScan(b *testing.B) { b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { - rows := srv.scanReachRows(context.Background(), tokens, 0) + rows, _ := srv.scanReachRows(context.Background(), tokens, 0) if len(rows) == 0 { b.Fatal("expected rows") } @@ -95,7 +95,7 @@ func BenchmarkNodeReachScanMixed(b *testing.B) { b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { - rows := srv.scanReachRows(context.Background(), tokens, 0) + rows, _ := srv.scanReachRows(context.Background(), tokens, 0) if len(rows) == 0 { b.Fatal("expected rows") } @@ -113,7 +113,7 @@ func BenchmarkNodeReachScanLowerCase(b *testing.B) { b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { - rows := srv.scanReachRows(context.Background(), tokens, 0) + rows, _ := srv.scanReachRows(context.Background(), tokens, 0) if len(rows) == 0 { b.Fatal("expected rows") } @@ -127,7 +127,7 @@ func BenchmarkNodeReachAttribute(b *testing.B) { tokens := map[string]bool{"01FA": true} db := benchReachDB(b, 100000, 1, false) srv := &Server{db: db} - rows := srv.scanReachRows(context.Background(), tokens, 0) + rows, _ := srv.scanReachRows(context.Background(), tokens, 0) if len(rows) == 0 { b.Fatal("expected rows") } @@ -149,3 +149,27 @@ func BenchmarkNodeReachAttribute(b *testing.B) { } } } + +// TestScanReachRows_ErrorReturn anchors the new ([]pathRow, error) signature +// at the unit-level (issue #1631). Passing a Server whose db.conn is closed +// must surface an error, not a swallowed nil. Lives in this file because +// the bench callers in the same file rely on the same signature. +func TestScanReachRows_ErrorReturn(t *testing.T) { + conn, err := sql.Open("sqlite", ":memory:") + if err != nil { + t.Fatalf("open: %v", err) + } + // PREFLIGHT: async=true reason="test-only in-memory scratch schema, immediately closed" + if _, err := conn.Exec(`CREATE TABLE observations (id INTEGER); CREATE TABLE transmissions (id INTEGER); CREATE TABLE observers (rowid INTEGER, id TEXT)`); err != nil { + t.Fatalf("schema: %v", err) + } + conn.Close() // force QueryContext to fail + srv := &Server{db: &DB{conn: conn}} + rows, err := srv.scanReachRows(context.Background(), map[string]bool{"01FA": true}, 0) + if err == nil { + t.Fatalf("expected error from closed DB, got nil (rows=%d)", len(rows)) + } + if rows != nil { + t.Fatalf("expected nil rows on error, got %d", len(rows)) + } +} diff --git a/cmd/server/node_reach_endpoint_test.go b/cmd/server/node_reach_endpoint_test.go index c740c8d4..866fb886 100644 --- a/cmd/server/node_reach_endpoint_test.go +++ b/cmd/server/node_reach_endpoint_test.go @@ -222,6 +222,21 @@ func TestNodeReach_ShapeAndClamp(t *testing.T) { const pk = "01fa326b475800a31105abcb9e4cac000b3e5d9e2b5ba0739981ce8d5f3a6754" mustExecDB(t, db, `INSERT INTO nodes (public_key, name, role, lat, lon, last_seen, first_seen, advert_count) VALUES ('`+pk+`', 'BE-Test', 'repeater', 50.9, 5.4, '2026-06-07T00:00:00Z', '2026-06-01T00:00:00Z', 3)`) + // scanReachRows joins observations on observer_idx; the v2 schema's + // observations table lacks that column. Previously the scan error was + // swallowed (issue #1631) and the test still saw empty arrays. With the + // fix that returns 500, we rebuild observations to the observer_idx + // shape (empty — no rows needed for shape/clamp assertions). + mustExecDB(t, db, `DROP TABLE observations`) + // PREFLIGHT: async=true reason="test-only in-memory schema rebuild; not a production migration" + mustExecDB(t, db, `CREATE TABLE observations ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + transmission_id INTEGER, + observer_idx INTEGER, + snr REAL, + path_json TEXT, + timestamp INTEGER + )`) cfg := &Config{} srv := &Server{store: newTestStoreWithDB(t, db, cfg), db: db, cfg: cfg, perfStats: NewPerfStats()} @@ -248,6 +263,42 @@ func TestNodeReach_ShapeAndClamp(t *testing.T) { } } +// Issue #1631: a DB failure inside scanReachRows must surface as 500, not +// as a misleading "no reach" 200 or 404. We warm the integration DB, drop +// the observations table so the next reach scan query fails inside +// QueryContext, then assert the handler returns 500 (not 200 with empty +// arrays, which is the buggy current behavior — scanReachRows swallows the +// error and returns nil). +func TestNodeReach_ScanDBErrorReturns500(t *testing.T) { + resetReachState(t) + db, n := newReachIntegrationDB(t, `["AABB","01FA","CCDD"]`) + defer db.conn.Close() + cfg := &Config{} + srv := &Server{store: newTestStoreWithDB(t, db, cfg), db: db, cfg: cfg, perfStats: NewPerfStats()} + + // Warm the store's node cache (so buildNodeInfoMap on the failing call + // still finds the target node). One healthy call also primes the + // reach response cache — clear it below so the next call recomputes. + if rr := serveReach(srv, "/api/nodes/"+n+"/reach?days=30"); rr.Code != http.StatusOK { + t.Fatalf("warm-up call: status=%d want 200 (body=%s)", rr.Code, rr.Body.String()) + } + srv.reach.cacheMu.Lock() + srv.reach.cache = map[string]reachCacheEntry{} + srv.reach.cacheMu.Unlock() + + // Break the table that scanReachRows reads from. nodes / observers / + // neighbor_edges remain intact so the failure is isolated to the + // scanReachRows QueryContext path. + if _, err := db.conn.Exec("DROP TABLE observations"); err != nil { + t.Fatalf("drop observations: %v", err) + } + + rr := serveReach(srv, "/api/nodes/"+n+"/reach?days=30") + if rr.Code != http.StatusInternalServerError { + t.Fatalf("expected 500 on DB error inside scanReachRows, got %d (body=%s)", rr.Code, rr.Body.String()) + } +} + func contains(s []string, v string) bool { for _, x := range s { if x == v { diff --git a/cmd/server/node_reach_test.go b/cmd/server/node_reach_test.go index f576cf03..606f8e2c 100644 --- a/cmd/server/node_reach_test.go +++ b/cmd/server/node_reach_test.go @@ -179,7 +179,7 @@ func TestScanReachRows_CapTruncates(t *testing.T) { db := newReachScanTestDB(t) defer db.conn.Close() srv := &Server{db: db} - rows := srv.scanReachRows(context.Background(), map[string]bool{"01FA": true}, 0) + rows, _ := srv.scanReachRows(context.Background(), map[string]bool{"01FA": true}, 0) if len(rows) != 1 { t.Fatalf("scan must hard-cap at reachScanRowLimit (1), got %d rows", len(rows)) } @@ -264,7 +264,7 @@ func TestScanReachRows_DecodesRows(t *testing.T) { db := newReachScanTestDB(t) defer db.conn.Close() srv := &Server{db: db} - rows := srv.scanReachRows(context.Background(), map[string]bool{"01FA": true}, 0) + rows, _ := srv.scanReachRows(context.Background(), map[string]bool{"01FA": true}, 0) if len(rows) != 2 { t.Fatalf("expected 2 matching rows (non-matching path excluded), got %d", len(rows)) }