Files
meshcore-analyzer/cmd/ingestor/maintenance.go
T
Kpa-clawbot 9383201c07 refactor(db): finish #1283 — Option 4: ingestor owns neighbor-graph + schema migrations; server is read-only (fixes #1287) (#1289)
Red commit:
https://github.com/Kpa-clawbot/CoreScope/commit/eae179b99b5fd34924547632aa8f8025c405aa53
(CI: pending — opens with this PR)

Finishes #1283. RED test `TestServerSourceHasNoCachedRWCalls` goes from
failing (13 writer call-sites) to GREEN (zero). Per #1287 Option 4
(https://github.com/Kpa-clawbot/CoreScope/issues/1287#issuecomment-4485099992):
ingestor owns the neighbor graph build + persist; server reads the
snapshot.

**Category A — Schema migrations** → new `internal/dbschema` package.
`dbschema.Apply(rw)` runs in `cmd/ingestor` startup (in `OpenStore`).
`dbschema.AssertReady(ro)` runs in `cmd/server/main.go` and
FATAL-LOG-EXITS if any expected column/index/table is missing — the
operator must restart the ingestor first. Covers indexes,
`neighbor_edges`, `observations.resolved_path`,
`observers.{inactive,last_packet_at,iata}`,
`(inactive_)nodes.foreign_advert`, `transmissions.from_pubkey`.

**Category B — Backfill** → ingestor.
`BackfillFromPubkey` and observer-blacklist soft-delete moved to
`cmd/ingestor/maintenance.go`. Server keeps an inert
`fromPubkeyBackfillSnapshot` stub for `/api/healthz` API compatibility.

**Category C — Neighbor-graph persistence (Option 4)** → ingestor
writes, server reads.
- Ingestor (`cmd/ingestor/neighbor_builder.go`): every 60s scans
`observations + transmissions`, extracts edges (originator↔first-hop for
ADVERTs; observer↔last-hop for all), resolves hop prefixes via a
node-table prefix index, upserts into `neighbor_edges`.
- Server (`cmd/server/neighbor_recomputer.go`): every 60s re-reads
`neighbor_edges` and atomic-swaps the resulting `NeighborGraph` into
`s.graph`. Initial load is synchronous on startup. All server-side
incremental edge writers (the two `asyncPersistResolvedPathsAndEdges`
paths in `cmd/server/store.go`) are gone.
- Neighbor-edge daily prune (`PruneNeighborEdges`) moved to ingestor.

**Why Option 4**: clean read/write separation, no startup CPU spike
(server loads existing snapshot instead of rebuilding from history), no
IPC/delta-protocol churn. Staleness budget ~60s — same model as the
analytics recomputers in #1240 / #1248 / #672 axis 2.

**Recomputer interval default for neighbor graph**: 60s
(`NeighborGraphRecomputerDefaultInterval`,
`NeighborEdgesBuilderInterval`).

**Invariants added**:
- `TestServerSourceHasNoCachedRWCalls` (RED commit eae179b9): grep
enforces zero `cachedRW(`, `mode=rw`, or `sql.Open(_journal_mode=WAL…)`
in non-test `cmd/server/` sources.
- `TestServerStartupRequiresMigratedSchema`: server refuses to start
against an unmigrated DB.
- `TestNeighborGraphRecomputerLoadsSnapshot`: post-write snapshot is
picked up on the next refresh.
- `TestNeighborEdgesBuilderUpsertsFromObservations`: end-to-end pipeline
writes the expected edge.

`grep cachedRW cmd/server/*.go | grep -v _test.go` → 0 matches.

Fixes #1287.

---------

Co-authored-by: MeshCore Bot <bot@meshcore.local>
Co-authored-by: Kpa-clawbot <Kpa-clawbot@users.noreply.github.com>
Co-authored-by: corescope-bot <bot@corescope.local>
2026-05-19 23:53:41 -07:00

223 lines
6.4 KiB
Go

package main
import (
"database/sql"
"encoding/json"
"fmt"
"log"
"time"
"github.com/meshcore-analyzer/dbschema"
)
// PruneOldPackets deletes transmissions (and their child observations)
// older than `days`. Returns count of transmissions deleted.
//
// Owned by the ingestor per #1283: the writer process is the only one
// allowed to hold the DB write lock; previously this lived in
// cmd/server/db.go and raced ingestor INSERTs (SQLITE_BUSY).
func (s *Store) PruneOldPackets(days int) (int64, error) {
if days <= 0 {
return 0, nil
}
cutoff := time.Now().UTC().AddDate(0, 0, -days).Format(time.RFC3339)
tx, err := s.db.Begin()
if err != nil {
return 0, fmt.Errorf("prune begin: %w", err)
}
defer tx.Rollback()
// Delete child observations first (no CASCADE in SQLite).
if _, err := tx.Exec(`DELETE FROM observations WHERE transmission_id IN (
SELECT id FROM transmissions WHERE first_seen < ?
)`, cutoff); err != nil {
return 0, fmt.Errorf("prune observations: %w", err)
}
res, err := tx.Exec(`DELETE FROM transmissions WHERE first_seen < ?`, cutoff)
if err != nil {
return 0, fmt.Errorf("prune transmissions: %w", err)
}
n, _ := res.RowsAffected()
if err := tx.Commit(); err != nil {
return 0, fmt.Errorf("prune commit: %w", err)
}
if n > 0 {
log.Printf("[prune] deleted %d transmissions older than %d days", n, days)
}
return n, nil
}
// SoftDeleteBlacklistedObservers marks observers in the blacklist as
// inactive=1 so they are hidden from API responses. Owned by ingestor
// per #1287. Runs once at startup.
func (s *Store) SoftDeleteBlacklistedObservers(blacklist []string) {
n, err := dbschema.SoftDeleteBlacklistedObservers(s.db, blacklist)
if err != nil {
log.Printf("[observer-blacklist] warning: soft-delete failed: %v", err)
return
}
if n > 0 {
log.Printf("[observer-blacklist] soft-deleted %d blacklisted observer(s)", n)
}
}
// PruneNeighborEdges deletes rows older than maxAgeDays from
// neighbor_edges. Owned by the ingestor per #1287 (was in cmd/server).
// Returns DB rows deleted.
func (s *Store) PruneNeighborEdges(maxAgeDays int) (int64, error) {
if maxAgeDays <= 0 {
return 0, nil
}
cutoff := time.Now().UTC().Add(-time.Duration(maxAgeDays) * 24 * time.Hour).Format(time.RFC3339)
res, err := s.db.Exec("DELETE FROM neighbor_edges WHERE last_seen < ?", cutoff)
if err != nil {
return 0, fmt.Errorf("prune neighbor_edges: %w", err)
}
n, _ := res.RowsAffected()
if n > 0 {
log.Printf("[neighbor-prune] removed %d DB rows older than %d days", n, maxAgeDays)
}
return n, nil
}
// ─── from_pubkey backfill (#1143) ──────────────────────────────────────────
//
// Moved from cmd/server/from_pubkey_migration.go in #1287. Runs from the
// ingestor's maintenance loop. Populates transmissions.from_pubkey for
// ADVERT rows whose value is still NULL, by parsing decoded_json.pubKey.
// FromPubkeyBackfillStats holds progress for /api/healthz exposure.
// The ingestor exposes these via stats_file.go so the server can read
// them without writing.
type FromPubkeyBackfillStats struct {
Total int64 `json:"total"`
Processed int64 `json:"processed"`
Done bool `json:"done"`
}
// BackfillFromPubkey scans transmissions where from_pubkey IS NULL and
// payload_type = 4 (ADVERT) and populates from_pubkey from decoded_json.
// Chunked + yields between batches. Safe to call repeatedly; once a row
// is set to either "" or hex it never matches the WHERE clause again.
func (s *Store) BackfillFromPubkey(chunkSize int, yieldDuration time.Duration, progress func(total, processed int64, done bool)) {
defer func() {
if r := recover(); r != nil {
log.Printf("[backfill] from_pubkey panic recovered: %v", r)
}
if progress != nil {
progress(0, 0, true) // signal done; values overwritten below if collected
}
}()
if chunkSize <= 0 {
chunkSize = 5000
}
var total int64
if err := s.db.QueryRow(
"SELECT COUNT(*) FROM transmissions WHERE from_pubkey IS NULL AND payload_type = 4",
).Scan(&total); err != nil {
log.Printf("[backfill] from_pubkey count error: %v", err)
return
}
if total == 0 {
log.Println("[backfill] from_pubkey: nothing to do")
if progress != nil {
progress(0, 0, true)
}
return
}
if progress != nil {
progress(total, 0, false)
}
log.Printf("[backfill] from_pubkey starting: %d ADVERT rows", total)
stmt, err := s.db.Prepare("UPDATE transmissions SET from_pubkey = ? WHERE id = ?")
if err != nil {
log.Printf("[backfill] from_pubkey prepare: %v", err)
return
}
defer stmt.Close()
var processed int64
for {
rows, err := s.db.Query(
"SELECT id, decoded_json FROM transmissions WHERE from_pubkey IS NULL AND payload_type = 4 LIMIT ?",
chunkSize)
if err != nil {
log.Printf("[backfill] from_pubkey select: %v", err)
return
}
type row struct {
id int64
pk string
}
batch := make([]row, 0, chunkSize)
for rows.Next() {
var id int64
var dj sql.NullString
if err := rows.Scan(&id, &dj); err != nil {
continue
}
batch = append(batch, row{id: id, pk: extractPubkeyFromAdvertJSON(dj.String)})
}
rows.Close()
if len(batch) == 0 {
break
}
tx, err := s.db.Begin()
if err != nil {
log.Printf("[backfill] from_pubkey begin tx: %v", err)
return
}
txStmt := tx.Stmt(stmt)
for _, b := range batch {
// Sentinel: "" = scanned-no-pubkey (so the WHERE clause
// won't keep rescanning this row). hex = real pubkey.
var val interface{} = ""
if b.pk != "" {
val = b.pk
}
if _, err := txStmt.Exec(val, b.id); err != nil {
log.Printf("[backfill] from_pubkey update id=%d: %v", b.id, err)
}
}
if err := tx.Commit(); err != nil {
log.Printf("[backfill] from_pubkey commit: %v", err)
return
}
processed += int64(len(batch))
if progress != nil {
progress(total, processed, false)
}
if len(batch) < chunkSize {
break
}
if yieldDuration > 0 {
time.Sleep(yieldDuration)
}
}
log.Printf("[backfill] from_pubkey complete: %d rows processed", processed)
if progress != nil {
progress(total, processed, true)
}
}
// extractPubkeyFromAdvertJSON parses an ADVERT decoded_json blob and
// returns the pubKey field, or "" if absent/invalid.
func extractPubkeyFromAdvertJSON(s string) string {
if s == "" {
return ""
}
var m map[string]interface{}
if err := json.Unmarshal([]byte(s), &m); err != nil {
return ""
}
if v, ok := m["pubKey"].(string); ok {
return v
}
return ""
}