mirror of
https://github.com/Kpa-clawbot/meshcore-analyzer.git
synced 2026-06-06 12:31:40 +00:00
9383201c07
Red commit:
https://github.com/Kpa-clawbot/CoreScope/commit/eae179b99b5fd34924547632aa8f8025c405aa53
(CI: pending — opens with this PR)
Finishes #1283. RED test `TestServerSourceHasNoCachedRWCalls` goes from
failing (13 writer call-sites) to GREEN (zero). Per #1287 Option 4
(https://github.com/Kpa-clawbot/CoreScope/issues/1287#issuecomment-4485099992):
ingestor owns the neighbor graph build + persist; server reads the
snapshot.
**Category A — Schema migrations** → new `internal/dbschema` package.
`dbschema.Apply(rw)` runs in `cmd/ingestor` startup (in `OpenStore`).
`dbschema.AssertReady(ro)` runs in `cmd/server/main.go` and
FATAL-LOG-EXITS if any expected column/index/table is missing — the
operator must restart the ingestor first. Covers indexes,
`neighbor_edges`, `observations.resolved_path`,
`observers.{inactive,last_packet_at,iata}`,
`(inactive_)nodes.foreign_advert`, `transmissions.from_pubkey`.
**Category B — Backfill** → ingestor.
`BackfillFromPubkey` and observer-blacklist soft-delete moved to
`cmd/ingestor/maintenance.go`. Server keeps an inert
`fromPubkeyBackfillSnapshot` stub for `/api/healthz` API compatibility.
**Category C — Neighbor-graph persistence (Option 4)** → ingestor
writes, server reads.
- Ingestor (`cmd/ingestor/neighbor_builder.go`): every 60s scans
`observations + transmissions`, extracts edges (originator↔first-hop for
ADVERTs; observer↔last-hop for all), resolves hop prefixes via a
node-table prefix index, upserts into `neighbor_edges`.
- Server (`cmd/server/neighbor_recomputer.go`): every 60s re-reads
`neighbor_edges` and atomic-swaps the resulting `NeighborGraph` into
`s.graph`. Initial load is synchronous on startup. All server-side
incremental edge writers (the two `asyncPersistResolvedPathsAndEdges`
paths in `cmd/server/store.go`) are gone.
- Neighbor-edge daily prune (`PruneNeighborEdges`) moved to ingestor.
**Why Option 4**: clean read/write separation, no startup CPU spike
(server loads existing snapshot instead of rebuilding from history), no
IPC/delta-protocol churn. Staleness budget ~60s — same model as the
analytics recomputers in #1240 / #1248 / #672 axis 2.
**Recomputer interval default for neighbor graph**: 60s
(`NeighborGraphRecomputerDefaultInterval`,
`NeighborEdgesBuilderInterval`).
**Invariants added**:
- `TestServerSourceHasNoCachedRWCalls` (RED commit eae179b9): grep
enforces zero `cachedRW(`, `mode=rw`, or `sql.Open(_journal_mode=WAL…)`
in non-test `cmd/server/` sources.
- `TestServerStartupRequiresMigratedSchema`: server refuses to start
against an unmigrated DB.
- `TestNeighborGraphRecomputerLoadsSnapshot`: post-write snapshot is
picked up on the next refresh.
- `TestNeighborEdgesBuilderUpsertsFromObservations`: end-to-end pipeline
writes the expected edge.
`grep cachedRW cmd/server/*.go | grep -v _test.go` → 0 matches.
Fixes #1287.
---------
Co-authored-by: MeshCore Bot <bot@meshcore.local>
Co-authored-by: Kpa-clawbot <Kpa-clawbot@users.noreply.github.com>
Co-authored-by: corescope-bot <bot@corescope.local>
223 lines
6.4 KiB
Go
223 lines
6.4 KiB
Go
package main
|
|
|
|
import (
|
|
"database/sql"
|
|
"encoding/json"
|
|
"fmt"
|
|
"log"
|
|
"time"
|
|
|
|
"github.com/meshcore-analyzer/dbschema"
|
|
)
|
|
|
|
// PruneOldPackets deletes transmissions (and their child observations)
|
|
// older than `days`. Returns count of transmissions deleted.
|
|
//
|
|
// Owned by the ingestor per #1283: the writer process is the only one
|
|
// allowed to hold the DB write lock; previously this lived in
|
|
// cmd/server/db.go and raced ingestor INSERTs (SQLITE_BUSY).
|
|
func (s *Store) PruneOldPackets(days int) (int64, error) {
|
|
if days <= 0 {
|
|
return 0, nil
|
|
}
|
|
cutoff := time.Now().UTC().AddDate(0, 0, -days).Format(time.RFC3339)
|
|
|
|
tx, err := s.db.Begin()
|
|
if err != nil {
|
|
return 0, fmt.Errorf("prune begin: %w", err)
|
|
}
|
|
defer tx.Rollback()
|
|
|
|
// Delete child observations first (no CASCADE in SQLite).
|
|
if _, err := tx.Exec(`DELETE FROM observations WHERE transmission_id IN (
|
|
SELECT id FROM transmissions WHERE first_seen < ?
|
|
)`, cutoff); err != nil {
|
|
return 0, fmt.Errorf("prune observations: %w", err)
|
|
}
|
|
|
|
res, err := tx.Exec(`DELETE FROM transmissions WHERE first_seen < ?`, cutoff)
|
|
if err != nil {
|
|
return 0, fmt.Errorf("prune transmissions: %w", err)
|
|
}
|
|
n, _ := res.RowsAffected()
|
|
if err := tx.Commit(); err != nil {
|
|
return 0, fmt.Errorf("prune commit: %w", err)
|
|
}
|
|
if n > 0 {
|
|
log.Printf("[prune] deleted %d transmissions older than %d days", n, days)
|
|
}
|
|
return n, nil
|
|
}
|
|
|
|
// SoftDeleteBlacklistedObservers marks observers in the blacklist as
|
|
// inactive=1 so they are hidden from API responses. Owned by ingestor
|
|
// per #1287. Runs once at startup.
|
|
func (s *Store) SoftDeleteBlacklistedObservers(blacklist []string) {
|
|
n, err := dbschema.SoftDeleteBlacklistedObservers(s.db, blacklist)
|
|
if err != nil {
|
|
log.Printf("[observer-blacklist] warning: soft-delete failed: %v", err)
|
|
return
|
|
}
|
|
if n > 0 {
|
|
log.Printf("[observer-blacklist] soft-deleted %d blacklisted observer(s)", n)
|
|
}
|
|
}
|
|
|
|
// PruneNeighborEdges deletes rows older than maxAgeDays from
|
|
// neighbor_edges. Owned by the ingestor per #1287 (was in cmd/server).
|
|
// Returns DB rows deleted.
|
|
func (s *Store) PruneNeighborEdges(maxAgeDays int) (int64, error) {
|
|
if maxAgeDays <= 0 {
|
|
return 0, nil
|
|
}
|
|
cutoff := time.Now().UTC().Add(-time.Duration(maxAgeDays) * 24 * time.Hour).Format(time.RFC3339)
|
|
res, err := s.db.Exec("DELETE FROM neighbor_edges WHERE last_seen < ?", cutoff)
|
|
if err != nil {
|
|
return 0, fmt.Errorf("prune neighbor_edges: %w", err)
|
|
}
|
|
n, _ := res.RowsAffected()
|
|
if n > 0 {
|
|
log.Printf("[neighbor-prune] removed %d DB rows older than %d days", n, maxAgeDays)
|
|
}
|
|
return n, nil
|
|
}
|
|
|
|
// ─── from_pubkey backfill (#1143) ──────────────────────────────────────────
|
|
//
|
|
// Moved from cmd/server/from_pubkey_migration.go in #1287. Runs from the
|
|
// ingestor's maintenance loop. Populates transmissions.from_pubkey for
|
|
// ADVERT rows whose value is still NULL, by parsing decoded_json.pubKey.
|
|
|
|
// FromPubkeyBackfillStats holds progress for /api/healthz exposure.
|
|
// The ingestor exposes these via stats_file.go so the server can read
|
|
// them without writing.
|
|
type FromPubkeyBackfillStats struct {
|
|
Total int64 `json:"total"`
|
|
Processed int64 `json:"processed"`
|
|
Done bool `json:"done"`
|
|
}
|
|
|
|
// BackfillFromPubkey scans transmissions where from_pubkey IS NULL and
|
|
// payload_type = 4 (ADVERT) and populates from_pubkey from decoded_json.
|
|
// Chunked + yields between batches. Safe to call repeatedly; once a row
|
|
// is set to either "" or hex it never matches the WHERE clause again.
|
|
func (s *Store) BackfillFromPubkey(chunkSize int, yieldDuration time.Duration, progress func(total, processed int64, done bool)) {
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
log.Printf("[backfill] from_pubkey panic recovered: %v", r)
|
|
}
|
|
if progress != nil {
|
|
progress(0, 0, true) // signal done; values overwritten below if collected
|
|
}
|
|
}()
|
|
if chunkSize <= 0 {
|
|
chunkSize = 5000
|
|
}
|
|
|
|
var total int64
|
|
if err := s.db.QueryRow(
|
|
"SELECT COUNT(*) FROM transmissions WHERE from_pubkey IS NULL AND payload_type = 4",
|
|
).Scan(&total); err != nil {
|
|
log.Printf("[backfill] from_pubkey count error: %v", err)
|
|
return
|
|
}
|
|
if total == 0 {
|
|
log.Println("[backfill] from_pubkey: nothing to do")
|
|
if progress != nil {
|
|
progress(0, 0, true)
|
|
}
|
|
return
|
|
}
|
|
if progress != nil {
|
|
progress(total, 0, false)
|
|
}
|
|
log.Printf("[backfill] from_pubkey starting: %d ADVERT rows", total)
|
|
|
|
stmt, err := s.db.Prepare("UPDATE transmissions SET from_pubkey = ? WHERE id = ?")
|
|
if err != nil {
|
|
log.Printf("[backfill] from_pubkey prepare: %v", err)
|
|
return
|
|
}
|
|
defer stmt.Close()
|
|
|
|
var processed int64
|
|
for {
|
|
rows, err := s.db.Query(
|
|
"SELECT id, decoded_json FROM transmissions WHERE from_pubkey IS NULL AND payload_type = 4 LIMIT ?",
|
|
chunkSize)
|
|
if err != nil {
|
|
log.Printf("[backfill] from_pubkey select: %v", err)
|
|
return
|
|
}
|
|
type row struct {
|
|
id int64
|
|
pk string
|
|
}
|
|
batch := make([]row, 0, chunkSize)
|
|
for rows.Next() {
|
|
var id int64
|
|
var dj sql.NullString
|
|
if err := rows.Scan(&id, &dj); err != nil {
|
|
continue
|
|
}
|
|
batch = append(batch, row{id: id, pk: extractPubkeyFromAdvertJSON(dj.String)})
|
|
}
|
|
rows.Close()
|
|
if len(batch) == 0 {
|
|
break
|
|
}
|
|
|
|
tx, err := s.db.Begin()
|
|
if err != nil {
|
|
log.Printf("[backfill] from_pubkey begin tx: %v", err)
|
|
return
|
|
}
|
|
txStmt := tx.Stmt(stmt)
|
|
for _, b := range batch {
|
|
// Sentinel: "" = scanned-no-pubkey (so the WHERE clause
|
|
// won't keep rescanning this row). hex = real pubkey.
|
|
var val interface{} = ""
|
|
if b.pk != "" {
|
|
val = b.pk
|
|
}
|
|
if _, err := txStmt.Exec(val, b.id); err != nil {
|
|
log.Printf("[backfill] from_pubkey update id=%d: %v", b.id, err)
|
|
}
|
|
}
|
|
if err := tx.Commit(); err != nil {
|
|
log.Printf("[backfill] from_pubkey commit: %v", err)
|
|
return
|
|
}
|
|
processed += int64(len(batch))
|
|
if progress != nil {
|
|
progress(total, processed, false)
|
|
}
|
|
if len(batch) < chunkSize {
|
|
break
|
|
}
|
|
if yieldDuration > 0 {
|
|
time.Sleep(yieldDuration)
|
|
}
|
|
}
|
|
log.Printf("[backfill] from_pubkey complete: %d rows processed", processed)
|
|
if progress != nil {
|
|
progress(total, processed, true)
|
|
}
|
|
}
|
|
|
|
// extractPubkeyFromAdvertJSON parses an ADVERT decoded_json blob and
|
|
// returns the pubKey field, or "" if absent/invalid.
|
|
func extractPubkeyFromAdvertJSON(s string) string {
|
|
if s == "" {
|
|
return ""
|
|
}
|
|
var m map[string]interface{}
|
|
if err := json.Unmarshal([]byte(s), &m); err != nil {
|
|
return ""
|
|
}
|
|
if v, ok := m["pubKey"].(string); ok {
|
|
return v
|
|
}
|
|
return ""
|
|
}
|