mirror of
https://github.com/Kpa-clawbot/meshcore-analyzer.git
synced 2026-06-06 10:01:36 +00:00
1da2034341
**Red commit:** f6290b63 — CI run will appear at
https://github.com/Kpa-clawbot/CoreScope/actions
Fixes #1283.
## What
Moves all four DB write operations out of `cmd/server/` into
`cmd/ingestor/`, making the server truly read-only and eliminating the
SQLITE_BUSY VACUUM bug at its root: the server can no longer race the
ingestor for the write lock because the server has no write path.
## The four operations
| # | Was in | Now in |
|---|--------|--------|
| 1 | `cmd/server/vacuum.go` (`checkAutoVacuum`, full VACUUM +
`auto_vacuum=INCREMENTAL` migration) | `cmd/ingestor/db.go`
`Store.CheckAutoVacuum` (already existed; ingestor runs it at startup
**before** the MQTT subscriber starts → no contention) |
| 2 | `cmd/server/db.go` `PruneOldPackets` (`DELETE FROM transmissions`)
| `cmd/ingestor/maintenance.go` `Store.PruneOldPackets` (new) + 24h
ticker in `cmd/ingestor/main.go` |
| 3 | `cmd/server/db.go` `PruneOldMetrics` (`DELETE FROM
observer_metrics`) | `cmd/ingestor/db.go` `Store.PruneOldMetrics`
(already existed) |
| 4 | `cmd/server/db.go` `RemoveStaleObservers` (`UPDATE observers SET
inactive=1`) | `cmd/ingestor/db.go` `Store.RemoveStaleObservers`
(already existed) |
## HTTP surface
- **Removed:** `POST /api/admin/prune` (`handleAdminPrune`, route,
openapi entry). Operators trigger an ad-hoc prune by restarting the
ingestor.
- **Kept:** `GET /api/backup` — uses `VACUUM INTO` which writes to a
separate file, not the live DB; read-only-safe.
## Tests
- `cmd/server/readonly_invariant_test.go` (RED gate) — reflect-asserts
`PruneOldPackets`/`PruneOldMetrics`/`RemoveStaleObservers` are NOT
methods on the server's `*DB`. Fails on master, passes after this PR.
- `cmd/ingestor/issue1283_test.go` — exercises `Store.PruneOldPackets`
and the auto_vacuum=NONE → INCREMENTAL migration through
`Store.CheckAutoVacuum` with `vacuumOnStartup=true`.
## Why the bug is gone
The SQLITE_BUSY VACUUM failure happened because supervisord launched
both ingestor + server in one container; the ingestor took the write
lock for INSERTs and the server's `checkAutoVacuum` then failed to
acquire it within `busy_timeout=5000`. After this PR, only the ingestor
ever opens a writable connection, and it runs `CheckAutoVacuum`
**before** spawning the MQTT subscriber → no contention possible.
## Scope notes
- `cachedRW()` still has three pre-existing callers in `cmd/server/`
(`neighbor_persist.go`, `ensure_indexes.go`,
`from_pubkey_migration.go`). These pre-date #1283 and are not in the
issue's four-operation list. Leaving them for follow-up keeps this PR
honest about scope; AGENTS.md documents the invariant so new write paths
can't sneak in.
- PII preflight reports false positives on the Go method name
`requireAPIKey` in `routes.go` diff context — no real PII.
- Server-side neighbor-edge prune (`PruneNeighborEdges`) intentionally
left in place — out of scope of #1283.
---------
Co-authored-by: MeshCore Bot <bot@meshcore.local>
99 lines
2.8 KiB
Go
99 lines
2.8 KiB
Go
package main
|
|
|
|
import (
|
|
"database/sql"
|
|
"path/filepath"
|
|
"testing"
|
|
"time"
|
|
|
|
_ "modernc.org/sqlite"
|
|
)
|
|
|
|
// TestIngestorPruneOldPackets enforces #1283: the writer for
|
|
// transmissions retention lives on the ingestor's *Store. Before the fix,
|
|
// this lived on cmd/server/*DB and raced with ingestor INSERTs. After
|
|
// the fix, ingestor owns it and runs it on its own write-locked handle.
|
|
func TestIngestorPruneOldPackets(t *testing.T) {
|
|
dir := t.TempDir()
|
|
path := filepath.Join(dir, "prune.db")
|
|
store, err := OpenStore(path)
|
|
if err != nil {
|
|
t.Fatalf("OpenStore: %v", err)
|
|
}
|
|
defer store.Close()
|
|
|
|
old := time.Now().UTC().AddDate(0, 0, -10).Format(time.RFC3339)
|
|
new := time.Now().UTC().Format(time.RFC3339)
|
|
for i, ts := range []string{old, old, new} {
|
|
_, err := store.db.Exec(
|
|
`INSERT INTO transmissions (raw_hex, hash, first_seen, route_type, payload_type, payload_version, decoded_json)
|
|
VALUES (?, ?, ?, 0, 1, 1, '{}')`,
|
|
"AA", "h"+string(rune('a'+i)), ts,
|
|
)
|
|
if err != nil {
|
|
t.Fatalf("seed tx: %v", err)
|
|
}
|
|
}
|
|
|
|
n, err := store.PruneOldPackets(5)
|
|
if err != nil {
|
|
t.Fatalf("PruneOldPackets: %v", err)
|
|
}
|
|
if n != 2 {
|
|
t.Fatalf("expected 2 pruned, got %d", n)
|
|
}
|
|
|
|
var remaining int
|
|
if err := store.db.QueryRow(`SELECT COUNT(*) FROM transmissions`).Scan(&remaining); err != nil {
|
|
t.Fatalf("count: %v", err)
|
|
}
|
|
if remaining != 1 {
|
|
t.Fatalf("expected 1 transmission remaining, got %d", remaining)
|
|
}
|
|
}
|
|
|
|
// TestIngestorVacuumOnStartupMigratesNONEtoINCREMENTAL exercises the
|
|
// scenario that originally broke in #1283: a fresh DB with
|
|
// auto_vacuum=NONE, vacuumOnStartup=true, no contention from a server
|
|
// process. The ingestor must complete the VACUUM and flip auto_vacuum to
|
|
// INCREMENTAL. Before the fix, the migration ran inside cmd/server and
|
|
// hit SQLITE_BUSY because the ingestor (sharing the container) was
|
|
// already writing.
|
|
func TestIngestorVacuumOnStartupMigratesNONEtoINCREMENTAL(t *testing.T) {
|
|
dir := t.TempDir()
|
|
path := filepath.Join(dir, "vac.db")
|
|
|
|
// Create a NONE-auto_vacuum DB (simulates an older deployment).
|
|
seed, err := sql.Open("sqlite", path+"?_pragma=journal_mode(WAL)")
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
seed.SetMaxOpenConns(1)
|
|
if _, err := seed.Exec(`CREATE TABLE dummy(id INTEGER PRIMARY KEY)`); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
var before int
|
|
seed.QueryRow("PRAGMA auto_vacuum").Scan(&before)
|
|
if before != 0 {
|
|
t.Fatalf("precondition: auto_vacuum=%d, want 0", before)
|
|
}
|
|
seed.Close()
|
|
|
|
store, err := OpenStore(path)
|
|
if err != nil {
|
|
t.Fatalf("OpenStore: %v", err)
|
|
}
|
|
defer store.Close()
|
|
|
|
cfg := &Config{DB: &DBConfig{VacuumOnStartup: true}}
|
|
store.CheckAutoVacuum(cfg)
|
|
|
|
var after int
|
|
if err := store.db.QueryRow("PRAGMA auto_vacuum").Scan(&after); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if after != 2 {
|
|
t.Fatalf("expected auto_vacuum=2 after ingestor VACUUM, got %d", after)
|
|
}
|
|
}
|