Files
meshcore-analyzer/cmd/server/main.go
T
MeshCore Bot dbadef3e2f refactor(db): move all server writes to ingestor; server truly read-only (#1283)
Eliminates the SQLITE_BUSY VACUUM bug from #1283 by making cmd/server
truly read-only. The bug surfaced when supervisord launched both
ingestor + server in one container: the ingestor took the write lock for
INSERTs, then the server's VACUUM-on-startup immediately failed with
SQLITE_BUSY. Same race latently affected three other server-side writes.

Four write operations moved out of cmd/server/:

1. VACUUM / auto_vacuum migration (cmd/server/vacuum.go, entire file)
   → cmd/ingestor/db.go Store.CheckAutoVacuum (already existed;
     ingestor runs it BEFORE the MQTT subscriber starts so there is
     no contention with concurrent writes).

2. PruneOldPackets (DELETE FROM transmissions)
   cmd/server/db.go → cmd/ingestor/maintenance.go (new file,
     Store.PruneOldPackets) + main.go scheduler.

3. PruneOldMetrics (DELETE FROM observer_metrics)
   cmd/server/db.go → cmd/ingestor/db.go Store.PruneOldMetrics
     (already existed).

4. RemoveStaleObservers (UPDATE observers SET inactive = 1)
   cmd/server/db.go → cmd/ingestor/db.go Store.RemoveStaleObservers
     (already existed).

Server-side changes:
- vacuum.go deleted; checkAutoVacuum / runIncrementalVacuum gone.
- cmd/server/db.go: PruneOldPackets, PruneOldMetrics, RemoveStaleObservers
  deleted.
- cmd/server/main.go: packet/metrics/observer prune schedulers removed;
  the neighbor-edge prune scheduler (PruneNeighborEdges) is intentionally
  left in place — outside scope of #1283, tracked separately.
- routes.go + openapi.go: /api/admin/prune endpoint removed (prune is
  scheduled by the ingestor now; operators restart the ingestor for an
  ad-hoc pass).

Ingestor changes:
- New cmd/ingestor/maintenance.go with Store.PruneOldPackets.
- cmd/ingestor/config.go gains RetentionConfig.PacketDays and
  Config.PacketDaysOrZero().
- cmd/ingestor/main.go runs PruneOldPackets at startup (if
  packetDays > 0) and on a 24h ticker.

Docs:
- AGENTS.md: documents the read/write separation invariant.
- config.example.json: notes that retention + vacuumOnStartup are
  consumed by the ingestor.

TDD:
- Red: bb1d749a — invariant tests + Store.PruneOldPackets stub.
- Green: this commit — real implementation + server-side removals.

Note: cachedRW() still has three out-of-scope callers in cmd/server
(neighbor_persist.go, ensure_indexes.go, from_pubkey_migration.go).
Those are pre-existing write paths not covered by issue #1283 and are
left untouched per the issue scope. Future work can relocate them
under the same invariant.
2026-05-19 06:31:17 +00:00

530 lines
18 KiB
Go

package main
import (
"context"
"database/sql"
"flag"
"fmt"
"log"
"net/http"
_ "net/http/pprof"
"os"
"os/exec"
"os/signal"
"path/filepath"
"strings"
"sync"
"syscall"
"time"
"github.com/gorilla/mux"
)
// Set via -ldflags at build time
var Version string
var Commit string
var BuildTime string
func resolveCommit() string {
if Commit != "" {
return Commit
}
// Try .git-commit file (baked by Docker / CI)
if data, err := os.ReadFile(".git-commit"); err == nil {
if c := strings.TrimSpace(string(data)); c != "" && c != "unknown" {
return c
}
}
// Try git rev-parse at runtime
if out, err := exec.Command("git", "rev-parse", "--short", "HEAD").Output(); err == nil {
return strings.TrimSpace(string(out))
}
return "unknown"
}
func resolveVersion() string {
if Version != "" {
return Version
}
return "unknown"
}
func resolveBuildTime() string {
if BuildTime != "" {
return BuildTime
}
return "unknown"
}
func main() {
// pprof profiling — off by default, enable with ENABLE_PPROF=true
if os.Getenv("ENABLE_PPROF") == "true" {
pprofPort := os.Getenv("PPROF_PORT")
if pprofPort == "" {
pprofPort = "6060"
}
go func() {
log.Printf("[pprof] profiling UI at http://localhost:%s/debug/pprof/", pprofPort)
if err := http.ListenAndServe(":"+pprofPort, nil); err != nil {
log.Printf("[pprof] failed to start: %v (non-fatal)", err)
}
}()
}
var (
configDir string
port int
dbPath string
publicDir string
pollMs int
)
flag.StringVar(&configDir, "config-dir", ".", "Directory containing config.json")
flag.IntVar(&port, "port", 0, "HTTP port (overrides config)")
flag.StringVar(&dbPath, "db", "", "SQLite database path (overrides config/env)")
flag.StringVar(&publicDir, "public", "public", "Directory to serve static files from")
flag.IntVar(&pollMs, "poll-ms", 1000, "SQLite poll interval for WebSocket broadcast (ms)")
flag.Parse()
// Load config
cfg, err := LoadConfig(configDir)
if err != nil {
log.Printf("[config] warning: %v (using defaults)", err)
}
// CLI flags override config
if port > 0 {
cfg.Port = port
}
if cfg.Port == 0 {
cfg.Port = 3000
}
if dbPath != "" {
cfg.DBPath = dbPath
}
if cfg.APIKey == "" {
log.Printf("[security] WARNING: no apiKey configured — write endpoints are BLOCKED (set apiKey in config.json to enable them)")
} else if IsWeakAPIKey(cfg.APIKey) {
log.Printf("[security] WARNING: API key is weak or a known default — write endpoints are vulnerable")
}
// Apply Go runtime soft memory limit (#836).
// Honors GOMEMLIMIT if set; otherwise derives from packetStore.maxMemoryMB.
{
_, envSet := os.LookupEnv("GOMEMLIMIT")
maxMB := 0
if cfg.PacketStore != nil {
maxMB = cfg.PacketStore.MaxMemoryMB
}
limit, source := applyMemoryLimit(maxMB, envSet)
switch source {
case "env":
log.Printf("[memlimit] using GOMEMLIMIT from environment (%s)", os.Getenv("GOMEMLIMIT"))
case "derived":
log.Printf("[memlimit] derived from packetStore.maxMemoryMB=%d → %d MiB (1.5x headroom)", maxMB, limit/(1024*1024))
default:
log.Printf("[memlimit] no soft memory limit set (GOMEMLIMIT unset, packetStore.maxMemoryMB=0); recommend setting one to avoid container OOM-kill")
}
}
// Resolve DB path
resolvedDB := cfg.ResolveDBPath(configDir)
log.Printf("[config] port=%d db=%s public=%s", cfg.Port, resolvedDB, publicDir)
if len(cfg.NodeBlacklist) > 0 {
log.Printf("[config] nodeBlacklist: %d node(s) will be hidden from API", len(cfg.NodeBlacklist))
for _, pk := range cfg.NodeBlacklist {
if trimmed := strings.ToLower(strings.TrimSpace(pk)); trimmed != "" {
log.Printf("[config] blacklisted: %s", trimmed)
}
}
}
// Open database
database, err := OpenDB(resolvedDB)
if err != nil {
log.Fatalf("[db] failed to open %s: %v", resolvedDB, err)
}
var dbCloseOnce sync.Once
dbClose := func() error {
var err error
dbCloseOnce.Do(func() { err = database.Close() })
return err
}
defer dbClose()
// Verify DB has expected tables
var tableName string
err = database.conn.QueryRow("SELECT name FROM sqlite_master WHERE type='table' AND name='transmissions'").Scan(&tableName)
if err == sql.ErrNoRows {
log.Fatalf("[db] table 'transmissions' not found — is this a CoreScope database?")
}
stats, err := database.GetStats()
if err != nil {
log.Printf("[db] warning: could not read stats: %v", err)
} else {
log.Printf("[db] transmissions=%d observations=%d nodes=%d observers=%d",
stats.TotalTransmissions, stats.TotalObservations, stats.TotalNodes, stats.TotalObservers)
}
// auto_vacuum is checked + migrated by the ingestor (#1283). The
// server is read-only and must not race the writer for the lock.
// Ensure indexes the server's SQL fallback path depends on
// (mirrors ingestor schema for DBs created by old server-only builds).
if err := ensureServerIndexes(resolvedDB); err != nil {
log.Printf("[db] warning: could not ensure server indexes: %v", err)
}
// In-memory packet store
store := NewPacketStore(database, cfg.PacketStore, cfg.CacheTTL)
if err := store.Load(); err != nil {
log.Fatalf("[store] failed to load: %v", err)
}
if store.hotStartupHours > 0 {
log.Printf("[store] starting background load: filling retentionHours=%gh from hotStartupHours=%gh",
store.retentionHours, store.hotStartupHours)
go store.loadBackgroundChunks()
}
// Initialize persisted neighbor graph
dbPath = database.path
if err := ensureNeighborEdgesTable(dbPath); err != nil {
log.Printf("[neighbor] warning: could not create neighbor_edges table: %v", err)
}
// Add resolved_path column if missing.
// NOTE on startup ordering (review item #10): ensureResolvedPathColumn runs AFTER
// OpenDB/detectSchema, so db.hasResolvedPath will be false on first run with a
// pre-existing DB. This means Load() won't SELECT resolved_path from SQLite.
// Async backfill runs after HTTP starts (see backfillResolvedPathsAsync below)
// AND to SQLite. On next restart, detectSchema finds the column and Load() reads it.
if err := ensureResolvedPathColumn(dbPath); err != nil {
log.Printf("[store] warning: could not add resolved_path column: %v", err)
} else {
database.hasResolvedPath = true // detectSchema ran before column was added; fix the flag
}
// Ensure observers.inactive column exists (PR #954 filters on it; ingestor migration
// adds it but server may run against DBs ingestor never touched, e.g. e2e fixture).
if err := ensureObserverInactiveColumn(dbPath); err != nil {
log.Printf("[store] warning: could not add observers.inactive column: %v", err)
}
// Ensure observers.last_packet_at column exists (PR #905 reads it; ingestor migration
// adds it but server may run against DBs ingestor never touched, e.g. e2e fixture).
if err := ensureLastPacketAtColumn(dbPath); err != nil {
log.Printf("[store] warning: could not add observers.last_packet_at column: %v", err)
}
// Ensure observers.iata column exists (#1188 read paths COALESCE(obs.iata, '')
// in Store.Load() / IngestNewFromDB / IngestNewObservations; ingestor migration
// adds it but server may run against DBs ingestor never touched (e2e fixture)
// OR pre-iata operator DBs upgraded to this build — without this migration
// the first SELECT crashes with "no such column: obs.iata" (#1189 R1).
if err := ensureObserverIATAColumn(dbPath); err != nil {
log.Printf("[store] warning: could not add observers.iata column: %v", err)
}
// Ensure nodes.foreign_advert column exists (#730 reads it on every /api/nodes
// scan; ingestor migration foreign_advert_v1 adds it but server may run against
// DBs ingestor never touched, e.g. e2e fixture).
if err := ensureForeignAdvertColumn(dbPath); err != nil {
log.Printf("[store] warning: could not add nodes.foreign_advert column: %v", err)
}
// Ensure transmissions.from_pubkey column + index exists (#1143). Backfill
// for legacy NULL rows runs async after HTTP starts so it can't block boot
// even on prod-sized DBs (100K+ transmissions).
if err := ensureFromPubkeyColumn(dbPath); err != nil {
log.Printf("[store] warning: could not add transmissions.from_pubkey column: %v", err)
}
// Soft-delete observers that are in the blacklist (mark inactive=1) so
// historical data from a prior unblocked window is hidden too.
if len(cfg.ObserverBlacklist) > 0 {
softDeleteBlacklistedObservers(dbPath, cfg.ObserverBlacklist)
}
// WaitGroup for background init steps that gate /api/healthz readiness.
var initWg sync.WaitGroup
// Load or build neighbor graph
if neighborEdgesTableExists(database.conn) {
store.graph.Store(loadNeighborEdgesFromDB(database.conn))
log.Printf("[neighbor] loaded persisted neighbor graph")
} else {
log.Printf("[neighbor] no persisted edges found, will build in background...")
store.graph.Store(NewNeighborGraph()) // empty graph — gets populated by background goroutine
initWg.Add(1)
go func() {
defer initWg.Done()
defer func() {
if r := recover(); r != nil {
log.Printf("[neighbor] graph build panic recovered: %v", r)
}
}()
rw, rwErr := cachedRW(dbPath)
if rwErr == nil {
edgeCount := buildAndPersistEdges(store, rw)
log.Printf("[neighbor] persisted %d edges", edgeCount)
}
built := BuildFromStore(store)
store.graph.Store(built)
log.Printf("[neighbor] graph build complete")
}()
}
// Initial pickBestObservation runs in background — doesn't need to block HTTP.
// API serves best-effort data until this completes (~10s for 100K txs).
// Processes in chunks of 5000, releasing the lock between chunks so API
// handlers remain responsive.
initWg.Add(1)
go func() {
defer initWg.Done()
defer func() {
if r := recover(); r != nil {
log.Printf("[store] pickBestObservation panic recovered: %v", r)
}
}()
const chunkSize = 5000
store.mu.RLock()
totalPackets := len(store.packets)
store.mu.RUnlock()
for i := 0; i < totalPackets; i += chunkSize {
end := i + chunkSize
if end > totalPackets {
end = totalPackets
}
store.mu.Lock()
for j := i; j < end && j < len(store.packets); j++ {
pickBestObservation(store.packets[j])
}
store.mu.Unlock()
if end < totalPackets {
time.Sleep(10 * time.Millisecond) // yield to API handlers
}
}
log.Printf("[store] initial pickBestObservation complete (%d transmissions)", totalPackets)
}()
// Mark server ready once all background init completes.
go func() {
initWg.Wait()
readiness.Store(1)
log.Printf("[server] readiness: ready=true (background init complete)")
}()
// WebSocket hub
hub := NewHub()
// HTTP server
srv := NewServer(database, cfg, hub)
srv.store = store
router := mux.NewRouter()
srv.RegisterRoutes(router)
// WebSocket endpoint
router.HandleFunc("/ws", hub.ServeWS)
// Static files + SPA fallback
absPublic, _ := filepath.Abs(publicDir)
if _, err := os.Stat(absPublic); err == nil {
fs := http.FileServer(http.Dir(absPublic))
router.PathPrefix("/").Handler(wsOrStatic(hub, spaHandler(absPublic, fs)))
log.Printf("[static] serving %s", absPublic)
} else {
log.Printf("[static] directory %s not found — API-only mode", absPublic)
router.PathPrefix("/").HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
w.Write([]byte(`<!DOCTYPE html><html><body><h1>CoreScope</h1><p>Frontend not found. API available at /api/</p></body></html>`))
})
}
// Start SQLite poller for WebSocket broadcast
poller := NewPoller(database, hub, time.Duration(pollMs)*time.Millisecond)
poller.store = store
go poller.Start()
// Start periodic eviction
stopEviction := store.StartEvictionTicker()
defer stopEviction()
// Steady-state analytics recomputers (issue #1240). Replaces the
// on-request compute-then-cache pattern for the default (region="",
// zero-window) analytics queries with a background refresh loop so
// reads always hit cache in <1ms.
stopAnalyticsRecomp := store.StartAnalyticsRecomputers(
cfg.AnalyticsDefaultRecomputeInterval(),
cfg.AnalyticsRecomputeIntervals(),
)
defer stopAnalyticsRecomp()
log.Printf("[analytics-recompute] background recompute enabled (default=%s)", cfg.AnalyticsDefaultRecomputeInterval())
// Steady-state repeater-enrichment recomputer (issue #1262).
// Prewarms the bulk caches feeding handleNodes so the very first
// /api/nodes?limit=2000 from live.js's SPA bootstrap hits a
// populated cache instead of paying a 15.7s on-thread rebuild.
// Uses the configured RelayActiveHours window and the same
// default recompute interval as the other analytics caches.
relayWindowHours := cfg.GetHealthThresholds().RelayActiveHours
stopRepeaterEnrichRecomp := store.StartRepeaterEnrichmentRecomputer(
relayWindowHours,
cfg.AnalyticsDefaultRecomputeInterval(),
)
defer stopRepeaterEnrichRecomp()
log.Printf("[repeater-enrich-recompute] background recompute enabled (window=%.1fh, interval=%s)",
relayWindowHours, cfg.AnalyticsDefaultRecomputeInterval())
// Steady-state bridge-centrality recomputer (issue #672 axis 2).
// Computes betweenness centrality over the in-memory neighbor
// graph and stores the per-pubkey score map atomically. Read by
// handleNodes via a single atomic load.
stopBridgeRecomp := store.StartBridgeScoreRecomputer(
cfg.AnalyticsDefaultRecomputeInterval(),
)
defer stopBridgeRecomp()
log.Printf("[bridge-recompute] background recompute enabled (interval=%s)",
cfg.AnalyticsDefaultRecomputeInterval())
// Packet / metrics / observer retention moved to the ingestor in
// #1283 (writes only belong on the writer process). The server no
// longer schedules any of these; the ingestor's tickers handle them.
_ = cfg.IncrementalVacuumPages() // kept reachable for config validation; not used here
var stopEdgePrune func()
{
maxAgeDays := cfg.NeighborMaxAgeDays()
edgePruneTicker := time.NewTicker(24 * time.Hour)
edgePruneDone := make(chan struct{})
stopEdgePrune = func() {
edgePruneTicker.Stop()
close(edgePruneDone)
}
go func() {
defer func() {
if r := recover(); r != nil {
log.Printf("[neighbor-prune] panic recovered: %v", r)
}
}()
time.Sleep(4 * time.Minute) // stagger after metrics prune
g := store.graph.Load()
PruneNeighborEdges(dbPath, g, maxAgeDays)
for {
select {
case <-edgePruneTicker.C:
g := store.graph.Load()
PruneNeighborEdges(dbPath, g, maxAgeDays)
case <-edgePruneDone:
return
}
}
}()
log.Printf("[neighbor-prune] auto-prune enabled: edges older than %d days", maxAgeDays)
}
// Graceful shutdown
httpServer := &http.Server{
Addr: fmt.Sprintf(":%d", cfg.Port),
Handler: router,
ReadTimeout: 30 * time.Second,
WriteTimeout: 60 * time.Second,
IdleTimeout: 120 * time.Second,
}
go func() {
sigCh := make(chan os.Signal, 1)
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
sig := <-sigCh
log.Printf("[server] received %v, shutting down...", sig)
// 1. Stop accepting new WebSocket/poll data
poller.Stop()
// 1b. Stop auto-prune ticker (server-side packet/metrics/observer
// prunes were removed in #1283; only neighbor-edge prune remains.)
if stopEdgePrune != nil {
stopEdgePrune()
}
// 1c. Stop steady-state analytics recomputers (issue #1240).
// Must happen before dbClose so any in-flight compute that
// reaches into SQLite has finished.
if stopAnalyticsRecomp != nil {
stopAnalyticsRecomp()
}
// 2. Gracefully drain HTTP connections (up to 15s)
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel()
if err := httpServer.Shutdown(ctx); err != nil {
log.Printf("[server] HTTP shutdown error: %v", err)
}
// 3. Close WebSocket hub
hub.Close()
// 4. Close database (release SQLite WAL lock)
if err := dbClose(); err != nil {
log.Printf("[server] DB close error: %v", err)
}
log.Println("[server] shutdown complete")
}()
log.Printf("[server] CoreScope (Go) listening on http://localhost:%d", cfg.Port)
// Start async backfill in background — HTTP is now available.
go backfillResolvedPathsAsync(store, dbPath, 5000, 100*time.Millisecond, cfg.BackfillHours())
// #1143: backfill from_pubkey for legacy ADVERT rows. Async so even
// 100K+ rows can't block boot; queries handle NULL gracefully.
// startFromPubkeyBackfill wraps the goroutine dispatch so the async
// contract is testable (see TestBackfillFromPubkey_DoesNotBlockBoot).
startFromPubkeyBackfill(dbPath, 5000, 100*time.Millisecond)
// Migrate old content hashes in background (one-time, idempotent).
go migrateContentHashesAsync(store, 5000, 100*time.Millisecond)
if err := httpServer.ListenAndServe(); err != http.ErrServerClosed {
log.Fatalf("[server] %v", err)
}
}
// spaHandler serves static files, falling back to index.html for SPA routes.
// It reads index.html once at creation time and replaces the __BUST__ placeholder
// with a Unix timestamp so browsers fetch fresh JS/CSS after each server restart.
func spaHandler(root string, fs http.Handler) http.Handler {
// Pre-process index.html: replace __BUST__ with a cache-bust timestamp
indexPath := filepath.Join(root, "index.html")
rawHTML, err := os.ReadFile(indexPath)
if err != nil {
log.Printf("[static] warning: could not read index.html for cache-bust: %v", err)
rawHTML = []byte("<!DOCTYPE html><html><body><h1>CoreScope</h1><p>index.html not found</p></body></html>")
}
bustValue := fmt.Sprintf("%d", time.Now().Unix())
indexHTML := []byte(strings.ReplaceAll(string(rawHTML), "__BUST__", bustValue))
log.Printf("[static] cache-bust value: %s", bustValue)
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
// Serve pre-processed index.html for root and /index.html
if r.URL.Path == "/" || r.URL.Path == "/index.html" {
w.Header().Set("Content-Type", "text/html; charset=utf-8")
w.Header().Set("Cache-Control", "no-cache, no-store, must-revalidate")
w.Write(indexHTML)
return
}
path := filepath.Join(root, r.URL.Path)
if _, err := os.Stat(path); os.IsNotExist(err) {
// SPA fallback — serve pre-processed index.html
w.Header().Set("Content-Type", "text/html; charset=utf-8")
w.Header().Set("Cache-Control", "no-cache, no-store, must-revalidate")
w.Write(indexHTML)
return
}
// Disable caching for JS/CSS/HTML
if filepath.Ext(path) == ".js" || filepath.Ext(path) == ".css" || filepath.Ext(path) == ".html" {
w.Header().Set("Cache-Control", "no-cache, no-store, must-revalidate")
}
fs.ServeHTTP(w, r)
})
}