mirror of
https://github.com/Kpa-clawbot/meshcore-analyzer.git
synced 2026-04-26 02:32:08 +00:00
## Summary Fixes #450 — staging deployment flaky due to container not shutting down cleanly. ## Root Causes 1. **Server never closed DB on shutdown** — SQLite WAL lock held indefinitely, blocking new container startup 2. **`httpServer.Close()` instead of `Shutdown()`** — abruptly kills connections instead of draining them 3. **No `stop_grace_period` in compose configs** — Docker sends SIGTERM then immediately SIGKILL (default 10s is often not enough for WAL checkpoint) 4. **Supervisor didn't forward SIGTERM** — missing `stopsignal`/`stopwaitsecs` meant Go processes got SIGKILL instead of graceful shutdown 5. **Deploy scripts used default `docker stop` timeout** — only 10s grace period ## Changes ### Go Server (`cmd/server/`) - **Graceful HTTP shutdown**: `httpServer.Shutdown(ctx)` with 15s context timeout — drains in-flight requests before closing - **WebSocket cleanup**: New `Hub.Close()` method sends `CloseGoingAway` frames to all connected clients - **DB close on shutdown**: Explicitly closes DB after HTTP server stops (was never closed before) - **WAL checkpoint**: `PRAGMA wal_checkpoint(TRUNCATE)` before DB close — flushes WAL to main DB file and removes WAL/SHM lock files ### Go Ingestor (`cmd/ingestor/`) - **WAL checkpoint on shutdown**: New `Store.Checkpoint()` method, called before `Close()` - **Longer MQTT disconnect timeout**: 5s (was 1s) to allow in-flight messages to drain ### Docker Compose (all 4 variants) - Added `stop_grace_period: 30s` and `stop_signal: SIGTERM` ### Supervisor Configs (both variants) - Added `stopsignal=TERM` and `stopwaitsecs=20` to server and ingestor programs ### Deploy Scripts - `deploy-staging.sh`: `docker stop -t 30` with explicit grace period - `deploy-live.sh`: `docker stop -t 30` with explicit grace period ## Shutdown Sequence (after fix) 1. Docker sends SIGTERM to supervisord (PID 1) 2. Supervisord forwards SIGTERM to server + ingestor (waits up to 20s each) 3. Server: stops poller → drains HTTP (15s) → closes WS clients → checkpoints WAL → closes DB 4. Ingestor: stops tickers → disconnects MQTT (5s) → checkpoints WAL → closes DB 5. Docker waits up to 30s total before SIGKILL ## Tests All existing tests pass: - `cd cmd/server && go test ./...` ✅ - `cd cmd/ingestor && go test ./...` ✅ --------- Co-authored-by: you <you@example.com> Co-authored-by: Kpa-clawbot <kpabap+clawdbot@gmail.com>
259 lines
7.1 KiB
Go
259 lines
7.1 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"database/sql"
|
|
"flag"
|
|
"fmt"
|
|
"log"
|
|
"net/http"
|
|
_ "net/http/pprof"
|
|
"os"
|
|
"os/exec"
|
|
"os/signal"
|
|
"sync"
|
|
"path/filepath"
|
|
"strings"
|
|
"syscall"
|
|
"time"
|
|
|
|
"github.com/gorilla/mux"
|
|
)
|
|
|
|
// Set via -ldflags at build time
|
|
var Version string
|
|
var Commit string
|
|
var BuildTime string
|
|
|
|
func resolveCommit() string {
|
|
if Commit != "" {
|
|
return Commit
|
|
}
|
|
// Try .git-commit file (baked by Docker / CI)
|
|
if data, err := os.ReadFile(".git-commit"); err == nil {
|
|
if c := strings.TrimSpace(string(data)); c != "" && c != "unknown" {
|
|
return c
|
|
}
|
|
}
|
|
// Try git rev-parse at runtime
|
|
if out, err := exec.Command("git", "rev-parse", "--short", "HEAD").Output(); err == nil {
|
|
return strings.TrimSpace(string(out))
|
|
}
|
|
return "unknown"
|
|
}
|
|
|
|
func resolveVersion() string {
|
|
if Version != "" {
|
|
return Version
|
|
}
|
|
return "unknown"
|
|
}
|
|
|
|
func resolveBuildTime() string {
|
|
if BuildTime != "" {
|
|
return BuildTime
|
|
}
|
|
return "unknown"
|
|
}
|
|
|
|
func main() {
|
|
// pprof profiling — off by default, enable with ENABLE_PPROF=true
|
|
if os.Getenv("ENABLE_PPROF") == "true" {
|
|
pprofPort := os.Getenv("PPROF_PORT")
|
|
if pprofPort == "" {
|
|
pprofPort = "6060"
|
|
}
|
|
go func() {
|
|
log.Printf("[pprof] profiling UI at http://localhost:%s/debug/pprof/", pprofPort)
|
|
if err := http.ListenAndServe(":"+pprofPort, nil); err != nil {
|
|
log.Printf("[pprof] failed to start: %v (non-fatal)", err)
|
|
}
|
|
}()
|
|
}
|
|
|
|
var (
|
|
configDir string
|
|
port int
|
|
dbPath string
|
|
publicDir string
|
|
pollMs int
|
|
)
|
|
|
|
flag.StringVar(&configDir, "config-dir", ".", "Directory containing config.json")
|
|
flag.IntVar(&port, "port", 0, "HTTP port (overrides config)")
|
|
flag.StringVar(&dbPath, "db", "", "SQLite database path (overrides config/env)")
|
|
flag.StringVar(&publicDir, "public", "public", "Directory to serve static files from")
|
|
flag.IntVar(&pollMs, "poll-ms", 1000, "SQLite poll interval for WebSocket broadcast (ms)")
|
|
flag.Parse()
|
|
|
|
// Load config
|
|
cfg, err := LoadConfig(configDir)
|
|
if err != nil {
|
|
log.Printf("[config] warning: %v (using defaults)", err)
|
|
}
|
|
|
|
// CLI flags override config
|
|
if port > 0 {
|
|
cfg.Port = port
|
|
}
|
|
if cfg.Port == 0 {
|
|
cfg.Port = 3000
|
|
}
|
|
if dbPath != "" {
|
|
cfg.DBPath = dbPath
|
|
}
|
|
if cfg.APIKey == "" {
|
|
log.Printf("[security] WARNING: no apiKey configured — write endpoints are BLOCKED (set apiKey in config.json to enable them)")
|
|
}
|
|
|
|
// Resolve DB path
|
|
resolvedDB := cfg.ResolveDBPath(configDir)
|
|
log.Printf("[config] port=%d db=%s public=%s", cfg.Port, resolvedDB, publicDir)
|
|
|
|
// Open database
|
|
database, err := OpenDB(resolvedDB)
|
|
if err != nil {
|
|
log.Fatalf("[db] failed to open %s: %v", resolvedDB, err)
|
|
}
|
|
var dbCloseOnce sync.Once
|
|
dbClose := func() error {
|
|
var err error
|
|
dbCloseOnce.Do(func() { err = database.Close() })
|
|
return err
|
|
}
|
|
defer dbClose()
|
|
|
|
// Verify DB has expected tables
|
|
var tableName string
|
|
err = database.conn.QueryRow("SELECT name FROM sqlite_master WHERE type='table' AND name='transmissions'").Scan(&tableName)
|
|
if err == sql.ErrNoRows {
|
|
log.Fatalf("[db] table 'transmissions' not found — is this a CoreScope database?")
|
|
}
|
|
|
|
stats, err := database.GetStats()
|
|
if err != nil {
|
|
log.Printf("[db] warning: could not read stats: %v", err)
|
|
} else {
|
|
log.Printf("[db] transmissions=%d observations=%d nodes=%d observers=%d",
|
|
stats.TotalTransmissions, stats.TotalObservations, stats.TotalNodes, stats.TotalObservers)
|
|
}
|
|
|
|
// In-memory packet store
|
|
store := NewPacketStore(database, cfg.PacketStore)
|
|
if err := store.Load(); err != nil {
|
|
log.Fatalf("[store] failed to load: %v", err)
|
|
}
|
|
|
|
// WebSocket hub
|
|
hub := NewHub()
|
|
|
|
// HTTP server
|
|
srv := NewServer(database, cfg, hub)
|
|
srv.store = store
|
|
router := mux.NewRouter()
|
|
srv.RegisterRoutes(router)
|
|
|
|
// WebSocket endpoint
|
|
router.HandleFunc("/ws", hub.ServeWS)
|
|
|
|
// Static files + SPA fallback
|
|
absPublic, _ := filepath.Abs(publicDir)
|
|
if _, err := os.Stat(absPublic); err == nil {
|
|
fs := http.FileServer(http.Dir(absPublic))
|
|
router.PathPrefix("/").Handler(wsOrStatic(hub, spaHandler(absPublic, fs)))
|
|
log.Printf("[static] serving %s", absPublic)
|
|
} else {
|
|
log.Printf("[static] directory %s not found — API-only mode", absPublic)
|
|
router.PathPrefix("/").HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "text/html")
|
|
w.Write([]byte(`<!DOCTYPE html><html><body><h1>CoreScope</h1><p>Frontend not found. API available at /api/</p></body></html>`))
|
|
})
|
|
}
|
|
|
|
// Start SQLite poller for WebSocket broadcast
|
|
poller := NewPoller(database, hub, time.Duration(pollMs)*time.Millisecond)
|
|
poller.store = store
|
|
go poller.Start()
|
|
|
|
// Start periodic eviction
|
|
stopEviction := store.StartEvictionTicker()
|
|
defer stopEviction()
|
|
|
|
// Auto-prune old packets if retention.packetDays is configured
|
|
if cfg.Retention != nil && cfg.Retention.PacketDays > 0 {
|
|
days := cfg.Retention.PacketDays
|
|
go func() {
|
|
time.Sleep(1 * time.Minute)
|
|
if n, err := database.PruneOldPackets(days); err != nil {
|
|
log.Printf("[prune] error: %v", err)
|
|
} else {
|
|
log.Printf("[prune] deleted %d transmissions older than %d days", n, days)
|
|
}
|
|
for range time.Tick(24 * time.Hour) {
|
|
if n, err := database.PruneOldPackets(days); err != nil {
|
|
log.Printf("[prune] error: %v", err)
|
|
} else {
|
|
log.Printf("[prune] deleted %d transmissions older than %d days", n, days)
|
|
}
|
|
}
|
|
}()
|
|
log.Printf("[prune] auto-prune enabled: packets older than %d days will be removed daily", days)
|
|
}
|
|
|
|
// Graceful shutdown
|
|
httpServer := &http.Server{
|
|
Addr: fmt.Sprintf(":%d", cfg.Port),
|
|
Handler: router,
|
|
ReadTimeout: 30 * time.Second,
|
|
WriteTimeout: 60 * time.Second,
|
|
IdleTimeout: 120 * time.Second,
|
|
}
|
|
|
|
go func() {
|
|
sigCh := make(chan os.Signal, 1)
|
|
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
|
|
sig := <-sigCh
|
|
log.Printf("[server] received %v, shutting down...", sig)
|
|
|
|
// 1. Stop accepting new WebSocket/poll data
|
|
poller.Stop()
|
|
|
|
// 2. Gracefully drain HTTP connections (up to 15s)
|
|
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
|
|
defer cancel()
|
|
if err := httpServer.Shutdown(ctx); err != nil {
|
|
log.Printf("[server] HTTP shutdown error: %v", err)
|
|
}
|
|
|
|
// 3. Close WebSocket hub
|
|
hub.Close()
|
|
|
|
// 4. Close database (release SQLite WAL lock)
|
|
if err := dbClose(); err != nil {
|
|
log.Printf("[server] DB close error: %v", err)
|
|
}
|
|
log.Println("[server] shutdown complete")
|
|
}()
|
|
|
|
log.Printf("[server] CoreScope (Go) listening on http://localhost:%d", cfg.Port)
|
|
if err := httpServer.ListenAndServe(); err != http.ErrServerClosed {
|
|
log.Fatalf("[server] %v", err)
|
|
}
|
|
}
|
|
|
|
// spaHandler serves static files, falling back to index.html for SPA routes.
|
|
func spaHandler(root string, fs http.Handler) http.Handler {
|
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
path := filepath.Join(root, r.URL.Path)
|
|
if _, err := os.Stat(path); os.IsNotExist(err) {
|
|
http.ServeFile(w, r, filepath.Join(root, "index.html"))
|
|
return
|
|
}
|
|
// Disable caching for JS/CSS/HTML
|
|
if filepath.Ext(path) == ".js" || filepath.Ext(path) == ".css" || filepath.Ext(path) == ".html" {
|
|
w.Header().Set("Cache-Control", "no-cache, no-store, must-revalidate")
|
|
}
|
|
fs.ServeHTTP(w, r)
|
|
})
|
|
}
|