Files
meshcore-analyzer/cmd/server/main.go
Kpa-clawbot f87eb3601c fix: graceful container shutdown for reliable deployments (#453)
## Summary

Fixes #450 — staging deployment flaky due to container not shutting down
cleanly.

## Root Causes

1. **Server never closed DB on shutdown** — SQLite WAL lock held
indefinitely, blocking new container startup
2. **`httpServer.Close()` instead of `Shutdown()`** — abruptly kills
connections instead of draining them
3. **No `stop_grace_period` in compose configs** — Docker sends SIGTERM
then immediately SIGKILL (default 10s is often not enough for WAL
checkpoint)
4. **Supervisor didn't forward SIGTERM** — missing
`stopsignal`/`stopwaitsecs` meant Go processes got SIGKILL instead of
graceful shutdown
5. **Deploy scripts used default `docker stop` timeout** — only 10s
grace period

## Changes

### Go Server (`cmd/server/`)
- **Graceful HTTP shutdown**: `httpServer.Shutdown(ctx)` with 15s
context timeout — drains in-flight requests before closing
- **WebSocket cleanup**: New `Hub.Close()` method sends `CloseGoingAway`
frames to all connected clients
- **DB close on shutdown**: Explicitly closes DB after HTTP server stops
(was never closed before)
- **WAL checkpoint**: `PRAGMA wal_checkpoint(TRUNCATE)` before DB close
— flushes WAL to main DB file and removes WAL/SHM lock files

### Go Ingestor (`cmd/ingestor/`)
- **WAL checkpoint on shutdown**: New `Store.Checkpoint()` method,
called before `Close()`
- **Longer MQTT disconnect timeout**: 5s (was 1s) to allow in-flight
messages to drain

### Docker Compose (all 4 variants)
- Added `stop_grace_period: 30s` and `stop_signal: SIGTERM`

### Supervisor Configs (both variants)
- Added `stopsignal=TERM` and `stopwaitsecs=20` to server and ingestor
programs

### Deploy Scripts
- `deploy-staging.sh`: `docker stop -t 30` with explicit grace period
- `deploy-live.sh`: `docker stop -t 30` with explicit grace period

## Shutdown Sequence (after fix)

1. Docker sends SIGTERM to supervisord (PID 1)
2. Supervisord forwards SIGTERM to server + ingestor (waits up to 20s
each)
3. Server: stops poller → drains HTTP (15s) → closes WS clients →
checkpoints WAL → closes DB
4. Ingestor: stops tickers → disconnects MQTT (5s) → checkpoints WAL →
closes DB
5. Docker waits up to 30s total before SIGKILL

## Tests

All existing tests pass:
- `cd cmd/server && go test ./...` 
- `cd cmd/ingestor && go test ./...` 

---------

Co-authored-by: you <you@example.com>
Co-authored-by: Kpa-clawbot <kpabap+clawdbot@gmail.com>
2026-04-01 12:19:20 -07:00

259 lines
7.1 KiB
Go

package main
import (
"context"
"database/sql"
"flag"
"fmt"
"log"
"net/http"
_ "net/http/pprof"
"os"
"os/exec"
"os/signal"
"sync"
"path/filepath"
"strings"
"syscall"
"time"
"github.com/gorilla/mux"
)
// Set via -ldflags at build time
var Version string
var Commit string
var BuildTime string
func resolveCommit() string {
if Commit != "" {
return Commit
}
// Try .git-commit file (baked by Docker / CI)
if data, err := os.ReadFile(".git-commit"); err == nil {
if c := strings.TrimSpace(string(data)); c != "" && c != "unknown" {
return c
}
}
// Try git rev-parse at runtime
if out, err := exec.Command("git", "rev-parse", "--short", "HEAD").Output(); err == nil {
return strings.TrimSpace(string(out))
}
return "unknown"
}
func resolveVersion() string {
if Version != "" {
return Version
}
return "unknown"
}
func resolveBuildTime() string {
if BuildTime != "" {
return BuildTime
}
return "unknown"
}
func main() {
// pprof profiling — off by default, enable with ENABLE_PPROF=true
if os.Getenv("ENABLE_PPROF") == "true" {
pprofPort := os.Getenv("PPROF_PORT")
if pprofPort == "" {
pprofPort = "6060"
}
go func() {
log.Printf("[pprof] profiling UI at http://localhost:%s/debug/pprof/", pprofPort)
if err := http.ListenAndServe(":"+pprofPort, nil); err != nil {
log.Printf("[pprof] failed to start: %v (non-fatal)", err)
}
}()
}
var (
configDir string
port int
dbPath string
publicDir string
pollMs int
)
flag.StringVar(&configDir, "config-dir", ".", "Directory containing config.json")
flag.IntVar(&port, "port", 0, "HTTP port (overrides config)")
flag.StringVar(&dbPath, "db", "", "SQLite database path (overrides config/env)")
flag.StringVar(&publicDir, "public", "public", "Directory to serve static files from")
flag.IntVar(&pollMs, "poll-ms", 1000, "SQLite poll interval for WebSocket broadcast (ms)")
flag.Parse()
// Load config
cfg, err := LoadConfig(configDir)
if err != nil {
log.Printf("[config] warning: %v (using defaults)", err)
}
// CLI flags override config
if port > 0 {
cfg.Port = port
}
if cfg.Port == 0 {
cfg.Port = 3000
}
if dbPath != "" {
cfg.DBPath = dbPath
}
if cfg.APIKey == "" {
log.Printf("[security] WARNING: no apiKey configured — write endpoints are BLOCKED (set apiKey in config.json to enable them)")
}
// Resolve DB path
resolvedDB := cfg.ResolveDBPath(configDir)
log.Printf("[config] port=%d db=%s public=%s", cfg.Port, resolvedDB, publicDir)
// Open database
database, err := OpenDB(resolvedDB)
if err != nil {
log.Fatalf("[db] failed to open %s: %v", resolvedDB, err)
}
var dbCloseOnce sync.Once
dbClose := func() error {
var err error
dbCloseOnce.Do(func() { err = database.Close() })
return err
}
defer dbClose()
// Verify DB has expected tables
var tableName string
err = database.conn.QueryRow("SELECT name FROM sqlite_master WHERE type='table' AND name='transmissions'").Scan(&tableName)
if err == sql.ErrNoRows {
log.Fatalf("[db] table 'transmissions' not found — is this a CoreScope database?")
}
stats, err := database.GetStats()
if err != nil {
log.Printf("[db] warning: could not read stats: %v", err)
} else {
log.Printf("[db] transmissions=%d observations=%d nodes=%d observers=%d",
stats.TotalTransmissions, stats.TotalObservations, stats.TotalNodes, stats.TotalObservers)
}
// In-memory packet store
store := NewPacketStore(database, cfg.PacketStore)
if err := store.Load(); err != nil {
log.Fatalf("[store] failed to load: %v", err)
}
// WebSocket hub
hub := NewHub()
// HTTP server
srv := NewServer(database, cfg, hub)
srv.store = store
router := mux.NewRouter()
srv.RegisterRoutes(router)
// WebSocket endpoint
router.HandleFunc("/ws", hub.ServeWS)
// Static files + SPA fallback
absPublic, _ := filepath.Abs(publicDir)
if _, err := os.Stat(absPublic); err == nil {
fs := http.FileServer(http.Dir(absPublic))
router.PathPrefix("/").Handler(wsOrStatic(hub, spaHandler(absPublic, fs)))
log.Printf("[static] serving %s", absPublic)
} else {
log.Printf("[static] directory %s not found — API-only mode", absPublic)
router.PathPrefix("/").HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
w.Write([]byte(`<!DOCTYPE html><html><body><h1>CoreScope</h1><p>Frontend not found. API available at /api/</p></body></html>`))
})
}
// Start SQLite poller for WebSocket broadcast
poller := NewPoller(database, hub, time.Duration(pollMs)*time.Millisecond)
poller.store = store
go poller.Start()
// Start periodic eviction
stopEviction := store.StartEvictionTicker()
defer stopEviction()
// Auto-prune old packets if retention.packetDays is configured
if cfg.Retention != nil && cfg.Retention.PacketDays > 0 {
days := cfg.Retention.PacketDays
go func() {
time.Sleep(1 * time.Minute)
if n, err := database.PruneOldPackets(days); err != nil {
log.Printf("[prune] error: %v", err)
} else {
log.Printf("[prune] deleted %d transmissions older than %d days", n, days)
}
for range time.Tick(24 * time.Hour) {
if n, err := database.PruneOldPackets(days); err != nil {
log.Printf("[prune] error: %v", err)
} else {
log.Printf("[prune] deleted %d transmissions older than %d days", n, days)
}
}
}()
log.Printf("[prune] auto-prune enabled: packets older than %d days will be removed daily", days)
}
// Graceful shutdown
httpServer := &http.Server{
Addr: fmt.Sprintf(":%d", cfg.Port),
Handler: router,
ReadTimeout: 30 * time.Second,
WriteTimeout: 60 * time.Second,
IdleTimeout: 120 * time.Second,
}
go func() {
sigCh := make(chan os.Signal, 1)
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
sig := <-sigCh
log.Printf("[server] received %v, shutting down...", sig)
// 1. Stop accepting new WebSocket/poll data
poller.Stop()
// 2. Gracefully drain HTTP connections (up to 15s)
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel()
if err := httpServer.Shutdown(ctx); err != nil {
log.Printf("[server] HTTP shutdown error: %v", err)
}
// 3. Close WebSocket hub
hub.Close()
// 4. Close database (release SQLite WAL lock)
if err := dbClose(); err != nil {
log.Printf("[server] DB close error: %v", err)
}
log.Println("[server] shutdown complete")
}()
log.Printf("[server] CoreScope (Go) listening on http://localhost:%d", cfg.Port)
if err := httpServer.ListenAndServe(); err != http.ErrServerClosed {
log.Fatalf("[server] %v", err)
}
}
// spaHandler serves static files, falling back to index.html for SPA routes.
func spaHandler(root string, fs http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
path := filepath.Join(root, r.URL.Path)
if _, err := os.Stat(path); os.IsNotExist(err) {
http.ServeFile(w, r, filepath.Join(root, "index.html"))
return
}
// Disable caching for JS/CSS/HTML
if filepath.Ext(path) == ".js" || filepath.Ext(path) == ".css" || filepath.Ext(path) == ".html" {
w.Header().Set("Cache-Control", "no-cache, no-store, must-revalidate")
}
fs.ServeHTTP(w, r)
})
}