Files
meshcore-analyzer/cmd/server/websocket.go
Kpa-clawbot f87eb3601c fix: graceful container shutdown for reliable deployments (#453)
## Summary

Fixes #450 — staging deployment flaky due to container not shutting down
cleanly.

## Root Causes

1. **Server never closed DB on shutdown** — SQLite WAL lock held
indefinitely, blocking new container startup
2. **`httpServer.Close()` instead of `Shutdown()`** — abruptly kills
connections instead of draining them
3. **No `stop_grace_period` in compose configs** — Docker sends SIGTERM
then immediately SIGKILL (default 10s is often not enough for WAL
checkpoint)
4. **Supervisor didn't forward SIGTERM** — missing
`stopsignal`/`stopwaitsecs` meant Go processes got SIGKILL instead of
graceful shutdown
5. **Deploy scripts used default `docker stop` timeout** — only 10s
grace period

## Changes

### Go Server (`cmd/server/`)
- **Graceful HTTP shutdown**: `httpServer.Shutdown(ctx)` with 15s
context timeout — drains in-flight requests before closing
- **WebSocket cleanup**: New `Hub.Close()` method sends `CloseGoingAway`
frames to all connected clients
- **DB close on shutdown**: Explicitly closes DB after HTTP server stops
(was never closed before)
- **WAL checkpoint**: `PRAGMA wal_checkpoint(TRUNCATE)` before DB close
— flushes WAL to main DB file and removes WAL/SHM lock files

### Go Ingestor (`cmd/ingestor/`)
- **WAL checkpoint on shutdown**: New `Store.Checkpoint()` method,
called before `Close()`
- **Longer MQTT disconnect timeout**: 5s (was 1s) to allow in-flight
messages to drain

### Docker Compose (all 4 variants)
- Added `stop_grace_period: 30s` and `stop_signal: SIGTERM`

### Supervisor Configs (both variants)
- Added `stopsignal=TERM` and `stopwaitsecs=20` to server and ingestor
programs

### Deploy Scripts
- `deploy-staging.sh`: `docker stop -t 30` with explicit grace period
- `deploy-live.sh`: `docker stop -t 30` with explicit grace period

## Shutdown Sequence (after fix)

1. Docker sends SIGTERM to supervisord (PID 1)
2. Supervisord forwards SIGTERM to server + ingestor (waits up to 20s
each)
3. Server: stops poller → drains HTTP (15s) → closes WS clients →
checkpoints WAL → closes DB
4. Ingestor: stops tickers → disconnects MQTT (5s) → checkpoints WAL →
closes DB
5. Docker waits up to 30s total before SIGKILL

## Tests

All existing tests pass:
- `cd cmd/server && go test ./...` 
- `cd cmd/ingestor && go test ./...` 

---------

Co-authored-by: you <you@example.com>
Co-authored-by: Kpa-clawbot <kpabap+clawdbot@gmail.com>
2026-04-01 12:19:20 -07:00

274 lines
6.3 KiB
Go

package main
import (
"encoding/json"
"log"
"net/http"
"strings"
"sync"
"time"
"github.com/gorilla/websocket"
)
var upgrader = websocket.Upgrader{
ReadBufferSize: 1024,
WriteBufferSize: 4096,
CheckOrigin: func(r *http.Request) bool { return true },
}
// Hub manages WebSocket clients and broadcasts.
type Hub struct {
mu sync.RWMutex
clients map[*Client]bool
}
// Client is a single WebSocket connection.
type Client struct {
conn *websocket.Conn
send chan []byte
closeOnce sync.Once
}
func NewHub() *Hub {
return &Hub{
clients: make(map[*Client]bool),
}
}
func (h *Hub) ClientCount() int {
h.mu.RLock()
defer h.mu.RUnlock()
return len(h.clients)
}
func (h *Hub) Register(c *Client) {
h.mu.Lock()
h.clients[c] = true
h.mu.Unlock()
log.Printf("[ws] client connected (%d total)", h.ClientCount())
}
func (h *Hub) Unregister(c *Client) {
h.mu.Lock()
if _, ok := h.clients[c]; ok {
delete(h.clients, c)
c.closeOnce.Do(func() { close(c.send) })
}
h.mu.Unlock()
log.Printf("[ws] client disconnected (%d total)", h.ClientCount())
}
// Close gracefully disconnects all WebSocket clients.
func (h *Hub) Close() {
h.mu.Lock()
for c := range h.clients {
c.conn.WriteControl(
websocket.CloseMessage,
websocket.FormatCloseMessage(websocket.CloseGoingAway, "server shutting down"),
time.Now().Add(3*time.Second),
)
c.closeOnce.Do(func() { close(c.send) })
delete(h.clients, c)
}
h.mu.Unlock()
log.Println("[ws] all clients disconnected")
}
// Broadcast sends a message to all connected clients.
func (h *Hub) Broadcast(msg interface{}) {
data, err := json.Marshal(msg)
if err != nil {
log.Printf("[ws] marshal error: %v", err)
return
}
h.mu.RLock()
defer h.mu.RUnlock()
for c := range h.clients {
select {
case c.send <- data:
default:
// Client buffer full — drop
}
}
}
// ServeWS handles the WebSocket upgrade and runs the client.
func (h *Hub) ServeWS(w http.ResponseWriter, r *http.Request) {
conn, err := upgrader.Upgrade(w, r, nil)
if err != nil {
log.Printf("[ws] upgrade error: %v", err)
return
}
client := &Client{
conn: conn,
send: make(chan []byte, 256),
}
h.Register(client)
go client.writePump()
go client.readPump(h)
}
// wsOrStatic upgrades WebSocket requests at any path, serves static files otherwise.
func wsOrStatic(hub *Hub, static http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if strings.EqualFold(r.Header.Get("Upgrade"), "websocket") {
hub.ServeWS(w, r)
return
}
static.ServeHTTP(w, r)
})
}
func (c *Client) readPump(hub *Hub) {
defer func() {
hub.Unregister(c)
c.conn.Close()
}()
c.conn.SetReadLimit(512)
c.conn.SetReadDeadline(time.Now().Add(60 * time.Second))
c.conn.SetPongHandler(func(string) error {
c.conn.SetReadDeadline(time.Now().Add(60 * time.Second))
return nil
})
for {
_, _, err := c.conn.ReadMessage()
if err != nil {
break
}
}
}
func (c *Client) writePump() {
ticker := time.NewTicker(30 * time.Second)
defer func() {
ticker.Stop()
c.conn.Close()
}()
for {
select {
case message, ok := <-c.send:
c.conn.SetWriteDeadline(time.Now().Add(10 * time.Second))
if !ok {
c.conn.WriteMessage(websocket.CloseMessage, []byte{})
return
}
if err := c.conn.WriteMessage(websocket.TextMessage, message); err != nil {
return
}
case <-ticker.C:
c.conn.SetWriteDeadline(time.Now().Add(10 * time.Second))
if err := c.conn.WriteMessage(websocket.PingMessage, nil); err != nil {
return
}
}
}
}
// Poller watches for new transmissions in SQLite and broadcasts them.
type Poller struct {
db *DB
hub *Hub
store *PacketStore // optional: if set, new transmissions are ingested into memory
interval time.Duration
stop chan struct{}
}
func NewPoller(db *DB, hub *Hub, interval time.Duration) *Poller {
return &Poller{db: db, hub: hub, interval: interval, stop: make(chan struct{})}
}
func (p *Poller) Start() {
lastID := p.db.GetMaxTransmissionID()
lastObsID := p.db.GetMaxObservationID()
// If the store already loaded data, use its max IDs as a floor.
// This prevents replaying the entire DB when the DB query fails
// (e.g., corrupted DB returns 0 from COALESCE).
if p.store != nil {
if storeMax := p.store.MaxTransmissionID(); storeMax > lastID {
lastID = storeMax
}
if storeMaxObs := p.store.MaxObservationID(); storeMaxObs > lastObsID {
lastObsID = storeMaxObs
}
}
log.Printf("[poller] starting from transmission ID %d, obs ID %d, interval %v", lastID, lastObsID, p.interval)
ticker := time.NewTicker(p.interval)
defer ticker.Stop()
for {
select {
case <-ticker.C:
if p.store != nil {
// Ingest new transmissions into in-memory store and broadcast
newTxs, newMax := p.store.IngestNewFromDB(lastID, 100)
if newMax > lastID {
lastID = newMax
}
// Ingest new observations for existing transmissions (fixes #174)
nextObsID := lastObsID
if err := p.db.conn.QueryRow(`
SELECT COALESCE(MAX(id), ?) FROM (
SELECT id FROM observations
WHERE id > ?
ORDER BY id ASC
LIMIT 500
)`, lastObsID, lastObsID).Scan(&nextObsID); err != nil {
nextObsID = lastObsID
}
newObs := p.store.IngestNewObservations(lastObsID, 500)
if nextObsID > lastObsID {
lastObsID = nextObsID
}
if len(newTxs) > 0 {
log.Printf("[broadcast] sending %d packets to %d clients (lastID now %d)", len(newTxs), p.hub.ClientCount(), lastID)
}
for _, tx := range newTxs {
p.hub.Broadcast(WSMessage{
Type: "packet",
Data: tx,
})
}
for _, obs := range newObs {
p.hub.Broadcast(WSMessage{
Type: "packet",
Data: obs,
})
}
} else {
// Fallback: direct DB query (used when store is nil, e.g. tests)
newTxs, err := p.db.GetNewTransmissionsSince(lastID, 100)
if err != nil {
log.Printf("[poller] error: %v", err)
continue
}
for _, tx := range newTxs {
id, _ := tx["id"].(int)
if id > lastID {
lastID = id
}
// Copy packet fields for the nested packet (avoids circular ref)
pkt := make(map[string]interface{}, len(tx))
for k, v := range tx {
pkt[k] = v
}
tx["packet"] = pkt
p.hub.Broadcast(WSMessage{
Type: "packet",
Data: tx,
})
}
}
case <-p.stop:
return
}
}
}
func (p *Poller) Stop() {
close(p.stop)
}