mirror of
https://github.com/Kpa-clawbot/meshcore-analyzer.git
synced 2026-04-04 06:56:32 +00:00
## Summary Fixes #450 — staging deployment flaky due to container not shutting down cleanly. ## Root Causes 1. **Server never closed DB on shutdown** — SQLite WAL lock held indefinitely, blocking new container startup 2. **`httpServer.Close()` instead of `Shutdown()`** — abruptly kills connections instead of draining them 3. **No `stop_grace_period` in compose configs** — Docker sends SIGTERM then immediately SIGKILL (default 10s is often not enough for WAL checkpoint) 4. **Supervisor didn't forward SIGTERM** — missing `stopsignal`/`stopwaitsecs` meant Go processes got SIGKILL instead of graceful shutdown 5. **Deploy scripts used default `docker stop` timeout** — only 10s grace period ## Changes ### Go Server (`cmd/server/`) - **Graceful HTTP shutdown**: `httpServer.Shutdown(ctx)` with 15s context timeout — drains in-flight requests before closing - **WebSocket cleanup**: New `Hub.Close()` method sends `CloseGoingAway` frames to all connected clients - **DB close on shutdown**: Explicitly closes DB after HTTP server stops (was never closed before) - **WAL checkpoint**: `PRAGMA wal_checkpoint(TRUNCATE)` before DB close — flushes WAL to main DB file and removes WAL/SHM lock files ### Go Ingestor (`cmd/ingestor/`) - **WAL checkpoint on shutdown**: New `Store.Checkpoint()` method, called before `Close()` - **Longer MQTT disconnect timeout**: 5s (was 1s) to allow in-flight messages to drain ### Docker Compose (all 4 variants) - Added `stop_grace_period: 30s` and `stop_signal: SIGTERM` ### Supervisor Configs (both variants) - Added `stopsignal=TERM` and `stopwaitsecs=20` to server and ingestor programs ### Deploy Scripts - `deploy-staging.sh`: `docker stop -t 30` with explicit grace period - `deploy-live.sh`: `docker stop -t 30` with explicit grace period ## Shutdown Sequence (after fix) 1. Docker sends SIGTERM to supervisord (PID 1) 2. Supervisord forwards SIGTERM to server + ingestor (waits up to 20s each) 3. Server: stops poller → drains HTTP (15s) → closes WS clients → checkpoints WAL → closes DB 4. Ingestor: stops tickers → disconnects MQTT (5s) → checkpoints WAL → closes DB 5. Docker waits up to 30s total before SIGKILL ## Tests All existing tests pass: - `cd cmd/server && go test ./...` ✅ - `cd cmd/ingestor && go test ./...` ✅ --------- Co-authored-by: you <you@example.com> Co-authored-by: Kpa-clawbot <kpabap+clawdbot@gmail.com>
274 lines
6.3 KiB
Go
274 lines
6.3 KiB
Go
package main
|
|
|
|
import (
|
|
"encoding/json"
|
|
"log"
|
|
"net/http"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/gorilla/websocket"
|
|
)
|
|
|
|
var upgrader = websocket.Upgrader{
|
|
ReadBufferSize: 1024,
|
|
WriteBufferSize: 4096,
|
|
CheckOrigin: func(r *http.Request) bool { return true },
|
|
}
|
|
|
|
// Hub manages WebSocket clients and broadcasts.
|
|
type Hub struct {
|
|
mu sync.RWMutex
|
|
clients map[*Client]bool
|
|
}
|
|
|
|
// Client is a single WebSocket connection.
|
|
type Client struct {
|
|
conn *websocket.Conn
|
|
send chan []byte
|
|
closeOnce sync.Once
|
|
}
|
|
|
|
func NewHub() *Hub {
|
|
return &Hub{
|
|
clients: make(map[*Client]bool),
|
|
}
|
|
}
|
|
|
|
func (h *Hub) ClientCount() int {
|
|
h.mu.RLock()
|
|
defer h.mu.RUnlock()
|
|
return len(h.clients)
|
|
}
|
|
|
|
func (h *Hub) Register(c *Client) {
|
|
h.mu.Lock()
|
|
h.clients[c] = true
|
|
h.mu.Unlock()
|
|
log.Printf("[ws] client connected (%d total)", h.ClientCount())
|
|
}
|
|
|
|
func (h *Hub) Unregister(c *Client) {
|
|
h.mu.Lock()
|
|
if _, ok := h.clients[c]; ok {
|
|
delete(h.clients, c)
|
|
c.closeOnce.Do(func() { close(c.send) })
|
|
}
|
|
h.mu.Unlock()
|
|
log.Printf("[ws] client disconnected (%d total)", h.ClientCount())
|
|
}
|
|
|
|
// Close gracefully disconnects all WebSocket clients.
|
|
func (h *Hub) Close() {
|
|
h.mu.Lock()
|
|
for c := range h.clients {
|
|
c.conn.WriteControl(
|
|
websocket.CloseMessage,
|
|
websocket.FormatCloseMessage(websocket.CloseGoingAway, "server shutting down"),
|
|
time.Now().Add(3*time.Second),
|
|
)
|
|
c.closeOnce.Do(func() { close(c.send) })
|
|
delete(h.clients, c)
|
|
}
|
|
h.mu.Unlock()
|
|
log.Println("[ws] all clients disconnected")
|
|
}
|
|
|
|
// Broadcast sends a message to all connected clients.
|
|
func (h *Hub) Broadcast(msg interface{}) {
|
|
data, err := json.Marshal(msg)
|
|
if err != nil {
|
|
log.Printf("[ws] marshal error: %v", err)
|
|
return
|
|
}
|
|
h.mu.RLock()
|
|
defer h.mu.RUnlock()
|
|
for c := range h.clients {
|
|
select {
|
|
case c.send <- data:
|
|
default:
|
|
// Client buffer full — drop
|
|
}
|
|
}
|
|
}
|
|
|
|
// ServeWS handles the WebSocket upgrade and runs the client.
|
|
func (h *Hub) ServeWS(w http.ResponseWriter, r *http.Request) {
|
|
conn, err := upgrader.Upgrade(w, r, nil)
|
|
if err != nil {
|
|
log.Printf("[ws] upgrade error: %v", err)
|
|
return
|
|
}
|
|
|
|
client := &Client{
|
|
conn: conn,
|
|
send: make(chan []byte, 256),
|
|
}
|
|
h.Register(client)
|
|
|
|
go client.writePump()
|
|
go client.readPump(h)
|
|
}
|
|
|
|
// wsOrStatic upgrades WebSocket requests at any path, serves static files otherwise.
|
|
func wsOrStatic(hub *Hub, static http.Handler) http.Handler {
|
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
if strings.EqualFold(r.Header.Get("Upgrade"), "websocket") {
|
|
hub.ServeWS(w, r)
|
|
return
|
|
}
|
|
static.ServeHTTP(w, r)
|
|
})
|
|
}
|
|
|
|
func (c *Client) readPump(hub *Hub) {
|
|
defer func() {
|
|
hub.Unregister(c)
|
|
c.conn.Close()
|
|
}()
|
|
c.conn.SetReadLimit(512)
|
|
c.conn.SetReadDeadline(time.Now().Add(60 * time.Second))
|
|
c.conn.SetPongHandler(func(string) error {
|
|
c.conn.SetReadDeadline(time.Now().Add(60 * time.Second))
|
|
return nil
|
|
})
|
|
for {
|
|
_, _, err := c.conn.ReadMessage()
|
|
if err != nil {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
func (c *Client) writePump() {
|
|
ticker := time.NewTicker(30 * time.Second)
|
|
defer func() {
|
|
ticker.Stop()
|
|
c.conn.Close()
|
|
}()
|
|
for {
|
|
select {
|
|
case message, ok := <-c.send:
|
|
c.conn.SetWriteDeadline(time.Now().Add(10 * time.Second))
|
|
if !ok {
|
|
c.conn.WriteMessage(websocket.CloseMessage, []byte{})
|
|
return
|
|
}
|
|
if err := c.conn.WriteMessage(websocket.TextMessage, message); err != nil {
|
|
return
|
|
}
|
|
case <-ticker.C:
|
|
c.conn.SetWriteDeadline(time.Now().Add(10 * time.Second))
|
|
if err := c.conn.WriteMessage(websocket.PingMessage, nil); err != nil {
|
|
return
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Poller watches for new transmissions in SQLite and broadcasts them.
|
|
type Poller struct {
|
|
db *DB
|
|
hub *Hub
|
|
store *PacketStore // optional: if set, new transmissions are ingested into memory
|
|
interval time.Duration
|
|
stop chan struct{}
|
|
}
|
|
|
|
func NewPoller(db *DB, hub *Hub, interval time.Duration) *Poller {
|
|
return &Poller{db: db, hub: hub, interval: interval, stop: make(chan struct{})}
|
|
}
|
|
|
|
func (p *Poller) Start() {
|
|
lastID := p.db.GetMaxTransmissionID()
|
|
lastObsID := p.db.GetMaxObservationID()
|
|
// If the store already loaded data, use its max IDs as a floor.
|
|
// This prevents replaying the entire DB when the DB query fails
|
|
// (e.g., corrupted DB returns 0 from COALESCE).
|
|
if p.store != nil {
|
|
if storeMax := p.store.MaxTransmissionID(); storeMax > lastID {
|
|
lastID = storeMax
|
|
}
|
|
if storeMaxObs := p.store.MaxObservationID(); storeMaxObs > lastObsID {
|
|
lastObsID = storeMaxObs
|
|
}
|
|
}
|
|
log.Printf("[poller] starting from transmission ID %d, obs ID %d, interval %v", lastID, lastObsID, p.interval)
|
|
|
|
ticker := time.NewTicker(p.interval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
if p.store != nil {
|
|
// Ingest new transmissions into in-memory store and broadcast
|
|
newTxs, newMax := p.store.IngestNewFromDB(lastID, 100)
|
|
if newMax > lastID {
|
|
lastID = newMax
|
|
}
|
|
// Ingest new observations for existing transmissions (fixes #174)
|
|
nextObsID := lastObsID
|
|
if err := p.db.conn.QueryRow(`
|
|
SELECT COALESCE(MAX(id), ?) FROM (
|
|
SELECT id FROM observations
|
|
WHERE id > ?
|
|
ORDER BY id ASC
|
|
LIMIT 500
|
|
)`, lastObsID, lastObsID).Scan(&nextObsID); err != nil {
|
|
nextObsID = lastObsID
|
|
}
|
|
newObs := p.store.IngestNewObservations(lastObsID, 500)
|
|
if nextObsID > lastObsID {
|
|
lastObsID = nextObsID
|
|
}
|
|
if len(newTxs) > 0 {
|
|
log.Printf("[broadcast] sending %d packets to %d clients (lastID now %d)", len(newTxs), p.hub.ClientCount(), lastID)
|
|
}
|
|
for _, tx := range newTxs {
|
|
p.hub.Broadcast(WSMessage{
|
|
Type: "packet",
|
|
Data: tx,
|
|
})
|
|
}
|
|
for _, obs := range newObs {
|
|
p.hub.Broadcast(WSMessage{
|
|
Type: "packet",
|
|
Data: obs,
|
|
})
|
|
}
|
|
} else {
|
|
// Fallback: direct DB query (used when store is nil, e.g. tests)
|
|
newTxs, err := p.db.GetNewTransmissionsSince(lastID, 100)
|
|
if err != nil {
|
|
log.Printf("[poller] error: %v", err)
|
|
continue
|
|
}
|
|
for _, tx := range newTxs {
|
|
id, _ := tx["id"].(int)
|
|
if id > lastID {
|
|
lastID = id
|
|
}
|
|
// Copy packet fields for the nested packet (avoids circular ref)
|
|
pkt := make(map[string]interface{}, len(tx))
|
|
for k, v := range tx {
|
|
pkt[k] = v
|
|
}
|
|
tx["packet"] = pkt
|
|
p.hub.Broadcast(WSMessage{
|
|
Type: "packet",
|
|
Data: tx,
|
|
})
|
|
}
|
|
}
|
|
case <-p.stop:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
func (p *Poller) Stop() {
|
|
close(p.stop)
|
|
}
|