From fcba2a9f3de9518acb574e255282feeff7a5d294 Mon Sep 17 00:00:00 2001 From: Kpa-clawbot Date: Sat, 11 Apr 2026 21:25:23 -0700 Subject: [PATCH] fix: set PRAGMA busy_timeout on all RW SQLite connections (#707) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem `SQLITE_BUSY` contention between the ingestor and server's async persistence goroutine drops `resolved_path` and `neighbor_edges` updates. The DSN parameter `_busy_timeout=10000` may not be honored by the modernc/sqlite driver. ## Fix - **`openRW()` now sets `PRAGMA busy_timeout = 5000`** after opening the connection, guaranteeing SQLite retries for up to 5 seconds before returning `SQLITE_BUSY` - **Refactored `PruneOldPackets` and `PruneOldMetrics`** to use `openRW()` instead of duplicating connection setup — all RW connections now get consistent busy_timeout handling - Added test verifying the pragma is set correctly ## Changes | File | Change | |------|--------| | `cmd/server/neighbor_persist.go` | `openRW()` sets `PRAGMA busy_timeout = 5000` after open | | `cmd/server/db.go` | `PruneOldPackets` and `PruneOldMetrics` use `openRW()` instead of inline `sql.Open` | | `cmd/server/neighbor_persist_test.go` | `TestOpenRW_BusyTimeout` verifies pragma is set | ## Performance No performance impact — `PRAGMA busy_timeout` is a connection-level setting with zero overhead on uncontended writes. Under contention, it converts immediate `SQLITE_BUSY` failures into brief retries (up to 5s), which is strictly better than dropping data. Fixes #705 --------- Co-authored-by: you --- cmd/server/db.go | 8 ++------ cmd/server/neighbor_persist.go | 8 +++++++- cmd/server/neighbor_persist_test.go | 28 ++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 7 deletions(-) diff --git a/cmd/server/db.go b/cmd/server/db.go index 719a57e..b0ed653 100644 --- a/cmd/server/db.go +++ b/cmd/server/db.go @@ -1704,12 +1704,10 @@ func nullInt(ni sql.NullInt64) interface{} { // Returns the number of transmissions deleted. // Opens a separate read-write connection since the main connection is read-only. func (db *DB) PruneOldPackets(days int) (int64, error) { - dsn := fmt.Sprintf("file:%s?_journal_mode=WAL&_busy_timeout=10000", db.path) - rw, err := sql.Open("sqlite", dsn) + rw, err := openRW(db.path) if err != nil { return 0, err } - rw.SetMaxOpenConns(1) defer rw.Close() cutoff := time.Now().UTC().AddDate(0, 0, -days).Format(time.RFC3339) @@ -2053,12 +2051,10 @@ func (db *DB) GetMetricsSummary(since string) ([]MetricsSummaryRow, error) { // PruneOldMetrics deletes observer_metrics rows older than retentionDays. func (db *DB) PruneOldMetrics(retentionDays int) (int64, error) { - dsn := fmt.Sprintf("file:%s?_journal_mode=WAL&_busy_timeout=10000", db.path) - rw, err := sql.Open("sqlite", dsn) + rw, err := openRW(db.path) if err != nil { return 0, err } - rw.SetMaxOpenConns(1) defer rw.Close() cutoff := time.Now().UTC().AddDate(0, 0, -retentionDays).Format(time.RFC3339) diff --git a/cmd/server/neighbor_persist.go b/cmd/server/neighbor_persist.go index dfa0e66..4539ec6 100644 --- a/cmd/server/neighbor_persist.go +++ b/cmd/server/neighbor_persist.go @@ -584,12 +584,18 @@ func extractEdgesFromObs(obs *StoreObs, tx *StoreTx, pm *prefixMap) []edgeCandid // openRW opens a read-write SQLite connection (same pattern as PruneOldPackets). func openRW(dbPath string) (*sql.DB, error) { - dsn := fmt.Sprintf("file:%s?_journal_mode=WAL&_busy_timeout=10000", dbPath) + dsn := fmt.Sprintf("file:%s?_journal_mode=WAL", dbPath) rw, err := sql.Open("sqlite", dsn) if err != nil { return nil, err } rw.SetMaxOpenConns(1) + // DSN _busy_timeout may not be honored by all drivers; set via PRAGMA + // to guarantee SQLite retries for up to 5s before returning SQLITE_BUSY. + if _, err := rw.Exec("PRAGMA busy_timeout = 5000"); err != nil { + rw.Close() + return nil, fmt.Errorf("set busy_timeout: %w", err) + } return rw, nil } diff --git a/cmd/server/neighbor_persist_test.go b/cmd/server/neighbor_persist_test.go index 66fe35e..c39f138 100644 --- a/cmd/server/neighbor_persist_test.go +++ b/cmd/server/neighbor_persist_test.go @@ -532,3 +532,31 @@ func TestPersistSemaphoreTryAcquireSkipsBatch(t *testing.T) { <-persistSem // release } + +func TestOpenRW_BusyTimeout(t *testing.T) { + dir := t.TempDir() + dbPath := filepath.Join(dir, "test.db") + + // Create the DB file first + db, err := sql.Open("sqlite", "file:"+dbPath+"?_journal_mode=WAL") + if err != nil { + t.Fatal(err) + } + db.Exec("CREATE TABLE dummy (id INTEGER)") + db.Close() + + // Open via openRW and verify busy_timeout is set + rw, err := openRW(dbPath) + if err != nil { + t.Fatalf("openRW failed: %v", err) + } + defer rw.Close() + + var timeout int + if err := rw.QueryRow("PRAGMA busy_timeout").Scan(&timeout); err != nil { + t.Fatalf("query busy_timeout: %v", err) + } + if timeout != 5000 { + t.Errorf("expected busy_timeout=5000, got %d", timeout) + } +}