fix: set PRAGMA busy_timeout on all RW SQLite connections (#707)

## Problem

`SQLITE_BUSY` contention between the ingestor and server's async
persistence goroutine drops `resolved_path` and `neighbor_edges`
updates. The DSN parameter `_busy_timeout=10000` may not be honored by
the modernc/sqlite driver.

## Fix

- **`openRW()` now sets `PRAGMA busy_timeout = 5000`** after opening the
connection, guaranteeing SQLite retries for up to 5 seconds before
returning `SQLITE_BUSY`
- **Refactored `PruneOldPackets` and `PruneOldMetrics`** to use
`openRW()` instead of duplicating connection setup — all RW connections
now get consistent busy_timeout handling
- Added test verifying the pragma is set correctly

## Changes

| File | Change |
|------|--------|
| `cmd/server/neighbor_persist.go` | `openRW()` sets `PRAGMA
busy_timeout = 5000` after open |
| `cmd/server/db.go` | `PruneOldPackets` and `PruneOldMetrics` use
`openRW()` instead of inline `sql.Open` |
| `cmd/server/neighbor_persist_test.go` | `TestOpenRW_BusyTimeout`
verifies pragma is set |

## Performance

No performance impact — `PRAGMA busy_timeout` is a connection-level
setting with zero overhead on uncontended writes. Under contention, it
converts immediate `SQLITE_BUSY` failures into brief retries (up to 5s),
which is strictly better than dropping data.

Fixes #705

---------

Co-authored-by: you <you@example.com>
This commit is contained in:
Kpa-clawbot
2026-04-11 21:25:23 -07:00
committed by GitHub
parent c6a0f91b07
commit fcba2a9f3d
3 changed files with 37 additions and 7 deletions

View File

@@ -1704,12 +1704,10 @@ func nullInt(ni sql.NullInt64) interface{} {
// Returns the number of transmissions deleted.
// Opens a separate read-write connection since the main connection is read-only.
func (db *DB) PruneOldPackets(days int) (int64, error) {
dsn := fmt.Sprintf("file:%s?_journal_mode=WAL&_busy_timeout=10000", db.path)
rw, err := sql.Open("sqlite", dsn)
rw, err := openRW(db.path)
if err != nil {
return 0, err
}
rw.SetMaxOpenConns(1)
defer rw.Close()
cutoff := time.Now().UTC().AddDate(0, 0, -days).Format(time.RFC3339)
@@ -2053,12 +2051,10 @@ func (db *DB) GetMetricsSummary(since string) ([]MetricsSummaryRow, error) {
// PruneOldMetrics deletes observer_metrics rows older than retentionDays.
func (db *DB) PruneOldMetrics(retentionDays int) (int64, error) {
dsn := fmt.Sprintf("file:%s?_journal_mode=WAL&_busy_timeout=10000", db.path)
rw, err := sql.Open("sqlite", dsn)
rw, err := openRW(db.path)
if err != nil {
return 0, err
}
rw.SetMaxOpenConns(1)
defer rw.Close()
cutoff := time.Now().UTC().AddDate(0, 0, -retentionDays).Format(time.RFC3339)

View File

@@ -584,12 +584,18 @@ func extractEdgesFromObs(obs *StoreObs, tx *StoreTx, pm *prefixMap) []edgeCandid
// openRW opens a read-write SQLite connection (same pattern as PruneOldPackets).
func openRW(dbPath string) (*sql.DB, error) {
dsn := fmt.Sprintf("file:%s?_journal_mode=WAL&_busy_timeout=10000", dbPath)
dsn := fmt.Sprintf("file:%s?_journal_mode=WAL", dbPath)
rw, err := sql.Open("sqlite", dsn)
if err != nil {
return nil, err
}
rw.SetMaxOpenConns(1)
// DSN _busy_timeout may not be honored by all drivers; set via PRAGMA
// to guarantee SQLite retries for up to 5s before returning SQLITE_BUSY.
if _, err := rw.Exec("PRAGMA busy_timeout = 5000"); err != nil {
rw.Close()
return nil, fmt.Errorf("set busy_timeout: %w", err)
}
return rw, nil
}

View File

@@ -532,3 +532,31 @@ func TestPersistSemaphoreTryAcquireSkipsBatch(t *testing.T) {
<-persistSem // release
}
func TestOpenRW_BusyTimeout(t *testing.T) {
dir := t.TempDir()
dbPath := filepath.Join(dir, "test.db")
// Create the DB file first
db, err := sql.Open("sqlite", "file:"+dbPath+"?_journal_mode=WAL")
if err != nil {
t.Fatal(err)
}
db.Exec("CREATE TABLE dummy (id INTEGER)")
db.Close()
// Open via openRW and verify busy_timeout is set
rw, err := openRW(dbPath)
if err != nil {
t.Fatalf("openRW failed: %v", err)
}
defer rw.Close()
var timeout int
if err := rw.QueryRow("PRAGMA busy_timeout").Scan(&timeout); err != nil {
t.Fatalf("query busy_timeout: %v", err)
}
if timeout != 5000 {
t.Errorf("expected busy_timeout=5000, got %d", timeout)
}
}