mirror of
https://github.com/Kpa-clawbot/meshcore-analyzer.git
synced 2026-05-12 16:24:49 +00:00
74dffa2fb7
## Summary Implements per-component disk I/O + write source metrics on the Perf page so operators can self-diagnose write-volume anomalies (cf. the BackfillPathJSON loop debugged in #1119) without SSHing in to run iotop/fatrace. Partial fix for #1120 ## What's done (4/6 ACs) - ✅ `/api/perf/io` — server-process `/proc/self/io` delta rates (read/write bytes per sec, syscalls) - ✅ `/api/perf/sqlite` — WAL size, page count, page size, cache hit rate - ✅ `/api/perf/write-sources` — per-component counters from ingestor (tx/obs/upserts/backfill_*) - ✅ Frontend Perf page — three new sections with anomaly thresholds + per-second rate columns ## What's NOT done (deferred to follow-up) - ❌ `cancelledWriteBytesPerSec` field — issue #1120 lists this under server-process I/O ("writes the kernel discarded — interesting signal"); not exposed in this PR - ❌ Ingestor `/proc/<pid>/io` — issue #1120 says "Both ingestor and server"; only server-process I/O lands here. Adding ingestor I/O requires either a unix socket back to the server, or surfacing the ingestor pid through the stats file. Doable without changing the existing API shape. - ❌ Adaptive baselining — anomaly thresholds remain static (10×, 100 MB, 90%); steady-state baselining can come once we have enough deployed Perf-page telemetry Per AGENTS.md rule 34, this PR uses "Partial fix for #1120" rather than "Fixes #1120" so the issue stays open until the remaining ACs land. ## Backend **Server (`cmd/server/perf_io.go`)** - `GET /api/perf/io` — reads `/proc/self/io` and returns delta-rate `{readBytesPerSec, writeBytesPerSec, syscallsRead, syscallsWrite}` since last call (in-memory tracker, no allocation per sample). - `GET /api/perf/sqlite` — returns `{walSize, walSizeMB, pageCount, pageSize, cacheSize, cacheHitRate}`. `cacheHitRate` is proxied from the in-process row cache (closest available signal under the modernc sqlite driver). - `GET /api/perf/write-sources` — reads the ingestor's stats JSON file and returns a flat `{sources: {...}, sampleAt}` payload. **Ingestor (`cmd/ingestor/`)** - `DBStats` gains `WALCommits atomic.Int64` (incremented on every successful `tx.Commit()` and on every auto-commit `InsertTransmission` write) and `BackfillUpdates sync.Map` keyed by backfill name with `IncBackfill(name)` / `SnapshotBackfills()` helpers. - `BackfillPathJSONAsync` now increments `BackfillUpdates["path_json"]` per row write — the BackfillPathJSON-style infinite loop becomes immediately visible at `backfill_path_json` in the Write Sources table. - New `StartStatsFileWriter` publishes a JSON snapshot to `/tmp/corescope-ingestor-stats.json` (override via `CORESCOPE_INGESTOR_STATS`) every second using atomic tmp+rename. The tmp file is opened with `O_CREATE|O_WRONLY|O_TRUNC|O_NOFOLLOW` mode `0o600` so a pre-planted symlink in a world-writable `/tmp` cannot redirect the write to an arbitrary file. ## Frontend (`public/perf.js`) Three new sections on the Perf page, all auto-refreshed via the existing 5s interval: - **Disk I/O (server process)** — read/write rates (formatted B/KB/MB-per-sec) + syscall counts. Write rate >10 MB/s flags ⚠️. - **Write Sources** — sorted table of per-component counters with a per-second rate column derived from snapshot deltas. Backfill rows show ⚠️ only when `tx_inserted >= 100` (meaningful baseline) AND the backfill's per-second rate exceeds 10× the live tx rate. Avoids the startup-spurious-alarm where cumulative-vs-cumulative was a tautology. - **SQLite (WAL + Cache Hit)** — WAL size (⚠️ when >100 MB), page count, page size, cache hit rate (⚠️ when <90%). ## Tests - **Backend** (`cmd/server/perf_io_test.go`) — `TestPerfIOEndpoint_ReturnsValidJSON`, `TestPerfSqliteEndpoint_ReturnsValidJSON`, `TestPerfWriteSourcesEndpoint_ReturnsSources` exercise the three new endpoints. Skips the `/proc/self/io` non-zero-rate assertion when `/proc` is unavailable. - **Frontend** (`test-perf-disk-io-1120.js`) — vm-sandbox runs `perf.js` with stubbed `fetch`, asserts the three new sections render with their headings + values. E2E assertion added: test-perf-disk-io-1120.js:91 ## TDD 1. Red commit (`21abd22`) — added the three handlers as no-op stubs returning empty values; tests fail on assertion mismatches (non-zero rate, `pageSize > 0`, headings present). 2. Green commit (`d8da54c`) — fills in the real `/proc/self/io` parser, PRAGMA queries, ingestor stats writer, and Perf page rendering. --------- Co-authored-by: corescope-bot <bot@corescope.local> Co-authored-by: Kpa-clawbot <kpa-clawbot@users.noreply.github.com>
97 lines
2.7 KiB
Go
97 lines
2.7 KiB
Go
package main
|
|
|
|
import (
|
|
"encoding/json"
|
|
"net/http"
|
|
"net/http/httptest"
|
|
"os"
|
|
"strings"
|
|
"testing"
|
|
)
|
|
|
|
func TestPerfIOEndpoint_ReturnsValidJSON(t *testing.T) {
|
|
_, router := setupTestServer(t)
|
|
|
|
req := httptest.NewRequest("GET", "/api/perf/io", nil)
|
|
w := httptest.NewRecorder()
|
|
router.ServeHTTP(w, req)
|
|
|
|
if w.Code != http.StatusOK {
|
|
t.Fatalf("expected 200, got %d", w.Code)
|
|
}
|
|
var body map[string]interface{}
|
|
if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil {
|
|
t.Fatalf("invalid JSON: %v", err)
|
|
}
|
|
for _, field := range []string{"readBytesPerSec", "writeBytesPerSec", "syscallsRead", "syscallsWrite"} {
|
|
if _, ok := body[field]; !ok {
|
|
t.Errorf("missing field %q", field)
|
|
}
|
|
}
|
|
|
|
// /proc/self/io only exists on Linux. When absent (e.g. some test
|
|
// containers) we still expect well-formed JSON but skip the non-zero
|
|
// delta assertion.
|
|
if _, err := os.Stat("/proc/self/io"); err != nil {
|
|
t.Skip("skip non-zero rate assertion: /proc/self/io unavailable")
|
|
}
|
|
|
|
// Drive a second request so the delta-tracker emits a non-zero rate.
|
|
// Generate a small read-bytes signal between the two reads.
|
|
req2 := httptest.NewRequest("GET", "/api/perf/io", nil)
|
|
w2 := httptest.NewRecorder()
|
|
router.ServeHTTP(w2, req2)
|
|
var body2 map[string]interface{}
|
|
json.Unmarshal(w2.Body.Bytes(), &body2)
|
|
any := false
|
|
for _, k := range []string{"readBytesPerSec", "writeBytesPerSec", "syscallsRead", "syscallsWrite"} {
|
|
if v, ok := body2[k].(float64); ok && v > 0 {
|
|
any = true
|
|
break
|
|
}
|
|
}
|
|
if !any {
|
|
t.Errorf("expected at least one non-zero rate after second sample, got %v", body2)
|
|
}
|
|
}
|
|
|
|
func TestPerfSqliteEndpoint_ReturnsValidJSON(t *testing.T) {
|
|
_, router := setupTestServer(t)
|
|
|
|
req := httptest.NewRequest("GET", "/api/perf/sqlite", nil)
|
|
w := httptest.NewRecorder()
|
|
router.ServeHTTP(w, req)
|
|
|
|
if w.Code != http.StatusOK {
|
|
t.Fatalf("expected 200, got %d", w.Code)
|
|
}
|
|
var body map[string]interface{}
|
|
if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil {
|
|
t.Fatalf("invalid JSON: %v", err)
|
|
}
|
|
for _, field := range []string{"walSize", "pageCount", "pageSize", "cacheHitRate"} {
|
|
if _, ok := body[field]; !ok {
|
|
t.Errorf("missing field %q", field)
|
|
}
|
|
}
|
|
// pageSize must be > 0 for any open SQLite DB
|
|
if v, ok := body["pageSize"].(float64); !ok || v <= 0 {
|
|
t.Errorf("expected pageSize > 0, got %v", body["pageSize"])
|
|
}
|
|
}
|
|
|
|
func TestPerfWriteSourcesEndpoint_ReturnsSources(t *testing.T) {
|
|
_, router := setupTestServer(t)
|
|
|
|
req := httptest.NewRequest("GET", "/api/perf/write-sources", nil)
|
|
w := httptest.NewRecorder()
|
|
router.ServeHTTP(w, req)
|
|
if w.Code != http.StatusOK {
|
|
t.Fatalf("expected 200, got %d", w.Code)
|
|
}
|
|
body := w.Body.String()
|
|
if !strings.Contains(body, "sources") {
|
|
t.Errorf("response missing 'sources' key: %s", body)
|
|
}
|
|
}
|