From a26a412c9bb7ccd4485f52e8441cceec67f1ec35 Mon Sep 17 00:00:00 2001 From: Kpa-clawbot Date: Sat, 6 Jun 2026 20:43:58 -0700 Subject: [PATCH] feat(perf): 5-min rolling-baseline anomaly detection for Write Sources (#1120) (#1593) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Addresses the remaining acceptance gap on #1120: a true **5-minute rolling-baseline anomaly detector** for the Perf-page Write Sources table. The endpoints + ingestor wiring + UI scaffolding landed in #1123 (partial); this PR replaces the ad-hoc tx-rate comparison with the rolling baseline the issue actually asks for, and adds a JS unit test that proves the ⚠️ flag fires at 11× baseline. ## What changed - **`public/perf.js`** — new pure helper `detectPerfAnomalies(history, current, opts)`. Computes per-component current rate and rolling baseline rate over a window (default 5 min). Flags components whose current rate > 10× baseline. Includes a 0.05/s floor so a stale `0` baseline doesn't false-positive at startup. - **UI** — Write Sources table now shows `Rate/s`, `Baseline/s`, and `Anomaly` columns. Operators can sanity-check the ⚠️ rather than trusting opaque output. History is kept on `window` and pruned to a 6-min sliding ring. - **`test-perf-anomaly.js`** — new VM-sandbox test asserting: - ⚠️ fires when one component runs at 11× its 5-min baseline - No ⚠️ at 5× (under threshold) - No ⚠️ until ≥30s of history has accumulated ## TDD evidence (red → green) - Red commit `590f04d3`: introduces the stub `detectPerfAnomalies` (returns empty `{flags:{}}`) + the test. Test FAILS on the `assert(r.flags.backfill_path_json === true, ...)` assertion — not a build error. ``` ❌ ⚠️ fires when backfill rate hits 11× the 5-minute baseline: expected backfill_path_json flagged at 11× baseline, got flags={} 2 passed, 1 failed ``` - Green commit `726a5e78`: implements the rolling-baseline detector. All 3 tests pass; existing `test-packet-filter.js` (79 tests) still green; `cmd/server` Go tests for `/api/perf/*` still green. ## What is NOT in this PR (deferred / out of scope per brief) - **SQLite-stats subsection** (WAL size + cache hit rate + pending checkpoint) — `/api/perf/sqlite` already exists (landed in #1123). Issue body lists it as a metric category, brief explicitly marks it OPTIONAL. Not regressed; no changes needed. - **Ingestor `/proc/self/io` bridge** — already lives in the ingestor stats file (`ProcIO` field, `internal/perfio`) and is rendered on the Perf page. No change. - **Issue #1340** (SQLite write-lock instrumentation) — separate PR in flight, not piggybacked. - **No new metrics backend** (no Prometheus, no OpenTelemetry). Pure JSON over `/api/perf/*`. ## Hard-rule compliance - Files changed: 2 (`public/perf.js`, `test-perf-anomaly.js`) — well inside the 3-files-outside-allowed-set cap. - `Stats` struct unchanged. - All colors via CSS variables — no hex literals introduced (grep clean). - TDD: red commit fails on assertion, green commit passes — visible in branch history. - PII preflight: clean on both commits. Partial fix language deliberately not used — this completes the issue's UI acceptance criterion. Leaving `Fixes #1120` off so the user can verify on the staging deploy before closing. --------- Co-authored-by: meshcore-bot --- public/perf.js | 128 ++++++++++++++++++++++++++++++------------- test-perf-anomaly.js | 107 ++++++++++++++++++++++++++++++++++++ 2 files changed, 198 insertions(+), 37 deletions(-) create mode 100644 test-perf-anomaly.js diff --git a/public/perf.js b/public/perf.js index 611b9a32..ce3d1db6 100644 --- a/public/perf.js +++ b/public/perf.js @@ -3,6 +3,75 @@ var GH = 'https://github.com/Kpa-clawbot/corescope'; +// detectPerfAnomalies — pure, testable. +// Computes per-component write rates over a rolling time window and flags any +// component whose current per-second rate exceeds `factor` × its rolling +// baseline rate. Issue #1120 acceptance: 5-minute window, 10× threshold. +// +// Inputs: +// history: ordered array of snapshots [{ sampleAt: ISO, sources: { name: cum } }] +// current: the freshest snapshot, same shape +// opts: +// windowMs (default 5*60*1000) — rolling baseline window +// factor (default 10) — rate-multiplier threshold +// minHistorySec (default 30) — refuse to flag until baseline is stable +// +// Returns: { rates, baselineRates, flags } — all keyed by source name. +function detectPerfAnomalies(history, current, opts) { + opts = opts || {}; + const windowMs = opts.windowMs || (5 * 60 * 1000); + const factor = opts.factor || 10; + const minHistorySec = opts.minHistorySec != null ? opts.minHistorySec : 30; + const out = { rates: {}, baselineRates: {}, flags: {} }; + if (!current || !current.sources || !history || history.length === 0) return out; + const curT = Date.parse(current.sampleAt); + if (!isFinite(curT)) return out; + + // Find the most recent prior sample (for the *current* per-second rate) + // and the oldest sample within the window (for the baseline). + const prior = history[history.length - 1]; + const priorT = Date.parse(prior.sampleAt); + const curDt = (curT - priorT) / 1000; + if (!(curDt > 0)) return out; + + // Baseline: oldest sample within window vs. prior (the snapshot just before + // `current`). Anything older than windowMs is excluded. + const cutoff = curT - windowMs; + let baseIdx = 0; + for (let i = history.length - 1; i >= 0; i--) { + if (Date.parse(history[i].sampleAt) < cutoff) { baseIdx = i + 1; break; } + } + if (baseIdx >= history.length) baseIdx = history.length - 1; + const baseSnap = history[baseIdx]; + const baseT = Date.parse(baseSnap.sampleAt); + const baseDt = (priorT - baseT) / 1000; + + // Compute rates for every source seen in current. + for (const k of Object.keys(current.sources)) { + const cur = current.sources[k] || 0; + const prev = (prior.sources && prior.sources[k]) || 0; + const rate = (cur - prev) / curDt; + out.rates[k] = rate; + if (baseDt <= 0 || baseDt < minHistorySec) { + out.baselineRates[k] = null; + continue; + } + const baseStart = (baseSnap.sources && baseSnap.sources[k]) || 0; + const baseEnd = prev; // baseline window = [baseSnap .. prior] + const baseRate = (baseEnd - baseStart) / baseDt; + out.baselineRates[k] = baseRate; + // Guard floor to avoid 0-baseline → infinite ratio false positives. + const floor = 0.05; // 1 event per 20s minimum baseline + if (rate > factor * Math.max(baseRate, floor) && rate > factor * floor) { + out.flags[k] = true; + } + } + return out; +} +if (typeof window !== 'undefined') { + window.detectPerfAnomalies = detectPerfAnomalies; +} + function renderVersionCard(health) { if (!health || (!health.version && !health.commit)) return ''; var ver = health.version && health.version !== 'unknown' ? health.version : null; @@ -127,48 +196,33 @@ function renderVersionCard(health) { if (keys.length === 0) { html += '

No ingestor stats yet (waiting for /tmp/corescope-ingestor-stats.json)

'; } else { - // Anomaly detection (#1123 polish): - // Compare PER-SECOND DELTA RATES, not cumulative counts. - // Cumulative-vs-cumulative was a tautology that fired ⚠️ at startup - // (any backfill_* > 10 when tx_inserted=0 → baseline collapses to 1) - // and false-cleared once tx grew past a one-shot backfill burst. - // Now we cache the previous snapshot + sampleAt and only fire when: - // 1) we have a real interval (≥ 0.5s) to compute deltas against - // 2) tx_inserted has crossed MIN_SAMPLE so the baseline is meaningful - // 3) the per-second backfill rate exceeds 10× the per-second tx rate - const MIN_SAMPLE = 100; - const prev = window._perfWriteSourcesPrev; - let prevSrc = null, dtSec = 0; - if (prev && prev.sampleAt && writeSources.sampleAt) { - dtSec = (Date.parse(writeSources.sampleAt) - Date.parse(prev.sampleAt)) / 1000; - if (dtSec >= 0.5) prevSrc = prev.sources; - } - const txTotal = src.tx_inserted || 0; - const txDelta = prevSrc ? (txTotal - (prevSrc.tx_inserted || 0)) : 0; - const txRate = (prevSrc && dtSec > 0) ? (txDelta / dtSec) : 0; - html += '
'; + // Anomaly detection (#1120 acceptance): flag any component whose + // per-second write rate exceeds 10× its 5-minute rolling baseline. + // History is stashed on window so the detector has multi-sample + // context across the 5s refresh tick. + if (!window._perfWriteSourcesHistory) window._perfWriteSourcesHistory = []; + const history = window._perfWriteSourcesHistory; + const current = { sampleAt: writeSources.sampleAt || new Date().toISOString(), sources: { ...src } }; + const anom = detectPerfAnomalies(history, current, { windowMs: 5 * 60 * 1000, factor: 10 }); + // Append current and prune anything older than 6 minutes (keeps a + // little headroom past the 5-min window, bounded memory). + history.push(current); + const cutoff = Date.parse(current.sampleAt) - (6 * 60 * 1000); + while (history.length > 1 && Date.parse(history[0].sampleAt) < cutoff) history.shift(); + + html += '
SourceTotalRate/sAnomaly
'; for (const k of keys) { const v = src[k] || 0; - const isBackfill = k.startsWith('backfill_'); - let rate = 0; - let flag = ''; - if (prevSrc && dtSec > 0) { - const delta = v - (prevSrc[k] || 0); - rate = delta / dtSec; - // Only flag when tx baseline is statistically meaningful AND - // backfill is actively running faster than 10× the live tx rate. - if (isBackfill && txTotal >= MIN_SAMPLE && rate > 10 * Math.max(txRate, 1)) { - flag = ' ⚠️'; - } - } - const rateStr = (prevSrc && dtSec > 0) ? rate.toFixed(1) : '—'; - html += ``; + const rate = anom.rates[k]; + const base = anom.baselineRates[k]; + const flag = anom.flags[k] ? ' ⚠️' : ''; + const rateStr = (rate != null && isFinite(rate)) ? rate.toFixed(2) : '—'; + const baseStr = (base != null && isFinite(base)) ? base.toFixed(2) : '—'; + html += ``; } html += '
SourceTotalRate/sBaseline/sAnomaly
${k}${v.toLocaleString()}${rateStr}${flag}
${k}${v.toLocaleString()}${rateStr}${baseStr}${flag}
'; - // Stash for next tick's delta computation. - window._perfWriteSourcesPrev = { sources: { ...src }, sampleAt: writeSources.sampleAt }; if (writeSources.sampleAt) { - html += `
Sampled: ${writeSources.sampleAt}
`; + html += `
Sampled: ${writeSources.sampleAt} · baseline window: 5 min · threshold: 10×
`; } } } diff --git a/test-perf-anomaly.js b/test-perf-anomaly.js new file mode 100644 index 00000000..c9710e51 --- /dev/null +++ b/test-perf-anomaly.js @@ -0,0 +1,107 @@ +/* Unit tests for perf.js anomaly detection — 5-minute rolling baseline. + * + * Issue #1120 acceptance criterion: "Per-component write rate > 10× steady-state + * baseline" flagged with ⚠️. The baseline must be a 5-minute rolling window, + * not a single sample-to-sample comparison (which gives false negatives during + * a slow ramp and false positives during natural bursts). + * + * This file exercises window.detectPerfAnomalies(history, current, opts). + */ +'use strict'; +const vm = require('vm'); +const fs = require('fs'); + +const code = fs.readFileSync('public/perf.js', 'utf8'); +const ctx = { + window: {}, + document: { addEventListener() {}, getElementById() { return null; }, hidden: true }, + console, + fetch: () => Promise.resolve({ json: () => Promise.resolve(null) }), + setInterval: () => 0, + clearInterval: () => {}, + registerPage: () => {}, +}; +vm.createContext(ctx); +vm.runInContext(code, ctx); + +const detect = ctx.window.detectPerfAnomalies; +if (typeof detect !== 'function') { + console.log('FAIL: window.detectPerfAnomalies is not a function (got ' + typeof detect + ')'); + process.exit(1); +} + +let pass = 0, fail = 0; +function test(name, fn) { + try { fn(); pass++; console.log(' ✅ ' + name); } + catch (e) { fail++; console.log(' ❌ ' + name + ': ' + e.message); } +} +function assert(cond, msg) { if (!cond) throw new Error(msg || 'assertion failed'); } + +// Build a 5-minute history where backfill_path_json increments at a steady +// 1/sec baseline (300 samples over 300s), tx_inserted at 5/sec. +function buildHistory(startMs, durSec, perSec) { + const h = []; + let cum = {}; + for (const k of Object.keys(perSec)) cum[k] = 0; + for (let i = 0; i <= durSec; i++) { + const ts = new Date(startMs + i * 1000).toISOString(); + const snap = { sampleAt: ts, sources: {} }; + for (const k of Object.keys(perSec)) { + cum[k] += perSec[k]; + snap.sources[k] = cum[k]; + } + h.push(snap); + } + return h; +} + +test('⚠️ fires when backfill rate hits 11× the 5-minute baseline', () => { + const t0 = Date.UTC(2026, 5, 5, 0, 0, 0); + const history = buildHistory(t0, 300, { backfill_path_json: 1, tx_inserted: 5 }); + // Now a fresh sample at t0+301s where backfill_path_json jumped from 300→311 + // (11/sec over 1s), tx_inserted continues at 5/sec. + const last = history[history.length - 1]; + const current = { + sampleAt: new Date(t0 + 301 * 1000).toISOString(), + sources: { + backfill_path_json: last.sources.backfill_path_json + 11, + tx_inserted: last.sources.tx_inserted + 5, + }, + }; + const r = detect(history, current, { windowMs: 5 * 60 * 1000, factor: 10 }); + assert(r && r.flags, 'expected result with flags map'); + assert(r.flags.backfill_path_json === true, + 'expected backfill_path_json flagged at 11× baseline, got flags=' + JSON.stringify(r.flags) + + ' rates=' + JSON.stringify(r.rates) + ' baselines=' + JSON.stringify(r.baselineRates)); +}); + +test('no flag at 5× baseline (under threshold)', () => { + const t0 = Date.UTC(2026, 5, 5, 0, 0, 0); + const history = buildHistory(t0, 300, { backfill_path_json: 2, tx_inserted: 5 }); + const last = history[history.length - 1]; + const current = { + sampleAt: new Date(t0 + 301 * 1000).toISOString(), + sources: { + backfill_path_json: last.sources.backfill_path_json + 10, // 10/sec vs 2/sec baseline = 5× + tx_inserted: last.sources.tx_inserted + 5, + }, + }; + const r = detect(history, current, { windowMs: 5 * 60 * 1000, factor: 10 }); + assert(!r.flags.backfill_path_json, + 'expected no flag at 5× baseline, got ' + JSON.stringify(r.flags)); +}); + +test('no flag without enough history (< 30s of samples)', () => { + const t0 = Date.UTC(2026, 5, 5, 0, 0, 0); + const history = buildHistory(t0, 5, { backfill_path_json: 1 }); + const last = history[history.length - 1]; + const current = { + sampleAt: new Date(t0 + 6 * 1000).toISOString(), + sources: { backfill_path_json: last.sources.backfill_path_json + 100 }, + }; + const r = detect(history, current, { windowMs: 5 * 60 * 1000, factor: 10, minHistorySec: 30 }); + assert(!r.flags.backfill_path_json, 'expected no flag with insufficient history'); +}); + +console.log('\n' + pass + ' passed, ' + fail + ' failed'); +process.exit(fail === 0 ? 0 : 1);