feat(perf): 5-min rolling-baseline anomaly detection for Write Sources (#1120) (#1593)

## Summary Addresses the remaining acceptance gap on #1120: a true **5-minute rolling-baseline anomaly detector** for the Perf-page Write Sources table. The endpoints + ingestor wiring + UI scaffolding landed in #1123 (partial); this PR replaces the ad-hoc tx-rate comparison with the rolling baseline the issue actually asks for, and adds a JS unit test that proves the ⚠️ flag fires at 11× baseline. ## What changed - **`public/perf.js`** — new pure helper `detectPerfAnomalies(history, current, opts)`. Computes per-component current rate and rolling baseline rate over a window (default 5 min). Flags components whose current rate > 10× baseline. Includes a 0.05/s floor so a stale `0` baseline doesn't false-positive at startup. - **UI** — Write Sources table now shows `Rate/s`, `Baseline/s`, and `Anomaly` columns. Operators can sanity-check the ⚠️ rather than trusting opaque output. History is kept on `window` and pruned to a 6-min sliding ring. - **`test-perf-anomaly.js`** — new VM-sandbox test asserting: - ⚠️ fires when one component runs at 11× its 5-min baseline - No ⚠️ at 5× (under threshold) - No ⚠️ until ≥30s of history has accumulated ## TDD evidence (red → green) - Red commit `590f04d3`: introduces the stub `detectPerfAnomalies` (returns empty `{flags:{}}`) + the test. Test FAILS on the `assert(r.flags.backfill_path_json === true, ...)` assertion — not a build error. ``` ❌ ⚠️ fires when backfill rate hits 11× the 5-minute baseline: expected backfill_path_json flagged at 11× baseline, got flags={} 2 passed, 1 failed ``` - Green commit `726a5e78`: implements the rolling-baseline detector. All 3 tests pass; existing `test-packet-filter.js` (79 tests) still green; `cmd/server` Go tests for `/api/perf/*` still green. ## What is NOT in this PR (deferred / out of scope per brief) - **SQLite-stats subsection** (WAL size + cache hit rate + pending checkpoint) — `/api/perf/sqlite` already exists (landed in #1123). Issue body lists it as a metric category, brief explicitly marks it OPTIONAL. Not regressed; no changes needed. - **Ingestor `/proc/self/io` bridge** — already lives in the ingestor stats file (`ProcIO` field, `internal/perfio`) and is rendered on the Perf page. No change. - **Issue #1340** (SQLite write-lock instrumentation) — separate PR in flight, not piggybacked. - **No new metrics backend** (no Prometheus, no OpenTelemetry). Pure JSON over `/api/perf/*`. ## Hard-rule compliance - Files changed: 2 (`public/perf.js`, `test-perf-anomaly.js`) — well inside the 3-files-outside-allowed-set cap. - `Stats` struct unchanged. - All colors via CSS variables — no hex literals introduced (grep clean). - TDD: red commit fails on assertion, green commit passes — visible in branch history. - PII preflight: clean on both commits. Partial fix language deliberately not used — this completes the issue's UI acceptance criterion. Leaving `Fixes #1120` off so the user can verify on the staging deploy before closing. --------- Co-authored-by: meshcore-bot <bot@meshcore>
2026-06-29 09:01:39 +00:00 · 2026-06-06 20:43:58 -07:00
parent d6384c3c59
commit a26a412c9b
2 changed files with 198 additions and 37 deletions
@@ -3,6 +3,75 @@

 var GH = 'https://github.com/Kpa-clawbot/corescope';

+// detectPerfAnomalies — pure, testable.
+// Computes per-component write rates over a rolling time window and flags any
+// component whose current per-second rate exceeds `factor` × its rolling
+// baseline rate. Issue #1120 acceptance: 5-minute window, 10× threshold.
+//
+// Inputs:
+//   history: ordered array of snapshots [{ sampleAt: ISO, sources: { name: cum } }]
+//   current: the freshest snapshot, same shape
+//   opts:
+//     windowMs       (default 5*60*1000) — rolling baseline window
+//     factor         (default 10)        — rate-multiplier threshold
+//     minHistorySec  (default 30)        — refuse to flag until baseline is stable
+//
+// Returns: { rates, baselineRates, flags } — all keyed by source name.
+function detectPerfAnomalies(history, current, opts) {
+  opts = opts || {};
+  const windowMs = opts.windowMs || (5 * 60 * 1000);
+  const factor = opts.factor || 10;
+  const minHistorySec = opts.minHistorySec != null ? opts.minHistorySec : 30;
+  const out = { rates: {}, baselineRates: {}, flags: {} };
+  if (!current || !current.sources || !history || history.length === 0) return out;
+  const curT = Date.parse(current.sampleAt);
+  if (!isFinite(curT)) return out;
+
+  // Find the most recent prior sample (for the *current* per-second rate)
+  // and the oldest sample within the window (for the baseline).
+  const prior = history[history.length - 1];
+  const priorT = Date.parse(prior.sampleAt);
+  const curDt = (curT - priorT) / 1000;
+  if (!(curDt > 0)) return out;
+
+  // Baseline: oldest sample within window vs. prior (the snapshot just before
+  // `current`). Anything older than windowMs is excluded.
+  const cutoff = curT - windowMs;
+  let baseIdx = 0;
+  for (let i = history.length - 1; i >= 0; i--) {
+    if (Date.parse(history[i].sampleAt) < cutoff) { baseIdx = i + 1; break; }
+  }
+  if (baseIdx >= history.length) baseIdx = history.length - 1;
+  const baseSnap = history[baseIdx];
+  const baseT = Date.parse(baseSnap.sampleAt);
+  const baseDt = (priorT - baseT) / 1000;
+
+  // Compute rates for every source seen in current.
+  for (const k of Object.keys(current.sources)) {
+    const cur = current.sources[k] || 0;
+    const prev = (prior.sources && prior.sources[k]) || 0;
+    const rate = (cur - prev) / curDt;
+    out.rates[k] = rate;
+    if (baseDt <= 0 || baseDt < minHistorySec) {
+      out.baselineRates[k] = null;
+      continue;
+    }
+    const baseStart = (baseSnap.sources && baseSnap.sources[k]) || 0;
+    const baseEnd = prev; // baseline window = [baseSnap .. prior]
+    const baseRate = (baseEnd - baseStart) / baseDt;
+    out.baselineRates[k] = baseRate;
+    // Guard floor to avoid 0-baseline → infinite ratio false positives.
+    const floor = 0.05; // 1 event per 20s minimum baseline
+    if (rate > factor * Math.max(baseRate, floor) && rate > factor * floor) {
+      out.flags[k] = true;
+    }
+  }
+  return out;
+}
+if (typeof window !== 'undefined') {
+  window.detectPerfAnomalies = detectPerfAnomalies;
+}
+
 function renderVersionCard(health) {
  if (!health || (!health.version && !health.commit)) return '';
  var ver = health.version && health.version !== 'unknown' ? health.version : null;
@@ -127,48 +196,33 @@ function renderVersionCard(health) {
        if (keys.length === 0) {
          html += '<p style="color:var(--text-muted)">No ingestor stats yet (waiting for /tmp/corescope-ingestor-stats.json)</p>';
        } else {
-          // Anomaly detection (#1123 polish):
-          //   Compare PER-SECOND DELTA RATES, not cumulative counts.
-          //   Cumulative-vs-cumulative was a tautology that fired ⚠️ at startup
-          //   (any backfill_* > 10 when tx_inserted=0 → baseline collapses to 1)
-          //   and false-cleared once tx grew past a one-shot backfill burst.
-          //   Now we cache the previous snapshot + sampleAt and only fire when:
-          //     1) we have a real interval (≥ 0.5s) to compute deltas against
-          //     2) tx_inserted has crossed MIN_SAMPLE so the baseline is meaningful
-          //     3) the per-second backfill rate exceeds 10× the per-second tx rate
-          const MIN_SAMPLE = 100;
-          const prev = window._perfWriteSourcesPrev;
-          let prevSrc = null, dtSec = 0;
-          if (prev && prev.sampleAt && writeSources.sampleAt) {
-            dtSec = (Date.parse(writeSources.sampleAt) - Date.parse(prev.sampleAt)) / 1000;
-            if (dtSec >= 0.5) prevSrc = prev.sources;
-          }
-          const txTotal = src.tx_inserted || 0;
-          const txDelta = prevSrc ? (txTotal - (prevSrc.tx_inserted || 0)) : 0;
-          const txRate = (prevSrc && dtSec > 0) ? (txDelta / dtSec) : 0;
-          html += '<div style="overflow-x:auto"><table class="perf-table"><thead><tr><th scope="col">Source</th><th scope="col">Total</th><th scope="col">Rate/s</th><th scope="col">Anomaly</th></tr></thead><tbody>';
+          // Anomaly detection (#1120 acceptance): flag any component whose
+          // per-second write rate exceeds 10× its 5-minute rolling baseline.
+          // History is stashed on window so the detector has multi-sample
+          // context across the 5s refresh tick.
+          if (!window._perfWriteSourcesHistory) window._perfWriteSourcesHistory = [];
+          const history = window._perfWriteSourcesHistory;
+          const current = { sampleAt: writeSources.sampleAt || new Date().toISOString(), sources: { ...src } };
+          const anom = detectPerfAnomalies(history, current, { windowMs: 5 * 60 * 1000, factor: 10 });
+          // Append current and prune anything older than 6 minutes (keeps a
+          // little headroom past the 5-min window, bounded memory).
+          history.push(current);
+          const cutoff = Date.parse(current.sampleAt) - (6 * 60 * 1000);
+          while (history.length > 1 && Date.parse(history[0].sampleAt) < cutoff) history.shift();
+
+          html += '<div style="overflow-x:auto"><table class="perf-table"><thead><tr><th scope="col">Source</th><th scope="col">Total</th><th scope="col">Rate/s</th><th scope="col">Baseline/s</th><th scope="col">Anomaly</th></tr></thead><tbody>';
          for (const k of keys) {
            const v = src[k] || 0;
-            const isBackfill = k.startsWith('backfill_');
-            let rate = 0;
-            let flag = '';
-            if (prevSrc && dtSec > 0) {
-              const delta = v - (prevSrc[k] || 0);
-              rate = delta / dtSec;
-              // Only flag when tx baseline is statistically meaningful AND
-              // backfill is actively running faster than 10× the live tx rate.
-              if (isBackfill && txTotal >= MIN_SAMPLE && rate > 10 * Math.max(txRate, 1)) {
-                flag = ' ⚠️';
-              }
-            }
-            const rateStr = (prevSrc && dtSec > 0) ? rate.toFixed(1) : '—';
-            html += `<tr><td><code>${k}</code></td><td>${v.toLocaleString()}</td><td>${rateStr}</td><td>${flag}</td></tr>`;
+            const rate = anom.rates[k];
+            const base = anom.baselineRates[k];
+            const flag = anom.flags[k] ? ' ⚠️' : '';
+            const rateStr = (rate != null && isFinite(rate)) ? rate.toFixed(2) : '—';
+            const baseStr = (base != null && isFinite(base)) ? base.toFixed(2) : '—';
+            html += `<tr><td><code>${k}</code></td><td>${v.toLocaleString()}</td><td>${rateStr}</td><td>${baseStr}</td><td>${flag}</td></tr>`;
          }
          html += '</tbody></table></div>';
-          // Stash for next tick's delta computation.
-          window._perfWriteSourcesPrev = { sources: { ...src }, sampleAt: writeSources.sampleAt };
          if (writeSources.sampleAt) {
-            html += `<div style="font-size:11px;color:var(--text-muted);margin-top:4px">Sampled: ${writeSources.sampleAt}</div>`;
+            html += `<div style="font-size:11px;color:var(--text-muted);margin-top:4px">Sampled: ${writeSources.sampleAt} · baseline window: 5 min · threshold: 10×</div>`;
          }
        }
      }
@@ -0,0 +1,107 @@
+/* Unit tests for perf.js anomaly detection — 5-minute rolling baseline.
+ *
+ * Issue #1120 acceptance criterion: "Per-component write rate > 10× steady-state
+ * baseline" flagged with ⚠️. The baseline must be a 5-minute rolling window,
+ * not a single sample-to-sample comparison (which gives false negatives during
+ * a slow ramp and false positives during natural bursts).
+ *
+ * This file exercises window.detectPerfAnomalies(history, current, opts).
+ */
+'use strict';
+const vm = require('vm');
+const fs = require('fs');
+
+const code = fs.readFileSync('public/perf.js', 'utf8');
+const ctx = {
+  window: {},
+  document: { addEventListener() {}, getElementById() { return null; }, hidden: true },
+  console,
+  fetch: () => Promise.resolve({ json: () => Promise.resolve(null) }),
+  setInterval: () => 0,
+  clearInterval: () => {},
+  registerPage: () => {},
+};
+vm.createContext(ctx);
+vm.runInContext(code, ctx);
+
+const detect = ctx.window.detectPerfAnomalies;
+if (typeof detect !== 'function') {
+  console.log('FAIL: window.detectPerfAnomalies is not a function (got ' + typeof detect + ')');
+  process.exit(1);
+}
+
+let pass = 0, fail = 0;
+function test(name, fn) {
+  try { fn(); pass++; console.log('  ✅ ' + name); }
+  catch (e) { fail++; console.log('  ❌ ' + name + ': ' + e.message); }
+}
+function assert(cond, msg) { if (!cond) throw new Error(msg || 'assertion failed'); }
+
+// Build a 5-minute history where backfill_path_json increments at a steady
+// 1/sec baseline (300 samples over 300s), tx_inserted at 5/sec.
+function buildHistory(startMs, durSec, perSec) {
+  const h = [];
+  let cum = {};
+  for (const k of Object.keys(perSec)) cum[k] = 0;
+  for (let i = 0; i <= durSec; i++) {
+    const ts = new Date(startMs + i * 1000).toISOString();
+    const snap = { sampleAt: ts, sources: {} };
+    for (const k of Object.keys(perSec)) {
+      cum[k] += perSec[k];
+      snap.sources[k] = cum[k];
+    }
+    h.push(snap);
+  }
+  return h;
+}
+
+test('⚠️ fires when backfill rate hits 11× the 5-minute baseline', () => {
+  const t0 = Date.UTC(2026, 5, 5, 0, 0, 0);
+  const history = buildHistory(t0, 300, { backfill_path_json: 1, tx_inserted: 5 });
+  // Now a fresh sample at t0+301s where backfill_path_json jumped from 300→311
+  // (11/sec over 1s), tx_inserted continues at 5/sec.
+  const last = history[history.length - 1];
+  const current = {
+    sampleAt: new Date(t0 + 301 * 1000).toISOString(),
+    sources: {
+      backfill_path_json: last.sources.backfill_path_json + 11,
+      tx_inserted: last.sources.tx_inserted + 5,
+    },
+  };
+  const r = detect(history, current, { windowMs: 5 * 60 * 1000, factor: 10 });
+  assert(r && r.flags, 'expected result with flags map');
+  assert(r.flags.backfill_path_json === true,
+    'expected backfill_path_json flagged at 11× baseline, got flags=' + JSON.stringify(r.flags) +
+    ' rates=' + JSON.stringify(r.rates) + ' baselines=' + JSON.stringify(r.baselineRates));
+});
+
+test('no flag at 5× baseline (under threshold)', () => {
+  const t0 = Date.UTC(2026, 5, 5, 0, 0, 0);
+  const history = buildHistory(t0, 300, { backfill_path_json: 2, tx_inserted: 5 });
+  const last = history[history.length - 1];
+  const current = {
+    sampleAt: new Date(t0 + 301 * 1000).toISOString(),
+    sources: {
+      backfill_path_json: last.sources.backfill_path_json + 10, // 10/sec vs 2/sec baseline = 5×
+      tx_inserted: last.sources.tx_inserted + 5,
+    },
+  };
+  const r = detect(history, current, { windowMs: 5 * 60 * 1000, factor: 10 });
+  assert(!r.flags.backfill_path_json,
+    'expected no flag at 5× baseline, got ' + JSON.stringify(r.flags));
+});
+
+test('no flag without enough history (< 30s of samples)', () => {
+  const t0 = Date.UTC(2026, 5, 5, 0, 0, 0);
+  const history = buildHistory(t0, 5, { backfill_path_json: 1 });
+  const last = history[history.length - 1];
+  const current = {
+    sampleAt: new Date(t0 + 6 * 1000).toISOString(),
+    sources: { backfill_path_json: last.sources.backfill_path_json + 100 },
+  };
+  const r = detect(history, current, { windowMs: 5 * 60 * 1000, factor: 10, minHistorySec: 30 });
+  assert(!r.flags.backfill_path_json, 'expected no flag with insufficient history');
+});
+
+console.log('\n' + pass + ' passed, ' + fail + ' failed');
+process.exit(fail === 0 ? 0 : 1);