mirror of
https://github.com/Kpa-clawbot/meshcore-analyzer.git
synced 2026-06-29 09:01:39 +00:00
## Summary Addresses the remaining acceptance gap on #1120: a true **5-minute rolling-baseline anomaly detector** for the Perf-page Write Sources table. The endpoints + ingestor wiring + UI scaffolding landed in #1123 (partial); this PR replaces the ad-hoc tx-rate comparison with the rolling baseline the issue actually asks for, and adds a JS unit test that proves the ⚠️ flag fires at 11× baseline. ## What changed - **`public/perf.js`** — new pure helper `detectPerfAnomalies(history, current, opts)`. Computes per-component current rate and rolling baseline rate over a window (default 5 min). Flags components whose current rate > 10× baseline. Includes a 0.05/s floor so a stale `0` baseline doesn't false-positive at startup. - **UI** — Write Sources table now shows `Rate/s`, `Baseline/s`, and `Anomaly` columns. Operators can sanity-check the ⚠️ rather than trusting opaque output. History is kept on `window` and pruned to a 6-min sliding ring. - **`test-perf-anomaly.js`** — new VM-sandbox test asserting: - ⚠️ fires when one component runs at 11× its 5-min baseline - No ⚠️ at 5× (under threshold) - No ⚠️ until ≥30s of history has accumulated ## TDD evidence (red → green) - Red commit `590f04d3`: introduces the stub `detectPerfAnomalies` (returns empty `{flags:{}}`) + the test. Test FAILS on the `assert(r.flags.backfill_path_json === true, ...)` assertion — not a build error. ``` ❌ ⚠️ fires when backfill rate hits 11× the 5-minute baseline: expected backfill_path_json flagged at 11× baseline, got flags={} 2 passed, 1 failed ``` - Green commit `726a5e78`: implements the rolling-baseline detector. All 3 tests pass; existing `test-packet-filter.js` (79 tests) still green; `cmd/server` Go tests for `/api/perf/*` still green. ## What is NOT in this PR (deferred / out of scope per brief) - **SQLite-stats subsection** (WAL size + cache hit rate + pending checkpoint) — `/api/perf/sqlite` already exists (landed in #1123). Issue body lists it as a metric category, brief explicitly marks it OPTIONAL. Not regressed; no changes needed. - **Ingestor `/proc/self/io` bridge** — already lives in the ingestor stats file (`ProcIO` field, `internal/perfio`) and is rendered on the Perf page. No change. - **Issue #1340** (SQLite write-lock instrumentation) — separate PR in flight, not piggybacked. - **No new metrics backend** (no Prometheus, no OpenTelemetry). Pure JSON over `/api/perf/*`. ## Hard-rule compliance - Files changed: 2 (`public/perf.js`, `test-perf-anomaly.js`) — well inside the 3-files-outside-allowed-set cap. - `Stats` struct unchanged. - All colors via CSS variables — no hex literals introduced (grep clean). - TDD: red commit fails on assertion, green commit passes — visible in branch history. - PII preflight: clean on both commits. Partial fix language deliberately not used — this completes the issue's UI acceptance criterion. Leaving `Fixes #1120` off so the user can verify on the staging deploy before closing. --------- Co-authored-by: meshcore-bot <bot@meshcore>
This commit is contained in:
+91
-37
@@ -3,6 +3,75 @@
|
||||
|
||||
var GH = 'https://github.com/Kpa-clawbot/corescope';
|
||||
|
||||
// detectPerfAnomalies — pure, testable.
|
||||
// Computes per-component write rates over a rolling time window and flags any
|
||||
// component whose current per-second rate exceeds `factor` × its rolling
|
||||
// baseline rate. Issue #1120 acceptance: 5-minute window, 10× threshold.
|
||||
//
|
||||
// Inputs:
|
||||
// history: ordered array of snapshots [{ sampleAt: ISO, sources: { name: cum } }]
|
||||
// current: the freshest snapshot, same shape
|
||||
// opts:
|
||||
// windowMs (default 5*60*1000) — rolling baseline window
|
||||
// factor (default 10) — rate-multiplier threshold
|
||||
// minHistorySec (default 30) — refuse to flag until baseline is stable
|
||||
//
|
||||
// Returns: { rates, baselineRates, flags } — all keyed by source name.
|
||||
function detectPerfAnomalies(history, current, opts) {
|
||||
opts = opts || {};
|
||||
const windowMs = opts.windowMs || (5 * 60 * 1000);
|
||||
const factor = opts.factor || 10;
|
||||
const minHistorySec = opts.minHistorySec != null ? opts.minHistorySec : 30;
|
||||
const out = { rates: {}, baselineRates: {}, flags: {} };
|
||||
if (!current || !current.sources || !history || history.length === 0) return out;
|
||||
const curT = Date.parse(current.sampleAt);
|
||||
if (!isFinite(curT)) return out;
|
||||
|
||||
// Find the most recent prior sample (for the *current* per-second rate)
|
||||
// and the oldest sample within the window (for the baseline).
|
||||
const prior = history[history.length - 1];
|
||||
const priorT = Date.parse(prior.sampleAt);
|
||||
const curDt = (curT - priorT) / 1000;
|
||||
if (!(curDt > 0)) return out;
|
||||
|
||||
// Baseline: oldest sample within window vs. prior (the snapshot just before
|
||||
// `current`). Anything older than windowMs is excluded.
|
||||
const cutoff = curT - windowMs;
|
||||
let baseIdx = 0;
|
||||
for (let i = history.length - 1; i >= 0; i--) {
|
||||
if (Date.parse(history[i].sampleAt) < cutoff) { baseIdx = i + 1; break; }
|
||||
}
|
||||
if (baseIdx >= history.length) baseIdx = history.length - 1;
|
||||
const baseSnap = history[baseIdx];
|
||||
const baseT = Date.parse(baseSnap.sampleAt);
|
||||
const baseDt = (priorT - baseT) / 1000;
|
||||
|
||||
// Compute rates for every source seen in current.
|
||||
for (const k of Object.keys(current.sources)) {
|
||||
const cur = current.sources[k] || 0;
|
||||
const prev = (prior.sources && prior.sources[k]) || 0;
|
||||
const rate = (cur - prev) / curDt;
|
||||
out.rates[k] = rate;
|
||||
if (baseDt <= 0 || baseDt < minHistorySec) {
|
||||
out.baselineRates[k] = null;
|
||||
continue;
|
||||
}
|
||||
const baseStart = (baseSnap.sources && baseSnap.sources[k]) || 0;
|
||||
const baseEnd = prev; // baseline window = [baseSnap .. prior]
|
||||
const baseRate = (baseEnd - baseStart) / baseDt;
|
||||
out.baselineRates[k] = baseRate;
|
||||
// Guard floor to avoid 0-baseline → infinite ratio false positives.
|
||||
const floor = 0.05; // 1 event per 20s minimum baseline
|
||||
if (rate > factor * Math.max(baseRate, floor) && rate > factor * floor) {
|
||||
out.flags[k] = true;
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
if (typeof window !== 'undefined') {
|
||||
window.detectPerfAnomalies = detectPerfAnomalies;
|
||||
}
|
||||
|
||||
function renderVersionCard(health) {
|
||||
if (!health || (!health.version && !health.commit)) return '';
|
||||
var ver = health.version && health.version !== 'unknown' ? health.version : null;
|
||||
@@ -127,48 +196,33 @@ function renderVersionCard(health) {
|
||||
if (keys.length === 0) {
|
||||
html += '<p style="color:var(--text-muted)">No ingestor stats yet (waiting for /tmp/corescope-ingestor-stats.json)</p>';
|
||||
} else {
|
||||
// Anomaly detection (#1123 polish):
|
||||
// Compare PER-SECOND DELTA RATES, not cumulative counts.
|
||||
// Cumulative-vs-cumulative was a tautology that fired ⚠️ at startup
|
||||
// (any backfill_* > 10 when tx_inserted=0 → baseline collapses to 1)
|
||||
// and false-cleared once tx grew past a one-shot backfill burst.
|
||||
// Now we cache the previous snapshot + sampleAt and only fire when:
|
||||
// 1) we have a real interval (≥ 0.5s) to compute deltas against
|
||||
// 2) tx_inserted has crossed MIN_SAMPLE so the baseline is meaningful
|
||||
// 3) the per-second backfill rate exceeds 10× the per-second tx rate
|
||||
const MIN_SAMPLE = 100;
|
||||
const prev = window._perfWriteSourcesPrev;
|
||||
let prevSrc = null, dtSec = 0;
|
||||
if (prev && prev.sampleAt && writeSources.sampleAt) {
|
||||
dtSec = (Date.parse(writeSources.sampleAt) - Date.parse(prev.sampleAt)) / 1000;
|
||||
if (dtSec >= 0.5) prevSrc = prev.sources;
|
||||
}
|
||||
const txTotal = src.tx_inserted || 0;
|
||||
const txDelta = prevSrc ? (txTotal - (prevSrc.tx_inserted || 0)) : 0;
|
||||
const txRate = (prevSrc && dtSec > 0) ? (txDelta / dtSec) : 0;
|
||||
html += '<div style="overflow-x:auto"><table class="perf-table"><thead><tr><th scope="col">Source</th><th scope="col">Total</th><th scope="col">Rate/s</th><th scope="col">Anomaly</th></tr></thead><tbody>';
|
||||
// Anomaly detection (#1120 acceptance): flag any component whose
|
||||
// per-second write rate exceeds 10× its 5-minute rolling baseline.
|
||||
// History is stashed on window so the detector has multi-sample
|
||||
// context across the 5s refresh tick.
|
||||
if (!window._perfWriteSourcesHistory) window._perfWriteSourcesHistory = [];
|
||||
const history = window._perfWriteSourcesHistory;
|
||||
const current = { sampleAt: writeSources.sampleAt || new Date().toISOString(), sources: { ...src } };
|
||||
const anom = detectPerfAnomalies(history, current, { windowMs: 5 * 60 * 1000, factor: 10 });
|
||||
// Append current and prune anything older than 6 minutes (keeps a
|
||||
// little headroom past the 5-min window, bounded memory).
|
||||
history.push(current);
|
||||
const cutoff = Date.parse(current.sampleAt) - (6 * 60 * 1000);
|
||||
while (history.length > 1 && Date.parse(history[0].sampleAt) < cutoff) history.shift();
|
||||
|
||||
html += '<div style="overflow-x:auto"><table class="perf-table"><thead><tr><th scope="col">Source</th><th scope="col">Total</th><th scope="col">Rate/s</th><th scope="col">Baseline/s</th><th scope="col">Anomaly</th></tr></thead><tbody>';
|
||||
for (const k of keys) {
|
||||
const v = src[k] || 0;
|
||||
const isBackfill = k.startsWith('backfill_');
|
||||
let rate = 0;
|
||||
let flag = '';
|
||||
if (prevSrc && dtSec > 0) {
|
||||
const delta = v - (prevSrc[k] || 0);
|
||||
rate = delta / dtSec;
|
||||
// Only flag when tx baseline is statistically meaningful AND
|
||||
// backfill is actively running faster than 10× the live tx rate.
|
||||
if (isBackfill && txTotal >= MIN_SAMPLE && rate > 10 * Math.max(txRate, 1)) {
|
||||
flag = ' ⚠️';
|
||||
}
|
||||
}
|
||||
const rateStr = (prevSrc && dtSec > 0) ? rate.toFixed(1) : '—';
|
||||
html += `<tr><td><code>${k}</code></td><td>${v.toLocaleString()}</td><td>${rateStr}</td><td>${flag}</td></tr>`;
|
||||
const rate = anom.rates[k];
|
||||
const base = anom.baselineRates[k];
|
||||
const flag = anom.flags[k] ? ' ⚠️' : '';
|
||||
const rateStr = (rate != null && isFinite(rate)) ? rate.toFixed(2) : '—';
|
||||
const baseStr = (base != null && isFinite(base)) ? base.toFixed(2) : '—';
|
||||
html += `<tr><td><code>${k}</code></td><td>${v.toLocaleString()}</td><td>${rateStr}</td><td>${baseStr}</td><td>${flag}</td></tr>`;
|
||||
}
|
||||
html += '</tbody></table></div>';
|
||||
// Stash for next tick's delta computation.
|
||||
window._perfWriteSourcesPrev = { sources: { ...src }, sampleAt: writeSources.sampleAt };
|
||||
if (writeSources.sampleAt) {
|
||||
html += `<div style="font-size:11px;color:var(--text-muted);margin-top:4px">Sampled: ${writeSources.sampleAt}</div>`;
|
||||
html += `<div style="font-size:11px;color:var(--text-muted);margin-top:4px">Sampled: ${writeSources.sampleAt} · baseline window: 5 min · threshold: 10×</div>`;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,107 @@
|
||||
/* Unit tests for perf.js anomaly detection — 5-minute rolling baseline.
|
||||
*
|
||||
* Issue #1120 acceptance criterion: "Per-component write rate > 10× steady-state
|
||||
* baseline" flagged with ⚠️. The baseline must be a 5-minute rolling window,
|
||||
* not a single sample-to-sample comparison (which gives false negatives during
|
||||
* a slow ramp and false positives during natural bursts).
|
||||
*
|
||||
* This file exercises window.detectPerfAnomalies(history, current, opts).
|
||||
*/
|
||||
'use strict';
|
||||
const vm = require('vm');
|
||||
const fs = require('fs');
|
||||
|
||||
const code = fs.readFileSync('public/perf.js', 'utf8');
|
||||
const ctx = {
|
||||
window: {},
|
||||
document: { addEventListener() {}, getElementById() { return null; }, hidden: true },
|
||||
console,
|
||||
fetch: () => Promise.resolve({ json: () => Promise.resolve(null) }),
|
||||
setInterval: () => 0,
|
||||
clearInterval: () => {},
|
||||
registerPage: () => {},
|
||||
};
|
||||
vm.createContext(ctx);
|
||||
vm.runInContext(code, ctx);
|
||||
|
||||
const detect = ctx.window.detectPerfAnomalies;
|
||||
if (typeof detect !== 'function') {
|
||||
console.log('FAIL: window.detectPerfAnomalies is not a function (got ' + typeof detect + ')');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
let pass = 0, fail = 0;
|
||||
function test(name, fn) {
|
||||
try { fn(); pass++; console.log(' ✅ ' + name); }
|
||||
catch (e) { fail++; console.log(' ❌ ' + name + ': ' + e.message); }
|
||||
}
|
||||
function assert(cond, msg) { if (!cond) throw new Error(msg || 'assertion failed'); }
|
||||
|
||||
// Build a 5-minute history where backfill_path_json increments at a steady
|
||||
// 1/sec baseline (300 samples over 300s), tx_inserted at 5/sec.
|
||||
function buildHistory(startMs, durSec, perSec) {
|
||||
const h = [];
|
||||
let cum = {};
|
||||
for (const k of Object.keys(perSec)) cum[k] = 0;
|
||||
for (let i = 0; i <= durSec; i++) {
|
||||
const ts = new Date(startMs + i * 1000).toISOString();
|
||||
const snap = { sampleAt: ts, sources: {} };
|
||||
for (const k of Object.keys(perSec)) {
|
||||
cum[k] += perSec[k];
|
||||
snap.sources[k] = cum[k];
|
||||
}
|
||||
h.push(snap);
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
test('⚠️ fires when backfill rate hits 11× the 5-minute baseline', () => {
|
||||
const t0 = Date.UTC(2026, 5, 5, 0, 0, 0);
|
||||
const history = buildHistory(t0, 300, { backfill_path_json: 1, tx_inserted: 5 });
|
||||
// Now a fresh sample at t0+301s where backfill_path_json jumped from 300→311
|
||||
// (11/sec over 1s), tx_inserted continues at 5/sec.
|
||||
const last = history[history.length - 1];
|
||||
const current = {
|
||||
sampleAt: new Date(t0 + 301 * 1000).toISOString(),
|
||||
sources: {
|
||||
backfill_path_json: last.sources.backfill_path_json + 11,
|
||||
tx_inserted: last.sources.tx_inserted + 5,
|
||||
},
|
||||
};
|
||||
const r = detect(history, current, { windowMs: 5 * 60 * 1000, factor: 10 });
|
||||
assert(r && r.flags, 'expected result with flags map');
|
||||
assert(r.flags.backfill_path_json === true,
|
||||
'expected backfill_path_json flagged at 11× baseline, got flags=' + JSON.stringify(r.flags) +
|
||||
' rates=' + JSON.stringify(r.rates) + ' baselines=' + JSON.stringify(r.baselineRates));
|
||||
});
|
||||
|
||||
test('no flag at 5× baseline (under threshold)', () => {
|
||||
const t0 = Date.UTC(2026, 5, 5, 0, 0, 0);
|
||||
const history = buildHistory(t0, 300, { backfill_path_json: 2, tx_inserted: 5 });
|
||||
const last = history[history.length - 1];
|
||||
const current = {
|
||||
sampleAt: new Date(t0 + 301 * 1000).toISOString(),
|
||||
sources: {
|
||||
backfill_path_json: last.sources.backfill_path_json + 10, // 10/sec vs 2/sec baseline = 5×
|
||||
tx_inserted: last.sources.tx_inserted + 5,
|
||||
},
|
||||
};
|
||||
const r = detect(history, current, { windowMs: 5 * 60 * 1000, factor: 10 });
|
||||
assert(!r.flags.backfill_path_json,
|
||||
'expected no flag at 5× baseline, got ' + JSON.stringify(r.flags));
|
||||
});
|
||||
|
||||
test('no flag without enough history (< 30s of samples)', () => {
|
||||
const t0 = Date.UTC(2026, 5, 5, 0, 0, 0);
|
||||
const history = buildHistory(t0, 5, { backfill_path_json: 1 });
|
||||
const last = history[history.length - 1];
|
||||
const current = {
|
||||
sampleAt: new Date(t0 + 6 * 1000).toISOString(),
|
||||
sources: { backfill_path_json: last.sources.backfill_path_json + 100 },
|
||||
};
|
||||
const r = detect(history, current, { windowMs: 5 * 60 * 1000, factor: 10, minHistorySec: 30 });
|
||||
assert(!r.flags.backfill_path_json, 'expected no flag with insufficient history');
|
||||
});
|
||||
|
||||
console.log('\n' + pass + ' passed, ' + fail + ' failed');
|
||||
process.exit(fail === 0 ? 0 : 1);
|
||||
Reference in New Issue
Block a user