mirror of
https://github.com/Kpa-clawbot/meshcore-analyzer.git
synced 2026-06-28 21:21:51 +00:00
Closes #1552. ## What Make observer `Online` / `Stale` / `Offline` thresholds operator-configurable via `config.json`'s existing `healthThresholds` block — and **raise the defaults** from 10 min / 60 min to **60 min / 1440 min (1 h / 24 h)** so they match the node thresholds and stop producing flap out of the box. ⚠️ **This is a default behavior change.** Operators who want the old aggressive 10-min Online threshold must opt in via: ```json "healthThresholds": { "observerOnlineMinutes": 10 } ``` ## Why Per #1552: the `600000` / `3600000` constants in `public/observers.js` were not tunable, *and* 10 min is wrong as a default. Wide-geo, low-traffic meshes legitimately see observers go quiet for >10 min between reports, and operators behind a CDN (#1551) get cached `last_seen` values that can push the observer 15+ min behind reality — guaranteeing flap at the 10-min threshold. The meshat.se operator (43 observers, v3.8.3) reports exactly this pattern. Defaults raised from 10 / 60 minutes to 60 / 1440 minutes (1 h / 24 h) to match the node thresholds for consistency and eliminate flap on low-traffic / CDN-fronted instances. Operators wanting the old 10-min Online behavior can set `observerOnlineMinutes: 10` in config. ## Changes Backend (`cmd/server/config.go`): - `HealthThresholds` gains `ObserverOnlineMinutes` / `ObserverStaleMinutes` (int). - `GetHealthThresholds()` defaults to **60 / 1440** when zero/absent. - `ToClientMs()` emits `observerOnlineMs` / `observerStaleMs`, picked up by the existing `/api/config-public` → `roles.js` `Object.assign(HEALTH_THRESHOLDS, …)` pipeline. `config.example.json`: new `observerOnlineMinutes` / `observerStaleMinutes` keys (60 / 1440) + `_comment_observerThresholds` explaining the rationale and opt-out. Frontend: - `public/observers.js` `healthStatus()` — reads from `window.HEALTH_THRESHOLDS.observerOnlineMs / observerStaleMs`, falls back to **3600000 / 86400000** (matching the new Go defaults for the pre-`/api/config-public` window). - `public/observer-detail.js` — same refactor (was previously hardcoded `600000` + misusing `nodeDegradedMs` for the Stale boundary). ## Backward compat - API shape: unchanged — only adds two optional keys. - Config: unchanged keys / no renames. - Default behavior: **changed** — operators relying on the implicit 10/60 must opt in (one config line). ## TDD - RED 1 (`ee19058f`): assertions on the new fields + `ToClientMs` keys + `healthStatus` reading from `window.HEALTH_THRESHOLDS`. CI: [failure](https://github.com/Kpa-clawbot/CoreScope/actions/runs/26945264822). - GREEN 1 (`30cfbf7a`): configurability landed (defaults still old 10/60). CI: [success](https://github.com/Kpa-clawbot/CoreScope/actions/runs/26945220598). - RED 2 (`2649cf35`): pin new 60/1440 defaults — empty-config Go path + JS `healthStatus` with no `HEALTH_THRESHOLDS`. CI must fail. - GREEN 2 (`5ef85bca`): bump Go defaults to 60/1440, JS fallbacks to 3600000/86400000, `config.example.json` updated. CI must pass. ## Preflight Clean (exit 0). `cross-stack` ack in commit messages — single feature spans Go + JSON + JS readers. ## Not in scope - Customizer UI for editing the thresholds (config-only per issue). - Node/infra thresholds (unchanged). - The deeper observer-flap root cause (#1551 cache-control is a separate PR in flight). --------- Co-authored-by: corescope-bot <bot@corescope> Co-authored-by: mc-bot <bot@meshcore.local>
This commit is contained in:
+23
-4
@@ -325,6 +325,10 @@ type HealthThresholds struct {
|
||||
// repeater to be considered "actively relaying" vs only "alive
|
||||
// (advert-only)". See issue #662. Defaults to 24h.
|
||||
RelayActiveHours float64 `json:"relayActiveHours"`
|
||||
// Issue #1552 — observer health classification thresholds (minutes).
|
||||
// Defaults match prior hardcoded behavior in public/observers.js (10/60).
|
||||
ObserverOnlineMinutes int `json:"observerOnlineMinutes"`
|
||||
ObserverStaleMinutes int `json:"observerStaleMinutes"`
|
||||
}
|
||||
|
||||
// ThemeFile mirrors theme.json overlay.
|
||||
@@ -415,6 +419,18 @@ func (c *Config) GetHealthThresholds() HealthThresholds {
|
||||
if c.HealthThresholds.RelayActiveHours > 0 {
|
||||
h.RelayActiveHours = c.HealthThresholds.RelayActiveHours
|
||||
}
|
||||
if c.HealthThresholds.ObserverOnlineMinutes > 0 {
|
||||
h.ObserverOnlineMinutes = c.HealthThresholds.ObserverOnlineMinutes
|
||||
}
|
||||
if c.HealthThresholds.ObserverStaleMinutes > 0 {
|
||||
h.ObserverStaleMinutes = c.HealthThresholds.ObserverStaleMinutes
|
||||
}
|
||||
}
|
||||
if h.ObserverOnlineMinutes <= 0 {
|
||||
h.ObserverOnlineMinutes = 60
|
||||
}
|
||||
if h.ObserverStaleMinutes <= 0 {
|
||||
h.ObserverStaleMinutes = 1440
|
||||
}
|
||||
return h
|
||||
}
|
||||
@@ -431,11 +447,14 @@ func (h HealthThresholds) GetHealthMs(role string) (degradedMs, silentMs int) {
|
||||
// ToClientMs returns the thresholds as ms for the frontend.
|
||||
func (h HealthThresholds) ToClientMs() map[string]int {
|
||||
const hourMs = 3600000
|
||||
const minMs = 60000
|
||||
return map[string]int{
|
||||
"infraDegradedMs": int(h.InfraDegradedHours * hourMs),
|
||||
"infraSilentMs": int(h.InfraSilentHours * hourMs),
|
||||
"nodeDegradedMs": int(h.NodeDegradedHours * hourMs),
|
||||
"nodeSilentMs": int(h.NodeSilentHours * hourMs),
|
||||
"infraDegradedMs": int(h.InfraDegradedHours * hourMs),
|
||||
"infraSilentMs": int(h.InfraSilentHours * hourMs),
|
||||
"nodeDegradedMs": int(h.NodeDegradedHours * hourMs),
|
||||
"nodeSilentMs": int(h.NodeSilentHours * hourMs),
|
||||
"observerOnlineMs": h.ObserverOnlineMinutes * minMs,
|
||||
"observerStaleMs": h.ObserverStaleMinutes * minMs,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -387,3 +387,71 @@ func TestObserverDaysOrDefault(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Issue #1552 — observer health thresholds configurable.
|
||||
|
||||
func TestObserverThresholdsOverride(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
cfgData := map[string]interface{}{
|
||||
"healthThresholds": map[string]interface{}{
|
||||
"observerOnlineMinutes": 30,
|
||||
"observerStaleMinutes": 120,
|
||||
},
|
||||
}
|
||||
data, _ := json.Marshal(cfgData)
|
||||
os.WriteFile(filepath.Join(dir, "config.json"), data, 0644)
|
||||
cfg, err := LoadConfig(dir)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
h := cfg.GetHealthThresholds()
|
||||
if h.ObserverOnlineMinutes != 30 {
|
||||
t.Errorf("ObserverOnlineMinutes = %d, want 30", h.ObserverOnlineMinutes)
|
||||
}
|
||||
if h.ObserverStaleMinutes != 120 {
|
||||
t.Errorf("ObserverStaleMinutes = %d, want 120", h.ObserverStaleMinutes)
|
||||
}
|
||||
m := h.ToClientMs()
|
||||
if m["observerOnlineMs"] != 30*60*1000 {
|
||||
t.Errorf("observerOnlineMs = %d, want %d", m["observerOnlineMs"], 30*60*1000)
|
||||
}
|
||||
if m["observerStaleMs"] != 120*60*1000 {
|
||||
t.Errorf("observerStaleMs = %d, want %d", m["observerStaleMs"], 120*60*1000)
|
||||
}
|
||||
}
|
||||
|
||||
func TestObserverThresholdsDefaults(t *testing.T) {
|
||||
cfg := &Config{}
|
||||
h := cfg.GetHealthThresholds()
|
||||
if h.ObserverOnlineMinutes != 60 {
|
||||
t.Errorf("default ObserverOnlineMinutes = %d, want 60", h.ObserverOnlineMinutes)
|
||||
}
|
||||
if h.ObserverStaleMinutes != 1440 {
|
||||
t.Errorf("default ObserverStaleMinutes = %d, want 1440", h.ObserverStaleMinutes)
|
||||
}
|
||||
m := h.ToClientMs()
|
||||
if m["observerOnlineMs"] != 3600000 {
|
||||
t.Errorf("default observerOnlineMs = %d, want 3600000", m["observerOnlineMs"])
|
||||
}
|
||||
if m["observerStaleMs"] != 86400000 {
|
||||
t.Errorf("default observerStaleMs = %d, want 86400000", m["observerStaleMs"])
|
||||
}
|
||||
}
|
||||
|
||||
// Loading a config with no healthThresholds block at all must still produce
|
||||
// the new 60 / 1440 defaults (not zero, not the old 10 / 60).
|
||||
func TestObserverThresholdsDefaultsFromEmptyConfigFile(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
os.WriteFile(filepath.Join(dir, "config.json"), []byte(`{"port": 3000}`), 0644)
|
||||
cfg, err := LoadConfig(dir)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
h := cfg.GetHealthThresholds()
|
||||
if h.ObserverOnlineMinutes != 60 {
|
||||
t.Errorf("empty-config ObserverOnlineMinutes = %d, want 60 (new default)", h.ObserverOnlineMinutes)
|
||||
}
|
||||
if h.ObserverStaleMinutes != 1440 {
|
||||
t.Errorf("empty-config ObserverStaleMinutes = %d, want 1440 (new default)", h.ObserverStaleMinutes)
|
||||
}
|
||||
}
|
||||
|
||||
+4
-1
@@ -177,7 +177,10 @@
|
||||
"nodeDegradedHours": 1,
|
||||
"nodeSilentHours": 24,
|
||||
"relayActiveHours": 24,
|
||||
"_comment": "How long (hours) before nodes show as degraded/silent. 'infra' = repeaters & rooms, 'node' = companions & others. relayActiveHours: a repeater is shown as 'actively relaying' if its pubkey appeared as a path hop in a non-advert packet within this window (issue #662)."
|
||||
"observerOnlineMinutes": 60,
|
||||
"observerStaleMinutes": 1440,
|
||||
"_comment": "How long (hours) before nodes show as degraded/silent. 'infra' = repeaters & rooms, 'node' = companions & others. relayActiveHours: a repeater is shown as 'actively relaying' if its pubkey appeared as a path hop in a non-advert packet within this window (issue #662).",
|
||||
"_comment_observerThresholds": "Observer health classification. Online: last_seen < observerOnlineMinutes ago. Stale: between Online and observerStaleMinutes. Offline: beyond observerStaleMinutes. Defaults 60 / 1440 (1h / 24h) match the node thresholds for consistency and eliminate flap on low-traffic / CDN-fronted instances (#1552). Operators who want the old aggressive 10-min Online threshold can set observerOnlineMinutes: 10."
|
||||
},
|
||||
"defaultRegion": "SJC",
|
||||
"mapDefaults": {
|
||||
|
||||
@@ -164,10 +164,14 @@ window.ObserverDetailNaiveBanner = {
|
||||
+ ' · BW' + escapeHtml(rp[1] || '?') + ' · CR' + escapeHtml(rp[3] || '?');
|
||||
}
|
||||
|
||||
// Health status
|
||||
// Health status — Issue #1552: thresholds are operator-configurable via
|
||||
// window.HEALTH_THRESHOLDS.observerOnlineMs / observerStaleMs (defaults
|
||||
// 60 min / 1440 min (24h), matching node thresholds — #1552).
|
||||
const ago = obs.last_seen ? Date.now() - new Date(obs.last_seen).getTime() : Infinity;
|
||||
const statusCls = ago < 600000 ? 'health-green' : ago < HEALTH_THRESHOLDS.nodeDegradedMs ? 'health-yellow' : 'health-red';
|
||||
const statusLabel = ago < 600000 ? 'Online' : ago < HEALTH_THRESHOLDS.nodeDegradedMs ? 'Stale' : 'Offline';
|
||||
const _obsOnlineMs = (HEALTH_THRESHOLDS && HEALTH_THRESHOLDS.observerOnlineMs) || 3600000;
|
||||
const _obsStaleMs = (HEALTH_THRESHOLDS && HEALTH_THRESHOLDS.observerStaleMs) || 86400000;
|
||||
const statusCls = ago < _obsOnlineMs ? 'health-green' : ago < _obsStaleMs ? 'health-yellow' : 'health-red';
|
||||
const statusLabel = ago < _obsOnlineMs ? 'Online' : ago < _obsStaleMs ? 'Stale' : 'Offline';
|
||||
|
||||
el.innerHTML = `
|
||||
${window.ObserverDetailNaiveBanner.render(obs)}
|
||||
|
||||
+11
-2
@@ -122,10 +122,19 @@ window.ObserversNaiveChip = {
|
||||
if (!lastSeen) return { cls: 'health-red', label: 'Unknown' };
|
||||
const ago = Date.now() - new Date(lastSeen).getTime();
|
||||
const tolerance = 30000; // 30s tolerance for clock skew
|
||||
if (ago < 600000 + tolerance) return { cls: 'health-green', label: 'Online' }; // < 10 min + tolerance
|
||||
if (ago < 3600000 + tolerance) return { cls: 'health-yellow', label: 'Stale' }; // < 1 hour + tolerance
|
||||
// Issue #1552 — thresholds are operator-configurable via config.json
|
||||
// healthThresholds.observerOnlineMinutes / observerStaleMinutes, surfaced
|
||||
// to the client through window.HEALTH_THRESHOLDS. Defaults are 60 min
|
||||
// Online / 1440 min (24h) Stale, matching node thresholds (#1552).
|
||||
const th = (typeof window !== 'undefined' && window.HEALTH_THRESHOLDS) || {};
|
||||
const onlineMs = th.observerOnlineMs || 3600000;
|
||||
const staleMs = th.observerStaleMs || 86400000;
|
||||
if (ago < onlineMs + tolerance) return { cls: 'health-green', label: 'Online' };
|
||||
if (ago < staleMs + tolerance) return { cls: 'health-yellow', label: 'Stale' };
|
||||
return { cls: 'health-red', label: 'Offline' };
|
||||
}
|
||||
// Issue #1552 — exposed for tests and external callers.
|
||||
window.observerHealthStatus = healthStatus;
|
||||
|
||||
function packetBadge(o) {
|
||||
if (!o.last_packet_at) return '<span title="No packets ever observed">📡⚠ never</span>';
|
||||
|
||||
@@ -6387,3 +6387,64 @@ Promise.allSettled(pendingTests).then(() => {
|
||||
console.error('Failed waiting for async tests:', e);
|
||||
process.exit(1);
|
||||
});
|
||||
|
||||
// ===== observers.js: healthStatus (#1552) =====
|
||||
console.log('\n=== observers.js: healthStatus (configurable thresholds) ===');
|
||||
{
|
||||
// Extract the healthStatus function body from observers.js so we can test
|
||||
// it standalone (the file is an IIFE that depends on many page globals).
|
||||
const src = fs.readFileSync('public/observers.js', 'utf8');
|
||||
const m = src.match(/function healthStatus\s*\([^)]*\)\s*\{[\s\S]*?\n \}/);
|
||||
if (!m) throw new Error('healthStatus not found in public/observers.js');
|
||||
const healthStatusSrc = m[0];
|
||||
|
||||
function runHealthStatus(lastSeen, healthThresholds) {
|
||||
const ctx = { window: {}, Date };
|
||||
if (healthThresholds) ctx.window.HEALTH_THRESHOLDS = healthThresholds;
|
||||
vm.createContext(ctx);
|
||||
vm.runInContext(healthStatusSrc + '\n; result = healthStatus(lastSeen);', Object.assign(ctx, { lastSeen, result: null }));
|
||||
return ctx.result;
|
||||
}
|
||||
|
||||
test('observer 20min old with 30min override → Online', () => {
|
||||
const ts = new Date(Date.now() - 20 * 60 * 1000).toISOString();
|
||||
const r = runHealthStatus(ts, { observerOnlineMs: 30 * 60 * 1000, observerStaleMs: 120 * 60 * 1000 });
|
||||
assert.strictEqual(r.cls, 'health-green', 'expected Online with 30min override, got ' + JSON.stringify(r));
|
||||
});
|
||||
|
||||
test('observer 20min old with default thresholds → Online (new 1h default, #1552)', () => {
|
||||
const ts = new Date(Date.now() - 20 * 60 * 1000).toISOString();
|
||||
const r = runHealthStatus(ts, null);
|
||||
assert.strictEqual(r.cls, 'health-green', 'expected Online with 1h default, got ' + JSON.stringify(r));
|
||||
});
|
||||
|
||||
test('observer 30min old with NO HEALTH_THRESHOLDS → Online (#1552 default raised to 1h)', () => {
|
||||
const ts = new Date(Date.now() - 30 * 60 * 1000).toISOString();
|
||||
const r = runHealthStatus(ts, null);
|
||||
assert.strictEqual(r.cls, 'health-green', 'expected Online with new 1h default, got ' + JSON.stringify(r));
|
||||
});
|
||||
|
||||
test('observer 2h old with default thresholds → Stale (new 24h stale default, #1552)', () => {
|
||||
const ts = new Date(Date.now() - 2 * 60 * 60 * 1000).toISOString();
|
||||
const r = runHealthStatus(ts, null);
|
||||
assert.strictEqual(r.cls, 'health-yellow', 'expected Stale with 24h default, got ' + JSON.stringify(r));
|
||||
});
|
||||
|
||||
test('observer 25h old with default thresholds → Offline (>24h, #1552)', () => {
|
||||
const ts = new Date(Date.now() - 25 * 60 * 60 * 1000).toISOString();
|
||||
const r = runHealthStatus(ts, null);
|
||||
assert.strictEqual(r.cls, 'health-red', 'expected Offline beyond 24h stale default, got ' + JSON.stringify(r));
|
||||
});
|
||||
|
||||
test('observer 90min old with 2h stale override → Stale', () => {
|
||||
const ts = new Date(Date.now() - 90 * 60 * 1000).toISOString();
|
||||
const r = runHealthStatus(ts, { observerOnlineMs: 30 * 60 * 1000, observerStaleMs: 120 * 60 * 1000 });
|
||||
assert.strictEqual(r.cls, 'health-yellow', 'expected Stale (90min < 120min stale), got ' + JSON.stringify(r));
|
||||
});
|
||||
|
||||
test('null lastSeen → Unknown', () => {
|
||||
const r = runHealthStatus(null, null);
|
||||
assert.strictEqual(r.cls, 'health-red');
|
||||
assert.strictEqual(r.label, 'Unknown');
|
||||
});
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user