feat(config): make observer health thresholds configurable (closes #1552) (#1556)

Closes #1552.

## What

Make observer `Online` / `Stale` / `Offline` thresholds
operator-configurable via `config.json`'s existing `healthThresholds`
block — and **raise the defaults** from 10 min / 60 min to **60 min /
1440 min (1 h / 24 h)** so they match the node thresholds and stop
producing flap out of the box.

⚠️ **This is a default behavior change.** Operators who want the old
aggressive 10-min Online threshold must opt in via:

```json
"healthThresholds": { "observerOnlineMinutes": 10 }
```

## Why

Per #1552: the `600000` / `3600000` constants in `public/observers.js`
were not tunable, *and* 10 min is wrong as a default. Wide-geo,
low-traffic meshes legitimately see observers go quiet for >10 min
between reports, and operators behind a CDN (#1551) get cached
`last_seen` values that can push the observer 15+ min behind reality —
guaranteeing flap at the 10-min threshold. The meshat.se operator (43
observers, v3.8.3) reports exactly this pattern.

Defaults raised from 10 / 60 minutes to 60 / 1440 minutes (1 h / 24 h)
to match the node thresholds for consistency and eliminate flap on
low-traffic / CDN-fronted instances. Operators wanting the old 10-min
Online behavior can set `observerOnlineMinutes: 10` in config.

## Changes

Backend (`cmd/server/config.go`):
- `HealthThresholds` gains `ObserverOnlineMinutes` /
`ObserverStaleMinutes` (int).
- `GetHealthThresholds()` defaults to **60 / 1440** when zero/absent.
- `ToClientMs()` emits `observerOnlineMs` / `observerStaleMs`, picked up
by the existing `/api/config-public` → `roles.js`
`Object.assign(HEALTH_THRESHOLDS, …)` pipeline.

`config.example.json`: new `observerOnlineMinutes` /
`observerStaleMinutes` keys (60 / 1440) + `_comment_observerThresholds`
explaining the rationale and opt-out.

Frontend:
- `public/observers.js` `healthStatus()` — reads from
`window.HEALTH_THRESHOLDS.observerOnlineMs / observerStaleMs`, falls
back to **3600000 / 86400000** (matching the new Go defaults for the
pre-`/api/config-public` window).
- `public/observer-detail.js` — same refactor (was previously hardcoded
`600000` + misusing `nodeDegradedMs` for the Stale boundary).

## Backward compat

- API shape: unchanged — only adds two optional keys.
- Config: unchanged keys / no renames.
- Default behavior: **changed** — operators relying on the implicit
10/60 must opt in (one config line).

## TDD

- RED 1 (`ee19058f`): assertions on the new fields + `ToClientMs` keys +
`healthStatus` reading from `window.HEALTH_THRESHOLDS`. CI:
[failure](https://github.com/Kpa-clawbot/CoreScope/actions/runs/26945264822).
- GREEN 1 (`30cfbf7a`): configurability landed (defaults still old
10/60). CI:
[success](https://github.com/Kpa-clawbot/CoreScope/actions/runs/26945220598).
- RED 2 (`2649cf35`): pin new 60/1440 defaults — empty-config Go path +
JS `healthStatus` with no `HEALTH_THRESHOLDS`. CI must fail.
- GREEN 2 (`5ef85bca`): bump Go defaults to 60/1440, JS fallbacks to
3600000/86400000, `config.example.json` updated. CI must pass.

## Preflight

Clean (exit 0). `cross-stack` ack in commit messages — single feature
spans Go + JSON + JS readers.

## Not in scope

- Customizer UI for editing the thresholds (config-only per issue).
- Node/infra thresholds (unchanged).
- The deeper observer-flap root cause (#1551 cache-control is a separate
PR in flight).

---------

Co-authored-by: corescope-bot <bot@corescope>
Co-authored-by: mc-bot <bot@meshcore.local>
This commit is contained in:
Kpa-clawbot
2026-06-04 03:56:48 -07:00
committed by GitHub
parent b23640cd69
commit 65bd954b17
6 changed files with 174 additions and 10 deletions
+23 -4
View File
@@ -325,6 +325,10 @@ type HealthThresholds struct {
// repeater to be considered "actively relaying" vs only "alive
// (advert-only)". See issue #662. Defaults to 24h.
RelayActiveHours float64 `json:"relayActiveHours"`
// Issue #1552 — observer health classification thresholds (minutes).
// Defaults match prior hardcoded behavior in public/observers.js (10/60).
ObserverOnlineMinutes int `json:"observerOnlineMinutes"`
ObserverStaleMinutes int `json:"observerStaleMinutes"`
}
// ThemeFile mirrors theme.json overlay.
@@ -415,6 +419,18 @@ func (c *Config) GetHealthThresholds() HealthThresholds {
if c.HealthThresholds.RelayActiveHours > 0 {
h.RelayActiveHours = c.HealthThresholds.RelayActiveHours
}
if c.HealthThresholds.ObserverOnlineMinutes > 0 {
h.ObserverOnlineMinutes = c.HealthThresholds.ObserverOnlineMinutes
}
if c.HealthThresholds.ObserverStaleMinutes > 0 {
h.ObserverStaleMinutes = c.HealthThresholds.ObserverStaleMinutes
}
}
if h.ObserverOnlineMinutes <= 0 {
h.ObserverOnlineMinutes = 60
}
if h.ObserverStaleMinutes <= 0 {
h.ObserverStaleMinutes = 1440
}
return h
}
@@ -431,11 +447,14 @@ func (h HealthThresholds) GetHealthMs(role string) (degradedMs, silentMs int) {
// ToClientMs returns the thresholds as ms for the frontend.
func (h HealthThresholds) ToClientMs() map[string]int {
const hourMs = 3600000
const minMs = 60000
return map[string]int{
"infraDegradedMs": int(h.InfraDegradedHours * hourMs),
"infraSilentMs": int(h.InfraSilentHours * hourMs),
"nodeDegradedMs": int(h.NodeDegradedHours * hourMs),
"nodeSilentMs": int(h.NodeSilentHours * hourMs),
"infraDegradedMs": int(h.InfraDegradedHours * hourMs),
"infraSilentMs": int(h.InfraSilentHours * hourMs),
"nodeDegradedMs": int(h.NodeDegradedHours * hourMs),
"nodeSilentMs": int(h.NodeSilentHours * hourMs),
"observerOnlineMs": h.ObserverOnlineMinutes * minMs,
"observerStaleMs": h.ObserverStaleMinutes * minMs,
}
}
+68
View File
@@ -387,3 +387,71 @@ func TestObserverDaysOrDefault(t *testing.T) {
})
}
}
// Issue #1552 — observer health thresholds configurable.
func TestObserverThresholdsOverride(t *testing.T) {
dir := t.TempDir()
cfgData := map[string]interface{}{
"healthThresholds": map[string]interface{}{
"observerOnlineMinutes": 30,
"observerStaleMinutes": 120,
},
}
data, _ := json.Marshal(cfgData)
os.WriteFile(filepath.Join(dir, "config.json"), data, 0644)
cfg, err := LoadConfig(dir)
if err != nil {
t.Fatal(err)
}
h := cfg.GetHealthThresholds()
if h.ObserverOnlineMinutes != 30 {
t.Errorf("ObserverOnlineMinutes = %d, want 30", h.ObserverOnlineMinutes)
}
if h.ObserverStaleMinutes != 120 {
t.Errorf("ObserverStaleMinutes = %d, want 120", h.ObserverStaleMinutes)
}
m := h.ToClientMs()
if m["observerOnlineMs"] != 30*60*1000 {
t.Errorf("observerOnlineMs = %d, want %d", m["observerOnlineMs"], 30*60*1000)
}
if m["observerStaleMs"] != 120*60*1000 {
t.Errorf("observerStaleMs = %d, want %d", m["observerStaleMs"], 120*60*1000)
}
}
func TestObserverThresholdsDefaults(t *testing.T) {
cfg := &Config{}
h := cfg.GetHealthThresholds()
if h.ObserverOnlineMinutes != 60 {
t.Errorf("default ObserverOnlineMinutes = %d, want 60", h.ObserverOnlineMinutes)
}
if h.ObserverStaleMinutes != 1440 {
t.Errorf("default ObserverStaleMinutes = %d, want 1440", h.ObserverStaleMinutes)
}
m := h.ToClientMs()
if m["observerOnlineMs"] != 3600000 {
t.Errorf("default observerOnlineMs = %d, want 3600000", m["observerOnlineMs"])
}
if m["observerStaleMs"] != 86400000 {
t.Errorf("default observerStaleMs = %d, want 86400000", m["observerStaleMs"])
}
}
// Loading a config with no healthThresholds block at all must still produce
// the new 60 / 1440 defaults (not zero, not the old 10 / 60).
func TestObserverThresholdsDefaultsFromEmptyConfigFile(t *testing.T) {
dir := t.TempDir()
os.WriteFile(filepath.Join(dir, "config.json"), []byte(`{"port": 3000}`), 0644)
cfg, err := LoadConfig(dir)
if err != nil {
t.Fatal(err)
}
h := cfg.GetHealthThresholds()
if h.ObserverOnlineMinutes != 60 {
t.Errorf("empty-config ObserverOnlineMinutes = %d, want 60 (new default)", h.ObserverOnlineMinutes)
}
if h.ObserverStaleMinutes != 1440 {
t.Errorf("empty-config ObserverStaleMinutes = %d, want 1440 (new default)", h.ObserverStaleMinutes)
}
}
+4 -1
View File
@@ -177,7 +177,10 @@
"nodeDegradedHours": 1,
"nodeSilentHours": 24,
"relayActiveHours": 24,
"_comment": "How long (hours) before nodes show as degraded/silent. 'infra' = repeaters & rooms, 'node' = companions & others. relayActiveHours: a repeater is shown as 'actively relaying' if its pubkey appeared as a path hop in a non-advert packet within this window (issue #662)."
"observerOnlineMinutes": 60,
"observerStaleMinutes": 1440,
"_comment": "How long (hours) before nodes show as degraded/silent. 'infra' = repeaters & rooms, 'node' = companions & others. relayActiveHours: a repeater is shown as 'actively relaying' if its pubkey appeared as a path hop in a non-advert packet within this window (issue #662).",
"_comment_observerThresholds": "Observer health classification. Online: last_seen < observerOnlineMinutes ago. Stale: between Online and observerStaleMinutes. Offline: beyond observerStaleMinutes. Defaults 60 / 1440 (1h / 24h) match the node thresholds for consistency and eliminate flap on low-traffic / CDN-fronted instances (#1552). Operators who want the old aggressive 10-min Online threshold can set observerOnlineMinutes: 10."
},
"defaultRegion": "SJC",
"mapDefaults": {
+7 -3
View File
@@ -164,10 +164,14 @@ window.ObserverDetailNaiveBanner = {
+ ' · BW' + escapeHtml(rp[1] || '?') + ' · CR' + escapeHtml(rp[3] || '?');
}
// Health status
// Health status — Issue #1552: thresholds are operator-configurable via
// window.HEALTH_THRESHOLDS.observerOnlineMs / observerStaleMs (defaults
// 60 min / 1440 min (24h), matching node thresholds — #1552).
const ago = obs.last_seen ? Date.now() - new Date(obs.last_seen).getTime() : Infinity;
const statusCls = ago < 600000 ? 'health-green' : ago < HEALTH_THRESHOLDS.nodeDegradedMs ? 'health-yellow' : 'health-red';
const statusLabel = ago < 600000 ? 'Online' : ago < HEALTH_THRESHOLDS.nodeDegradedMs ? 'Stale' : 'Offline';
const _obsOnlineMs = (HEALTH_THRESHOLDS && HEALTH_THRESHOLDS.observerOnlineMs) || 3600000;
const _obsStaleMs = (HEALTH_THRESHOLDS && HEALTH_THRESHOLDS.observerStaleMs) || 86400000;
const statusCls = ago < _obsOnlineMs ? 'health-green' : ago < _obsStaleMs ? 'health-yellow' : 'health-red';
const statusLabel = ago < _obsOnlineMs ? 'Online' : ago < _obsStaleMs ? 'Stale' : 'Offline';
el.innerHTML = `
${window.ObserverDetailNaiveBanner.render(obs)}
+11 -2
View File
@@ -122,10 +122,19 @@ window.ObserversNaiveChip = {
if (!lastSeen) return { cls: 'health-red', label: 'Unknown' };
const ago = Date.now() - new Date(lastSeen).getTime();
const tolerance = 30000; // 30s tolerance for clock skew
if (ago < 600000 + tolerance) return { cls: 'health-green', label: 'Online' }; // < 10 min + tolerance
if (ago < 3600000 + tolerance) return { cls: 'health-yellow', label: 'Stale' }; // < 1 hour + tolerance
// Issue #1552 — thresholds are operator-configurable via config.json
// healthThresholds.observerOnlineMinutes / observerStaleMinutes, surfaced
// to the client through window.HEALTH_THRESHOLDS. Defaults are 60 min
// Online / 1440 min (24h) Stale, matching node thresholds (#1552).
const th = (typeof window !== 'undefined' && window.HEALTH_THRESHOLDS) || {};
const onlineMs = th.observerOnlineMs || 3600000;
const staleMs = th.observerStaleMs || 86400000;
if (ago < onlineMs + tolerance) return { cls: 'health-green', label: 'Online' };
if (ago < staleMs + tolerance) return { cls: 'health-yellow', label: 'Stale' };
return { cls: 'health-red', label: 'Offline' };
}
// Issue #1552 — exposed for tests and external callers.
window.observerHealthStatus = healthStatus;
function packetBadge(o) {
if (!o.last_packet_at) return '<span title="No packets ever observed">📡⚠ never</span>';
+61
View File
@@ -6387,3 +6387,64 @@ Promise.allSettled(pendingTests).then(() => {
console.error('Failed waiting for async tests:', e);
process.exit(1);
});
// ===== observers.js: healthStatus (#1552) =====
console.log('\n=== observers.js: healthStatus (configurable thresholds) ===');
{
// Extract the healthStatus function body from observers.js so we can test
// it standalone (the file is an IIFE that depends on many page globals).
const src = fs.readFileSync('public/observers.js', 'utf8');
const m = src.match(/function healthStatus\s*\([^)]*\)\s*\{[\s\S]*?\n \}/);
if (!m) throw new Error('healthStatus not found in public/observers.js');
const healthStatusSrc = m[0];
function runHealthStatus(lastSeen, healthThresholds) {
const ctx = { window: {}, Date };
if (healthThresholds) ctx.window.HEALTH_THRESHOLDS = healthThresholds;
vm.createContext(ctx);
vm.runInContext(healthStatusSrc + '\n; result = healthStatus(lastSeen);', Object.assign(ctx, { lastSeen, result: null }));
return ctx.result;
}
test('observer 20min old with 30min override → Online', () => {
const ts = new Date(Date.now() - 20 * 60 * 1000).toISOString();
const r = runHealthStatus(ts, { observerOnlineMs: 30 * 60 * 1000, observerStaleMs: 120 * 60 * 1000 });
assert.strictEqual(r.cls, 'health-green', 'expected Online with 30min override, got ' + JSON.stringify(r));
});
test('observer 20min old with default thresholds → Online (new 1h default, #1552)', () => {
const ts = new Date(Date.now() - 20 * 60 * 1000).toISOString();
const r = runHealthStatus(ts, null);
assert.strictEqual(r.cls, 'health-green', 'expected Online with 1h default, got ' + JSON.stringify(r));
});
test('observer 30min old with NO HEALTH_THRESHOLDS → Online (#1552 default raised to 1h)', () => {
const ts = new Date(Date.now() - 30 * 60 * 1000).toISOString();
const r = runHealthStatus(ts, null);
assert.strictEqual(r.cls, 'health-green', 'expected Online with new 1h default, got ' + JSON.stringify(r));
});
test('observer 2h old with default thresholds → Stale (new 24h stale default, #1552)', () => {
const ts = new Date(Date.now() - 2 * 60 * 60 * 1000).toISOString();
const r = runHealthStatus(ts, null);
assert.strictEqual(r.cls, 'health-yellow', 'expected Stale with 24h default, got ' + JSON.stringify(r));
});
test('observer 25h old with default thresholds → Offline (>24h, #1552)', () => {
const ts = new Date(Date.now() - 25 * 60 * 60 * 1000).toISOString();
const r = runHealthStatus(ts, null);
assert.strictEqual(r.cls, 'health-red', 'expected Offline beyond 24h stale default, got ' + JSON.stringify(r));
});
test('observer 90min old with 2h stale override → Stale', () => {
const ts = new Date(Date.now() - 90 * 60 * 1000).toISOString();
const r = runHealthStatus(ts, { observerOnlineMs: 30 * 60 * 1000, observerStaleMs: 120 * 60 * 1000 });
assert.strictEqual(r.cls, 'health-yellow', 'expected Stale (90min < 120min stale), got ' + JSON.stringify(r));
});
test('null lastSeen → Unknown', () => {
const r = runHealthStatus(null, null);
assert.strictEqual(r.cls, 'health-red');
assert.strictEqual(r.label, 'Unknown');
});
}