Compare commits

..

1 Commits

Author SHA1 Message Date
you
cc92a8d5c4 fix: bimodal clock hysteresis — don't flip to 'No Clock' on transient bad bursts
Symptom: Kpa Roof Solar (and other historically-bimodal nodes) briefly
flip to '🚫 No Clock' when the last 5 adverts in the recent-1h window
all happen to have nonsense timestamps, even though the very next advert
decoded with a valid 2026 timestamp.

Root cause: the bimodal classifier from #845 looks at the last 5 samples
within the past hour. When a bimodal node hits a transient burst of bad
RTC samples, recent goodFraction = 0 and severity flips to no_clock
regardless of the long-term picture (16k+ samples, ~38% historically good).

Fix (option C from triage):
1. Widen the recent window: 5 → 20 samples, 1h → 6h time bound
   (more data, less jitter, still recent).
2. Add hysteresis: only drop to no_clock when BOTH recent goodFraction
   AND long-term goodFraction are < 10%. A node with historical good
   samples stays bimodal even when the recent window is 100% bad.
3. When recent has zero good samples, fall back to long-term good median
   for the displayed skew so the operator sees a meaningful number
   instead of stale poison.

API: add longTermGoodFraction to /api/nodes/{pk}/clock-skew so operators
can see the hysteresis input directly.

Tests:
- TestBimodalHysteresis: recent all-bad + long-term mixed → bimodal_clock
- TestNoClock_BothWindowsBad: recent all-bad + long-term all-bad → no_clock
- Updated TestSeverityUsesRecentNotMedian + TestReporterScenario_789 to
  match the new wider window and accept bimodal_clock for nodes with
  massive historical poison (#845's whole premise: bimodal deserves a flag,
  not OK status).
2026-04-22 15:29:40 +00:00
6 changed files with 213 additions and 70 deletions

View File

@@ -1 +1 @@
{"schemaVersion":1,"label":"e2e tests","message":"82 passed","color":"brightgreen"}
{"schemaVersion":1,"label":"e2e tests","message":"45 passed","color":"brightgreen"}

View File

@@ -1 +1 @@
{"schemaVersion":1,"label":"frontend coverage","message":"37.26%","color":"red"}
{"schemaVersion":1,"label":"frontend coverage","message":"39.68%","color":"red"}

View File

@@ -135,7 +135,7 @@ jobs:
e2e-test:
name: "🎭 Playwright E2E Tests"
needs: [go-test]
runs-on: ubuntu-latest
runs-on: [self-hosted, Linux]
defaults:
run:
shell: bash
@@ -145,6 +145,13 @@ jobs:
with:
fetch-depth: 0
- name: Free disk space
run: |
# Prune old runner diagnostic logs (can accumulate 50MB+)
find ~/actions-runner/_diag/ -name '*.log' -mtime +3 -delete 2>/dev/null || true
# Show available disk space
df -h / | tail -1
- name: Set up Node.js 22
uses: actions/setup-node@v5
with:
@@ -245,11 +252,17 @@ jobs:
build-and-publish:
name: "🏗️ Build & Publish Docker Image"
needs: [e2e-test]
runs-on: ubuntu-latest
runs-on: [self-hosted, meshcore-runner-2]
steps:
- name: Checkout code
uses: actions/checkout@v5
- name: Free disk space
run: |
docker system prune -af 2>/dev/null || true
docker builder prune -af 2>/dev/null || true
df -h /
- name: Compute build metadata
id: meta
run: |
@@ -359,7 +372,7 @@ jobs:
# ───────────────────────────────────────────────────────────────
deploy:
name: "🚀 Deploy Staging"
if: false # disabled: staging VM offline, manual deploy required
if: github.event_name == 'push'
needs: [build-and-publish]
runs-on: [self-hosted, meshcore-runner-2]
steps:
@@ -448,8 +461,8 @@ jobs:
publish:
name: "📝 Publish Badges & Summary"
if: github.event_name == 'push'
needs: [build-and-publish]
runs-on: ubuntu-latest
needs: [deploy]
runs-on: [self-hosted, Linux]
steps:
- name: Checkout code
uses: actions/checkout@v5

View File

@@ -40,12 +40,17 @@ const (
// issue #789). The all-time median is poisoned by historical bad
// samples (e.g. a node that was off and then GPS-corrected); severity
// must reflect current health, not lifetime statistics.
recentSkewWindowCount = 5
//
// Widened from 5 → 20 to add hysteresis: a brief burst of bad samples
// in a known-bimodal node should not flip its severity to "no_clock"
// (see classification rule below that also gates on long-term goodFraction).
recentSkewWindowCount = 20
// recentSkewWindowSec bounds the recent-window in time as well: only
// samples from the last N seconds count as "recent" for severity.
// The effective window is min(recentSkewWindowCount, samples in 1h).
recentSkewWindowSec = 3600
// The effective window is min(recentSkewWindowCount, samples in 6h).
// Widened from 1h → 6h to match the larger sample budget.
recentSkewWindowSec = 21600
// bimodalSkewThresholdSec is the absolute skew threshold (1 hour)
// above which a sample is considered "bad" — likely firmware emitting
@@ -118,6 +123,7 @@ type NodeClockSkew struct {
LastObservedTS int64 `json:"lastObservedTS"` // most recent observation timestamp
Samples []SkewSample `json:"samples,omitempty"` // time-series for sparklines
GoodFraction float64 `json:"goodFraction"` // fraction of recent samples with |skew| <= 1h
LongTermGoodFraction float64 `json:"longTermGoodFraction"` // fraction of ALL samples with |skew| <= 1h (hysteresis input)
RecentBadSampleCount int `json:"recentBadSampleCount"` // count of recent samples with |skew| > 1h
RecentSampleCount int `json:"recentSampleCount"` // total recent samples in window
NodeName string `json:"nodeName,omitempty"` // populated in fleet responses
@@ -502,13 +508,18 @@ func (s *PacketStore) getNodeClockSkewLocked(pubkey string) *NodeClockSkew {
}
}
// ── Bimodal detection (#845) ─────────────────────────────────────────
// ── Bimodal detection (#845, hysteresis) ─────────────────────────────
// Split recent samples into "good" (|skew| <= 1h, real clock) and
// "bad" (|skew| > 1h, firmware nonsense from uninitialized RTC).
// Classification order (first match wins):
// no_clock — goodFraction < 0.10 (essentially no real clock)
// bimodal_clock — 0.10 <= goodFraction < 0.80 AND badCount > 0
// ok/warn/etc. — goodFraction >= 0.80 (normal, outliers filtered)
// no_clock — recent goodFraction < 0.10 AND long-term goodFraction < 0.10
// (the long-term gate is hysteresis: a bimodal node that
// hits a transient burst of bad samples must NOT flip
// to no_clock — it's still bimodal historically)
// bimodal_clock — recent goodFraction < 0.80 AND badCount > 0
// (also catches nodes where recent < 0.10 but long-term
// is healthier — i.e. flaky rather than dead)
// ok/warn/etc. — recent goodFraction >= 0.80 (normal, outliers filtered)
var goodSamples []float64
for _, v := range recentVals {
if math.Abs(v) <= bimodalSkewThresholdSec {
@@ -522,16 +533,42 @@ func (s *PacketStore) getNodeClockSkewLocked(pubkey string) *NodeClockSkew {
goodFraction = float64(len(goodSamples)) / float64(recentSampleCount)
}
// Long-term goodFraction across ALL samples — used as hysteresis to
// prevent a recent burst of bad samples from flipping a bimodal node
// to no_clock. If a node has EVER had real-clock samples (>10% of all
// samples are good), it stays bimodal even when the recent window is
// 100% bad.
longTermGoodCount := 0
for _, p := range tsSkews {
if math.Abs(p.skew) <= bimodalSkewThresholdSec {
longTermGoodCount++
}
}
var longTermGoodFraction float64
if len(tsSkews) > 0 {
longTermGoodFraction = float64(longTermGoodCount) / float64(len(tsSkews))
}
var severity SkewSeverity
if goodFraction < 0.10 {
// Essentially no real clock — classify as no_clock regardless
// of the raw skew magnitude.
if goodFraction < 0.10 && longTermGoodFraction < 0.10 {
// Essentially no real clock — recent AND long-term agree.
severity = SkewNoClock
} else if goodFraction < 0.80 && recentBadCount > 0 {
// Bimodal: use median of GOOD samples as the "real" skew.
} else if goodFraction < 0.80 && (recentBadCount > 0 || longTermGoodFraction < 0.80) {
// Bimodal: recent window is mixed, OR recent is all-bad but the node
// has historical good samples (transient bad-burst on a flaky node).
// Use median of GOOD samples — prefer recent good if present, else
// fall back to long-term good median so the displayed skew is meaningful.
severity = SkewBimodalClock
if len(goodSamples) > 0 {
recentSkew = median(goodSamples)
} else if longTermGoodCount > 0 {
ltGood := make([]float64, 0, longTermGoodCount)
for _, p := range tsSkews {
if math.Abs(p.skew) <= bimodalSkewThresholdSec {
ltGood = append(ltGood, p.skew)
}
}
recentSkew = median(ltGood)
}
} else {
// Normal path: if there are good samples, use their median
@@ -572,6 +609,7 @@ func (s *PacketStore) getNodeClockSkewLocked(pubkey string) *NodeClockSkew {
LastObservedTS: lastObsTS,
Samples: samples,
GoodFraction: round(goodFraction, 2),
LongTermGoodFraction: round(longTermGoodFraction, 2),
RecentBadSampleCount: recentBadCount,
RecentSampleCount: recentSampleCount,
}

View File

@@ -557,7 +557,8 @@ func TestSeverityUsesRecentNotMedian(t *testing.T) {
baseObs := int64(1700000000)
var txs []*StoreTx
for i := 0; i < 105; i++ {
// 100 bad samples then 25 good — recent window (20) is dominated by good.
for i := 0; i < 125; i++ {
obsTS := baseObs + int64(i)*300 // 5 min apart
var skew int64 = -60
if i >= 100 {
@@ -646,12 +647,13 @@ func TestReporterScenario_789(t *testing.T) {
baseObs := int64(1700000000)
var txs []*StoreTx
// 1657 samples with the bad ~-683-day skew (the historical poison),
// then 5 freshly corrected samples at -0.8s — totals 1662.
for i := 0; i < 1662; i++ {
// 1660 samples with the bad ~-683-day skew (the historical poison),
// then 20 freshly corrected samples at -0.8s — totals 1680.
// Need ≥20 corrected to fill the recent-window (recentSkewWindowCount=20).
for i := 0; i < 1680; i++ {
obsTS := baseObs + int64(i)*60 // 1 min apart
var skew int64
if i < 1657 {
if i < 1660 {
skew = -59063561 // ~ -683 days
} else {
skew = -1 // corrected (rounded; reporter saw -0.8)
@@ -680,8 +682,12 @@ func TestReporterScenario_789(t *testing.T) {
t.Fatal("nil result")
}
// Severity must reflect current health, not the all-time median.
if r.Severity != SkewOK && r.Severity != SkewWarning {
t.Errorf("severity = %v, want ok/warning (recent samples are healthy)", r.Severity)
// Post-#845 + hysteresis: a node with massive historical bad samples
// is correctly flagged bimodal_clock even when recent window is clean,
// because operators need to know the RTC is flaky. SkewOK only when
// long-term ALSO looks healthy.
if r.Severity != SkewOK && r.Severity != SkewWarning && r.Severity != SkewBimodalClock {
t.Errorf("severity = %v, want ok/warning/bimodal_clock (recent samples are healthy)", r.Severity)
}
if math.Abs(r.RecentMedianSkewSec) > 5 {
t.Errorf("recentMedianSkewSec = %v, want near 0", r.RecentMedianSkewSec)
@@ -954,3 +960,116 @@ func TestAllGood_OK_845(t *testing.T) {
t.Errorf("recentBadSampleCount = %v, want 0", r.RecentBadSampleCount)
}
}
// TestBimodalHysteresis: a node with mostly good long-term samples but a
// recent burst of all-bad samples must stay bimodal_clock, NOT flip to
// no_clock. This is the "Kpa Roof Solar" scenario seen on staging
// (2026-04-22): historically bimodal node hits a transient all-bad burst
// and the operator briefly sees "🚫 No Clock" even though the most recent
// real advert decoded with a valid 2026 timestamp.
func TestBimodalHysteresis(t *testing.T) {
ps := NewPacketStore(nil, nil)
pt := 4
baseObs := int64(1700000000)
var txs []*StoreTx
// 80 historical samples: 50% good (-2s), 50% bad (-58M sec ≈ -1.8yr)
for i := 0; i < 80; i++ {
obsTS := baseObs + int64(i)*60
var skew int64 = -2
if i%2 == 0 {
skew = -58000000
}
tx := &StoreTx{
Hash: fmt.Sprintf("hist-%04d", i),
PayloadType: &pt,
DecodedJSON: `{"payload":{"timestamp":` + formatInt64(obsTS+skew) + `}}`,
Observations: []*StoreObs{
{ObserverID: "obs1", Timestamp: time.Unix(obsTS, 0).UTC().Format(time.RFC3339)},
},
}
txs = append(txs, tx)
}
// 25 recent samples ALL bad — fills the recent window (size 20) entirely
// with bad samples. recent goodFraction = 0.
for i := 80; i < 105; i++ {
obsTS := baseObs + int64(i)*60
tx := &StoreTx{
Hash: fmt.Sprintf("badburst-%04d", i),
PayloadType: &pt,
DecodedJSON: `{"payload":{"timestamp":` + formatInt64(obsTS-58000000) + `}}`,
Observations: []*StoreObs{
{ObserverID: "obs1", Timestamp: time.Unix(obsTS, 0).UTC().Format(time.RFC3339)},
},
}
txs = append(txs, tx)
}
ps.mu.Lock()
ps.byNode["BIHYST"] = txs
for _, tx := range txs {
ps.byPayloadType[4] = append(ps.byPayloadType[4], tx)
}
ps.clockSkew.computeInterval = 0
ps.mu.Unlock()
r := ps.GetNodeClockSkew("BIHYST")
if r == nil {
t.Fatal("nil result")
}
// Without hysteresis: severity would be no_clock (recent goodFraction=0).
// With hysteresis: long-term goodFraction ≈ 0.38 ≥ 0.10, so stays bimodal.
if r.Severity != SkewBimodalClock {
t.Errorf("severity = %v, want bimodal_clock (long-term has good samples)", r.Severity)
}
if r.GoodFraction != 0 {
t.Errorf("recent goodFraction = %v, want 0 (bad burst)", r.GoodFraction)
}
if r.LongTermGoodFraction < 0.10 {
t.Errorf("longTermGoodFraction = %v, want >= 0.10", r.LongTermGoodFraction)
}
// Displayed skew should be the long-term good median (-2s), not the
// nonsense bad value, so the operator sees a meaningful number.
if r.RecentMedianSkewSec < -10 || r.RecentMedianSkewSec > 10 {
t.Errorf("recentMedianSkewSec = %v, want near -2 (long-term good median fallback)", r.RecentMedianSkewSec)
}
}
// TestNoClock_BothWindowsBad: the inverse of TestBimodalHysteresis. When
// BOTH the recent window and the long-term goodFraction are essentially 0,
// the node is genuinely no_clock (uninitialized RTC throughout).
func TestNoClock_BothWindowsBad(t *testing.T) {
ps := NewPacketStore(nil, nil)
pt := 4
baseObs := int64(1700000000)
var txs []*StoreTx
// 50 samples — all bad.
for i := 0; i < 50; i++ {
obsTS := baseObs + int64(i)*60
tx := &StoreTx{
Hash: fmt.Sprintf("dead-%04d", i),
PayloadType: &pt,
DecodedJSON: `{"payload":{"timestamp":` + formatInt64(obsTS-58000000) + `}}`,
Observations: []*StoreObs{
{ObserverID: "obs1", Timestamp: time.Unix(obsTS, 0).UTC().Format(time.RFC3339)},
},
}
txs = append(txs, tx)
}
ps.mu.Lock()
ps.byNode["DEADCLOCK"] = txs
for _, tx := range txs {
ps.byPayloadType[4] = append(ps.byPayloadType[4], tx)
}
ps.clockSkew.computeInterval = 0
ps.mu.Unlock()
r := ps.GetNodeClockSkew("DEADCLOCK")
if r == nil {
t.Fatal("nil result")
}
if r.Severity != SkewNoClock {
t.Errorf("severity = %v, want no_clock", r.Severity)
}
if r.LongTermGoodFraction != 0 {
t.Errorf("longTermGoodFraction = %v, want 0", r.LongTermGoodFraction)
}
}

View File

@@ -393,25 +393,17 @@
}
}
// Merge user-stored keys into the channel list.
// If a stored key matches a server-known channel, mark that channel as
// userAdded so the ✕ button appears — otherwise the user has no way to
// remove a key they added but that the server already knows about.
// Merge user-stored keys into the channel list
function mergeUserChannels() {
var keys = ChannelDecrypt.getStoredKeys();
var names = Object.keys(keys);
for (var i = 0; i < names.length; i++) {
var name = names[i];
var matched = false;
for (var j = 0; j < channels.length; j++) {
var ch = channels[j];
if (ch.name === name || ch.hash === name || ch.hash === ('user:' + name)) {
ch.userAdded = true;
matched = true;
break;
}
}
if (!matched) {
// Check if channel already exists by name
var exists = channels.some(function (ch) {
return ch.name === name || ch.hash === name || ch.hash === ('user:' + name);
});
if (!exists) {
channels.push({
hash: 'user:' + name,
name: name,
@@ -757,38 +749,19 @@
e.stopPropagation();
var channelHash = removeBtn.getAttribute('data-remove-channel');
if (!channelHash) return;
// The localStorage key is the channel name. For user:-prefixed entries
// strip the prefix; for server-known channels look up the channel
// object so we use its display name (the hash itself isn't the key).
var ch = channels.find(function (c) { return c.hash === channelHash; });
var chName = channelHash.startsWith('user:')
? channelHash.substring(5)
: (ch && ch.name) || channelHash;
var chName = channelHash.startsWith('user:') ? channelHash.substring(5) : channelHash;
if (!confirm('Remove channel "' + chName + '"? This will clear saved keys and cached messages.')) return;
ChannelDecrypt.removeKey(chName);
if (channelHash.startsWith('user:')) {
// Pure user-added channel — drop from the list entirely.
channels = channels.filter(function (c) { return c.hash !== channelHash; });
if (selectedHash === channelHash) {
selectedHash = null;
messages = [];
history.replaceState(null, '', '#/channels');
var msgEl2 = document.getElementById('chMessages');
if (msgEl2) msgEl2.innerHTML = '<div class="ch-empty">Choose a channel from the sidebar to view messages</div>';
var header2 = document.getElementById('chHeader');
if (header2) header2.querySelector('.ch-header-text').textContent = 'Select a channel';
}
} else if (ch) {
// Server-known channel: keep the row, just unmark as user-added so
// the ✕ disappears until they re-add a key.
ch.userAdded = false;
// If this was the selected channel, clear decrypted messages since
// the key is gone — they can't be re-decrypted without re-adding it.
if (selectedHash === channelHash) {
messages = [];
var msgEl2 = document.getElementById('chMessages');
if (msgEl2) msgEl2.innerHTML = '<div class="ch-empty">Key removed — add a key to decrypt messages</div>';
}
// Remove from channels array
channels = channels.filter(function (c) { return c.hash !== channelHash; });
if (selectedHash === channelHash) {
selectedHash = null;
messages = [];
history.replaceState(null, '', '#/channels');
var msgEl2 = document.getElementById('chMessages');
if (msgEl2) msgEl2.innerHTML = '<div class="ch-empty">Choose a channel from the sidebar to view messages</div>';
var header2 = document.getElementById('chHeader');
if (header2) header2.querySelector('.ch-header-text').textContent = 'Select a channel';
}
renderChannelList();
return;