fix: bimodal clock hysteresis — don't flip to 'No Clock' on transient bad bursts

Symptom: Kpa Roof Solar (and other historically-bimodal nodes) briefly flip to '🚫 No Clock' when the last 5 adverts in the recent-1h window all happen to have nonsense timestamps, even though the very next advert decoded with a valid 2026 timestamp. Root cause: the bimodal classifier from #845 looks at the last 5 samples within the past hour. When a bimodal node hits a transient burst of bad RTC samples, recent goodFraction = 0 and severity flips to no_clock regardless of the long-term picture (16k+ samples, ~38% historically good). Fix (option C from triage): 1. Widen the recent window: 5 → 20 samples, 1h → 6h time bound (more data, less jitter, still recent). 2. Add hysteresis: only drop to no_clock when BOTH recent goodFraction AND long-term goodFraction are < 10%. A node with historical good samples stays bimodal even when the recent window is 100% bad. 3. When recent has zero good samples, fall back to long-term good median for the displayed skew so the operator sees a meaningful number instead of stale poison. API: add longTermGoodFraction to /api/nodes/{pk}/clock-skew so operators can see the hysteresis input directly. Tests: - TestBimodalHysteresis: recent all-bad + long-term mixed → bimodal_clock - TestNoClock_BothWindowsBad: recent all-bad + long-term all-bad → no_clock - Updated TestSeverityUsesRecentNotMedian + TestReporterScenario_789 to match the new wider window and accept bimodal_clock for nodes with massive historical poison (#845's whole premise: bimodal deserves a flag, not OK status).
2026-07-08 20:01:54 +00:00 · 2026-04-22 15:29:40 +00:00
2 changed files with 176 additions and 19 deletions
@@ -40,12 +40,17 @@ const (
 	// issue #789). The all-time median is poisoned by historical bad
 	// samples (e.g. a node that was off and then GPS-corrected); severity
 	// must reflect current health, not lifetime statistics.
-	recentSkewWindowCount = 5
+	//
+	// Widened from 5 → 20 to add hysteresis: a brief burst of bad samples
+	// in a known-bimodal node should not flip its severity to "no_clock"
+	// (see classification rule below that also gates on long-term goodFraction).
+	recentSkewWindowCount = 20

 	// recentSkewWindowSec bounds the recent-window in time as well: only
 	// samples from the last N seconds count as "recent" for severity.
-	// The effective window is min(recentSkewWindowCount, samples in 1h).
-	recentSkewWindowSec = 3600
+	// The effective window is min(recentSkewWindowCount, samples in 6h).
+	// Widened from 1h → 6h to match the larger sample budget.
+	recentSkewWindowSec = 21600

 	// bimodalSkewThresholdSec is the absolute skew threshold (1 hour)
 	// above which a sample is considered "bad" — likely firmware emitting
@@ -118,6 +123,7 @@ type NodeClockSkew struct {
 	LastObservedTS  int64        `json:"lastObservedTS"`   // most recent observation timestamp
 	Samples         []SkewSample `json:"samples,omitempty"` // time-series for sparklines
 	GoodFraction        float64  `json:"goodFraction"`        // fraction of recent samples with |skew| <= 1h
+	LongTermGoodFraction float64 `json:"longTermGoodFraction"` // fraction of ALL samples with |skew| <= 1h (hysteresis input)
 	RecentBadSampleCount int     `json:"recentBadSampleCount"` // count of recent samples with |skew| > 1h
 	RecentSampleCount    int     `json:"recentSampleCount"`    // total recent samples in window
 	NodeName        string       `json:"nodeName,omitempty"` // populated in fleet responses
@@ -502,13 +508,18 @@ func (s *PacketStore) getNodeClockSkewLocked(pubkey string) *NodeClockSkew {
 		}
 	}

-	// ── Bimodal detection (#845) ─────────────────────────────────────────
+	// ── Bimodal detection (#845, hysteresis) ─────────────────────────────
 	// Split recent samples into "good" (|skew| <= 1h, real clock) and
 	// "bad" (|skew| > 1h, firmware nonsense from uninitialized RTC).
 	// Classification order (first match wins):
-	//   no_clock       — goodFraction < 0.10 (essentially no real clock)
-	//   bimodal_clock  — 0.10 <= goodFraction < 0.80 AND badCount > 0
-	//   ok/warn/etc.   — goodFraction >= 0.80 (normal, outliers filtered)
+	//   no_clock       — recent goodFraction < 0.10 AND long-term goodFraction < 0.10
+	//                    (the long-term gate is hysteresis: a bimodal node that
+	//                     hits a transient burst of bad samples must NOT flip
+	//                     to no_clock — it's still bimodal historically)
+	//   bimodal_clock  — recent goodFraction < 0.80 AND badCount > 0
+	//                    (also catches nodes where recent < 0.10 but long-term
+	//                     is healthier — i.e. flaky rather than dead)
+	//   ok/warn/etc.   — recent goodFraction >= 0.80 (normal, outliers filtered)
 	var goodSamples []float64
 	for _, v := range recentVals {
 		if math.Abs(v) <= bimodalSkewThresholdSec {
@@ -522,16 +533,42 @@ func (s *PacketStore) getNodeClockSkewLocked(pubkey string) *NodeClockSkew {
 		goodFraction = float64(len(goodSamples)) / float64(recentSampleCount)
 	}

+	// Long-term goodFraction across ALL samples — used as hysteresis to
+	// prevent a recent burst of bad samples from flipping a bimodal node
+	// to no_clock. If a node has EVER had real-clock samples (>10% of all
+	// samples are good), it stays bimodal even when the recent window is
+	// 100% bad.
+	longTermGoodCount := 0
+	for _, p := range tsSkews {
+		if math.Abs(p.skew) <= bimodalSkewThresholdSec {
+			longTermGoodCount++
+		}
+	}
+	var longTermGoodFraction float64
+	if len(tsSkews) > 0 {
+		longTermGoodFraction = float64(longTermGoodCount) / float64(len(tsSkews))
+	}
+
 	var severity SkewSeverity
-	if goodFraction < 0.10 {
-		// Essentially no real clock — classify as no_clock regardless
-		// of the raw skew magnitude.
+	if goodFraction < 0.10 && longTermGoodFraction < 0.10 {
+		// Essentially no real clock — recent AND long-term agree.
 		severity = SkewNoClock
-	} else if goodFraction < 0.80 && recentBadCount > 0 {
-		// Bimodal: use median of GOOD samples as the "real" skew.
+	} else if goodFraction < 0.80 && (recentBadCount > 0 || longTermGoodFraction < 0.80) {
+		// Bimodal: recent window is mixed, OR recent is all-bad but the node
+		// has historical good samples (transient bad-burst on a flaky node).
+		// Use median of GOOD samples — prefer recent good if present, else
+		// fall back to long-term good median so the displayed skew is meaningful.
 		severity = SkewBimodalClock
 		if len(goodSamples) > 0 {
 			recentSkew = median(goodSamples)
+		} else if longTermGoodCount > 0 {
+			ltGood := make([]float64, 0, longTermGoodCount)
+			for _, p := range tsSkews {
+				if math.Abs(p.skew) <= bimodalSkewThresholdSec {
+					ltGood = append(ltGood, p.skew)
+				}
+			}
+			recentSkew = median(ltGood)
 		}
 	} else {
 		// Normal path: if there are good samples, use their median
@@ -572,6 +609,7 @@ func (s *PacketStore) getNodeClockSkewLocked(pubkey string) *NodeClockSkew {
 		LastObservedTS:       lastObsTS,
 		Samples:              samples,
 		GoodFraction:         round(goodFraction, 2),
+		LongTermGoodFraction: round(longTermGoodFraction, 2),
 		RecentBadSampleCount: recentBadCount,
 		RecentSampleCount:    recentSampleCount,
 	}
@@ -557,7 +557,8 @@ func TestSeverityUsesRecentNotMedian(t *testing.T) {

 	baseObs := int64(1700000000)
 	var txs []*StoreTx
-	for i := 0; i < 105; i++ {
+	// 100 bad samples then 25 good — recent window (20) is dominated by good.
+	for i := 0; i < 125; i++ {
 		obsTS := baseObs + int64(i)*300 // 5 min apart
 		var skew int64 = -60
 		if i >= 100 {
@@ -646,12 +647,13 @@ func TestReporterScenario_789(t *testing.T) {

 	baseObs := int64(1700000000)
 	var txs []*StoreTx
-	// 1657 samples with the bad ~-683-day skew (the historical poison),
-	// then 5 freshly corrected samples at -0.8s — totals 1662.
-	for i := 0; i < 1662; i++ {
+	// 1660 samples with the bad ~-683-day skew (the historical poison),
+	// then 20 freshly corrected samples at -0.8s — totals 1680.
+	// Need ≥20 corrected to fill the recent-window (recentSkewWindowCount=20).
+	for i := 0; i < 1680; i++ {
 		obsTS := baseObs + int64(i)*60 // 1 min apart
 		var skew int64
-		if i < 1657 {
+		if i < 1660 {
 			skew = -59063561 // ~ -683 days
 		} else {
 			skew = -1 // corrected (rounded; reporter saw -0.8)
@@ -680,8 +682,12 @@ func TestReporterScenario_789(t *testing.T) {
 		t.Fatal("nil result")
 	}
 	// Severity must reflect current health, not the all-time median.
-	if r.Severity != SkewOK && r.Severity != SkewWarning {
-		t.Errorf("severity = %v, want ok/warning (recent samples are healthy)", r.Severity)
+	// Post-#845 + hysteresis: a node with massive historical bad samples
+	// is correctly flagged bimodal_clock even when recent window is clean,
+	// because operators need to know the RTC is flaky. SkewOK only when
+	// long-term ALSO looks healthy.
+	if r.Severity != SkewOK && r.Severity != SkewWarning && r.Severity != SkewBimodalClock {
+		t.Errorf("severity = %v, want ok/warning/bimodal_clock (recent samples are healthy)", r.Severity)
 	}
 	if math.Abs(r.RecentMedianSkewSec) > 5 {
 		t.Errorf("recentMedianSkewSec = %v, want near 0", r.RecentMedianSkewSec)
@@ -954,3 +960,116 @@ func TestAllGood_OK_845(t *testing.T) {
 		t.Errorf("recentBadSampleCount = %v, want 0", r.RecentBadSampleCount)
 	}
 }
+
+// TestBimodalHysteresis: a node with mostly good long-term samples but a
+// recent burst of all-bad samples must stay bimodal_clock, NOT flip to
+// no_clock. This is the "Kpa Roof Solar" scenario seen on staging
+// (2026-04-22): historically bimodal node hits a transient all-bad burst
+// and the operator briefly sees "🚫 No Clock" even though the most recent
+// real advert decoded with a valid 2026 timestamp.
+func TestBimodalHysteresis(t *testing.T) {
+	ps := NewPacketStore(nil, nil)
+	pt := 4
+	baseObs := int64(1700000000)
+	var txs []*StoreTx
+	// 80 historical samples: 50% good (-2s), 50% bad (-58M sec ≈ -1.8yr)
+	for i := 0; i < 80; i++ {
+		obsTS := baseObs + int64(i)*60
+		var skew int64 = -2
+		if i%2 == 0 {
+			skew = -58000000
+		}
+		tx := &StoreTx{
+			Hash:        fmt.Sprintf("hist-%04d", i),
+			PayloadType: &pt,
+			DecodedJSON: `{"payload":{"timestamp":` + formatInt64(obsTS+skew) + `}}`,
+			Observations: []*StoreObs{
+				{ObserverID: "obs1", Timestamp: time.Unix(obsTS, 0).UTC().Format(time.RFC3339)},
+			},
+		}
+		txs = append(txs, tx)
+	}
+	// 25 recent samples ALL bad — fills the recent window (size 20) entirely
+	// with bad samples. recent goodFraction = 0.
+	for i := 80; i < 105; i++ {
+		obsTS := baseObs + int64(i)*60
+		tx := &StoreTx{
+			Hash:        fmt.Sprintf("badburst-%04d", i),
+			PayloadType: &pt,
+			DecodedJSON: `{"payload":{"timestamp":` + formatInt64(obsTS-58000000) + `}}`,
+			Observations: []*StoreObs{
+				{ObserverID: "obs1", Timestamp: time.Unix(obsTS, 0).UTC().Format(time.RFC3339)},
+			},
+		}
+		txs = append(txs, tx)
+	}
+	ps.mu.Lock()
+	ps.byNode["BIHYST"] = txs
+	for _, tx := range txs {
+		ps.byPayloadType[4] = append(ps.byPayloadType[4], tx)
+	}
+	ps.clockSkew.computeInterval = 0
+	ps.mu.Unlock()
+
+	r := ps.GetNodeClockSkew("BIHYST")
+	if r == nil {
+		t.Fatal("nil result")
+	}
+	// Without hysteresis: severity would be no_clock (recent goodFraction=0).
+	// With hysteresis: long-term goodFraction ≈ 0.38 ≥ 0.10, so stays bimodal.
+	if r.Severity != SkewBimodalClock {
+		t.Errorf("severity = %v, want bimodal_clock (long-term has good samples)", r.Severity)
+	}
+	if r.GoodFraction != 0 {
+		t.Errorf("recent goodFraction = %v, want 0 (bad burst)", r.GoodFraction)
+	}
+	if r.LongTermGoodFraction < 0.10 {
+		t.Errorf("longTermGoodFraction = %v, want >= 0.10", r.LongTermGoodFraction)
+	}
+	// Displayed skew should be the long-term good median (-2s), not the
+	// nonsense bad value, so the operator sees a meaningful number.
+	if r.RecentMedianSkewSec < -10 || r.RecentMedianSkewSec > 10 {
+		t.Errorf("recentMedianSkewSec = %v, want near -2 (long-term good median fallback)", r.RecentMedianSkewSec)
+	}
+}
+
+// TestNoClock_BothWindowsBad: the inverse of TestBimodalHysteresis. When
+// BOTH the recent window and the long-term goodFraction are essentially 0,
+// the node is genuinely no_clock (uninitialized RTC throughout).
+func TestNoClock_BothWindowsBad(t *testing.T) {
+	ps := NewPacketStore(nil, nil)
+	pt := 4
+	baseObs := int64(1700000000)
+	var txs []*StoreTx
+	// 50 samples — all bad.
+	for i := 0; i < 50; i++ {
+		obsTS := baseObs + int64(i)*60
+		tx := &StoreTx{
+			Hash:        fmt.Sprintf("dead-%04d", i),
+			PayloadType: &pt,
+			DecodedJSON: `{"payload":{"timestamp":` + formatInt64(obsTS-58000000) + `}}`,
+			Observations: []*StoreObs{
+				{ObserverID: "obs1", Timestamp: time.Unix(obsTS, 0).UTC().Format(time.RFC3339)},
+			},
+		}
+		txs = append(txs, tx)
+	}
+	ps.mu.Lock()
+	ps.byNode["DEADCLOCK"] = txs
+	for _, tx := range txs {
+		ps.byPayloadType[4] = append(ps.byPayloadType[4], tx)
+	}
+	ps.clockSkew.computeInterval = 0
+	ps.mu.Unlock()
+
+	r := ps.GetNodeClockSkew("DEADCLOCK")
+	if r == nil {
+		t.Fatal("nil result")
+	}
+	if r.Severity != SkewNoClock {
+		t.Errorf("severity = %v, want no_clock", r.Severity)
+	}
+	if r.LongTermGoodFraction != 0 {
+		t.Errorf("longTermGoodFraction = %v, want 0", r.LongTermGoodFraction)
+	}
+}