Compare commits

...

1 Commits

Author SHA1 Message Date
you
cc92a8d5c4 fix: bimodal clock hysteresis — don't flip to 'No Clock' on transient bad bursts
Symptom: Kpa Roof Solar (and other historically-bimodal nodes) briefly
flip to '🚫 No Clock' when the last 5 adverts in the recent-1h window
all happen to have nonsense timestamps, even though the very next advert
decoded with a valid 2026 timestamp.

Root cause: the bimodal classifier from #845 looks at the last 5 samples
within the past hour. When a bimodal node hits a transient burst of bad
RTC samples, recent goodFraction = 0 and severity flips to no_clock
regardless of the long-term picture (16k+ samples, ~38% historically good).

Fix (option C from triage):
1. Widen the recent window: 5 → 20 samples, 1h → 6h time bound
   (more data, less jitter, still recent).
2. Add hysteresis: only drop to no_clock when BOTH recent goodFraction
   AND long-term goodFraction are < 10%. A node with historical good
   samples stays bimodal even when the recent window is 100% bad.
3. When recent has zero good samples, fall back to long-term good median
   for the displayed skew so the operator sees a meaningful number
   instead of stale poison.

API: add longTermGoodFraction to /api/nodes/{pk}/clock-skew so operators
can see the hysteresis input directly.

Tests:
- TestBimodalHysteresis: recent all-bad + long-term mixed → bimodal_clock
- TestNoClock_BothWindowsBad: recent all-bad + long-term all-bad → no_clock
- Updated TestSeverityUsesRecentNotMedian + TestReporterScenario_789 to
  match the new wider window and accept bimodal_clock for nodes with
  massive historical poison (#845's whole premise: bimodal deserves a flag,
  not OK status).
2026-04-22 15:29:40 +00:00
2 changed files with 176 additions and 19 deletions

View File

@@ -40,12 +40,17 @@ const (
// issue #789). The all-time median is poisoned by historical bad
// samples (e.g. a node that was off and then GPS-corrected); severity
// must reflect current health, not lifetime statistics.
recentSkewWindowCount = 5
//
// Widened from 5 → 20 to add hysteresis: a brief burst of bad samples
// in a known-bimodal node should not flip its severity to "no_clock"
// (see classification rule below that also gates on long-term goodFraction).
recentSkewWindowCount = 20
// recentSkewWindowSec bounds the recent-window in time as well: only
// samples from the last N seconds count as "recent" for severity.
// The effective window is min(recentSkewWindowCount, samples in 1h).
recentSkewWindowSec = 3600
// The effective window is min(recentSkewWindowCount, samples in 6h).
// Widened from 1h → 6h to match the larger sample budget.
recentSkewWindowSec = 21600
// bimodalSkewThresholdSec is the absolute skew threshold (1 hour)
// above which a sample is considered "bad" — likely firmware emitting
@@ -118,6 +123,7 @@ type NodeClockSkew struct {
LastObservedTS int64 `json:"lastObservedTS"` // most recent observation timestamp
Samples []SkewSample `json:"samples,omitempty"` // time-series for sparklines
GoodFraction float64 `json:"goodFraction"` // fraction of recent samples with |skew| <= 1h
LongTermGoodFraction float64 `json:"longTermGoodFraction"` // fraction of ALL samples with |skew| <= 1h (hysteresis input)
RecentBadSampleCount int `json:"recentBadSampleCount"` // count of recent samples with |skew| > 1h
RecentSampleCount int `json:"recentSampleCount"` // total recent samples in window
NodeName string `json:"nodeName,omitempty"` // populated in fleet responses
@@ -502,13 +508,18 @@ func (s *PacketStore) getNodeClockSkewLocked(pubkey string) *NodeClockSkew {
}
}
// ── Bimodal detection (#845) ─────────────────────────────────────────
// ── Bimodal detection (#845, hysteresis) ─────────────────────────────
// Split recent samples into "good" (|skew| <= 1h, real clock) and
// "bad" (|skew| > 1h, firmware nonsense from uninitialized RTC).
// Classification order (first match wins):
// no_clock — goodFraction < 0.10 (essentially no real clock)
// bimodal_clock — 0.10 <= goodFraction < 0.80 AND badCount > 0
// ok/warn/etc. — goodFraction >= 0.80 (normal, outliers filtered)
// no_clock — recent goodFraction < 0.10 AND long-term goodFraction < 0.10
// (the long-term gate is hysteresis: a bimodal node that
// hits a transient burst of bad samples must NOT flip
// to no_clock — it's still bimodal historically)
// bimodal_clock — recent goodFraction < 0.80 AND badCount > 0
// (also catches nodes where recent < 0.10 but long-term
// is healthier — i.e. flaky rather than dead)
// ok/warn/etc. — recent goodFraction >= 0.80 (normal, outliers filtered)
var goodSamples []float64
for _, v := range recentVals {
if math.Abs(v) <= bimodalSkewThresholdSec {
@@ -522,16 +533,42 @@ func (s *PacketStore) getNodeClockSkewLocked(pubkey string) *NodeClockSkew {
goodFraction = float64(len(goodSamples)) / float64(recentSampleCount)
}
// Long-term goodFraction across ALL samples — used as hysteresis to
// prevent a recent burst of bad samples from flipping a bimodal node
// to no_clock. If a node has EVER had real-clock samples (>10% of all
// samples are good), it stays bimodal even when the recent window is
// 100% bad.
longTermGoodCount := 0
for _, p := range tsSkews {
if math.Abs(p.skew) <= bimodalSkewThresholdSec {
longTermGoodCount++
}
}
var longTermGoodFraction float64
if len(tsSkews) > 0 {
longTermGoodFraction = float64(longTermGoodCount) / float64(len(tsSkews))
}
var severity SkewSeverity
if goodFraction < 0.10 {
// Essentially no real clock — classify as no_clock regardless
// of the raw skew magnitude.
if goodFraction < 0.10 && longTermGoodFraction < 0.10 {
// Essentially no real clock — recent AND long-term agree.
severity = SkewNoClock
} else if goodFraction < 0.80 && recentBadCount > 0 {
// Bimodal: use median of GOOD samples as the "real" skew.
} else if goodFraction < 0.80 && (recentBadCount > 0 || longTermGoodFraction < 0.80) {
// Bimodal: recent window is mixed, OR recent is all-bad but the node
// has historical good samples (transient bad-burst on a flaky node).
// Use median of GOOD samples — prefer recent good if present, else
// fall back to long-term good median so the displayed skew is meaningful.
severity = SkewBimodalClock
if len(goodSamples) > 0 {
recentSkew = median(goodSamples)
} else if longTermGoodCount > 0 {
ltGood := make([]float64, 0, longTermGoodCount)
for _, p := range tsSkews {
if math.Abs(p.skew) <= bimodalSkewThresholdSec {
ltGood = append(ltGood, p.skew)
}
}
recentSkew = median(ltGood)
}
} else {
// Normal path: if there are good samples, use their median
@@ -572,6 +609,7 @@ func (s *PacketStore) getNodeClockSkewLocked(pubkey string) *NodeClockSkew {
LastObservedTS: lastObsTS,
Samples: samples,
GoodFraction: round(goodFraction, 2),
LongTermGoodFraction: round(longTermGoodFraction, 2),
RecentBadSampleCount: recentBadCount,
RecentSampleCount: recentSampleCount,
}

View File

@@ -557,7 +557,8 @@ func TestSeverityUsesRecentNotMedian(t *testing.T) {
baseObs := int64(1700000000)
var txs []*StoreTx
for i := 0; i < 105; i++ {
// 100 bad samples then 25 good — recent window (20) is dominated by good.
for i := 0; i < 125; i++ {
obsTS := baseObs + int64(i)*300 // 5 min apart
var skew int64 = -60
if i >= 100 {
@@ -646,12 +647,13 @@ func TestReporterScenario_789(t *testing.T) {
baseObs := int64(1700000000)
var txs []*StoreTx
// 1657 samples with the bad ~-683-day skew (the historical poison),
// then 5 freshly corrected samples at -0.8s — totals 1662.
for i := 0; i < 1662; i++ {
// 1660 samples with the bad ~-683-day skew (the historical poison),
// then 20 freshly corrected samples at -0.8s — totals 1680.
// Need ≥20 corrected to fill the recent-window (recentSkewWindowCount=20).
for i := 0; i < 1680; i++ {
obsTS := baseObs + int64(i)*60 // 1 min apart
var skew int64
if i < 1657 {
if i < 1660 {
skew = -59063561 // ~ -683 days
} else {
skew = -1 // corrected (rounded; reporter saw -0.8)
@@ -680,8 +682,12 @@ func TestReporterScenario_789(t *testing.T) {
t.Fatal("nil result")
}
// Severity must reflect current health, not the all-time median.
if r.Severity != SkewOK && r.Severity != SkewWarning {
t.Errorf("severity = %v, want ok/warning (recent samples are healthy)", r.Severity)
// Post-#845 + hysteresis: a node with massive historical bad samples
// is correctly flagged bimodal_clock even when recent window is clean,
// because operators need to know the RTC is flaky. SkewOK only when
// long-term ALSO looks healthy.
if r.Severity != SkewOK && r.Severity != SkewWarning && r.Severity != SkewBimodalClock {
t.Errorf("severity = %v, want ok/warning/bimodal_clock (recent samples are healthy)", r.Severity)
}
if math.Abs(r.RecentMedianSkewSec) > 5 {
t.Errorf("recentMedianSkewSec = %v, want near 0", r.RecentMedianSkewSec)
@@ -954,3 +960,116 @@ func TestAllGood_OK_845(t *testing.T) {
t.Errorf("recentBadSampleCount = %v, want 0", r.RecentBadSampleCount)
}
}
// TestBimodalHysteresis: a node with mostly good long-term samples but a
// recent burst of all-bad samples must stay bimodal_clock, NOT flip to
// no_clock. This is the "Kpa Roof Solar" scenario seen on staging
// (2026-04-22): historically bimodal node hits a transient all-bad burst
// and the operator briefly sees "🚫 No Clock" even though the most recent
// real advert decoded with a valid 2026 timestamp.
func TestBimodalHysteresis(t *testing.T) {
ps := NewPacketStore(nil, nil)
pt := 4
baseObs := int64(1700000000)
var txs []*StoreTx
// 80 historical samples: 50% good (-2s), 50% bad (-58M sec ≈ -1.8yr)
for i := 0; i < 80; i++ {
obsTS := baseObs + int64(i)*60
var skew int64 = -2
if i%2 == 0 {
skew = -58000000
}
tx := &StoreTx{
Hash: fmt.Sprintf("hist-%04d", i),
PayloadType: &pt,
DecodedJSON: `{"payload":{"timestamp":` + formatInt64(obsTS+skew) + `}}`,
Observations: []*StoreObs{
{ObserverID: "obs1", Timestamp: time.Unix(obsTS, 0).UTC().Format(time.RFC3339)},
},
}
txs = append(txs, tx)
}
// 25 recent samples ALL bad — fills the recent window (size 20) entirely
// with bad samples. recent goodFraction = 0.
for i := 80; i < 105; i++ {
obsTS := baseObs + int64(i)*60
tx := &StoreTx{
Hash: fmt.Sprintf("badburst-%04d", i),
PayloadType: &pt,
DecodedJSON: `{"payload":{"timestamp":` + formatInt64(obsTS-58000000) + `}}`,
Observations: []*StoreObs{
{ObserverID: "obs1", Timestamp: time.Unix(obsTS, 0).UTC().Format(time.RFC3339)},
},
}
txs = append(txs, tx)
}
ps.mu.Lock()
ps.byNode["BIHYST"] = txs
for _, tx := range txs {
ps.byPayloadType[4] = append(ps.byPayloadType[4], tx)
}
ps.clockSkew.computeInterval = 0
ps.mu.Unlock()
r := ps.GetNodeClockSkew("BIHYST")
if r == nil {
t.Fatal("nil result")
}
// Without hysteresis: severity would be no_clock (recent goodFraction=0).
// With hysteresis: long-term goodFraction ≈ 0.38 ≥ 0.10, so stays bimodal.
if r.Severity != SkewBimodalClock {
t.Errorf("severity = %v, want bimodal_clock (long-term has good samples)", r.Severity)
}
if r.GoodFraction != 0 {
t.Errorf("recent goodFraction = %v, want 0 (bad burst)", r.GoodFraction)
}
if r.LongTermGoodFraction < 0.10 {
t.Errorf("longTermGoodFraction = %v, want >= 0.10", r.LongTermGoodFraction)
}
// Displayed skew should be the long-term good median (-2s), not the
// nonsense bad value, so the operator sees a meaningful number.
if r.RecentMedianSkewSec < -10 || r.RecentMedianSkewSec > 10 {
t.Errorf("recentMedianSkewSec = %v, want near -2 (long-term good median fallback)", r.RecentMedianSkewSec)
}
}
// TestNoClock_BothWindowsBad: the inverse of TestBimodalHysteresis. When
// BOTH the recent window and the long-term goodFraction are essentially 0,
// the node is genuinely no_clock (uninitialized RTC throughout).
func TestNoClock_BothWindowsBad(t *testing.T) {
ps := NewPacketStore(nil, nil)
pt := 4
baseObs := int64(1700000000)
var txs []*StoreTx
// 50 samples — all bad.
for i := 0; i < 50; i++ {
obsTS := baseObs + int64(i)*60
tx := &StoreTx{
Hash: fmt.Sprintf("dead-%04d", i),
PayloadType: &pt,
DecodedJSON: `{"payload":{"timestamp":` + formatInt64(obsTS-58000000) + `}}`,
Observations: []*StoreObs{
{ObserverID: "obs1", Timestamp: time.Unix(obsTS, 0).UTC().Format(time.RFC3339)},
},
}
txs = append(txs, tx)
}
ps.mu.Lock()
ps.byNode["DEADCLOCK"] = txs
for _, tx := range txs {
ps.byPayloadType[4] = append(ps.byPayloadType[4], tx)
}
ps.clockSkew.computeInterval = 0
ps.mu.Unlock()
r := ps.GetNodeClockSkew("DEADCLOCK")
if r == nil {
t.Fatal("nil result")
}
if r.Severity != SkewNoClock {
t.Errorf("severity = %v, want no_clock", r.Severity)
}
if r.LongTermGoodFraction != 0 {
t.Errorf("longTermGoodFraction = %v, want 0", r.LongTermGoodFraction)
}
}