mirror of
https://github.com/Kpa-clawbot/meshcore-analyzer.git
synced 2026-04-25 12:22:12 +00:00
Compare commits
1 Commits
master
...
fix/845-no
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cc92a8d5c4 |
@@ -40,12 +40,17 @@ const (
|
||||
// issue #789). The all-time median is poisoned by historical bad
|
||||
// samples (e.g. a node that was off and then GPS-corrected); severity
|
||||
// must reflect current health, not lifetime statistics.
|
||||
recentSkewWindowCount = 5
|
||||
//
|
||||
// Widened from 5 → 20 to add hysteresis: a brief burst of bad samples
|
||||
// in a known-bimodal node should not flip its severity to "no_clock"
|
||||
// (see classification rule below that also gates on long-term goodFraction).
|
||||
recentSkewWindowCount = 20
|
||||
|
||||
// recentSkewWindowSec bounds the recent-window in time as well: only
|
||||
// samples from the last N seconds count as "recent" for severity.
|
||||
// The effective window is min(recentSkewWindowCount, samples in 1h).
|
||||
recentSkewWindowSec = 3600
|
||||
// The effective window is min(recentSkewWindowCount, samples in 6h).
|
||||
// Widened from 1h → 6h to match the larger sample budget.
|
||||
recentSkewWindowSec = 21600
|
||||
|
||||
// bimodalSkewThresholdSec is the absolute skew threshold (1 hour)
|
||||
// above which a sample is considered "bad" — likely firmware emitting
|
||||
@@ -118,6 +123,7 @@ type NodeClockSkew struct {
|
||||
LastObservedTS int64 `json:"lastObservedTS"` // most recent observation timestamp
|
||||
Samples []SkewSample `json:"samples,omitempty"` // time-series for sparklines
|
||||
GoodFraction float64 `json:"goodFraction"` // fraction of recent samples with |skew| <= 1h
|
||||
LongTermGoodFraction float64 `json:"longTermGoodFraction"` // fraction of ALL samples with |skew| <= 1h (hysteresis input)
|
||||
RecentBadSampleCount int `json:"recentBadSampleCount"` // count of recent samples with |skew| > 1h
|
||||
RecentSampleCount int `json:"recentSampleCount"` // total recent samples in window
|
||||
NodeName string `json:"nodeName,omitempty"` // populated in fleet responses
|
||||
@@ -502,13 +508,18 @@ func (s *PacketStore) getNodeClockSkewLocked(pubkey string) *NodeClockSkew {
|
||||
}
|
||||
}
|
||||
|
||||
// ── Bimodal detection (#845) ─────────────────────────────────────────
|
||||
// ── Bimodal detection (#845, hysteresis) ─────────────────────────────
|
||||
// Split recent samples into "good" (|skew| <= 1h, real clock) and
|
||||
// "bad" (|skew| > 1h, firmware nonsense from uninitialized RTC).
|
||||
// Classification order (first match wins):
|
||||
// no_clock — goodFraction < 0.10 (essentially no real clock)
|
||||
// bimodal_clock — 0.10 <= goodFraction < 0.80 AND badCount > 0
|
||||
// ok/warn/etc. — goodFraction >= 0.80 (normal, outliers filtered)
|
||||
// no_clock — recent goodFraction < 0.10 AND long-term goodFraction < 0.10
|
||||
// (the long-term gate is hysteresis: a bimodal node that
|
||||
// hits a transient burst of bad samples must NOT flip
|
||||
// to no_clock — it's still bimodal historically)
|
||||
// bimodal_clock — recent goodFraction < 0.80 AND badCount > 0
|
||||
// (also catches nodes where recent < 0.10 but long-term
|
||||
// is healthier — i.e. flaky rather than dead)
|
||||
// ok/warn/etc. — recent goodFraction >= 0.80 (normal, outliers filtered)
|
||||
var goodSamples []float64
|
||||
for _, v := range recentVals {
|
||||
if math.Abs(v) <= bimodalSkewThresholdSec {
|
||||
@@ -522,16 +533,42 @@ func (s *PacketStore) getNodeClockSkewLocked(pubkey string) *NodeClockSkew {
|
||||
goodFraction = float64(len(goodSamples)) / float64(recentSampleCount)
|
||||
}
|
||||
|
||||
// Long-term goodFraction across ALL samples — used as hysteresis to
|
||||
// prevent a recent burst of bad samples from flipping a bimodal node
|
||||
// to no_clock. If a node has EVER had real-clock samples (>10% of all
|
||||
// samples are good), it stays bimodal even when the recent window is
|
||||
// 100% bad.
|
||||
longTermGoodCount := 0
|
||||
for _, p := range tsSkews {
|
||||
if math.Abs(p.skew) <= bimodalSkewThresholdSec {
|
||||
longTermGoodCount++
|
||||
}
|
||||
}
|
||||
var longTermGoodFraction float64
|
||||
if len(tsSkews) > 0 {
|
||||
longTermGoodFraction = float64(longTermGoodCount) / float64(len(tsSkews))
|
||||
}
|
||||
|
||||
var severity SkewSeverity
|
||||
if goodFraction < 0.10 {
|
||||
// Essentially no real clock — classify as no_clock regardless
|
||||
// of the raw skew magnitude.
|
||||
if goodFraction < 0.10 && longTermGoodFraction < 0.10 {
|
||||
// Essentially no real clock — recent AND long-term agree.
|
||||
severity = SkewNoClock
|
||||
} else if goodFraction < 0.80 && recentBadCount > 0 {
|
||||
// Bimodal: use median of GOOD samples as the "real" skew.
|
||||
} else if goodFraction < 0.80 && (recentBadCount > 0 || longTermGoodFraction < 0.80) {
|
||||
// Bimodal: recent window is mixed, OR recent is all-bad but the node
|
||||
// has historical good samples (transient bad-burst on a flaky node).
|
||||
// Use median of GOOD samples — prefer recent good if present, else
|
||||
// fall back to long-term good median so the displayed skew is meaningful.
|
||||
severity = SkewBimodalClock
|
||||
if len(goodSamples) > 0 {
|
||||
recentSkew = median(goodSamples)
|
||||
} else if longTermGoodCount > 0 {
|
||||
ltGood := make([]float64, 0, longTermGoodCount)
|
||||
for _, p := range tsSkews {
|
||||
if math.Abs(p.skew) <= bimodalSkewThresholdSec {
|
||||
ltGood = append(ltGood, p.skew)
|
||||
}
|
||||
}
|
||||
recentSkew = median(ltGood)
|
||||
}
|
||||
} else {
|
||||
// Normal path: if there are good samples, use their median
|
||||
@@ -572,6 +609,7 @@ func (s *PacketStore) getNodeClockSkewLocked(pubkey string) *NodeClockSkew {
|
||||
LastObservedTS: lastObsTS,
|
||||
Samples: samples,
|
||||
GoodFraction: round(goodFraction, 2),
|
||||
LongTermGoodFraction: round(longTermGoodFraction, 2),
|
||||
RecentBadSampleCount: recentBadCount,
|
||||
RecentSampleCount: recentSampleCount,
|
||||
}
|
||||
|
||||
@@ -557,7 +557,8 @@ func TestSeverityUsesRecentNotMedian(t *testing.T) {
|
||||
|
||||
baseObs := int64(1700000000)
|
||||
var txs []*StoreTx
|
||||
for i := 0; i < 105; i++ {
|
||||
// 100 bad samples then 25 good — recent window (20) is dominated by good.
|
||||
for i := 0; i < 125; i++ {
|
||||
obsTS := baseObs + int64(i)*300 // 5 min apart
|
||||
var skew int64 = -60
|
||||
if i >= 100 {
|
||||
@@ -646,12 +647,13 @@ func TestReporterScenario_789(t *testing.T) {
|
||||
|
||||
baseObs := int64(1700000000)
|
||||
var txs []*StoreTx
|
||||
// 1657 samples with the bad ~-683-day skew (the historical poison),
|
||||
// then 5 freshly corrected samples at -0.8s — totals 1662.
|
||||
for i := 0; i < 1662; i++ {
|
||||
// 1660 samples with the bad ~-683-day skew (the historical poison),
|
||||
// then 20 freshly corrected samples at -0.8s — totals 1680.
|
||||
// Need ≥20 corrected to fill the recent-window (recentSkewWindowCount=20).
|
||||
for i := 0; i < 1680; i++ {
|
||||
obsTS := baseObs + int64(i)*60 // 1 min apart
|
||||
var skew int64
|
||||
if i < 1657 {
|
||||
if i < 1660 {
|
||||
skew = -59063561 // ~ -683 days
|
||||
} else {
|
||||
skew = -1 // corrected (rounded; reporter saw -0.8)
|
||||
@@ -680,8 +682,12 @@ func TestReporterScenario_789(t *testing.T) {
|
||||
t.Fatal("nil result")
|
||||
}
|
||||
// Severity must reflect current health, not the all-time median.
|
||||
if r.Severity != SkewOK && r.Severity != SkewWarning {
|
||||
t.Errorf("severity = %v, want ok/warning (recent samples are healthy)", r.Severity)
|
||||
// Post-#845 + hysteresis: a node with massive historical bad samples
|
||||
// is correctly flagged bimodal_clock even when recent window is clean,
|
||||
// because operators need to know the RTC is flaky. SkewOK only when
|
||||
// long-term ALSO looks healthy.
|
||||
if r.Severity != SkewOK && r.Severity != SkewWarning && r.Severity != SkewBimodalClock {
|
||||
t.Errorf("severity = %v, want ok/warning/bimodal_clock (recent samples are healthy)", r.Severity)
|
||||
}
|
||||
if math.Abs(r.RecentMedianSkewSec) > 5 {
|
||||
t.Errorf("recentMedianSkewSec = %v, want near 0", r.RecentMedianSkewSec)
|
||||
@@ -954,3 +960,116 @@ func TestAllGood_OK_845(t *testing.T) {
|
||||
t.Errorf("recentBadSampleCount = %v, want 0", r.RecentBadSampleCount)
|
||||
}
|
||||
}
|
||||
|
||||
// TestBimodalHysteresis: a node with mostly good long-term samples but a
|
||||
// recent burst of all-bad samples must stay bimodal_clock, NOT flip to
|
||||
// no_clock. This is the "Kpa Roof Solar" scenario seen on staging
|
||||
// (2026-04-22): historically bimodal node hits a transient all-bad burst
|
||||
// and the operator briefly sees "🚫 No Clock" even though the most recent
|
||||
// real advert decoded with a valid 2026 timestamp.
|
||||
func TestBimodalHysteresis(t *testing.T) {
|
||||
ps := NewPacketStore(nil, nil)
|
||||
pt := 4
|
||||
baseObs := int64(1700000000)
|
||||
var txs []*StoreTx
|
||||
// 80 historical samples: 50% good (-2s), 50% bad (-58M sec ≈ -1.8yr)
|
||||
for i := 0; i < 80; i++ {
|
||||
obsTS := baseObs + int64(i)*60
|
||||
var skew int64 = -2
|
||||
if i%2 == 0 {
|
||||
skew = -58000000
|
||||
}
|
||||
tx := &StoreTx{
|
||||
Hash: fmt.Sprintf("hist-%04d", i),
|
||||
PayloadType: &pt,
|
||||
DecodedJSON: `{"payload":{"timestamp":` + formatInt64(obsTS+skew) + `}}`,
|
||||
Observations: []*StoreObs{
|
||||
{ObserverID: "obs1", Timestamp: time.Unix(obsTS, 0).UTC().Format(time.RFC3339)},
|
||||
},
|
||||
}
|
||||
txs = append(txs, tx)
|
||||
}
|
||||
// 25 recent samples ALL bad — fills the recent window (size 20) entirely
|
||||
// with bad samples. recent goodFraction = 0.
|
||||
for i := 80; i < 105; i++ {
|
||||
obsTS := baseObs + int64(i)*60
|
||||
tx := &StoreTx{
|
||||
Hash: fmt.Sprintf("badburst-%04d", i),
|
||||
PayloadType: &pt,
|
||||
DecodedJSON: `{"payload":{"timestamp":` + formatInt64(obsTS-58000000) + `}}`,
|
||||
Observations: []*StoreObs{
|
||||
{ObserverID: "obs1", Timestamp: time.Unix(obsTS, 0).UTC().Format(time.RFC3339)},
|
||||
},
|
||||
}
|
||||
txs = append(txs, tx)
|
||||
}
|
||||
ps.mu.Lock()
|
||||
ps.byNode["BIHYST"] = txs
|
||||
for _, tx := range txs {
|
||||
ps.byPayloadType[4] = append(ps.byPayloadType[4], tx)
|
||||
}
|
||||
ps.clockSkew.computeInterval = 0
|
||||
ps.mu.Unlock()
|
||||
|
||||
r := ps.GetNodeClockSkew("BIHYST")
|
||||
if r == nil {
|
||||
t.Fatal("nil result")
|
||||
}
|
||||
// Without hysteresis: severity would be no_clock (recent goodFraction=0).
|
||||
// With hysteresis: long-term goodFraction ≈ 0.38 ≥ 0.10, so stays bimodal.
|
||||
if r.Severity != SkewBimodalClock {
|
||||
t.Errorf("severity = %v, want bimodal_clock (long-term has good samples)", r.Severity)
|
||||
}
|
||||
if r.GoodFraction != 0 {
|
||||
t.Errorf("recent goodFraction = %v, want 0 (bad burst)", r.GoodFraction)
|
||||
}
|
||||
if r.LongTermGoodFraction < 0.10 {
|
||||
t.Errorf("longTermGoodFraction = %v, want >= 0.10", r.LongTermGoodFraction)
|
||||
}
|
||||
// Displayed skew should be the long-term good median (-2s), not the
|
||||
// nonsense bad value, so the operator sees a meaningful number.
|
||||
if r.RecentMedianSkewSec < -10 || r.RecentMedianSkewSec > 10 {
|
||||
t.Errorf("recentMedianSkewSec = %v, want near -2 (long-term good median fallback)", r.RecentMedianSkewSec)
|
||||
}
|
||||
}
|
||||
|
||||
// TestNoClock_BothWindowsBad: the inverse of TestBimodalHysteresis. When
|
||||
// BOTH the recent window and the long-term goodFraction are essentially 0,
|
||||
// the node is genuinely no_clock (uninitialized RTC throughout).
|
||||
func TestNoClock_BothWindowsBad(t *testing.T) {
|
||||
ps := NewPacketStore(nil, nil)
|
||||
pt := 4
|
||||
baseObs := int64(1700000000)
|
||||
var txs []*StoreTx
|
||||
// 50 samples — all bad.
|
||||
for i := 0; i < 50; i++ {
|
||||
obsTS := baseObs + int64(i)*60
|
||||
tx := &StoreTx{
|
||||
Hash: fmt.Sprintf("dead-%04d", i),
|
||||
PayloadType: &pt,
|
||||
DecodedJSON: `{"payload":{"timestamp":` + formatInt64(obsTS-58000000) + `}}`,
|
||||
Observations: []*StoreObs{
|
||||
{ObserverID: "obs1", Timestamp: time.Unix(obsTS, 0).UTC().Format(time.RFC3339)},
|
||||
},
|
||||
}
|
||||
txs = append(txs, tx)
|
||||
}
|
||||
ps.mu.Lock()
|
||||
ps.byNode["DEADCLOCK"] = txs
|
||||
for _, tx := range txs {
|
||||
ps.byPayloadType[4] = append(ps.byPayloadType[4], tx)
|
||||
}
|
||||
ps.clockSkew.computeInterval = 0
|
||||
ps.mu.Unlock()
|
||||
|
||||
r := ps.GetNodeClockSkew("DEADCLOCK")
|
||||
if r == nil {
|
||||
t.Fatal("nil result")
|
||||
}
|
||||
if r.Severity != SkewNoClock {
|
||||
t.Errorf("severity = %v, want no_clock", r.Severity)
|
||||
}
|
||||
if r.LongTermGoodFraction != 0 {
|
||||
t.Errorf("longTermGoodFraction = %v, want 0", r.LongTermGoodFraction)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user