mirror of
https://github.com/Kpa-clawbot/meshcore-analyzer.git
synced 2026-04-25 19:22:13 +00:00
Compare commits
12 Commits
master
...
feat/clock
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
86ca793b60 | ||
|
|
4291b387f5 | ||
|
|
3cd7186563 | ||
|
|
86a4403136 | ||
|
|
c46a60f78a | ||
|
|
d4b1aa40d0 | ||
|
|
d617a55155 | ||
|
|
2106cc0b8b | ||
|
|
0acbac6fde | ||
|
|
2c675f5ab2 | ||
|
|
545df2788d | ||
|
|
f872fd90bf |
@@ -12,20 +12,28 @@ import (
|
||||
type SkewSeverity string
|
||||
|
||||
const (
|
||||
SkewOK SkewSeverity = "ok" // < 5 min
|
||||
SkewWarning SkewSeverity = "warning" // 5 min – 1 hour
|
||||
SkewCritical SkewSeverity = "critical" // 1 hour – 30 days
|
||||
SkewAbsurd SkewSeverity = "absurd" // > 30 days
|
||||
SkewNoClock SkewSeverity = "no_clock" // > 365 days — uninitialized RTC
|
||||
SkewBimodalClock SkewSeverity = "bimodal_clock" // mixed good+bad recent samples (flaky RTC)
|
||||
SkewDefault SkewSeverity = "default" // firmware-default epoch + uptime
|
||||
SkewOK SkewSeverity = "ok" // |skew| <= 15s
|
||||
SkewDegrading SkewSeverity = "degrading" // 15s < |skew| <= 60s
|
||||
SkewDegraded SkewSeverity = "degraded" // 60s < |skew| <= 600s
|
||||
SkewWrong SkewSeverity = "wrong" // |skew| > 600s and not default
|
||||
)
|
||||
|
||||
// Known firmware default epochs. Nodes with advert_ts in
|
||||
// [epoch, epoch + maxPlausibleUptimeSec] are classified as "default".
|
||||
// See docs/clock-skew-redesign.md for provenance of each value.
|
||||
var defaultEpochs = []int64{0, 1609459200, 1672531200, 1715770351}
|
||||
|
||||
// Default thresholds in seconds.
|
||||
const (
|
||||
skewThresholdWarnSec = 5 * 60 // 5 minutes
|
||||
skewThresholdCriticalSec = 60 * 60 // 1 hour
|
||||
skewThresholdAbsurdSec = 30 * 24 * 3600 // 30 days
|
||||
skewThresholdNoClockSec = 365 * 24 * 3600 // 365 days — uninitialized RTC
|
||||
// maxPlausibleUptimeSec caps how far past a default epoch we still
|
||||
// consider "default + uptime ticking". 730 days ≈ 2 years.
|
||||
maxPlausibleUptimeSec = 1095 * 86400 // 3 years — covers solar repeater deployment lifetimes at firmware default
|
||||
|
||||
// Severity band boundaries (absolute skew in seconds).
|
||||
skewThresholdOKSec = 15
|
||||
skewThresholdDegradingSec = 60
|
||||
skewThresholdDegradedSec = 600
|
||||
|
||||
// minDriftSamples is the minimum number of advert transmissions needed
|
||||
// to compute a meaningful linear drift rate.
|
||||
@@ -35,54 +43,52 @@ const (
|
||||
// drift rates (> 1 day/day) indicate insufficient or outlier samples.
|
||||
maxReasonableDriftPerDay = 86400.0
|
||||
|
||||
// recentSkewWindowCount is the number of most-recent advert samples
|
||||
// used to derive the "current" skew for severity classification (see
|
||||
// issue #789). The all-time median is poisoned by historical bad
|
||||
// samples (e.g. a node that was off and then GPS-corrected); severity
|
||||
// must reflect current health, not lifetime statistics.
|
||||
recentSkewWindowCount = 5
|
||||
|
||||
// recentSkewWindowSec bounds the recent-window in time as well: only
|
||||
// samples from the last N seconds count as "recent" for severity.
|
||||
// The effective window is min(recentSkewWindowCount, samples in 1h).
|
||||
recentSkewWindowSec = 3600
|
||||
|
||||
// bimodalSkewThresholdSec is the absolute skew threshold (1 hour)
|
||||
// above which a sample is considered "bad" — likely firmware emitting
|
||||
// a nonsense timestamp from an uninitialized RTC, not real drift.
|
||||
// Chosen to match the warning/critical severity boundary: real clock
|
||||
// drift rarely exceeds 1 hour, while epoch-0 RTCs produce ~1.7B sec.
|
||||
bimodalSkewThresholdSec = 3600.0
|
||||
|
||||
// maxPlausibleSkewJumpSec is the largest skew change between
|
||||
// consecutive samples that we treat as physical drift. Anything larger
|
||||
// (e.g. a GPS sync that jumps the clock by minutes/days) is rejected
|
||||
// as an outlier when computing drift. Real microcontroller drift is
|
||||
// fractions of a second per advert; 60s is a generous safety factor.
|
||||
// consecutive samples that we treat as physical drift.
|
||||
maxPlausibleSkewJumpSec = 60.0
|
||||
|
||||
// theilSenMaxPoints caps the number of points fed to Theil-Sen
|
||||
// regression (O(n²) in pairs). For nodes with thousands of samples we
|
||||
// keep the most-recent points, which are also the most relevant for
|
||||
// current drift.
|
||||
// regression (O(n²) in pairs).
|
||||
theilSenMaxPoints = 200
|
||||
)
|
||||
|
||||
// classifySkew maps absolute skew (seconds) to a severity level.
|
||||
// Float64 comparison is safe: inputs are rounded to 1 decimal via round(),
|
||||
// and thresholds are integer multiples of 60 — no rounding artifacts.
|
||||
func classifySkew(absSkewSec float64) SkewSeverity {
|
||||
// isDefaultEpoch returns true if the raw advert timestamp falls within
|
||||
// [epoch, epoch + maxPlausibleUptimeSec] for any known firmware default.
|
||||
// If matched, returns the matched epoch; otherwise returns 0.
|
||||
func isDefaultEpoch(advertTS int64) (bool, int64) {
|
||||
// Find the largest epoch <= advertTS (closest match). Since ranges
|
||||
// overlap, picking the closest avoids attributing a 2023-firmware
|
||||
// node's timestamp to the 2024 epoch.
|
||||
bestEpoch := int64(-1)
|
||||
for _, epoch := range defaultEpochs {
|
||||
if epoch <= advertTS && epoch > bestEpoch {
|
||||
bestEpoch = epoch
|
||||
}
|
||||
}
|
||||
if bestEpoch >= 0 && advertTS <= bestEpoch+maxPlausibleUptimeSec {
|
||||
return true, bestEpoch
|
||||
}
|
||||
return false, 0
|
||||
}
|
||||
|
||||
// classifySkew maps a raw advert timestamp and corrected skew (signed)
|
||||
// to a severity level. Takes math.Abs internally so callers may pass
|
||||
// signed values. Default detection runs on the raw advert_ts
|
||||
// (independent of observer calibration).
|
||||
func classifySkew(advertTS int64, skewSec float64) (SkewSeverity, int64) {
|
||||
if ok, epoch := isDefaultEpoch(advertTS); ok {
|
||||
return SkewDefault, epoch
|
||||
}
|
||||
abs := math.Abs(skewSec)
|
||||
switch {
|
||||
case absSkewSec >= skewThresholdNoClockSec:
|
||||
return SkewNoClock
|
||||
case absSkewSec >= skewThresholdAbsurdSec:
|
||||
return SkewAbsurd
|
||||
case absSkewSec >= skewThresholdCriticalSec:
|
||||
return SkewCritical
|
||||
case absSkewSec >= skewThresholdWarnSec:
|
||||
return SkewWarning
|
||||
case abs <= skewThresholdOKSec:
|
||||
return SkewOK, 0
|
||||
case abs <= skewThresholdDegradingSec:
|
||||
return SkewDegrading, 0
|
||||
case abs <= skewThresholdDegradedSec:
|
||||
return SkewDegraded, 0
|
||||
default:
|
||||
return SkewOK
|
||||
return SkewWrong, 0
|
||||
}
|
||||
}
|
||||
|
||||
@@ -90,38 +96,35 @@ func classifySkew(absSkewSec float64) SkewSeverity {
|
||||
|
||||
// skewSample is a single raw skew measurement from one advert observation.
|
||||
type skewSample struct {
|
||||
advertTS int64 // node's advert Unix timestamp
|
||||
observedTS int64 // observation Unix timestamp
|
||||
observerID string // which observer saw this
|
||||
hash string // transmission hash (for multi-observer grouping)
|
||||
advertTS int64 // node's advert Unix timestamp
|
||||
observedTS int64 // observation Unix timestamp
|
||||
observerID string // which observer saw this
|
||||
hash string // transmission hash (for multi-observer grouping)
|
||||
}
|
||||
|
||||
// ObserverCalibration holds the computed clock offset for an observer.
|
||||
type ObserverCalibration struct {
|
||||
ObserverID string `json:"observerID"`
|
||||
OffsetSec float64 `json:"offsetSec"` // positive = observer clock ahead
|
||||
Samples int `json:"samples"` // number of multi-observer packets used
|
||||
OffsetSec float64 `json:"offsetSec"` // positive = observer clock ahead
|
||||
Samples int `json:"samples"` // number of multi-observer packets used
|
||||
}
|
||||
|
||||
// NodeClockSkew is the API response for a single node's clock skew data.
|
||||
type NodeClockSkew struct {
|
||||
Pubkey string `json:"pubkey"`
|
||||
MeanSkewSec float64 `json:"meanSkewSec"` // corrected mean skew (positive = node ahead)
|
||||
MedianSkewSec float64 `json:"medianSkewSec"` // corrected median skew
|
||||
LastSkewSec float64 `json:"lastSkewSec"` // most recent corrected skew
|
||||
RecentMedianSkewSec float64 `json:"recentMedianSkewSec"` // median across most-recent samples (drives severity, see #789)
|
||||
DriftPerDaySec float64 `json:"driftPerDaySec"` // linear drift rate (sec/day)
|
||||
Severity SkewSeverity `json:"severity"`
|
||||
SampleCount int `json:"sampleCount"`
|
||||
Calibrated bool `json:"calibrated"` // true if observer calibration was applied
|
||||
LastAdvertTS int64 `json:"lastAdvertTS"` // most recent advert timestamp
|
||||
LastObservedTS int64 `json:"lastObservedTS"` // most recent observation timestamp
|
||||
Samples []SkewSample `json:"samples,omitempty"` // time-series for sparklines
|
||||
GoodFraction float64 `json:"goodFraction"` // fraction of recent samples with |skew| <= 1h
|
||||
RecentBadSampleCount int `json:"recentBadSampleCount"` // count of recent samples with |skew| > 1h
|
||||
RecentSampleCount int `json:"recentSampleCount"` // total recent samples in window
|
||||
NodeName string `json:"nodeName,omitempty"` // populated in fleet responses
|
||||
NodeRole string `json:"nodeRole,omitempty"` // populated in fleet responses
|
||||
Pubkey string `json:"pubkey"`
|
||||
MeanSkewSec float64 `json:"meanSkewSec"` // corrected mean skew (positive = node ahead)
|
||||
MedianSkewSec float64 `json:"medianSkewSec"` // corrected median skew
|
||||
LastSkewSec float64 `json:"lastSkewSec"` // most recent corrected skew
|
||||
DriftPerDaySec float64 `json:"driftPerDaySec"` // linear drift rate (sec/day)
|
||||
Severity SkewSeverity `json:"severity"`
|
||||
SampleCount int `json:"sampleCount"`
|
||||
Calibrated bool `json:"calibrated"` // true if observer calibration was applied
|
||||
LastAdvertTS int64 `json:"lastAdvertTS"` // most recent advert timestamp
|
||||
LastObservedTS int64 `json:"lastObservedTS"` // most recent observation timestamp
|
||||
DefaultEpoch *int64 `json:"defaultEpoch,omitempty"` // matched epoch when severity=default
|
||||
Samples []SkewSample `json:"samples,omitempty"` // time-series for sparklines
|
||||
NodeName string `json:"nodeName,omitempty"` // populated in fleet responses
|
||||
NodeRole string `json:"nodeRole,omitempty"` // populated in fleet responses
|
||||
}
|
||||
|
||||
// SkewSample is a single (timestamp, skew) point for sparkline rendering.
|
||||
@@ -130,28 +133,26 @@ type SkewSample struct {
|
||||
SkewSec float64 `json:"skew"` // corrected skew in seconds
|
||||
}
|
||||
|
||||
// txSkewResult maps tx hash → per-transmission skew stats. This is an
|
||||
// intermediate result keyed by hash (not pubkey); the store maps hash → pubkey
|
||||
// when building the final per-node view.
|
||||
// txSkewResult maps tx hash → per-transmission skew stats.
|
||||
type txSkewResult = map[string]*NodeClockSkew
|
||||
|
||||
// ── Clock Skew Engine ──────────────────────────────────────────────────────────
|
||||
|
||||
// ClockSkewEngine computes and caches clock skew data for nodes and observers.
|
||||
type ClockSkewEngine struct {
|
||||
mu sync.RWMutex
|
||||
observerOffsets map[string]float64 // observerID → calibrated offset (seconds)
|
||||
observerSamples map[string]int // observerID → number of multi-observer packets used
|
||||
nodeSkew txSkewResult
|
||||
lastComputed time.Time
|
||||
computeInterval time.Duration
|
||||
mu sync.RWMutex
|
||||
observerOffsets map[string]float64 // observerID → calibrated offset (seconds)
|
||||
observerSamples map[string]int // observerID → number of multi-observer packets used
|
||||
nodeSkew txSkewResult
|
||||
lastComputed time.Time
|
||||
computeInterval time.Duration
|
||||
}
|
||||
|
||||
func NewClockSkewEngine() *ClockSkewEngine {
|
||||
return &ClockSkewEngine{
|
||||
observerOffsets: make(map[string]float64),
|
||||
observerOffsets: make(map[string]float64),
|
||||
observerSamples: make(map[string]int),
|
||||
nodeSkew: make(txSkewResult),
|
||||
nodeSkew: make(txSkewResult),
|
||||
computeInterval: 30 * time.Second,
|
||||
}
|
||||
}
|
||||
@@ -188,7 +189,6 @@ func (e *ClockSkewEngine) Recompute(store *PacketStore) {
|
||||
|
||||
// Swap results under brief write lock.
|
||||
e.mu.Lock()
|
||||
// Re-check: another goroutine may have computed while we were working.
|
||||
if time.Since(e.lastComputed) < e.computeInterval {
|
||||
e.mu.Unlock()
|
||||
return
|
||||
@@ -214,13 +214,13 @@ func collectSamples(store *PacketStore) []skewSample {
|
||||
if decoded == nil {
|
||||
continue
|
||||
}
|
||||
// Extract advert timestamp from decoded JSON.
|
||||
advertTS := extractTimestamp(decoded)
|
||||
if advertTS <= 0 {
|
||||
if advertTS < 0 {
|
||||
continue
|
||||
}
|
||||
// Sanity: skip timestamps before year 2020 or after year 2100.
|
||||
if advertTS < 1577836800 || advertTS > 4102444800 {
|
||||
// Allow epoch 0 and above (needed for default-epoch detection).
|
||||
// Upper bound: year 2100.
|
||||
if advertTS > 4102444800 {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -240,21 +240,43 @@ func collectSamples(store *PacketStore) []skewSample {
|
||||
return samples
|
||||
}
|
||||
|
||||
// timestampMissing is the sentinel returned by extractTimestamp when no
|
||||
// timestamp field is present in the decoded advert. Using -1 lets us
|
||||
// distinguish "field absent" from a real epoch-0 timestamp (ts == 0).
|
||||
const timestampMissing int64 = -1
|
||||
|
||||
// extractTimestamp gets the Unix timestamp from a decoded ADVERT payload.
|
||||
// Returns timestampMissing (-1) if no timestamp field is found.
|
||||
func extractTimestamp(decoded map[string]interface{}) int64 {
|
||||
// Try payload.timestamp first (nested in "payload" key).
|
||||
if payload, ok := decoded["payload"]; ok {
|
||||
if pm, ok := payload.(map[string]interface{}); ok {
|
||||
if ts := jsonNumber(pm, "timestamp"); ts > 0 {
|
||||
if ts, ok := jsonNumberOk(pm, "timestamp"); ok {
|
||||
return ts
|
||||
}
|
||||
}
|
||||
}
|
||||
// Fallback: top-level timestamp.
|
||||
if ts := jsonNumber(decoded, "timestamp"); ts > 0 {
|
||||
if ts, ok := jsonNumberOk(decoded, "timestamp"); ok {
|
||||
return ts
|
||||
}
|
||||
return 0
|
||||
return timestampMissing
|
||||
}
|
||||
|
||||
// jsonNumberOk extracts an int64 from a JSON-parsed map, returning (value, true)
|
||||
// if the key exists and is numeric, or (0, false) otherwise.
|
||||
func jsonNumberOk(m map[string]interface{}, key string) (int64, bool) {
|
||||
v, ok := m[key]
|
||||
if !ok || v == nil {
|
||||
return 0, false
|
||||
}
|
||||
switch n := v.(type) {
|
||||
case float64:
|
||||
return int64(n), true
|
||||
case int64:
|
||||
return n, true
|
||||
case int:
|
||||
return int64(n), true
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
// jsonNumber extracts an int64 from a JSON-parsed map (handles float64 and json.Number).
|
||||
@@ -281,7 +303,6 @@ func parseISO(s string) int64 {
|
||||
}
|
||||
t, err := time.Parse(time.RFC3339, s)
|
||||
if err != nil {
|
||||
// Try with fractional seconds.
|
||||
t, err = time.Parse("2006-01-02T15:04:05.999999999Z07:00", s)
|
||||
if err != nil {
|
||||
return 0
|
||||
@@ -295,19 +316,16 @@ func parseISO(s string) int64 {
|
||||
// calibrateObservers computes each observer's clock offset using multi-observer
|
||||
// packets. Returns offset map and sample count map.
|
||||
func calibrateObservers(samples []skewSample) (map[string]float64, map[string]int) {
|
||||
// Group observations by packet hash.
|
||||
byHash := make(map[string][]skewSample)
|
||||
for _, s := range samples {
|
||||
byHash[s.hash] = append(byHash[s.hash], s)
|
||||
}
|
||||
|
||||
// For each multi-observer packet, compute per-observer deviation from median.
|
||||
deviations := make(map[string][]float64) // observerID → list of deviations
|
||||
deviations := make(map[string][]float64)
|
||||
for _, group := range byHash {
|
||||
if len(group) < 2 {
|
||||
continue // single-observer packet, can't calibrate
|
||||
continue
|
||||
}
|
||||
// Compute median observation timestamp for this packet.
|
||||
obsTimes := make([]float64, len(group))
|
||||
for i, s := range group {
|
||||
obsTimes[i] = float64(s.observedTS)
|
||||
@@ -319,7 +337,6 @@ func calibrateObservers(samples []skewSample) (map[string]float64, map[string]in
|
||||
}
|
||||
}
|
||||
|
||||
// Each observer's offset = median of its deviations.
|
||||
offsets := make(map[string]float64, len(deviations))
|
||||
counts := make(map[string]int, len(deviations))
|
||||
for obsID, devs := range deviations {
|
||||
@@ -333,8 +350,6 @@ func calibrateObservers(samples []skewSample) (map[string]float64, map[string]in
|
||||
|
||||
// computeNodeSkew calculates corrected skew statistics for each node.
|
||||
func computeNodeSkew(samples []skewSample, obsOffsets map[string]float64) txSkewResult {
|
||||
// Compute corrected skew per sample, grouped by hash (each hash = one
|
||||
// node's advert transmission). The caller maps hash → pubkey via byNode.
|
||||
type correctedSample struct {
|
||||
skew float64
|
||||
observedTS int64
|
||||
@@ -349,8 +364,6 @@ func computeNodeSkew(samples []skewSample, obsOffsets map[string]float64) txSkew
|
||||
rawSkew := float64(s.advertTS - s.observedTS)
|
||||
corrected := rawSkew
|
||||
if hasCal {
|
||||
// Observer offset = obs_ts - median(all_obs_ts). If observer is ahead,
|
||||
// its obs_ts is inflated, making raw_skew too low. Add offset to correct.
|
||||
corrected = rawSkew + obsOffset
|
||||
}
|
||||
byHash[s.hash] = append(byHash[s.hash], correctedSample{
|
||||
@@ -361,10 +374,7 @@ func computeNodeSkew(samples []skewSample, obsOffsets map[string]float64) txSkew
|
||||
hashAdvertTS[s.hash] = s.advertTS
|
||||
}
|
||||
|
||||
// Each hash represents one advert from one node. Compute median corrected
|
||||
// skew per hash (across multiple observers).
|
||||
|
||||
result := make(map[string]*NodeClockSkew) // keyed by hash for now
|
||||
result := make(map[string]*NodeClockSkew)
|
||||
for hash, cs := range byHash {
|
||||
skews := make([]float64, len(cs))
|
||||
for i, c := range cs {
|
||||
@@ -373,29 +383,37 @@ func computeNodeSkew(samples []skewSample, obsOffsets map[string]float64) txSkew
|
||||
medSkew := median(skews)
|
||||
meanSkew := mean(skews)
|
||||
|
||||
// Find latest observation.
|
||||
var latestObsTS int64
|
||||
// Pick the skew from the most recent observation (max observedTS),
|
||||
// not the last-appended sample which may be non-chronological.
|
||||
var latest correctedSample
|
||||
var anyCal bool
|
||||
for _, c := range cs {
|
||||
if c.observedTS > latestObsTS {
|
||||
latestObsTS = c.observedTS
|
||||
if c.observedTS > latest.observedTS {
|
||||
latest = c
|
||||
}
|
||||
if c.calibrated {
|
||||
anyCal = true
|
||||
}
|
||||
}
|
||||
lastCorrectedSkew := latest.skew
|
||||
advTS := hashAdvertTS[hash]
|
||||
severity, matchedEpoch := classifySkew(advTS, lastCorrectedSkew)
|
||||
|
||||
absMedian := math.Abs(medSkew)
|
||||
result[hash] = &NodeClockSkew{
|
||||
ncs := &NodeClockSkew{
|
||||
MeanSkewSec: round(meanSkew, 1),
|
||||
MedianSkewSec: round(medSkew, 1),
|
||||
LastSkewSec: round(cs[len(cs)-1].skew, 1),
|
||||
Severity: classifySkew(absMedian),
|
||||
LastSkewSec: round(lastCorrectedSkew, 1),
|
||||
Severity: severity,
|
||||
SampleCount: len(cs),
|
||||
Calibrated: anyCal,
|
||||
LastAdvertTS: hashAdvertTS[hash],
|
||||
LastObservedTS: latestObsTS,
|
||||
LastAdvertTS: advTS,
|
||||
LastObservedTS: latest.observedTS,
|
||||
}
|
||||
if severity == SkewDefault {
|
||||
ep := matchedEpoch
|
||||
ncs.DefaultEpoch = &ep
|
||||
}
|
||||
result[hash] = ncs
|
||||
}
|
||||
return result
|
||||
}
|
||||
@@ -457,124 +475,45 @@ func (s *PacketStore) getNodeClockSkewLocked(pubkey string) *NodeClockSkew {
|
||||
medSkew := median(allSkews)
|
||||
meanSkew := mean(allSkews)
|
||||
|
||||
// Severity is derived from RECENT samples only (issue #789). The
|
||||
// all-time median is poisoned by historical bad data — a node that
|
||||
// was off for hours and then GPS-corrected can have median = -59M sec
|
||||
// while its current skew is -0.8s. Operators need severity to reflect
|
||||
// current health, so they trust the dashboard.
|
||||
//
|
||||
// Sort tsSkews by time and take the last recentSkewWindowCount samples
|
||||
// (or all samples within recentSkewWindowSec of the latest, whichever
|
||||
// gives FEWER samples — we want the more-current view; a chatty node
|
||||
// can fit dozens of samples in 1h, in which case the count cap wins).
|
||||
sort.Slice(tsSkews, func(i, j int) bool { return tsSkews[i].ts < tsSkews[j].ts })
|
||||
// Classify using the most recent advert's raw timestamp and
|
||||
// the most recent corrected skew. No windowing or median-driven
|
||||
// severity — per-advert classification per the spec.
|
||||
severity, matchedEpoch := classifySkew(lastAdvTS, lastSkew)
|
||||
|
||||
recentSkew := lastSkew
|
||||
var recentVals []float64
|
||||
if n := len(tsSkews); n > 0 {
|
||||
latestTS := tsSkews[n-1].ts
|
||||
// Index-based window: last K samples.
|
||||
startByCount := n - recentSkewWindowCount
|
||||
if startByCount < 0 {
|
||||
startByCount = 0
|
||||
}
|
||||
// Time-based window: samples newer than latestTS - windowSec.
|
||||
startByTime := n - 1
|
||||
for i := n - 1; i >= 0; i-- {
|
||||
if latestTS-tsSkews[i].ts <= recentSkewWindowSec {
|
||||
startByTime = i
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
// Pick the narrower (larger-index) of the two windows — the most
|
||||
// current view of the node's clock health.
|
||||
start := startByCount
|
||||
if startByTime > start {
|
||||
start = startByTime
|
||||
}
|
||||
recentVals = make([]float64, 0, n-start)
|
||||
for i := start; i < n; i++ {
|
||||
recentVals = append(recentVals, tsSkews[i].skew)
|
||||
}
|
||||
if len(recentVals) > 0 {
|
||||
recentSkew = median(recentVals)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Bimodal detection (#845) ─────────────────────────────────────────
|
||||
// Split recent samples into "good" (|skew| <= 1h, real clock) and
|
||||
// "bad" (|skew| > 1h, firmware nonsense from uninitialized RTC).
|
||||
// Classification order (first match wins):
|
||||
// no_clock — goodFraction < 0.10 (essentially no real clock)
|
||||
// bimodal_clock — 0.10 <= goodFraction < 0.80 AND badCount > 0
|
||||
// ok/warn/etc. — goodFraction >= 0.80 (normal, outliers filtered)
|
||||
var goodSamples []float64
|
||||
for _, v := range recentVals {
|
||||
if math.Abs(v) <= bimodalSkewThresholdSec {
|
||||
goodSamples = append(goodSamples, v)
|
||||
}
|
||||
}
|
||||
recentSampleCount := len(recentVals)
|
||||
recentBadCount := recentSampleCount - len(goodSamples)
|
||||
var goodFraction float64
|
||||
if recentSampleCount > 0 {
|
||||
goodFraction = float64(len(goodSamples)) / float64(recentSampleCount)
|
||||
}
|
||||
|
||||
var severity SkewSeverity
|
||||
if goodFraction < 0.10 {
|
||||
// Essentially no real clock — classify as no_clock regardless
|
||||
// of the raw skew magnitude.
|
||||
severity = SkewNoClock
|
||||
} else if goodFraction < 0.80 && recentBadCount > 0 {
|
||||
// Bimodal: use median of GOOD samples as the "real" skew.
|
||||
severity = SkewBimodalClock
|
||||
if len(goodSamples) > 0 {
|
||||
recentSkew = median(goodSamples)
|
||||
}
|
||||
} else {
|
||||
// Normal path: if there are good samples, use their median
|
||||
// (filters out rare outliers in ≥80% good case).
|
||||
if len(goodSamples) > 0 && recentBadCount > 0 {
|
||||
recentSkew = median(goodSamples)
|
||||
}
|
||||
severity = classifySkew(math.Abs(recentSkew))
|
||||
}
|
||||
|
||||
// For no_clock / bimodal_clock nodes, skip drift when data is unreliable.
|
||||
// Drift: display only, not a classifier input.
|
||||
var drift float64
|
||||
if severity != SkewNoClock && severity != SkewBimodalClock && len(tsSkews) >= minDriftSamples {
|
||||
if severity != SkewDefault && len(tsSkews) >= minDriftSamples {
|
||||
drift = computeDrift(tsSkews)
|
||||
// Cap physically impossible drift rates.
|
||||
if math.Abs(drift) > maxReasonableDriftPerDay {
|
||||
drift = 0
|
||||
}
|
||||
}
|
||||
|
||||
// Build sparkline samples from tsSkews (already sorted by time above).
|
||||
// Build sparkline samples.
|
||||
sort.Slice(tsSkews, func(i, j int) bool { return tsSkews[i].ts < tsSkews[j].ts })
|
||||
samples := make([]SkewSample, len(tsSkews))
|
||||
for i, p := range tsSkews {
|
||||
samples[i] = SkewSample{Timestamp: p.ts, SkewSec: round(p.skew, 1)}
|
||||
}
|
||||
|
||||
return &NodeClockSkew{
|
||||
Pubkey: pubkey,
|
||||
MeanSkewSec: round(meanSkew, 1),
|
||||
MedianSkewSec: round(medSkew, 1),
|
||||
LastSkewSec: round(lastSkew, 1),
|
||||
RecentMedianSkewSec: round(recentSkew, 1),
|
||||
DriftPerDaySec: round(drift, 2),
|
||||
Severity: severity,
|
||||
SampleCount: totalSamples,
|
||||
Calibrated: anyCal,
|
||||
LastAdvertTS: lastAdvTS,
|
||||
LastObservedTS: lastObsTS,
|
||||
Samples: samples,
|
||||
GoodFraction: round(goodFraction, 2),
|
||||
RecentBadSampleCount: recentBadCount,
|
||||
RecentSampleCount: recentSampleCount,
|
||||
result := &NodeClockSkew{
|
||||
Pubkey: pubkey,
|
||||
MeanSkewSec: round(meanSkew, 1),
|
||||
MedianSkewSec: round(medSkew, 1),
|
||||
LastSkewSec: round(lastSkew, 1),
|
||||
DriftPerDaySec: round(drift, 2),
|
||||
Severity: severity,
|
||||
SampleCount: totalSamples,
|
||||
Calibrated: anyCal,
|
||||
LastAdvertTS: lastAdvTS,
|
||||
LastObservedTS: lastObsTS,
|
||||
Samples: samples,
|
||||
}
|
||||
if severity == SkewDefault {
|
||||
ep := matchedEpoch
|
||||
result.DefaultEpoch = &ep
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// GetFleetClockSkew returns clock skew data for all nodes that have skew data.
|
||||
@@ -583,7 +522,6 @@ func (s *PacketStore) GetFleetClockSkew() []*NodeClockSkew {
|
||||
s.mu.RLock()
|
||||
defer s.mu.RUnlock()
|
||||
|
||||
// Build name/role lookup from DB cache (requires s.mu held).
|
||||
allNodes, _ := s.getCachedNodesAndPM()
|
||||
nameMap := make(map[string]nodeInfo, len(allNodes))
|
||||
for _, ni := range allNodes {
|
||||
@@ -596,12 +534,10 @@ func (s *PacketStore) GetFleetClockSkew() []*NodeClockSkew {
|
||||
if cs == nil {
|
||||
continue
|
||||
}
|
||||
// Enrich with node name/role.
|
||||
if ni, ok := nameMap[pubkey]; ok {
|
||||
cs.NodeName = ni.Name
|
||||
cs.NodeRole = ni.Role
|
||||
}
|
||||
// Omit samples in fleet response (too much data).
|
||||
cs.Samples = nil
|
||||
results = append(results, cs)
|
||||
}
|
||||
@@ -626,7 +562,6 @@ func (s *PacketStore) GetObserverCalibrations() []ObserverCalibration {
|
||||
Samples: s.clockSkew.observerSamples[obsID],
|
||||
})
|
||||
}
|
||||
// Sort by absolute offset descending.
|
||||
sort.Slice(result, func(i, j int) bool {
|
||||
return math.Abs(result[i].OffsetSec) > math.Abs(result[j].OffsetSec)
|
||||
})
|
||||
@@ -667,38 +602,20 @@ type tsSkewPair struct {
|
||||
}
|
||||
|
||||
// computeDrift estimates linear drift in seconds per day from time-ordered
|
||||
// (timestamp, skew) pairs. Issue #789: a single GPS-correction event (huge
|
||||
// skew jump in seconds) used to dominate ordinary least squares and produce
|
||||
// absurd drift like 1.7M sec/day. We now:
|
||||
//
|
||||
// 1. Drop pairs whose consecutive skew jump exceeds maxPlausibleSkewJumpSec
|
||||
// (clock corrections, not physical drift). This protects both OLS-style
|
||||
// consumers and Theil-Sen.
|
||||
// 2. Use Theil-Sen regression — the slope is the median of all pairwise
|
||||
// slopes, naturally robust to remaining outliers (breakdown point ~29%).
|
||||
//
|
||||
// For very small samples after filtering we fall back to a simple slope
|
||||
// between first and last calibrated samples.
|
||||
// (timestamp, skew) pairs using Theil-Sen regression with outlier filtering.
|
||||
func computeDrift(pairs []tsSkewPair) float64 {
|
||||
if len(pairs) < 2 {
|
||||
return 0
|
||||
}
|
||||
// Sort by timestamp.
|
||||
sort.Slice(pairs, func(i, j int) bool {
|
||||
return pairs[i].ts < pairs[j].ts
|
||||
})
|
||||
|
||||
// Time span too short? Skip.
|
||||
spanSec := float64(pairs[len(pairs)-1].ts - pairs[0].ts)
|
||||
if spanSec < 3600 { // need at least 1 hour of data
|
||||
if spanSec < 3600 {
|
||||
return 0
|
||||
}
|
||||
|
||||
// Outlier filter: drop samples where the skew jumps more than
|
||||
// maxPlausibleSkewJumpSec from the running "stable" baseline.
|
||||
// We anchor on the first sample, then accept each subsequent point
|
||||
// that's within the threshold of the most recent accepted point —
|
||||
// this preserves a slow drift while rejecting correction events.
|
||||
filtered := make([]tsSkewPair, 0, len(pairs))
|
||||
filtered = append(filtered, pairs[0])
|
||||
for i := 1; i < len(pairs); i++ {
|
||||
@@ -707,30 +624,23 @@ func computeDrift(pairs []tsSkewPair) float64 {
|
||||
filtered = append(filtered, pairs[i])
|
||||
}
|
||||
}
|
||||
// If the filter killed too much (e.g. unstable node), fall back to the
|
||||
// raw series so we at least produce *something* — it'll be capped by
|
||||
// maxReasonableDriftPerDay downstream.
|
||||
if len(filtered) < 2 || float64(filtered[len(filtered)-1].ts-filtered[0].ts) < 3600 {
|
||||
filtered = pairs
|
||||
}
|
||||
|
||||
// Cap point count for Theil-Sen (O(n²) on pairs). Keep most-recent.
|
||||
if len(filtered) > theilSenMaxPoints {
|
||||
filtered = filtered[len(filtered)-theilSenMaxPoints:]
|
||||
}
|
||||
|
||||
return theilSenSlope(filtered) * 86400 // sec/sec → sec/day
|
||||
return theilSenSlope(filtered) * 86400
|
||||
}
|
||||
|
||||
// theilSenSlope returns the Theil-Sen estimator: median of all pairwise
|
||||
// slopes (yj - yi) / (tj - ti) for i < j. Naturally robust to outliers.
|
||||
// Pairs must be sorted by timestamp ascending.
|
||||
// theilSenSlope returns the Theil-Sen estimator: median of all pairwise slopes.
|
||||
func theilSenSlope(pairs []tsSkewPair) float64 {
|
||||
n := len(pairs)
|
||||
if n < 2 {
|
||||
return 0
|
||||
}
|
||||
// Pre-allocate: n*(n-1)/2 pairs.
|
||||
slopes := make([]float64, 0, n*(n-1)/2)
|
||||
for i := 0; i < n; i++ {
|
||||
for j := i + 1; j < n; j++ {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
241
docs/clock-skew-redesign.md
Normal file
241
docs/clock-skew-redesign.md
Normal file
@@ -0,0 +1,241 @@
|
||||
# Clock Skew Classifier — Redesign
|
||||
|
||||
**Status:** spec, pre-implementation
|
||||
**Supersedes:** parts of #690 / #789 / #845 / PR #894
|
||||
**Date drafted:** 2026-04-24
|
||||
|
||||
## Problem
|
||||
|
||||
The current classifier (`cmd/server/clock_skew.go`) uses windowed medians, hysteresis, "good fraction" floors, and a 365-day `no_clock` threshold. It produces:
|
||||
|
||||
- False `no_clock` flags on nodes whose clocks are working today but had garbage timestamps in recent samples.
|
||||
- Symmetric severity bands that conflate "clock at firmware default" with "operator set the clock wrong by a year" — completely different operator actions required.
|
||||
- Compounding over-engineering as each operator complaint added a new tier or window.
|
||||
|
||||
The actual physical reality of these devices is much simpler than the classifier assumes.
|
||||
|
||||
## Hardware reality
|
||||
|
||||
Most MeshCore nodes have **no auto-updating RTC**. There are two hardware paths:
|
||||
|
||||
1. **Volatile RTC nodes** (`firmware/src/helpers/ArduinoHelpers.h:11` — `VolatileRTCClock`):
|
||||
- On boot, `base_time` is hardcoded to a firmware-build constant (currently `1715770351` = 2024-05-15 20:52:31 UTC).
|
||||
- `getCurrentTime()` returns `base_time + millis()/1000`.
|
||||
- On reboot the value snaps back to the constant.
|
||||
- User must manually sync via companion app (`set time` CLI invokes `setCurrentTime(...)`) to set a real wall-clock time, which then ticks until the next reboot.
|
||||
|
||||
2. **Hardware-RTC nodes** (`firmware/src/helpers/AutoDiscoverRTCClock.cpp` — DS3231 / RV3028 / PCF8563):
|
||||
- Real-time chip with battery backup. Holds the time across reboots.
|
||||
- Behaves correctly once set; no default-snap behavior.
|
||||
|
||||
The `set time RESET` CLI command (`firmware/src/helpers/CommonCLI.cpp:215`) explicitly calls `setCurrentTime(1715770351)` regardless of hardware — so even hardware-RTC nodes can be deliberately reset to the default epoch.
|
||||
|
||||
**Therefore every node is in exactly one of these states:**
|
||||
|
||||
| State | Description |
|
||||
|---|---|
|
||||
| **Default / never set** | RTC is at a firmware-default epoch + ticking up since the last boot. |
|
||||
| **Set, drifting normally** | RTC was synced; small skew accumulating at ~0.8s/day per #789 reports. |
|
||||
| **Set, drifted past tolerance** | Like above but skew has grown beyond what's useful. |
|
||||
| **Wrong** | Operator-set incorrect time, or genuine RTC malfunction not matching any known default. |
|
||||
|
||||
There is no "bimodal RTC bug" — what looked bimodal in #845 is just a sequence of `defaulted → user sync → reboot → defaulted again`. The "bad" timestamps are not noise; they're a constant (the default epoch + a small uptime).
|
||||
|
||||
## Production data analysis (2026-04-24)
|
||||
|
||||
### 00id.net (this deployment, 416 nodes, commit `abd9c46`)
|
||||
|
||||
`lastSkewSec` (advert_ts − observed_ts) distribution:
|
||||
|
||||
| Bucket | Count | Pct |
|
||||
|---|---:|---:|
|
||||
| OK ≤15s | 90 | 22% |
|
||||
| Degrading ≤60s | 93 | 22% |
|
||||
| Degraded ≤10m | 13 | 3% |
|
||||
| off ≤1d | 5 | 1% |
|
||||
| off ≤1y | 110 | 26% |
|
||||
| absurd >1y | 105 | 25% |
|
||||
|
||||
Per-node `lastAdvertTS` raw timestamp distribution shows a sharp default cluster:
|
||||
|
||||
```
|
||||
+0 days count=19 samples=114969 ← exactly at 1715770351 (just rebooted)
|
||||
+1d count=9 samples=24766
|
||||
+2d count=7 samples=58101
|
||||
+3d count=2 samples=360
|
||||
... ← decay through ~110 days
|
||||
+113d count=2 samples=53776
|
||||
```
|
||||
|
||||
103 of 416 nodes (25%) have `lastAdvertTS` between `1715770351` and `1715770351 + 1095 days`, consistent with the volatile-RTC-default-ticking-up pattern.
|
||||
|
||||
A second cluster of 5 nodes has `lastAdvertTS = 1672531542 ≈ 1672531200 + 5min` = **2023-01-01 00:00:00 UTC** + small uptime. This is a *different* firmware-default epoch from an older firmware version.
|
||||
|
||||
### Cascadia (analyzer.cascadiamesh.org, 433 nodes in 5000-packet sample, commit `111b03c` v3.5.0)
|
||||
|
||||
ADVERT timestamp by year-month:
|
||||
|
||||
```
|
||||
1970-01 1 ← epoch zero (ESP32 native fallback OR ancient firmware)
|
||||
2021-01 1 ← possible third default epoch
|
||||
2023-01 2 ← old firmware default (matches 00id)
|
||||
2024-05 60 ← current VolatileRTCClock + days uptime
|
||||
2024-06 39 ← same default + weeks uptime
|
||||
2024-07 21
|
||||
2024-08 10
|
||||
2024-09 2
|
||||
2024-10 1
|
||||
2024-11 2 ← decays out as fewer nodes have multi-month uptime since reboot
|
||||
2025-10 1 ← pre-current-now miscellany
|
||||
2025-11 2
|
||||
2026-03 4
|
||||
2026-04 285 ← currently set clocks (this is "now-ish")
|
||||
2027-04 1 ← operator set wrong by ~1 year (typo?)
|
||||
2067-12 1 ← operator set wildly wrong / corrupted RTC
|
||||
```
|
||||
|
||||
Confirms the model: ~67% of nodes have a current clock, ~32% are at known firmware defaults at varying uptime offsets, ~3 outliers represent genuine misconfigurations.
|
||||
|
||||
## Known firmware default epochs
|
||||
|
||||
These are the values discovered in production data so far:
|
||||
|
||||
| Epoch (unix) | UTC | Source |
|
||||
|---:|---|---|
|
||||
| `0` | 1970-01-01 | Likely ESP32 boot when no RTC initialization runs (`time(NULL)` returns 0). |
|
||||
| `1609459200` | 2021-01-01 | Speculation — single-sample evidence, validate as more data arrives. |
|
||||
| `1672531200` | 2023-01-01 | Older firmware `VolatileRTCClock::base_time` value. |
|
||||
| `1715770351` | 2024-05-15 20:52:31 | **Current** `VolatileRTCClock` constructor + `set time RESET` CLI. |
|
||||
|
||||
Treat the table as data, not fixed code. New firmware versions will introduce new defaults; expect to add to the list over time.
|
||||
|
||||
## Reconciliation with #690 — the four timestamps
|
||||
|
||||
#690 lists three timestamps; in practice there are four signals worth distinguishing:
|
||||
|
||||
| Signal | Source | Used for |
|
||||
|---|---|---|
|
||||
| `advert_ts` | Inside MeshCore packet, set by sending node | Per-node classification (THE signal). |
|
||||
| `mqtt_envelope_ts` | Set by observer when it forwards via MQTT | Observer-side calibration only — *not* a direct node-skew signal because observer clock can itself be wrong. |
|
||||
| `corescope_received_ts` | Wall clock when CoreScope ingested the message | Reference "now"; calibration cross-check. |
|
||||
| `same_packet_across_observers` | Multiple observers seeing the same hash | Phase 2 calibration (triangulation). |
|
||||
|
||||
**Inputs flow:**
|
||||
|
||||
1. **Phase 2 (existing, kept):** for each packet hash seen by ≥2 observers, compute each observer's deviation from the per-packet median observed_ts → `observerOffset`. This is the triangulation #690 calls for ("Same packet observed by more than one (ideally 3+) observers gives good indication if one observer is off"). Observer offsets are the calibration table.
|
||||
2. **Per-advert correction (existing, kept):** `correctedSkew = (advert_ts - observed_ts) + observerOffset[observer_id]`. If no calibration exists for an observer, fall back to raw skew with `calibrated: false`.
|
||||
3. **Default detection (new):** runs on RAW `advert_ts`, not corrected. The firmware default is a fixed wall-clock value; observer offsets are seconds-to-minutes scale and cannot move `advert_ts` from 2024 to 2026. Default check is independent of calibration.
|
||||
4. **Severity classification (new):** if `is_default(advert_ts)` → `default`; else classify by `|correctedSkew|` band.
|
||||
|
||||
This keeps everything #690 asks for (observer detection, bias subtraction, triangulation), and adds the firmware-default cluster as a new pre-empting tier.
|
||||
|
||||
## UI: explain WHY (#690 requirement)
|
||||
|
||||
The classifier alone doesn't satisfy #690's "present on the UI why clock skew is obvious or suspected." The evidence panel from PR #906 (per-hash observer breakdown showing raw vs corrected skew per observer) is the WHY.
|
||||
|
||||
For each per-node clock card the UI must show:
|
||||
|
||||
- **Tier badge** (default / ok / degrading / degraded / wrong) + magnitude.
|
||||
- **Plain-English reason line**: e.g. "Last advert at 2024-05-15 + 3.2 days uptime — matches firmware default (volatile RTC, not yet user-set)" or "Last advert −12s vs wall clock — within OK tolerance."
|
||||
- **Calibration footnote**: "Skew corrected using observer X offset +1.7s (computed from 412 multi-observer packets)" or "Single-observer measurement, no calibration available."
|
||||
- **Evidence accordion** (PR #906 shape, retained): for the most recent N hashes, each observer's raw vs corrected skew + the observer's offset.
|
||||
|
||||
For the per-observer page (also from PR #906): show the observer's offset, the multi-observer sample count, and a tier badge using the same scale (treating `|observerOffset|` as the skew).
|
||||
|
||||
## Proposed classifier
|
||||
|
||||
Per-advert classification, no windowing:
|
||||
|
||||
```python
|
||||
DEFAULT_EPOCHS = [0, 1609459200, 1672531200, 1715770351]
|
||||
MAX_PLAUSIBLE_UPTIME_SEC = 1095 * 86400 # 3 years
|
||||
|
||||
def is_default(ts):
|
||||
return any(d <= ts <= d + MAX_PLAUSIBLE_UPTIME_SEC for d in DEFAULT_EPOCHS)
|
||||
|
||||
def classify(advert_ts, corrected_skew_sec):
|
||||
if is_default(advert_ts):
|
||||
return "default" # gray
|
||||
abs_skew = abs(corrected_skew_sec)
|
||||
if abs_skew <= 15: return "ok" # green
|
||||
if abs_skew <= 60: return "degrading" # yellow
|
||||
if abs_skew <= 600: return "degraded" # orange
|
||||
return "wrong" # red
|
||||
```
|
||||
|
||||
`corrected_skew_sec` is the observer-bias-subtracted skew per Phase 2 calibration. Default detection is independent of calibration (runs on raw `advert_ts`).
|
||||
|
||||
Per-node state = classification of the node's most-recent advert (per hash, picking the most recent observation across all observers). No medians, no good-fraction, no hysteresis.
|
||||
|
||||
## Severity tier definitions
|
||||
|
||||
| Tier | Condition | Color | UI label | Meaning |
|
||||
|---|---|---|---|---|
|
||||
| `default` | Advert ts within `[default, default + 3y]` of any known epoch | Gray | "Default" | Volatile RTC at firmware boot constant; never set or rebooted and not re-synced. |
|
||||
| `ok` | abs(skew) ≤ 15s | Green | "OK" | Working clock. |
|
||||
| `degrading` | 15s < abs(skew) ≤ 60s | Yellow | "Degrading" | Real but accumulating drift. |
|
||||
| `degraded` | 60s < abs(skew) ≤ 600s | Orange | "Degraded" | Off by minutes — needs re-sync. |
|
||||
| `wrong` | abs(skew) > 600s and not `default` | Red | "Wrong" | Operator-set error or RTC malfunction. |
|
||||
|
||||
## What this kills
|
||||
|
||||
- The 365-day `no_clock` threshold and the entire `recentSkewWindow{Count,Sec}` machinery.
|
||||
- The hysteresis / `goodFraction` / `longTermGoodFraction` logic from PR #894.
|
||||
- The proposed `bimodal_clock` tier from #845 — the pattern is not bimodal, it's defaulted vs set.
|
||||
- All Theil-Sen drift calculations as classifier inputs (drift remains a derived display value).
|
||||
|
||||
## What this preserves
|
||||
|
||||
- **Phase 2 observer calibration** (`calibrateObservers()`) — kept verbatim. It's what powers the "subtract observer bias" requirement from #690 and provides the triangulation evidence the UI needs.
|
||||
- **Drift display** (computed but not classifying).
|
||||
- **PR #906 evidence UI** — orthogonal to the classifier; it is in fact the implementation of #690's "explain WHY" requirement. Only label strings change to match the new tier names.
|
||||
- **`/api/observers/clock-skew`** — unchanged shape.
|
||||
|
||||
## API impact
|
||||
|
||||
`/api/nodes/{pubkey}/clock-skew` response changes:
|
||||
|
||||
- `severity` enum: `default | ok | degrading | degraded | wrong` (no more `no_clock | severe | warn | absurd`).
|
||||
- New field `defaultEpoch` (int, optional): if `severity == "default"`, the matched epoch.
|
||||
- Drop fields: `recentMedianSkewSec`, `goodFraction`, `recentBadSampleCount`, `longTermGoodFraction`.
|
||||
- Keep: `lastSkewSec`, `medianSkewSec`, `meanSkewSec`, `driftPerDaySec`, `sampleCount`, `calibrated`, `lastAdvertTS`, `lastObservedTS`, `nodeName`, `nodeRole`.
|
||||
|
||||
`/api/nodes/clock-skew` (fleet) shape unchanged except severity enum values.
|
||||
|
||||
## UI impact
|
||||
|
||||
- New CSS classes `skew-badge--default`, `skew-badge--degrading`, `skew-badge--degraded`, `skew-badge--wrong`. Drop `--no_clock`, `--severe`, `--warn`, `--absurd`, `--bimodal_clock`.
|
||||
- Tooltip text updated per tier.
|
||||
- "Default" badge tooltip should explain the clock is at firmware default plus uptime since boot, and the operator hasn't set it yet (or hasn't re-set it since the last reboot).
|
||||
|
||||
## Migration
|
||||
|
||||
Single PR replaces the classifier in `clock_skew.go` and updates the frontend badges/labels. No database schema change, no data migration — all per-call computation.
|
||||
|
||||
## Open issues to close
|
||||
|
||||
- **#789** (median hides corrected clocks) — resolved by per-advert classification.
|
||||
- **#845** (bimodal_clock tier) — replaced by `default` tier; the pattern that motivated it is correctly captured.
|
||||
- **PR #894** — close without merging; this design supersedes Option C entirely.
|
||||
- **#690** UI completion (PR #906) — keeps moving in parallel; only label updates needed.
|
||||
|
||||
## Validation plan
|
||||
|
||||
1. Hand-run the classifier against a snapshot of `/api/nodes/clock-skew` from 00id and cascadia. Confirm:
|
||||
- All 103 00id "absurd" nodes reclassify as `default`.
|
||||
- All 5 cascadia 2023-01 nodes reclassify as `default`.
|
||||
- The 2027 / 2067 cascadia outliers reclassify as `wrong`.
|
||||
- The 285 cascadia 2026-04 nodes reclassify as `ok` (or `degrading` if drift exceeds 15s).
|
||||
2. Add per-tier unit tests in `cmd/server/clock_skew_test.go`.
|
||||
3. Add a regression test for each known default epoch (synthesize advert at `default + 0s`, `default + 1d`, `default + 3y - 1s` → all classify as `default`).
|
||||
4. Edge cases:
|
||||
- `advert_ts == 0` → matches default epoch 0.
|
||||
- `advert_ts == 1715770351 + 731 days` → no longer matches (uptime cap exceeded) — should fall through to time-based classification, likely `wrong`.
|
||||
- Future timestamps beyond `now + 600s` → `wrong`.
|
||||
|
||||
## Out of scope (follow-ups)
|
||||
|
||||
- Per-firmware-version known-default lookup (when `firmware_version` field becomes reliable on adverts).
|
||||
- Reboot-count / flakiness indicator ("this node has hit default N times in last 30d").
|
||||
- Auto-discovery of new default epochs from clustering analysis (could detect a 4th default emerging in the wild).
|
||||
- Filtering defaulted-clock adverts out of time-windowed analytics queries (separate spec — affects path attribution).
|
||||
@@ -3495,12 +3495,12 @@ function destroy() { _analyticsData = {}; _channelData = null; if (_ngState && _
|
||||
});
|
||||
|
||||
// Summary
|
||||
var counts = { ok: 0, warning: 0, critical: 0, absurd: 0 };
|
||||
var counts = { ok: 0, degrading: 0, degraded: 0, wrong: 0, default: 0 };
|
||||
data.forEach(function(n) { if (counts[n.severity] !== undefined) counts[n.severity]++; });
|
||||
|
||||
// Filter buttons (also serve as summary — no separate stats pills needed)
|
||||
var filterColors = { ok: 'var(--status-green)', warning: 'var(--status-yellow)', critical: 'var(--status-orange)', absurd: 'var(--status-purple)', no_clock: 'var(--text-muted)' };
|
||||
var filters = ['all', 'ok', 'warning', 'critical', 'absurd', 'no_clock'];
|
||||
var filterColors = { ok: 'var(--status-green)', degrading: 'var(--status-yellow)', degraded: 'var(--status-orange)', wrong: 'var(--status-red)', default: 'var(--text-muted)' };
|
||||
var filters = ['all', 'ok', 'degrading', 'degraded', 'wrong', 'default'];
|
||||
var filterHtml = '<div style="margin-bottom:10px">' + filters.map(function(f) {
|
||||
var dot = f !== 'all' ? '<span style="display:inline-block;width:8px;height:8px;border-radius:50%;background:' + filterColors[f] + ';margin-right:4px;vertical-align:middle"></span>' : '';
|
||||
return '<button class="clock-filter-btn' + (activeFilter === f ? ' active' : '') + '" data-filter="' + f + '">' +
|
||||
@@ -3513,8 +3513,8 @@ function destroy() { _analyticsData = {}; _channelData = null; if (_ngState && _
|
||||
var rowClass = 'clock-fleet-row--' + (n.severity || 'ok');
|
||||
var lastAdv = n.lastObservedTS ? new Date(n.lastObservedTS * 1000).toISOString().replace('T', ' ').replace(/\.\d+Z/, ' UTC') : '—';
|
||||
var skewVal = window.currentSkewValue(n);
|
||||
var skewText = n.severity === 'no_clock' ? 'No Clock' : formatSkew(skewVal);
|
||||
var driftText = n.severity === 'no_clock' || !n.driftPerDaySec ? '–' : formatDrift(n.driftPerDaySec);
|
||||
var skewText = n.severity === 'default' ? 'Default' : formatSkew(skewVal);
|
||||
var driftText = n.severity === 'default' || !n.driftPerDaySec ? '–' : formatDrift(n.driftPerDaySec);
|
||||
return '<tr class="' + rowClass + '" data-pubkey="' + esc(n.pubkey) + '" style="cursor:pointer">' +
|
||||
'<td><strong>' + esc(n.nodeName || n.pubkey.slice(0, 12)) + '</strong></td>' +
|
||||
'<td style="font-family:var(--mono,monospace)">' + skewText + '</td>' +
|
||||
|
||||
@@ -808,7 +808,7 @@
|
||||
let _themeRefreshHandler = null;
|
||||
|
||||
let _allNodes = null; // cached full node list
|
||||
let _fleetSkew = null; // cached clock skew map: pubkey → {severity, recentMedianSkewSec, medianSkewSec, ...}
|
||||
let _fleetSkew = null; // cached clock skew map: pubkey → {severity, medianSkewSec, ...}
|
||||
|
||||
/**
|
||||
* Fetch per-node clock skew and render into the given container element.
|
||||
@@ -824,14 +824,28 @@
|
||||
var driftHtml = cs.driftPerDaySec ? '<div style="font-size:12px;color:var(--text-muted);margin-top:2px">Drift: ' + formatDrift(cs.driftPerDaySec) + '</div>' : '';
|
||||
var sparkHtml = renderSkewSparkline(cs.samples, 200, 32);
|
||||
var skewVal = window.currentSkewValue(cs);
|
||||
var skewDisplay = cs.severity === 'no_clock'
|
||||
? '<span style="font-size:18px;font-weight:700;color:var(--text-muted)">No Clock</span>'
|
||||
var skewDisplay = cs.severity === 'default'
|
||||
? '<span style="font-size:18px;font-weight:700;color:var(--text-muted)">Default</span>'
|
||||
: '<span style="font-size:18px;font-weight:700;font-family:var(--mono)">' + formatSkew(skewVal) + '</span>';
|
||||
var bimodalWarning = '';
|
||||
if (cs.severity === 'bimodal_clock') {
|
||||
var totalRecent = cs.recentSampleCount || 0;
|
||||
bimodalWarning = '<div style="font-size:12px;color:var(--status-amber-text);margin-top:4px">⚠️ ' + (cs.recentBadSampleCount || '?') + ' of last ' + (totalRecent || '?') + ' adverts had nonsense timestamps (likely RTC reset)</div>';
|
||||
|
||||
// Per-tier explainer line (plain English reason).
|
||||
var explainer = '';
|
||||
var absSkew = Math.abs(cs.lastSkewSec || 0);
|
||||
var skewStr = Math.round(absSkew) + 's';
|
||||
if (cs.severity === 'default') {
|
||||
var isoAdv = cs.lastAdvertTS ? new Date(cs.lastAdvertTS * 1000).toISOString() : '?';
|
||||
explainer = 'Last advert at ' + isoAdv + ' — matches firmware default (volatile RTC, not user-set since boot)';
|
||||
} else if (cs.severity === 'ok') {
|
||||
explainer = 'Last advert ' + skewStr + ' vs wall clock — within OK tolerance (≤15s)';
|
||||
} else if (cs.severity === 'degrading') {
|
||||
explainer = 'Last advert ' + skewStr + ' vs wall clock — drift accumulating (≤60s)';
|
||||
} else if (cs.severity === 'degraded') {
|
||||
explainer = 'Last advert ' + skewStr + ' vs wall clock — significantly off (≤10m)';
|
||||
} else if (cs.severity === 'wrong') {
|
||||
explainer = 'Last advert ' + skewStr + ' vs wall clock — clock incorrect (operator-set or RTC failure)';
|
||||
}
|
||||
var explainerHtml = explainer ? '<div style="font-size:12px;color:var(--text-muted);margin-top:4px">' + explainer + '</div>' : '';
|
||||
|
||||
container.innerHTML =
|
||||
'<h4 style="margin:0 0 6px">⏰ Clock Skew</h4>' +
|
||||
'<div style="display:flex;align-items:center;gap:12px;flex-wrap:wrap">' +
|
||||
@@ -839,9 +853,9 @@
|
||||
renderSkewBadge(cs.severity, skewVal, cs) +
|
||||
(cs.calibrated ? ' <span style="font-size:10px;color:var(--text-muted)" title="Observer-calibrated">✓ calibrated</span>' : '') +
|
||||
'</div>' +
|
||||
explainerHtml +
|
||||
driftHtml +
|
||||
(sparkHtml ? '<div class="skew-sparkline-wrap" style="margin-top:8px">' + sparkHtml + '<div style="font-size:10px;color:var(--text-muted)">Skew over time (' + (cs.samples || []).length + ' samples)</div></div>' : '') +
|
||||
bimodalWarning;
|
||||
(sparkHtml ? '<div class="skew-sparkline-wrap" style="margin-top:8px">' + sparkHtml + '<div style="font-size:10px;color:var(--text-muted)">Skew over time (' + (cs.samples || []).length + ' samples)</div></div>' : '');
|
||||
} catch (e) {
|
||||
// Non-fatal — section stays hidden
|
||||
}
|
||||
|
||||
@@ -397,17 +397,16 @@
|
||||
|
||||
// #690 — Clock Skew shared helpers
|
||||
var SKEW_SEVERITY_COLORS = {
|
||||
default: 'var(--text-muted)',
|
||||
ok: 'var(--status-green)',
|
||||
warning: 'var(--status-yellow)',
|
||||
critical: 'var(--status-orange)',
|
||||
absurd: 'var(--status-purple)',
|
||||
bimodal_clock: 'var(--status-amber)',
|
||||
no_clock: 'var(--text-muted)'
|
||||
degrading: 'var(--status-yellow)',
|
||||
degraded: 'var(--status-orange)',
|
||||
wrong: 'var(--status-red)'
|
||||
};
|
||||
var SKEW_SEVERITY_LABELS = {
|
||||
ok: 'OK', warning: 'Warning', critical: 'Critical', absurd: 'Absurd', bimodal_clock: 'Bimodal', no_clock: 'No Clock'
|
||||
default: 'Default', ok: 'OK', degrading: 'Degrading', degraded: 'Degraded', wrong: 'Wrong'
|
||||
};
|
||||
var SKEW_SEVERITY_ORDER = { no_clock: 0, bimodal_clock: 1, absurd: 2, critical: 3, warning: 4, ok: 5 };
|
||||
var SKEW_SEVERITY_ORDER = { default: 0, wrong: 1, degraded: 2, degrading: 3, ok: 4 };
|
||||
|
||||
window.SKEW_SEVERITY_COLORS = SKEW_SEVERITY_COLORS;
|
||||
window.SKEW_SEVERITY_LABELS = SKEW_SEVERITY_LABELS;
|
||||
@@ -430,26 +429,19 @@
|
||||
return (secPerDay >= 0 ? '+' : '') + secPerDay.toFixed(1) + ' s/day';
|
||||
};
|
||||
|
||||
/** Pick the skew value that drives current-health UI: prefer the
|
||||
* recent-window median (#789, current health) over the all-time median
|
||||
* (poisoned by historical bad samples). Falls back gracefully if the
|
||||
* field isn't present (older API responses). */
|
||||
/** Pick the skew value that drives current-health UI. Uses lastSkewSec
|
||||
* (most recent corrected skew) when available, falls back to medianSkewSec. */
|
||||
window.currentSkewValue = function(cs) {
|
||||
if (!cs) return null;
|
||||
return cs.recentMedianSkewSec != null ? cs.recentMedianSkewSec : cs.medianSkewSec;
|
||||
return cs.lastSkewSec != null ? cs.lastSkewSec : cs.medianSkewSec;
|
||||
};
|
||||
|
||||
/** Render a clock skew badge HTML */
|
||||
window.renderSkewBadge = function(severity, skewSec, cs) {
|
||||
if (!severity) return '';
|
||||
var cls = 'skew-badge skew-badge--' + severity;
|
||||
if (severity === 'no_clock') {
|
||||
return '<span class="' + cls + '" title="Uninitialized RTC — no valid clock">🚫 No Clock</span>';
|
||||
}
|
||||
if (severity === 'bimodal_clock' && cs) {
|
||||
var badPct = cs.goodFraction != null ? Math.round((1 - cs.goodFraction) * 100) : '?';
|
||||
var label = '⏰ ' + window.formatSkew(skewSec);
|
||||
return '<span class="' + cls + '" title="Clock skew: ' + window.formatSkew(skewSec) + ' (bimodal: ' + badPct + '% of recent adverts have nonsense timestamps)">' + label + '</span>';
|
||||
if (severity === 'default') {
|
||||
return '<span class="' + cls + '" title="Firmware default clock — volatile RTC not yet user-set since boot">⏰ Default</span>';
|
||||
}
|
||||
var label = severity === 'ok' ? '⏰' : '⏰ ' + window.formatSkew(skewSec);
|
||||
return '<span class="' + cls + '" title="Clock skew: ' + window.formatSkew(skewSec) + ' (' + (SKEW_SEVERITY_LABELS[severity] || severity) + ')">' + label + '</span>';
|
||||
|
||||
@@ -2291,22 +2291,21 @@ th.sort-active { color: var(--accent, #60a5fa); }
|
||||
|
||||
/* #690 — Clock Skew badges & fleet table */
|
||||
.skew-badge { display: inline-block; font-size: 10px; padding: 1px 5px; border-radius: 3px; margin-left: 4px; font-weight: 600; white-space: nowrap; }
|
||||
.skew-badge--default { background: var(--text-muted); color: #fff; }
|
||||
.skew-badge--ok { background: var(--status-green); color: #fff; }
|
||||
.skew-badge--warning { background: var(--status-yellow); color: #000; }
|
||||
.skew-badge--critical { background: var(--status-orange); color: #fff; }
|
||||
.skew-badge--absurd { background: var(--status-purple); color: #fff; }
|
||||
.skew-badge--no_clock { background: var(--text-muted); color: #fff; }
|
||||
.skew-badge--bimodal_clock { background: var(--status-amber-light); color: var(--status-amber-text); border: 1px solid var(--status-amber); }
|
||||
.skew-badge--degrading { background: var(--status-yellow); color: #000; }
|
||||
.skew-badge--degraded { background: var(--status-orange); color: #fff; }
|
||||
.skew-badge--wrong { background: var(--status-red); color: #fff; }
|
||||
|
||||
.skew-detail-section { padding: 10px 16px; margin-bottom: 8px; }
|
||||
.skew-sparkline-wrap { margin-top: 6px; }
|
||||
.skew-sparkline-wrap svg { display: block; }
|
||||
|
||||
|
||||
.clock-fleet-row--warning { background: color-mix(in srgb, var(--status-yellow) 10%, transparent); }
|
||||
.clock-fleet-row--critical { background: color-mix(in srgb, var(--status-orange) 10%, transparent); }
|
||||
.clock-fleet-row--absurd { background: color-mix(in srgb, var(--status-purple) 10%, transparent); }
|
||||
.clock-fleet-row--no_clock { background: color-mix(in srgb, var(--text-muted) 10%, transparent); }
|
||||
.clock-fleet-row--degrading { background: color-mix(in srgb, var(--status-yellow) 10%, transparent); }
|
||||
.clock-fleet-row--degraded { background: color-mix(in srgb, var(--status-orange) 10%, transparent); }
|
||||
.clock-fleet-row--wrong { background: color-mix(in srgb, var(--status-red) 10%, transparent); }
|
||||
.clock-fleet-row--default { background: color-mix(in srgb, var(--text-muted) 10%, transparent); }
|
||||
|
||||
.clock-filter-btn { font-size: 12px; padding: 3px 8px; border: 1px solid var(--border); border-radius: 4px; background: var(--card-bg, #fff); color: var(--text); cursor: pointer; margin-right: 4px; }
|
||||
.clock-filter-btn.active { background: var(--accent); color: #fff; border-color: var(--accent); }
|
||||
|
||||
@@ -5904,12 +5904,11 @@ console.log('\n=== channel-decrypt.js: key derivation, MAC, parsing, storage ===
|
||||
assert.strictEqual(ctx.window.renderSkewBadge(null, 0), '');
|
||||
});
|
||||
|
||||
test('renderSkewBadge renders bimodal_clock badge with tooltip (#845)', () => {
|
||||
var cs = { goodFraction: 0.6, recentBadSampleCount: 4, recentSampleCount: 10 };
|
||||
var html = ctx.window.renderSkewBadge('bimodal_clock', -5, cs);
|
||||
assert.ok(html.includes('skew-badge--bimodal_clock'), 'should contain bimodal_clock class');
|
||||
assert.ok(html.includes('bimodal'), 'tooltip should mention bimodal');
|
||||
assert.ok(html.includes('40%'), 'tooltip should show bad percentage');
|
||||
test('renderSkewBadge renders default badge with tooltip', () => {
|
||||
var cs = {};
|
||||
var html = ctx.window.renderSkewBadge('default', 0, cs);
|
||||
assert.ok(html.includes('skew-badge--default'), 'should contain default class');
|
||||
assert.ok(html.toLowerCase().includes('firmware default'), 'tooltip should mention firmware default');
|
||||
assert.ok(html.includes('⏰'), 'should contain clock emoji');
|
||||
});
|
||||
|
||||
@@ -5933,9 +5932,9 @@ console.log('\n=== channel-decrypt.js: key derivation, MAC, parsing, storage ===
|
||||
|
||||
test('SKEW_SEVERITY_ORDER sorts worst first', () => {
|
||||
var order = ctx.window.SKEW_SEVERITY_ORDER;
|
||||
assert.ok(order.absurd < order.critical, 'absurd should sort before critical');
|
||||
assert.ok(order.critical < order.warning, 'critical should sort before warning');
|
||||
assert.ok(order.warning < order.ok, 'warning should sort before ok');
|
||||
assert.ok(order.wrong < order.degraded, 'wrong should sort before degraded');
|
||||
assert.ok(order.degraded < order.degrading, 'degraded should sort before degrading');
|
||||
assert.ok(order.degrading < order.ok, 'degrading should sort before ok');
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user