Compare commits

...

12 Commits

Author SHA1 Message Date
you
86ca793b60 fix: bump default-epoch uptime cap to 3 years for solar repeater lifetimes 2026-04-25 00:08:09 +00:00
you
4291b387f5 fix: classifySkew defensive absolute value 2026-04-24 23:49:21 +00:00
you
3cd7186563 test: integration tests for epoch-0 and missing-timestamp adverts
TestGetNodeClockSkew_EpochZeroAdvert: verifies advert with timestamp==0
flows through PacketStore and classifies as severity=default, epoch=0.

TestGetNodeClockSkew_MissingTimestamp: verifies advert with no timestamp
field is skipped (extractTimestamp returns -1, filtered by collectSamples).

Review item #4 on PR #907.
2026-04-24 23:42:56 +00:00
you
86a4403136 fix: computeNodeSkew picks chronologically-latest observation
Uses max observedTS instead of last-appended slice element to
determine the most recent skew sample per hash. Consolidates
the latestObsTS and anyCal loop into a single pass.

Review item #3 on PR #907.
2026-04-24 23:42:56 +00:00
you
c46a60f78a fix: rename CSS fleet row classes to match severity names
.clock-fleet-row--warning  → .clock-fleet-row--degrading
.clock-fleet-row--critical → .clock-fleet-row--degraded

The JS in analytics.js builds classes as 'clock-fleet-row--<severity>'
so the CSS must match the actual severity strings.

Review item #2 on PR #907.
2026-04-24 23:42:56 +00:00
you
d4b1aa40d0 fix: extractTimestamp returns -1 sentinel for missing timestamp
Distinguishes 'no timestamp field' (returns -1) from real epoch-0
(returns 0). Adds jsonNumberOk helper that returns (value, bool).
The collectSamples guard 'advertTS < 0' correctly filters missing
timestamps while allowing epoch-0 through to isDefaultEpoch.

Updates TestExtractTimestamp to verify both cases.

Review item #1 on PR #907.
2026-04-24 23:42:56 +00:00
you
d617a55155 polish: remove leftover recentMedianSkewSec comment, add overlapping-epoch test
- Remove stale 'recentMedianSkewSec' reference in nodes.js comment
- Add TestIsDefault_OverlappingWindowsPicksLargest covering epoch
  selection when default ranges overlap
2026-04-24 23:33:49 +00:00
you
2106cc0b8b ui: per-tier explainer line in node clock card 2026-04-24 23:29:26 +00:00
you
0acbac6fde ui: rename clock skew tiers to default/ok/degrading/degraded/wrong 2026-04-24 23:27:16 +00:00
you
2c675f5ab2 test: cover default-detection classifier tiers and edge cases 2026-04-24 23:25:35 +00:00
you
545df2788d feat: replace clock skew classifier with default-detection model 2026-04-24 23:20:12 +00:00
you
f872fd90bf docs: clock-skew classifier redesign spec 2026-04-24 23:18:15 +00:00
8 changed files with 779 additions and 906 deletions

View File

@@ -12,20 +12,28 @@ import (
type SkewSeverity string
const (
SkewOK SkewSeverity = "ok" // < 5 min
SkewWarning SkewSeverity = "warning" // 5 min 1 hour
SkewCritical SkewSeverity = "critical" // 1 hour 30 days
SkewAbsurd SkewSeverity = "absurd" // > 30 days
SkewNoClock SkewSeverity = "no_clock" // > 365 days — uninitialized RTC
SkewBimodalClock SkewSeverity = "bimodal_clock" // mixed good+bad recent samples (flaky RTC)
SkewDefault SkewSeverity = "default" // firmware-default epoch + uptime
SkewOK SkewSeverity = "ok" // |skew| <= 15s
SkewDegrading SkewSeverity = "degrading" // 15s < |skew| <= 60s
SkewDegraded SkewSeverity = "degraded" // 60s < |skew| <= 600s
SkewWrong SkewSeverity = "wrong" // |skew| > 600s and not default
)
// Known firmware default epochs. Nodes with advert_ts in
// [epoch, epoch + maxPlausibleUptimeSec] are classified as "default".
// See docs/clock-skew-redesign.md for provenance of each value.
var defaultEpochs = []int64{0, 1609459200, 1672531200, 1715770351}
// Default thresholds in seconds.
const (
skewThresholdWarnSec = 5 * 60 // 5 minutes
skewThresholdCriticalSec = 60 * 60 // 1 hour
skewThresholdAbsurdSec = 30 * 24 * 3600 // 30 days
skewThresholdNoClockSec = 365 * 24 * 3600 // 365 days — uninitialized RTC
// maxPlausibleUptimeSec caps how far past a default epoch we still
// consider "default + uptime ticking". 730 days ≈ 2 years.
maxPlausibleUptimeSec = 1095 * 86400 // 3 years — covers solar repeater deployment lifetimes at firmware default
// Severity band boundaries (absolute skew in seconds).
skewThresholdOKSec = 15
skewThresholdDegradingSec = 60
skewThresholdDegradedSec = 600
// minDriftSamples is the minimum number of advert transmissions needed
// to compute a meaningful linear drift rate.
@@ -35,54 +43,52 @@ const (
// drift rates (> 1 day/day) indicate insufficient or outlier samples.
maxReasonableDriftPerDay = 86400.0
// recentSkewWindowCount is the number of most-recent advert samples
// used to derive the "current" skew for severity classification (see
// issue #789). The all-time median is poisoned by historical bad
// samples (e.g. a node that was off and then GPS-corrected); severity
// must reflect current health, not lifetime statistics.
recentSkewWindowCount = 5
// recentSkewWindowSec bounds the recent-window in time as well: only
// samples from the last N seconds count as "recent" for severity.
// The effective window is min(recentSkewWindowCount, samples in 1h).
recentSkewWindowSec = 3600
// bimodalSkewThresholdSec is the absolute skew threshold (1 hour)
// above which a sample is considered "bad" — likely firmware emitting
// a nonsense timestamp from an uninitialized RTC, not real drift.
// Chosen to match the warning/critical severity boundary: real clock
// drift rarely exceeds 1 hour, while epoch-0 RTCs produce ~1.7B sec.
bimodalSkewThresholdSec = 3600.0
// maxPlausibleSkewJumpSec is the largest skew change between
// consecutive samples that we treat as physical drift. Anything larger
// (e.g. a GPS sync that jumps the clock by minutes/days) is rejected
// as an outlier when computing drift. Real microcontroller drift is
// fractions of a second per advert; 60s is a generous safety factor.
// consecutive samples that we treat as physical drift.
maxPlausibleSkewJumpSec = 60.0
// theilSenMaxPoints caps the number of points fed to Theil-Sen
// regression (O(n²) in pairs). For nodes with thousands of samples we
// keep the most-recent points, which are also the most relevant for
// current drift.
// regression (O(n²) in pairs).
theilSenMaxPoints = 200
)
// classifySkew maps absolute skew (seconds) to a severity level.
// Float64 comparison is safe: inputs are rounded to 1 decimal via round(),
// and thresholds are integer multiples of 60 — no rounding artifacts.
func classifySkew(absSkewSec float64) SkewSeverity {
// isDefaultEpoch returns true if the raw advert timestamp falls within
// [epoch, epoch + maxPlausibleUptimeSec] for any known firmware default.
// If matched, returns the matched epoch; otherwise returns 0.
func isDefaultEpoch(advertTS int64) (bool, int64) {
// Find the largest epoch <= advertTS (closest match). Since ranges
// overlap, picking the closest avoids attributing a 2023-firmware
// node's timestamp to the 2024 epoch.
bestEpoch := int64(-1)
for _, epoch := range defaultEpochs {
if epoch <= advertTS && epoch > bestEpoch {
bestEpoch = epoch
}
}
if bestEpoch >= 0 && advertTS <= bestEpoch+maxPlausibleUptimeSec {
return true, bestEpoch
}
return false, 0
}
// classifySkew maps a raw advert timestamp and corrected skew (signed)
// to a severity level. Takes math.Abs internally so callers may pass
// signed values. Default detection runs on the raw advert_ts
// (independent of observer calibration).
func classifySkew(advertTS int64, skewSec float64) (SkewSeverity, int64) {
if ok, epoch := isDefaultEpoch(advertTS); ok {
return SkewDefault, epoch
}
abs := math.Abs(skewSec)
switch {
case absSkewSec >= skewThresholdNoClockSec:
return SkewNoClock
case absSkewSec >= skewThresholdAbsurdSec:
return SkewAbsurd
case absSkewSec >= skewThresholdCriticalSec:
return SkewCritical
case absSkewSec >= skewThresholdWarnSec:
return SkewWarning
case abs <= skewThresholdOKSec:
return SkewOK, 0
case abs <= skewThresholdDegradingSec:
return SkewDegrading, 0
case abs <= skewThresholdDegradedSec:
return SkewDegraded, 0
default:
return SkewOK
return SkewWrong, 0
}
}
@@ -90,38 +96,35 @@ func classifySkew(absSkewSec float64) SkewSeverity {
// skewSample is a single raw skew measurement from one advert observation.
type skewSample struct {
advertTS int64 // node's advert Unix timestamp
observedTS int64 // observation Unix timestamp
observerID string // which observer saw this
hash string // transmission hash (for multi-observer grouping)
advertTS int64 // node's advert Unix timestamp
observedTS int64 // observation Unix timestamp
observerID string // which observer saw this
hash string // transmission hash (for multi-observer grouping)
}
// ObserverCalibration holds the computed clock offset for an observer.
type ObserverCalibration struct {
ObserverID string `json:"observerID"`
OffsetSec float64 `json:"offsetSec"` // positive = observer clock ahead
Samples int `json:"samples"` // number of multi-observer packets used
OffsetSec float64 `json:"offsetSec"` // positive = observer clock ahead
Samples int `json:"samples"` // number of multi-observer packets used
}
// NodeClockSkew is the API response for a single node's clock skew data.
type NodeClockSkew struct {
Pubkey string `json:"pubkey"`
MeanSkewSec float64 `json:"meanSkewSec"` // corrected mean skew (positive = node ahead)
MedianSkewSec float64 `json:"medianSkewSec"` // corrected median skew
LastSkewSec float64 `json:"lastSkewSec"` // most recent corrected skew
RecentMedianSkewSec float64 `json:"recentMedianSkewSec"` // median across most-recent samples (drives severity, see #789)
DriftPerDaySec float64 `json:"driftPerDaySec"` // linear drift rate (sec/day)
Severity SkewSeverity `json:"severity"`
SampleCount int `json:"sampleCount"`
Calibrated bool `json:"calibrated"` // true if observer calibration was applied
LastAdvertTS int64 `json:"lastAdvertTS"` // most recent advert timestamp
LastObservedTS int64 `json:"lastObservedTS"` // most recent observation timestamp
Samples []SkewSample `json:"samples,omitempty"` // time-series for sparklines
GoodFraction float64 `json:"goodFraction"` // fraction of recent samples with |skew| <= 1h
RecentBadSampleCount int `json:"recentBadSampleCount"` // count of recent samples with |skew| > 1h
RecentSampleCount int `json:"recentSampleCount"` // total recent samples in window
NodeName string `json:"nodeName,omitempty"` // populated in fleet responses
NodeRole string `json:"nodeRole,omitempty"` // populated in fleet responses
Pubkey string `json:"pubkey"`
MeanSkewSec float64 `json:"meanSkewSec"` // corrected mean skew (positive = node ahead)
MedianSkewSec float64 `json:"medianSkewSec"` // corrected median skew
LastSkewSec float64 `json:"lastSkewSec"` // most recent corrected skew
DriftPerDaySec float64 `json:"driftPerDaySec"` // linear drift rate (sec/day)
Severity SkewSeverity `json:"severity"`
SampleCount int `json:"sampleCount"`
Calibrated bool `json:"calibrated"` // true if observer calibration was applied
LastAdvertTS int64 `json:"lastAdvertTS"` // most recent advert timestamp
LastObservedTS int64 `json:"lastObservedTS"` // most recent observation timestamp
DefaultEpoch *int64 `json:"defaultEpoch,omitempty"` // matched epoch when severity=default
Samples []SkewSample `json:"samples,omitempty"` // time-series for sparklines
NodeName string `json:"nodeName,omitempty"` // populated in fleet responses
NodeRole string `json:"nodeRole,omitempty"` // populated in fleet responses
}
// SkewSample is a single (timestamp, skew) point for sparkline rendering.
@@ -130,28 +133,26 @@ type SkewSample struct {
SkewSec float64 `json:"skew"` // corrected skew in seconds
}
// txSkewResult maps tx hash → per-transmission skew stats. This is an
// intermediate result keyed by hash (not pubkey); the store maps hash → pubkey
// when building the final per-node view.
// txSkewResult maps tx hash → per-transmission skew stats.
type txSkewResult = map[string]*NodeClockSkew
// ── Clock Skew Engine ──────────────────────────────────────────────────────────
// ClockSkewEngine computes and caches clock skew data for nodes and observers.
type ClockSkewEngine struct {
mu sync.RWMutex
observerOffsets map[string]float64 // observerID → calibrated offset (seconds)
observerSamples map[string]int // observerID → number of multi-observer packets used
nodeSkew txSkewResult
lastComputed time.Time
computeInterval time.Duration
mu sync.RWMutex
observerOffsets map[string]float64 // observerID → calibrated offset (seconds)
observerSamples map[string]int // observerID → number of multi-observer packets used
nodeSkew txSkewResult
lastComputed time.Time
computeInterval time.Duration
}
func NewClockSkewEngine() *ClockSkewEngine {
return &ClockSkewEngine{
observerOffsets: make(map[string]float64),
observerOffsets: make(map[string]float64),
observerSamples: make(map[string]int),
nodeSkew: make(txSkewResult),
nodeSkew: make(txSkewResult),
computeInterval: 30 * time.Second,
}
}
@@ -188,7 +189,6 @@ func (e *ClockSkewEngine) Recompute(store *PacketStore) {
// Swap results under brief write lock.
e.mu.Lock()
// Re-check: another goroutine may have computed while we were working.
if time.Since(e.lastComputed) < e.computeInterval {
e.mu.Unlock()
return
@@ -214,13 +214,13 @@ func collectSamples(store *PacketStore) []skewSample {
if decoded == nil {
continue
}
// Extract advert timestamp from decoded JSON.
advertTS := extractTimestamp(decoded)
if advertTS <= 0 {
if advertTS < 0 {
continue
}
// Sanity: skip timestamps before year 2020 or after year 2100.
if advertTS < 1577836800 || advertTS > 4102444800 {
// Allow epoch 0 and above (needed for default-epoch detection).
// Upper bound: year 2100.
if advertTS > 4102444800 {
continue
}
@@ -240,21 +240,43 @@ func collectSamples(store *PacketStore) []skewSample {
return samples
}
// timestampMissing is the sentinel returned by extractTimestamp when no
// timestamp field is present in the decoded advert. Using -1 lets us
// distinguish "field absent" from a real epoch-0 timestamp (ts == 0).
const timestampMissing int64 = -1
// extractTimestamp gets the Unix timestamp from a decoded ADVERT payload.
// Returns timestampMissing (-1) if no timestamp field is found.
func extractTimestamp(decoded map[string]interface{}) int64 {
// Try payload.timestamp first (nested in "payload" key).
if payload, ok := decoded["payload"]; ok {
if pm, ok := payload.(map[string]interface{}); ok {
if ts := jsonNumber(pm, "timestamp"); ts > 0 {
if ts, ok := jsonNumberOk(pm, "timestamp"); ok {
return ts
}
}
}
// Fallback: top-level timestamp.
if ts := jsonNumber(decoded, "timestamp"); ts > 0 {
if ts, ok := jsonNumberOk(decoded, "timestamp"); ok {
return ts
}
return 0
return timestampMissing
}
// jsonNumberOk extracts an int64 from a JSON-parsed map, returning (value, true)
// if the key exists and is numeric, or (0, false) otherwise.
func jsonNumberOk(m map[string]interface{}, key string) (int64, bool) {
v, ok := m[key]
if !ok || v == nil {
return 0, false
}
switch n := v.(type) {
case float64:
return int64(n), true
case int64:
return n, true
case int:
return int64(n), true
}
return 0, false
}
// jsonNumber extracts an int64 from a JSON-parsed map (handles float64 and json.Number).
@@ -281,7 +303,6 @@ func parseISO(s string) int64 {
}
t, err := time.Parse(time.RFC3339, s)
if err != nil {
// Try with fractional seconds.
t, err = time.Parse("2006-01-02T15:04:05.999999999Z07:00", s)
if err != nil {
return 0
@@ -295,19 +316,16 @@ func parseISO(s string) int64 {
// calibrateObservers computes each observer's clock offset using multi-observer
// packets. Returns offset map and sample count map.
func calibrateObservers(samples []skewSample) (map[string]float64, map[string]int) {
// Group observations by packet hash.
byHash := make(map[string][]skewSample)
for _, s := range samples {
byHash[s.hash] = append(byHash[s.hash], s)
}
// For each multi-observer packet, compute per-observer deviation from median.
deviations := make(map[string][]float64) // observerID → list of deviations
deviations := make(map[string][]float64)
for _, group := range byHash {
if len(group) < 2 {
continue // single-observer packet, can't calibrate
continue
}
// Compute median observation timestamp for this packet.
obsTimes := make([]float64, len(group))
for i, s := range group {
obsTimes[i] = float64(s.observedTS)
@@ -319,7 +337,6 @@ func calibrateObservers(samples []skewSample) (map[string]float64, map[string]in
}
}
// Each observer's offset = median of its deviations.
offsets := make(map[string]float64, len(deviations))
counts := make(map[string]int, len(deviations))
for obsID, devs := range deviations {
@@ -333,8 +350,6 @@ func calibrateObservers(samples []skewSample) (map[string]float64, map[string]in
// computeNodeSkew calculates corrected skew statistics for each node.
func computeNodeSkew(samples []skewSample, obsOffsets map[string]float64) txSkewResult {
// Compute corrected skew per sample, grouped by hash (each hash = one
// node's advert transmission). The caller maps hash → pubkey via byNode.
type correctedSample struct {
skew float64
observedTS int64
@@ -349,8 +364,6 @@ func computeNodeSkew(samples []skewSample, obsOffsets map[string]float64) txSkew
rawSkew := float64(s.advertTS - s.observedTS)
corrected := rawSkew
if hasCal {
// Observer offset = obs_ts - median(all_obs_ts). If observer is ahead,
// its obs_ts is inflated, making raw_skew too low. Add offset to correct.
corrected = rawSkew + obsOffset
}
byHash[s.hash] = append(byHash[s.hash], correctedSample{
@@ -361,10 +374,7 @@ func computeNodeSkew(samples []skewSample, obsOffsets map[string]float64) txSkew
hashAdvertTS[s.hash] = s.advertTS
}
// Each hash represents one advert from one node. Compute median corrected
// skew per hash (across multiple observers).
result := make(map[string]*NodeClockSkew) // keyed by hash for now
result := make(map[string]*NodeClockSkew)
for hash, cs := range byHash {
skews := make([]float64, len(cs))
for i, c := range cs {
@@ -373,29 +383,37 @@ func computeNodeSkew(samples []skewSample, obsOffsets map[string]float64) txSkew
medSkew := median(skews)
meanSkew := mean(skews)
// Find latest observation.
var latestObsTS int64
// Pick the skew from the most recent observation (max observedTS),
// not the last-appended sample which may be non-chronological.
var latest correctedSample
var anyCal bool
for _, c := range cs {
if c.observedTS > latestObsTS {
latestObsTS = c.observedTS
if c.observedTS > latest.observedTS {
latest = c
}
if c.calibrated {
anyCal = true
}
}
lastCorrectedSkew := latest.skew
advTS := hashAdvertTS[hash]
severity, matchedEpoch := classifySkew(advTS, lastCorrectedSkew)
absMedian := math.Abs(medSkew)
result[hash] = &NodeClockSkew{
ncs := &NodeClockSkew{
MeanSkewSec: round(meanSkew, 1),
MedianSkewSec: round(medSkew, 1),
LastSkewSec: round(cs[len(cs)-1].skew, 1),
Severity: classifySkew(absMedian),
LastSkewSec: round(lastCorrectedSkew, 1),
Severity: severity,
SampleCount: len(cs),
Calibrated: anyCal,
LastAdvertTS: hashAdvertTS[hash],
LastObservedTS: latestObsTS,
LastAdvertTS: advTS,
LastObservedTS: latest.observedTS,
}
if severity == SkewDefault {
ep := matchedEpoch
ncs.DefaultEpoch = &ep
}
result[hash] = ncs
}
return result
}
@@ -457,124 +475,45 @@ func (s *PacketStore) getNodeClockSkewLocked(pubkey string) *NodeClockSkew {
medSkew := median(allSkews)
meanSkew := mean(allSkews)
// Severity is derived from RECENT samples only (issue #789). The
// all-time median is poisoned by historical bad data — a node that
// was off for hours and then GPS-corrected can have median = -59M sec
// while its current skew is -0.8s. Operators need severity to reflect
// current health, so they trust the dashboard.
//
// Sort tsSkews by time and take the last recentSkewWindowCount samples
// (or all samples within recentSkewWindowSec of the latest, whichever
// gives FEWER samples — we want the more-current view; a chatty node
// can fit dozens of samples in 1h, in which case the count cap wins).
sort.Slice(tsSkews, func(i, j int) bool { return tsSkews[i].ts < tsSkews[j].ts })
// Classify using the most recent advert's raw timestamp and
// the most recent corrected skew. No windowing or median-driven
// severity — per-advert classification per the spec.
severity, matchedEpoch := classifySkew(lastAdvTS, lastSkew)
recentSkew := lastSkew
var recentVals []float64
if n := len(tsSkews); n > 0 {
latestTS := tsSkews[n-1].ts
// Index-based window: last K samples.
startByCount := n - recentSkewWindowCount
if startByCount < 0 {
startByCount = 0
}
// Time-based window: samples newer than latestTS - windowSec.
startByTime := n - 1
for i := n - 1; i >= 0; i-- {
if latestTS-tsSkews[i].ts <= recentSkewWindowSec {
startByTime = i
} else {
break
}
}
// Pick the narrower (larger-index) of the two windows — the most
// current view of the node's clock health.
start := startByCount
if startByTime > start {
start = startByTime
}
recentVals = make([]float64, 0, n-start)
for i := start; i < n; i++ {
recentVals = append(recentVals, tsSkews[i].skew)
}
if len(recentVals) > 0 {
recentSkew = median(recentVals)
}
}
// ── Bimodal detection (#845) ─────────────────────────────────────────
// Split recent samples into "good" (|skew| <= 1h, real clock) and
// "bad" (|skew| > 1h, firmware nonsense from uninitialized RTC).
// Classification order (first match wins):
// no_clock — goodFraction < 0.10 (essentially no real clock)
// bimodal_clock — 0.10 <= goodFraction < 0.80 AND badCount > 0
// ok/warn/etc. — goodFraction >= 0.80 (normal, outliers filtered)
var goodSamples []float64
for _, v := range recentVals {
if math.Abs(v) <= bimodalSkewThresholdSec {
goodSamples = append(goodSamples, v)
}
}
recentSampleCount := len(recentVals)
recentBadCount := recentSampleCount - len(goodSamples)
var goodFraction float64
if recentSampleCount > 0 {
goodFraction = float64(len(goodSamples)) / float64(recentSampleCount)
}
var severity SkewSeverity
if goodFraction < 0.10 {
// Essentially no real clock — classify as no_clock regardless
// of the raw skew magnitude.
severity = SkewNoClock
} else if goodFraction < 0.80 && recentBadCount > 0 {
// Bimodal: use median of GOOD samples as the "real" skew.
severity = SkewBimodalClock
if len(goodSamples) > 0 {
recentSkew = median(goodSamples)
}
} else {
// Normal path: if there are good samples, use their median
// (filters out rare outliers in ≥80% good case).
if len(goodSamples) > 0 && recentBadCount > 0 {
recentSkew = median(goodSamples)
}
severity = classifySkew(math.Abs(recentSkew))
}
// For no_clock / bimodal_clock nodes, skip drift when data is unreliable.
// Drift: display only, not a classifier input.
var drift float64
if severity != SkewNoClock && severity != SkewBimodalClock && len(tsSkews) >= minDriftSamples {
if severity != SkewDefault && len(tsSkews) >= minDriftSamples {
drift = computeDrift(tsSkews)
// Cap physically impossible drift rates.
if math.Abs(drift) > maxReasonableDriftPerDay {
drift = 0
}
}
// Build sparkline samples from tsSkews (already sorted by time above).
// Build sparkline samples.
sort.Slice(tsSkews, func(i, j int) bool { return tsSkews[i].ts < tsSkews[j].ts })
samples := make([]SkewSample, len(tsSkews))
for i, p := range tsSkews {
samples[i] = SkewSample{Timestamp: p.ts, SkewSec: round(p.skew, 1)}
}
return &NodeClockSkew{
Pubkey: pubkey,
MeanSkewSec: round(meanSkew, 1),
MedianSkewSec: round(medSkew, 1),
LastSkewSec: round(lastSkew, 1),
RecentMedianSkewSec: round(recentSkew, 1),
DriftPerDaySec: round(drift, 2),
Severity: severity,
SampleCount: totalSamples,
Calibrated: anyCal,
LastAdvertTS: lastAdvTS,
LastObservedTS: lastObsTS,
Samples: samples,
GoodFraction: round(goodFraction, 2),
RecentBadSampleCount: recentBadCount,
RecentSampleCount: recentSampleCount,
result := &NodeClockSkew{
Pubkey: pubkey,
MeanSkewSec: round(meanSkew, 1),
MedianSkewSec: round(medSkew, 1),
LastSkewSec: round(lastSkew, 1),
DriftPerDaySec: round(drift, 2),
Severity: severity,
SampleCount: totalSamples,
Calibrated: anyCal,
LastAdvertTS: lastAdvTS,
LastObservedTS: lastObsTS,
Samples: samples,
}
if severity == SkewDefault {
ep := matchedEpoch
result.DefaultEpoch = &ep
}
return result
}
// GetFleetClockSkew returns clock skew data for all nodes that have skew data.
@@ -583,7 +522,6 @@ func (s *PacketStore) GetFleetClockSkew() []*NodeClockSkew {
s.mu.RLock()
defer s.mu.RUnlock()
// Build name/role lookup from DB cache (requires s.mu held).
allNodes, _ := s.getCachedNodesAndPM()
nameMap := make(map[string]nodeInfo, len(allNodes))
for _, ni := range allNodes {
@@ -596,12 +534,10 @@ func (s *PacketStore) GetFleetClockSkew() []*NodeClockSkew {
if cs == nil {
continue
}
// Enrich with node name/role.
if ni, ok := nameMap[pubkey]; ok {
cs.NodeName = ni.Name
cs.NodeRole = ni.Role
}
// Omit samples in fleet response (too much data).
cs.Samples = nil
results = append(results, cs)
}
@@ -626,7 +562,6 @@ func (s *PacketStore) GetObserverCalibrations() []ObserverCalibration {
Samples: s.clockSkew.observerSamples[obsID],
})
}
// Sort by absolute offset descending.
sort.Slice(result, func(i, j int) bool {
return math.Abs(result[i].OffsetSec) > math.Abs(result[j].OffsetSec)
})
@@ -667,38 +602,20 @@ type tsSkewPair struct {
}
// computeDrift estimates linear drift in seconds per day from time-ordered
// (timestamp, skew) pairs. Issue #789: a single GPS-correction event (huge
// skew jump in seconds) used to dominate ordinary least squares and produce
// absurd drift like 1.7M sec/day. We now:
//
// 1. Drop pairs whose consecutive skew jump exceeds maxPlausibleSkewJumpSec
// (clock corrections, not physical drift). This protects both OLS-style
// consumers and Theil-Sen.
// 2. Use Theil-Sen regression — the slope is the median of all pairwise
// slopes, naturally robust to remaining outliers (breakdown point ~29%).
//
// For very small samples after filtering we fall back to a simple slope
// between first and last calibrated samples.
// (timestamp, skew) pairs using Theil-Sen regression with outlier filtering.
func computeDrift(pairs []tsSkewPair) float64 {
if len(pairs) < 2 {
return 0
}
// Sort by timestamp.
sort.Slice(pairs, func(i, j int) bool {
return pairs[i].ts < pairs[j].ts
})
// Time span too short? Skip.
spanSec := float64(pairs[len(pairs)-1].ts - pairs[0].ts)
if spanSec < 3600 { // need at least 1 hour of data
if spanSec < 3600 {
return 0
}
// Outlier filter: drop samples where the skew jumps more than
// maxPlausibleSkewJumpSec from the running "stable" baseline.
// We anchor on the first sample, then accept each subsequent point
// that's within the threshold of the most recent accepted point —
// this preserves a slow drift while rejecting correction events.
filtered := make([]tsSkewPair, 0, len(pairs))
filtered = append(filtered, pairs[0])
for i := 1; i < len(pairs); i++ {
@@ -707,30 +624,23 @@ func computeDrift(pairs []tsSkewPair) float64 {
filtered = append(filtered, pairs[i])
}
}
// If the filter killed too much (e.g. unstable node), fall back to the
// raw series so we at least produce *something* — it'll be capped by
// maxReasonableDriftPerDay downstream.
if len(filtered) < 2 || float64(filtered[len(filtered)-1].ts-filtered[0].ts) < 3600 {
filtered = pairs
}
// Cap point count for Theil-Sen (O(n²) on pairs). Keep most-recent.
if len(filtered) > theilSenMaxPoints {
filtered = filtered[len(filtered)-theilSenMaxPoints:]
}
return theilSenSlope(filtered) * 86400 // sec/sec → sec/day
return theilSenSlope(filtered) * 86400
}
// theilSenSlope returns the Theil-Sen estimator: median of all pairwise
// slopes (yj - yi) / (tj - ti) for i < j. Naturally robust to outliers.
// Pairs must be sorted by timestamp ascending.
// theilSenSlope returns the Theil-Sen estimator: median of all pairwise slopes.
func theilSenSlope(pairs []tsSkewPair) float64 {
n := len(pairs)
if n < 2 {
return 0
}
// Pre-allocate: n*(n-1)/2 pairs.
slopes := make([]float64, 0, n*(n-1)/2)
for i := 0; i < n; i++ {
for j := i + 1; j < n; j++ {

File diff suppressed because it is too large Load Diff

241
docs/clock-skew-redesign.md Normal file
View File

@@ -0,0 +1,241 @@
# Clock Skew Classifier — Redesign
**Status:** spec, pre-implementation
**Supersedes:** parts of #690 / #789 / #845 / PR #894
**Date drafted:** 2026-04-24
## Problem
The current classifier (`cmd/server/clock_skew.go`) uses windowed medians, hysteresis, "good fraction" floors, and a 365-day `no_clock` threshold. It produces:
- False `no_clock` flags on nodes whose clocks are working today but had garbage timestamps in recent samples.
- Symmetric severity bands that conflate "clock at firmware default" with "operator set the clock wrong by a year" — completely different operator actions required.
- Compounding over-engineering as each operator complaint added a new tier or window.
The actual physical reality of these devices is much simpler than the classifier assumes.
## Hardware reality
Most MeshCore nodes have **no auto-updating RTC**. There are two hardware paths:
1. **Volatile RTC nodes** (`firmware/src/helpers/ArduinoHelpers.h:11``VolatileRTCClock`):
- On boot, `base_time` is hardcoded to a firmware-build constant (currently `1715770351` = 2024-05-15 20:52:31 UTC).
- `getCurrentTime()` returns `base_time + millis()/1000`.
- On reboot the value snaps back to the constant.
- User must manually sync via companion app (`set time` CLI invokes `setCurrentTime(...)`) to set a real wall-clock time, which then ticks until the next reboot.
2. **Hardware-RTC nodes** (`firmware/src/helpers/AutoDiscoverRTCClock.cpp` — DS3231 / RV3028 / PCF8563):
- Real-time chip with battery backup. Holds the time across reboots.
- Behaves correctly once set; no default-snap behavior.
The `set time RESET` CLI command (`firmware/src/helpers/CommonCLI.cpp:215`) explicitly calls `setCurrentTime(1715770351)` regardless of hardware — so even hardware-RTC nodes can be deliberately reset to the default epoch.
**Therefore every node is in exactly one of these states:**
| State | Description |
|---|---|
| **Default / never set** | RTC is at a firmware-default epoch + ticking up since the last boot. |
| **Set, drifting normally** | RTC was synced; small skew accumulating at ~0.8s/day per #789 reports. |
| **Set, drifted past tolerance** | Like above but skew has grown beyond what's useful. |
| **Wrong** | Operator-set incorrect time, or genuine RTC malfunction not matching any known default. |
There is no "bimodal RTC bug" — what looked bimodal in #845 is just a sequence of `defaulted → user sync → reboot → defaulted again`. The "bad" timestamps are not noise; they're a constant (the default epoch + a small uptime).
## Production data analysis (2026-04-24)
### 00id.net (this deployment, 416 nodes, commit `abd9c46`)
`lastSkewSec` (advert_ts observed_ts) distribution:
| Bucket | Count | Pct |
|---|---:|---:|
| OK ≤15s | 90 | 22% |
| Degrading ≤60s | 93 | 22% |
| Degraded ≤10m | 13 | 3% |
| off ≤1d | 5 | 1% |
| off ≤1y | 110 | 26% |
| absurd >1y | 105 | 25% |
Per-node `lastAdvertTS` raw timestamp distribution shows a sharp default cluster:
```
+0 days count=19 samples=114969 ← exactly at 1715770351 (just rebooted)
+1d count=9 samples=24766
+2d count=7 samples=58101
+3d count=2 samples=360
... ← decay through ~110 days
+113d count=2 samples=53776
```
103 of 416 nodes (25%) have `lastAdvertTS` between `1715770351` and `1715770351 + 1095 days`, consistent with the volatile-RTC-default-ticking-up pattern.
A second cluster of 5 nodes has `lastAdvertTS = 1672531542 ≈ 1672531200 + 5min` = **2023-01-01 00:00:00 UTC** + small uptime. This is a *different* firmware-default epoch from an older firmware version.
### Cascadia (analyzer.cascadiamesh.org, 433 nodes in 5000-packet sample, commit `111b03c` v3.5.0)
ADVERT timestamp by year-month:
```
1970-01 1 ← epoch zero (ESP32 native fallback OR ancient firmware)
2021-01 1 ← possible third default epoch
2023-01 2 ← old firmware default (matches 00id)
2024-05 60 ← current VolatileRTCClock + days uptime
2024-06 39 ← same default + weeks uptime
2024-07 21
2024-08 10
2024-09 2
2024-10 1
2024-11 2 ← decays out as fewer nodes have multi-month uptime since reboot
2025-10 1 ← pre-current-now miscellany
2025-11 2
2026-03 4
2026-04 285 ← currently set clocks (this is "now-ish")
2027-04 1 ← operator set wrong by ~1 year (typo?)
2067-12 1 ← operator set wildly wrong / corrupted RTC
```
Confirms the model: ~67% of nodes have a current clock, ~32% are at known firmware defaults at varying uptime offsets, ~3 outliers represent genuine misconfigurations.
## Known firmware default epochs
These are the values discovered in production data so far:
| Epoch (unix) | UTC | Source |
|---:|---|---|
| `0` | 1970-01-01 | Likely ESP32 boot when no RTC initialization runs (`time(NULL)` returns 0). |
| `1609459200` | 2021-01-01 | Speculation — single-sample evidence, validate as more data arrives. |
| `1672531200` | 2023-01-01 | Older firmware `VolatileRTCClock::base_time` value. |
| `1715770351` | 2024-05-15 20:52:31 | **Current** `VolatileRTCClock` constructor + `set time RESET` CLI. |
Treat the table as data, not fixed code. New firmware versions will introduce new defaults; expect to add to the list over time.
## Reconciliation with #690 — the four timestamps
#690 lists three timestamps; in practice there are four signals worth distinguishing:
| Signal | Source | Used for |
|---|---|---|
| `advert_ts` | Inside MeshCore packet, set by sending node | Per-node classification (THE signal). |
| `mqtt_envelope_ts` | Set by observer when it forwards via MQTT | Observer-side calibration only — *not* a direct node-skew signal because observer clock can itself be wrong. |
| `corescope_received_ts` | Wall clock when CoreScope ingested the message | Reference "now"; calibration cross-check. |
| `same_packet_across_observers` | Multiple observers seeing the same hash | Phase 2 calibration (triangulation). |
**Inputs flow:**
1. **Phase 2 (existing, kept):** for each packet hash seen by ≥2 observers, compute each observer's deviation from the per-packet median observed_ts → `observerOffset`. This is the triangulation #690 calls for ("Same packet observed by more than one (ideally 3+) observers gives good indication if one observer is off"). Observer offsets are the calibration table.
2. **Per-advert correction (existing, kept):** `correctedSkew = (advert_ts - observed_ts) + observerOffset[observer_id]`. If no calibration exists for an observer, fall back to raw skew with `calibrated: false`.
3. **Default detection (new):** runs on RAW `advert_ts`, not corrected. The firmware default is a fixed wall-clock value; observer offsets are seconds-to-minutes scale and cannot move `advert_ts` from 2024 to 2026. Default check is independent of calibration.
4. **Severity classification (new):** if `is_default(advert_ts)``default`; else classify by `|correctedSkew|` band.
This keeps everything #690 asks for (observer detection, bias subtraction, triangulation), and adds the firmware-default cluster as a new pre-empting tier.
## UI: explain WHY (#690 requirement)
The classifier alone doesn't satisfy #690's "present on the UI why clock skew is obvious or suspected." The evidence panel from PR #906 (per-hash observer breakdown showing raw vs corrected skew per observer) is the WHY.
For each per-node clock card the UI must show:
- **Tier badge** (default / ok / degrading / degraded / wrong) + magnitude.
- **Plain-English reason line**: e.g. "Last advert at 2024-05-15 + 3.2 days uptime — matches firmware default (volatile RTC, not yet user-set)" or "Last advert 12s vs wall clock — within OK tolerance."
- **Calibration footnote**: "Skew corrected using observer X offset +1.7s (computed from 412 multi-observer packets)" or "Single-observer measurement, no calibration available."
- **Evidence accordion** (PR #906 shape, retained): for the most recent N hashes, each observer's raw vs corrected skew + the observer's offset.
For the per-observer page (also from PR #906): show the observer's offset, the multi-observer sample count, and a tier badge using the same scale (treating `|observerOffset|` as the skew).
## Proposed classifier
Per-advert classification, no windowing:
```python
DEFAULT_EPOCHS = [0, 1609459200, 1672531200, 1715770351]
MAX_PLAUSIBLE_UPTIME_SEC = 1095 * 86400 # 3 years
def is_default(ts):
return any(d <= ts <= d + MAX_PLAUSIBLE_UPTIME_SEC for d in DEFAULT_EPOCHS)
def classify(advert_ts, corrected_skew_sec):
if is_default(advert_ts):
return "default" # gray
abs_skew = abs(corrected_skew_sec)
if abs_skew <= 15: return "ok" # green
if abs_skew <= 60: return "degrading" # yellow
if abs_skew <= 600: return "degraded" # orange
return "wrong" # red
```
`corrected_skew_sec` is the observer-bias-subtracted skew per Phase 2 calibration. Default detection is independent of calibration (runs on raw `advert_ts`).
Per-node state = classification of the node's most-recent advert (per hash, picking the most recent observation across all observers). No medians, no good-fraction, no hysteresis.
## Severity tier definitions
| Tier | Condition | Color | UI label | Meaning |
|---|---|---|---|---|
| `default` | Advert ts within `[default, default + 3y]` of any known epoch | Gray | "Default" | Volatile RTC at firmware boot constant; never set or rebooted and not re-synced. |
| `ok` | abs(skew) ≤ 15s | Green | "OK" | Working clock. |
| `degrading` | 15s < abs(skew) ≤ 60s | Yellow | "Degrading" | Real but accumulating drift. |
| `degraded` | 60s < abs(skew) ≤ 600s | Orange | "Degraded" | Off by minutes — needs re-sync. |
| `wrong` | abs(skew) > 600s and not `default` | Red | "Wrong" | Operator-set error or RTC malfunction. |
## What this kills
- The 365-day `no_clock` threshold and the entire `recentSkewWindow{Count,Sec}` machinery.
- The hysteresis / `goodFraction` / `longTermGoodFraction` logic from PR #894.
- The proposed `bimodal_clock` tier from #845 — the pattern is not bimodal, it's defaulted vs set.
- All Theil-Sen drift calculations as classifier inputs (drift remains a derived display value).
## What this preserves
- **Phase 2 observer calibration** (`calibrateObservers()`) — kept verbatim. It's what powers the "subtract observer bias" requirement from #690 and provides the triangulation evidence the UI needs.
- **Drift display** (computed but not classifying).
- **PR #906 evidence UI** — orthogonal to the classifier; it is in fact the implementation of #690's "explain WHY" requirement. Only label strings change to match the new tier names.
- **`/api/observers/clock-skew`** — unchanged shape.
## API impact
`/api/nodes/{pubkey}/clock-skew` response changes:
- `severity` enum: `default | ok | degrading | degraded | wrong` (no more `no_clock | severe | warn | absurd`).
- New field `defaultEpoch` (int, optional): if `severity == "default"`, the matched epoch.
- Drop fields: `recentMedianSkewSec`, `goodFraction`, `recentBadSampleCount`, `longTermGoodFraction`.
- Keep: `lastSkewSec`, `medianSkewSec`, `meanSkewSec`, `driftPerDaySec`, `sampleCount`, `calibrated`, `lastAdvertTS`, `lastObservedTS`, `nodeName`, `nodeRole`.
`/api/nodes/clock-skew` (fleet) shape unchanged except severity enum values.
## UI impact
- New CSS classes `skew-badge--default`, `skew-badge--degrading`, `skew-badge--degraded`, `skew-badge--wrong`. Drop `--no_clock`, `--severe`, `--warn`, `--absurd`, `--bimodal_clock`.
- Tooltip text updated per tier.
- "Default" badge tooltip should explain the clock is at firmware default plus uptime since boot, and the operator hasn't set it yet (or hasn't re-set it since the last reboot).
## Migration
Single PR replaces the classifier in `clock_skew.go` and updates the frontend badges/labels. No database schema change, no data migration — all per-call computation.
## Open issues to close
- **#789** (median hides corrected clocks) — resolved by per-advert classification.
- **#845** (bimodal_clock tier) — replaced by `default` tier; the pattern that motivated it is correctly captured.
- **PR #894** — close without merging; this design supersedes Option C entirely.
- **#690** UI completion (PR #906) — keeps moving in parallel; only label updates needed.
## Validation plan
1. Hand-run the classifier against a snapshot of `/api/nodes/clock-skew` from 00id and cascadia. Confirm:
- All 103 00id "absurd" nodes reclassify as `default`.
- All 5 cascadia 2023-01 nodes reclassify as `default`.
- The 2027 / 2067 cascadia outliers reclassify as `wrong`.
- The 285 cascadia 2026-04 nodes reclassify as `ok` (or `degrading` if drift exceeds 15s).
2. Add per-tier unit tests in `cmd/server/clock_skew_test.go`.
3. Add a regression test for each known default epoch (synthesize advert at `default + 0s`, `default + 1d`, `default + 3y - 1s` → all classify as `default`).
4. Edge cases:
- `advert_ts == 0` → matches default epoch 0.
- `advert_ts == 1715770351 + 731 days` → no longer matches (uptime cap exceeded) — should fall through to time-based classification, likely `wrong`.
- Future timestamps beyond `now + 600s``wrong`.
## Out of scope (follow-ups)
- Per-firmware-version known-default lookup (when `firmware_version` field becomes reliable on adverts).
- Reboot-count / flakiness indicator ("this node has hit default N times in last 30d").
- Auto-discovery of new default epochs from clustering analysis (could detect a 4th default emerging in the wild).
- Filtering defaulted-clock adverts out of time-windowed analytics queries (separate spec — affects path attribution).

View File

@@ -3495,12 +3495,12 @@ function destroy() { _analyticsData = {}; _channelData = null; if (_ngState && _
});
// Summary
var counts = { ok: 0, warning: 0, critical: 0, absurd: 0 };
var counts = { ok: 0, degrading: 0, degraded: 0, wrong: 0, default: 0 };
data.forEach(function(n) { if (counts[n.severity] !== undefined) counts[n.severity]++; });
// Filter buttons (also serve as summary — no separate stats pills needed)
var filterColors = { ok: 'var(--status-green)', warning: 'var(--status-yellow)', critical: 'var(--status-orange)', absurd: 'var(--status-purple)', no_clock: 'var(--text-muted)' };
var filters = ['all', 'ok', 'warning', 'critical', 'absurd', 'no_clock'];
var filterColors = { ok: 'var(--status-green)', degrading: 'var(--status-yellow)', degraded: 'var(--status-orange)', wrong: 'var(--status-red)', default: 'var(--text-muted)' };
var filters = ['all', 'ok', 'degrading', 'degraded', 'wrong', 'default'];
var filterHtml = '<div style="margin-bottom:10px">' + filters.map(function(f) {
var dot = f !== 'all' ? '<span style="display:inline-block;width:8px;height:8px;border-radius:50%;background:' + filterColors[f] + ';margin-right:4px;vertical-align:middle"></span>' : '';
return '<button class="clock-filter-btn' + (activeFilter === f ? ' active' : '') + '" data-filter="' + f + '">' +
@@ -3513,8 +3513,8 @@ function destroy() { _analyticsData = {}; _channelData = null; if (_ngState && _
var rowClass = 'clock-fleet-row--' + (n.severity || 'ok');
var lastAdv = n.lastObservedTS ? new Date(n.lastObservedTS * 1000).toISOString().replace('T', ' ').replace(/\.\d+Z/, ' UTC') : '—';
var skewVal = window.currentSkewValue(n);
var skewText = n.severity === 'no_clock' ? 'No Clock' : formatSkew(skewVal);
var driftText = n.severity === 'no_clock' || !n.driftPerDaySec ? '' : formatDrift(n.driftPerDaySec);
var skewText = n.severity === 'default' ? 'Default' : formatSkew(skewVal);
var driftText = n.severity === 'default' || !n.driftPerDaySec ? '' : formatDrift(n.driftPerDaySec);
return '<tr class="' + rowClass + '" data-pubkey="' + esc(n.pubkey) + '" style="cursor:pointer">' +
'<td><strong>' + esc(n.nodeName || n.pubkey.slice(0, 12)) + '</strong></td>' +
'<td style="font-family:var(--mono,monospace)">' + skewText + '</td>' +

View File

@@ -808,7 +808,7 @@
let _themeRefreshHandler = null;
let _allNodes = null; // cached full node list
let _fleetSkew = null; // cached clock skew map: pubkey → {severity, recentMedianSkewSec, medianSkewSec, ...}
let _fleetSkew = null; // cached clock skew map: pubkey → {severity, medianSkewSec, ...}
/**
* Fetch per-node clock skew and render into the given container element.
@@ -824,14 +824,28 @@
var driftHtml = cs.driftPerDaySec ? '<div style="font-size:12px;color:var(--text-muted);margin-top:2px">Drift: ' + formatDrift(cs.driftPerDaySec) + '</div>' : '';
var sparkHtml = renderSkewSparkline(cs.samples, 200, 32);
var skewVal = window.currentSkewValue(cs);
var skewDisplay = cs.severity === 'no_clock'
? '<span style="font-size:18px;font-weight:700;color:var(--text-muted)">No Clock</span>'
var skewDisplay = cs.severity === 'default'
? '<span style="font-size:18px;font-weight:700;color:var(--text-muted)">Default</span>'
: '<span style="font-size:18px;font-weight:700;font-family:var(--mono)">' + formatSkew(skewVal) + '</span>';
var bimodalWarning = '';
if (cs.severity === 'bimodal_clock') {
var totalRecent = cs.recentSampleCount || 0;
bimodalWarning = '<div style="font-size:12px;color:var(--status-amber-text);margin-top:4px">⚠️ ' + (cs.recentBadSampleCount || '?') + ' of last ' + (totalRecent || '?') + ' adverts had nonsense timestamps (likely RTC reset)</div>';
// Per-tier explainer line (plain English reason).
var explainer = '';
var absSkew = Math.abs(cs.lastSkewSec || 0);
var skewStr = Math.round(absSkew) + 's';
if (cs.severity === 'default') {
var isoAdv = cs.lastAdvertTS ? new Date(cs.lastAdvertTS * 1000).toISOString() : '?';
explainer = 'Last advert at ' + isoAdv + ' — matches firmware default (volatile RTC, not user-set since boot)';
} else if (cs.severity === 'ok') {
explainer = 'Last advert ' + skewStr + ' vs wall clock — within OK tolerance (≤15s)';
} else if (cs.severity === 'degrading') {
explainer = 'Last advert ' + skewStr + ' vs wall clock — drift accumulating (≤60s)';
} else if (cs.severity === 'degraded') {
explainer = 'Last advert ' + skewStr + ' vs wall clock — significantly off (≤10m)';
} else if (cs.severity === 'wrong') {
explainer = 'Last advert ' + skewStr + ' vs wall clock — clock incorrect (operator-set or RTC failure)';
}
var explainerHtml = explainer ? '<div style="font-size:12px;color:var(--text-muted);margin-top:4px">' + explainer + '</div>' : '';
container.innerHTML =
'<h4 style="margin:0 0 6px">⏰ Clock Skew</h4>' +
'<div style="display:flex;align-items:center;gap:12px;flex-wrap:wrap">' +
@@ -839,9 +853,9 @@
renderSkewBadge(cs.severity, skewVal, cs) +
(cs.calibrated ? ' <span style="font-size:10px;color:var(--text-muted)" title="Observer-calibrated">✓ calibrated</span>' : '') +
'</div>' +
explainerHtml +
driftHtml +
(sparkHtml ? '<div class="skew-sparkline-wrap" style="margin-top:8px">' + sparkHtml + '<div style="font-size:10px;color:var(--text-muted)">Skew over time (' + (cs.samples || []).length + ' samples)</div></div>' : '') +
bimodalWarning;
(sparkHtml ? '<div class="skew-sparkline-wrap" style="margin-top:8px">' + sparkHtml + '<div style="font-size:10px;color:var(--text-muted)">Skew over time (' + (cs.samples || []).length + ' samples)</div></div>' : '');
} catch (e) {
// Non-fatal — section stays hidden
}

View File

@@ -397,17 +397,16 @@
// #690 — Clock Skew shared helpers
var SKEW_SEVERITY_COLORS = {
default: 'var(--text-muted)',
ok: 'var(--status-green)',
warning: 'var(--status-yellow)',
critical: 'var(--status-orange)',
absurd: 'var(--status-purple)',
bimodal_clock: 'var(--status-amber)',
no_clock: 'var(--text-muted)'
degrading: 'var(--status-yellow)',
degraded: 'var(--status-orange)',
wrong: 'var(--status-red)'
};
var SKEW_SEVERITY_LABELS = {
ok: 'OK', warning: 'Warning', critical: 'Critical', absurd: 'Absurd', bimodal_clock: 'Bimodal', no_clock: 'No Clock'
default: 'Default', ok: 'OK', degrading: 'Degrading', degraded: 'Degraded', wrong: 'Wrong'
};
var SKEW_SEVERITY_ORDER = { no_clock: 0, bimodal_clock: 1, absurd: 2, critical: 3, warning: 4, ok: 5 };
var SKEW_SEVERITY_ORDER = { default: 0, wrong: 1, degraded: 2, degrading: 3, ok: 4 };
window.SKEW_SEVERITY_COLORS = SKEW_SEVERITY_COLORS;
window.SKEW_SEVERITY_LABELS = SKEW_SEVERITY_LABELS;
@@ -430,26 +429,19 @@
return (secPerDay >= 0 ? '+' : '') + secPerDay.toFixed(1) + ' s/day';
};
/** Pick the skew value that drives current-health UI: prefer the
* recent-window median (#789, current health) over the all-time median
* (poisoned by historical bad samples). Falls back gracefully if the
* field isn't present (older API responses). */
/** Pick the skew value that drives current-health UI. Uses lastSkewSec
* (most recent corrected skew) when available, falls back to medianSkewSec. */
window.currentSkewValue = function(cs) {
if (!cs) return null;
return cs.recentMedianSkewSec != null ? cs.recentMedianSkewSec : cs.medianSkewSec;
return cs.lastSkewSec != null ? cs.lastSkewSec : cs.medianSkewSec;
};
/** Render a clock skew badge HTML */
window.renderSkewBadge = function(severity, skewSec, cs) {
if (!severity) return '';
var cls = 'skew-badge skew-badge--' + severity;
if (severity === 'no_clock') {
return '<span class="' + cls + '" title="Uninitialized RTC — no valid clock">🚫 No Clock</span>';
}
if (severity === 'bimodal_clock' && cs) {
var badPct = cs.goodFraction != null ? Math.round((1 - cs.goodFraction) * 100) : '?';
var label = '⏰ ' + window.formatSkew(skewSec);
return '<span class="' + cls + '" title="Clock skew: ' + window.formatSkew(skewSec) + ' (bimodal: ' + badPct + '% of recent adverts have nonsense timestamps)">' + label + '</span>';
if (severity === 'default') {
return '<span class="' + cls + '" title="Firmware default clock — volatile RTC not yet user-set since boot">⏰ Default</span>';
}
var label = severity === 'ok' ? '⏰' : '⏰ ' + window.formatSkew(skewSec);
return '<span class="' + cls + '" title="Clock skew: ' + window.formatSkew(skewSec) + ' (' + (SKEW_SEVERITY_LABELS[severity] || severity) + ')">' + label + '</span>';

View File

@@ -2291,22 +2291,21 @@ th.sort-active { color: var(--accent, #60a5fa); }
/* #690 — Clock Skew badges & fleet table */
.skew-badge { display: inline-block; font-size: 10px; padding: 1px 5px; border-radius: 3px; margin-left: 4px; font-weight: 600; white-space: nowrap; }
.skew-badge--default { background: var(--text-muted); color: #fff; }
.skew-badge--ok { background: var(--status-green); color: #fff; }
.skew-badge--warning { background: var(--status-yellow); color: #000; }
.skew-badge--critical { background: var(--status-orange); color: #fff; }
.skew-badge--absurd { background: var(--status-purple); color: #fff; }
.skew-badge--no_clock { background: var(--text-muted); color: #fff; }
.skew-badge--bimodal_clock { background: var(--status-amber-light); color: var(--status-amber-text); border: 1px solid var(--status-amber); }
.skew-badge--degrading { background: var(--status-yellow); color: #000; }
.skew-badge--degraded { background: var(--status-orange); color: #fff; }
.skew-badge--wrong { background: var(--status-red); color: #fff; }
.skew-detail-section { padding: 10px 16px; margin-bottom: 8px; }
.skew-sparkline-wrap { margin-top: 6px; }
.skew-sparkline-wrap svg { display: block; }
.clock-fleet-row--warning { background: color-mix(in srgb, var(--status-yellow) 10%, transparent); }
.clock-fleet-row--critical { background: color-mix(in srgb, var(--status-orange) 10%, transparent); }
.clock-fleet-row--absurd { background: color-mix(in srgb, var(--status-purple) 10%, transparent); }
.clock-fleet-row--no_clock { background: color-mix(in srgb, var(--text-muted) 10%, transparent); }
.clock-fleet-row--degrading { background: color-mix(in srgb, var(--status-yellow) 10%, transparent); }
.clock-fleet-row--degraded { background: color-mix(in srgb, var(--status-orange) 10%, transparent); }
.clock-fleet-row--wrong { background: color-mix(in srgb, var(--status-red) 10%, transparent); }
.clock-fleet-row--default { background: color-mix(in srgb, var(--text-muted) 10%, transparent); }
.clock-filter-btn { font-size: 12px; padding: 3px 8px; border: 1px solid var(--border); border-radius: 4px; background: var(--card-bg, #fff); color: var(--text); cursor: pointer; margin-right: 4px; }
.clock-filter-btn.active { background: var(--accent); color: #fff; border-color: var(--accent); }

View File

@@ -5904,12 +5904,11 @@ console.log('\n=== channel-decrypt.js: key derivation, MAC, parsing, storage ===
assert.strictEqual(ctx.window.renderSkewBadge(null, 0), '');
});
test('renderSkewBadge renders bimodal_clock badge with tooltip (#845)', () => {
var cs = { goodFraction: 0.6, recentBadSampleCount: 4, recentSampleCount: 10 };
var html = ctx.window.renderSkewBadge('bimodal_clock', -5, cs);
assert.ok(html.includes('skew-badge--bimodal_clock'), 'should contain bimodal_clock class');
assert.ok(html.includes('bimodal'), 'tooltip should mention bimodal');
assert.ok(html.includes('40%'), 'tooltip should show bad percentage');
test('renderSkewBadge renders default badge with tooltip', () => {
var cs = {};
var html = ctx.window.renderSkewBadge('default', 0, cs);
assert.ok(html.includes('skew-badge--default'), 'should contain default class');
assert.ok(html.toLowerCase().includes('firmware default'), 'tooltip should mention firmware default');
assert.ok(html.includes('⏰'), 'should contain clock emoji');
});
@@ -5933,9 +5932,9 @@ console.log('\n=== channel-decrypt.js: key derivation, MAC, parsing, storage ===
test('SKEW_SEVERITY_ORDER sorts worst first', () => {
var order = ctx.window.SKEW_SEVERITY_ORDER;
assert.ok(order.absurd < order.critical, 'absurd should sort before critical');
assert.ok(order.critical < order.warning, 'critical should sort before warning');
assert.ok(order.warning < order.ok, 'warning should sort before ok');
assert.ok(order.wrong < order.degraded, 'wrong should sort before degraded');
assert.ok(order.degraded < order.degrading, 'degraded should sort before degrading');
assert.ok(order.degrading < order.ok, 'degrading should sort before ok');
});
}