mirror of
https://github.com/Kpa-clawbot/meshcore-analyzer.git
synced 2026-06-01 20:14:04 +00:00
38eb7103b3
## Problem
`handleNodes` enriches each repeater/room node by calling
`GetRepeaterRelayInfo` and `GetRepeaterUsefulnessScore` **per node**
inside a loop. `GetRepeaterUsefulnessScore` acquires `s.mu.RLock()` and
then iterates **all** `byPayloadType` entries to compute the non-advert
denominator — once per node.
On a deployment with ~1500 repeater/room nodes and ~145K transmissions
in memory, this is **~220M iterations per `/api/nodes` request**, plus
~3000 separate lock acquisitions. Response times of 18–44 seconds have
been observed in production, especially during startup backfill when
write-lock contention compounds the issue.
## Fix
Add `GetRepeaterNodeStatsBatch(pubkeys []string, windowHours float64)
map[string]RepeaterNodeStats` to `repeater_usefulness.go`:
- Takes **one** `s.mu.RLock()` for the entire node list
- Computes the non-advert denominator **once** (shared across all nodes)
- Snapshots `byPathHop` slice headers for all requested pubkeys under
that single lock
- Processes timestamps and counts **outside** the lock
Update `handleNodes` to collect repeater/room pubkeys first, call the
batch method once, and apply results.
**Complexity: O(M + N) instead of O(N × M)** per request (M = total
transmissions, N = repeater nodes).
`GetRepeaterRelayInfo` and `GetRepeaterUsefulnessScore` are unchanged —
they are still correct for single-node calls (e.g. `handleNodeDetail`).
## Test plan
- [ ] `go build ./cmd/server` passes
- [ ] `/api/nodes` response is correct (relay_active,
relay_count_1h/24h, usefulness_score fields present for repeaters)
- [ ] No change in output for `/api/nodes/{pubkey}` (uses existing
single-node methods)
- [ ] CI passes
🤖 Generated with [Claude Code](https://claude.com/claude-code)
---------
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
Co-authored-by: openclaw-bot <bot@openclaw.local>
191 lines
6.5 KiB
Go
191 lines
6.5 KiB
Go
package main
|
|
|
|
import (
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// RepeaterRelayInfo describes whether a repeater has been observed
|
|
// relaying traffic (appearing as a path hop in non-advert packets) and
|
|
// when. This is distinct from advert-based liveness (last_seen / last_heard),
|
|
// which only proves the repeater can transmit its own adverts.
|
|
//
|
|
// See issue #662.
|
|
type RepeaterRelayInfo struct {
|
|
// LastRelayed is the ISO-8601 timestamp of the most recent non-advert
|
|
// packet where this pubkey appeared as a relay hop. Empty if never.
|
|
LastRelayed string `json:"lastRelayed,omitempty"`
|
|
// RelayActive is true if LastRelayed falls within the configured
|
|
// activity window (default 24h).
|
|
RelayActive bool `json:"relayActive"`
|
|
// WindowHours is the active-window threshold actually used.
|
|
WindowHours float64 `json:"windowHours"`
|
|
// RelayCount1h is the count of distinct non-advert packets where this
|
|
// pubkey appeared as a relay hop in the last 1 hour.
|
|
RelayCount1h int `json:"relayCount1h"`
|
|
// RelayCount24h is the count of distinct non-advert packets where this
|
|
// pubkey appeared as a relay hop in the last 24 hours.
|
|
RelayCount24h int `json:"relayCount24h"`
|
|
}
|
|
|
|
// payloadTypeAdvert is the MeshCore payload type for ADVERT packets.
|
|
// See firmware/src/Mesh.h. Adverts are NOT considered relay activity:
|
|
// a repeater that only sends adverts proves it is alive, not that it
|
|
// is forwarding traffic for other nodes.
|
|
const payloadTypeAdvert = 4
|
|
|
|
// parseRelayTS attempts to parse a packet first-seen timestamp using the
|
|
// formats CoreScope writes in practice. Returns zero time and false on
|
|
// failure. Accepted (in order):
|
|
// - RFC3339Nano — Go's default UTC marshal output
|
|
// - RFC3339 — second-precision ISO-8601 with offset
|
|
// - "2006-01-02T15:04:05.000Z" — millisecond-precision Z form used by ingest
|
|
func parseRelayTS(ts string) (time.Time, bool) {
|
|
if ts == "" {
|
|
return time.Time{}, false
|
|
}
|
|
if t, err := time.Parse(time.RFC3339Nano, ts); err == nil {
|
|
return t, true
|
|
}
|
|
if t, err := time.Parse(time.RFC3339, ts); err == nil {
|
|
return t, true
|
|
}
|
|
if t, err := time.Parse("2006-01-02T15:04:05.000Z", ts); err == nil {
|
|
return t, true
|
|
}
|
|
return time.Time{}, false
|
|
}
|
|
|
|
// relayEntry is a minimal snapshot of a StoreTx taken while the store
|
|
// read-lock is held. Copying only the fields we need lets us release the
|
|
// lock before doing timestamp parsing and comparison work.
|
|
type relayEntry struct {
|
|
ts string
|
|
pt int
|
|
}
|
|
|
|
// collectRelayEntriesLocked returns deduplicated relayEntry snapshots for
|
|
// all StoreTx entries indexed under key (full pubkey) and its 1-byte wire
|
|
// prefix. Caller MUST hold s.mu at least for reading.
|
|
//
|
|
// byPathHop is keyed by both full resolved pubkey AND raw 1-byte hop
|
|
// prefix (e.g. "a3"). Many ingested non-advert packets only carry the
|
|
// raw hop on the wire — resolution to the full pubkey happens later via
|
|
// neighbor affinity. Looking up both keys and de-duping by tx ID matches
|
|
// what the "Paths seen through node" view shows.
|
|
//
|
|
// The 1-byte prefix lookup CAN over-count when multiple nodes share the
|
|
// same first byte. This trades a possible over-count for clearly false
|
|
// zeros (issue #662).
|
|
func (s *PacketStore) collectRelayEntriesLocked(key string) []relayEntry {
|
|
txList := s.byPathHop[key]
|
|
var prefixList []*StoreTx
|
|
if len(key) >= 2 {
|
|
// key[:2] is the first 2 hex characters — exactly 1 byte of raw
|
|
// hop data, matching addTxToPathHopIndex for wire-level hops.
|
|
prefix := key[:2]
|
|
if prefix != key {
|
|
prefixList = s.byPathHop[prefix]
|
|
}
|
|
}
|
|
|
|
// Capacity hint: upper-bound is len(txList)+len(prefixList). The
|
|
// collect() pass below uses `seen` for true dedup, so we don't need
|
|
// a separate prepass (PR #1164 CR item 3: dead `uniq` map removed).
|
|
hint := len(txList) + len(prefixList)
|
|
entries := make([]relayEntry, 0, hint)
|
|
seen := make(map[int]bool, hint)
|
|
collect := func(list []*StoreTx) {
|
|
for _, tx := range list {
|
|
if tx == nil || seen[tx.ID] {
|
|
continue
|
|
}
|
|
seen[tx.ID] = true
|
|
pt := -1
|
|
if tx.PayloadType != nil {
|
|
pt = *tx.PayloadType
|
|
}
|
|
entries = append(entries, relayEntry{ts: tx.FirstSeen, pt: pt})
|
|
}
|
|
}
|
|
collect(txList)
|
|
collect(prefixList)
|
|
return entries
|
|
}
|
|
|
|
// computeRelayInfoFromEntries derives RepeaterRelayInfo from pre-snapshotted
|
|
// relayEntry values. Safe to call without any lock held.
|
|
func computeRelayInfoFromEntries(entries []relayEntry, windowHours float64) RepeaterRelayInfo {
|
|
info := RepeaterRelayInfo{WindowHours: windowHours}
|
|
|
|
now := time.Now().UTC()
|
|
cutoff1h := now.Add(-time.Hour)
|
|
cutoff24h := now.Add(-24 * time.Hour)
|
|
|
|
var latest time.Time
|
|
var latestRaw string
|
|
for _, e := range entries {
|
|
// Self-originated adverts are not relay activity.
|
|
if e.pt == payloadTypeAdvert {
|
|
continue
|
|
}
|
|
t, ok := parseRelayTS(e.ts)
|
|
if !ok {
|
|
continue
|
|
}
|
|
if t.After(latest) {
|
|
latest = t
|
|
latestRaw = e.ts
|
|
}
|
|
if t.After(cutoff24h) {
|
|
info.RelayCount24h++
|
|
if t.After(cutoff1h) {
|
|
info.RelayCount1h++
|
|
}
|
|
}
|
|
}
|
|
if latestRaw == "" {
|
|
return info
|
|
}
|
|
info.LastRelayed = latestRaw
|
|
|
|
if windowHours > 0 {
|
|
cutoff := now.Add(-time.Duration(windowHours * float64(time.Hour)))
|
|
if latest.After(cutoff) {
|
|
info.RelayActive = true
|
|
}
|
|
}
|
|
return info
|
|
}
|
|
|
|
// GetRepeaterRelayInfo returns relay-activity information for a node by
|
|
// scanning the byPathHop index for non-advert packets that name the
|
|
// pubkey as a hop. It computes the most recent appearance timestamp,
|
|
// 1h/24h hop counts, and whether the latest appearance falls within
|
|
// windowHours.
|
|
//
|
|
// Cost: O(N) over the indexed entries for `pubkey`. The byPathHop index
|
|
// is bounded by store eviction; on real data this is small per-node.
|
|
//
|
|
// Note on self-as-source: byPathHop is keyed by every hop in a packet's
|
|
// resolved path, including the originator. For ADVERT packets that's the
|
|
// node itself, which is filtered above by the payloadTypeAdvert check.
|
|
// For non-advert packets a node "originates" rather than "relays" only
|
|
// when it is the source; we don't currently have a clean signal for that
|
|
// distinction, so the count here is *path-hop appearances in non-advert
|
|
// packets*. In practice for a repeater nearly all such appearances are
|
|
// relay hops (the firmware doesn't originate user traffic), so this is
|
|
// the right approximation for issue #662.
|
|
func (s *PacketStore) GetRepeaterRelayInfo(pubkey string, windowHours float64) RepeaterRelayInfo {
|
|
if pubkey == "" {
|
|
return RepeaterRelayInfo{WindowHours: windowHours}
|
|
}
|
|
key := strings.ToLower(pubkey)
|
|
|
|
s.mu.RLock()
|
|
entries := s.collectRelayEntriesLocked(key)
|
|
s.mu.RUnlock()
|
|
|
|
return computeRelayInfoFromEntries(entries, windowHours)
|
|
}
|