Files
meshcore-analyzer/cmd/server/repeater_usefulness.go
T
efiten 38eb7103b3 perf(nodes): batch relay stats to fix O(N×M) /api/nodes regression (#1164)
## Problem

`handleNodes` enriches each repeater/room node by calling
`GetRepeaterRelayInfo` and `GetRepeaterUsefulnessScore` **per node**
inside a loop. `GetRepeaterUsefulnessScore` acquires `s.mu.RLock()` and
then iterates **all** `byPayloadType` entries to compute the non-advert
denominator — once per node.

On a deployment with ~1500 repeater/room nodes and ~145K transmissions
in memory, this is **~220M iterations per `/api/nodes` request**, plus
~3000 separate lock acquisitions. Response times of 18–44 seconds have
been observed in production, especially during startup backfill when
write-lock contention compounds the issue.

## Fix

Add `GetRepeaterNodeStatsBatch(pubkeys []string, windowHours float64)
map[string]RepeaterNodeStats` to `repeater_usefulness.go`:

- Takes **one** `s.mu.RLock()` for the entire node list
- Computes the non-advert denominator **once** (shared across all nodes)
- Snapshots `byPathHop` slice headers for all requested pubkeys under
that single lock
- Processes timestamps and counts **outside** the lock

Update `handleNodes` to collect repeater/room pubkeys first, call the
batch method once, and apply results.

**Complexity: O(M + N) instead of O(N × M)** per request (M = total
transmissions, N = repeater nodes).

`GetRepeaterRelayInfo` and `GetRepeaterUsefulnessScore` are unchanged —
they are still correct for single-node calls (e.g. `handleNodeDetail`).

## Test plan

- [ ] `go build ./cmd/server` passes
- [ ] `/api/nodes` response is correct (relay_active,
relay_count_1h/24h, usefulness_score fields present for repeaters)
- [ ] No change in output for `/api/nodes/{pubkey}` (uses existing
single-node methods)
- [ ] CI passes

🤖 Generated with [Claude Code](https://claude.com/claude-code)

---------

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
Co-authored-by: openclaw-bot <bot@openclaw.local>
2026-05-20 20:57:02 -07:00

177 lines
5.0 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package main
import (
"sort"
"strings"
"time"
)
// GetRepeaterUsefulnessScore returns a 0..1 score representing what
// fraction of non-advert traffic in the store passes through this
// repeater as a relay hop. Issue #672 (Traffic axis only — bridge,
// coverage, and redundancy axes are deferred to follow-up work).
//
// Numerator: count of non-advert StoreTx entries indexed under
// pubkey in byPathHop.
// Denominator: total non-advert StoreTx entries in the store
// (sum of byPayloadType for all keys != payloadTypeAdvert).
//
// Returns 0 when there is no non-advert traffic, the pubkey is empty,
// or the repeater never appears as a relay hop. Scores are clamped to
// [0,1] for defensive bounds.
//
// Cost: O(N) over byPayloadType keys (typically <20) plus the per-hop
// slice for pubkey. Cheap relative to the per-request enrichment loop
// in handleNodes; if it ever shows up in profiles, denominator can be
// memoized off store invalidation.
func (s *PacketStore) GetRepeaterUsefulnessScore(pubkey string) float64 {
if pubkey == "" {
return 0
}
key := strings.ToLower(pubkey)
s.mu.RLock()
defer s.mu.RUnlock()
// Denominator: total non-advert packets.
totalNonAdvert := 0
for pt, list := range s.byPayloadType {
if pt == payloadTypeAdvert {
continue
}
totalNonAdvert += len(list)
}
if totalNonAdvert == 0 {
return 0
}
// Numerator: this repeater's non-advert hop appearances.
relayed := 0
for _, tx := range s.byPathHop[key] {
if tx == nil {
continue
}
if tx.PayloadType != nil && *tx.PayloadType == payloadTypeAdvert {
continue
}
relayed++
}
score := float64(relayed) / float64(totalNonAdvert)
if score < 0 {
return 0
}
if score > 1 {
return 1
}
return score
}
// RepeaterNodeStats bundles relay-activity and usefulness data for a single node.
type RepeaterNodeStats struct {
Info RepeaterRelayInfo
Score float64
}
// GetRepeaterNodeStatsBatch computes relay info and usefulness scores for all given
// pubkeys in a single read-lock pass, sharing the non-advert denominator across all
// nodes. All StoreTx fields are read under the lock and copied into relayEntry
// snapshots before the lock is released; no StoreTx pointers escape the lock.
// Replaces the per-node loop in handleNodes that called GetRepeaterRelayInfo +
// GetRepeaterUsefulnessScore N times (O(N × byPayloadType) → O(byPayloadType + N)).
func (s *PacketStore) GetRepeaterNodeStatsBatch(pubkeys []string, windowHours float64) map[string]RepeaterNodeStats {
result := make(map[string]RepeaterNodeStats, len(pubkeys))
if len(pubkeys) == 0 {
return result
}
type nodeSnap struct {
entries []relayEntry
relayed int // non-advert count in full-key list only (for usefulness score)
}
s.mu.RLock()
totalNonAdvert := 0
for pt, list := range s.byPayloadType {
if pt != payloadTypeAdvert {
totalNonAdvert += len(list)
}
}
snaps := make(map[string]nodeSnap, len(pubkeys))
for _, pk := range pubkeys {
key := strings.ToLower(pk)
entries := s.collectRelayEntriesLocked(key)
relayed := 0
for _, tx := range s.byPathHop[key] {
if tx != nil && (tx.PayloadType == nil || *tx.PayloadType != payloadTypeAdvert) {
relayed++
}
}
snaps[pk] = nodeSnap{entries: entries, relayed: relayed}
}
s.mu.RUnlock()
for _, pk := range pubkeys {
snap := snaps[pk]
info := computeRelayInfoFromEntries(snap.entries, windowHours)
var score float64
if totalNonAdvert > 0 && snap.relayed > 0 {
score = float64(snap.relayed) / float64(totalNonAdvert)
if score > 1 {
score = 1
}
}
result[pk] = RepeaterNodeStats{Info: info, Score: score}
}
return result
}
// GetRepeaterNodeStatsBatchCached wraps GetRepeaterNodeStatsBatch with a 5min
// TTL cache keyed on (pubkeys, windowHours). handleNodes calls this for every
// map/live/node request; without caching the full batch over ~1900 repeaters
// takes 20-30s on large datasets.
// 300s TTL: cold compute (~25s) runs at most once per 5min (~8% duty cycle)
// vs the previous 30s TTL (~82% duty cycle).
func (s *PacketStore) GetRepeaterNodeStatsBatchCached(pubkeys []string, windowHours float64) map[string]RepeaterNodeStats {
sig := pubkeySig(pubkeys)
s.relayStatsCacheMu.Lock()
if s.relayStatsCache != nil &&
s.relayStatsCacheSig == sig &&
s.relayStatsCacheWindow == windowHours &&
time.Since(s.relayStatsCacheAt) < 300*time.Second {
cached := s.relayStatsCache
s.relayStatsCacheMu.Unlock()
return cached
}
s.relayStatsCacheMu.Unlock()
result := s.GetRepeaterNodeStatsBatch(pubkeys, windowHours)
s.relayStatsCacheMu.Lock()
s.relayStatsCache = result
s.relayStatsCacheAt = time.Now()
s.relayStatsCacheWindow = windowHours
s.relayStatsCacheSig = sig
s.relayStatsCacheMu.Unlock()
return result
}
// pubkeySig returns a stable, order-independent string key for a pubkey set.
func pubkeySig(pubkeys []string) string {
if len(pubkeys) == 0 {
return ""
}
sorted := make([]string, len(pubkeys))
copy(sorted, pubkeys)
sort.Strings(sorted)
return strings.Join(sorted, ",")
}