mirror of
https://github.com/Kpa-clawbot/meshcore-analyzer.git
synced 2026-06-05 08:41:26 +00:00
ae17a2be12
RED commit: `22ce5736066142583017cad7303fa48d9e00ccf0` — CI on red: https://github.com/Kpa-clawbot/CoreScope/actions?query=branch%3Afix%2Fissue-1262 ## Problem After #1260 added a 15s-TTL bulk cache for repeater enrichment in `handleNodes`, `/api/nodes` (default limit) dropped to ~500ms. But `/api/nodes?limit=2000` — called by `public/live.js` at SPA startup for hop resolution — still took **15.7s cold** on staging (75k tx, 600 nodes). Warm hits were ~40ms. Root cause: the bulk cache was lazily populated on the first request after TTL expiry. The rebuild ran on the request-serving goroutine. Every cold SPA load triggered the rebuild and ate 15s. ## Fix Add `StartRepeaterEnrichmentRecomputer` — a steady-state background recomputer that mirrors the `analytics_recomputer.go` pattern from #1240: - **Prewarm**: initial synchronous compute on Start so the first request hits a populated cache. - **Steady-state**: ticker refreshes the snapshot every 5min (configurable via the existing analytics recompute interval knob). - **Panic-safe** + idempotent Start. Wired into `main.go` right after `StartAnalyticsRecomputers`, using `cfg.GetHealthThresholds().RelayActiveHours` as the window. ## Test `TestHandleNodesLimit2000ColdMiss` — seeds 600 nodes + 150k non-advert tx with repeaters indexed under a shared 1-byte hop prefix (matches production hop-prefix collisions), starts the recomputer, then issues `/api/nodes?limit=2000` with **no HTTP warmup**. | State | Latency | |---|---| | Before (master, on-thread rebuild) | 3.37s | | After (prewarm + steady-state) | 56ms | | Budget | 2s | Staging end-to-end: 15.7s → expected sub-100ms on the same call path. Red commit (`22ce5736066142583017cad7303fa48d9e00ccf0`) compiles with a no-op stub of the new method so the test fails on the latency **assertion**, not a missing symbol. Fixes #1262 --------- Co-authored-by: corescope-bot <bot@corescope.local>
105 lines
3.5 KiB
Go
105 lines
3.5 KiB
Go
package main
|
|
|
|
import (
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
// repeaterEnrichmentRecomputerInterval is the default tick interval
|
|
// for the steady-state recompute of the repeater enrichment bulk
|
|
// caches. The on-request 15s-TTL fallback in repeater_enrich_bulk.go
|
|
// is kept as a safety net — the recomputer just makes sure the cache
|
|
// is populated before any request arrives.
|
|
//
|
|
// 5min mirrors the analytics_recomputer default from #1240 and is
|
|
// plenty fresh for an at-a-glance status column.
|
|
const repeaterEnrichmentRecomputerDefaultInterval = 5 * time.Minute
|
|
|
|
// StartRepeaterEnrichmentRecomputer is the steady-state background
|
|
// recompute loop for the repeater enrichment bulk caches consumed by
|
|
// handleNodes (GetRepeaterRelayInfoMap + GetRepeaterUsefulnessScoreMap).
|
|
//
|
|
// Why this exists (issue #1262): PR #1260 added a 15s-TTL bulk cache,
|
|
// but the rebuild itself runs on the request-serving goroutine on the
|
|
// first request after startup or after the TTL expires. On staging
|
|
// (75k tx, 600 nodes) that cold rebuild took 15.7s and was triggered
|
|
// by every cold SPA load via live.js's /api/nodes?limit=2000 call.
|
|
//
|
|
// On Start this does an initial synchronous compute (so the next
|
|
// request hits cache) and then ticks every `interval` to keep the
|
|
// snapshot fresh — same pattern as analytics_recomputer.go (#1240).
|
|
//
|
|
// Returns a stop closure that signals the goroutine and waits for it
|
|
// to exit (with a 5s defensive timeout).
|
|
//
|
|
// Safe to call multiple times: subsequent calls are no-ops and return
|
|
// a no-op stop closure (the original goroutine retains ownership).
|
|
func (s *PacketStore) StartRepeaterEnrichmentRecomputer(windowHours float64, interval time.Duration) func() {
|
|
if interval <= 0 {
|
|
interval = repeaterEnrichmentRecomputerDefaultInterval
|
|
}
|
|
|
|
s.repeaterEnrichRecompMu.Lock()
|
|
if s.repeaterEnrichRecompStarted {
|
|
s.repeaterEnrichRecompMu.Unlock()
|
|
return func() {}
|
|
}
|
|
s.repeaterEnrichRecompStarted = true
|
|
stop := make(chan struct{})
|
|
done := make(chan struct{})
|
|
s.repeaterEnrichRecompStop = stop
|
|
s.repeaterEnrichRecompDone = done
|
|
s.repeaterEnrichRecompMu.Unlock()
|
|
|
|
// Initial synchronous prewarm — the entire point of this recomputer
|
|
// is to make sure the very first /api/nodes?limit=2000 from
|
|
// live.js's SPA bootstrap (issue #1262) hits a populated cache
|
|
// instead of paying the on-thread rebuild cost.
|
|
recomputeRepeaterEnrichmentSafe(s, windowHours)
|
|
|
|
var stopOnce sync.Once
|
|
go func() {
|
|
defer close(done)
|
|
t := time.NewTicker(interval)
|
|
defer t.Stop()
|
|
for {
|
|
select {
|
|
case <-t.C:
|
|
recomputeRepeaterEnrichmentSafe(s, windowHours)
|
|
case <-stop:
|
|
return
|
|
}
|
|
}
|
|
}()
|
|
|
|
return func() {
|
|
stopOnce.Do(func() {
|
|
close(stop)
|
|
})
|
|
select {
|
|
case <-done:
|
|
case <-time.After(5 * time.Second):
|
|
}
|
|
}
|
|
}
|
|
|
|
// recomputeRepeaterEnrichmentSafe runs both bulk-cache compute paths
|
|
// behind a panic recover — a panic in compute must not kill the
|
|
// background goroutine (the previous snapshot remains valid).
|
|
func recomputeRepeaterEnrichmentSafe(s *PacketStore, windowHours float64) {
|
|
defer func() { _ = recover() }()
|
|
// Bypass the 15s-TTL gate by forcing a fresh recompute and
|
|
// installing the result. The public Get* helpers would return the
|
|
// existing cache when within TTL; we want to refresh proactively.
|
|
relay := s.computeRepeaterRelayInfoMap(windowHours)
|
|
useful := s.computeRepeaterUsefulnessScoreMap()
|
|
now := time.Now()
|
|
s.repeaterEnrichMu.Lock()
|
|
s.repeaterRelayCache = relay
|
|
s.repeaterRelayCacheWin = windowHours
|
|
s.repeaterRelayAt = now
|
|
s.repeaterUsefulCache = useful
|
|
s.repeaterUsefulAt = now
|
|
s.repeaterEnrichMu.Unlock()
|
|
}
|