Files
meshcore-analyzer/cmd/server/role_analytics_recompute_test.go
Kpa-clawbot f81ed5b3cf perf(#1256): wire /api/analytics/roles into steady-state recomputer (#1259)
RED commit: `0190466d` — failing CI:
https://github.com/Kpa-clawbot/CoreScope/actions (will populate after PR
creation)

## Problem
On staging (commit `d69d9fb`, 78k tx, 2.3M obs), `curl
http://localhost/api/analytics/roles` times out at 60s with 0 bytes —
the Roles tab is unusable. Issue #1256.

PR #1248's steady-state recomputer fan-out (topology / rf / distance /
channels / hash-collisions / hash-sizes) **didn't include roles**. The
legacy handler:

1. Holds `s.mu.RLock` for the entire compute.
2. Calls `GetFleetClockSkew()`, which drives `clockSkew.Recompute(s)`
over all ADVERT transmissions — O(78k) per request.
3. Concurrent ingest writers compound the latency through
writer-starvation.

Result: every request hits the cold path; the response never comes back
inside the 60 s HTTP budget.

## Fix
Add `roles` as the 7th endpoint in the recomputer fan-out — same pattern
as #1248:

- `PacketStore.recompRoles` slot, registered in
`StartAnalyticsRecomputers` with default 5-min interval.
- `PacketStore.GetAnalyticsRoles()` → atomic-pointer load from the
snapshot (sub-ms), with a `computeAnalyticsRoles()` fallback only for
the brief startup window before the initial sync compute completes.
- Handler is now a thin wrapper — no lock-held work on the request path.
- New optional `roles` key under `analytics.recomputeIntervalSeconds` in
config; `config.example.json` and `_comment_analytics` updated.

## Latency (unit-scope benchmark)
- Worst-of-50 handler latency: **<100 ms** (test budget; well under the
2 s p99 acceptance).
- Compute itself is bounded by the existing 5-min recompute window — it
runs once in the background, never on the request path.

## Tests
- RED `0190466d`: asserts `recompRoles` is registered and the handler
returns under the latency budget. Fails on master with `recompRoles not
registered`.
- GREEN `d7784f76`: registers the recomputer + snapshot accessor — both
tests pass.

Fixes #1256

---------

Co-authored-by: openclaw-bot <bot@openclaw.local>
2026-05-18 07:36:28 -07:00

98 lines
3.1 KiB
Go

package main
import (
"encoding/json"
"net/http"
"net/http/httptest"
"testing"
"time"
)
// TestRolesAnalyticsRecomputerRegistered asserts that the
// /api/analytics/roles endpoint is backed by the steady-state
// analytics recomputer (issue #1256). On master, roles was
// NOT wired into StartAnalyticsRecomputers — every request
// holds s.mu.RLock for the whole compute and triggers a fleet
// clock-skew recompute over 78k transmissions, hanging >60s.
//
// Post-fix: after StartAnalyticsRecomputers, the store exposes
// a recomputer for roles whose Load() returns a populated
// RoleAnalyticsResponse (initial sync compute), and the
// PacketStore.GetAnalyticsRoles() accessor returns from the
// snapshot in sub-millisecond time.
func TestRolesAnalyticsRecomputerRegistered(t *testing.T) {
db := setupTestDB(t)
defer db.Close()
store := NewPacketStore(db, nil)
stop := store.StartAnalyticsRecomputers(50 * time.Millisecond)
defer stop()
// Give the initial synchronous compute a beat to populate.
time.Sleep(100 * time.Millisecond)
store.analyticsRecomputerMu.RLock()
rc := store.recompRoles
store.analyticsRecomputerMu.RUnlock()
if rc == nil {
t.Fatalf("recompRoles not registered after StartAnalyticsRecomputers (issue #1256 not fixed)")
}
v := rc.Load()
if v == nil {
t.Fatalf("recompRoles snapshot is nil after initial compute")
}
if _, ok := v.(RoleAnalyticsResponse); !ok {
t.Fatalf("recompRoles snapshot type = %T, want RoleAnalyticsResponse", v)
}
// Accessor must hit the snapshot path.
t0 := time.Now()
resp := store.GetAnalyticsRoles()
dt := time.Since(t0)
if dt > 5*time.Millisecond {
t.Errorf("GetAnalyticsRoles latency = %v, want <5ms (snapshot path)", dt)
}
// Just confirm we got the response shape (empty store → empty roles).
_ = resp
}
// TestRolesHandlerUsesRecomputer is a HTTP-level guard that the
// /api/analytics/roles handler returns from the recomputer snapshot
// quickly even when no clock skew engine state has been primed (the
// hang on staging was: every call drove a full clockSkew.Recompute
// on 78k adverts). With recomputer wired, the handler is an atomic
// pointer load + JSON encode.
func TestRolesHandlerSnapshotLatency(t *testing.T) {
db := setupTestDB(t)
defer db.Close()
store := NewPacketStore(db, nil)
stop := store.StartAnalyticsRecomputers(50 * time.Millisecond)
defer stop()
time.Sleep(100 * time.Millisecond)
s := &Server{store: store}
// p99 over 50 reads must be well under 2 s (issue acceptance).
worst := time.Duration(0)
for i := 0; i < 50; i++ {
rr := httptest.NewRecorder()
req := httptest.NewRequest(http.MethodGet, "/api/analytics/roles", nil)
t0 := time.Now()
s.handleAnalyticsRoles(rr, req)
dt := time.Since(t0)
if dt > worst {
worst = dt
}
if rr.Code != http.StatusOK {
t.Fatalf("status = %d, want 200", rr.Code)
}
var out RoleAnalyticsResponse
if err := json.Unmarshal(rr.Body.Bytes(), &out); err != nil {
t.Fatalf("invalid json: %v", err)
}
}
if worst > 100*time.Millisecond {
t.Fatalf("worst-of-50 handler latency = %v, want <100ms (recomputer snapshot)", worst)
}
}