mirror of
https://github.com/Kpa-clawbot/meshcore-analyzer.git
synced 2026-05-22 02:55:10 +00:00
f81ed5b3cf
RED commit: `0190466d` — failing CI: https://github.com/Kpa-clawbot/CoreScope/actions (will populate after PR creation) ## Problem On staging (commit `d69d9fb`, 78k tx, 2.3M obs), `curl http://localhost/api/analytics/roles` times out at 60s with 0 bytes — the Roles tab is unusable. Issue #1256. PR #1248's steady-state recomputer fan-out (topology / rf / distance / channels / hash-collisions / hash-sizes) **didn't include roles**. The legacy handler: 1. Holds `s.mu.RLock` for the entire compute. 2. Calls `GetFleetClockSkew()`, which drives `clockSkew.Recompute(s)` over all ADVERT transmissions — O(78k) per request. 3. Concurrent ingest writers compound the latency through writer-starvation. Result: every request hits the cold path; the response never comes back inside the 60 s HTTP budget. ## Fix Add `roles` as the 7th endpoint in the recomputer fan-out — same pattern as #1248: - `PacketStore.recompRoles` slot, registered in `StartAnalyticsRecomputers` with default 5-min interval. - `PacketStore.GetAnalyticsRoles()` → atomic-pointer load from the snapshot (sub-ms), with a `computeAnalyticsRoles()` fallback only for the brief startup window before the initial sync compute completes. - Handler is now a thin wrapper — no lock-held work on the request path. - New optional `roles` key under `analytics.recomputeIntervalSeconds` in config; `config.example.json` and `_comment_analytics` updated. ## Latency (unit-scope benchmark) - Worst-of-50 handler latency: **<100 ms** (test budget; well under the 2 s p99 acceptance). - Compute itself is bounded by the existing 5-min recompute window — it runs once in the background, never on the request path. ## Tests - RED `0190466d`: asserts `recompRoles` is registered and the handler returns under the latency budget. Fails on master with `recompRoles not registered`. - GREEN `d7784f76`: registers the recomputer + snapshot accessor — both tests pass. Fixes #1256 --------- Co-authored-by: openclaw-bot <bot@openclaw.local>
98 lines
3.1 KiB
Go
98 lines
3.1 KiB
Go
package main
|
|
|
|
import (
|
|
"encoding/json"
|
|
"net/http"
|
|
"net/http/httptest"
|
|
"testing"
|
|
"time"
|
|
)
|
|
|
|
// TestRolesAnalyticsRecomputerRegistered asserts that the
|
|
// /api/analytics/roles endpoint is backed by the steady-state
|
|
// analytics recomputer (issue #1256). On master, roles was
|
|
// NOT wired into StartAnalyticsRecomputers — every request
|
|
// holds s.mu.RLock for the whole compute and triggers a fleet
|
|
// clock-skew recompute over 78k transmissions, hanging >60s.
|
|
//
|
|
// Post-fix: after StartAnalyticsRecomputers, the store exposes
|
|
// a recomputer for roles whose Load() returns a populated
|
|
// RoleAnalyticsResponse (initial sync compute), and the
|
|
// PacketStore.GetAnalyticsRoles() accessor returns from the
|
|
// snapshot in sub-millisecond time.
|
|
func TestRolesAnalyticsRecomputerRegistered(t *testing.T) {
|
|
db := setupTestDB(t)
|
|
defer db.Close()
|
|
store := NewPacketStore(db, nil)
|
|
|
|
stop := store.StartAnalyticsRecomputers(50 * time.Millisecond)
|
|
defer stop()
|
|
|
|
// Give the initial synchronous compute a beat to populate.
|
|
time.Sleep(100 * time.Millisecond)
|
|
|
|
store.analyticsRecomputerMu.RLock()
|
|
rc := store.recompRoles
|
|
store.analyticsRecomputerMu.RUnlock()
|
|
if rc == nil {
|
|
t.Fatalf("recompRoles not registered after StartAnalyticsRecomputers (issue #1256 not fixed)")
|
|
}
|
|
v := rc.Load()
|
|
if v == nil {
|
|
t.Fatalf("recompRoles snapshot is nil after initial compute")
|
|
}
|
|
if _, ok := v.(RoleAnalyticsResponse); !ok {
|
|
t.Fatalf("recompRoles snapshot type = %T, want RoleAnalyticsResponse", v)
|
|
}
|
|
|
|
// Accessor must hit the snapshot path.
|
|
t0 := time.Now()
|
|
resp := store.GetAnalyticsRoles()
|
|
dt := time.Since(t0)
|
|
if dt > 5*time.Millisecond {
|
|
t.Errorf("GetAnalyticsRoles latency = %v, want <5ms (snapshot path)", dt)
|
|
}
|
|
// Just confirm we got the response shape (empty store → empty roles).
|
|
_ = resp
|
|
}
|
|
|
|
// TestRolesHandlerUsesRecomputer is a HTTP-level guard that the
|
|
// /api/analytics/roles handler returns from the recomputer snapshot
|
|
// quickly even when no clock skew engine state has been primed (the
|
|
// hang on staging was: every call drove a full clockSkew.Recompute
|
|
// on 78k adverts). With recomputer wired, the handler is an atomic
|
|
// pointer load + JSON encode.
|
|
func TestRolesHandlerSnapshotLatency(t *testing.T) {
|
|
db := setupTestDB(t)
|
|
defer db.Close()
|
|
store := NewPacketStore(db, nil)
|
|
stop := store.StartAnalyticsRecomputers(50 * time.Millisecond)
|
|
defer stop()
|
|
time.Sleep(100 * time.Millisecond)
|
|
|
|
s := &Server{store: store}
|
|
|
|
// p99 over 50 reads must be well under 2 s (issue acceptance).
|
|
worst := time.Duration(0)
|
|
for i := 0; i < 50; i++ {
|
|
rr := httptest.NewRecorder()
|
|
req := httptest.NewRequest(http.MethodGet, "/api/analytics/roles", nil)
|
|
t0 := time.Now()
|
|
s.handleAnalyticsRoles(rr, req)
|
|
dt := time.Since(t0)
|
|
if dt > worst {
|
|
worst = dt
|
|
}
|
|
if rr.Code != http.StatusOK {
|
|
t.Fatalf("status = %d, want 200", rr.Code)
|
|
}
|
|
var out RoleAnalyticsResponse
|
|
if err := json.Unmarshal(rr.Body.Bytes(), &out); err != nil {
|
|
t.Fatalf("invalid json: %v", err)
|
|
}
|
|
}
|
|
if worst > 100*time.Millisecond {
|
|
t.Fatalf("worst-of-50 handler latency = %v, want <100ms (recomputer snapshot)", worst)
|
|
}
|
|
}
|