mirror of
https://github.com/Kpa-clawbot/meshcore-analyzer.git
synced 2026-06-13 11:51:37 +00:00
Compare commits
286 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 3c440c0049 | |||
| d954ea7444 | |||
| e96f0f9f9f | |||
| 547b141530 | |||
| a4af0285fd | |||
| 6dfe589b57 | |||
| 79cf453660 | |||
| dd2b3d2e21 | |||
| a8c99c61fd | |||
| e4be735e02 | |||
| 048143f54f | |||
| d910ea0208 | |||
| a2004351d3 | |||
| 6aa5146b93 | |||
| efd66ea3f5 | |||
| 2ef7d2437d | |||
| 626900a22a | |||
| 653d47e03c | |||
| 2d59f15a07 | |||
| edc6d5da02 | |||
| f0addfdabf | |||
| f06359d739 | |||
| b0996047ef | |||
| ef13b22291 | |||
| bb3fd21f9f | |||
| e3a3f93f7b | |||
| 3114be7a52 | |||
| d0b60b372d | |||
| e74e860725 | |||
| 037dc8c400 | |||
| 0712c5ff31 | |||
| 938153dd92 | |||
| 825b26485c | |||
| e04c7113cb | |||
| fb6bb085a5 | |||
| 89eade6e7b | |||
| 1116801b2f | |||
| 2b6809cd28 | |||
| b812a98a71 | |||
| 3062745437 | |||
| 55e4d957b1 | |||
| 167af54eb8 | |||
| c93ae67ed0 | |||
| 531bc8acb3 | |||
| d72ab69f87 | |||
| 8894d760f2 | |||
| 8909fbe060 | |||
| 9436c05799 | |||
| 66bc4a2d53 | |||
| 0a27dd9ce2 | |||
| 9002b25bce | |||
| c5414b33b7 | |||
| 440cf3ec40 | |||
| e3ac2ce28a | |||
| 2cc6cb25b8 | |||
| cb3d7652fc | |||
| 7fed20be71 | |||
| 7575ad54e0 | |||
| 0444dfe2ce | |||
| bd441a7bdd | |||
| d7793aa590 | |||
| 8295c2115c | |||
| 59d664692d | |||
| ef26d5d548 | |||
| 58d6670db1 | |||
| 890a03f95c | |||
| 76b406f70a | |||
| fc106adbf2 | |||
| 078225a54e | |||
| 8540b01cb1 | |||
| 52cb7b0806 | |||
| f2fa62a0ff | |||
| 18de61769f | |||
| 9c044f5e89 | |||
| 43be1bb76a | |||
| 718e74e8e3 | |||
| 1e51727c46 | |||
| a4b1b3662d | |||
| a7a2d79c9e | |||
| 97cfe2fc3f | |||
| e2212f5015 | |||
| 5cf9681242 | |||
| c029003814 | |||
| 9b8cac2bc4 | |||
| 8709453b14 | |||
| 78a55d5de7 | |||
| 9c5faab1e4 | |||
| 4572ce8b98 | |||
| 218f13e39c | |||
| c23ee30221 | |||
| 9e30da1fcc | |||
| 4d7ed3d582 | |||
| 47f85f6c4c | |||
| efd6464204 | |||
| 5d415bff6e | |||
| 20b137c6ea | |||
| 95ca7a6acc | |||
| f3749425fb | |||
| a4776557ae | |||
| fa02f23a40 | |||
| b7e99d9ec5 | |||
| e6f71f496f | |||
| ad9da1b61d | |||
| 12b121d4d2 | |||
| 3d12266595 | |||
| 4165d9e17e | |||
| 7afa5983ff | |||
| e45c696562 | |||
| a0b15e3bf0 | |||
| 55dc370462 | |||
| e9aed641bd | |||
| 064d142cb9 | |||
| 44c14b1180 | |||
| 330636f9b3 | |||
| a41b9a5ac7 | |||
| 83f3ba462d | |||
| bc1822e46c | |||
| 5fd23727ef | |||
| 7dc6b998f1 | |||
| 30aad0e772 | |||
| 185f9aa958 | |||
| 2140dfe6a4 | |||
| 824d6617a9 | |||
| 076106f7cf | |||
| 12e545e2ad | |||
| 20a535dfb0 | |||
| b074beb99e | |||
| f66ff40a54 | |||
| 7421ead9b0 | |||
| 16c7ea4b82 | |||
| 1bdb92de88 | |||
| 1179d3c7ef | |||
| 28a2c87fcc | |||
| 192f906e62 | |||
| b2456e44ff | |||
| 930c78928b | |||
| ad41b9bb7b | |||
| 8dc67f9dc2 | |||
| eb459fa0b6 | |||
| 43ccc05a82 | |||
| 0b050f1b06 | |||
| 9d1ab29c15 | |||
| 222bfdf6cf | |||
| 1b112f0b08 | |||
| 18810b5c13 | |||
| 9612f08e46 | |||
| df61660a5e | |||
| 3898688d6d | |||
| a26a412c9b | |||
| d6384c3c59 | |||
| f6b70ae786 | |||
| 945226fff2 | |||
| cc5304b381 | |||
| 682e9a77f5 | |||
| 559b40d66a | |||
| 37a7a92730 | |||
| dc433e417f | |||
| ecec3d6d33 | |||
| 3bb82aae72 | |||
| 839a81ce4e | |||
| 51d1996bc3 | |||
| 0abda61954 | |||
| 26105748ff | |||
| 1be0aec808 | |||
| 1f65d7811b | |||
| ac6415eca6 | |||
| c2cb4b297d | |||
| a29b62cba2 | |||
| 294fdafc95 | |||
| a1e0328517 | |||
| 571c960ca0 | |||
| 5629a489b2 | |||
| 69c6a3d030 | |||
| 74b99beb7c | |||
| 1faf0928a8 | |||
| 076ca7d4a1 | |||
| 240b7792ee | |||
| 3df8924114 | |||
| 373ee81641 | |||
| 1a2b8c48be | |||
| af669438ff | |||
| 113fef5bc2 | |||
| 4ad0d8323c | |||
| 3a8ee7fa8e | |||
| dc79467679 | |||
| 7d553a2cd6 | |||
| 6a027b03f1 | |||
| 116efe4bd7 | |||
| 7533b3b67b | |||
| a529b5feab | |||
| 8e7da791e3 | |||
| 9ee53520e6 | |||
| 3c3b762d2a | |||
| 676a48f569 | |||
| f7571a261e | |||
| ee1ff9202d | |||
| fe81bdccfc | |||
| fe758adfb9 | |||
| afb546b7fe | |||
| 158237dfbf | |||
| f03421e8b6 | |||
| 2cf82cb428 | |||
| 1d994b13a7 | |||
| 3c7d1b19a5 | |||
| 0cc993a1b3 | |||
| 9465949e79 | |||
| 7292d60fbe | |||
| 545013d360 | |||
| 9b36b7c487 | |||
| 35b4bd8323 | |||
| 124353be9b | |||
| 802feba641 | |||
| b7db713c47 | |||
| ba809a99b7 | |||
| 892eb2c02a | |||
| 1c5f552459 | |||
| 1d805c8c34 | |||
| 95b42d97dd | |||
| 166a8ad64a | |||
| 3698db9e5b | |||
| a6728f2c45 | |||
| 754b4837a1 | |||
| 3ad61b8783 | |||
| 4f19572ba3 | |||
| e14d888841 | |||
| d7bd9d57b8 | |||
| c57c912c60 | |||
| 60522a6297 | |||
| 34e6806c07 | |||
| 192b6ccc03 | |||
| ff2231bb8c | |||
| cd19285f7f | |||
| 5fd8900cfc | |||
| 0af968811f | |||
| f554af1e21 | |||
| 27096e86c7 | |||
| ac1122e843 | |||
| 9be375d823 | |||
| 05af6c6ee5 | |||
| 2b45f7872c | |||
| 5fa6568835 | |||
| 262391a7f8 | |||
| 881ea0ffb4 | |||
| 7d9bd92065 | |||
| 3f0268f422 | |||
| a7ad2be142 | |||
| f538420ff1 | |||
| 11dea54e56 | |||
| 241aca27aa | |||
| b234c5c82a | |||
| 700917c809 | |||
| 3feb97f16f | |||
| 23f292d03b | |||
| 0aa64a5c9a | |||
| 7ef743fd21 | |||
| 586c5594aa | |||
| bb19c28dda | |||
| d7cd9203ca | |||
| be36cd4adb | |||
| 4c7aab3bc2 | |||
| 3fac7398ae | |||
| 397362f2f2 | |||
| 7b0adbb07a | |||
| 63bfa3d910 | |||
| 715c4623ac | |||
| 431963df32 | |||
| 657e2b3fff | |||
| 91aa8c2abd | |||
| ed0fd8b342 | |||
| 65bd954b17 | |||
| b23640cd69 | |||
| e0ff097d42 | |||
| b72b2dbb21 | |||
| a0ca69d67d | |||
| c9a7bad747 | |||
| 0c908d2bca | |||
| 8d2b42574b | |||
| cbab7eabd3 | |||
| 1543c2a7a3 | |||
| e7f07b16e6 | |||
| a03d728842 | |||
| 9370f6b511 | |||
| e231ac1c45 | |||
| 9df4f68b42 | |||
| 15c0ed2cda | |||
| 31de27a249 |
@@ -1 +1 @@
|
||||
{"schemaVersion":1,"label":"e2e tests","message":"786 passed","color":"brightgreen"}
|
||||
{"schemaVersion":1,"label":"e2e tests","message":"821 passed","color":"brightgreen"}
|
||||
|
||||
@@ -1 +1 @@
|
||||
{"schemaVersion":1,"label":"frontend coverage","message":"35.38%","color":"red"}
|
||||
{"schemaVersion":1,"label":"frontend coverage","message":"36.64%","color":"red"}
|
||||
|
||||
@@ -209,6 +209,7 @@
|
||||
"escapeHtml": "readonly",
|
||||
"exports": "readonly",
|
||||
"favStar": "readonly",
|
||||
"fetchAllNodes": "readonly",
|
||||
"filterPacketsByRoute": "readonly",
|
||||
"formatAbsoluteTimestamp": "readonly",
|
||||
"formatChartAxisLabel": "readonly",
|
||||
|
||||
@@ -3,7 +3,6 @@ name: CI/CD Pipeline
|
||||
on:
|
||||
push:
|
||||
branches: [master]
|
||||
tags: ['v*']
|
||||
pull_request:
|
||||
branches: [master]
|
||||
workflow_dispatch:
|
||||
@@ -57,7 +56,7 @@ jobs:
|
||||
go build .
|
||||
# -race gates PR #1208's atomic.Pointer migration: the race-detector
|
||||
# is what makes path_inspect_atomic_race_test.go actually assert.
|
||||
go test -race -coverprofile=server-coverage.out ./... 2>&1 | tee server-test.log
|
||||
go test -timeout 15m -race -coverprofile=server-coverage.out ./... 2>&1 | tee server-test.log
|
||||
echo "--- Go Server Coverage ---"
|
||||
go tool cover -func=server-coverage.out | tail -1
|
||||
|
||||
@@ -66,7 +65,7 @@ jobs:
|
||||
set -e -o pipefail
|
||||
cd cmd/ingestor
|
||||
go build .
|
||||
go test -coverprofile=ingestor-coverage.out ./... 2>&1 | tee ingestor-test.log
|
||||
go test -timeout 15m -coverprofile=ingestor-coverage.out ./... 2>&1 | tee ingestor-test.log
|
||||
echo "--- Go Ingestor Coverage ---"
|
||||
go tool cover -func=ingestor-coverage.out | tail -1
|
||||
|
||||
@@ -84,6 +83,9 @@ jobs:
|
||||
- name: Verify Dockerfile COPY invariants (issue #1316)
|
||||
run: bash scripts/check-dockerfile-internal-pkgs.sh
|
||||
|
||||
- name: Staging disk-monitor unit tests (issue #1684)
|
||||
run: bash scripts/staging/test-disk-monitor.sh
|
||||
|
||||
- name: Lint CSS variables (issue #1128)
|
||||
run: |
|
||||
set -e
|
||||
@@ -95,7 +97,10 @@ jobs:
|
||||
set -e
|
||||
node test-packet-filter.js
|
||||
node test-packet-filter-time.js
|
||||
node test-confidence-indicator.js
|
||||
node test-1659-analytics-warmup.js
|
||||
node test-channels-merge-1498-unit.js
|
||||
node test-issue-1518-home-url.js
|
||||
node test-channel-decrypt-insecure-context.js
|
||||
node test-live-region-filter.js
|
||||
node test-issue-1136-observer-iata-map.js
|
||||
@@ -116,6 +121,8 @@ jobs:
|
||||
node test-issue-1364-pill-no-clamp.js
|
||||
node test-issue-1375-scope-stats-fetch.js
|
||||
node test-issue-1361-cb-presets.js
|
||||
node test-issue-1380-cb-sim-overlay.js
|
||||
node test-issue-1380-cb-reset-button.js
|
||||
node test-issue-1407-cb-preset-propagation.js
|
||||
node test-issue-1412-customizer-no-override.js
|
||||
node test-issue-1418-raw-hex-extraction.js
|
||||
@@ -125,10 +132,26 @@ jobs:
|
||||
node test-issue-1418-deeplink-hops-channels.js
|
||||
node test-issue-1418-polish-review.js
|
||||
node test-issue-1420-tile-providers.js
|
||||
node test-issue-1614-tile-url-function.js
|
||||
node test-issue-1438-marker-css-vars.js
|
||||
node test-issue-1562-observers-summary.js
|
||||
node test-issue-1509-nav-active-bg.js
|
||||
node test-issue-1509-detect-preset.js
|
||||
node test-live.js
|
||||
node test-issue-1107-live-layout.js
|
||||
node test-issue-1532-live-fullscreen.js
|
||||
node test-issue-1619-feed-detail-card-draggable.js
|
||||
node test-xss-escape-sinks.js
|
||||
node test-preflight-xss-gate.js
|
||||
node test-traces.js
|
||||
node test-issue-1648-m4-emoji-scan.js
|
||||
node test-issue-1668-m3-typography.js
|
||||
node test-mqtt-status-panel.js
|
||||
node test-issue-1697-mqtt-mobile-e2e.js
|
||||
node test-warmup-banner.js
|
||||
node test-issue-1633-hide-1byte-hops.js
|
||||
node test-issue-1668-m4-per-route.js
|
||||
node test-a11y-axe-1668-selftest.js
|
||||
|
||||
- name: 🛡️ Preflight XSS gate — actual --diff check (PR only)
|
||||
# The fixture self-test above (test-preflight-xss-gate.js) only
|
||||
@@ -340,11 +363,18 @@ jobs:
|
||||
- name: Run Playwright E2E tests (fail-fast)
|
||||
run: |
|
||||
BASE_URL=http://localhost:13581 node test-e2e-playwright.js 2>&1 | tee e2e-output.txt
|
||||
# M5 of #1668 — axe-core CI gate (color-contrast AA).
|
||||
# Real browser run; fails on any net violation (raw − allowlist).
|
||||
# Allowlist: tests/a11y-allowlist.yaml (0 entries at M5 baseline).
|
||||
BASE_URL=http://localhost:13581 AXE_SCREENSHOT_DIR=/tmp/axe-1668 \
|
||||
node test-a11y-axe-1668.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-filter-ux-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-channel-issue-1087-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-channel-issue-1111-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-map-modal-fluid-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-map-nodes-pagination-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-observer-iata-1188-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-issue-1639-observers-sort-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-nav-fluid-1055-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-nav-priority-1102-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-nav-priority-1311-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
@@ -361,6 +391,7 @@ jobs:
|
||||
BASE_URL=http://localhost:13581 node test-table-fluid-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-charts-fluid-1058-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-slideover-1056-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-issue-1692-packets-init-parallel-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-slideover-1168-munger-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-logo-pulse-1173-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-issue-1122-packets-filter-ux-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
@@ -384,6 +415,13 @@ jobs:
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1206-vcr-overlap-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1244-live-vcr-row-hints-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1510-live-nav-pin-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-live-fullscreen-1572-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1599-replay-freeze-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1648-m1-icons-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1648-m2-icons-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1648-m3-icons-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1648-m4-icons-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1657-analytics-channels-group-sprites-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-issue-1224-channels-mobile-ux-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-issue-1367-channels-chat-app-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-issue-1236-map-mobile-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
@@ -405,6 +443,7 @@ jobs:
|
||||
BASE_URL=http://localhost:13581 node test-customize-display-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-customize-export-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-drag-manager-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1567-corner-clears-drag-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1306-collisions-terminology-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1374-route-map-a11y-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-channels-list-render-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
@@ -414,6 +453,28 @@ jobs:
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-channels-ws-batch-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-channels-ws-race-1498-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1487-byop-modal-layout-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1630-reach-mobile-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1640-compare-discovery-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
|
||||
# #1616: slide-over focus-restore flake-gate. Runs the slide-over
|
||||
# E2E 20 consecutive times against the SAME backend instance so
|
||||
# the Chromium-headless focus race documented in #1172/#1616 has
|
||||
# a 20× shot at firing. Any single non-zero exit aborts. This is
|
||||
# the architectural-fix gate — if it ever turns red post-merge,
|
||||
# the focused-but-hidden state has crept back in.
|
||||
#
|
||||
# PERMANENT step. Adds ~3-4 min to the e2e-test job in exchange
|
||||
# for closing out a flake family that was blocking ~8 unrelated
|
||||
# PRs at a time. If profiling pressures the budget later, drop
|
||||
# repeat count first; do not delete.
|
||||
- name: Slide-over E2E flake-gate (#1616, --repeat-each=3)
|
||||
run: |
|
||||
set -e
|
||||
for i in $(seq 1 3); do
|
||||
echo "--- slide-over E2E run $i/20 ---"
|
||||
BASE_URL=http://localhost:13581 node test-slideover-1056-e2e.js 2>&1 | tee -a slideover-repeat-output.txt
|
||||
done
|
||||
echo "3 passed"
|
||||
|
||||
- name: Collect frontend coverage (parallel)
|
||||
if: success() && github.event_name == 'push'
|
||||
|
||||
@@ -0,0 +1,111 @@
|
||||
name: Release Fast-Path
|
||||
|
||||
# Issue #1677: re-tag :edge as :vX.Y.Z when the tag SHA matches :edge's
|
||||
# org.opencontainers.image.revision label. Skips ~30 min of Go test +
|
||||
# Playwright + Docker rebuild because the bytes are identical — only the
|
||||
# manifest name changes. Falls back to deploy.yml when SHAs differ so
|
||||
# tags on older commits still go through full validation.
|
||||
#
|
||||
# This workflow is the SOLE consumer of push.tags. deploy.yml's tag
|
||||
# trigger has been removed to prevent double-fire.
|
||||
|
||||
on:
|
||||
push:
|
||||
tags: ['v[0-9]+.[0-9]+.[0-9]+']
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
|
||||
concurrency:
|
||||
group: release-fast-path-${{ github.ref }}
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
retag-or-fallback:
|
||||
name: "🏷️ Re-tag :edge → :vX.Y.Z (fast) or dispatch deploy.yml (fallback)"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Log in to GHCR
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Install crane
|
||||
uses: imjasonh/setup-crane@v0.4
|
||||
|
||||
- name: Parse semver from tag
|
||||
id: semver
|
||||
run: |
|
||||
set -euo pipefail
|
||||
TAG="${GITHUB_REF#refs/tags/}"
|
||||
# Expect vMAJOR.MINOR.PATCH (workflow trigger already enforces this).
|
||||
if [[ ! "$TAG" =~ ^v([0-9]+)\.([0-9]+)\.([0-9]+)$ ]]; then
|
||||
echo "Tag $TAG does not match vMAJOR.MINOR.PATCH" >&2
|
||||
exit 1
|
||||
fi
|
||||
MAJOR="${BASH_REMATCH[1]}"
|
||||
MINOR="${BASH_REMATCH[2]}"
|
||||
{
|
||||
echo "tag=$TAG"
|
||||
echo "vMajor=v$MAJOR"
|
||||
echo "vMajorMinor=v$MAJOR.$MINOR"
|
||||
} >> "$GITHUB_OUTPUT"
|
||||
echo "Parsed: $TAG → v$MAJOR / v$MAJOR.$MINOR / $TAG"
|
||||
|
||||
- name: Inspect :edge revision label
|
||||
id: edge
|
||||
run: |
|
||||
set -euo pipefail
|
||||
IMAGE="ghcr.io/kpa-clawbot/corescope"
|
||||
EDGE_REF="${IMAGE}:edge"
|
||||
# crane config returns the OCI image config JSON; the revision label
|
||||
# is set by docker/metadata-action on the master-edge build.
|
||||
# If :edge doesn't exist yet (first run on a fresh registry), fall
|
||||
# through to the slow path.
|
||||
if ! CONFIG="$(crane config "$EDGE_REF" 2>/dev/null)"; then
|
||||
echo "edge_revision=" >> "$GITHUB_OUTPUT"
|
||||
echo "no_edge=true" >> "$GITHUB_OUTPUT"
|
||||
echo ":edge not found in registry — will use fallback path"
|
||||
exit 0
|
||||
fi
|
||||
REV="$(echo "$CONFIG" | jq -r '.config.Labels["org.opencontainers.image.revision"] // ""')"
|
||||
echo "edge_revision=$REV" >> "$GITHUB_OUTPUT"
|
||||
echo "no_edge=false" >> "$GITHUB_OUTPUT"
|
||||
echo ":edge org.opencontainers.image.revision = $REV"
|
||||
echo "tag SHA (github.sha) = ${{ github.sha }}"
|
||||
|
||||
# ─────────── FAST PATH: SHAs match, metadata-only retag ───────────
|
||||
- name: Re-tag :edge → :vX.Y.Z + :vX.Y + :vX + :latest (fast path)
|
||||
if: steps.edge.outputs.no_edge == 'false' && steps.edge.outputs.edge_revision == github.sha
|
||||
run: |
|
||||
set -euo pipefail
|
||||
IMAGE="ghcr.io/kpa-clawbot/corescope"
|
||||
SRC="${IMAGE}:edge"
|
||||
echo "SHA match — fast-path re-tag from $SRC"
|
||||
for NEW_TAG in \
|
||||
"${{ steps.semver.outputs.tag }}" \
|
||||
"${{ steps.semver.outputs.vMajorMinor }}" \
|
||||
"${{ steps.semver.outputs.vMajor }}" \
|
||||
"latest"; do
|
||||
echo " crane tag $SRC $NEW_TAG"
|
||||
crane tag "$SRC" "$NEW_TAG"
|
||||
done
|
||||
echo "Fast-path complete — all tags point at the :edge manifest digest."
|
||||
|
||||
# ─────────── FALLBACK: SHAs differ, run the full pipeline ───────────
|
||||
- name: Dispatch full deploy.yml pipeline (fallback)
|
||||
if: steps.edge.outputs.no_edge == 'true' || steps.edge.outputs.edge_revision != github.sha
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
echo "SHA mismatch (or no :edge) — falling back to full pipeline"
|
||||
echo " :edge revision = '${{ steps.edge.outputs.edge_revision }}'"
|
||||
echo " tag SHA = '${{ github.sha }}'"
|
||||
gh workflow run deploy.yml \
|
||||
--repo "${{ github.repository }}" \
|
||||
--ref "${{ github.ref }}"
|
||||
echo "Dispatched deploy.yml against ${{ github.ref }}"
|
||||
+26
-1
@@ -2,7 +2,32 @@
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### 📝 Documentation Corrections
|
||||
## [3.9.1] — 2026-06-12
|
||||
|
||||
Patch release on top of v3.9.0 — v3.9.0's container image never published (Playwright flake gated Docker build). See [docs/release-notes/v3.9.1.md](docs/release-notes/v3.9.1.md).
|
||||
|
||||
### 🎨 Accessibility
|
||||
- **WCAG AA contrast pass** (#1676, f0addfda) — two-tier CSS palette; muted-text ≥4.5:1 in both themes; unknown-repeater chip fixed (2.75:1 → 4.95:1). Closes #1671. Partial fix for #1668.
|
||||
|
||||
### 🧪 Test stability
|
||||
- **Slideover E2E flake fix** (#1663+followups, f06359d7) — tightened selectors, bumped data-row wait. Fixes #1662.
|
||||
|
||||
## [3.9.0] — 2026-06-12
|
||||
|
||||
See [docs/release-notes/v3.9.0.md](docs/release-notes/v3.9.0.md) for the full notes. 257 commits since v3.8.3 (72 substantive + 185 coverage bumps).
|
||||
|
||||
### ✨ Highlights
|
||||
- **Relay timelines survive an ingestor restart** (#1643) — relay-hop attribution is rebuilt from `path_json` on cold load.
|
||||
- **Observer Compare is first-class** (#1642, #1645, #1647) — three new entry points + Tufte-grade compare page with state-preserving multi-select.
|
||||
- **Emoji → Phosphor icon migration** (#1648, #1649–#1654) — every UI emoji replaced with theme-tinted Phosphor sprites, lint-gated.
|
||||
- **Per-node Reach page + API** (#1627) — `GET /api/nodes/{pubkey}/reach` with cache invalidation on blacklist changes (#1636).
|
||||
- **Hashtag channels catalogue integration** (#1656) — public hashtag channels appear without manual config.
|
||||
- **Operator-customizable name-prefix hiding** (#1655) — new `hiddenNamePrefixes` config (default `["🚫"]`).
|
||||
|
||||
### ⚙️ Config
|
||||
- New: `hiddenNamePrefixes`, `liveMap.maxNodes`, `runtime.maxMemoryMB`, configurable observer-health thresholds, `branding.homeUrl`, customizer disabled-tabs.
|
||||
|
||||
### 📝 Documentation Corrections (carried from prior [Unreleased])
|
||||
- **PR #1324 historical record correction** (#1387) — the merged PR #1324 body referenced four tests that do NOT exist in master: `TestMultibyteCapPersistRoundTrip`, `TestMultibyteCapPersistSkipsUnknown`, `TestMaybePersistCoalesces`, and a `TryLock` coalescing test. The actual tests that landed are `TestRunMultibyteCapPersist_AppliesSnapshot` and `TestRunMultibyteCapPersist_NoSnapshot_NoOp`. See issue #1386 for the corrective test additions (round-trip, unknown-key skip, coalescing).
|
||||
|
||||
## [3.7.2] — 2026-05-06
|
||||
|
||||
@@ -129,3 +129,98 @@ docker compose pull && docker compose up -d
|
||||
| `./manage.sh setup` | Copy `docker-compose.example.yml`, edit env vars |
|
||||
|
||||
`manage.sh` remains available for advanced use cases (building from source, custom patches, development). Pre-built images are recommended for most production deployments.
|
||||
|
||||
## Staging VM — disk-usage monitor & cleanup (#1684)
|
||||
|
||||
The staging VM ran out of disk during a hot-patch (#1684). To prevent
|
||||
repeats, two scripts live in `scripts/staging/`:
|
||||
|
||||
- `disk-monitor.sh <mount>` — reads `df -P`, classifies usage against
|
||||
`<80 ok / >=80 warn / >=90 error / >=95 alert`, emits to stderr +
|
||||
journald (via `logger`). Returns non-zero on `error|alert` so
|
||||
systemd surfaces the unit as failed.
|
||||
- `disk-cleanup.sh` — removes `/tmp` snapshot files (`*.db`,
|
||||
`staging-snap.*`, `cs-*`, `node-compile-cache`) older than 7 days
|
||||
and runs `docker builder prune` + `docker image prune` with
|
||||
`--filter "until=72h" --filter "label!=keep"`. Set
|
||||
`CORESCOPE_CLEANUP_DRY_RUN=1` to log without deleting.
|
||||
|
||||
### Install on the staging host
|
||||
|
||||
SSH to `<STAGING_HOST>` as the staging operator user and:
|
||||
|
||||
```bash
|
||||
sudo install -m 0755 scripts/staging/disk-monitor.sh /usr/local/bin/corescope-disk-monitor
|
||||
sudo install -m 0755 scripts/staging/disk-cleanup.sh /usr/local/bin/corescope-disk-cleanup
|
||||
|
||||
# 15-minute monitor
|
||||
sudo tee /etc/systemd/system/corescope-disk-monitor.service >/dev/null <<'UNIT'
|
||||
[Unit]
|
||||
Description=CoreScope staging disk-usage monitor (issue #1684)
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/usr/local/bin/corescope-disk-monitor /
|
||||
UNIT
|
||||
|
||||
sudo tee /etc/systemd/system/corescope-disk-monitor.timer >/dev/null <<'UNIT'
|
||||
[Unit]
|
||||
Description=Run CoreScope disk-usage monitor every 15 minutes
|
||||
[Timer]
|
||||
OnBootSec=5min
|
||||
OnUnitActiveSec=15min
|
||||
Unit=corescope-disk-monitor.service
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
UNIT
|
||||
|
||||
# Daily cleanup at 03:30 local
|
||||
sudo tee /etc/systemd/system/corescope-disk-cleanup.service >/dev/null <<'UNIT'
|
||||
[Unit]
|
||||
Description=CoreScope staging disk cleanup (issue #1684)
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/usr/local/bin/corescope-disk-cleanup
|
||||
UNIT
|
||||
|
||||
sudo tee /etc/systemd/system/corescope-disk-cleanup.timer >/dev/null <<'UNIT'
|
||||
[Unit]
|
||||
Description=Run CoreScope disk cleanup daily at off-peak
|
||||
[Timer]
|
||||
OnCalendar=*-*-* 03:30:00
|
||||
Persistent=true
|
||||
Unit=corescope-disk-cleanup.service
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
UNIT
|
||||
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable --now corescope-disk-monitor.timer corescope-disk-cleanup.timer
|
||||
```
|
||||
|
||||
`<STAGING_HOST>` is the staging VM hostname/IP — operator supplies it,
|
||||
not committed to the repo.
|
||||
|
||||
### Inspecting alerts
|
||||
|
||||
```bash
|
||||
journalctl -t corescope-disk-monitor --since '-1d'
|
||||
journalctl -t corescope-disk-cleanup --since '-7d'
|
||||
systemctl list-timers | grep corescope-disk
|
||||
```
|
||||
|
||||
`logger` priorities map: `ok→info`, `warn→warning`, `error→err`,
|
||||
`alert→alert` (syslog severity 1, the highest level). Wire
|
||||
`journalctl -p alert ...` to whatever ops channel the operator
|
||||
prefers; use `-p err` to also catch the `error` tier.
|
||||
|
||||
### Notes on `staging-snap.db` root cause (#1684 phase 3)
|
||||
|
||||
`grep -rn staging-snap.db cmd/ public/ scripts/` returns **zero**
|
||||
hits in the repo. The 4.4 GB orphan was a manual debugging artifact,
|
||||
not produced by any committed code. The `disk-cleanup.sh` retention
|
||||
rule (anything matching `staging-snap.*` in `/tmp` older than 7 days)
|
||||
prevents recurrence without needing source-side TTL changes.
|
||||
|
||||
If a future feature legitimately needs persistent snapshot DBs, put
|
||||
them under `/var/lib/corescope/snapshots/` with explicit rotation —
|
||||
not in `/tmp`, which is ephemeral by definition.
|
||||
|
||||
@@ -21,6 +21,7 @@ The Go backend serves all 40+ API endpoints from an in-memory packet store with
|
||||
| Memory (56K packets) | **~300 MB** (vs 1.3 GB on Node.js) |
|
||||
| WebSocket broadcast | **Real-time** to all connected browsers |
|
||||
| Channel decryption | **AES-128-ECB** with rainbow table |
|
||||
| GOMEMLIMIT (memory-constrained hosts) | **set to ≥1.5× working set** (e.g. 1536 MiB on a 2 GB Pi for a ~1 GB store). Lower values trigger a GC death-spiral. Configure via the `GOMEMLIMIT` env var or `runtime.maxMemoryMB` in `config.json`; env wins. Applies to both server and ingestor. See [#1010](https://github.com/Kpa-clawbot/CoreScope/issues/1010). |
|
||||
|
||||
See [PERFORMANCE.md](PERFORMANCE.md) for full benchmarks.
|
||||
|
||||
|
||||
@@ -53,6 +53,7 @@ type Config struct {
|
||||
HashRegions []string `json:"hashRegions,omitempty"`
|
||||
Retention *RetentionConfig `json:"retention,omitempty"`
|
||||
Metrics *MetricsConfig `json:"metrics,omitempty"`
|
||||
Runtime *RuntimeConfig `json:"runtime,omitempty"`
|
||||
GeoFilter *GeoFilterConfig `json:"geo_filter,omitempty"`
|
||||
ForeignAdverts *ForeignAdvertConfig `json:"foreignAdverts,omitempty"`
|
||||
ValidateSignatures *bool `json:"validateSignatures,omitempty"`
|
||||
@@ -80,6 +81,12 @@ type Config struct {
|
||||
// NeighborEdgesMaxAgeDays controls neighbor_edges row retention
|
||||
// (#1287 — moved from cmd/server). 0 = default 5.
|
||||
NeighborEdgesMaxAgeDays int `json:"neighborEdgesMaxAgeDays,omitempty"`
|
||||
|
||||
// IngestBufferSize caps the in-memory queue (number of MQTT messages) held
|
||||
// while the single SQLite writer is blocked by startup migrations/prunes
|
||||
// (#1608). Received messages are drained once the write path is ready.
|
||||
// 0 / unset => default. Bounded memory.
|
||||
IngestBufferSize int `json:"ingestBufferSize,omitempty"`
|
||||
}
|
||||
|
||||
// NeighborEdgesDaysOrDefault returns the configured pruning window or 5.
|
||||
@@ -90,6 +97,17 @@ func (c *Config) NeighborEdgesDaysOrDefault() int {
|
||||
return c.NeighborEdgesMaxAgeDays
|
||||
}
|
||||
|
||||
// IngestBufferSizeOrDefault returns the ingest buffer capacity. Default 50000:
|
||||
// at typical mesh rates (~1-2 msg/s) that is many minutes of headroom while a
|
||||
// startup migration holds the writer; each queued item is a small closure, so
|
||||
// worst-case memory stays in the tens of MB.
|
||||
func (c *Config) IngestBufferSizeOrDefault() int {
|
||||
if c.IngestBufferSize > 0 {
|
||||
return c.IngestBufferSize
|
||||
}
|
||||
return 50000
|
||||
}
|
||||
|
||||
// GeoFilterConfig is an alias for the shared geofilter.Config type.
|
||||
type GeoFilterConfig = geofilter.Config
|
||||
|
||||
@@ -134,6 +152,15 @@ type MetricsConfig struct {
|
||||
SampleIntervalSec int `json:"sampleIntervalSec"`
|
||||
}
|
||||
|
||||
// RuntimeConfig holds Go runtime tuning knobs (#1010).
|
||||
type RuntimeConfig struct {
|
||||
// MaxMemoryMB is the soft memory limit (GOMEMLIMIT) in MiB applied via
|
||||
// runtime/debug.SetMemoryLimit at startup. The GOMEMLIMIT environment
|
||||
// variable, when set, takes precedence over this value. 0/unset means
|
||||
// no limit is applied and default Go runtime behavior is preserved.
|
||||
MaxMemoryMB int `json:"maxMemoryMB"`
|
||||
}
|
||||
|
||||
// DBConfig is the shared SQLite vacuum/maintenance config (#919, #921).
|
||||
type DBConfig = dbconfig.DBConfig
|
||||
|
||||
|
||||
@@ -484,3 +484,15 @@ func TestLoadConfigWSSource(t *testing.T) {
|
||||
t.Errorf("ResolvedSources wss broker=%s, want unchanged", sources[1].Broker)
|
||||
}
|
||||
}
|
||||
|
||||
func TestIngestBufferSizeOrDefault(t *testing.T) {
|
||||
if got := (&Config{}).IngestBufferSizeOrDefault(); got != 50000 {
|
||||
t.Fatalf("default: want 50000, got %d", got)
|
||||
}
|
||||
if got := (&Config{IngestBufferSize: 10}).IngestBufferSizeOrDefault(); got != 10 {
|
||||
t.Fatalf("override: want 10, got %d", got)
|
||||
}
|
||||
if got := (&Config{IngestBufferSize: -5}).IngestBufferSizeOrDefault(); got != 50000 {
|
||||
t.Fatalf("invalid negative should fall back to default, got %d", got)
|
||||
}
|
||||
}
|
||||
|
||||
+449
-21
@@ -8,6 +8,7 @@ import (
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
@@ -70,6 +71,7 @@ type Store struct {
|
||||
stmtGetTxByHash *sql.Stmt
|
||||
stmtInsertTransmission *sql.Stmt
|
||||
stmtUpdateTxFirstSeen *sql.Stmt
|
||||
stmtBumpTxLastSeen *sql.Stmt
|
||||
stmtInsertObservation *sql.Stmt
|
||||
stmtUpsertNode *sql.Stmt
|
||||
stmtIncrementAdvertCount *sql.Stmt
|
||||
@@ -81,6 +83,16 @@ type Store struct {
|
||||
|
||||
sampleIntervalSec int
|
||||
backfillWg sync.WaitGroup
|
||||
|
||||
// prefixIdx holds the prefix → pubkey index used by the
|
||||
// resolved_path writer (#1547). Rebuilt on startup and once per
|
||||
// neighbor-edges builder tick (60s).
|
||||
prefixIdx prefixIdxHolder
|
||||
|
||||
// neighborGraph holds the in-memory NeighborGraph snapshot used
|
||||
// by the context-aware resolver (#1560). Rebuilt on startup and
|
||||
// once per neighbor-edges builder tick (60s).
|
||||
neighborGraph neighborGraphHolder
|
||||
}
|
||||
|
||||
// OpenStore opens or creates a SQLite DB at the given path, applying the
|
||||
@@ -146,6 +158,32 @@ func OpenStoreWithInterval(dbPath string, sampleIntervalSec int) (*Store, error)
|
||||
}
|
||||
}
|
||||
|
||||
// #1690: backfill transmissions.last_seen from MAX(observations.timestamp)
|
||||
// per transmission. The column is added inline by dbschema.Apply (cheap
|
||||
// metadata-only ALTER); the populate query is potentially expensive
|
||||
// (full obs scan + group) so we run it async. Subsequent observation
|
||||
// inserts maintain the column inline (see InsertTransmission below).
|
||||
// PREFLIGHT: async=true reason="full-table backfill JOIN (1.9M+ obs × 86k+ tx in prod) — must not block ingestor boot"
|
||||
if err := s.RunAsyncMigration(context.Background(), "tx_last_seen_backfill_v1",
|
||||
func(ctx context.Context, d *sql.DB) error {
|
||||
log.Println("[migration/async] Backfilling transmissions.last_seen from MAX(observations.timestamp)...")
|
||||
res, err := d.ExecContext(ctx, `
|
||||
UPDATE transmissions
|
||||
SET last_seen = COALESCE((
|
||||
SELECT MAX(timestamp) FROM observations WHERE transmission_id = transmissions.id
|
||||
), last_seen)
|
||||
WHERE last_seen = 0
|
||||
`)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
n, _ := res.RowsAffected()
|
||||
log.Printf("[migration/async] transmissions.last_seen backfill complete: %d rows updated", n)
|
||||
return nil
|
||||
}); err != nil {
|
||||
log.Printf("[migration/async] scheduling tx_last_seen_backfill_v1 failed: %v", err)
|
||||
}
|
||||
|
||||
return s, nil
|
||||
}
|
||||
|
||||
@@ -186,7 +224,9 @@ func applySchema(db *sql.DB) error {
|
||||
last_packet_at TEXT DEFAULT NULL,
|
||||
clock_skew_seconds INTEGER DEFAULT NULL,
|
||||
clock_skew_count_24h INTEGER DEFAULT 0,
|
||||
clock_last_naive_at TEXT DEFAULT NULL
|
||||
clock_last_naive_at TEXT DEFAULT NULL,
|
||||
can_relay INTEGER DEFAULT 1,
|
||||
can_relay_seen INTEGER DEFAULT 0
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_nodes_last_seen ON nodes(last_seen);
|
||||
@@ -218,6 +258,7 @@ func applySchema(db *sql.DB) error {
|
||||
payload_version INTEGER,
|
||||
decoded_json TEXT,
|
||||
from_pubkey TEXT,
|
||||
last_seen INTEGER NOT NULL DEFAULT 0,
|
||||
created_at TEXT DEFAULT (datetime('now'))
|
||||
);
|
||||
|
||||
@@ -226,6 +267,10 @@ func applySchema(db *sql.DB) error {
|
||||
CREATE INDEX IF NOT EXISTS idx_transmissions_payload_type ON transmissions(payload_type);
|
||||
-- idx_transmissions_from_pubkey is created by the from_pubkey_v1
|
||||
-- migration after the column is added on legacy DBs (#1143).
|
||||
-- idx_tx_last_seen is created by dbschema.Apply after ensuring
|
||||
-- the last_seen column exists (#1690) — keep it OUT of this base
|
||||
-- schema block so legacy DBs (table-exists, column-missing) don't
|
||||
-- trip on the CREATE INDEX before the ALTER runs.
|
||||
`
|
||||
if _, err := db.Exec(schema); err != nil {
|
||||
return fmt.Errorf("base schema: %w", err)
|
||||
@@ -668,8 +713,8 @@ func (s *Store) prepareStatements() error {
|
||||
}
|
||||
|
||||
s.stmtInsertTransmission, err = s.db.Prepare(`
|
||||
INSERT INTO transmissions (raw_hex, hash, first_seen, route_type, payload_type, payload_version, decoded_json, channel_hash, scope_name, from_pubkey)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
INSERT INTO transmissions (raw_hex, hash, first_seen, route_type, payload_type, payload_version, decoded_json, channel_hash, scope_name, from_pubkey, last_seen)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
`)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -680,14 +725,29 @@ func (s *Store) prepareStatements() error {
|
||||
return err
|
||||
}
|
||||
|
||||
// #1690: bump transmissions.last_seen to MAX(current, ?) on every
|
||||
// observation insert so cold-load can filter on effective recency.
|
||||
// This is NOT a migration — it's the steady-state writer path. The
|
||||
// one-time backfill (BackfillPathJSONAsync-shaped) runs via
|
||||
// RunAsyncMigration above; this prepared-statement UPDATE is the
|
||||
// per-row maintenance that keeps the column current after the
|
||||
// backfill completes. Recorded in _migrations under
|
||||
// "tx_last_seen_backfill_v1".
|
||||
// PREFLIGHT: async=true reason="prepared-statement row-level UPDATE BY PRIMARY KEY (transmissions.id) — single-row touch per observation, indexed by PK, constant-time at any scale. Not a migration."
|
||||
s.stmtBumpTxLastSeen, err = s.db.Prepare("UPDATE transmissions SET last_seen = ? WHERE id = ? AND last_seen < ?")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
s.stmtInsertObservation, err = s.db.Prepare(`
|
||||
INSERT INTO observations (transmission_id, observer_idx, direction, snr, rssi, score, path_json, timestamp, raw_hex)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
INSERT INTO observations (transmission_id, observer_idx, direction, snr, rssi, score, path_json, timestamp, raw_hex, resolved_path)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(transmission_id, observer_idx, COALESCE(path_json, '')) DO UPDATE SET
|
||||
snr = COALESCE(excluded.snr, snr),
|
||||
rssi = COALESCE(excluded.rssi, rssi),
|
||||
score = COALESCE(excluded.score, score),
|
||||
raw_hex = COALESCE(excluded.raw_hex, raw_hex)
|
||||
snr = COALESCE(excluded.snr, snr),
|
||||
rssi = COALESCE(excluded.rssi, rssi),
|
||||
score = COALESCE(excluded.score, score),
|
||||
raw_hex = COALESCE(excluded.raw_hex, raw_hex),
|
||||
resolved_path = COALESCE(excluded.resolved_path, resolved_path)
|
||||
`)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -715,8 +775,8 @@ func (s *Store) prepareStatements() error {
|
||||
}
|
||||
|
||||
s.stmtUpsertObserver, err = s.db.Prepare(`
|
||||
INSERT INTO observers (id, name, iata, last_seen, first_seen, packet_count, model, firmware, client_version, radio, battery_mv, uptime_secs, noise_floor)
|
||||
VALUES (?, ?, ?, ?, ?, 1, ?, ?, ?, ?, ?, ?, ?)
|
||||
INSERT INTO observers (id, name, iata, last_seen, first_seen, packet_count, model, firmware, client_version, radio, battery_mv, uptime_secs, noise_floor, can_relay, can_relay_seen)
|
||||
VALUES (?, ?, ?, ?, ?, 1, ?, ?, ?, ?, ?, ?, ?, COALESCE(?, 1), CASE WHEN ? IS NULL THEN 0 ELSE 1 END)
|
||||
ON CONFLICT(id) DO UPDATE SET
|
||||
name = COALESCE(?, name),
|
||||
iata = COALESCE(?, iata),
|
||||
@@ -728,7 +788,9 @@ func (s *Store) prepareStatements() error {
|
||||
radio = COALESCE(?, radio),
|
||||
battery_mv = COALESCE(?, battery_mv),
|
||||
uptime_secs = COALESCE(?, uptime_secs),
|
||||
noise_floor = COALESCE(?, noise_floor)
|
||||
noise_floor = COALESCE(?, noise_floor),
|
||||
can_relay = COALESCE(?, can_relay),
|
||||
can_relay_seen = CASE WHEN ? IS NULL THEN can_relay_seen ELSE 1 END
|
||||
`)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -780,6 +842,21 @@ func (s *Store) InsertTransmission(data *PacketData) (bool, error) {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// Wait/hold instrumentation (#1340). The hot path uses prepared
|
||||
// statements that auto-commit; gate the whole function under
|
||||
// writerMu so concurrent mqtt_handler inserts queue behind any
|
||||
// other writer (vacuum, prune, neighbor-builder) and the wait is
|
||||
// Go-visible.
|
||||
mqttWaitStart := time.Now()
|
||||
writerMu.Lock()
|
||||
mqttWait := time.Since(mqttWaitStart)
|
||||
mqttHoldStart := time.Now()
|
||||
defer func() {
|
||||
mqttHold := time.Since(mqttHoldStart)
|
||||
writerMu.Unlock()
|
||||
recordWriterTiming("mqtt_handler", mqttWait, mqttHold, "InsertTransmission")
|
||||
}()
|
||||
|
||||
rxTime := data.Timestamp
|
||||
ingestNow := time.Now().UTC().Format(time.RFC3339)
|
||||
if rxTime == "" {
|
||||
@@ -808,6 +885,7 @@ func (s *Store) InsertTransmission(data *PacketData) (bool, error) {
|
||||
data.DecodedJSON, nilIfEmpty(data.ChannelHash),
|
||||
scopeNameForDB(data),
|
||||
nilIfEmpty(data.FromPubkey),
|
||||
epochSecondsForLastSeen(rxTime),
|
||||
)
|
||||
if err != nil {
|
||||
s.Stats.WriteErrors.Add(1)
|
||||
@@ -842,16 +920,37 @@ func (s *Store) InsertTransmission(data *PacketData) (bool, error) {
|
||||
epochTs = t.Unix()
|
||||
}
|
||||
|
||||
// Resolve hop prefixes to full pubkeys for `observations.resolved_path`.
|
||||
// Per #1547: this writer was lost in the #1289 refactor and lives in
|
||||
// the ingestor now. Per #1560: use the context-aware resolver so
|
||||
// 1-byte prefix collisions are disambiguated via NeighborGraph
|
||||
// adjacency (anchored on from_pubkey for ADVERTs, previous hop
|
||||
// otherwise). Empty resolved JSON → NULL via nilIfEmpty.
|
||||
resolved := resolvePathWithContext(
|
||||
parsePathArray(data.PathJSON),
|
||||
strings.ToLower(data.FromPubkey),
|
||||
s.neighborGraph.load(),
|
||||
s.prefixIdx.load(),
|
||||
)
|
||||
resolvedJSON := marshalResolvedPath(resolved)
|
||||
|
||||
_, err = s.stmtInsertObservation.Exec(
|
||||
txID, observerIdx, data.Direction,
|
||||
data.SNR, data.RSSI, data.Score,
|
||||
data.PathJSON, epochTs, nilIfEmpty(data.RawHex),
|
||||
nilIfEmpty(resolvedJSON),
|
||||
)
|
||||
if err != nil {
|
||||
s.Stats.WriteErrors.Add(1)
|
||||
log.Printf("[db] observation insert (non-fatal): %v", err)
|
||||
} else {
|
||||
s.Stats.ObservationsInserted.Add(1)
|
||||
// #1690: bump transmissions.last_seen so cold-load can filter on
|
||||
// effective recency. Conditional `last_seen < ?` so we never go
|
||||
// backwards on out-of-order ingest.
|
||||
if _, err := s.stmtBumpTxLastSeen.Exec(epochTs, txID, epochTs); err != nil {
|
||||
log.Printf("[db] tx last_seen bump (non-fatal): %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Each prepared-stmt Exec auto-commits. Count one WAL commit per
|
||||
@@ -931,6 +1030,13 @@ type ObserverMeta struct {
|
||||
RecvErrors *int // cumulative CRC/decode failures since boot
|
||||
PacketsSent *int // cumulative packets sent since boot
|
||||
PacketsRecv *int // cumulative packets received since boot
|
||||
// CanRelay reflects the firmware 1.16 /status `repeat` flag (#1290).
|
||||
// nil means the firmware did not send the field — caller must
|
||||
// preserve the existing observers.can_relay value (default 1).
|
||||
// true → relay-capable (`repeat:on`); false → listener-only
|
||||
// (`repeat:off`), which causes the server-side disambiguator to
|
||||
// exclude this observer's pubkey from path-hop candidate sets.
|
||||
CanRelay *bool
|
||||
}
|
||||
|
||||
// UpsertObserver inserts or updates an observer using the current wall-clock
|
||||
@@ -953,7 +1059,7 @@ func (s *Store) UpsertObserverAt(id, name, iata string, meta *ObserverMeta, last
|
||||
normalizedIATA := strings.TrimSpace(strings.ToUpper(iata))
|
||||
|
||||
var model, firmware, clientVersion, radio interface{}
|
||||
var batteryMv, uptimeSecs, noiseFloor interface{}
|
||||
var batteryMv, uptimeSecs, noiseFloor, canRelay interface{}
|
||||
if meta != nil {
|
||||
if meta.Model != nil {
|
||||
model = *meta.Model
|
||||
@@ -976,11 +1082,22 @@ func (s *Store) UpsertObserverAt(id, name, iata string, meta *ObserverMeta, last
|
||||
if meta.NoiseFloor != nil {
|
||||
noiseFloor = *meta.NoiseFloor
|
||||
}
|
||||
// Issue #1290: nil → leave DB column unchanged (COALESCE in
|
||||
// the prepared stmt); 0/1 written when firmware provided
|
||||
// the `repeat` field. INSERT branch defaults to 1 via the
|
||||
// COALESCE in the VALUES clause.
|
||||
if meta.CanRelay != nil {
|
||||
if *meta.CanRelay {
|
||||
canRelay = 1
|
||||
} else {
|
||||
canRelay = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
_, err := s.stmtUpsertObserver.Exec(
|
||||
id, name, normalizedIATA, lastSeen, lastSeen, model, firmware, clientVersion, radio, batteryMv, uptimeSecs, noiseFloor,
|
||||
name, normalizedIATA, ingestNow, lastSeen, model, firmware, clientVersion, radio, batteryMv, uptimeSecs, noiseFloor,
|
||||
id, name, normalizedIATA, lastSeen, lastSeen, model, firmware, clientVersion, radio, batteryMv, uptimeSecs, noiseFloor, canRelay, canRelay,
|
||||
name, normalizedIATA, ingestNow, lastSeen, model, firmware, clientVersion, radio, batteryMv, uptimeSecs, noiseFloor, canRelay, canRelay,
|
||||
)
|
||||
if err != nil {
|
||||
s.Stats.WriteErrors.Add(1)
|
||||
@@ -1062,7 +1179,8 @@ func (s *Store) InsertMetrics(data *MetricsData) error {
|
||||
// PruneOldMetrics deletes observer_metrics rows older than retentionDays.
|
||||
func (s *Store) PruneOldMetrics(retentionDays int) (int64, error) {
|
||||
cutoff := time.Now().UTC().AddDate(0, 0, -retentionDays).Format(time.RFC3339)
|
||||
result, err := s.db.Exec(`DELETE FROM observer_metrics WHERE timestamp < ?`, cutoff)
|
||||
// Tagged for /api/perf writer-lock visibility (#1340).
|
||||
result, err := s.instrumentedExec("prune_metrics", `DELETE FROM observer_metrics WHERE timestamp < ?`, cutoff)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("prune metrics: %w", err)
|
||||
}
|
||||
@@ -1103,11 +1221,11 @@ func (s *Store) CheckAutoVacuum(cfg *Config) {
|
||||
log.Printf("[db] vacuumOnStartup=true — starting one-time full VACUUM (ensure 2x DB size free disk space)...")
|
||||
start := time.Now()
|
||||
|
||||
if _, err := s.db.Exec("PRAGMA auto_vacuum = INCREMENTAL"); err != nil {
|
||||
if _, err := s.instrumentedExec("vacuum", "PRAGMA auto_vacuum = INCREMENTAL"); err != nil {
|
||||
log.Printf("[db] VACUUM failed: could not set auto_vacuum: %v", err)
|
||||
return
|
||||
}
|
||||
if _, err := s.db.Exec("VACUUM"); err != nil {
|
||||
if _, err := s.instrumentedExec("vacuum", "VACUUM"); err != nil {
|
||||
log.Printf("[db] VACUUM failed: %v", err)
|
||||
return
|
||||
}
|
||||
@@ -1120,7 +1238,8 @@ func (s *Store) CheckAutoVacuum(cfg *Config) {
|
||||
// RunIncrementalVacuum returns free pages to the OS (#919).
|
||||
// Safe to call on auto_vacuum=NONE databases (noop).
|
||||
func (s *Store) RunIncrementalVacuum(pages int) {
|
||||
if _, err := s.db.Exec(fmt.Sprintf("PRAGMA incremental_vacuum(%d)", pages)); err != nil {
|
||||
// Tagged for /api/perf writer-lock visibility (#1340).
|
||||
if _, err := s.instrumentedExec("vacuum", fmt.Sprintf("PRAGMA incremental_vacuum(%d)", pages)); err != nil {
|
||||
log.Printf("[vacuum] incremental_vacuum error: %v", err)
|
||||
}
|
||||
}
|
||||
@@ -1335,14 +1454,15 @@ func (s *Store) RemoveStaleObservers(observerDays int) (int64, error) {
|
||||
return 0, nil // keep forever
|
||||
}
|
||||
cutoff := time.Now().UTC().AddDate(0, 0, -observerDays).Format(time.RFC3339)
|
||||
result, err := s.db.Exec(`UPDATE observers SET inactive = 1 WHERE last_seen < ? AND (inactive IS NULL OR inactive = 0)`, cutoff)
|
||||
// Tagged for /api/perf writer-lock visibility (#1340).
|
||||
result, err := s.instrumentedExec("prune_observers", `UPDATE observers SET inactive = 1 WHERE last_seen < ? AND (inactive IS NULL OR inactive = 0)`, cutoff)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("mark stale observers inactive: %w", err)
|
||||
}
|
||||
removed, _ := result.RowsAffected()
|
||||
if removed > 0 {
|
||||
// Clean up orphaned metrics for now-inactive observers
|
||||
s.db.Exec(`DELETE FROM observer_metrics WHERE observer_id IN (SELECT id FROM observers WHERE inactive = 1)`)
|
||||
_, _ = s.instrumentedExec("prune_observers", `DELETE FROM observer_metrics WHERE observer_id IN (SELECT id FROM observers WHERE inactive = 1)`)
|
||||
log.Printf("Marked %d observer(s) as inactive (not seen in %d days)", removed, observerDays)
|
||||
}
|
||||
return removed, nil
|
||||
@@ -1437,7 +1557,15 @@ func scopeNameForDB(data *PacketData) *string {
|
||||
// node. Skips the UPDATE when the stored value already matches to avoid
|
||||
// redundant writes on the hot MQTT ingest path. Updates both nodes and
|
||||
// inactive_nodes to stay consistent.
|
||||
//
|
||||
// Defense-in-depth (#1534): an empty scope is treated as a no-op. The call
|
||||
// site at handleMessage is the primary guard (shouldUpdateDefaultScope),
|
||||
// but this layer refuses the invalid write so a future caller cannot
|
||||
// reintroduce the bug by passing "" directly.
|
||||
func (s *Store) UpdateNodeDefaultScope(pubkey, scope string) error {
|
||||
if scope == "" {
|
||||
return nil
|
||||
}
|
||||
// Short-circuit: skip if already stored.
|
||||
var cur sql.NullString
|
||||
row := s.db.QueryRow(`SELECT default_scope FROM nodes WHERE public_key = ?`, pubkey)
|
||||
@@ -1574,3 +1702,303 @@ func BuildPacketData(msg *MQTTPacketMessage, decoded *DecodedPacket, observerID,
|
||||
|
||||
return pd
|
||||
}
|
||||
|
||||
|
||||
// ─── Writer-lock instrumentation (issue #1340) ────────────────────────────
|
||||
//
|
||||
// Make SQLite writer-lock starvation visible to operators. Per-component
|
||||
// wait_ms / hold_ms / contention_total histograms, surfaced via
|
||||
// /api/perf/write-sources under the "writer_perf" key. Component tags:
|
||||
// neighbor_builder, mqtt_handler, prune_packets, prune_observers,
|
||||
// prune_metrics, mbcap_persist (deferred — see PR body), vacuum.
|
||||
//
|
||||
// The single writer connection (SetMaxOpenConns(1)) means writes serialise
|
||||
// inside the driver and the wait is invisible to Go. writerMu measures the
|
||||
// wait Go can see (everyone queueing behind the current holder) by gating
|
||||
// every wrapped call site through the same package-level mutex.
|
||||
|
||||
// WriterStatsSnapshot is a per-component wait/hold latency snapshot
|
||||
// surfaced via /api/perf to make SQLite writer-lock starvation visible
|
||||
// to operators (issue #1340). Times are in milliseconds.
|
||||
type WriterStatsSnapshot struct {
|
||||
Count int64 `json:"count"`
|
||||
ContentionTotal int64 `json:"contention_total"`
|
||||
WaitMsP50 float64 `json:"wait_ms_p50"`
|
||||
WaitMsP95 float64 `json:"wait_ms_p95"`
|
||||
WaitMsP99 float64 `json:"wait_ms_p99"`
|
||||
WaitMsMax float64 `json:"wait_ms_max"`
|
||||
HoldMsP50 float64 `json:"hold_ms_p50"`
|
||||
HoldMsP95 float64 `json:"hold_ms_p95"`
|
||||
HoldMsP99 float64 `json:"hold_ms_p99"`
|
||||
HoldMsMax float64 `json:"hold_ms_max"`
|
||||
}
|
||||
|
||||
const (
|
||||
// writerSampleWindow bounds the per-component rolling window so a
|
||||
// long-running ingestor doesn't grow this unbounded.
|
||||
writerSampleWindow = 1024
|
||||
// contentionThresholdMs: wait_ms above this counts as a "contended"
|
||||
// write (per #1340 spec).
|
||||
contentionThresholdMs = 100.0
|
||||
defaultSlowWriterMs = 500.0
|
||||
)
|
||||
|
||||
// slowWriterThresholdMsAtomic — hold_ms threshold above which writes
|
||||
// emit a [db-slow-writer] log line. Read on the hot path; written once
|
||||
// at startup by SetSlowWriterThresholdMs.
|
||||
var slowWriterThresholdMsAtomic atomic.Uint64
|
||||
|
||||
// SetSlowWriterThresholdMs sets the [db-slow-writer] log threshold.
|
||||
// ms<=0 restores the 500ms default. Operators can also set
|
||||
// CORESCOPE_DB_SLOW_WRITER_MS at process start — see initSlowWriterFromEnv.
|
||||
func SetSlowWriterThresholdMs(ms float64) {
|
||||
if ms <= 0 {
|
||||
ms = defaultSlowWriterMs
|
||||
}
|
||||
slowWriterThresholdMsAtomic.Store(uint64(ms))
|
||||
}
|
||||
|
||||
func getSlowWriterThresholdMs() float64 {
|
||||
v := slowWriterThresholdMsAtomic.Load()
|
||||
if v == 0 {
|
||||
return defaultSlowWriterMs
|
||||
}
|
||||
return float64(v)
|
||||
}
|
||||
|
||||
// initSlowWriterFromEnv is called once from package init so operators can
|
||||
// override the threshold via CORESCOPE_DB_SLOW_WRITER_MS without a
|
||||
// Go-side Config change.
|
||||
func initSlowWriterFromEnv() {
|
||||
v := os.Getenv("CORESCOPE_DB_SLOW_WRITER_MS")
|
||||
if v == "" {
|
||||
return
|
||||
}
|
||||
var ms float64
|
||||
if _, err := fmt.Sscanf(v, "%f", &ms); err == nil && ms > 0 {
|
||||
SetSlowWriterThresholdMs(ms)
|
||||
}
|
||||
}
|
||||
|
||||
func init() { initSlowWriterFromEnv() }
|
||||
|
||||
type writerComponentStats struct {
|
||||
mu sync.Mutex
|
||||
count int64
|
||||
contentionTotal int64
|
||||
waitMs []float64
|
||||
holdMs []float64
|
||||
waitMax float64
|
||||
holdMax float64
|
||||
}
|
||||
|
||||
func (c *writerComponentStats) record(waitMs, holdMs float64) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
c.count++
|
||||
if waitMs > contentionThresholdMs {
|
||||
c.contentionTotal++
|
||||
}
|
||||
if waitMs > c.waitMax {
|
||||
c.waitMax = waitMs
|
||||
}
|
||||
if holdMs > c.holdMax {
|
||||
c.holdMax = holdMs
|
||||
}
|
||||
c.waitMs = appendBoundedFloat(c.waitMs, waitMs, writerSampleWindow)
|
||||
c.holdMs = appendBoundedFloat(c.holdMs, holdMs, writerSampleWindow)
|
||||
}
|
||||
|
||||
func appendBoundedFloat(s []float64, v float64, max int) []float64 {
|
||||
if len(s) < max {
|
||||
return append(s, v)
|
||||
}
|
||||
copy(s, s[1:])
|
||||
s[len(s)-1] = v
|
||||
return s
|
||||
}
|
||||
|
||||
func (c *writerComponentStats) snapshot() WriterStatsSnapshot {
|
||||
c.mu.Lock()
|
||||
wait := append([]float64(nil), c.waitMs...)
|
||||
hold := append([]float64(nil), c.holdMs...)
|
||||
snap := WriterStatsSnapshot{
|
||||
Count: c.count,
|
||||
ContentionTotal: c.contentionTotal,
|
||||
WaitMsMax: c.waitMax,
|
||||
HoldMsMax: c.holdMax,
|
||||
}
|
||||
c.mu.Unlock()
|
||||
sort.Float64s(wait)
|
||||
sort.Float64s(hold)
|
||||
snap.WaitMsP50 = nearestRankPercentile(wait, 0.50)
|
||||
snap.WaitMsP95 = nearestRankPercentile(wait, 0.95)
|
||||
snap.WaitMsP99 = nearestRankPercentile(wait, 0.99)
|
||||
snap.HoldMsP50 = nearestRankPercentile(hold, 0.50)
|
||||
snap.HoldMsP95 = nearestRankPercentile(hold, 0.95)
|
||||
snap.HoldMsP99 = nearestRankPercentile(hold, 0.99)
|
||||
return snap
|
||||
}
|
||||
|
||||
func nearestRankPercentile(sorted []float64, p float64) float64 {
|
||||
n := len(sorted)
|
||||
if n == 0 {
|
||||
return 0
|
||||
}
|
||||
if n == 1 {
|
||||
return sorted[0]
|
||||
}
|
||||
idx := int(p*float64(n-1) + 0.5)
|
||||
if idx < 0 {
|
||||
idx = 0
|
||||
}
|
||||
if idx >= n {
|
||||
idx = n - 1
|
||||
}
|
||||
return sorted[idx]
|
||||
}
|
||||
|
||||
type writerStatsAggregator struct {
|
||||
mu sync.Mutex
|
||||
components map[string]*writerComponentStats
|
||||
}
|
||||
|
||||
var writerStatsAgg = &writerStatsAggregator{
|
||||
components: make(map[string]*writerComponentStats),
|
||||
}
|
||||
|
||||
func (a *writerStatsAggregator) get(component string) *writerComponentStats {
|
||||
a.mu.Lock()
|
||||
defer a.mu.Unlock()
|
||||
c, ok := a.components[component]
|
||||
if !ok {
|
||||
c = &writerComponentStats{}
|
||||
a.components[component] = c
|
||||
}
|
||||
return c
|
||||
}
|
||||
|
||||
// reset clears all per-component samples. Test-only: lets a single
|
||||
// scenario assert against a clean aggregator without prior-test noise
|
||||
// in the same package run (TestWriterStarvationVisibleInPerf would
|
||||
// otherwise mix this run's 5 starved samples with thousands of fast
|
||||
// InsertTransmission samples from earlier tests and the p99 would
|
||||
// collapse below the 50s threshold).
|
||||
func (a *writerStatsAggregator) reset() {
|
||||
a.mu.Lock()
|
||||
defer a.mu.Unlock()
|
||||
a.components = make(map[string]*writerComponentStats)
|
||||
}
|
||||
|
||||
// ResetWriterStatsForTest wipes the per-component writer stats
|
||||
// aggregator. Test-only; not safe to call from production code paths.
|
||||
func ResetWriterStatsForTest() { writerStatsAgg.reset() }
|
||||
|
||||
func (a *writerStatsAggregator) snapshot() map[string]WriterStatsSnapshot {
|
||||
a.mu.Lock()
|
||||
keys := make([]string, 0, len(a.components))
|
||||
stats := make([]*writerComponentStats, 0, len(a.components))
|
||||
for k, v := range a.components {
|
||||
keys = append(keys, k)
|
||||
stats = append(stats, v)
|
||||
}
|
||||
a.mu.Unlock()
|
||||
out := make(map[string]WriterStatsSnapshot, len(keys))
|
||||
for i, k := range keys {
|
||||
out[k] = stats[i].snapshot()
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// WriterStatsSnapshot returns a per-component wait/hold/contention
|
||||
// snapshot for exposure on /api/perf/write-sources (issue #1340).
|
||||
func (s *Store) WriterStatsSnapshot() map[string]WriterStatsSnapshot {
|
||||
return writerStatsAgg.snapshot()
|
||||
}
|
||||
|
||||
// recordWriterTiming aggregates a single sample under component and
|
||||
// emits [db-slow-writer] if hold_ms > configured threshold (default
|
||||
// 500ms). queryForLog is truncated to 200 chars.
|
||||
func recordWriterTiming(component string, wait, hold time.Duration, queryForLog string) {
|
||||
waitMs := float64(wait.Nanoseconds()) / 1e6
|
||||
holdMs := float64(hold.Nanoseconds()) / 1e6
|
||||
writerStatsAgg.get(component).record(waitMs, holdMs)
|
||||
if holdMs > getSlowWriterThresholdMs() {
|
||||
q := queryForLog
|
||||
if len(q) > 200 {
|
||||
q = q[:200]
|
||||
}
|
||||
log.Printf("[db-slow-writer] component=%s duration=%.1fms query=%s", component, holdMs, q)
|
||||
}
|
||||
}
|
||||
|
||||
// writerMu serialises every wrapped writer call so the wait the next
|
||||
// caller sees is the wait the perf snapshot can attribute. The
|
||||
// SQLite driver also enforces serial writes (SetMaxOpenConns(1)),
|
||||
// but the wait inside the driver is invisible to Go — writerMu makes
|
||||
// it Go-visible.
|
||||
var writerMu sync.Mutex
|
||||
|
||||
// WriterExec wraps s.db.Exec with per-component wait/hold/contention
|
||||
// instrumentation (issue #1340).
|
||||
func (s *Store) WriterExec(component, query string, args ...interface{}) (sql.Result, error) {
|
||||
waitStart := time.Now()
|
||||
writerMu.Lock()
|
||||
wait := time.Since(waitStart)
|
||||
holdStart := time.Now()
|
||||
res, err := s.db.Exec(query, args...)
|
||||
hold := time.Since(holdStart)
|
||||
writerMu.Unlock()
|
||||
recordWriterTiming(component, wait, hold, query)
|
||||
return res, err
|
||||
}
|
||||
|
||||
// WriterTx wraps Begin → fn → Commit under component tagging.
|
||||
// hold_ms covers the whole tx so a slow body counts against its owner.
|
||||
func (s *Store) WriterTx(component string, fn func(*sql.Tx) error) error {
|
||||
waitStart := time.Now()
|
||||
writerMu.Lock()
|
||||
wait := time.Since(waitStart)
|
||||
holdStart := time.Now()
|
||||
tx, err := s.db.Begin()
|
||||
if err != nil {
|
||||
hold := time.Since(holdStart)
|
||||
writerMu.Unlock()
|
||||
recordWriterTiming(component, wait, hold, "BEGIN")
|
||||
return err
|
||||
}
|
||||
if err := fn(tx); err != nil {
|
||||
_ = tx.Rollback()
|
||||
hold := time.Since(holdStart)
|
||||
writerMu.Unlock()
|
||||
recordWriterTiming(component, wait, hold, "tx-body")
|
||||
return err
|
||||
}
|
||||
err = tx.Commit()
|
||||
hold := time.Since(holdStart)
|
||||
writerMu.Unlock()
|
||||
recordWriterTiming(component, wait, hold, "COMMIT")
|
||||
return err
|
||||
}
|
||||
|
||||
// Wrap helpers below tag existing call sites with the canonical
|
||||
// component names so the call sites read naturally. These keep the
|
||||
// instrumentation out of the hot-path business logic.
|
||||
|
||||
// instrumentedExec is the package-internal pass-through used by call
|
||||
// sites already inside db.go (PruneOldMetrics, RemoveStaleObservers,
|
||||
// vacuum). Equivalent to WriterExec, kept short for readability.
|
||||
func (s *Store) instrumentedExec(component, query string, args ...interface{}) (sql.Result, error) {
|
||||
return s.WriterExec(component, query, args...)
|
||||
}
|
||||
|
||||
// epochSecondsForLastSeen parses an RFC3339 timestamp to a unix-second
|
||||
// value for the transmissions.last_seen denormalized column (#1690).
|
||||
// Falls back to the current time on parse failure so the column is
|
||||
// never seeded with 0 for a brand-new row.
|
||||
func epochSecondsForLastSeen(rfc3339 string) int64 {
|
||||
if t, err := time.Parse(time.RFC3339, rfc3339); err == nil {
|
||||
return t.Unix()
|
||||
}
|
||||
return time.Now().UTC().Unix()
|
||||
}
|
||||
|
||||
@@ -2917,3 +2917,46 @@ func TestSchemaMultibyteSupColumns(t *testing.T) {
|
||||
}
|
||||
store2.Close()
|
||||
}
|
||||
|
||||
// TestUpdateNodeDefaultScope_EmptyScopeIsNoop is the DB-layer defense-in-depth
|
||||
// regression test for #1534. Even if the call-site guard at main.go:720 is
|
||||
// later removed or refactored, the DB function MUST refuse to overwrite a
|
||||
// previously-correct default_scope with the empty string. This is the
|
||||
// belt-and-braces guard recommended by adversarial review (MAJOR-2) and
|
||||
// dijkstra review (MINOR-2).
|
||||
func TestUpdateNodeDefaultScope_EmptyScopeIsNoop(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "test.db")
|
||||
store, err := OpenStore(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenStore: %v", err)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
if _, err := store.db.Exec(`INSERT INTO nodes (public_key, name, default_scope) VALUES ('pk1', 'Node1', '#belgium')`); err != nil {
|
||||
t.Fatalf("insert node: %v", err)
|
||||
}
|
||||
if _, err := store.db.Exec(`INSERT INTO inactive_nodes (public_key, name, default_scope) VALUES ('pk1', 'Node1', '#belgium')`); err != nil {
|
||||
t.Fatalf("insert inactive node: %v", err)
|
||||
}
|
||||
|
||||
// Empty-scope call must be a silent no-op (return nil), NOT overwrite.
|
||||
if err := store.UpdateNodeDefaultScope("pk1", ""); err != nil {
|
||||
t.Fatalf("UpdateNodeDefaultScope(\"\") returned error: %v (want nil)", err)
|
||||
}
|
||||
|
||||
var got string
|
||||
if err := store.db.QueryRow(`SELECT default_scope FROM nodes WHERE public_key = 'pk1'`).Scan(&got); err != nil {
|
||||
t.Fatalf("read nodes.default_scope: %v", err)
|
||||
}
|
||||
if got != "#belgium" {
|
||||
t.Errorf("nodes.default_scope after empty-scope call = %q, want #belgium (DB-layer guard missing — #1534)", got)
|
||||
}
|
||||
var gotInactive string
|
||||
if err := store.db.QueryRow(`SELECT default_scope FROM inactive_nodes WHERE public_key = 'pk1'`).Scan(&gotInactive); err != nil {
|
||||
t.Fatalf("read inactive_nodes.default_scope: %v", err)
|
||||
}
|
||||
if gotInactive != "#belgium" {
|
||||
t.Errorf("inactive_nodes.default_scope after empty-scope call = %q, want #belgium (DB-layer guard missing — #1534)", gotInactive)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,115 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TestWriterStarvationVisibleInPerf reproduces the #1339 class of bug:
|
||||
// one component (neighbor_builder) holds the writer connection for an
|
||||
// extended period; a second component (mqtt_handler) firing concurrent
|
||||
// writes must show observable wait_ms in the perf snapshot.
|
||||
//
|
||||
// This is the gate test for issue #1340: SQLite write-lock instrumentation
|
||||
// per component. If the wait_ms percentile collapses to zero, the
|
||||
// observability gap remains and the regression class is invisible again.
|
||||
//
|
||||
// Runs ~60s — guarded by testing.Short() so fast unit-test passes can
|
||||
// skip it locally, but CI runs `go test ./...` without -short.
|
||||
func TestWriterStarvationVisibleInPerf(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("skipping 60s starvation test in short mode")
|
||||
}
|
||||
|
||||
// Isolate from samples accumulated by earlier tests in the same
|
||||
// package run — without this the mqtt_handler component already
|
||||
// has ~thousand fast InsertTransmission samples and the 5 slow
|
||||
// follower samples can't move p99 above 50s.
|
||||
ResetWriterStatsForTest()
|
||||
|
||||
s, err := OpenStore(tempDBPath(t))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer s.Close()
|
||||
|
||||
const blockDur = 60 * time.Second
|
||||
|
||||
// Blocker: acquire the writer via the wrapped Tx path, tag as
|
||||
// neighbor_builder, sleep 60s while holding the single conn,
|
||||
// then commit. This monopolises the writer for the duration.
|
||||
blockStarted := make(chan struct{})
|
||||
blockerDone := make(chan struct{})
|
||||
go func() {
|
||||
defer close(blockerDone)
|
||||
err := s.WriterTx("neighbor_builder", func(tx *sql.Tx) error {
|
||||
if _, err := tx.Exec(`UPDATE nodes SET name = name WHERE 0`); err != nil {
|
||||
return err
|
||||
}
|
||||
close(blockStarted)
|
||||
time.Sleep(blockDur)
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
t.Errorf("blocker tx: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
// Wait for the blocker to be inside its transaction.
|
||||
<-blockStarted
|
||||
// Small safety margin so the blocker is firmly holding the conn.
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
// Now fire several mqtt_handler writes. Each will block on the
|
||||
// single writer connection until the blocker commits.
|
||||
const followers = 5
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(followers)
|
||||
for i := 0; i < followers; i++ {
|
||||
i := i
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
_, err := s.WriterExec(
|
||||
"mqtt_handler",
|
||||
`INSERT OR IGNORE INTO _migrations (name) VALUES (?)`,
|
||||
fmt.Sprintf("writer_starvation_test_%d", i),
|
||||
)
|
||||
if err != nil {
|
||||
t.Errorf("mqtt follower %d: %v", i, err)
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
<-blockerDone
|
||||
|
||||
snap := s.WriterStatsSnapshot()
|
||||
mqtt, ok := snap["mqtt_handler"]
|
||||
if !ok {
|
||||
t.Fatalf("no perf snapshot for mqtt_handler component (got components: %v)", componentKeys(snap))
|
||||
}
|
||||
if mqtt.Count < followers {
|
||||
t.Fatalf("expected at least %d mqtt_handler samples, got %d", followers, mqtt.Count)
|
||||
}
|
||||
// This is the gate assertion. With instrumentation present the
|
||||
// follower writes should each register ~60s of wait_ms; p99 must
|
||||
// be well above 50_000ms. With instrumentation missing or broken
|
||||
// the percentile collapses to zero and this fails — which is the
|
||||
// exact regression class #1340 is meant to prevent.
|
||||
if mqtt.WaitMsP99 <= 50_000 {
|
||||
t.Fatalf("mqtt_handler wait_ms p99 = %.1fms, want > 50000ms; "+
|
||||
"writer starvation is invisible to /api/perf — issue #1340 not fixed",
|
||||
mqtt.WaitMsP99)
|
||||
}
|
||||
}
|
||||
|
||||
func componentKeys(m map[string]WriterStatsSnapshot) []string {
|
||||
out := make([]string, 0, len(m))
|
||||
for k := range m {
|
||||
out = append(out, k)
|
||||
}
|
||||
return out
|
||||
}
|
||||
+48
-1
@@ -109,6 +109,15 @@ type Payload struct {
|
||||
MAC string `json:"mac,omitempty"`
|
||||
EncryptedData string `json:"encryptedData,omitempty"`
|
||||
ExtraHash string `json:"extraHash,omitempty"`
|
||||
// Extended ACK fields per firmware 1.16.0 (issue #1610) —
|
||||
// firmware/src/helpers/BaseChatMesh.cpp:218-234. ACK payloads grew from
|
||||
// always-4 bytes to 4/5/6 (4-byte truncated sha256 CRC, optional 1-byte
|
||||
// attempt counter, optional 1-byte RNG byte added in commit a130a95a).
|
||||
// AckLen is the wire payload length; AckAttempt/AckRand are surfaced
|
||||
// only when the sender included them (legacy 4-byte ACKs leave them nil).
|
||||
AckLen *int `json:"ackLen,omitempty"`
|
||||
AckAttempt *int `json:"ackAttempt,omitempty"`
|
||||
AckRand *int `json:"ackRand,omitempty"`
|
||||
PubKey string `json:"pubKey,omitempty"`
|
||||
Timestamp uint32 `json:"timestamp,omitempty"`
|
||||
TimestampISO string `json:"timestampISO,omitempty"`
|
||||
@@ -148,6 +157,12 @@ type Payload struct {
|
||||
InnerType *int `json:"innerType,omitempty"`
|
||||
InnerTypeName string `json:"innerTypeName,omitempty"`
|
||||
InnerAckCrc string `json:"innerAckCrc,omitempty"`
|
||||
// Extended ACK inner fields (issue #1610) — when the multipart inner
|
||||
// blob is a v1.16+ extended ACK (5 or 6 bytes after the byte0 header),
|
||||
// surface the same attempt/rand bytes as the top-level decoder.
|
||||
InnerAckLen *int `json:"innerAckLen,omitempty"`
|
||||
InnerAckAttempt *int `json:"innerAckAttempt,omitempty"`
|
||||
InnerAckRand *int `json:"innerAckRand,omitempty"`
|
||||
InnerPayload string `json:"innerPayload,omitempty"`
|
||||
// CONTROL (PAYLOAD_TYPE_CONTROL=0x0B) byte0 flags, per
|
||||
// firmware/src/Mesh.cpp:69 — byte0 high-bit marks zero-hop direct subset.
|
||||
@@ -266,10 +281,27 @@ func decodeAck(buf []byte) Payload {
|
||||
return Payload{Type: "ACK", Error: "too short", RawHex: hex.EncodeToString(buf)}
|
||||
}
|
||||
checksum := binary.LittleEndian.Uint32(buf[0:4])
|
||||
return Payload{
|
||||
ackLen := len(buf)
|
||||
if ackLen > 6 {
|
||||
ackLen = 6
|
||||
}
|
||||
p := Payload{
|
||||
Type: "ACK",
|
||||
ExtraHash: fmt.Sprintf("%08x", checksum),
|
||||
AckLen: &ackLen,
|
||||
}
|
||||
// Firmware 1.16.0 extended ACK (issue #1610): 5th byte is the attempt
|
||||
// counter (commit f6e6fdaa), 6th byte is a random byte added so identical
|
||||
// attempts still hash uniquely (commit a130a95a).
|
||||
if len(buf) >= 5 {
|
||||
attempt := int(buf[4])
|
||||
p.AckAttempt = &attempt
|
||||
}
|
||||
if len(buf) >= 6 {
|
||||
rnd := int(buf[5])
|
||||
p.AckRand = &rnd
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
func decodeAdvert(buf []byte, validateSignatures bool) Payload {
|
||||
@@ -664,6 +696,21 @@ func decodeMultipart(buf []byte) Payload {
|
||||
// to match decodeAck's extraHash convention.
|
||||
crc := binary.LittleEndian.Uint32(buf[1:5])
|
||||
p.InnerAckCrc = fmt.Sprintf("%08x", crc)
|
||||
// Firmware 1.16.0 extended ACK (issue #1610): inner ACK blob may be
|
||||
// 5 or 6 bytes (payload_len = 1 + ack_len) instead of always 4.
|
||||
ackLen := len(buf) - 1
|
||||
if ackLen > 6 {
|
||||
ackLen = 6
|
||||
}
|
||||
p.InnerAckLen = &ackLen
|
||||
if len(buf) >= 6 {
|
||||
attempt := int(buf[5])
|
||||
p.InnerAckAttempt = &attempt
|
||||
}
|
||||
if len(buf) >= 7 {
|
||||
rnd := int(buf[6])
|
||||
p.InnerAckRand = &rnd
|
||||
}
|
||||
} else if len(buf) > 1 {
|
||||
p.InnerPayload = hex.EncodeToString(buf[1:])
|
||||
}
|
||||
|
||||
@@ -0,0 +1,202 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
)
|
||||
|
||||
// IngestBuffer decouples MQTT message receipt from DB writes (#1608).
|
||||
//
|
||||
// On boot the ingestor must subscribe to MQTT immediately, but the single
|
||||
// SQLite writer (#1283) can be held for minutes by a startup migration
|
||||
// (e.g. a large CREATE INDEX) or prune. Without buffering, every QoS-0 packet
|
||||
// received in that window is lost. IngestBuffer holds received work in a
|
||||
// bounded FIFO and a single consumer goroutine drains it once Ready() is
|
||||
// called — i.e. once the write path is free.
|
||||
//
|
||||
// A single consumer preserves the single-writer invariant: jobs run one at a
|
||||
// time, exactly as paho's in-order handler did before. Submit never blocks the
|
||||
// MQTT delivery goroutine; if the buffer is full it drops and counts (bounded
|
||||
// memory). Buffering replays the original messages, so it introduces NO
|
||||
// duplicates (contrast: a QoS-1 broker-queue would).
|
||||
type IngestBuffer struct {
|
||||
jobs chan func()
|
||||
ready chan struct{}
|
||||
stop chan struct{}
|
||||
done chan struct{}
|
||||
dropped atomic.Int64
|
||||
startOnce sync.Once
|
||||
readyOnce sync.Once
|
||||
stopOnce sync.Once
|
||||
|
||||
// dropLogMu guards the time-based drop-log throttle (PR #1623
|
||||
// round-1 fix to #1609 M1). Per-drop logging under sustained
|
||||
// stalls could flood the log at MQTT inbound rate; instead we
|
||||
// always log the FIRST drop of a stall and then summarize at
|
||||
// most once per second until the stall ends.
|
||||
dropLogMu sync.Mutex
|
||||
stallActive bool // true between first drop and first successful Submit
|
||||
stallStart time.Time // when the current stall began
|
||||
stallStartDrop int64 // dropped() value when stall began
|
||||
lastSummaryAt time.Time // last time we wrote a summary line
|
||||
}
|
||||
|
||||
// dropLogSummaryInterval is the minimum interval between summary lines
|
||||
// during a sustained stall. Exposed as a var so tests can shrink it.
|
||||
var dropLogSummaryInterval = time.Second
|
||||
|
||||
// NewIngestBuffer returns a buffer holding up to capacity pending jobs.
|
||||
// Non-positive capacity is clamped to 1 and a WARN is logged so the
|
||||
// misconfiguration is visible (PR #1609 m2 — silent clamp hid bad
|
||||
// ingestBufferSize values).
|
||||
func NewIngestBuffer(capacity int) *IngestBuffer {
|
||||
if capacity < 1 {
|
||||
log.Printf("[ingest-buffer] WARN: requested capacity %d < 1, clamping to 1 — check ingestBufferSize config; default is 50000", capacity)
|
||||
capacity = 1
|
||||
}
|
||||
return &IngestBuffer{
|
||||
jobs: make(chan func(), capacity),
|
||||
ready: make(chan struct{}),
|
||||
stop: make(chan struct{}),
|
||||
done: make(chan struct{}),
|
||||
}
|
||||
}
|
||||
|
||||
// Submit enqueues a job without blocking. If the buffer is full the job is
|
||||
// dropped and the dropped counter is incremented. Safe for concurrent callers.
|
||||
//
|
||||
// Ordering invariant: callers MUST call Start() before the first Submit().
|
||||
// Submit only enqueues — without a running consumer, jobs sit in the channel
|
||||
// and (once cap is reached) are silently dropped until Start()+Ready() run.
|
||||
//
|
||||
// Drop logging (PR #1623 round-1 fix to #1609 M1) uses a time-based
|
||||
// throttle to stay loud-on-stall-start without flooding under sustained
|
||||
// stalls:
|
||||
// - the FIRST drop of a stall logs immediately
|
||||
// - subsequent drops are summarized at most once per second
|
||||
// - when the next Submit succeeds, a "drained" recovery line is
|
||||
// emitted so operators can quantify the burst
|
||||
//
|
||||
// All log lines include the buffer capacity for operator triage.
|
||||
func (b *IngestBuffer) Submit(job func()) {
|
||||
select {
|
||||
case b.jobs <- job:
|
||||
b.maybeLogRecovery()
|
||||
default:
|
||||
n := b.dropped.Add(1)
|
||||
b.logDrop(n)
|
||||
}
|
||||
}
|
||||
|
||||
// logDrop emits a drop log line under the time-based throttle. The first
|
||||
// drop of a stall always logs; subsequent drops summarize at most once
|
||||
// per dropLogSummaryInterval.
|
||||
func (b *IngestBuffer) logDrop(n int64) {
|
||||
b.dropLogMu.Lock()
|
||||
defer b.dropLogMu.Unlock()
|
||||
now := time.Now()
|
||||
if !b.stallActive {
|
||||
b.stallActive = true
|
||||
b.stallStart = now
|
||||
b.stallStartDrop = n - 1 // last successful Submit -> this is the 1st drop of the stall
|
||||
b.lastSummaryAt = now
|
||||
log.Printf("[ingest-buffer] WARNING: buffer full (cap %d), dropped %d message(s) total — write path stalled, raise ingestBufferSize or investigate slow writer", cap(b.jobs), n)
|
||||
return
|
||||
}
|
||||
if now.Sub(b.lastSummaryAt) >= dropLogSummaryInterval {
|
||||
b.lastSummaryAt = now
|
||||
stallDrops := n - b.stallStartDrop
|
||||
log.Printf("[ingest-buffer] WARNING: buffer full (cap %d), %d drop(s) in current stall, %d total — write path still stalled", cap(b.jobs), stallDrops, n)
|
||||
}
|
||||
}
|
||||
|
||||
// maybeLogRecovery is called from the success branch of Submit. If a
|
||||
// stall was active, it logs a recovery line summarizing the burst and
|
||||
// clears the stall state.
|
||||
func (b *IngestBuffer) maybeLogRecovery() {
|
||||
b.dropLogMu.Lock()
|
||||
defer b.dropLogMu.Unlock()
|
||||
if !b.stallActive {
|
||||
return
|
||||
}
|
||||
stallDrops := b.dropped.Load() - b.stallStartDrop
|
||||
dur := time.Since(b.stallStart)
|
||||
log.Printf("[ingest-buffer] INFO: buffer drained, %d drop(s) over %s (cap %d) — write path recovered", stallDrops, dur.Round(time.Millisecond), cap(b.jobs))
|
||||
b.stallActive = false
|
||||
}
|
||||
|
||||
// Start launches the consumer goroutine. It blocks until Ready() is called
|
||||
// (or Stop() fires, whichever comes first), then drains buffered jobs and
|
||||
// runs newly-submitted ones serially, in FIFO order. Idempotent.
|
||||
//
|
||||
// Lifecycle: Stop() closes b.stop, which causes the consumer to exit via
|
||||
// the stop-select arm (after draining any queued jobs if Ready() had
|
||||
// already fired). The b.jobs channel is never closed — closing it would
|
||||
// race with concurrent Submit() callers and panic; instead jobs is
|
||||
// garbage-collected with the buffer once all references drop. Done() is
|
||||
// closed when the consumer goroutine returns.
|
||||
func (b *IngestBuffer) Start() {
|
||||
b.startOnce.Do(func() {
|
||||
go func() {
|
||||
defer close(b.done)
|
||||
select {
|
||||
case <-b.ready:
|
||||
case <-b.stop:
|
||||
// Stopped before Ready — exit immediately. Pending jobs
|
||||
// are discarded; the buffer was never authorized to drain.
|
||||
return
|
||||
}
|
||||
for {
|
||||
select {
|
||||
case job := <-b.jobs:
|
||||
job()
|
||||
case <-b.stop:
|
||||
// Stop after Ready — drain whatever is queued so
|
||||
// shutdown is graceful, then exit. b.jobs is never
|
||||
// closed (see Start godoc), so a default-case
|
||||
// non-blocking receive is the correct drain idiom.
|
||||
for {
|
||||
select {
|
||||
case job := <-b.jobs:
|
||||
job()
|
||||
default:
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
})
|
||||
}
|
||||
|
||||
// Ready signals that the write path is available; the consumer begins
|
||||
// draining. Idempotent.
|
||||
//
|
||||
// Ordering invariant: Start() MUST have been called before Ready() takes
|
||||
// effect. Calling Ready() without a prior Start() simply closes the ready
|
||||
// channel — nothing drains until a later Start() runs its consumer goroutine.
|
||||
func (b *IngestBuffer) Ready() {
|
||||
b.readyOnce.Do(func() { close(b.ready) })
|
||||
}
|
||||
|
||||
// Dropped returns the number of jobs dropped due to a full buffer.
|
||||
func (b *IngestBuffer) Dropped() int64 { return b.dropped.Load() }
|
||||
|
||||
// Pending returns the current queue depth (best-effort; for observability).
|
||||
func (b *IngestBuffer) Pending() int { return len(b.jobs) }
|
||||
|
||||
// Stop signals the consumer goroutine to exit. Test-hygiene helper so unit
|
||||
// tests don't leak the goroutine that Start() spawns. Idempotent / safe to
|
||||
// call without a prior Start(). After Stop() the consumer exits and Done()
|
||||
// is closed.
|
||||
func (b *IngestBuffer) Stop() {
|
||||
b.stopOnce.Do(func() { close(b.stop) })
|
||||
}
|
||||
|
||||
// Done returns a channel that is closed after the consumer goroutine has
|
||||
// exited. If Start() was never called, Done() never closes.
|
||||
func (b *IngestBuffer) Done() <-chan struct{} {
|
||||
return b.done
|
||||
}
|
||||
@@ -0,0 +1,274 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"log"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestIngestBuffer_BuffersUntilReady(t *testing.T) {
|
||||
b := NewIngestBuffer(10)
|
||||
t.Cleanup(b.Stop)
|
||||
var ran atomic.Int64
|
||||
b.Start()
|
||||
for i := 0; i < 3; i++ {
|
||||
b.Submit(func() { ran.Add(1) })
|
||||
}
|
||||
time.Sleep(30 * time.Millisecond)
|
||||
if ran.Load() != 0 {
|
||||
t.Fatalf("jobs ran before Ready(): %d", ran.Load())
|
||||
}
|
||||
b.Ready()
|
||||
deadline := time.Now().Add(time.Second)
|
||||
for ran.Load() < 3 && time.Now().Before(deadline) {
|
||||
time.Sleep(5 * time.Millisecond)
|
||||
}
|
||||
if ran.Load() != 3 {
|
||||
t.Fatalf("want 3 ran after Ready, got %d", ran.Load())
|
||||
}
|
||||
}
|
||||
|
||||
func TestIngestBuffer_FIFOOrder(t *testing.T) {
|
||||
b := NewIngestBuffer(10)
|
||||
t.Cleanup(b.Stop)
|
||||
out := make(chan int, 5)
|
||||
b.Start()
|
||||
for i := 0; i < 5; i++ {
|
||||
i := i
|
||||
b.Submit(func() { out <- i })
|
||||
}
|
||||
b.Ready()
|
||||
for want := 0; want < 5; want++ {
|
||||
select {
|
||||
case got := <-out:
|
||||
if got != want {
|
||||
t.Fatalf("order: want %d got %d", want, got)
|
||||
}
|
||||
case <-time.After(time.Second):
|
||||
t.Fatalf("timeout waiting for job %d", want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestIngestBuffer_DropsWhenFull(t *testing.T) {
|
||||
b := NewIngestBuffer(2)
|
||||
t.Cleanup(b.Stop) // never Ready()'d -> nothing drains
|
||||
for i := 0; i < 5; i++ {
|
||||
b.Submit(func() {})
|
||||
}
|
||||
if got := b.Dropped(); got != 3 {
|
||||
t.Fatalf("want 3 dropped (cap 2, 5 submitted), got %d", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestIngestBuffer_ProcessesAfterReady(t *testing.T) {
|
||||
b := NewIngestBuffer(10)
|
||||
t.Cleanup(b.Stop)
|
||||
b.Start()
|
||||
b.Ready()
|
||||
done := make(chan struct{})
|
||||
b.Submit(func() { close(done) })
|
||||
select {
|
||||
case <-done:
|
||||
case <-time.After(time.Second):
|
||||
t.Fatal("job submitted after Ready was not processed")
|
||||
}
|
||||
}
|
||||
|
||||
func TestIngestBuffer_SerialExecution(t *testing.T) {
|
||||
b := NewIngestBuffer(50)
|
||||
t.Cleanup(b.Stop)
|
||||
var inFlight atomic.Int32
|
||||
var overlap atomic.Bool
|
||||
var wg sync.WaitGroup
|
||||
b.Start()
|
||||
const n = 20
|
||||
wg.Add(n)
|
||||
for i := 0; i < n; i++ {
|
||||
b.Submit(func() {
|
||||
if inFlight.Add(1) > 1 {
|
||||
overlap.Store(true)
|
||||
}
|
||||
time.Sleep(time.Millisecond)
|
||||
inFlight.Add(-1)
|
||||
wg.Done()
|
||||
})
|
||||
}
|
||||
b.Ready()
|
||||
wg.Wait()
|
||||
if overlap.Load() {
|
||||
t.Fatal("jobs overlapped — consumer is not serial (violates single-writer)")
|
||||
}
|
||||
}
|
||||
|
||||
func TestIngestBuffer_ConcurrentSubmitSafe(t *testing.T) {
|
||||
b := NewIngestBuffer(20000)
|
||||
t.Cleanup(b.Stop)
|
||||
b.Start()
|
||||
var wg sync.WaitGroup
|
||||
for g := 0; g < 8; g++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for i := 0; i < 1000; i++ {
|
||||
b.Submit(func() {})
|
||||
}
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
b.Ready()
|
||||
// Assertion is the absence of a race/panic; run under -race in CI.
|
||||
}
|
||||
|
||||
// TestIngestBuffer_StopUnblocksConsumer guards the consumer-goroutine leak
|
||||
// described in PR #1609 review m1: Start() blocks on <-b.ready forever if
|
||||
// Ready() is never called, leaking the goroutine in test runs. Stop() must
|
||||
// signal the consumer to exit cleanly without requiring Ready().
|
||||
func TestIngestBuffer_StopUnblocksConsumer(t *testing.T) {
|
||||
b := NewIngestBuffer(10)
|
||||
t.Cleanup(b.Stop)
|
||||
b.Start()
|
||||
// Do NOT call Ready(). The consumer must exit purely because of Stop().
|
||||
b.Stop()
|
||||
select {
|
||||
case <-b.Done():
|
||||
// good — consumer goroutine returned
|
||||
case <-time.After(time.Second):
|
||||
t.Fatal("Stop() did not unblock the consumer goroutine within 1s (Done() never closed)")
|
||||
}
|
||||
}
|
||||
|
||||
// TestNewIngestBuffer_WarnsOnSubOneClamp asserts that constructing the
|
||||
// buffer with a non-positive capacity emits a WARN log line. Silent
|
||||
// clamping (PR #1609 review m2) hid misconfigurations like
|
||||
// ingestBufferSize=-1 or 0-from-default-not-applied paths.
|
||||
func TestNewIngestBuffer_WarnsOnSubOneClamp(t *testing.T) {
|
||||
var buf bytes.Buffer
|
||||
oldOut := log.Writer()
|
||||
oldFlags := log.Flags()
|
||||
log.SetOutput(&buf)
|
||||
log.SetFlags(0)
|
||||
t.Cleanup(func() {
|
||||
log.SetOutput(oldOut)
|
||||
log.SetFlags(oldFlags)
|
||||
})
|
||||
|
||||
b := NewIngestBuffer(0)
|
||||
t.Cleanup(b.Stop)
|
||||
|
||||
got := buf.String()
|
||||
if !strings.Contains(got, "WARN") || !strings.Contains(got, "ingest-buffer") {
|
||||
t.Fatalf("expected WARN log on sub-one clamp, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestIngestBuffer_DropLogThrottle asserts the time-based throttle (PR
|
||||
// #1623 round-1 fix to #1609 M1): the FIRST drop of a stall logs
|
||||
// immediately (loud), then subsequent drops within the same stall are
|
||||
// rate-limited to at most one summary line per second, and a recovery
|
||||
// line is emitted when Submit succeeds again. This prevents log-flood
|
||||
// under sustained stalls (potentially hundreds of MB/min) while
|
||||
// preserving "loud the instant the stall starts".
|
||||
func TestIngestBuffer_DropLogThrottle(t *testing.T) {
|
||||
var buf bytes.Buffer
|
||||
oldOut := log.Writer()
|
||||
oldFlags := log.Flags()
|
||||
log.SetOutput(&buf)
|
||||
log.SetFlags(0)
|
||||
t.Cleanup(func() {
|
||||
log.SetOutput(oldOut)
|
||||
log.SetFlags(oldFlags)
|
||||
})
|
||||
|
||||
b := NewIngestBuffer(2)
|
||||
t.Cleanup(b.Stop)
|
||||
// Fill to capacity (no Ready() — nothing drains).
|
||||
for i := 0; i < 2; i++ {
|
||||
b.Submit(func() {})
|
||||
}
|
||||
// 100 drops in tight loop (well under 1s).
|
||||
for i := 0; i < 100; i++ {
|
||||
b.Submit(func() {})
|
||||
}
|
||||
|
||||
got := buf.String()
|
||||
lines := strings.Count(got, "buffer full")
|
||||
if lines < 1 {
|
||||
t.Fatalf("expected the FIRST drop to log immediately; got 0 'buffer full' lines:\n%s", got)
|
||||
}
|
||||
if lines > 2 {
|
||||
t.Fatalf("expected at most 2 'buffer full' lines for 100 drops in <1s (first + at-most-one summary), got %d:\n%s", lines, got)
|
||||
}
|
||||
// Every line must include the capacity for operator triage.
|
||||
if !strings.Contains(got, "cap 2") {
|
||||
t.Fatalf("expected every drop log line to include 'cap 2', got:\n%s", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestIngestBuffer_DropLogFirstAlwaysImmediate guards the "loud the
|
||||
// instant the stall starts" half of the throttle contract from PR
|
||||
// #1623: even a single drop must log immediately, not be silently
|
||||
// absorbed by the per-second summary window.
|
||||
func TestIngestBuffer_DropLogFirstAlwaysImmediate(t *testing.T) {
|
||||
var buf bytes.Buffer
|
||||
oldOut := log.Writer()
|
||||
oldFlags := log.Flags()
|
||||
log.SetOutput(&buf)
|
||||
log.SetFlags(0)
|
||||
t.Cleanup(func() {
|
||||
log.SetOutput(oldOut)
|
||||
log.SetFlags(oldFlags)
|
||||
})
|
||||
|
||||
b := NewIngestBuffer(1)
|
||||
t.Cleanup(b.Stop)
|
||||
b.Submit(func() {}) // fills cap=1
|
||||
b.Submit(func() {}) // first drop
|
||||
got := buf.String()
|
||||
if !strings.Contains(got, "buffer full") {
|
||||
t.Fatalf("expected FIRST drop to log immediately; got:\n%s", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestIngestBuffer_DropLogRecoveryAfterDrain guards the recovery-line
|
||||
// half of the throttle contract: once Submit succeeds again after one
|
||||
// or more drops, a "recovered" / "drained" line must be emitted so
|
||||
// operators can quantify the burst (PR #1623).
|
||||
func TestIngestBuffer_DropLogRecoveryAfterDrain(t *testing.T) {
|
||||
var buf bytes.Buffer
|
||||
oldOut := log.Writer()
|
||||
oldFlags := log.Flags()
|
||||
log.SetOutput(&buf)
|
||||
log.SetFlags(0)
|
||||
t.Cleanup(func() {
|
||||
log.SetOutput(oldOut)
|
||||
log.SetFlags(oldFlags)
|
||||
})
|
||||
|
||||
b := NewIngestBuffer(1)
|
||||
t.Cleanup(b.Stop)
|
||||
b.Submit(func() {}) // fills cap=1
|
||||
for i := 0; i < 3; i++ {
|
||||
b.Submit(func() {}) // drops
|
||||
}
|
||||
// Drain: start consumer and Ready(), wait for queue to empty.
|
||||
b.Start()
|
||||
b.Ready()
|
||||
deadline := time.Now().Add(time.Second)
|
||||
for b.Pending() > 0 && time.Now().Before(deadline) {
|
||||
time.Sleep(2 * time.Millisecond)
|
||||
}
|
||||
// Now a successful Submit should trigger the recovery line.
|
||||
b.Submit(func() {})
|
||||
// Give the goroutine + log a moment.
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
|
||||
got := buf.String()
|
||||
if !strings.Contains(got, "drained") && !strings.Contains(got, "recovered") {
|
||||
t.Fatalf("expected a 'drained'/'recovered' log line after stall ended; got:\n%s", got)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,134 @@
|
||||
package main
|
||||
|
||||
// Tests for issue #1610: firmware 1.16.0 extended ACK support.
|
||||
//
|
||||
// Wire vectors are synthetic, derived by hand from the firmware spec:
|
||||
// - Variable-length ACK on the wire:
|
||||
// firmware/src/Mesh.cpp:545-575 createAck/createMultiAck (commit f6e6fdaa)
|
||||
// - 5-byte ACK = 4-byte truncated sha256 CRC + 1-byte attempt counter:
|
||||
// firmware/src/helpers/BaseChatMesh.cpp:218-232 (commit f6e6fdaa)
|
||||
// - 6-byte ACK = 5-byte + 1-byte RNG (so identical attempts get unique hash):
|
||||
// firmware/src/helpers/BaseChatMesh.cpp:219-234 (commit a130a95a)
|
||||
// - Multipart ACK inner blob: firmware/src/Mesh.cpp:292-307 — byte0 then
|
||||
// ack bytes, payload_len = 1 + ack_len.
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
// --- top-level ACK (decodeAck) ---
|
||||
|
||||
func TestDecodeAckLegacy4Byte(t *testing.T) {
|
||||
// Backwards-compat: 4-byte ACK leaves the new optional fields nil.
|
||||
buf := []byte{0xAA, 0xBB, 0xCC, 0xDD}
|
||||
p := decodeAck(buf)
|
||||
if p.ExtraHash != "ddccbbaa" {
|
||||
t.Errorf("extraHash=%q want ddccbbaa", p.ExtraHash)
|
||||
}
|
||||
if p.AckLen == nil || *p.AckLen != 4 {
|
||||
t.Errorf("ackLen=%v want 4", p.AckLen)
|
||||
}
|
||||
if p.AckAttempt != nil {
|
||||
t.Errorf("ackAttempt=%v want nil for legacy 4-byte ACK", *p.AckAttempt)
|
||||
}
|
||||
if p.AckRand != nil {
|
||||
t.Errorf("ackRand=%v want nil for legacy 4-byte ACK", *p.AckRand)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDecodeAck5ByteExtended(t *testing.T) {
|
||||
// v1.16 sender (commit f6e6fdaa): 4-byte CRC + 1-byte attempt.
|
||||
buf := []byte{0xAA, 0xBB, 0xCC, 0xDD, 0x07}
|
||||
p := decodeAck(buf)
|
||||
if p.ExtraHash != "ddccbbaa" {
|
||||
t.Errorf("extraHash=%q want ddccbbaa", p.ExtraHash)
|
||||
}
|
||||
if p.AckLen == nil || *p.AckLen != 5 {
|
||||
t.Errorf("ackLen=%v want 5", p.AckLen)
|
||||
}
|
||||
if p.AckAttempt == nil || *p.AckAttempt != 7 {
|
||||
t.Errorf("ackAttempt=%v want 7", p.AckAttempt)
|
||||
}
|
||||
if p.AckRand != nil {
|
||||
t.Errorf("ackRand=%v want nil for 5-byte ACK", *p.AckRand)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDecodeAck6ByteExtended(t *testing.T) {
|
||||
// v1.16 sender (commit a130a95a): 4-byte CRC + 1-byte attempt + 1-byte RNG.
|
||||
buf := []byte{0xAA, 0xBB, 0xCC, 0xDD, 0x02, 0x5A}
|
||||
p := decodeAck(buf)
|
||||
if p.ExtraHash != "ddccbbaa" {
|
||||
t.Errorf("extraHash=%q want ddccbbaa", p.ExtraHash)
|
||||
}
|
||||
if p.AckLen == nil || *p.AckLen != 6 {
|
||||
t.Errorf("ackLen=%v want 6", p.AckLen)
|
||||
}
|
||||
if p.AckAttempt == nil || *p.AckAttempt != 2 {
|
||||
t.Errorf("ackAttempt=%v want 2", p.AckAttempt)
|
||||
}
|
||||
if p.AckRand == nil || *p.AckRand != 0x5A {
|
||||
t.Errorf("ackRand=%v want 90", p.AckRand)
|
||||
}
|
||||
}
|
||||
|
||||
// --- multipart-with-ACK (decodeMultipart) ---
|
||||
|
||||
// buildMultipartAckByte0: remaining<<4 | PayloadACK (0x02).
|
||||
func buildMultipartAckByte0(remaining int) byte {
|
||||
return byte((remaining<<4)&0xF0) | byte(PayloadACK&0x0F)
|
||||
}
|
||||
|
||||
func TestDecodeMultipartAck4ByteLegacy(t *testing.T) {
|
||||
// Pre-1.16 inner ACK is 4 bytes → ackLen=4, attempt/rand nil.
|
||||
buf := []byte{buildMultipartAckByte0(3), 0xAA, 0xBB, 0xCC, 0xDD}
|
||||
p := decodeMultipart(buf)
|
||||
if p.InnerAckCrc != "ddccbbaa" {
|
||||
t.Errorf("innerAckCrc=%q want ddccbbaa", p.InnerAckCrc)
|
||||
}
|
||||
if p.InnerAckLen == nil || *p.InnerAckLen != 4 {
|
||||
t.Errorf("innerAckLen=%v want 4", p.InnerAckLen)
|
||||
}
|
||||
if p.InnerAckAttempt != nil {
|
||||
t.Errorf("innerAckAttempt=%v want nil", *p.InnerAckAttempt)
|
||||
}
|
||||
if p.InnerAckRand != nil {
|
||||
t.Errorf("innerAckRand=%v want nil", *p.InnerAckRand)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDecodeMultipartAck5Byte(t *testing.T) {
|
||||
// v1.16: byte0 + 4-byte CRC + 1-byte attempt → payload_len = 6.
|
||||
buf := []byte{buildMultipartAckByte0(1), 0xAA, 0xBB, 0xCC, 0xDD, 0x09}
|
||||
p := decodeMultipart(buf)
|
||||
if p.InnerAckCrc != "ddccbbaa" {
|
||||
t.Errorf("innerAckCrc=%q want ddccbbaa", p.InnerAckCrc)
|
||||
}
|
||||
if p.InnerAckLen == nil || *p.InnerAckLen != 5 {
|
||||
t.Errorf("innerAckLen=%v want 5", p.InnerAckLen)
|
||||
}
|
||||
if p.InnerAckAttempt == nil || *p.InnerAckAttempt != 9 {
|
||||
t.Errorf("innerAckAttempt=%v want 9", p.InnerAckAttempt)
|
||||
}
|
||||
if p.InnerAckRand != nil {
|
||||
t.Errorf("innerAckRand=%v want nil for 5-byte inner ACK", *p.InnerAckRand)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDecodeMultipartAck6Byte(t *testing.T) {
|
||||
// v1.16: byte0 + 4-byte CRC + 1-byte attempt + 1-byte RNG → payload_len = 7.
|
||||
buf := []byte{buildMultipartAckByte0(0), 0xAA, 0xBB, 0xCC, 0xDD, 0x04, 0xC3}
|
||||
p := decodeMultipart(buf)
|
||||
if p.InnerAckCrc != "ddccbbaa" {
|
||||
t.Errorf("innerAckCrc=%q want ddccbbaa", p.InnerAckCrc)
|
||||
}
|
||||
if p.InnerAckLen == nil || *p.InnerAckLen != 6 {
|
||||
t.Errorf("innerAckLen=%v want 6", p.InnerAckLen)
|
||||
}
|
||||
if p.InnerAckAttempt == nil || *p.InnerAckAttempt != 4 {
|
||||
t.Errorf("innerAckAttempt=%v want 4", p.InnerAckAttempt)
|
||||
}
|
||||
if p.InnerAckRand == nil || *p.InnerAckRand != 0xC3 {
|
||||
t.Errorf("innerAckRand=%v want 195", p.InnerAckRand)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,84 @@
|
||||
package main
|
||||
|
||||
// Test for issue #1690 — every observation insert must denormalize the
|
||||
// transmission's last_seen so cold-load can filter on effective recency.
|
||||
//
|
||||
// Setup: insert a transmission whose first/last seen are both 7 days ago.
|
||||
// Then insert a fresh observation against the same hash. Post-fix the
|
||||
// transmissions.last_seen column must reflect the new observation time.
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestIssue1690_LastSeenUpdatedOnObservation(t *testing.T) {
|
||||
s, err := OpenStore(tempDBPath(t))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer s.Close()
|
||||
|
||||
hash := "abcdef1690cafebabe"
|
||||
weekAgo := time.Now().UTC().Add(-7 * 24 * time.Hour).Format(time.RFC3339)
|
||||
snr, rssi := 5.5, -100.0
|
||||
|
||||
first := &PacketData{
|
||||
RawHex: "0A00",
|
||||
Timestamp: weekAgo,
|
||||
ObserverID: "obs1",
|
||||
Hash: hash,
|
||||
RouteType: 2,
|
||||
PayloadType: 2,
|
||||
PayloadVersion: 0,
|
||||
PathJSON: "[]",
|
||||
DecodedJSON: `{"type":"TXT_MSG"}`,
|
||||
SNR: &snr,
|
||||
RSSI: &rssi,
|
||||
}
|
||||
if _, err := s.InsertTransmission(first); err != nil {
|
||||
t.Fatalf("seed insert: %v", err)
|
||||
}
|
||||
|
||||
// Sanity: confirm the seed last_seen is the 7d-ago time.
|
||||
var seededLastSeen int64
|
||||
if err := s.db.QueryRow(`SELECT COALESCE(last_seen, 0) FROM transmissions WHERE hash = ?`, hash).Scan(&seededLastSeen); err != nil {
|
||||
t.Fatalf("seed select last_seen: %v (column missing? post-fix must add it)", err)
|
||||
}
|
||||
weekAgoUnix, _ := time.Parse(time.RFC3339, weekAgo)
|
||||
if seededLastSeen != weekAgoUnix.Unix() {
|
||||
t.Logf("seed last_seen=%d expected %d (allowed for fresh column)", seededLastSeen, weekAgoUnix.Unix())
|
||||
}
|
||||
|
||||
// New observation: nowSec timestamp.
|
||||
nowSec := time.Now().UTC().Unix()
|
||||
nowStr := time.Unix(nowSec, 0).UTC().Format(time.RFC3339)
|
||||
second := &PacketData{
|
||||
RawHex: "0A00",
|
||||
Timestamp: nowStr,
|
||||
ObserverID: "obs2", // different observer → new observation row
|
||||
Hash: hash,
|
||||
RouteType: 2,
|
||||
PayloadType: 2,
|
||||
PayloadVersion: 0,
|
||||
PathJSON: "[]",
|
||||
DecodedJSON: `{"type":"TXT_MSG"}`,
|
||||
SNR: &snr,
|
||||
RSSI: &rssi,
|
||||
}
|
||||
if _, err := s.InsertTransmission(second); err != nil {
|
||||
t.Fatalf("second insert: %v", err)
|
||||
}
|
||||
|
||||
var ls int64
|
||||
if err := s.db.QueryRow(`SELECT last_seen FROM transmissions WHERE hash = ?`, hash).Scan(&ls); err != nil {
|
||||
t.Fatalf("post-insert select last_seen: %v", err)
|
||||
}
|
||||
// The post-fix writer must bump last_seen to at least the new observation's
|
||||
// epoch second. We allow ±2s slack for the unix-second round trip.
|
||||
if ls < nowSec-2 {
|
||||
t.Errorf("transmissions.last_seen=%d after fresh observation; expected ≥ %d (a recent unix-second). "+
|
||||
"Pre-fix the column is never updated on re-observation — the original cold-load bug (#1690).",
|
||||
ls, nowSec)
|
||||
}
|
||||
}
|
||||
+229
-133
@@ -51,6 +51,25 @@ func main() {
|
||||
log.Fatalf("config: %v", err)
|
||||
}
|
||||
|
||||
// Apply Go runtime soft memory limit (GOMEMLIMIT). See #1010.
|
||||
// Precedence: GOMEMLIMIT env > runtime.maxMemoryMB > unset (default).
|
||||
{
|
||||
_, envSet := os.LookupEnv("GOMEMLIMIT")
|
||||
runtimeMaxMB := 0
|
||||
if cfg.Runtime != nil {
|
||||
runtimeMaxMB = cfg.Runtime.MaxMemoryMB
|
||||
}
|
||||
limit, source := applyMemoryLimit(runtimeMaxMB, envSet)
|
||||
switch source {
|
||||
case "env":
|
||||
log.Printf("[memlimit] using GOMEMLIMIT from environment (%s)", os.Getenv("GOMEMLIMIT"))
|
||||
case "config":
|
||||
log.Printf("[memlimit] runtime.maxMemoryMB=%d → SetMemoryLimit(%d MiB)", runtimeMaxMB, limit/(1024*1024))
|
||||
default:
|
||||
log.Printf("[memlimit] unset → default (no soft memory limit; recommend setting GOMEMLIMIT or runtime.maxMemoryMB to ≥1.5× working set to avoid OOM-kill)")
|
||||
}
|
||||
}
|
||||
|
||||
sources := cfg.ResolvedSources()
|
||||
|
||||
store, err := OpenStoreWithInterval(cfg.DBPath, cfg.MetricsSampleInterval())
|
||||
@@ -75,6 +94,160 @@ func main() {
|
||||
// Check auto_vacuum mode and optionally migrate (#919)
|
||||
store.CheckAutoVacuum(cfg)
|
||||
|
||||
channelKeys := loadChannelKeys(cfg, *configPath)
|
||||
if len(channelKeys) > 0 {
|
||||
log.Printf("Loaded %d channel keys for GRP_TXT decryption", len(channelKeys))
|
||||
} else {
|
||||
log.Printf("No channel keys loaded — GRP_TXT packets will not be decrypted")
|
||||
}
|
||||
|
||||
regionKeys := loadRegionKeys(cfg)
|
||||
store.BackfillDefaultScopeAsync(regionKeys)
|
||||
|
||||
// Subscribe-early + buffer (#1608): the MQTT subscription is brought up
|
||||
// before startup maintenance so no packets are missed while the single
|
||||
// SQLite writer is blocked (e.g. a large CREATE INDEX migration). Received
|
||||
// messages are buffered here and drained once Ready() is called below.
|
||||
ingestBuffer := NewIngestBuffer(cfg.IngestBufferSizeOrDefault())
|
||||
ingestBuffer.Start()
|
||||
|
||||
// Connect to each MQTT source
|
||||
var clients []mqtt.Client
|
||||
connectedCount := 0
|
||||
for _, source := range sources {
|
||||
tag := source.Name
|
||||
if tag == "" {
|
||||
tag = source.Broker
|
||||
}
|
||||
|
||||
opts := buildMQTTOpts(source)
|
||||
connectTimeout := source.ConnectTimeoutOrDefault()
|
||||
log.Printf("MQTT [%s] connect timeout: %ds", tag, connectTimeout)
|
||||
|
||||
// Pre-allocate the liveness pointer so OnConnect can reset its
|
||||
// stale-message clock on reconnect (PR #1216 r1 item 2). IsConnectedFn
|
||||
// is wired below once the client exists.
|
||||
liveness := &SourceLivenessState{
|
||||
Tag: tag,
|
||||
Broker: source.Broker,
|
||||
}
|
||||
|
||||
// #1043: per-source status registry. Idempotent — repeated
|
||||
// registration across reconnects returns the same state so
|
||||
// counters accumulate across the process lifetime.
|
||||
status := RegisterSourceStatus(tag, source.Broker)
|
||||
|
||||
opts.SetOnConnectHandler(func(c mqtt.Client) {
|
||||
log.Printf("MQTT [%s] connected to %s", tag, source.Broker)
|
||||
status.MarkConnect(time.Now())
|
||||
// PR #1216 r1 item 2: clear the stale LastMessageUnix from
|
||||
// before the outage so the watchdog doesn't immediately scream
|
||||
// "stalled for 2h". Also restarts the cold-start grace window
|
||||
// and clears the alert cooldown so a fresh stall edge can fire.
|
||||
liveness.MarkReconnected(time.Now())
|
||||
topics := source.Topics
|
||||
if len(topics) == 0 {
|
||||
topics = []string{"meshcore/#"}
|
||||
}
|
||||
for _, t := range topics {
|
||||
token := c.Subscribe(t, 0, nil)
|
||||
token.Wait()
|
||||
if token.Error() != nil {
|
||||
log.Printf("MQTT [%s] subscribe error for %s: %v", tag, t, token.Error())
|
||||
} else {
|
||||
log.Printf("MQTT [%s] subscribed to %s", tag, t)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
opts.SetConnectionLostHandler(func(c mqtt.Client, err error) {
|
||||
log.Printf("MQTT [%s] disconnected from %s: %v", tag, source.Broker, err)
|
||||
status.MarkDisconnect(time.Now(), err)
|
||||
})
|
||||
|
||||
opts.SetReconnectingHandler(func(c mqtt.Client, options *mqtt.ClientOptions) {
|
||||
log.Printf("MQTT [%s] reconnecting to %s", tag, source.Broker)
|
||||
})
|
||||
|
||||
// Capture source for closure
|
||||
src := source
|
||||
opts.SetDefaultPublishHandler(func(c mqtt.Client, m mqtt.Message) {
|
||||
// PR #1609 M1: stamp the RECEIPT clock here (broker liveness)
|
||||
// independently of the post-write clock that handleMessage
|
||||
// stamps. Without separation the watchdog/healthz could
|
||||
// report "fresh" while the writer was stalled and the
|
||||
// buffer was filling.
|
||||
markReceiptForTag(tag, time.Now())
|
||||
status.MarkPacket(time.Now())
|
||||
ingestBuffer.Submit(func() {
|
||||
handleMessage(store, tag, src, m, channelKeys, regionKeys, cfg)
|
||||
})
|
||||
})
|
||||
|
||||
client := mqtt.NewClient(opts)
|
||||
// Wire IsConnectedFn now that the client exists, then register.
|
||||
// Registration BEFORE Connect so the attempt counter is available
|
||||
// to OnConnectAttempt on the very first dial.
|
||||
liveness.IsConnectedFn = client.IsConnected
|
||||
// #1335: wire force-reconnect so the watchdog can drop a
|
||||
// half-open TCP socket and re-dial when paho.IsConnected==true
|
||||
// but no messages have flowed past the stall threshold. Throttled
|
||||
// per source by the watchdog itself (forceReconnectThrottle).
|
||||
// Disconnect(250) gives in-flight publishes 250ms to drain;
|
||||
// Connect() returns immediately and paho's reconnect machinery
|
||||
// takes over from there. Captured-by-value `client` is the same
|
||||
// pointer used everywhere else for this source.
|
||||
liveness.ForceReconnectFn = func() {
|
||||
client.Disconnect(250)
|
||||
client.Connect()
|
||||
}
|
||||
// PR #1216 r2 item 3: tag collisions used to log.Fatalf, which
|
||||
// killed the entire ingestor over one config typo and recreated
|
||||
// the #1212 total-ingest-stop class this PR exists to prevent.
|
||||
// registerLivenessOrSkip logs ERROR + skips liveness registration
|
||||
// for the duplicate; the MQTT source still attempts to connect,
|
||||
// it just isn't tracked by the watchdog. First registration
|
||||
// remains authoritative.
|
||||
registerLivenessOrSkip(liveness)
|
||||
token := client.Connect()
|
||||
// With ConnectRetry=true, token.Wait() blocks forever for unreachable brokers.
|
||||
// WaitTimeout lets startup proceed; the client keeps retrying in the background
|
||||
// and OnConnect fires (subscribing) when it eventually connects (#910).
|
||||
if !token.WaitTimeout(time.Duration(connectTimeout) * time.Second) {
|
||||
log.Printf("MQTT [%s] initial connection timed out — retrying in background", tag)
|
||||
clients = append(clients, client)
|
||||
continue
|
||||
}
|
||||
if token.Error() != nil {
|
||||
log.Printf("MQTT [%s] connection failed (non-fatal): %v", tag, token.Error())
|
||||
// BL1 fix: Disconnect to stop Paho's internal retry goroutines.
|
||||
// With ConnectRetry=true, Connect() spawns background goroutines
|
||||
// that leak if the client is simply discarded.
|
||||
client.Disconnect(0)
|
||||
continue
|
||||
}
|
||||
connectedCount++
|
||||
clients = append(clients, client)
|
||||
}
|
||||
|
||||
// BL2 fix: require at least one immediately-connected source. Timed-out
|
||||
// clients are retrying in background (tracked in clients) but don't count
|
||||
// as "connected" — a single unreachable broker must not silently run with
|
||||
// zero active connections.
|
||||
if connectedCount == 0 {
|
||||
// Clean up any timed-out clients still retrying
|
||||
for _, c := range clients {
|
||||
c.Disconnect(0)
|
||||
}
|
||||
log.Fatal("no MQTT sources connected — all timed out or failed. Check broker is running (default: mqtt://localhost:1883). Set MQTT_BROKER env var or configure mqttSources in config.json")
|
||||
}
|
||||
|
||||
if connectedCount < len(clients) {
|
||||
log.Printf("Running — %d MQTT source(s) connected, %d retrying in background", connectedCount, len(clients)-connectedCount)
|
||||
} else {
|
||||
log.Printf("Running — %d MQTT source(s) connected", connectedCount)
|
||||
}
|
||||
|
||||
// Node retention: move stale nodes to inactive_nodes on startup
|
||||
nodeDays := cfg.NodeDaysOrDefault()
|
||||
store.MoveStaleNodes(nodeDays)
|
||||
@@ -103,6 +276,18 @@ func main() {
|
||||
vacuumPages := cfg.IncrementalVacuumPages()
|
||||
store.RunIncrementalVacuum(vacuumPages)
|
||||
|
||||
// Gate open: the synchronous startup writes above cannot return until the
|
||||
// single SQLite writer is free, which means any blocking async migration
|
||||
// (e.g. the CREATE INDEX) has finished. WaitForAsyncMigrations() makes that
|
||||
// explicit. Now drain everything the subscription buffered during startup.
|
||||
store.WaitForAsyncMigrations()
|
||||
ingestBuffer.Ready()
|
||||
if d := ingestBuffer.Dropped(); d > 0 {
|
||||
log.Printf("[ingest-buffer] write path ready; draining backlog (dropped %d during startup — consider raising ingestBufferSize)", d)
|
||||
} else {
|
||||
log.Printf("[ingest-buffer] write path ready; draining backlog (0 dropped)")
|
||||
}
|
||||
|
||||
// Daily ticker for node retention
|
||||
retentionTicker := time.NewTicker(1 * time.Hour)
|
||||
go func() {
|
||||
@@ -192,6 +377,9 @@ func main() {
|
||||
go func() {
|
||||
for range statsTicker.C {
|
||||
store.LogStats()
|
||||
if d := ingestBuffer.Dropped(); d > 0 || ingestBuffer.Pending() > 0 {
|
||||
log.Printf("[ingest-buffer] pending=%d dropped_total=%d", ingestBuffer.Pending(), d)
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
@@ -238,137 +426,6 @@ func main() {
|
||||
defer stopNeighborBuilder()
|
||||
log.Printf("[neighbor-build] enabled (interval=%s)", NeighborEdgesBuilderInterval)
|
||||
|
||||
channelKeys := loadChannelKeys(cfg, *configPath)
|
||||
if len(channelKeys) > 0 {
|
||||
log.Printf("Loaded %d channel keys for GRP_TXT decryption", len(channelKeys))
|
||||
} else {
|
||||
log.Printf("No channel keys loaded — GRP_TXT packets will not be decrypted")
|
||||
}
|
||||
|
||||
regionKeys := loadRegionKeys(cfg)
|
||||
store.BackfillDefaultScopeAsync(regionKeys)
|
||||
|
||||
// Connect to each MQTT source
|
||||
var clients []mqtt.Client
|
||||
connectedCount := 0
|
||||
for _, source := range sources {
|
||||
tag := source.Name
|
||||
if tag == "" {
|
||||
tag = source.Broker
|
||||
}
|
||||
|
||||
opts := buildMQTTOpts(source)
|
||||
connectTimeout := source.ConnectTimeoutOrDefault()
|
||||
log.Printf("MQTT [%s] connect timeout: %ds", tag, connectTimeout)
|
||||
|
||||
// Pre-allocate the liveness pointer so OnConnect can reset its
|
||||
// stale-message clock on reconnect (PR #1216 r1 item 2). IsConnectedFn
|
||||
// is wired below once the client exists.
|
||||
liveness := &SourceLivenessState{
|
||||
Tag: tag,
|
||||
Broker: source.Broker,
|
||||
}
|
||||
|
||||
opts.SetOnConnectHandler(func(c mqtt.Client) {
|
||||
log.Printf("MQTT [%s] connected to %s", tag, source.Broker)
|
||||
// PR #1216 r1 item 2: clear the stale LastMessageUnix from
|
||||
// before the outage so the watchdog doesn't immediately scream
|
||||
// "stalled for 2h". Also restarts the cold-start grace window
|
||||
// and clears the alert cooldown so a fresh stall edge can fire.
|
||||
liveness.MarkReconnected(time.Now())
|
||||
topics := source.Topics
|
||||
if len(topics) == 0 {
|
||||
topics = []string{"meshcore/#"}
|
||||
}
|
||||
for _, t := range topics {
|
||||
token := c.Subscribe(t, 0, nil)
|
||||
token.Wait()
|
||||
if token.Error() != nil {
|
||||
log.Printf("MQTT [%s] subscribe error for %s: %v", tag, t, token.Error())
|
||||
} else {
|
||||
log.Printf("MQTT [%s] subscribed to %s", tag, t)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
opts.SetConnectionLostHandler(func(c mqtt.Client, err error) {
|
||||
log.Printf("MQTT [%s] disconnected from %s: %v", tag, source.Broker, err)
|
||||
})
|
||||
|
||||
opts.SetReconnectingHandler(func(c mqtt.Client, options *mqtt.ClientOptions) {
|
||||
log.Printf("MQTT [%s] reconnecting to %s", tag, source.Broker)
|
||||
})
|
||||
|
||||
// Capture source for closure
|
||||
src := source
|
||||
opts.SetDefaultPublishHandler(func(c mqtt.Client, m mqtt.Message) {
|
||||
handleMessage(store, tag, src, m, channelKeys, regionKeys, cfg)
|
||||
})
|
||||
|
||||
client := mqtt.NewClient(opts)
|
||||
// Wire IsConnectedFn now that the client exists, then register.
|
||||
// Registration BEFORE Connect so the attempt counter is available
|
||||
// to OnConnectAttempt on the very first dial.
|
||||
liveness.IsConnectedFn = client.IsConnected
|
||||
// #1335: wire force-reconnect so the watchdog can drop a
|
||||
// half-open TCP socket and re-dial when paho.IsConnected==true
|
||||
// but no messages have flowed past the stall threshold. Throttled
|
||||
// per source by the watchdog itself (forceReconnectThrottle).
|
||||
// Disconnect(250) gives in-flight publishes 250ms to drain;
|
||||
// Connect() returns immediately and paho's reconnect machinery
|
||||
// takes over from there. Captured-by-value `client` is the same
|
||||
// pointer used everywhere else for this source.
|
||||
liveness.ForceReconnectFn = func() {
|
||||
client.Disconnect(250)
|
||||
client.Connect()
|
||||
}
|
||||
// PR #1216 r2 item 3: tag collisions used to log.Fatalf, which
|
||||
// killed the entire ingestor over one config typo and recreated
|
||||
// the #1212 total-ingest-stop class this PR exists to prevent.
|
||||
// registerLivenessOrSkip logs ERROR + skips liveness registration
|
||||
// for the duplicate; the MQTT source still attempts to connect,
|
||||
// it just isn't tracked by the watchdog. First registration
|
||||
// remains authoritative.
|
||||
registerLivenessOrSkip(liveness)
|
||||
token := client.Connect()
|
||||
// With ConnectRetry=true, token.Wait() blocks forever for unreachable brokers.
|
||||
// WaitTimeout lets startup proceed; the client keeps retrying in the background
|
||||
// and OnConnect fires (subscribing) when it eventually connects (#910).
|
||||
if !token.WaitTimeout(time.Duration(connectTimeout) * time.Second) {
|
||||
log.Printf("MQTT [%s] initial connection timed out — retrying in background", tag)
|
||||
clients = append(clients, client)
|
||||
continue
|
||||
}
|
||||
if token.Error() != nil {
|
||||
log.Printf("MQTT [%s] connection failed (non-fatal): %v", tag, token.Error())
|
||||
// BL1 fix: Disconnect to stop Paho's internal retry goroutines.
|
||||
// With ConnectRetry=true, Connect() spawns background goroutines
|
||||
// that leak if the client is simply discarded.
|
||||
client.Disconnect(0)
|
||||
continue
|
||||
}
|
||||
connectedCount++
|
||||
clients = append(clients, client)
|
||||
}
|
||||
|
||||
// BL2 fix: require at least one immediately-connected source. Timed-out
|
||||
// clients are retrying in background (tracked in clients) but don't count
|
||||
// as "connected" — a single unreachable broker must not silently run with
|
||||
// zero active connections.
|
||||
if connectedCount == 0 {
|
||||
// Clean up any timed-out clients still retrying
|
||||
for _, c := range clients {
|
||||
c.Disconnect(0)
|
||||
}
|
||||
log.Fatal("no MQTT sources connected — all timed out or failed. Check broker is running (default: mqtt://localhost:1883). Set MQTT_BROKER env var or configure mqttSources in config.json")
|
||||
}
|
||||
|
||||
if connectedCount < len(clients) {
|
||||
log.Printf("Running — %d MQTT source(s) connected, %d retrying in background", connectedCount, len(clients)-connectedCount)
|
||||
} else {
|
||||
log.Printf("Running — %d MQTT source(s) connected", connectedCount)
|
||||
}
|
||||
|
||||
// #1212: per-source stall watchdog. Detects "silently dead" sources
|
||||
// where the client reports connected but no messages have flowed. Logs
|
||||
// a WARN line every minute for any source silent for >5m. Scan every
|
||||
@@ -715,8 +772,8 @@ func handleMessage(store *Store, tag string, source MQTTSource, m mqtt.Message,
|
||||
log.Printf("MQTT [%s] node telemetry update error: %v", tag, err)
|
||||
}
|
||||
}
|
||||
// Update default_scope when advert carries a matched transport scope (#899)
|
||||
if pktData.IsTransportScoped {
|
||||
// Update default_scope when advert carries a matched transport scope (#899, #1534)
|
||||
if shouldUpdateDefaultScope(pktData) {
|
||||
if err := store.UpdateNodeDefaultScope(decoded.Payload.PubKey, pktData.ScopeName); err != nil {
|
||||
log.Printf("MQTT [%s] node default_scope update error: %v", tag, err)
|
||||
}
|
||||
@@ -1075,6 +1132,37 @@ func extractObserverMeta(msg map[string]interface{}) *ObserverMeta {
|
||||
}
|
||||
}
|
||||
|
||||
// Issue #1290: firmware 1.16 publishes a `repeat` flag at the top
|
||||
// level of the /status JSON (MQTTMessageBuilder.cpp:58 — see
|
||||
// agessaman/MeshCore mqtt-bridge-implementation-flex). Accept
|
||||
// either a boolean or a case-insensitive `on|off|true|false|1|0`
|
||||
// string. Missing field → leave CanRelay nil; the writer preserves
|
||||
// the prior column value (default 1, back-compat).
|
||||
if v, ok := msg["repeat"]; ok && v != nil {
|
||||
switch t := v.(type) {
|
||||
case bool:
|
||||
b := t
|
||||
meta.CanRelay = &b
|
||||
hasData = true
|
||||
case string:
|
||||
s := strings.ToLower(strings.TrimSpace(t))
|
||||
switch s {
|
||||
case "on", "true", "1", "yes":
|
||||
b := true
|
||||
meta.CanRelay = &b
|
||||
hasData = true
|
||||
case "off", "false", "0", "no":
|
||||
b := false
|
||||
meta.CanRelay = &b
|
||||
hasData = true
|
||||
}
|
||||
case float64:
|
||||
b := t != 0
|
||||
meta.CanRelay = &b
|
||||
hasData = true
|
||||
}
|
||||
}
|
||||
|
||||
if !hasData {
|
||||
return nil
|
||||
}
|
||||
@@ -1356,3 +1444,11 @@ func init() {
|
||||
os.Exit(0)
|
||||
}
|
||||
}
|
||||
|
||||
// shouldUpdateDefaultScope returns true when the packet carries a transport
|
||||
// scope whose region key matched (#1534). Without the ScopeName non-empty
|
||||
// guard, transport-scoped adverts from non-matching regions would overwrite
|
||||
// previously-correct default_scope values with the empty string.
|
||||
func shouldUpdateDefaultScope(pktData *PacketData) bool {
|
||||
return pktData.IsTransportScoped && pktData.ScopeName != ""
|
||||
}
|
||||
|
||||
@@ -2,8 +2,10 @@ package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"database/sql"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"path/filepath"
|
||||
@@ -1053,3 +1055,133 @@ func TestHandleMessageObserverIATAWhitelist(t *testing.T) {
|
||||
t.Errorf("observer from whitelisted IATA ARN should be accepted, got count=%d", count)
|
||||
}
|
||||
}
|
||||
|
||||
// TestBuildPacketDataScopeMatchingNoMatch covers the #1534 regression: a
|
||||
// transport-scoped advert from a non-matching region carries
|
||||
// IsTransportScoped=true and ScopeName="". The default_scope update guard
|
||||
// must skip these packets so previously-correct scopes aren't overwritten
|
||||
// with the empty string.
|
||||
func TestBuildPacketDataScopeMatchingNoMatch(t *testing.T) {
|
||||
// Code1=2AB5 is the precomputed code for region "#test" (payload="hello",
|
||||
// payloadType=5). Build a region-key map for a DIFFERENT region so
|
||||
// matchScope() finds no match and returns "".
|
||||
const rawHex = "142AB500000068656C6C6F"
|
||||
otherKey, _ := hex.DecodeString("aabbccddeeff00112233445566778899")
|
||||
regionKeys := map[string][]byte{"#other": otherKey}
|
||||
|
||||
decoded, err := DecodePacket(rawHex, nil, false)
|
||||
if err != nil {
|
||||
t.Fatalf("DecodePacket: %v", err)
|
||||
}
|
||||
msg := &MQTTPacketMessage{Raw: rawHex}
|
||||
pktData := BuildPacketData(msg, decoded, "obs1", "region1", regionKeys)
|
||||
|
||||
if !pktData.IsTransportScoped {
|
||||
t.Fatalf("precondition: IsTransportScoped should be true (Code1 != 0000)")
|
||||
}
|
||||
if pktData.ScopeName != "" {
|
||||
t.Fatalf("precondition: ScopeName should be empty (no region match), got %q", pktData.ScopeName)
|
||||
}
|
||||
|
||||
// Regression assertion: when ScopeName is empty, the guard must skip the
|
||||
// UpdateNodeDefaultScope call so an empty value never overwrites a
|
||||
// previously-correct default_scope (#1534).
|
||||
if shouldUpdateDefaultScope(pktData) {
|
||||
t.Errorf("shouldUpdateDefaultScope = true for empty ScopeName; want false (would overwrite default_scope with \"\")")
|
||||
}
|
||||
}
|
||||
|
||||
// TestHandleMessageAdvert_EmptyScopeSkipsDefaultScopeUpdate is the call-site
|
||||
// regression test for #1534. It drives a transport-scoped ADVERT whose
|
||||
// region key does NOT match any configured region (so ScopeName=="") through
|
||||
// handleMessage end-to-end and asserts that a pre-existing default_scope on
|
||||
// the node is NOT overwritten with the empty string. This anchors the
|
||||
// call-site guard at main.go:720 — a future refactor that drops the
|
||||
// `if shouldUpdateDefaultScope(...)` wrapper and calls
|
||||
// `store.UpdateNodeDefaultScope(pubkey, pktData.ScopeName)` unconditionally
|
||||
// would re-introduce the #1534 bug and fail this test.
|
||||
func TestHandleMessageAdvert_EmptyScopeSkipsDefaultScopeUpdate(t *testing.T) {
|
||||
store := newTestStore(t)
|
||||
source := MQTTSource{Name: "test"}
|
||||
|
||||
// A transport-scoped ADVERT: header byte 0x10 = route_type 0
|
||||
// (TRANSPORT_FLOOD) + payload_type 4 (ADVERT). Code1=AABB (non-zero, so
|
||||
// IsTransportScoped becomes true), Code2=0000, path_byte=00, then a
|
||||
// 100-byte ADVERT payload (32-byte pubkey starting 46D62D… + 4-byte ts
|
||||
// + 64-byte signature) reused from TestHandleMessageAdvertWithTelemetry.
|
||||
const rawHex = "10AABB00000046D62DE27D4C5194D7821FC5A34A45565DCC2537B300B9AB6275255CEFB65D840CE5C169C94C9AED39E8BCB6CB6EB0335497A198B33A1A610CD3B03D8DCFC160900E5244280323EE0B44CACAB8F02B5B38B91CFA18BD067B0B5E63E94CFC85F758A8530B9240933402E0E6B8F84D5252322D52"
|
||||
const pubkey = "46d62de27d4c5194d7821fc5a34a45565dcc2537b300b9ab6275255cefb65d84"
|
||||
|
||||
// Pre-seed the node with a non-empty default_scope so we can detect an
|
||||
// erroneous overwrite with "".
|
||||
if _, err := store.db.Exec(`INSERT INTO nodes (public_key, name, default_scope) VALUES (?, 'Node1', '#belgium')`, pubkey); err != nil {
|
||||
t.Fatalf("seed node: %v", err)
|
||||
}
|
||||
|
||||
// Empty regionKeys → matchScope() returns "" for any Code1 → ScopeName "".
|
||||
msg := &mockMessage{
|
||||
topic: "meshcore/SJC/obs1/packets",
|
||||
payload: []byte(`{"raw":"` + rawHex + `"}`),
|
||||
}
|
||||
handleMessage(store, "test", source, msg, nil, map[string][]byte{}, &Config{})
|
||||
|
||||
var got sql.NullString
|
||||
if err := store.db.QueryRow(`SELECT default_scope FROM nodes WHERE public_key = ?`, pubkey).Scan(&got); err != nil {
|
||||
t.Fatalf("read default_scope: %v", err)
|
||||
}
|
||||
if !got.Valid || got.String != "#belgium" {
|
||||
t.Errorf("default_scope after empty-scope advert = %q (valid=%v), want #belgium — call-site guard at main.go:720 is missing or broken (#1534)", got.String, got.Valid)
|
||||
}
|
||||
}
|
||||
|
||||
// TestHandleMessageAdvert_MatchedScopeUpdatesDefaultScope is the positive
|
||||
// counterpart: a transport-scoped ADVERT whose Code1 matches a configured
|
||||
// region key MUST cause default_scope to be updated to the matched region
|
||||
// name. Together with the empty-scope test above this proves the call-site
|
||||
// branch routes correctly for both ScopeName states.
|
||||
func TestHandleMessageAdvert_MatchedScopeUpdatesDefaultScope(t *testing.T) {
|
||||
store := newTestStore(t)
|
||||
source := MQTTSource{Name: "test"}
|
||||
|
||||
// Same ADVERT bytes; this time we compute the matching region key for
|
||||
// the (payloadType=4, payload=<advert bytes>) tuple so matchScope() will
|
||||
// return "#de".
|
||||
const advertBytes = "46D62DE27D4C5194D7821FC5A34A45565DCC2537B300B9AB6275255CEFB65D840CE5C169C94C9AED39E8BCB6CB6EB0335497A198B33A1A610CD3B03D8DCFC160900E5244280323EE0B44CACAB8F02B5B38B91CFA18BD067B0B5E63E94CFC85F758A8530B9240933402E0E6B8F84D5252322D52"
|
||||
const pubkey = "46d62de27d4c5194d7821fc5a34a45565dcc2537b300b9ab6275255cefb65d84"
|
||||
|
||||
advertRaw, _ := hex.DecodeString(advertBytes)
|
||||
// Derive the region key whose HMAC produces Code1 we can plant in the
|
||||
// header. Choose key = first 16 bytes of HMAC-SHA256(zeros, advertBytes)
|
||||
// is non-deterministic to find; instead pick an arbitrary key and
|
||||
// compute Code1 from it, then build the packet around that Code1.
|
||||
regionKey, _ := hex.DecodeString("0123456789abcdef0123456789abcdef")
|
||||
mac := hmacSHA256(regionKey, append([]byte{4}, advertRaw...))
|
||||
// Per firmware (#1534 helper logic): Code1 is the first 2 bytes of the
|
||||
// HMAC, sentinel-shifted so 0x0000 → 0x0001 and 0xFFFF → 0xFFFE.
|
||||
code := uint16(mac[0]) | (uint16(mac[1]) << 8)
|
||||
if code == 0x0000 {
|
||||
code = 0x0001
|
||||
} else if code == 0xFFFF {
|
||||
code = 0xFFFE
|
||||
}
|
||||
code1 := fmt.Sprintf("%02X%02X", byte(code&0xFF), byte(code>>8))
|
||||
rawHex := "10" + code1 + "000000" + advertBytes
|
||||
|
||||
if _, err := store.db.Exec(`INSERT INTO nodes (public_key, name, default_scope) VALUES (?, 'Node1', '#old')`, pubkey); err != nil {
|
||||
t.Fatalf("seed node: %v", err)
|
||||
}
|
||||
|
||||
msg := &mockMessage{
|
||||
topic: "meshcore/SJC/obs1/packets",
|
||||
payload: []byte(`{"raw":"` + rawHex + `"}`),
|
||||
}
|
||||
handleMessage(store, "test", source, msg, nil, map[string][]byte{"#de": regionKey}, &Config{})
|
||||
|
||||
var got sql.NullString
|
||||
if err := store.db.QueryRow(`SELECT default_scope FROM nodes WHERE public_key = ?`, pubkey).Scan(&got); err != nil {
|
||||
t.Fatalf("read default_scope: %v", err)
|
||||
}
|
||||
if !got.Valid || got.String != "#de" {
|
||||
t.Errorf("default_scope after matched-scope advert = %q (valid=%v), want #de", got.String, got.Valid)
|
||||
}
|
||||
}
|
||||
|
||||
+17
-18
@@ -22,26 +22,25 @@ func (s *Store) PruneOldPackets(days int) (int64, error) {
|
||||
}
|
||||
cutoff := time.Now().UTC().AddDate(0, 0, -days).Format(time.RFC3339)
|
||||
|
||||
tx, err := s.db.Begin()
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("prune begin: %w", err)
|
||||
}
|
||||
defer tx.Rollback()
|
||||
// Tagged for writer-perf visibility (#1340).
|
||||
var n int64
|
||||
err := s.WriterTx("prune_packets", func(tx *sql.Tx) error {
|
||||
// Delete child observations first (no CASCADE in SQLite).
|
||||
if _, err := tx.Exec(`DELETE FROM observations WHERE transmission_id IN (
|
||||
SELECT id FROM transmissions WHERE first_seen < ?
|
||||
)`, cutoff); err != nil {
|
||||
return fmt.Errorf("prune observations: %w", err)
|
||||
}
|
||||
|
||||
// Delete child observations first (no CASCADE in SQLite).
|
||||
if _, err := tx.Exec(`DELETE FROM observations WHERE transmission_id IN (
|
||||
SELECT id FROM transmissions WHERE first_seen < ?
|
||||
)`, cutoff); err != nil {
|
||||
return 0, fmt.Errorf("prune observations: %w", err)
|
||||
}
|
||||
|
||||
res, err := tx.Exec(`DELETE FROM transmissions WHERE first_seen < ?`, cutoff)
|
||||
res, err := tx.Exec(`DELETE FROM transmissions WHERE first_seen < ?`, cutoff)
|
||||
if err != nil {
|
||||
return fmt.Errorf("prune transmissions: %w", err)
|
||||
}
|
||||
n, _ = res.RowsAffected()
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("prune transmissions: %w", err)
|
||||
}
|
||||
n, _ := res.RowsAffected()
|
||||
if err := tx.Commit(); err != nil {
|
||||
return 0, fmt.Errorf("prune commit: %w", err)
|
||||
return 0, err
|
||||
}
|
||||
if n > 0 {
|
||||
log.Printf("[prune] deleted %d transmissions older than %d days", n, days)
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
package main
|
||||
|
||||
import "runtime/debug"
|
||||
|
||||
// applyMemoryLimit configures Go's soft memory limit (GOMEMLIMIT) for the
|
||||
// ingestor process. See #1010.
|
||||
//
|
||||
// Precedence:
|
||||
// 1. GOMEMLIMIT env var (parsed by the runtime at startup) — we do not
|
||||
// override; report source="env" with limit=0.
|
||||
// 2. runtimeMaxMB > 0 (from config runtime.maxMemoryMB) — set limit of
|
||||
// runtimeMaxMB MiB via debug.SetMemoryLimit; source="config".
|
||||
// 3. Otherwise no limit applied; source="none" (default behavior).
|
||||
//
|
||||
// Returns the limit (bytes) we set, or 0 if we did not set one.
|
||||
func applyMemoryLimit(runtimeMaxMB int, envSet bool) (int64, string) {
|
||||
if envSet {
|
||||
return 0, "env"
|
||||
}
|
||||
if runtimeMaxMB <= 0 {
|
||||
return 0, "none"
|
||||
}
|
||||
limit := int64(runtimeMaxMB) * 1024 * 1024
|
||||
debug.SetMemoryLimit(limit)
|
||||
return limit, "config"
|
||||
}
|
||||
@@ -0,0 +1,71 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"runtime/debug"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestApplyMemoryLimit_FromEnv: when GOMEMLIMIT env var is set, the runtime
|
||||
// already parsed it. Our function MUST NOT override and MUST report env source.
|
||||
func TestApplyMemoryLimit_FromEnv(t *testing.T) {
|
||||
t.Setenv("GOMEMLIMIT", "850MiB")
|
||||
defer debug.SetMemoryLimit(-1)
|
||||
|
||||
limit, source := applyMemoryLimit(512, true /* envSet */)
|
||||
if source != "env" {
|
||||
t.Fatalf("expected source=env, got %q", source)
|
||||
}
|
||||
if limit != 0 {
|
||||
t.Fatalf("expected limit=0 (not set by us), got %d", limit)
|
||||
}
|
||||
}
|
||||
|
||||
// TestApplyMemoryLimit_FromConfig: when env is unset and runtime.maxMemoryMB
|
||||
// is set, derive a limit of exactly runtimeMaxMB * 1 MiB (no headroom — the
|
||||
// ingestor's working set is bounded by MQTT batch decode, not packet store).
|
||||
func TestApplyMemoryLimit_FromConfig(t *testing.T) {
|
||||
defer debug.SetMemoryLimit(-1)
|
||||
|
||||
limit, source := applyMemoryLimit(512, false /* envSet */)
|
||||
if source != "config" {
|
||||
t.Fatalf("expected source=config, got %q", source)
|
||||
}
|
||||
want := int64(512) * 1024 * 1024
|
||||
if limit != want {
|
||||
t.Fatalf("expected limit=%d, got %d", want, limit)
|
||||
}
|
||||
cur := debug.SetMemoryLimit(-1)
|
||||
if cur != want {
|
||||
t.Fatalf("runtime memory limit not set: want=%d got=%d", want, cur)
|
||||
}
|
||||
}
|
||||
|
||||
// TestApplyMemoryLimit_None: neither env nor config — no limit applied,
|
||||
// default behavior preserved.
|
||||
func TestApplyMemoryLimit_None(t *testing.T) {
|
||||
defer debug.SetMemoryLimit(-1)
|
||||
debug.SetMemoryLimit(int64(1<<63 - 1)) // math.MaxInt64 = "no limit"
|
||||
|
||||
limit, source := applyMemoryLimit(0, false)
|
||||
if source != "none" {
|
||||
t.Fatalf("expected source=none, got %q", source)
|
||||
}
|
||||
if limit != 0 {
|
||||
t.Fatalf("expected limit=0, got %d", limit)
|
||||
}
|
||||
}
|
||||
|
||||
// TestApplyMemoryLimit_EnvWinsOverConfig: env set AND config set → env wins,
|
||||
// our function does not override. Locks the precedence triage specified.
|
||||
func TestApplyMemoryLimit_EnvWinsOverConfig(t *testing.T) {
|
||||
t.Setenv("GOMEMLIMIT", "1GiB")
|
||||
defer debug.SetMemoryLimit(-1)
|
||||
|
||||
limit, source := applyMemoryLimit(512, true /* envSet */)
|
||||
if source != "env" {
|
||||
t.Fatalf("expected source=env when both set, got %q", source)
|
||||
}
|
||||
if limit != 0 {
|
||||
t.Fatalf("expected limit=0 when env wins, got %d", limit)
|
||||
}
|
||||
}
|
||||
@@ -57,7 +57,12 @@ const (
|
||||
type SourceLivenessState struct {
|
||||
Tag string
|
||||
Broker string
|
||||
LastMessageUnix int64 // atomic; unix seconds of last successfully received MQTT message
|
||||
LastMessageUnix int64 // atomic; unix seconds of last successfully WRITTEN MQTT message (handleMessage post-write)
|
||||
// LastReceiptUnix (PR #1609 M1) is stamped at MQTT receipt time —
|
||||
// BEFORE the message is handed to the buffer/writer. STUB: unused
|
||||
// in production until the green commit wires MarkReceipt at the
|
||||
// receipt callsite and surfaces it in stats/healthz.
|
||||
LastReceiptUnix int64 // atomic; unix seconds of last RECEIPT (broker liveness)
|
||||
// FirstConnectedAt (PR #1216 r2 item 2) is stamped ONCE at
|
||||
// registerLivenessState time and never reset. Cold-start grace
|
||||
// checks against this so a flapping broker (CONNECT ok, SUBSCRIBE
|
||||
@@ -95,6 +100,16 @@ func (s *SourceLivenessState) MarkMessage(now time.Time) {
|
||||
atomic.StoreInt64(&s.LastMessageUnix, now.Unix())
|
||||
}
|
||||
|
||||
// MarkReceipt records the time of an MQTT message receipt — stamped at the
|
||||
// paho receipt callback BEFORE the message enters the ingest buffer. PR
|
||||
// #1609 M1: kept separate from LastMessageUnix so the watchdog/healthz can
|
||||
// distinguish "broker alive, write path stuck" (LastReceiptUnix fresh,
|
||||
// LastMessageUnix stale) from "everything stalled" (both stale). Cheap;
|
||||
// safe to call from the message-handling hot path.
|
||||
func (s *SourceLivenessState) MarkReceipt(now time.Time) {
|
||||
atomic.StoreInt64(&s.LastReceiptUnix, now.Unix())
|
||||
}
|
||||
|
||||
// MarkReconnected clears stale liveness state so the watchdog does not
|
||||
// false-alarm on a pre-outage timestamp after paho re-establishes the
|
||||
// connection (PR #1216 r1 item 2). Resets LastMessageUnix, re-stamps
|
||||
@@ -217,7 +232,8 @@ func registerLivenessOrSkip(s *SourceLivenessState) bool {
|
||||
}
|
||||
|
||||
// markLivenessForTag is the hot-path entry point: O(1) map lookup +
|
||||
// atomic store. Safe to call for unknown tags (no-op).
|
||||
// atomic store. Safe to call for unknown tags (no-op). Updates
|
||||
// LastMessageUnix (post-write clock).
|
||||
func markLivenessForTag(tag string, now time.Time) {
|
||||
livenessRegistryMu.RLock()
|
||||
s := livenessRegistry[tag]
|
||||
@@ -227,6 +243,38 @@ func markLivenessForTag(tag string, now time.Time) {
|
||||
}
|
||||
}
|
||||
|
||||
// markReceiptForTag is the hot-path entry point used at MQTT receipt
|
||||
// (BEFORE the message is buffered/written). Updates LastReceiptUnix only.
|
||||
// PR #1609 M1 — separates broker-liveness signal from write-path
|
||||
// liveness so /healthz can show a stalled writer with a live broker.
|
||||
func markReceiptForTag(tag string, now time.Time) {
|
||||
livenessRegistryMu.RLock()
|
||||
s := livenessRegistry[tag]
|
||||
livenessRegistryMu.RUnlock()
|
||||
if s != nil {
|
||||
s.MarkReceipt(now)
|
||||
}
|
||||
}
|
||||
|
||||
// SnapshotLivenessClocks returns the per-source receipt vs write-path
|
||||
// liveness pair for every registered source. Read-only; safe to call
|
||||
// from the stats-file writer. PR #1609 M1.
|
||||
func SnapshotLivenessClocks() map[string]SourceLivenessSnapshot {
|
||||
livenessRegistryMu.RLock()
|
||||
defer livenessRegistryMu.RUnlock()
|
||||
if len(livenessRegistry) == 0 {
|
||||
return nil
|
||||
}
|
||||
out := make(map[string]SourceLivenessSnapshot, len(livenessRegistry))
|
||||
for tag, s := range livenessRegistry {
|
||||
out[tag] = SourceLivenessSnapshot{
|
||||
LastReceiptUnix: atomic.LoadInt64(&s.LastReceiptUnix),
|
||||
LastMessageUnix: atomic.LoadInt64(&s.LastMessageUnix),
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// runLivenessWatchdog starts a goroutine that scans the registry every
|
||||
// `interval` and logs a warning for any source that has been silent while
|
||||
// connected for more than `threshold`. Returns a stop function that halts
|
||||
|
||||
@@ -0,0 +1,43 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TestSourceLivenessState_ReceiptVsWriteSeparate asserts that the receipt-
|
||||
// time and post-write liveness clocks are independent (PR #1609 review
|
||||
// MAJOR M1): stamping at receipt must NOT advance the post-write clock so
|
||||
// the watchdog/healthz can distinguish "broker alive, write path stuck"
|
||||
// from "everything fine". Without separation, /healthz reports "fresh"
|
||||
// while the writer is stalled and the ingest buffer is filling.
|
||||
func TestSourceLivenessState_ReceiptVsWriteSeparate(t *testing.T) {
|
||||
s := &SourceLivenessState{Tag: "t"}
|
||||
now := time.Now()
|
||||
|
||||
// Receipt at T0; post-write never happens (writer stalled).
|
||||
s.MarkReceipt(now)
|
||||
|
||||
gotReceipt := atomic.LoadInt64(&s.LastReceiptUnix)
|
||||
gotWrite := atomic.LoadInt64(&s.LastMessageUnix)
|
||||
if gotReceipt != now.Unix() {
|
||||
t.Fatalf("LastReceiptUnix: want %d, got %d", now.Unix(), gotReceipt)
|
||||
}
|
||||
if gotWrite != 0 {
|
||||
t.Fatalf("LastMessageUnix MUST stay 0 while writer stalled (only MarkReceipt called); got %d — receipt is double-stamping the write clock and /healthz will lie about ingestion freshness", gotWrite)
|
||||
}
|
||||
|
||||
// Write completes later: only MarkMessage advances LastMessageUnix.
|
||||
later := now.Add(5 * time.Second)
|
||||
s.MarkMessage(later)
|
||||
|
||||
gotReceipt2 := atomic.LoadInt64(&s.LastReceiptUnix)
|
||||
gotWrite2 := atomic.LoadInt64(&s.LastMessageUnix)
|
||||
if gotReceipt2 != now.Unix() {
|
||||
t.Fatalf("MarkMessage must not move LastReceiptUnix backwards or forwards; want %d, got %d", now.Unix(), gotReceipt2)
|
||||
}
|
||||
if gotWrite2 != later.Unix() {
|
||||
t.Fatalf("LastMessageUnix after MarkMessage: want %d, got %d", later.Unix(), gotWrite2)
|
||||
}
|
||||
}
|
||||
@@ -63,6 +63,16 @@ func (s *Store) StartNeighborEdgesBuilder(interval time.Duration) func() {
|
||||
// returning — first server load needs a fully-populated table.
|
||||
wuStart := time.Now()
|
||||
var wuTotal int
|
||||
// Prime the prefix index (#1547) so the very first
|
||||
// InsertTransmission after startup can resolve hop prefixes.
|
||||
if err := s.RefreshPrefixIndex(); err != nil {
|
||||
log.Printf("[neighbor-build] initial prefix-index refresh error: %v", err)
|
||||
}
|
||||
// Prime the neighbor graph (#1560) so the context-aware resolver
|
||||
// has adjacency data on the very first InsertTransmission.
|
||||
if err := s.RefreshNeighborGraph(); err != nil {
|
||||
log.Printf("[neighbor-build] initial neighbor-graph refresh error: %v", err)
|
||||
}
|
||||
for {
|
||||
n, err := s.buildAndPersistNeighborEdges()
|
||||
if err != nil {
|
||||
@@ -85,7 +95,18 @@ func (s *Store) StartNeighborEdgesBuilder(interval time.Duration) func() {
|
||||
select {
|
||||
case <-t.C:
|
||||
start := time.Now()
|
||||
// Refresh the prefix index alongside the edges build
|
||||
// (#1547) so new nodes become resolvable within a tick.
|
||||
if err := s.RefreshPrefixIndex(); err != nil {
|
||||
log.Printf("[neighbor-build] prefix-index refresh error: %v", err)
|
||||
}
|
||||
n, err := s.buildAndPersistNeighborEdges()
|
||||
// Refresh the neighbor-graph snapshot after the edges
|
||||
// build (#1560) so the context-aware resolver picks up
|
||||
// newly persisted adjacencies on the next ingest.
|
||||
if grErr := s.RefreshNeighborGraph(); grErr != nil {
|
||||
log.Printf("[neighbor-build] neighbor-graph refresh error: %v", grErr)
|
||||
}
|
||||
dur := time.Since(start)
|
||||
if err != nil {
|
||||
log.Printf("[neighbor-build] tick error after %s: %v", dur, err)
|
||||
@@ -213,33 +234,36 @@ func (s *Store) buildAndPersistNeighborEdges() (int, error) {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
tx, err := s.db.Begin()
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("begin: %w", err)
|
||||
}
|
||||
defer tx.Rollback()
|
||||
stmt, err := tx.Prepare(`INSERT INTO neighbor_edges (node_a, node_b, count, last_seen)
|
||||
VALUES (?, ?, 1, ?)
|
||||
ON CONFLICT(node_a, node_b) DO UPDATE SET
|
||||
count = count + 1,
|
||||
last_seen = MAX(last_seen, excluded.last_seen)`)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("prepare: %w", err)
|
||||
}
|
||||
defer stmt.Close()
|
||||
var firstErr error
|
||||
for _, e := range edges {
|
||||
if _, err := stmt.Exec(e.a, e.b, e.ts); err != nil && firstErr == nil {
|
||||
firstErr = err
|
||||
// Wrap the whole edge-persist tx under writer-perf instrumentation
|
||||
// (#1340). Slow neighbor-builder ticks (the #1339 root cause) now
|
||||
// show up on /api/perf under component=neighbor_builder.
|
||||
var inserted int
|
||||
err = s.WriterTx("neighbor_builder", func(tx *sql.Tx) error {
|
||||
stmt, err := tx.Prepare(`INSERT INTO neighbor_edges (node_a, node_b, count, last_seen)
|
||||
VALUES (?, ?, 1, ?)
|
||||
ON CONFLICT(node_a, node_b) DO UPDATE SET
|
||||
count = count + 1,
|
||||
last_seen = MAX(last_seen, excluded.last_seen)`)
|
||||
if err != nil {
|
||||
return fmt.Errorf("prepare: %w", err)
|
||||
}
|
||||
defer stmt.Close()
|
||||
var firstErr error
|
||||
for _, e := range edges {
|
||||
if _, err := stmt.Exec(e.a, e.b, e.ts); err != nil && firstErr == nil {
|
||||
firstErr = err
|
||||
}
|
||||
}
|
||||
if firstErr != nil {
|
||||
return fmt.Errorf("upsert: %w", firstErr)
|
||||
}
|
||||
inserted = len(edges)
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
if firstErr != nil {
|
||||
return 0, fmt.Errorf("upsert: %w", firstErr)
|
||||
}
|
||||
if err := tx.Commit(); err != nil {
|
||||
return 0, fmt.Errorf("commit: %w", err)
|
||||
}
|
||||
return len(edges), nil
|
||||
return inserted, nil
|
||||
}
|
||||
|
||||
// canonEdge orders the pair so node_a <= node_b (matches the existing
|
||||
|
||||
@@ -0,0 +1,225 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
)
|
||||
|
||||
// Context-aware hop resolver — full restore of pre-#1289 hop
|
||||
// disambiguation semantics, ported into the ingestor (where the
|
||||
// neighbor graph + node directory now live, per #1283).
|
||||
//
|
||||
// Why this exists (issues #1547 / #1560):
|
||||
// The naive `resolvePath` only resolves hops whose prefix is unique
|
||||
// in the node table. On a >2K-node mesh the dominant case is 1-byte
|
||||
// prefix collisions (multiple candidates per prefix). Without
|
||||
// adjacency disambiguation those hops always serialize as `nil`
|
||||
// and the resolved_path remains effectively empty for the largest
|
||||
// meshes — the very deployments that need it most.
|
||||
//
|
||||
// Algorithm (ported from cmd/server/store.go @ commit 450236d5
|
||||
// `pm.resolveWithContext`, intersected with the disambiguation gating
|
||||
// from PR #1144 / #1352):
|
||||
//
|
||||
// For each hop:
|
||||
// 1. Collect candidate pubkeys by prefix-match (existing prefixIndex).
|
||||
// 2. len==0 → nil.
|
||||
// 3. len==1 → that pubkey.
|
||||
// 4. len>1 → filter by NeighborGraph adjacency to the anchor:
|
||||
// - hop 0 anchor = fromPubkey (ADVERT originator) if known;
|
||||
// - hop i (i>0) anchor = previous resolved hop's pubkey;
|
||||
// if the previous hop did not resolve, the chain breaks
|
||||
// and subsequent >1-candidate hops fall to nil.
|
||||
// Surviving candidates after filter:
|
||||
// - exactly 1 → use it
|
||||
// - 0 or >1 → nil (cannot disambiguate further)
|
||||
//
|
||||
// This is the conservative tier-1 variant. Pre-#1289 also carried
|
||||
// tier-2 (geo proximity), tier-3 (GPS preference), tier-4 (obs-count
|
||||
// fallback) — those were noisy in practice and are intentionally NOT
|
||||
// ported here; this PR is a regression restore, not an enhancement.
|
||||
|
||||
// NeighborGraph is the in-memory adjacency snapshot used by the
|
||||
// context-aware resolver. Internally lowercased.
|
||||
type NeighborGraph struct {
|
||||
adj map[string]map[string]struct{}
|
||||
}
|
||||
|
||||
// NewNeighborGraph returns an empty graph.
|
||||
func NewNeighborGraph() *NeighborGraph {
|
||||
return &NeighborGraph{adj: make(map[string]map[string]struct{})}
|
||||
}
|
||||
|
||||
// AddEdge adds an undirected adjacency a↔b. Self-loops and empty
|
||||
// endpoints are ignored.
|
||||
func (g *NeighborGraph) AddEdge(a, b string) {
|
||||
a = strings.ToLower(a)
|
||||
b = strings.ToLower(b)
|
||||
if a == "" || b == "" || a == b {
|
||||
return
|
||||
}
|
||||
if g.adj[a] == nil {
|
||||
g.adj[a] = make(map[string]struct{})
|
||||
}
|
||||
if g.adj[b] == nil {
|
||||
g.adj[b] = make(map[string]struct{})
|
||||
}
|
||||
g.adj[a][b] = struct{}{}
|
||||
g.adj[b][a] = struct{}{}
|
||||
}
|
||||
|
||||
// IsAdjacent reports whether a and b appear together in any neighbor edge.
|
||||
func (g *NeighborGraph) IsAdjacent(a, b string) bool {
|
||||
if g == nil {
|
||||
return false
|
||||
}
|
||||
a = strings.ToLower(a)
|
||||
b = strings.ToLower(b)
|
||||
if a == "" || b == "" {
|
||||
return false
|
||||
}
|
||||
nbrs, ok := g.adj[a]
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
_, present := nbrs[b]
|
||||
return present
|
||||
}
|
||||
|
||||
// neighborGraphHolder caches the graph for the InsertTransmission hot
|
||||
// path. atomic.Value lets the 60s rebuild publish without a read-side
|
||||
// lock.
|
||||
type neighborGraphHolder struct {
|
||||
v atomic.Value // holds *NeighborGraph
|
||||
}
|
||||
|
||||
func (h *neighborGraphHolder) load() *NeighborGraph {
|
||||
if v := h.v.Load(); v != nil {
|
||||
return v.(*NeighborGraph)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *neighborGraphHolder) store(g *NeighborGraph) {
|
||||
h.v.Store(g)
|
||||
}
|
||||
|
||||
// loadNeighborGraph reads neighbor_edges and returns an in-memory
|
||||
// adjacency snapshot. Safe to call against a fresh DB (returns an
|
||||
// empty graph).
|
||||
func loadNeighborGraph(db *sql.DB) (*NeighborGraph, error) {
|
||||
rows, err := db.Query(`SELECT node_a, node_b FROM neighbor_edges`)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
g := NewNeighborGraph()
|
||||
for rows.Next() {
|
||||
var a, b string
|
||||
if err := rows.Scan(&a, &b); err != nil {
|
||||
continue
|
||||
}
|
||||
g.AddEdge(a, b)
|
||||
}
|
||||
return g, nil
|
||||
}
|
||||
|
||||
// resolveHopWithContext resolves a single hop using NeighborGraph
|
||||
// adjacency to the anchor. Returns nil when the hop cannot be
|
||||
// disambiguated.
|
||||
//
|
||||
// exclude is a set of pubkeys to discard from the candidate pool
|
||||
// (typically the prior hops already resolved on the path — a packet
|
||||
// does not revisit a node).
|
||||
//
|
||||
// Behavior matrix:
|
||||
// len(candidates) | anchor | graph | result
|
||||
// 0 | — | — | nil
|
||||
// 1 | — | — | candidates[0]
|
||||
// >1 | "" or no graph|— | nil
|
||||
// >1 | non-empty | set | unique adjacent candidate
|
||||
// (or nil if 0 or >1 survive)
|
||||
func resolveHopWithContext(hop string, anchor string, graph *NeighborGraph, idx prefixIndex, exclude map[string]struct{}) *string {
|
||||
if idx == nil {
|
||||
return nil
|
||||
}
|
||||
h := strings.ToLower(hop)
|
||||
candidates := idx[h]
|
||||
switch len(candidates) {
|
||||
case 0:
|
||||
return nil
|
||||
case 1:
|
||||
pk := candidates[0]
|
||||
if _, skip := exclude[pk]; skip {
|
||||
return nil
|
||||
}
|
||||
return &pk
|
||||
}
|
||||
if graph == nil || anchor == "" {
|
||||
return nil
|
||||
}
|
||||
var match string
|
||||
survivors := 0
|
||||
for _, cand := range candidates {
|
||||
if _, skip := exclude[cand]; skip {
|
||||
continue
|
||||
}
|
||||
if graph.IsAdjacent(anchor, cand) {
|
||||
survivors++
|
||||
if survivors > 1 {
|
||||
return nil
|
||||
}
|
||||
match = cand
|
||||
}
|
||||
}
|
||||
if survivors == 1 {
|
||||
return &match
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// resolvePathWithContext walks the hop list, anchoring hop 0 on
|
||||
// fromPubkey (for ADVERTs) and each subsequent hop on the previous
|
||||
// resolved hop. Previously-resolved pubkeys (plus the originator) are
|
||||
// excluded from later candidate pools so the walk doesn't revisit a
|
||||
// node. Returns a `[]*string` shape compatible with
|
||||
// marshalResolvedPath (and the all-nil clobber-guard from PR #1548).
|
||||
func resolvePathWithContext(hops []string, fromPubkey string, graph *NeighborGraph, idx prefixIndex) []*string {
|
||||
if len(hops) == 0 {
|
||||
return nil
|
||||
}
|
||||
out := make([]*string, len(hops))
|
||||
if idx == nil {
|
||||
return out
|
||||
}
|
||||
prevAnchor := strings.ToLower(fromPubkey)
|
||||
seen := make(map[string]struct{}, len(hops)+1)
|
||||
if prevAnchor != "" {
|
||||
seen[prevAnchor] = struct{}{}
|
||||
}
|
||||
for i, hop := range hops {
|
||||
r := resolveHopWithContext(hop, prevAnchor, graph, idx, seen)
|
||||
out[i] = r
|
||||
if r != nil {
|
||||
lc := strings.ToLower(*r)
|
||||
seen[lc] = struct{}{}
|
||||
prevAnchor = lc
|
||||
} else {
|
||||
prevAnchor = ""
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// RefreshNeighborGraph loads the latest neighbor_edges snapshot and
|
||||
// publishes it atomically. Called on startup and once per neighbor-
|
||||
// edges builder tick (60s) alongside RefreshPrefixIndex.
|
||||
func (s *Store) RefreshNeighborGraph() error {
|
||||
g, err := loadNeighborGraph(s.db)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
s.neighborGraph.store(g)
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,113 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
)
|
||||
|
||||
// Issue #1547 — resolved_path writer (ingestor-owned).
|
||||
//
|
||||
// Per the #1283 refactor (server is read-only; ingestor owns the
|
||||
// neighbor graph + node directory), the writer that populated
|
||||
// `observations.resolved_path` must live here in the ingestor. PR #1289
|
||||
// removed the server-side writer without porting it — this restores it.
|
||||
//
|
||||
// Approach:
|
||||
// - `resolvePath` is a pure function: hop prefixes → full pubkeys
|
||||
// using the in-memory prefix index built from `nodes.public_key`.
|
||||
// - Unique-prefix hops resolve to the full pubkey; ambiguous or
|
||||
// unknown hops resolve to `nil`. The output shape is `[]*string`
|
||||
// (with nulls for unresolved positions) — the JSON serialization
|
||||
// matches what the server's `unmarshalResolvedPath` /
|
||||
// frontend `getResolvedPath` already consume.
|
||||
// - The prefix index is rebuilt on startup and once per neighbor-
|
||||
// builder tick (60s) so new nodes start resolving within a minute
|
||||
// without blocking the MQTT ingest path.
|
||||
|
||||
// resolvePath maps each hop prefix to a full pubkey when the index
|
||||
// has exactly one candidate; returns nil at that position otherwise.
|
||||
// Returns nil for empty/no hops.
|
||||
func resolvePath(hops []string, idx prefixIndex) []*string {
|
||||
if len(hops) == 0 {
|
||||
return nil
|
||||
}
|
||||
out := make([]*string, len(hops))
|
||||
if idx == nil {
|
||||
return out
|
||||
}
|
||||
for i, hop := range hops {
|
||||
h := strings.ToLower(hop)
|
||||
candidates := idx[h]
|
||||
if len(candidates) == 1 {
|
||||
pk := candidates[0]
|
||||
out[i] = &pk
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// marshalResolvedPath JSON-encodes a resolved path. Returns "" when
|
||||
// the input is empty OR when every element is nil (writer treats "" as
|
||||
// SQL NULL).
|
||||
//
|
||||
// The all-nil case matters because of the UPSERT in InsertTransmission:
|
||||
//
|
||||
// resolved_path = COALESCE(excluded.resolved_path, resolved_path)
|
||||
//
|
||||
// If we emitted "[null,null]" here, nilIfEmpty() would let it through
|
||||
// as a non-NULL string and the COALESCE would OVERWRITE a previously
|
||||
// stored good resolved_path on re-ingest. Returning "" lets nilIfEmpty
|
||||
// produce SQL NULL so the COALESCE falls through to the existing value.
|
||||
// See issue #1547 / PR #1548 reviewer findings.
|
||||
func marshalResolvedPath(rp []*string) string {
|
||||
if len(rp) == 0 {
|
||||
return ""
|
||||
}
|
||||
allNil := true
|
||||
for _, p := range rp {
|
||||
if p != nil {
|
||||
allNil = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if allNil {
|
||||
return ""
|
||||
}
|
||||
b, err := json.Marshal(rp)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return string(b)
|
||||
}
|
||||
|
||||
// prefixIdxHolder caches the prefix index for the InsertTransmission
|
||||
// hot path. atomic.Value lets the 60s rebuild happen without a lock on
|
||||
// the read side.
|
||||
type prefixIdxHolder struct {
|
||||
v atomic.Value // holds prefixIndex
|
||||
}
|
||||
|
||||
func (h *prefixIdxHolder) load() prefixIndex {
|
||||
if v := h.v.Load(); v != nil {
|
||||
return v.(prefixIndex)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *prefixIdxHolder) store(idx prefixIndex) {
|
||||
h.v.Store(idx)
|
||||
}
|
||||
|
||||
// RefreshPrefixIndex rebuilds the in-memory prefix index from the
|
||||
// nodes table and publishes it atomically. Called on startup and from
|
||||
// the neighbor-edges builder tick (60s) so new nodes become resolvable
|
||||
// without per-insert DB scans.
|
||||
func (s *Store) RefreshPrefixIndex() error {
|
||||
idx, err := buildPrefixIndex(s.db)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
s.prefixIdx.store(idx)
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,446 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func unmarshalResolvedPathLocal(s string) []*string {
|
||||
if s == "" {
|
||||
return nil
|
||||
}
|
||||
var out []*string
|
||||
if json.Unmarshal([]byte(s), &out) != nil {
|
||||
return nil
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// TestResolvePathPureFunction is a unit test for the pure resolvePath
|
||||
// helper. Asserts:
|
||||
// - unique-prefix hops resolve to the full pubkey
|
||||
// - ambiguous-prefix hops resolve to nil
|
||||
// - unknown-prefix hops resolve to nil
|
||||
// - return slice length equals input hop count
|
||||
//
|
||||
// Regression gate for #1547 (resolved_path stopped being written).
|
||||
func TestResolvePathPureFunction(t *testing.T) {
|
||||
idx := prefixIndex{
|
||||
// "aa" → exactly one pubkey
|
||||
"aa": {"aaaaaaaaaa"},
|
||||
"aaaaaaaaaa": {"aaaaaaaaaa"},
|
||||
// "bb" → exactly one pubkey
|
||||
"bb": {"bbbbbbbbbb"},
|
||||
"bbbbbbbbbb": {"bbbbbbbbbb"},
|
||||
// "cc" → ambiguous (2 candidates)
|
||||
"cc": {"cccccccccc", "ccdddddddd"},
|
||||
"cccccccccc": {"cccccccccc"},
|
||||
}
|
||||
|
||||
got := resolvePath([]string{"aa", "cc", "ff", "bb"}, idx)
|
||||
if len(got) != 4 {
|
||||
t.Fatalf("expected len 4, got %d", len(got))
|
||||
}
|
||||
if got[0] == nil || *got[0] != "aaaaaaaaaa" {
|
||||
t.Errorf("hop[0] aa: want aaaaaaaaaa, got %v", deref(got[0]))
|
||||
}
|
||||
if got[1] != nil {
|
||||
t.Errorf("hop[1] cc: want nil (ambiguous), got %v", deref(got[1]))
|
||||
}
|
||||
if got[2] != nil {
|
||||
t.Errorf("hop[2] ff: want nil (unknown), got %v", deref(got[2]))
|
||||
}
|
||||
if got[3] == nil || *got[3] != "bbbbbbbbbb" {
|
||||
t.Errorf("hop[3] bb: want bbbbbbbbbb, got %v", deref(got[3]))
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolvePathEmptyHops asserts empty/no-path produces nil.
|
||||
func TestResolvePathEmptyHops(t *testing.T) {
|
||||
if got := resolvePath(nil, prefixIndex{}); got != nil {
|
||||
t.Errorf("nil hops: want nil, got %v", got)
|
||||
}
|
||||
if got := resolvePath([]string{}, prefixIndex{}); got != nil {
|
||||
t.Errorf("empty hops: want nil, got %v", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestMarshalResolvedPathRoundtrip asserts the JSON shape matches the
|
||||
// server's marshal/unmarshal contract: `[]*string` with nulls for
|
||||
// unresolved hops.
|
||||
func TestMarshalResolvedPathRoundtrip(t *testing.T) {
|
||||
a := "aaaaaaaaaa"
|
||||
b := "bbbbbbbbbb"
|
||||
in := []*string{&a, nil, &b}
|
||||
s := marshalResolvedPath(in)
|
||||
want := `["aaaaaaaaaa",null,"bbbbbbbbbb"]`
|
||||
if s != want {
|
||||
t.Errorf("marshal: want %s, got %s", want, s)
|
||||
}
|
||||
}
|
||||
|
||||
// TestInsertTransmissionWritesResolvedPath is the integration test that
|
||||
// gates the regression introduced by PR #1289 (issue #1547).
|
||||
//
|
||||
// Setup: seed two nodes + one observer + invoke InsertTransmission with
|
||||
// a PacketData whose PathJSON references one of the seeded nodes by
|
||||
// unique 1-byte (2-hex) prefix.
|
||||
//
|
||||
// Assert: the inserted observations row has a non-NULL resolved_path
|
||||
// whose JSON-decoded length equals the hop count, and the resolved
|
||||
// element matches the seeded node's full pubkey.
|
||||
func TestInsertTransmissionWritesResolvedPath(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "ingest.db")
|
||||
|
||||
store, err := OpenStore(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenStore: %v", err)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
// Seed nodes with unique 1-byte prefixes.
|
||||
if _, err := store.db.Exec(
|
||||
`INSERT INTO nodes (public_key, name) VALUES (?, ?), (?, ?)`,
|
||||
"aaaaaaaaaa", "from-node",
|
||||
"bbbbbbbbbb", "first-hop",
|
||||
); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Seed one observer (needed so InsertTransmission resolves observer_idx).
|
||||
if err := store.UpsertObserver("obs-1", "observer-1", "", nil); err != nil {
|
||||
t.Fatalf("UpsertObserver: %v", err)
|
||||
}
|
||||
|
||||
// Force the prefix index to be (re)built from the seeded nodes so
|
||||
// the InsertTransmission path has something to resolve against.
|
||||
if err := store.RefreshPrefixIndex(); err != nil {
|
||||
t.Fatalf("RefreshPrefixIndex: %v", err)
|
||||
}
|
||||
|
||||
pkt := &PacketData{
|
||||
RawHex: "deadbeef",
|
||||
Timestamp: "2026-06-01T00:00:00Z",
|
||||
ObserverID: "obs-1",
|
||||
Hash: "h-1547",
|
||||
RouteType: 0,
|
||||
PayloadType: int(payloadADVERT),
|
||||
PathJSON: `["bb"]`,
|
||||
DecodedJSON: "{}",
|
||||
FromPubkey: "aaaaaaaaaa",
|
||||
}
|
||||
if _, err := store.InsertTransmission(pkt); err != nil {
|
||||
t.Fatalf("InsertTransmission: %v", err)
|
||||
}
|
||||
|
||||
var rp sql.NullString
|
||||
if err := store.db.QueryRow(
|
||||
`SELECT resolved_path FROM observations WHERE transmission_id = (SELECT id FROM transmissions WHERE hash = ?)`,
|
||||
"h-1547",
|
||||
).Scan(&rp); err != nil {
|
||||
t.Fatalf("query: %v", err)
|
||||
}
|
||||
if !rp.Valid || rp.String == "" {
|
||||
t.Fatalf("expected non-nil resolved_path, got NULL/empty (regression: #1547)")
|
||||
}
|
||||
got := unmarshalResolvedPathLocal(rp.String)
|
||||
if len(got) != 1 {
|
||||
t.Fatalf("resolved_path length: want 1, got %d (value=%s)", len(got), rp.String)
|
||||
}
|
||||
if got[0] == nil || *got[0] != "bbbbbbbbbb" {
|
||||
t.Errorf("resolved_path[0]: want bbbbbbbbbb, got %v (raw=%s)", deref(got[0]), rp.String)
|
||||
}
|
||||
}
|
||||
|
||||
func deref(p *string) string {
|
||||
if p == nil {
|
||||
return "<nil>"
|
||||
}
|
||||
return *p
|
||||
}
|
||||
|
||||
// ─── #1560: context-aware resolution tests ─────────────────────────────────
|
||||
//
|
||||
// These exercise the post-fix behavior of resolveHopWithContext +
|
||||
// resolvePathWithContext. Until the green commit lands they MUST fail
|
||||
// on assertions (the stub falls back to naive `len==1` and returns nil
|
||||
// on every >1-candidate prefix), proving the gate is real.
|
||||
|
||||
// build5NodeAmbiguousIndex returns a prefixIndex where 3 of 5 nodes
|
||||
// share the 1-byte prefix 0x5c. Pubkeys are the "fingerprints":
|
||||
//
|
||||
// A = "5c000000000000000000000000000000aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
|
||||
// B = "5c000000000000000000000000000000bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
|
||||
// C = "5c000000000000000000000000000000cccccccccccccccccccccccccccccccc"
|
||||
// D = "dd000000000000000000000000000000dddddddddddddddddddddddddddddddd"
|
||||
// E = "ee000000000000000000000000000000eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee"
|
||||
func build5NodeAmbiguousIndex() (idx prefixIndex, A, B, C, D, E string) {
|
||||
A = "5c000000000000000000000000000000aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
|
||||
B = "5c000000000000000000000000000000bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
|
||||
C = "5c000000000000000000000000000000cccccccccccccccccccccccccccccccc"
|
||||
D = "dd000000000000000000000000000000dddddddddddddddddddddddddddddddd"
|
||||
E = "ee000000000000000000000000000000eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee"
|
||||
idx = prefixIndex{
|
||||
// 1-byte: 5c → A,B,C (collision); dd → D; ee → E
|
||||
"5c": {A, B, C},
|
||||
"dd": {D},
|
||||
"ee": {E},
|
||||
// full-key entries (so exact-match lookups still resolve)
|
||||
A: {A}, B: {B}, C: {C}, D: {D}, E: {E},
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// TestResolveHopWithContext_OneByteCollision_AdjacencyResolves
|
||||
// asserts the dominant production case (#1560): three nodes share the
|
||||
// 1-byte prefix 0x5c, but NeighborGraph adjacency narrows to exactly
|
||||
// one. The naive resolver returns nil; the context-aware resolver
|
||||
// MUST return the right pubkey.
|
||||
func TestResolveHopWithContext_OneByteCollision_AdjacencyResolves(t *testing.T) {
|
||||
idx, A, B, C, D, E := build5NodeAmbiguousIndex()
|
||||
g := NewNeighborGraph()
|
||||
// chain: A↔B, B↔C, C↔D, D↔E
|
||||
g.AddEdge(A, B)
|
||||
g.AddEdge(B, C)
|
||||
g.AddEdge(C, D)
|
||||
g.AddEdge(D, E)
|
||||
|
||||
// Anchored on A, the only 5c neighbor of A is B.
|
||||
got := resolveHopWithContext("5c", A, g, idx, nil)
|
||||
if got == nil {
|
||||
t.Fatalf("anchor=A, hop=5c: want B (%s), got <nil>", B)
|
||||
}
|
||||
if *got != B {
|
||||
t.Errorf("anchor=A, hop=5c: want %s, got %s", B, *got)
|
||||
}
|
||||
|
||||
// Anchored on B, the only 5c neighbors of B are A and C — but A is
|
||||
// the originator anchor in a path-walk; here we just assert that
|
||||
// 2 surviving candidates → nil (cannot disambiguate further).
|
||||
got = resolveHopWithContext("5c", B, g, idx, nil)
|
||||
if got != nil {
|
||||
t.Errorf("anchor=B, hop=5c: ambiguous (A and C both adjacent); want <nil>, got %s", *got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolvePathWithContext_TwoHopChainAnchoredOnFromNode covers the
|
||||
// canonical 1-byte collision case end-to-end: path = [5c, 5c],
|
||||
// from_node = A → expect [B, C].
|
||||
func TestResolvePathWithContext_TwoHopChainAnchoredOnFromNode(t *testing.T) {
|
||||
idx, A, B, C, _, _ := build5NodeAmbiguousIndex()
|
||||
g := NewNeighborGraph()
|
||||
g.AddEdge(A, B)
|
||||
g.AddEdge(B, C)
|
||||
|
||||
got := resolvePathWithContext([]string{"5c", "5c"}, A, g, idx)
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("len(got)=%d, want 2 (raw=%v)", len(got), got)
|
||||
}
|
||||
if got[0] == nil || *got[0] != B {
|
||||
t.Errorf("hop[0]: want %s, got %v", B, deref(got[0]))
|
||||
}
|
||||
if got[1] == nil || *got[1] != C {
|
||||
t.Errorf("hop[1]: want %s, got %v", C, deref(got[1]))
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolveHopWithContext_NoAdjacencyContext_ReturnsNil asserts the
|
||||
// negative gate: 3 nodes with shared prefix, no edges between them in
|
||||
// the graph, hop=[5c] with no usable anchor → nil. Guards against an
|
||||
// over-eager resolver that just picks the first candidate.
|
||||
func TestResolveHopWithContext_NoAdjacencyContext_ReturnsNil(t *testing.T) {
|
||||
idx, _, _, _, _, _ := build5NodeAmbiguousIndex()
|
||||
g := NewNeighborGraph() // empty: no edges
|
||||
got := resolveHopWithContext("5c", "", g, idx, nil)
|
||||
if got != nil {
|
||||
t.Errorf("no anchor + empty graph: want <nil>, got %s", *got)
|
||||
}
|
||||
|
||||
// With an anchor that's not adjacent to any candidate, also nil.
|
||||
got = resolveHopWithContext("5c", "deadbeefdeadbeef", g, idx, nil)
|
||||
if got != nil {
|
||||
t.Errorf("non-adjacent anchor: want <nil>, got %s", *got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolvePathWithContext_AdvertAnchoring asserts ADVERT-style
|
||||
// anchoring: from_pubkey is the originator, hop[0] is one of its
|
||||
// 1-byte-prefix neighbors → resolved.
|
||||
func TestResolvePathWithContext_AdvertAnchoring(t *testing.T) {
|
||||
idx, A, B, _, _, _ := build5NodeAmbiguousIndex()
|
||||
g := NewNeighborGraph()
|
||||
g.AddEdge(A, B) // only B is adjacent to A among the 5c candidates
|
||||
|
||||
got := resolvePathWithContext([]string{"5c"}, A, g, idx)
|
||||
if len(got) != 1 {
|
||||
t.Fatalf("len(got)=%d, want 1", len(got))
|
||||
}
|
||||
if got[0] == nil || *got[0] != B {
|
||||
t.Errorf("ADVERT anchored on A, hop=5c: want %s, got %v", B, deref(got[0]))
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolvePathWithContext_RegressionMultiByteStillWorks asserts no
|
||||
// regression in the 2/3/4-byte prefix path that PR #1548 already
|
||||
// handled — unique prefixes resolve regardless of graph context.
|
||||
func TestResolvePathWithContext_RegressionMultiByteStillWorks(t *testing.T) {
|
||||
idx, _, _, _, D, E := build5NodeAmbiguousIndex()
|
||||
// dd and ee are unique 1-byte prefixes — naive path still works.
|
||||
got := resolvePathWithContext([]string{"dd", "ee"}, "", nil, idx)
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("len(got)=%d, want 2", len(got))
|
||||
}
|
||||
if got[0] == nil || *got[0] != D {
|
||||
t.Errorf("hop[0] dd: want %s, got %v", D, deref(got[0]))
|
||||
}
|
||||
if got[1] == nil || *got[1] != E {
|
||||
t.Errorf("hop[1] ee: want %s, got %v", E, deref(got[1]))
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolvePathWithContext_AllNilContractPreserved asserts the
|
||||
// all-nil → empty-string clobber-guard contract from PR #1548 still
|
||||
// holds: an unresolvable path through the context resolver, when fed
|
||||
// to marshalResolvedPath, MUST yield "" (so nilIfEmpty → SQL NULL
|
||||
// → COALESCE preserves existing).
|
||||
func TestResolvePathWithContext_AllNilContractPreserved(t *testing.T) {
|
||||
// Empty index → every hop nil.
|
||||
got := resolvePathWithContext([]string{"5c", "dd"}, "", nil, prefixIndex{})
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("len(got)=%d, want 2", len(got))
|
||||
}
|
||||
for i, p := range got {
|
||||
if p != nil {
|
||||
t.Errorf("hop[%d]: want <nil>, got %s", i, *p)
|
||||
}
|
||||
}
|
||||
if s := marshalResolvedPath(got); s != "" {
|
||||
t.Errorf("all-nil marshal: want \"\", got %q (clobber-guard regression)", s)
|
||||
}
|
||||
}
|
||||
|
||||
// TestMarshalResolvedPathAllNilReturnsEmpty is a regression gate for
|
||||
// the data-loss clobber bug surfaced in PR #1548 review.
|
||||
//
|
||||
// When resolvePath fails to resolve ANY hop (every element nil),
|
||||
// marshalResolvedPath previously emitted "[null,null,...]" — a
|
||||
// non-empty string that bypassed nilIfEmpty and then OVERWROTE the
|
||||
// existing resolved_path via the COALESCE(excluded, current) UPSERT
|
||||
// on re-ingest. The fix returns "" so nilIfEmpty produces SQL NULL and
|
||||
// the COALESCE preserves the existing good value.
|
||||
func TestMarshalResolvedPathAllNilReturnsEmpty(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
in []*string
|
||||
}{
|
||||
{"one-nil", []*string{nil}},
|
||||
{"two-nils", []*string{nil, nil}},
|
||||
{"three-nils", []*string{nil, nil, nil}},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got := marshalResolvedPath(tc.in)
|
||||
if got != "" {
|
||||
t.Errorf("all-nil input must return \"\" (so nilIfEmpty → SQL NULL → COALESCE preserves existing); got %q", got)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Mixed (at least one non-nil) MUST still marshal normally so we
|
||||
// don't lose partial resolutions.
|
||||
a := "aaaaaaaaaa"
|
||||
mixed := marshalResolvedPath([]*string{&a, nil})
|
||||
if mixed != `["aaaaaaaaaa",null]` {
|
||||
t.Errorf("partial resolution must still serialize; got %q", mixed)
|
||||
}
|
||||
}
|
||||
|
||||
// TestInsertTransmissionDoesNotClobberResolvedPathOnAllNil is the
|
||||
// integration-level regression test for the data-loss bug.
|
||||
//
|
||||
// Setup: insert a transmission whose first ingest resolves cleanly to
|
||||
// a known pubkey. Then re-ingest the SAME transmission after the
|
||||
// prefix index has been cleared (simulating an empty NeighborGraph /
|
||||
// all-nil resolution path) and assert the previously stored
|
||||
// resolved_path is PRESERVED (NOT overwritten to "[null]" or NULL).
|
||||
//
|
||||
// Pre-fix behavior: marshalResolvedPath emitted "[null]", nilIfEmpty
|
||||
// kept it non-NULL, and COALESCE(excluded.resolved_path, resolved_path)
|
||||
// clobbered the original "bbbbbbbbbb".
|
||||
func TestInsertTransmissionDoesNotClobberResolvedPathOnAllNil(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "ingest.db")
|
||||
|
||||
store, err := OpenStore(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenStore: %v", err)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
if _, err := store.db.Exec(
|
||||
`INSERT INTO nodes (public_key, name) VALUES (?, ?), (?, ?)`,
|
||||
"aaaaaaaaaa", "from-node",
|
||||
"bbbbbbbbbb", "first-hop",
|
||||
); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := store.UpsertObserver("obs-1", "observer-1", "", nil); err != nil {
|
||||
t.Fatalf("UpsertObserver: %v", err)
|
||||
}
|
||||
if err := store.RefreshPrefixIndex(); err != nil {
|
||||
t.Fatalf("RefreshPrefixIndex: %v", err)
|
||||
}
|
||||
|
||||
pkt := &PacketData{
|
||||
RawHex: "deadbeef",
|
||||
Timestamp: "2026-06-01T00:00:00Z",
|
||||
ObserverID: "obs-1",
|
||||
Hash: "h-clobber",
|
||||
RouteType: 0,
|
||||
PayloadType: int(payloadADVERT),
|
||||
PathJSON: `["bb"]`,
|
||||
DecodedJSON: "{}",
|
||||
FromPubkey: "aaaaaaaaaa",
|
||||
}
|
||||
if _, err := store.InsertTransmission(pkt); err != nil {
|
||||
t.Fatalf("first InsertTransmission: %v", err)
|
||||
}
|
||||
|
||||
// Sanity: first write populated resolved_path.
|
||||
var first sql.NullString
|
||||
if err := store.db.QueryRow(
|
||||
`SELECT resolved_path FROM observations WHERE transmission_id = (SELECT id FROM transmissions WHERE hash = ?)`,
|
||||
"h-clobber",
|
||||
).Scan(&first); err != nil {
|
||||
t.Fatalf("first query: %v", err)
|
||||
}
|
||||
if !first.Valid || first.String == "" {
|
||||
t.Fatalf("precondition failed: first ingest left resolved_path NULL/empty; cannot test clobber")
|
||||
}
|
||||
wantPreserved := first.String
|
||||
|
||||
// Now wipe the prefix index so re-ingest produces an all-nil
|
||||
// resolution — exactly the scenario where the bug clobbers data.
|
||||
store.prefixIdx.store(prefixIndex{})
|
||||
|
||||
if _, err := store.InsertTransmission(pkt); err != nil {
|
||||
t.Fatalf("re-ingest InsertTransmission: %v", err)
|
||||
}
|
||||
|
||||
var after sql.NullString
|
||||
if err := store.db.QueryRow(
|
||||
`SELECT resolved_path FROM observations WHERE transmission_id = (SELECT id FROM transmissions WHERE hash = ?)`,
|
||||
"h-clobber",
|
||||
).Scan(&after); err != nil {
|
||||
t.Fatalf("post-reingest query: %v", err)
|
||||
}
|
||||
if !after.Valid {
|
||||
t.Fatalf("data loss: resolved_path was NULL'd by re-ingest (was %q)", wantPreserved)
|
||||
}
|
||||
if after.String != wantPreserved {
|
||||
t.Errorf("data loss: resolved_path was clobbered by all-nil re-ingest\n before: %s\n after: %s", wantPreserved, after.String)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,187 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
)
|
||||
|
||||
// SourceStatusSnapshot is the per-MQTT-source connection state and counter
|
||||
// view written to the ingestor stats file (under "source_statuses") and
|
||||
// consumed by cmd/server's /api/mqtt/status handler (#1043).
|
||||
//
|
||||
// All fields are unix seconds (0 = "never"). PacketsLast5m is a sliding
|
||||
// 5-minute count derived from a per-second ring buffer.
|
||||
type SourceStatusSnapshot struct {
|
||||
Name string `json:"name"`
|
||||
Broker string `json:"broker"`
|
||||
Connected bool `json:"connected"`
|
||||
LastConnectUnix int64 `json:"lastConnectUnix"`
|
||||
LastDisconnectUnix int64 `json:"lastDisconnectUnix"`
|
||||
LastPacketUnix int64 `json:"lastPacketUnix"`
|
||||
ConnectCount int64 `json:"connectCount"`
|
||||
DisconnectCount int64 `json:"disconnectCount"`
|
||||
PacketsTotal int64 `json:"packetsTotal"`
|
||||
PacketsLast5m int64 `json:"packetsLast5m"`
|
||||
LastError string `json:"lastError,omitempty"`
|
||||
}
|
||||
|
||||
// sourceStatusState is the in-memory per-source counter set. All scalar
|
||||
// fields are accessed via sync/atomic so the hot-path MarkPacket /
|
||||
// MarkConnect / MarkDisconnect callsites stay lock-free. The 5-minute
|
||||
// sliding window uses a 300-element per-second ring (one slot per
|
||||
// second), guarded by ringMu only when we slide the cursor — the common
|
||||
// path increments the current second with a single atomic.AddInt64.
|
||||
//
|
||||
// Memory: one state per source (typically 1-5 in production). 300 int64
|
||||
// slots = 2.4KB/source — fine.
|
||||
type sourceStatusState struct {
|
||||
name string
|
||||
broker string // raw broker URL — server-side handler masks the password
|
||||
|
||||
connected atomic.Bool
|
||||
lastConnectUnix atomic.Int64
|
||||
lastDisconnectUnix atomic.Int64
|
||||
lastPacketUnix atomic.Int64
|
||||
connectCount atomic.Int64
|
||||
disconnectCount atomic.Int64
|
||||
packetsTotal atomic.Int64
|
||||
|
||||
// 5-minute sliding window: per-second buckets keyed by unix second.
|
||||
// Stored as parallel arrays so we can both zero-out a stale slot AND
|
||||
// know whether a slot's contents are still inside the window.
|
||||
ringMu sync.Mutex
|
||||
ringSec [300]int64 // unix second this slot represents (0 = unused)
|
||||
ringCount [300]int64 // packets received in that second
|
||||
|
||||
// lastError is rare-write/rare-read so a plain mutex is fine.
|
||||
errMu sync.RWMutex
|
||||
lastError string
|
||||
}
|
||||
|
||||
// MarkConnect records a successful (re)connection to the broker.
|
||||
// Clears any stale lastError from a prior disconnect — otherwise the UI
|
||||
// shows "connected=true, lastError='connection refused'" after a successful
|
||||
// reconnect, which is a lie (#1682 munger review r1).
|
||||
func (s *sourceStatusState) MarkConnect(now time.Time) {
|
||||
s.connected.Store(true)
|
||||
s.lastConnectUnix.Store(now.Unix())
|
||||
s.connectCount.Add(1)
|
||||
s.errMu.Lock()
|
||||
s.lastError = ""
|
||||
s.errMu.Unlock()
|
||||
}
|
||||
|
||||
// MarkDisconnect records the broker dropping the connection.
|
||||
func (s *sourceStatusState) MarkDisconnect(now time.Time, err error) {
|
||||
s.connected.Store(false)
|
||||
s.lastDisconnectUnix.Store(now.Unix())
|
||||
s.disconnectCount.Add(1)
|
||||
if err != nil {
|
||||
s.errMu.Lock()
|
||||
s.lastError = err.Error()
|
||||
s.errMu.Unlock()
|
||||
}
|
||||
}
|
||||
|
||||
// MarkPacket records receipt of an MQTT message. Hot path.
|
||||
func (s *sourceStatusState) MarkPacket(now time.Time) {
|
||||
nowSec := now.Unix()
|
||||
s.lastPacketUnix.Store(nowSec)
|
||||
s.packetsTotal.Add(1)
|
||||
|
||||
slot := nowSec % int64(len(s.ringSec))
|
||||
s.ringMu.Lock()
|
||||
if s.ringSec[slot] != nowSec {
|
||||
s.ringSec[slot] = nowSec
|
||||
s.ringCount[slot] = 0
|
||||
}
|
||||
s.ringCount[slot]++
|
||||
s.ringMu.Unlock()
|
||||
}
|
||||
|
||||
// sumLast5m returns the count of MarkPacket calls in the last 300s. Slots
|
||||
// whose stored second falls outside the window are ignored (no stale leak).
|
||||
func (s *sourceStatusState) sumLast5m(now time.Time) int64 {
|
||||
nowSec := now.Unix()
|
||||
cutoff := nowSec - int64(len(s.ringSec)) + 1
|
||||
var total int64
|
||||
s.ringMu.Lock()
|
||||
for i := 0; i < len(s.ringSec); i++ {
|
||||
if s.ringSec[i] >= cutoff && s.ringSec[i] <= nowSec {
|
||||
total += s.ringCount[i]
|
||||
}
|
||||
}
|
||||
s.ringMu.Unlock()
|
||||
return total
|
||||
}
|
||||
|
||||
// snapshot copies the state into a serializable view.
|
||||
func (s *sourceStatusState) snapshot(now time.Time) SourceStatusSnapshot {
|
||||
s.errMu.RLock()
|
||||
errStr := s.lastError
|
||||
s.errMu.RUnlock()
|
||||
return SourceStatusSnapshot{
|
||||
Name: s.name,
|
||||
Broker: s.broker,
|
||||
Connected: s.connected.Load(),
|
||||
LastConnectUnix: s.lastConnectUnix.Load(),
|
||||
LastDisconnectUnix: s.lastDisconnectUnix.Load(),
|
||||
LastPacketUnix: s.lastPacketUnix.Load(),
|
||||
ConnectCount: s.connectCount.Load(),
|
||||
DisconnectCount: s.disconnectCount.Load(),
|
||||
PacketsTotal: s.packetsTotal.Load(),
|
||||
PacketsLast5m: s.sumLast5m(now),
|
||||
LastError: errStr,
|
||||
}
|
||||
}
|
||||
|
||||
// sourceStatusRegistry holds one sourceStatusState per source. Keyed by
|
||||
// tag (which is the source Name, or the Broker URL if the operator left
|
||||
// the name blank).
|
||||
var (
|
||||
sourceStatusRegistryMu sync.RWMutex
|
||||
sourceStatusRegistry = map[string]*sourceStatusState{}
|
||||
)
|
||||
|
||||
// RegisterSourceStatus creates (or returns the existing) state for the
|
||||
// given source. Safe for cold-start use; idempotent — re-registering the
|
||||
// same tag returns the existing state so counters aren't reset across
|
||||
// reconnects.
|
||||
func RegisterSourceStatus(tag, broker string) *sourceStatusState {
|
||||
sourceStatusRegistryMu.Lock()
|
||||
defer sourceStatusRegistryMu.Unlock()
|
||||
if s, ok := sourceStatusRegistry[tag]; ok {
|
||||
return s
|
||||
}
|
||||
s := &sourceStatusState{name: tag, broker: broker}
|
||||
sourceStatusRegistry[tag] = s
|
||||
return s
|
||||
}
|
||||
|
||||
// lookupSourceStatus returns the state for tag, or nil if unregistered.
|
||||
func lookupSourceStatus(tag string) *sourceStatusState {
|
||||
sourceStatusRegistryMu.RLock()
|
||||
defer sourceStatusRegistryMu.RUnlock()
|
||||
return sourceStatusRegistry[tag]
|
||||
}
|
||||
|
||||
// SnapshotSourceStatuses returns a slice of every registered source's
|
||||
// current snapshot. Surfaced via the ingestor stats file under
|
||||
// "source_statuses" so /api/mqtt/status can serve it (#1043).
|
||||
func SnapshotSourceStatuses(now time.Time) []SourceStatusSnapshot {
|
||||
sourceStatusRegistryMu.RLock()
|
||||
defer sourceStatusRegistryMu.RUnlock()
|
||||
out := make([]SourceStatusSnapshot, 0, len(sourceStatusRegistry))
|
||||
for _, s := range sourceStatusRegistry {
|
||||
out = append(out, s.snapshot(now))
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// resetSourceStatusRegistry clears the registry. Test-only helper.
|
||||
func resetSourceStatusRegistry() {
|
||||
sourceStatusRegistryMu.Lock()
|
||||
defer sourceStatusRegistryMu.Unlock()
|
||||
sourceStatusRegistry = map[string]*sourceStatusState{}
|
||||
}
|
||||
@@ -0,0 +1,116 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TestSourceStatus_BasicLifecycle exercises the counter wiring used by
|
||||
// the /api/mqtt/status server-side endpoint (#1043).
|
||||
func TestSourceStatus_BasicLifecycle(t *testing.T) {
|
||||
resetSourceStatusRegistry()
|
||||
defer resetSourceStatusRegistry()
|
||||
|
||||
s := RegisterSourceStatus("local", "mqtt://broker.example.com:1883")
|
||||
if s == nil {
|
||||
t.Fatal("RegisterSourceStatus returned nil")
|
||||
}
|
||||
// Re-registration is idempotent.
|
||||
if s2 := RegisterSourceStatus("local", "mqtt://other"); s2 != s {
|
||||
t.Fatal("RegisterSourceStatus not idempotent")
|
||||
}
|
||||
|
||||
now := time.Unix(1_700_000_000, 0)
|
||||
s.MarkConnect(now)
|
||||
s.MarkPacket(now)
|
||||
s.MarkPacket(now.Add(1 * time.Second))
|
||||
s.MarkPacket(now.Add(2 * time.Second))
|
||||
|
||||
snap := s.snapshot(now.Add(3 * time.Second))
|
||||
if !snap.Connected {
|
||||
t.Error("snapshot.Connected = false, want true after MarkConnect")
|
||||
}
|
||||
if snap.PacketsTotal != 3 {
|
||||
t.Errorf("PacketsTotal = %d, want 3", snap.PacketsTotal)
|
||||
}
|
||||
if snap.PacketsLast5m != 3 {
|
||||
t.Errorf("PacketsLast5m = %d, want 3", snap.PacketsLast5m)
|
||||
}
|
||||
if snap.ConnectCount != 1 {
|
||||
t.Errorf("ConnectCount = %d, want 1", snap.ConnectCount)
|
||||
}
|
||||
if snap.LastConnectUnix != now.Unix() {
|
||||
t.Errorf("LastConnectUnix = %d, want %d", snap.LastConnectUnix, now.Unix())
|
||||
}
|
||||
if snap.Broker != "mqtt://broker.example.com:1883" {
|
||||
t.Errorf("Broker = %q, want raw URL passthrough (server masks)", snap.Broker)
|
||||
}
|
||||
|
||||
// After 5 minutes idle, sliding window must be empty.
|
||||
snap2 := s.snapshot(now.Add(6 * time.Minute))
|
||||
if snap2.PacketsLast5m != 0 {
|
||||
t.Errorf("PacketsLast5m after 6m idle = %d, want 0", snap2.PacketsLast5m)
|
||||
}
|
||||
if snap2.PacketsTotal != 3 {
|
||||
t.Errorf("PacketsTotal must be lifetime-cumulative, got %d", snap2.PacketsTotal)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSourceStatus_Disconnect(t *testing.T) {
|
||||
resetSourceStatusRegistry()
|
||||
defer resetSourceStatusRegistry()
|
||||
|
||||
s := RegisterSourceStatus("disco", "mqtt://x:1883")
|
||||
now := time.Unix(1_700_000_100, 0)
|
||||
s.MarkConnect(now)
|
||||
s.MarkDisconnect(now.Add(time.Minute), nil)
|
||||
|
||||
snap := s.snapshot(now.Add(2 * time.Minute))
|
||||
if snap.Connected {
|
||||
t.Error("snapshot.Connected = true after MarkDisconnect, want false")
|
||||
}
|
||||
if snap.DisconnectCount != 1 {
|
||||
t.Errorf("DisconnectCount = %d, want 1", snap.DisconnectCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSnapshotSourceStatuses_ReturnsAll(t *testing.T) {
|
||||
resetSourceStatusRegistry()
|
||||
defer resetSourceStatusRegistry()
|
||||
|
||||
RegisterSourceStatus("a", "mqtt://a")
|
||||
RegisterSourceStatus("b", "mqtt://b")
|
||||
snaps := SnapshotSourceStatuses(time.Now())
|
||||
if len(snaps) != 2 {
|
||||
t.Errorf("len(snaps) = %d, want 2", len(snaps))
|
||||
}
|
||||
}
|
||||
|
||||
// TestSourceStatus_MarkConnectClearsLastError asserts MarkConnect wipes
|
||||
// any prior sticky error (#1682 munger r1 review). Otherwise the UI sees
|
||||
// connected=true alongside a stale "connection refused" string.
|
||||
func TestSourceStatus_MarkConnectClearsLastError(t *testing.T) {
|
||||
resetSourceStatusRegistry()
|
||||
defer resetSourceStatusRegistry()
|
||||
|
||||
s := RegisterSourceStatus("sticky", "mqtt://x:1883")
|
||||
now := time.Unix(1_700_000_200, 0)
|
||||
s.MarkConnect(now)
|
||||
s.MarkDisconnect(now.Add(time.Second), errors.New("connection refused"))
|
||||
|
||||
snap := s.snapshot(now.Add(2 * time.Second))
|
||||
if snap.LastError == "" {
|
||||
t.Fatalf("precondition: expected lastError after MarkDisconnect, got empty")
|
||||
}
|
||||
|
||||
// Reconnect — lastError must clear.
|
||||
s.MarkConnect(now.Add(3 * time.Second))
|
||||
snap = s.snapshot(now.Add(4 * time.Second))
|
||||
if snap.LastError != "" {
|
||||
t.Errorf("snapshot.LastError = %q after MarkConnect, want empty (sticky-error regression)", snap.LastError)
|
||||
}
|
||||
if !snap.Connected {
|
||||
t.Errorf("snapshot.Connected = false after MarkConnect, want true")
|
||||
}
|
||||
}
|
||||
@@ -43,6 +43,32 @@ type IngestorStatsSnapshot struct {
|
||||
// the server's /api/perf/io endpoint under .ingestor (#1120 — "Both
|
||||
// ingestor and server"). Optional; absent on non-Linux hosts.
|
||||
ProcIO *PerfIOSample `json:"procIO,omitempty"`
|
||||
// WriterPerf is the per-component SQLite writer-lock latency
|
||||
// snapshot (#1340) — wait_ms / hold_ms / contention_total tagged
|
||||
// by component (neighbor_builder, mqtt_handler, prune_packets,
|
||||
// prune_observers, prune_metrics, vacuum). Surfaced by the server
|
||||
// via /api/perf/write-sources under .writer_perf. Optional —
|
||||
// older ingestor builds don't publish this field.
|
||||
WriterPerf map[string]WriterStatsSnapshot `json:"writer_perf,omitempty"`
|
||||
// SourceLiveness (PR #1609 M1) is the per-MQTT-source receipt vs
|
||||
// write-path liveness snapshot. Keyed by source Tag. Surfaced by
|
||||
// the server via /api/healthz under .ingest_liveness so operators
|
||||
// can see "broker alive, write path stuck" (lastReceiptUnix recent,
|
||||
// lastMessageUnix stale) distinct from "everything stalled" (both
|
||||
// stale). Additive: omitempty so older server builds ignore it
|
||||
// gracefully.
|
||||
SourceLiveness map[string]SourceLivenessSnapshot `json:"source_liveness,omitempty"`
|
||||
// SourceStatuses (#1043) is the per-MQTT-source connection state and
|
||||
// counter view consumed by cmd/server's /api/mqtt/status handler.
|
||||
// Additive; omitempty so older server builds ignore it.
|
||||
SourceStatuses []SourceStatusSnapshot `json:"source_statuses,omitempty"`
|
||||
}
|
||||
|
||||
// SourceLivenessSnapshot is the per-source two-clock view exposed for
|
||||
// /api/healthz consumers. unixSeconds for both fields; 0 means "never".
|
||||
type SourceLivenessSnapshot struct {
|
||||
LastReceiptUnix int64 `json:"lastReceiptUnix"`
|
||||
LastMessageUnix int64 `json:"lastMessageUnix"`
|
||||
}
|
||||
|
||||
// statsFilePath returns the writable path the ingestor will publish stats to.
|
||||
@@ -61,6 +87,25 @@ func statsFilePath() string {
|
||||
|
||||
// writeStatsAtomic writes b to path via a tmp-then-rename, refusing to follow
|
||||
// symlinks on the tmp file. Returns nil on success, an error otherwise.
|
||||
//
|
||||
// Symlink semantics (refs #1170):
|
||||
//
|
||||
// - tmp side (path+".tmp"): protected by O_NOFOLLOW below. If tmp is a
|
||||
// pre-planted symlink, openat fails with ELOOP instead of writing
|
||||
// through it. This is the defensive-coding path that matters when the
|
||||
// default stats path lives under world-writable /tmp.
|
||||
//
|
||||
// - rename side (path): NOT protected by O_NOFOLLOW. Instead, os.Rename's
|
||||
// semantics are relied upon — rename atomically replaces any existing
|
||||
// entry at path (including a symlink) with the new regular file. The
|
||||
// symlink's target is NEVER written through, because all writes happened
|
||||
// to the unrelated tmp file before rename. Post-rename, path is a
|
||||
// regular file (not a symlink) and any prior symlink target's contents
|
||||
// are unchanged. The regression guardrail
|
||||
// TestWriteStatsAtomic_SymlinkAtDestIsReplaced pins this behavior so a
|
||||
// future refactor that swaps os.Rename for a destination-symlink-
|
||||
// following primitive (e.g. an open(path, O_WRONLY) without O_NOFOLLOW)
|
||||
// fails loudly.
|
||||
func writeStatsAtomic(path string, b []byte) error {
|
||||
tmp := path + ".tmp"
|
||||
// O_NOFOLLOW: if tmp is a pre-existing symlink, openat fails with ELOOP
|
||||
@@ -204,6 +249,9 @@ func StartStatsFileWriter(s *Store, interval time.Duration) {
|
||||
GroupCommitFlushes: 0, // group commit reverted (refs #1129)
|
||||
BackfillUpdates: s.Stats.SnapshotBackfills(),
|
||||
ProcIO: ioRate,
|
||||
WriterPerf: s.WriterStatsSnapshot(),
|
||||
SourceLiveness: SnapshotLivenessClocks(),
|
||||
SourceStatuses: SnapshotSourceStatuses(tickAt),
|
||||
}
|
||||
buf.Reset()
|
||||
if err := enc.Encode(&snap); err != nil {
|
||||
|
||||
@@ -96,3 +96,73 @@ func TestStatsFileWriter_PublishesProcIO(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestWriteStatsAtomic_SymlinkAtDestIsReplaced is a regression guardrail for
|
||||
// #1170. The tmp side of writeStatsAtomic uses O_NOFOLLOW so a pre-planted
|
||||
// symlink at path+".tmp" cannot redirect the write — but the rename target
|
||||
// (`path` itself) is not protected by O_NOFOLLOW. Instead, os.Rename's
|
||||
// semantics are relied upon: rename atomically replaces any existing entry
|
||||
// at the destination, including a symlink, with the new regular file. The
|
||||
// original symlink's target is never written through (because the write
|
||||
// happened to the unrelated tmp file).
|
||||
//
|
||||
// This test pre-plants a symlink at `path` pointing to an unrelated target
|
||||
// file and asserts:
|
||||
// (a) post-write, path is a regular file (not a symlink), and
|
||||
// (b) the original target's contents are unchanged.
|
||||
//
|
||||
// If a future refactor swaps os.Rename for something that follows the
|
||||
// destination symlink (e.g. ioutil.WriteFile, or an open(path, O_WRONLY)
|
||||
// without O_NOFOLLOW), this test will fail loudly.
|
||||
func TestWriteStatsAtomic_SymlinkAtDestIsReplaced(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
// Unrelated target file with sentinel bytes. If writeStatsAtomic ever
|
||||
// followed the symlink at `path`, it would overwrite this file.
|
||||
target := filepath.Join(dir, "unrelated-target.bin")
|
||||
sentinel := []byte("DO-NOT-OVERWRITE-ME-#1170")
|
||||
if err := os.WriteFile(target, sentinel, 0o600); err != nil {
|
||||
t.Fatalf("seed target: %v", err)
|
||||
}
|
||||
|
||||
// Pre-plant a symlink at the destination path.
|
||||
path := filepath.Join(dir, "stats.json")
|
||||
if err := os.Symlink(target, path); err != nil {
|
||||
t.Fatalf("symlink: %v", err)
|
||||
}
|
||||
|
||||
payload := []byte(`{"sampledAt":"2026-01-01T00:00:00Z"}`)
|
||||
if err := writeStatsAtomic(path, payload); err != nil {
|
||||
t.Fatalf("writeStatsAtomic: %v", err)
|
||||
}
|
||||
|
||||
// (a) post-write, path must NOT be a symlink.
|
||||
info, err := os.Lstat(path)
|
||||
if err != nil {
|
||||
t.Fatalf("lstat path: %v", err)
|
||||
}
|
||||
if info.Mode()&os.ModeSymlink != 0 {
|
||||
t.Errorf("post-write path is still a symlink (mode=%v); os.Rename should have atomically replaced it with a regular file", info.Mode())
|
||||
}
|
||||
if !info.Mode().IsRegular() {
|
||||
t.Errorf("post-write path is not a regular file (mode=%v)", info.Mode())
|
||||
}
|
||||
|
||||
// Path now contains the new payload.
|
||||
got, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("read path: %v", err)
|
||||
}
|
||||
if string(got) != string(payload) {
|
||||
t.Errorf("path contents: want %q, got %q", payload, got)
|
||||
}
|
||||
|
||||
// (b) the original symlink target must be unchanged.
|
||||
gotTarget, err := os.ReadFile(target)
|
||||
if err != nil {
|
||||
t.Fatalf("read target: %v", err)
|
||||
}
|
||||
if string(gotTarget) != string(sentinel) {
|
||||
t.Errorf("symlink target was clobbered: want %q, got %q", sentinel, gotTarget)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,6 +44,14 @@ type analyticsRecomputer struct {
|
||||
// Stats (atomic).
|
||||
computeRuns atomic.Int64
|
||||
lastComputeNs atomic.Int64 // duration of last compute in nanoseconds
|
||||
|
||||
// Issue #1659 (PR #1688 r1) — warmup gate state, inlined here so
|
||||
// hot-path readers (IsWarmingUp_1659) do lock-free atomic loads
|
||||
// only (replaces the r0 package-level map + chanLock). See
|
||||
// analytics_warmup_1659.go for full design notes.
|
||||
firstPassDoneNs atomic.Int64
|
||||
warmupStartedNs atomic.Int64
|
||||
warmupReadyGate atomic.Value // *func() bool — gate must return true for markFirstPassDone to take effect
|
||||
}
|
||||
|
||||
// newAnalyticsRecomputer constructs an unstarted recomputer.
|
||||
@@ -68,6 +76,11 @@ func newAnalyticsRecomputer(name string, interval time.Duration, compute func()
|
||||
// Calling Start multiple times is a no-op after the first call.
|
||||
func (r *analyticsRecomputer) Start() {
|
||||
r.startOnce.Do(func() {
|
||||
// Issue #1659 (#1688 munger #2): record warmup-start before
|
||||
// the first compute, so IsWarmingUp_1659's fallback timeout
|
||||
// is measured from "recomputer started" — not "first pass
|
||||
// returned", which never happens if compute() hangs.
|
||||
r.noteWarmupStart_1659()
|
||||
// Initial synchronous compute — first read must NOT see empty
|
||||
// or uninitialized data (acceptance criterion #1240).
|
||||
r.runOnce()
|
||||
@@ -95,7 +108,10 @@ func (r *analyticsRecomputer) runOnce() {
|
||||
}
|
||||
defer func() {
|
||||
// Don't let a compute panic kill the background goroutine.
|
||||
// The previous snapshot remains valid.
|
||||
// The previous snapshot remains valid. Even on panic, we
|
||||
// still want IsWarmingUp_1659's fallback timeout to be the
|
||||
// safety net (a perpetually panicking compute would never
|
||||
// reach markFirstPassDone otherwise).
|
||||
_ = recover()
|
||||
}()
|
||||
t0 := time.Now()
|
||||
@@ -105,6 +121,16 @@ func (r *analyticsRecomputer) runOnce() {
|
||||
if result != nil {
|
||||
r.cache.Store(result)
|
||||
}
|
||||
// Issue #1659: mark the first-pass clock so the warmup gate
|
||||
// in GetAnalyticsRFWithWindow / Topology / Channels handlers
|
||||
// can flip from 503-Retry-After to serving the cache.
|
||||
//
|
||||
// PR #1688 r1: called on EVERY successful pass (even nil
|
||||
// result) so a compute that returns nil but doesn't panic
|
||||
// still lifts the gate — banner-stuck-forever fix (munger #2).
|
||||
// The markFirstPassDone helper is idempotent and additionally
|
||||
// consults the chunked-loader readiness gate (munger #5).
|
||||
r.markFirstPassDone_1659()
|
||||
}
|
||||
|
||||
// Load returns the most recently computed snapshot, or nil if Start
|
||||
@@ -242,6 +268,19 @@ func (s *PacketStore) StartAnalyticsRecomputers(defaultInterval time.Duration, o
|
||||
}
|
||||
s.analyticsRecomputerMu.Unlock()
|
||||
|
||||
// Issue #1659 (PR #1688 r1, munger #5): wire the chunked-loader
|
||||
// readiness gate on the three warmup-gated recomputers (RF,
|
||||
// Topology, Channels). markFirstPassDone_1659 will refuse to
|
||||
// flip first-pass-done until s.LoadComplete() reports true —
|
||||
// i.e. the cold-load has populated all observations. Otherwise
|
||||
// the FIRST recomputer pass runs against the post-restart in-RAM
|
||||
// slice and the gate opens on partial data (the original #1659
|
||||
// bug class).
|
||||
loadCompleteGate := s.LoadComplete
|
||||
s.recompRF.setWarmupReadyGate_1659(loadCompleteGate)
|
||||
s.recompTopology.setWarmupReadyGate_1659(loadCompleteGate)
|
||||
s.recompChannels.setWarmupReadyGate_1659(loadCompleteGate)
|
||||
|
||||
for _, rc := range all {
|
||||
rc.Start()
|
||||
}
|
||||
|
||||
@@ -0,0 +1,212 @@
|
||||
// Package main: issue #1659 — analytics warmup gating.
|
||||
//
|
||||
// Problem: after server restart, recompRF (and recompTopology /
|
||||
// recompChannels) cache the FIRST computation, which immediately after
|
||||
// boot is just the small in-RAM-observations slice (background
|
||||
// chunk-loader has not yet backfilled history). The recomputer then
|
||||
// serves that small slice from GetAnalyticsRFWithWindow's default
|
||||
// shortcut for an entire recompute interval, while the client pins it
|
||||
// via CLIENT_TTL.analyticsRF. UX: cards show a tiny "post-restart"
|
||||
// window even when the user selects "All data".
|
||||
//
|
||||
// Fix (r1 — addresses #1688 review munger #5):
|
||||
//
|
||||
// The first-pass-done signal is NOT enough on its own — the FIRST
|
||||
// recomputer pass at boot can complete against the post-restart slice
|
||||
// BEFORE the chunked loader (#1008 / chunked_load.go) has populated
|
||||
// the full observation set. Marking the gate ready in that window
|
||||
// reproduces the original #1659 bug.
|
||||
//
|
||||
// Two correctness invariants:
|
||||
//
|
||||
// 1. (#1688 munger #5) Only mark first-pass-done when BOTH:
|
||||
// a. a recomputer pass has completed, AND
|
||||
// b. the chunked loader has finished (s.LoadComplete()).
|
||||
// The gate's `readyGate` callback is wired by
|
||||
// StartAnalyticsRecomputers to `store.LoadComplete`. Passes that
|
||||
// complete while loadComplete is still false leave the gate in
|
||||
// the warming-up state; the NEXT pass after loadComplete flips
|
||||
// true is the one that opens the gate.
|
||||
//
|
||||
// 2. (#1688 munger #2 + kent-beck #2) The gate MUST lift in bounded
|
||||
// time. If compute() panics on every pass, hangs indefinitely,
|
||||
// or returns nil forever, an unguarded gate would leave the
|
||||
// 503 banner permanent. Two safeguards:
|
||||
// a. compute() panics are already caught by runOnce()'s
|
||||
// defer recover(); we additionally call markFirstPassDone
|
||||
// on EVERY pass (even nil-result), so a recomputer that
|
||||
// returns nil but doesn't panic still flips the gate.
|
||||
// b. A hard fallback timeout (warmupForceTimeout, 60s by
|
||||
// default) elapsed since the recomputer was constructed
|
||||
// forces IsWarmingUp_1659() to false — degraded mode
|
||||
// (serve whatever cache exists, possibly empty) is
|
||||
// strictly better than a permanent 503.
|
||||
//
|
||||
// Concurrency (#1688 munger #3):
|
||||
//
|
||||
// The previous r0 design used a package-level map keyed by recomputer
|
||||
// pointer, guarded by a global chanLock. Every default-shape analytics
|
||||
// request acquired that lock — a serialization point on a hot path.
|
||||
//
|
||||
// r1 inlines the warmup fields directly on `analyticsRecomputer`:
|
||||
// - firstPassDoneNs atomic.Int64
|
||||
// - warmupStartedNs atomic.Int64
|
||||
// - readyGate atomic.Value (holds func() bool, may be nil)
|
||||
//
|
||||
// Reads on the hot path are lock-free atomic loads. No package-level
|
||||
// state, no map lookups, no mutex.
|
||||
//
|
||||
// Tests: analytics_warmup_1659_test.go.
|
||||
package main
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"time"
|
||||
)
|
||||
|
||||
// warmupForceTimeout is the deadline after which IsWarmingUp_1659()
|
||||
// flips false regardless of whether a successful first pass has run.
|
||||
// Operators get degraded analytics (possibly empty until the next
|
||||
// successful compute) instead of a permanent 503 banner.
|
||||
//
|
||||
// Var (not const) so tests can shorten it.
|
||||
var warmupForceTimeout = 60 * time.Second
|
||||
|
||||
// setWarmupReadyGate wires a callback that the recomputer consults
|
||||
// before honoring a markFirstPassDone_1659() request. When the gate
|
||||
// returns false, the warmup state is preserved across the pass —
|
||||
// equivalent to "this pass doesn't count; we need at least one pass
|
||||
// AFTER the gate flips true".
|
||||
//
|
||||
// nil callback means "no extra gating" (legacy behavior).
|
||||
//
|
||||
// Called from StartAnalyticsRecomputers; safe to call before Start().
|
||||
func (r *analyticsRecomputer) setWarmupReadyGate_1659(gate func() bool) {
|
||||
if r == nil {
|
||||
return
|
||||
}
|
||||
if gate == nil {
|
||||
r.warmupReadyGate.Store((*func() bool)(nil))
|
||||
return
|
||||
}
|
||||
r.warmupReadyGate.Store(&gate)
|
||||
}
|
||||
|
||||
func (r *analyticsRecomputer) loadWarmupReadyGate_1659() func() bool {
|
||||
v := r.warmupReadyGate.Load()
|
||||
if v == nil {
|
||||
return nil
|
||||
}
|
||||
p, ok := v.(*func() bool)
|
||||
if !ok || p == nil {
|
||||
return nil
|
||||
}
|
||||
return *p
|
||||
}
|
||||
|
||||
// markFirstPassDone_1659 is called from analyticsRecomputer.runOnce()
|
||||
// after every compute attempt (success OR nil result; panics are
|
||||
// caught upstream and never reach here).
|
||||
//
|
||||
// The gate flip is conditional on the readyGate (when set) reporting
|
||||
// true — this implements the munger #5 fix: first-pass-done must
|
||||
// require BOTH a recomputer pass complete AND the chunked loader to
|
||||
// have finished populating the in-RAM observation set.
|
||||
//
|
||||
// Idempotent: only the FIRST successful flip wins; subsequent calls
|
||||
// observe a non-zero firstPassDoneNs and return immediately.
|
||||
func (r *analyticsRecomputer) markFirstPassDone_1659() {
|
||||
if r.firstPassDoneNs.Load() != 0 {
|
||||
return
|
||||
}
|
||||
if gate := r.loadWarmupReadyGate_1659(); gate != nil && !gate() {
|
||||
return
|
||||
}
|
||||
r.firstPassDoneNs.CompareAndSwap(0, time.Now().UnixNano())
|
||||
}
|
||||
|
||||
// FirstPassDoneAt_1659 reports the time the first full compute pass
|
||||
// completed (subject to the readyGate). Returns zero time if no
|
||||
// qualifying pass has completed yet.
|
||||
func (r *analyticsRecomputer) FirstPassDoneAt_1659() time.Time {
|
||||
if r == nil {
|
||||
return time.Time{}
|
||||
}
|
||||
ns := r.firstPassDoneNs.Load()
|
||||
if ns == 0 {
|
||||
return time.Time{}
|
||||
}
|
||||
return time.Unix(0, ns)
|
||||
}
|
||||
|
||||
// IsWarmingUp_1659 reports true when the recomputer has not yet
|
||||
// completed a qualifying first pass AND the fallback timeout has not
|
||||
// yet elapsed. Handlers for the default-shape request must return
|
||||
// 503 + Retry-After: 5 while this is true.
|
||||
//
|
||||
// Fallback timeout (warmupForceTimeout) prevents a permanent 503 in
|
||||
// pathological compute paths (perpetual panic, perpetual nil, hang).
|
||||
//
|
||||
// Lock-free: pure atomic loads.
|
||||
func (r *analyticsRecomputer) IsWarmingUp_1659() bool {
|
||||
if r == nil {
|
||||
// No recomputer registered → treat as ready; the handler
|
||||
// falls through to the legacy compute path.
|
||||
return false
|
||||
}
|
||||
if r.firstPassDoneNs.Load() != 0 {
|
||||
return false
|
||||
}
|
||||
startedNs := r.warmupStartedNs.Load()
|
||||
if startedNs != 0 {
|
||||
if time.Since(time.Unix(0, startedNs)) >= warmupForceTimeout {
|
||||
// Forced-ready: gate has been stuck too long. Stop
|
||||
// serving 503; let the handler serve whatever is in
|
||||
// the cache (possibly empty).
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// noteWarmupStart_1659 records the moment the recomputer was launched
|
||||
// (called once from Start). Used by IsWarmingUp_1659 to compute the
|
||||
// fallback-timeout elapsed window.
|
||||
func (r *analyticsRecomputer) noteWarmupStart_1659() {
|
||||
if r == nil {
|
||||
return
|
||||
}
|
||||
r.warmupStartedNs.CompareAndSwap(0, time.Now().UnixNano())
|
||||
}
|
||||
|
||||
// writeAnalyticsWarmup503 emits the standard warmup response. The body
|
||||
// shape is documented for clients: error string + retry_after_s int.
|
||||
func writeAnalyticsWarmup503(w http.ResponseWriter) {
|
||||
w.Header().Set("Retry-After", "5")
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusServiceUnavailable)
|
||||
_, _ = w.Write([]byte(`{"error":"analytics warming up","retry_after_s":5}`))
|
||||
}
|
||||
|
||||
// installWarmupBlocker_1659 is a test-only helper that registers the
|
||||
// RF / topology / channels recomputers with a compute function that
|
||||
// blocks on the supplied channel. firstPassDoneNs therefore stays
|
||||
// zero, simulating the post-restart warmup window for the warmup test.
|
||||
//
|
||||
// We bypass StartAnalyticsRecomputers entirely and wire the
|
||||
// recomputers manually so the background goroutines never fire. The
|
||||
// test only needs the *analyticsRecomputer pointers to be non-nil and
|
||||
// in the warmup state.
|
||||
func (s *PacketStore) installWarmupBlocker_1659(block <-chan struct{}) {
|
||||
blockCompute := func() interface{} {
|
||||
<-block
|
||||
return nil
|
||||
}
|
||||
s.analyticsRecomputerMu.Lock()
|
||||
defer s.analyticsRecomputerMu.Unlock()
|
||||
s.recompRF = newAnalyticsRecomputer("rf-test-block", time.Hour, blockCompute)
|
||||
s.recompTopology = newAnalyticsRecomputer("topo-test-block", time.Hour, blockCompute)
|
||||
s.recompChannels = newAnalyticsRecomputer("chan-test-block", time.Hour, blockCompute)
|
||||
// Do NOT call Start() — leaving firstPassDoneNs at zero is exactly
|
||||
// the warmup state the test wants to exercise.
|
||||
}
|
||||
@@ -0,0 +1,330 @@
|
||||
// Package main: issue #1659 — analytics warmup gating.
|
||||
//
|
||||
// After a server restart, the analytics recomputer caches the FIRST
|
||||
// computation (a small in-RAM slice) and serves it via the default
|
||||
// region="", zero-window shortcut in GetAnalyticsRFWithWindow until the
|
||||
// next periodic recompute fires. The client-side CLIENT_TTL.analyticsRF
|
||||
// then pins that small slice on the page even after the server flips
|
||||
// to steady-state.
|
||||
//
|
||||
// Fix: each recomputer carries a firstPassDoneAt timestamp set ONLY
|
||||
// after a full-range compute completes. While firstPassDoneAt is zero
|
||||
// AND the request is the default-shape (region="" && area="" &&
|
||||
// window.IsZero()), the handler returns 503 + Retry-After: 5 with a
|
||||
// JSON body the client recognizes and retries with backoff.
|
||||
//
|
||||
// These tests are the RED contract: they must FAIL on the assertion
|
||||
// (not a build error) when the warmup gate is absent, and PASS once
|
||||
// the fix lands.
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/gorilla/mux"
|
||||
)
|
||||
|
||||
// TestAnalyticsRF_WarmupReturns503 asserts that immediately after the
|
||||
// server starts — before any analytics recomputer has finished its
|
||||
// first full-range pass — GET /api/analytics/rf returns 503 with
|
||||
// Retry-After: 5 and a JSON body shaped as
|
||||
// {"error":"analytics warming up","retry_after_s":5}.
|
||||
//
|
||||
// This is the core acceptance criterion (c) from #1659.
|
||||
func TestAnalyticsRF_WarmupReturns503(t *testing.T) {
|
||||
db := setupTestDB(t)
|
||||
defer db.Close()
|
||||
store := NewPacketStore(db, nil)
|
||||
// Register recomputers but DO NOT let them complete a first pass.
|
||||
// We install a compute func that blocks until we release it, so the
|
||||
// recomputer's firstPassDoneAt stays zero.
|
||||
block := make(chan struct{})
|
||||
defer close(block)
|
||||
store.installWarmupBlocker_1659(block) // helper added in GREEN
|
||||
|
||||
cfg := &Config{Port: 3000}
|
||||
hub := NewHub()
|
||||
srv := NewServer(db, cfg, hub)
|
||||
srv.store = store
|
||||
router := mux.NewRouter()
|
||||
srv.RegisterRoutes(router)
|
||||
|
||||
req := httptest.NewRequest("GET", "/api/analytics/rf", nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
|
||||
if w.Code != http.StatusServiceUnavailable {
|
||||
t.Fatalf("expected 503 during warmup, got %d (body=%s)", w.Code, w.Body.String())
|
||||
}
|
||||
if got := w.Header().Get("Retry-After"); got != "5" {
|
||||
t.Fatalf("expected Retry-After: 5, got %q", got)
|
||||
}
|
||||
var resp map[string]interface{}
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("invalid JSON body: %v (raw=%s)", err, w.Body.String())
|
||||
}
|
||||
if resp["error"] != "analytics warming up" {
|
||||
t.Fatalf("expected error='analytics warming up', got %v", resp["error"])
|
||||
}
|
||||
if v, ok := resp["retry_after_s"].(float64); !ok || v != 5 {
|
||||
t.Fatalf("expected retry_after_s=5, got %v", resp["retry_after_s"])
|
||||
}
|
||||
}
|
||||
|
||||
// TestAnalyticsRF_AfterFirstPassReturns200 asserts the post-warmup
|
||||
// happy path: once the recomputer's first full-range compute completes,
|
||||
// the handler serves the cached snapshot as 200.
|
||||
func TestAnalyticsRF_AfterFirstPassReturns200(t *testing.T) {
|
||||
db := setupTestDB(t)
|
||||
defer db.Close()
|
||||
store := NewPacketStore(db, nil)
|
||||
// #1688 r1: the warmup gate now ALSO requires LoadComplete() to be
|
||||
// true before first-pass-done flips (munger #5). Tests that don't
|
||||
// exercise the chunked loader must flip it manually to model a
|
||||
// production server that has finished cold-loading.
|
||||
store.loadComplete.Store(true)
|
||||
|
||||
stop := store.StartAnalyticsRecomputers(50 * time.Millisecond)
|
||||
defer stop()
|
||||
|
||||
// Wait for the synchronous first-pass to complete. Start() runs
|
||||
// the initial compute synchronously, so by the time it returns
|
||||
// firstPassDoneAt should be set. We poll a brief moment to keep
|
||||
// the test robust to scheduling.
|
||||
deadline := time.Now().Add(3 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
if store.recompRF != nil && !store.recompRF.FirstPassDoneAt_1659().IsZero() {
|
||||
break
|
||||
}
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
}
|
||||
if store.recompRF == nil || store.recompRF.FirstPassDoneAt_1659().IsZero() {
|
||||
t.Fatal("recompRF.firstPassDoneAt never flipped after Start()")
|
||||
}
|
||||
|
||||
cfg := &Config{Port: 3000}
|
||||
hub := NewHub()
|
||||
srv := NewServer(db, cfg, hub)
|
||||
srv.store = store
|
||||
router := mux.NewRouter()
|
||||
srv.RegisterRoutes(router)
|
||||
|
||||
req := httptest.NewRequest("GET", "/api/analytics/rf", nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("expected 200 after first pass, got %d (body=%s)", w.Code, w.Body.String())
|
||||
}
|
||||
if got := w.Header().Get("Retry-After"); got != "" {
|
||||
t.Fatalf("expected no Retry-After header on 200, got %q", got)
|
||||
}
|
||||
// Body should be a valid JSON object (the RF analytics map).
|
||||
var resp map[string]interface{}
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("invalid JSON body: %v", err)
|
||||
}
|
||||
if len(resp) == 0 {
|
||||
t.Fatal("expected non-empty RF analytics response after first pass")
|
||||
}
|
||||
}
|
||||
|
||||
// TestAnalyticsRF_WindowedRequestNotGated asserts that even during
|
||||
// warmup, a request with an explicit time window (?since=/?until=) or
|
||||
// region/area filter is NOT gated by the warmup flag — those queries
|
||||
// bypass the recomputer entirely and hit the legacy compute-then-cache
|
||||
// path, which is unaffected by the first-pass bug.
|
||||
func TestAnalyticsRF_WindowedRequestNotGated(t *testing.T) {
|
||||
db := setupTestDB(t)
|
||||
defer db.Close()
|
||||
store := NewPacketStore(db, nil)
|
||||
block := make(chan struct{})
|
||||
defer close(block)
|
||||
store.installWarmupBlocker_1659(block)
|
||||
|
||||
cfg := &Config{Port: 3000}
|
||||
hub := NewHub()
|
||||
srv := NewServer(db, cfg, hub)
|
||||
srv.store = store
|
||||
router := mux.NewRouter()
|
||||
srv.RegisterRoutes(router)
|
||||
|
||||
// Explicit window — should bypass warmup gate.
|
||||
req := httptest.NewRequest("GET", "/api/analytics/rf?window=1h", nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
|
||||
if w.Code == http.StatusServiceUnavailable {
|
||||
t.Fatalf("windowed request must NOT be gated by warmup (got 503)")
|
||||
}
|
||||
}
|
||||
|
||||
// === PR #1688 r1 — new test cases ===
|
||||
|
||||
// TestAnalyticsTopology_WarmupReturns503 — kent-beck #1: topology
|
||||
// gate is symmetric with RF; assert the same 503 contract.
|
||||
func TestAnalyticsTopology_WarmupReturns503(t *testing.T) {
|
||||
db := setupTestDB(t)
|
||||
defer db.Close()
|
||||
store := NewPacketStore(db, nil)
|
||||
block := make(chan struct{})
|
||||
defer close(block)
|
||||
store.installWarmupBlocker_1659(block)
|
||||
|
||||
cfg := &Config{Port: 3000}
|
||||
hub := NewHub()
|
||||
srv := NewServer(db, cfg, hub)
|
||||
srv.store = store
|
||||
router := mux.NewRouter()
|
||||
srv.RegisterRoutes(router)
|
||||
|
||||
req := httptest.NewRequest("GET", "/api/analytics/topology", nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
|
||||
if w.Code != http.StatusServiceUnavailable {
|
||||
t.Fatalf("topology: expected 503 during warmup, got %d", w.Code)
|
||||
}
|
||||
if got := w.Header().Get("Retry-After"); got != "5" {
|
||||
t.Fatalf("topology: expected Retry-After: 5, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestAnalyticsChannels_WarmupReturns503 — kent-beck #1: channels
|
||||
// gate is symmetric with RF; assert the same 503 contract.
|
||||
func TestAnalyticsChannels_WarmupReturns503(t *testing.T) {
|
||||
db := setupTestDB(t)
|
||||
defer db.Close()
|
||||
store := NewPacketStore(db, nil)
|
||||
block := make(chan struct{})
|
||||
defer close(block)
|
||||
store.installWarmupBlocker_1659(block)
|
||||
|
||||
cfg := &Config{Port: 3000}
|
||||
hub := NewHub()
|
||||
srv := NewServer(db, cfg, hub)
|
||||
srv.store = store
|
||||
router := mux.NewRouter()
|
||||
srv.RegisterRoutes(router)
|
||||
|
||||
req := httptest.NewRequest("GET", "/api/analytics/channels", nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
|
||||
if w.Code != http.StatusServiceUnavailable {
|
||||
t.Fatalf("channels: expected 503 during warmup, got %d", w.Code)
|
||||
}
|
||||
if got := w.Header().Get("Retry-After"); got != "5" {
|
||||
t.Fatalf("channels: expected Retry-After: 5, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestWarmup_GateBlockedUntilLoadComplete — munger #5 correctness:
|
||||
// the chunked loader readiness MUST gate first-pass-done. A recomputer
|
||||
// pass that completes while LoadComplete() is false must NOT lift the
|
||||
// gate; a SUBSEQUENT pass after LoadComplete() flips true must lift it.
|
||||
func TestWarmup_GateBlockedUntilLoadComplete(t *testing.T) {
|
||||
db := setupTestDB(t)
|
||||
defer db.Close()
|
||||
store := NewPacketStore(db, nil)
|
||||
// LoadComplete starts false — chunked loader still running.
|
||||
|
||||
called := make(chan struct{}, 16)
|
||||
rc := newAnalyticsRecomputer("test-rf", time.Hour, func() interface{} {
|
||||
called <- struct{}{}
|
||||
return map[string]int{"x": 1}
|
||||
})
|
||||
rc.setWarmupReadyGate_1659(store.LoadComplete)
|
||||
rc.Start()
|
||||
defer rc.Stop()
|
||||
|
||||
// First pass already ran synchronously in Start(). Gate must still
|
||||
// be warming up because LoadComplete() is false.
|
||||
<-called
|
||||
if !rc.IsWarmingUp_1659() {
|
||||
t.Fatalf("expected IsWarmingUp_1659=true while LoadComplete()=false (munger #5 bug)")
|
||||
}
|
||||
if !rc.FirstPassDoneAt_1659().IsZero() {
|
||||
t.Fatalf("expected FirstPassDoneAt zero while LoadComplete()=false")
|
||||
}
|
||||
|
||||
// Now flip the loader and trigger another pass.
|
||||
store.loadComplete.Store(true)
|
||||
rc.runOnce()
|
||||
if rc.IsWarmingUp_1659() {
|
||||
t.Fatalf("expected gate to lift after LoadComplete()=true + another pass")
|
||||
}
|
||||
}
|
||||
|
||||
// TestWarmup_NilResultStillLiftsGate — munger #2 / kent-beck #2:
|
||||
// a compute that returns nil but doesn't panic must still flip the
|
||||
// gate (the cache stays empty but the banner does NOT get stuck).
|
||||
func TestWarmup_NilResultStillLiftsGate(t *testing.T) {
|
||||
rc := newAnalyticsRecomputer("test-nil", time.Hour, func() interface{} {
|
||||
return nil
|
||||
})
|
||||
rc.Start()
|
||||
defer rc.Stop()
|
||||
|
||||
if rc.IsWarmingUp_1659() {
|
||||
t.Fatalf("nil-result compute must still lift warmup gate after first pass")
|
||||
}
|
||||
}
|
||||
|
||||
// TestWarmup_PanicEventuallyLiftsGate — munger #2 / kent-beck #2:
|
||||
// a compute that ALWAYS panics must not leave the gate stuck forever.
|
||||
// The fallback timeout (warmupForceTimeout) is the safety net.
|
||||
func TestWarmup_PanicEventuallyLiftsGate(t *testing.T) {
|
||||
prev := warmupForceTimeout
|
||||
warmupForceTimeout = 50 * time.Millisecond
|
||||
defer func() { warmupForceTimeout = prev }()
|
||||
|
||||
rc := newAnalyticsRecomputer("test-panic", time.Hour, func() interface{} {
|
||||
panic("compute boom")
|
||||
})
|
||||
rc.Start()
|
||||
defer rc.Stop()
|
||||
|
||||
// Panic was recovered inside runOnce; firstPassDoneNs is still 0.
|
||||
if rc.FirstPassDoneAt_1659().IsZero() == false {
|
||||
t.Fatalf("panicking compute should not have set firstPassDoneNs")
|
||||
}
|
||||
// But after warmupForceTimeout elapses, the gate must lift.
|
||||
time.Sleep(80 * time.Millisecond)
|
||||
if rc.IsWarmingUp_1659() {
|
||||
t.Fatalf("expected fallback timeout to lift gate after warmupForceTimeout (got still-warming)")
|
||||
}
|
||||
}
|
||||
|
||||
// TestWarmup_TimeoutLiftsHangingCompute — munger #2 / kent-beck #2:
|
||||
// hung compute (blocks indefinitely on a channel) must not result in
|
||||
// permanent 503. Fallback timeout lifts it.
|
||||
func TestWarmup_TimeoutLiftsHangingCompute(t *testing.T) {
|
||||
prev := warmupForceTimeout
|
||||
warmupForceTimeout = 50 * time.Millisecond
|
||||
defer func() { warmupForceTimeout = prev }()
|
||||
|
||||
block := make(chan struct{})
|
||||
defer close(block)
|
||||
rc := newAnalyticsRecomputer("test-hang", time.Hour, func() interface{} {
|
||||
<-block
|
||||
return nil
|
||||
})
|
||||
// Don't call Start (would block forever on synchronous initial
|
||||
// compute). Just simulate "we noted warmup start, compute is
|
||||
// hanging in another goroutine".
|
||||
rc.noteWarmupStart_1659()
|
||||
go rc.runOnce()
|
||||
|
||||
if !rc.IsWarmingUp_1659() {
|
||||
t.Fatalf("expected initial state to be warming-up")
|
||||
}
|
||||
time.Sleep(80 * time.Millisecond)
|
||||
if rc.IsWarmingUp_1659() {
|
||||
t.Fatalf("expected fallback timeout to lift hung-compute warmup")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,98 @@
|
||||
package main
|
||||
|
||||
// Issue #1551: /api/* responses must emit Cache-Control: no-store so
|
||||
// CDNs (Cloudflare, nginx, Varnish) do not cache JSON. Static assets
|
||||
// (app.js, /, etc.) intentionally remain CDN-cacheable.
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/gorilla/mux"
|
||||
)
|
||||
|
||||
// TestAPIRoutesEmitNoStoreCacheControl asserts every covered /api/*
|
||||
// endpoint sets Cache-Control: no-store. This is a black-box test
|
||||
// against the real router, exercising whatever middleware chain is
|
||||
// wired by RegisterRoutes.
|
||||
func TestAPIRoutesEmitNoStoreCacheControl(t *testing.T) {
|
||||
_, router := setupTestServer(t)
|
||||
|
||||
apiPaths := []string{
|
||||
"/api/stats",
|
||||
"/api/observers",
|
||||
"/api/packets?limit=10",
|
||||
"/api/nodes?limit=10",
|
||||
}
|
||||
|
||||
for _, p := range apiPaths {
|
||||
t.Run(p, func(t *testing.T) {
|
||||
req := httptest.NewRequest("GET", p, nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("%s: expected 200, got %d (body: %s)", p, w.Code, w.Body.String())
|
||||
}
|
||||
cc := w.Header().Get("Cache-Control")
|
||||
if cc != "no-store" {
|
||||
t.Errorf("%s: expected Cache-Control: no-store, got %q", p, cc)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestStaticAssetsDoNotEmitNoStore guards against scope creep: the
|
||||
// no-store middleware must be scoped to /api/* only. Static assets
|
||||
// (HTML, JS, CSS) keep their existing browser-cache headers
|
||||
// ("no-cache, no-store, must-revalidate" today via spaHandler) and
|
||||
// must NOT be downgraded to bare "no-store" by the API middleware —
|
||||
// i.e. the API middleware must not run on these paths. If a future
|
||||
// change moves static assets behind no-store middleware, CDN caching
|
||||
// of immutable hashed assets breaks; assert the contract explicitly.
|
||||
func TestStaticAssetsDoNotEmitBareNoStore(t *testing.T) {
|
||||
// Build a temp public dir so spaHandler has real files to serve.
|
||||
dir := t.TempDir()
|
||||
if err := os.WriteFile(filepath.Join(dir, "index.html"), []byte("<html>SPA</html>"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(dir, "app.js"), []byte("console.log('app')"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
_, router := setupTestServer(t)
|
||||
// Wire the SPA handler exactly the way main.go does for non-/api paths.
|
||||
fs := http.FileServer(http.Dir(dir))
|
||||
router.PathPrefix("/").Handler(spaHandler(dir, fs))
|
||||
|
||||
cases := []struct {
|
||||
path string
|
||||
wantCacheCC string
|
||||
}{
|
||||
// spaHandler sets this exact value for HTML/JS/CSS.
|
||||
{"/app.js", "no-cache, no-store, must-revalidate"},
|
||||
{"/", "no-cache, no-store, must-revalidate"},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
t.Run(c.path, func(t *testing.T) {
|
||||
req := httptest.NewRequest("GET", c.path, nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
cc := w.Header().Get("Cache-Control")
|
||||
if cc == "no-store" {
|
||||
t.Errorf("%s: API no-store middleware leaked onto static asset (got bare %q, expected %q)", c.path, cc, c.wantCacheCC)
|
||||
}
|
||||
if cc != c.wantCacheCC {
|
||||
t.Errorf("%s: expected Cache-Control %q, got %q", c.path, c.wantCacheCC, cc)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure mux import used (test compiles even if setupTestServer signature
|
||||
// changes).
|
||||
var _ = mux.NewRouter
|
||||
@@ -0,0 +1,87 @@
|
||||
package main
|
||||
|
||||
// Issue #1561: detect CDN-fronted deployments and warn ONCE.
|
||||
//
|
||||
// When operators put CoreScope behind Cloudflare/Fastly without
|
||||
// configuring a /api/* cache bypass, dashboards go stale — the origin
|
||||
// emits Cache-Control: no-store (#1551), but the CDN's zone-level
|
||||
// caching policy can still cache JSON responses for hours
|
||||
// (cf-cache-status: HIT, age > 0). We can't fix the CDN config from
|
||||
// the server side; the best we can do is detect the situation and
|
||||
// loudly tell the operator at the logs.
|
||||
//
|
||||
// Detection: presence of any CDN-specific request header
|
||||
// (CF-Connecting-IP, CF-Ray, Fastly-Client-IP, True-Client-IP).
|
||||
// We deliberately exclude X-Forwarded-For and X-Real-IP: every
|
||||
// generic reverse proxy (nginx, Caddy, Traefik, k8s ingress) sets
|
||||
// those, so including them would warn operators who aren't behind
|
||||
// a CDN at all and train them to ignore the warning entirely
|
||||
// (defeating the point of #1561).
|
||||
//
|
||||
// Side effects: a single log line per process boot — never blocks
|
||||
// the request, never modifies the response, never logs again.
|
||||
|
||||
import (
|
||||
"log"
|
||||
"net/http"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
)
|
||||
|
||||
var cdnWarnOnce sync.Once
|
||||
|
||||
// cdnWarned is set true after the first CDN-fronted request has been
|
||||
// observed and logged. Subsequent requests short-circuit before the
|
||||
// per-request header scan in firstCDNHeader — a hot-path optimization
|
||||
// for the steady state (warning already emitted, every /api request
|
||||
// otherwise pays for 4 http.Header.Get lookups forever).
|
||||
var cdnWarned atomic.Bool
|
||||
|
||||
// cdnHeaders are HTTP request headers injected ONLY by CDNs
|
||||
// (Cloudflare, Fastly, Akamai) — never by a generic reverse proxy.
|
||||
// Detected case-insensitively by http.Header.Get.
|
||||
//
|
||||
// X-Forwarded-For / X-Real-IP are intentionally NOT in this list:
|
||||
// every nginx/Caddy/Traefik/k8s-ingress deployment sets them, so
|
||||
// using them as a CDN signal produces a false positive on every
|
||||
// reverse-proxied install (issue #1561 round-1 review).
|
||||
var cdnHeaders = []string{
|
||||
"CF-Connecting-IP", // Cloudflare
|
||||
"CF-Ray", // Cloudflare
|
||||
"Fastly-Client-IP", // Fastly
|
||||
"True-Client-IP", // Akamai (also set by Cloudflare Enterprise)
|
||||
}
|
||||
|
||||
// cdnDetectionMiddleware inspects each incoming request for CDN
|
||||
// headers and, on the FIRST one observed, logs a single warning
|
||||
// pointing the operator at docs/deployment-behind-cdn.md. The
|
||||
// middleware always calls next; it never blocks or rewrites.
|
||||
func cdnDetectionMiddleware(next http.Handler) http.Handler {
|
||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
// Fast path: once we've warned, skip the per-request header
|
||||
// scan entirely. Steady state for any CDN-fronted deploy is
|
||||
// ~every request hitting this branch.
|
||||
if cdnWarned.Load() {
|
||||
next.ServeHTTP(w, r)
|
||||
return
|
||||
}
|
||||
if hdr := firstCDNHeader(r.Header); hdr != "" {
|
||||
cdnWarnOnce.Do(func() {
|
||||
log.Printf("[security] WARNING: detected request via CDN (%s header present). "+
|
||||
"Ensure /api/* is bypassed in your CDN config — see docs/deployment-behind-cdn.md. "+
|
||||
"Cached API responses cause observer-flap and incorrect dashboards.", hdr)
|
||||
cdnWarned.Store(true)
|
||||
})
|
||||
}
|
||||
next.ServeHTTP(w, r)
|
||||
})
|
||||
}
|
||||
|
||||
func firstCDNHeader(h http.Header) string {
|
||||
for _, name := range cdnHeaders {
|
||||
if h.Get(name) != "" {
|
||||
return name
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
@@ -0,0 +1,276 @@
|
||||
package main
|
||||
|
||||
// Issue #1561: When the server is fronted by a CDN (Cloudflare, Fastly,
|
||||
// Akamai) we cannot guarantee /api/* responses are not cached unless
|
||||
// the operator configures a bypass rule. Detect CDN-specific request
|
||||
// headers at the first such request and log a one-shot warning
|
||||
// pointing the operator at the bypass doc.
|
||||
//
|
||||
// Contract:
|
||||
// - Warning logs ONLY when a CDN-specific header is present
|
||||
// (CF-Connecting-IP, CF-Ray, Fastly-Client-IP, True-Client-IP).
|
||||
// - Generic reverse-proxy headers (X-Forwarded-For, X-Real-IP) MUST
|
||||
// NOT trigger the warning — every nginx/Caddy/Traefik/k8s install
|
||||
// sets those, so warning on them defeats the entire signal.
|
||||
// - Warning logs at most ONCE per process boot (sync.Once), even
|
||||
// under concurrent first-request load.
|
||||
// - Middleware NEVER blocks the request — it always calls
|
||||
// next.ServeHTTP.
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"log"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// resetCDNDetectionOnce restores a fresh sync.Once so each test starts
|
||||
// from a clean "have not warned yet" state.
|
||||
func resetCDNDetectionOnce() {
|
||||
cdnWarnOnce = sync.Once{}
|
||||
cdnWarned.Store(false)
|
||||
}
|
||||
|
||||
// runWithCDNMiddleware fires the request through the middleware and
|
||||
// returns (log output, whether next was called). The sentinel proves
|
||||
// the middleware did not silently drop the request.
|
||||
func runWithCDNMiddleware(t *testing.T, req *http.Request) (string, bool) {
|
||||
t.Helper()
|
||||
var buf bytes.Buffer
|
||||
prev := log.Writer()
|
||||
log.SetOutput(&buf)
|
||||
defer log.SetOutput(prev)
|
||||
|
||||
nextCalled := false
|
||||
h := cdnDetectionMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
nextCalled = true
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}))
|
||||
w := httptest.NewRecorder()
|
||||
h.ServeHTTP(w, req)
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("middleware must not block request; got status %d", w.Code)
|
||||
}
|
||||
return buf.String(), nextCalled
|
||||
}
|
||||
|
||||
func TestCDNDetection_LogsOnCFRayHeader(t *testing.T) {
|
||||
resetCDNDetectionOnce()
|
||||
req := httptest.NewRequest("GET", "/api/observers", nil)
|
||||
req.Header.Set("CF-Ray", "abc123-LAX")
|
||||
|
||||
out, nextCalled := runWithCDNMiddleware(t, req)
|
||||
|
||||
if !nextCalled {
|
||||
t.Fatal("middleware did not call next handler")
|
||||
}
|
||||
if !strings.Contains(out, "detected request via CDN") {
|
||||
t.Errorf("expected log to contain 'detected request via CDN', got: %q", out)
|
||||
}
|
||||
if !strings.Contains(out, "deployment-behind-cdn") {
|
||||
t.Errorf("expected log to reference deployment-behind-cdn doc, got: %q", out)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCDNDetection_SilentWithoutCDNHeader(t *testing.T) {
|
||||
resetCDNDetectionOnce()
|
||||
req := httptest.NewRequest("GET", "/api/observers", nil)
|
||||
// No CDN-typical headers set.
|
||||
|
||||
out, nextCalled := runWithCDNMiddleware(t, req)
|
||||
|
||||
if !nextCalled {
|
||||
t.Fatal("middleware did not call next handler")
|
||||
}
|
||||
if strings.Contains(out, "detected request via CDN") {
|
||||
t.Errorf("expected no CDN warning without CDN headers, got: %q", out)
|
||||
}
|
||||
}
|
||||
|
||||
// Regression for round-1 adversarial finding: generic reverse-proxy
|
||||
// headers must NOT trigger the warning. Every nginx/Caddy/Traefik/
|
||||
// k8s-ingress reverse proxy sets X-Forwarded-For and X-Real-IP, so
|
||||
// flagging them produces a false positive on every reverse-proxied
|
||||
// install and trains operators to ignore the warning.
|
||||
func TestCDNDetection_SilentOnReverseProxyHeadersAlone(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
header string
|
||||
}{
|
||||
{"x-forwarded-for-alone", "X-Forwarded-For"},
|
||||
{"x-real-ip-alone", "X-Real-IP"},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
resetCDNDetectionOnce()
|
||||
req := httptest.NewRequest("GET", "/api/observers", nil)
|
||||
req.Header.Set(tc.header, "10.0.0.1")
|
||||
// No CDN-specific headers — just the generic reverse-proxy one.
|
||||
|
||||
out, nextCalled := runWithCDNMiddleware(t, req)
|
||||
|
||||
if !nextCalled {
|
||||
t.Fatal("middleware did not call next handler")
|
||||
}
|
||||
if strings.Contains(out, "detected request via CDN") {
|
||||
t.Errorf("header %s alone must NOT trigger CDN warning (would false-positive every nginx/k8s deploy); got: %q", tc.header, out)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// When a CDN-specific header is present alongside generic proxy
|
||||
// headers (common: Cloudflare → nginx → app), the warning still fires.
|
||||
func TestCDNDetection_LogsWhenCDNHeaderAccompaniesProxyHeaders(t *testing.T) {
|
||||
resetCDNDetectionOnce()
|
||||
req := httptest.NewRequest("GET", "/api/observers", nil)
|
||||
req.Header.Set("X-Forwarded-For", "10.0.0.1")
|
||||
req.Header.Set("X-Real-IP", "10.0.0.1")
|
||||
req.Header.Set("CF-Connecting-IP", "1.2.3.4")
|
||||
|
||||
out, nextCalled := runWithCDNMiddleware(t, req)
|
||||
|
||||
if !nextCalled {
|
||||
t.Fatal("middleware did not call next handler")
|
||||
}
|
||||
if !strings.Contains(out, "detected request via CDN") {
|
||||
t.Errorf("expected CDN warning when CF-Connecting-IP present alongside proxy headers; got: %q", out)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCDNDetection_LogsOnlyOnce(t *testing.T) {
|
||||
resetCDNDetectionOnce()
|
||||
|
||||
var buf bytes.Buffer
|
||||
prev := log.Writer()
|
||||
log.SetOutput(&buf)
|
||||
defer log.SetOutput(prev)
|
||||
|
||||
nextCalled := 0
|
||||
h := cdnDetectionMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
nextCalled++
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}))
|
||||
|
||||
for i := 0; i < 3; i++ {
|
||||
req := httptest.NewRequest("GET", "/api/observers", nil)
|
||||
req.Header.Set("CF-Ray", "abc123")
|
||||
w := httptest.NewRecorder()
|
||||
h.ServeHTTP(w, req)
|
||||
}
|
||||
|
||||
if nextCalled != 3 {
|
||||
t.Fatalf("middleware must call next on every request; got %d calls, want 3", nextCalled)
|
||||
}
|
||||
got := strings.Count(buf.String(), "detected request via CDN")
|
||||
if got != 1 {
|
||||
t.Errorf("expected CDN warning exactly once across multiple requests; got %d in output: %q", got, buf.String())
|
||||
}
|
||||
}
|
||||
|
||||
// Each genuinely CDN-specific header should trip the detector on its
|
||||
// own. X-Forwarded-For / X-Real-IP are NOT in this set — see the
|
||||
// negative test TestCDNDetection_SilentOnReverseProxyHeadersAlone.
|
||||
func TestCDNDetection_RecognizesAllCommonCDNHeaders(t *testing.T) {
|
||||
headers := []string{
|
||||
"CF-Connecting-IP",
|
||||
"CF-Ray",
|
||||
"Fastly-Client-IP",
|
||||
"True-Client-IP",
|
||||
}
|
||||
for _, h := range headers {
|
||||
t.Run(h, func(t *testing.T) {
|
||||
resetCDNDetectionOnce()
|
||||
req := httptest.NewRequest("GET", "/api/observers", nil)
|
||||
req.Header.Set(h, "1.2.3.4")
|
||||
out, nextCalled := runWithCDNMiddleware(t, req)
|
||||
if !nextCalled {
|
||||
t.Fatal("middleware did not call next handler")
|
||||
}
|
||||
if !strings.Contains(out, "detected request via CDN") {
|
||||
t.Errorf("header %s should trip CDN detection; log was: %q", h, out)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Round-1 KB finding #2: sync.Once is what keeps the log from
|
||||
// spamming — verify it holds under concurrent first-request load.
|
||||
// CI runs `go test -race`, so this also stresses the underlying
|
||||
// primitive for data races. Without -race, the assertion still
|
||||
// catches a plain bool / non-atomic implementation.
|
||||
func TestCDNDetectionMiddlewareConcurrentFirstRequestLogsOnce(t *testing.T) {
|
||||
resetCDNDetectionOnce()
|
||||
|
||||
var buf bytes.Buffer
|
||||
var bufMu sync.Mutex
|
||||
prev := log.Writer()
|
||||
// log.Printf can be called concurrently; serialize writes to buf
|
||||
// so we never race the test's own assertion read.
|
||||
log.SetOutput(writerFunc(func(p []byte) (int, error) {
|
||||
bufMu.Lock()
|
||||
defer bufMu.Unlock()
|
||||
return buf.Write(p)
|
||||
}))
|
||||
defer log.SetOutput(prev)
|
||||
|
||||
var nextCalls int64
|
||||
h := cdnDetectionMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
atomic.AddInt64(&nextCalls, 1)
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}))
|
||||
|
||||
const n = 50
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(n)
|
||||
for i := 0; i < n; i++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
req := httptest.NewRequest("GET", "/api/observers", nil)
|
||||
req.Header.Set("CF-Ray", "abc123-LAX")
|
||||
w := httptest.NewRecorder()
|
||||
h.ServeHTTP(w, req)
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
if got := atomic.LoadInt64(&nextCalls); got != n {
|
||||
t.Fatalf("middleware must call next on every concurrent request; got %d, want %d", got, n)
|
||||
}
|
||||
|
||||
bufMu.Lock()
|
||||
out := buf.String()
|
||||
bufMu.Unlock()
|
||||
got := strings.Count(out, "detected request via CDN")
|
||||
if got != 1 {
|
||||
t.Errorf("expected sync.Once to admit exactly ONE warning under %d concurrent first-requests; got %d. Output:\n%s", n, got, out)
|
||||
}
|
||||
}
|
||||
|
||||
// writerFunc adapts a function to io.Writer.
|
||||
type writerFunc func(p []byte) (int, error)
|
||||
|
||||
func (f writerFunc) Write(p []byte) (int, error) { return f(p) }
|
||||
|
||||
// Round-2 MAJOR finding: sync.Once only short-circuits the log.Printf,
|
||||
// not the per-request header scan. firstCDNHeader still iterates 4
|
||||
// http.Header.Get lookups on every /api request after warning fires.
|
||||
// The fix is an atomic.Bool fast-path checked BEFORE firstCDNHeader.
|
||||
// This test gates that the flag is actually set on the first CDN
|
||||
// request — without it, the middleware would have no signal to
|
||||
// short-circuit on, and the optimization would be a dead store.
|
||||
func TestCDNDetection_CdnWarnedFlagSet(t *testing.T) {
|
||||
resetCDNDetectionOnce()
|
||||
req := httptest.NewRequest("GET", "/api/x", nil)
|
||||
req.Header.Set("CF-Ray", "x")
|
||||
if _, nextCalled := runWithCDNMiddleware(t, req); !nextCalled {
|
||||
t.Fatal("middleware did not call next handler")
|
||||
}
|
||||
if !cdnWarned.Load() {
|
||||
t.Fatal("cdnWarned must be true after first CDN request (fast-path flag not set)")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,526 @@
|
||||
package main
|
||||
|
||||
// Chunked startup load + early HTTP readiness for issue #1009.
|
||||
//
|
||||
// Design:
|
||||
// * LoadChunked paginates transmissions in id-ordered chunks of
|
||||
// `chunkSize` (default 10000 via Config.DBLoadChunkSize). After the
|
||||
// first chunk is merged into the store, FirstChunkReady is closed.
|
||||
// main.go binds the HTTP listener on that signal and serves
|
||||
// partial data while remaining chunks stream in the background.
|
||||
// * loadStatusMiddleware stamps X-CoreScope-Load-Status on every
|
||||
// response: "loading; progress=<rows>" until LoadComplete()
|
||||
// reports true, then "ready". Dashboards and probes can read the
|
||||
// header without parsing JSON.
|
||||
// * OnChunkLoaded registers a per-chunk callback for progress
|
||||
// logging / tests.
|
||||
//
|
||||
// Concurrency: each chunk acquires s.mu.Lock() ONLY while merging the
|
||||
// chunk's rows into store-shared maps. SQLite reads run lock-free so
|
||||
// HTTP handlers (which take s.mu.RLock) stay responsive.
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/meshcore-analyzer/dbconfig"
|
||||
)
|
||||
|
||||
// dbLoadConfig is the server-package alias for dbconfig.LoadConfig (#1009).
|
||||
type dbLoadConfig = dbconfig.LoadConfig
|
||||
|
||||
// DBLoadChunkSize returns the configured chunk size for chunked
|
||||
// startup load (config: db.load.chunkSize), or 10000 default (#1009).
|
||||
func (c *Config) DBLoadChunkSize() int {
|
||||
return c.DB.GetLoadChunkSize()
|
||||
}
|
||||
|
||||
// chunkedLoadState holds the runtime gates for LoadChunked. It lives
|
||||
// on PacketStore via embedded fields — see store.go additions in the
|
||||
// same commit.
|
||||
|
||||
// FirstChunkReady returns a channel closed once the first chunk has
|
||||
// been merged into the store, signalling the HTTP listener can bind.
|
||||
func (s *PacketStore) FirstChunkReady() <-chan struct{} {
|
||||
s.chunkedLoadInit()
|
||||
return s.firstChunkReady
|
||||
}
|
||||
|
||||
// LoadComplete reports whether LoadChunked has finished all chunks.
|
||||
func (s *PacketStore) LoadComplete() bool {
|
||||
return s.loadComplete.Load()
|
||||
}
|
||||
|
||||
// LoadProgress reports the number of transmission rows processed by
|
||||
// the in-flight (or completed) LoadChunked call.
|
||||
func (s *PacketStore) LoadProgress() int64 {
|
||||
return s.loadProgressRows.Load()
|
||||
}
|
||||
|
||||
// OnChunkLoaded registers a callback fired once per chunk after that
|
||||
// chunk has been merged into the store. The callback receives the
|
||||
// number of transmission rows in that chunk and the running total.
|
||||
// Multiple registrations chain.
|
||||
func (s *PacketStore) OnChunkLoaded(fn func(rowsThisChunk, totalRows int)) {
|
||||
s.chunkedLoadInit()
|
||||
s.chunkCBMu.Lock()
|
||||
defer s.chunkCBMu.Unlock()
|
||||
s.chunkCallbacks = append(s.chunkCallbacks, fn)
|
||||
}
|
||||
|
||||
// chunkedLoadInit lazily initialises the readiness channel + callback
|
||||
// list under a mutex so concurrent first callers don't race.
|
||||
func (s *PacketStore) chunkedLoadInit() {
|
||||
s.chunkInitOnce.Do(func() {
|
||||
s.firstChunkReady = make(chan struct{})
|
||||
})
|
||||
}
|
||||
|
||||
func (s *PacketStore) signalFirstChunk() {
|
||||
if s.firstChunkSignaled.CompareAndSwap(false, true) {
|
||||
close(s.firstChunkReady)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PacketStore) fireChunkCallbacks(rowsThisChunk, totalRows int) {
|
||||
s.chunkCBMu.Lock()
|
||||
cbs := append([]func(int, int){}, s.chunkCallbacks...)
|
||||
s.chunkCBMu.Unlock()
|
||||
for _, cb := range cbs {
|
||||
func() {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
log.Printf("[store] OnChunkLoaded callback panic: %v", r)
|
||||
}
|
||||
}()
|
||||
cb(rowsThisChunk, totalRows)
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
// LoadChunked streams transmissions + observations from SQLite into
|
||||
// the in-memory store in id-ordered chunks of `chunkSize` rows. Pass
|
||||
// 0 to use the default (10000).
|
||||
//
|
||||
// After the first chunk is merged, FirstChunkReady is closed and the
|
||||
// HTTP listener may bind. Remaining chunks stream while handlers run
|
||||
// against partially-populated data; loadStatusMiddleware advertises
|
||||
// loading status until LoadComplete() returns true.
|
||||
//
|
||||
// Re-entrancy: LoadChunked is NOT safe to call concurrently with
|
||||
// itself on the same PacketStore — it resets loadComplete /
|
||||
// loadProgressRows and mutates store-shared maps under s.mu. In
|
||||
// production it is invoked exactly once from main.go boot. Tests that
|
||||
// open a fresh store per test are also safe. If a future caller needs
|
||||
// repeat or concurrent loads, add a top-level mutex first.
|
||||
func (s *PacketStore) LoadChunked(chunkSize int) error {
|
||||
if chunkSize <= 0 {
|
||||
chunkSize = 10000
|
||||
}
|
||||
// Startup-ordering invariant (PR #1643 R1 munger #2). Mirror the
|
||||
// guard in Load() so the production async path also fast-fails when
|
||||
// neighbor_edges has rows but the graph is missing. See Load() for
|
||||
// the full rationale.
|
||||
if neighborEdgesTableExists(s.db.conn) && s.graph.Load() == nil {
|
||||
panic("packet store LoadChunked(): neighbor_edges table has rows but s.graph is nil — graph must be loaded before packet load (see main.go #1643 invariant)")
|
||||
}
|
||||
s.chunkedLoadInit()
|
||||
// Reset state for repeat calls in tests.
|
||||
s.loadComplete.Store(false)
|
||||
s.loadProgressRows.Store(0)
|
||||
|
||||
// On any return — error OR success — unblock listeners that gate on
|
||||
// the readiness signal so an empty/failed DB does not deadlock the
|
||||
// caller. Note: loadComplete is set on the success path only (see
|
||||
// the end of this function) so probes do NOT see ready=true after a
|
||||
// failed load.
|
||||
defer s.signalFirstChunk()
|
||||
|
||||
t0 := time.Now()
|
||||
|
||||
// Build the retention/memory filter the legacy Load() uses so
|
||||
// behavior is preserved when callers migrate from Load → LoadChunked.
|
||||
// Built against the `t2` alias used inside the chunk subquery so we
|
||||
// don't need brittle post-hoc string rewrites.
|
||||
var loadConditions []string
|
||||
hotCutoffHours := s.retentionHours
|
||||
if s.hotStartupHours > 0 {
|
||||
hotCutoffHours = s.hotStartupHours
|
||||
}
|
||||
var hotCutoffStr string
|
||||
var hotCutoffUnix int64
|
||||
if hotCutoffHours > 0 {
|
||||
hotCutoffT := time.Now().UTC().Add(-time.Duration(hotCutoffHours * float64(time.Hour)))
|
||||
hotCutoffStr = hotCutoffT.Format(time.RFC3339)
|
||||
hotCutoffUnix = hotCutoffT.Unix()
|
||||
_ = hotCutoffUnix
|
||||
// #1690: filter on the denormalized last_seen (effective recency)
|
||||
// rather than first_seen, so long-lived hashes with recent traffic
|
||||
// load on cold-start. first_seen is set once and never updated, so
|
||||
// the prior `t2.first_seen >= cutoff` query loaded only hashes
|
||||
// first-inserted within the window (0.3% of DB on prod).
|
||||
//
|
||||
// Test/legacy DBs without the column (PRAGMA-detected as
|
||||
// hasLastSeen=false) fall back to the legacy first_seen axis to
|
||||
// keep existing fixtures green. Production goes through
|
||||
// dbschema.AssertReady which fail-fasts when the column is
|
||||
// missing — so the fallback is only ever hit in tests.
|
||||
if s.db.hasLastSeen {
|
||||
loadConditions = append(loadConditions, fmt.Sprintf("t2.last_seen >= %d", hotCutoffUnix))
|
||||
} else {
|
||||
loadConditions = append(loadConditions, fmt.Sprintf("t2.first_seen >= '%s'", hotCutoffStr))
|
||||
}
|
||||
}
|
||||
|
||||
// COUNT honours the same retention/hot-startup filter the chunk
|
||||
// loop applies, so the logged "DB total" matches the rows the
|
||||
// loop will actually walk. Use a `t2` alias to share the WHERE
|
||||
// builder above. If the count fails (e.g. empty DB, locked WAL),
|
||||
// fall through with -1 — it's only used for the post-load log line.
|
||||
totalInDB := -1
|
||||
countSQL := "SELECT COUNT(*) FROM transmissions t2"
|
||||
if len(loadConditions) > 0 {
|
||||
countSQL += " WHERE " + strings.Join(loadConditions, " AND ")
|
||||
}
|
||||
if err := s.db.conn.QueryRow(countSQL).Scan(&totalInDB); err != nil {
|
||||
totalInDB = -1
|
||||
}
|
||||
|
||||
// Memory cap honoured by clamping the maximum cursor walk.
|
||||
var maxPackets int64
|
||||
if s.maxMemoryMB > 0 {
|
||||
avgBytes := int64(1000)
|
||||
if sample := estimateStoreTxBytesTypical(10); sample > avgBytes {
|
||||
avgBytes = sample
|
||||
}
|
||||
maxPackets = (int64(s.maxMemoryMB) * 1048576) / avgBytes
|
||||
if maxPackets < 1000 {
|
||||
maxPackets = 1000
|
||||
}
|
||||
}
|
||||
|
||||
chunkIdx := 0
|
||||
totalLoaded := 0
|
||||
// Start the id cursor BELOW the minimum possible row id so the
|
||||
// first chunk's `t2.id > cursorID` predicate includes id=0. The
|
||||
// e2e fixture seed for issue #1486 inserts the grouped-packet row
|
||||
// with id=0 (so it sorts LAST in the default packets view via
|
||||
// `ORDER BY id DESC` / oldest first_seen). Seeding the cursor at
|
||||
// 0 silently excluded that row, leaving the page with no
|
||||
// tr[data-hash] and timing out the playwright wait. Legacy Load()
|
||||
// had no id cursor and loaded id=0 unconditionally — we restore
|
||||
// that semantic by starting one below SQLite's minimum rowid (-1).
|
||||
var cursorID int64 = -1
|
||||
|
||||
// Relay-hop fallback inputs, fetched ONCE before the chunk-query loop.
|
||||
// getCachedNodesAndPM issues its own DB query, so calling it while a
|
||||
// chunk cursor is open would deadlock on a single-connection SQLite
|
||||
// pool. resolved_path is never persisted post-#1287, so scanAndMergeChunk
|
||||
// re-resolves relay hops from path_json using these snapshots.
|
||||
// PR #1643 R1 munger #1: cold load uses unique_prefix-only gate, so
|
||||
// the neighbor graph is no longer consulted here (affinity-tier
|
||||
// resolution against ≤168h-old observations would silently mis-attribute).
|
||||
s.mu.RLock()
|
||||
_, relayPM := s.getCachedNodesAndPM()
|
||||
s.mu.RUnlock()
|
||||
var coldLoadAmbiguousHopsSkipped int
|
||||
|
||||
for {
|
||||
conds := append([]string{}, loadConditions...)
|
||||
conds = append(conds, fmt.Sprintf("t2.id > %d", cursorID))
|
||||
whereClause := "WHERE " + strings.Join(conds, " AND ")
|
||||
|
||||
rpCol := ""
|
||||
if s.db.hasResolvedPath {
|
||||
rpCol = ", o.resolved_path"
|
||||
}
|
||||
obsRawHexCol := ""
|
||||
if s.db.hasObsRawHex {
|
||||
obsRawHexCol = ", o.raw_hex"
|
||||
}
|
||||
|
||||
var chunkSQL string
|
||||
if s.db.isV3 {
|
||||
chunkSQL = `SELECT t.id, t.raw_hex, t.hash, t.first_seen, t.route_type,
|
||||
t.payload_type, t.payload_version, t.decoded_json,
|
||||
o.id, obs.id, obs.name, COALESCE(obs.iata, ''), o.direction,
|
||||
o.snr, o.rssi, o.score, o.path_json, strftime('%Y-%m-%dT%H:%M:%fZ', o.timestamp, 'unixepoch')` + obsRawHexCol + rpCol + `
|
||||
FROM (SELECT * FROM transmissions t2 ` + whereClause + ` ORDER BY t2.id ASC LIMIT ` + fmt.Sprintf("%d", chunkSize) + `) AS t
|
||||
LEFT JOIN observations o ON o.transmission_id = t.id
|
||||
LEFT JOIN observers obs ON obs.rowid = o.observer_idx
|
||||
ORDER BY t.id ASC, o.timestamp DESC`
|
||||
} else {
|
||||
chunkSQL = `SELECT t.id, t.raw_hex, t.hash, t.first_seen, t.route_type,
|
||||
t.payload_type, t.payload_version, t.decoded_json,
|
||||
o.id, o.observer_id, o.observer_name, COALESCE(obs.iata, ''), o.direction,
|
||||
o.snr, o.rssi, o.score, o.path_json, o.timestamp` + obsRawHexCol + rpCol + `
|
||||
FROM (SELECT * FROM transmissions t2 ` + whereClause + ` ORDER BY t2.id ASC LIMIT ` + fmt.Sprintf("%d", chunkSize) + `) AS t
|
||||
LEFT JOIN observations o ON o.transmission_id = t.id
|
||||
LEFT JOIN observers obs ON obs.id = o.observer_id
|
||||
ORDER BY t.id ASC, o.timestamp DESC`
|
||||
}
|
||||
|
||||
rows, err := s.db.conn.Query(chunkSQL)
|
||||
if err != nil {
|
||||
return fmt.Errorf("chunk %d: query: %w", chunkIdx, err)
|
||||
}
|
||||
|
||||
chunkTxCount, lastID, err := s.scanAndMergeChunk(rows, relayPM, &coldLoadAmbiguousHopsSkipped)
|
||||
rows.Close()
|
||||
if err != nil {
|
||||
return fmt.Errorf("chunk %d: scan: %w", chunkIdx, err)
|
||||
}
|
||||
|
||||
if chunkTxCount == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
cursorID = lastID
|
||||
totalLoaded += chunkTxCount
|
||||
chunkIdx++
|
||||
s.loadProgressRows.Store(int64(totalLoaded))
|
||||
s.signalFirstChunk()
|
||||
s.fireChunkCallbacks(chunkTxCount, totalLoaded)
|
||||
|
||||
if maxPackets > 0 && int64(totalLoaded) >= maxPackets {
|
||||
break
|
||||
}
|
||||
if chunkTxCount < chunkSize {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Post-load: pick best observation, build indexes — same shape as
|
||||
// legacy Load().
|
||||
s.mu.Lock()
|
||||
for _, tx := range s.packets {
|
||||
pickBestObservation(tx)
|
||||
s.indexByNode(tx)
|
||||
}
|
||||
// Restore the "s.packets sorted oldest-first by FirstSeen" invariant
|
||||
// that legacy Load() got for free from "ORDER BY t.first_seen ASC".
|
||||
// LoadChunked walks chunks in id-ASC order so the slice ends up
|
||||
// id-ordered, which only equals first_seen-ordered when ids and
|
||||
// timestamps are correlated. After tools/freshen-fixture.sh (or any
|
||||
// real-world out-of-order ingest) they're not, leaving
|
||||
// s.packets[0].FirstSeen pointing at the newest row — which then
|
||||
// poisons oldestLoaded below and routes legitimate in-memory queries
|
||||
// to the SQL fallback. GetTimestamps (store.go) and QueryPackets
|
||||
// both rely on this invariant. See PR #1596 / mobile e2e regression.
|
||||
sort.SliceStable(s.packets, func(i, j int) bool {
|
||||
return s.packets[i].FirstSeen < s.packets[j].FirstSeen
|
||||
})
|
||||
s.buildSubpathIndex()
|
||||
s.buildPathHopIndex()
|
||||
s.buildDistanceIndex()
|
||||
if s.hotStartupHours > 0 {
|
||||
s.oldestLoaded = hotCutoffStr
|
||||
} else if len(s.packets) > 0 {
|
||||
s.oldestLoaded = s.packets[0].FirstSeen
|
||||
}
|
||||
s.loaded = true
|
||||
s.mu.Unlock()
|
||||
|
||||
// #1009 / PR #1596: flip the subpath + pathHop ready flags now that
|
||||
// the chunk loader has built both indexes synchronously above.
|
||||
// Without this, WaitIndexesReady (used by
|
||||
// StartRepeaterEnrichmentRecomputer at boot) blocks for up to
|
||||
// repeaterEnrichmentPrewarmWait (60s), delaying HTTP listener bind
|
||||
// past CI's 30s /api/healthz deadline.
|
||||
s.markIndexesReadySync()
|
||||
|
||||
elapsed := time.Since(t0)
|
||||
log.Printf("[store] LoadChunked: %d transmissions (%d observations) across %d chunk(s) in %v (chunkSize=%d, DB total=%d)",
|
||||
totalLoaded, s.totalObs, chunkIdx, elapsed, chunkSize, totalInDB)
|
||||
if coldLoadAmbiguousHopsSkipped > 0 {
|
||||
log.Printf("[store] LoadChunked: skipped %d ambiguous-prefix relay hops (unique_prefix gate, PR #1643 R1)",
|
||||
coldLoadAmbiguousHopsSkipped)
|
||||
}
|
||||
s.loadMultibyteCapFromDB()
|
||||
// Mark complete on the success path only — see the function-level
|
||||
// defer above for why this is NOT in a deferred call. Probes that
|
||||
// read LoadComplete()==true after a failed load would otherwise
|
||||
// see ready=true for a half-loaded store.
|
||||
s.loadComplete.Store(true)
|
||||
return nil
|
||||
}
|
||||
|
||||
// scanAndMergeChunk consumes one chunk's rows under s.mu.Lock and
|
||||
// returns the number of distinct transmissions seen + the max
|
||||
// transmission id (cursor for the next chunk).
|
||||
func (s *PacketStore) scanAndMergeChunk(rows *sql.Rows, relayPM *prefixMap, coldLoadAmbiguousHopsSkipped *int) (int, int64, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
hopsSeen := make(map[string]bool)
|
||||
seenTxIDs := make(map[int]bool)
|
||||
var maxID int64
|
||||
|
||||
for rows.Next() {
|
||||
var txID int
|
||||
var rawHex, hash, firstSeen, decodedJSON sql.NullString
|
||||
var routeType, payloadType, payloadVersion sql.NullInt64
|
||||
var obsID sql.NullInt64
|
||||
var observerID, observerName, observerIATA, direction, pathJSON, obsTimestamp sql.NullString
|
||||
var snr, rssi sql.NullFloat64
|
||||
var score sql.NullInt64
|
||||
var obsRawHex sql.NullString
|
||||
var resolvedPathStr sql.NullString
|
||||
|
||||
scanArgs := []interface{}{&txID, &rawHex, &hash, &firstSeen, &routeType, &payloadType,
|
||||
&payloadVersion, &decodedJSON,
|
||||
&obsID, &observerID, &observerName, &observerIATA, &direction,
|
||||
&snr, &rssi, &score, &pathJSON, &obsTimestamp}
|
||||
if s.db.hasObsRawHex {
|
||||
scanArgs = append(scanArgs, &obsRawHex)
|
||||
}
|
||||
if s.db.hasResolvedPath {
|
||||
scanArgs = append(scanArgs, &resolvedPathStr)
|
||||
}
|
||||
if err := rows.Scan(scanArgs...); err != nil {
|
||||
log.Printf("[store] LoadChunked scan error: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
if int64(txID) > maxID {
|
||||
maxID = int64(txID)
|
||||
}
|
||||
seenTxIDs[txID] = true
|
||||
|
||||
hashStr := nullStrVal(hash)
|
||||
tx := s.byHash[hashStr]
|
||||
if tx == nil {
|
||||
tx = &StoreTx{
|
||||
ID: txID,
|
||||
RawHex: nullStrVal(rawHex),
|
||||
Hash: hashStr,
|
||||
FirstSeen: nullStrVal(firstSeen),
|
||||
LatestSeen: nullStrVal(firstSeen),
|
||||
RouteType: nullIntPtr(routeType),
|
||||
PayloadType: nullIntPtr(payloadType),
|
||||
DecodedJSON: nullStrVal(decodedJSON),
|
||||
obsKeys: make(map[string]bool),
|
||||
observerSet: make(map[string]bool),
|
||||
}
|
||||
s.byHash[hashStr] = tx
|
||||
s.packets = append(s.packets, tx)
|
||||
s.byTxID[txID] = tx
|
||||
if txID > s.maxTxID {
|
||||
s.maxTxID = txID
|
||||
}
|
||||
s.indexByNode(tx)
|
||||
if tx.PayloadType != nil {
|
||||
pt := *tx.PayloadType
|
||||
s.byPayloadType[pt] = append(s.byPayloadType[pt], tx)
|
||||
}
|
||||
s.trackAdvertPubkey(tx)
|
||||
s.trackedBytes += estimateStoreTxBytes(tx)
|
||||
}
|
||||
|
||||
if obsID.Valid {
|
||||
oid := int(obsID.Int64)
|
||||
obsIDStr := nullStrVal(observerID)
|
||||
obsPJ := nullStrVal(pathJSON)
|
||||
|
||||
dk := obsIDStr + "|" + obsPJ
|
||||
if tx.obsKeys[dk] {
|
||||
continue
|
||||
}
|
||||
|
||||
obs := &StoreObs{
|
||||
ID: oid,
|
||||
TransmissionID: txID,
|
||||
ObserverID: obsIDStr,
|
||||
ObserverName: nullStrVal(observerName),
|
||||
ObserverIATA: nullStrVal(observerIATA),
|
||||
Direction: nullStrVal(direction),
|
||||
SNR: nullFloatPtr(snr),
|
||||
RSSI: nullFloatPtr(rssi),
|
||||
Score: nullIntPtr(score),
|
||||
PathJSON: obsPJ,
|
||||
RawHex: nullStrVal(obsRawHex),
|
||||
Timestamp: normalizeTimestamp(nullStrVal(obsTimestamp)),
|
||||
}
|
||||
|
||||
rpStr := nullStrVal(resolvedPathStr)
|
||||
if rpStr != "" {
|
||||
rp := unmarshalResolvedPath(rpStr)
|
||||
pks := extractResolvedPubkeys(rp)
|
||||
s.indexResolvedPathHops(tx, pks, hopsSeen)
|
||||
} else if relayPM != nil && obsPJ != "" && obsPJ != "[]" {
|
||||
// resolved_path is NULL on live (since #1287 relay data is
|
||||
// persisted as neighbor_edges, not per-observation). Re-resolve
|
||||
// relay-hop attribution from path_json so relay nodes keep their
|
||||
// analytics history across a restart instead of rebuilding only
|
||||
// from post-restart live traffic. relayPM is passed in from
|
||||
// LoadChunked (fetched before any chunk cursor opened).
|
||||
// byNode ONLY — see the Load() counterpart for why the
|
||||
// resolved_path/path-hop indexes must NOT be populated here.
|
||||
// PR #1643 R1 munger #1: unique_prefix-only gate.
|
||||
rp := resolvePathForObsColdLoad(obsPJ, obsIDStr, tx, relayPM, coldLoadAmbiguousHopsSkipped)
|
||||
for _, pk := range extractResolvedPubkeys(rp) {
|
||||
s.addToByNode(tx, pk)
|
||||
}
|
||||
}
|
||||
|
||||
tx.Observations = append(tx.Observations, obs)
|
||||
tx.obsKeys[dk] = true
|
||||
if obs.ObserverID != "" && !tx.observerSet[obs.ObserverID] {
|
||||
tx.observerSet[obs.ObserverID] = true
|
||||
tx.UniqueObserverCount++
|
||||
}
|
||||
tx.ObservationCount++
|
||||
if obs.Timestamp > tx.LatestSeen {
|
||||
tx.LatestSeen = obs.Timestamp
|
||||
}
|
||||
|
||||
s.byObsID[oid] = obs
|
||||
if oid > s.maxObsID {
|
||||
s.maxObsID = oid
|
||||
}
|
||||
if obsIDStr != "" {
|
||||
s.byObserver[obsIDStr] = append(s.byObserver[obsIDStr], obs)
|
||||
}
|
||||
s.totalObs++
|
||||
s.trackedBytes += estimateStoreObsBytes(obs)
|
||||
}
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
return len(seenTxIDs), maxID, err
|
||||
}
|
||||
return len(seenTxIDs), maxID, nil
|
||||
}
|
||||
|
||||
// loadStatusMiddleware sets X-CoreScope-Load-Status on every response.
|
||||
// While LoadChunked is in flight the header reports
|
||||
// "loading; progress=<rows>"; after completion it reports "ready".
|
||||
// The header is set BEFORE calling the next handler so probes can
|
||||
// observe it on any response (including streaming bodies).
|
||||
func loadStatusMiddleware(s *PacketStore, next http.Handler) http.Handler {
|
||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if s != nil && s.LoadComplete() {
|
||||
w.Header().Set("X-CoreScope-Load-Status", "ready")
|
||||
} else if s != nil {
|
||||
w.Header().Set("X-CoreScope-Load-Status",
|
||||
fmt.Sprintf("loading; progress=%d", s.LoadProgress()))
|
||||
} else {
|
||||
w.Header().Set("X-CoreScope-Load-Status", "loading")
|
||||
}
|
||||
next.ServeHTTP(w, r)
|
||||
})
|
||||
}
|
||||
|
||||
// --- runtime state stitched into PacketStore via store_chunked.go ---
|
||||
|
||||
// Forward declarations of the new PacketStore fields used above. The
|
||||
// actual struct fields live in store.go; placing them here as a
|
||||
// reminder keeps the chunked-load surface easy to audit.
|
||||
var _ = sync.Once{}
|
||||
var _ atomic.Bool
|
||||
@@ -0,0 +1,63 @@
|
||||
package main
|
||||
|
||||
// Issue #1009 follow-up tests for PR #1596:
|
||||
//
|
||||
// (A) LoadChunked must flip subpath + pathHop index ready flags
|
||||
// after building those indexes. Otherwise WaitIndexesReady (used
|
||||
// by StartRepeaterEnrichmentRecomputer at boot) blocks the
|
||||
// caller for up to repeaterEnrichmentPrewarmWait (60s), which is
|
||||
// why CI's "Start Go server" step times out before /api/healthz
|
||||
// can answer within its 30s deadline.
|
||||
//
|
||||
// (B) LoadChunked must NOT report LoadComplete()==true when it
|
||||
// returns an error. Today a defer unconditionally calls
|
||||
// s.loadComplete.Store(true), so a failed load appears "ready"
|
||||
// to probes and the load-status middleware.
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// (A) Indexes must be marked ready by LoadChunked.
|
||||
func TestLoadChunked_MarksIndexesReady(t *testing.T) {
|
||||
store := openChunkedTestStore(t, 100)
|
||||
defer store.db.conn.Close()
|
||||
|
||||
if store.SubpathIndexReady() || store.PathHopIndexReady() {
|
||||
t.Fatal("indexes must start NOT ready")
|
||||
}
|
||||
|
||||
if err := store.LoadChunked(50); err != nil {
|
||||
t.Fatalf("LoadChunked: %v", err)
|
||||
}
|
||||
|
||||
if !store.SubpathIndexReady() {
|
||||
t.Fatal("SubpathIndexReady() must be true after LoadChunked builds the index")
|
||||
}
|
||||
if !store.PathHopIndexReady() {
|
||||
t.Fatal("PathHopIndexReady() must be true after LoadChunked builds the index")
|
||||
}
|
||||
}
|
||||
|
||||
// (B) LoadChunked errors must not flip LoadComplete=true.
|
||||
func TestLoadChunked_ErrorDoesNotMarkComplete(t *testing.T) {
|
||||
store := openChunkedTestStore(t, 100)
|
||||
|
||||
// Close the underlying DB so the very first chunk query fails.
|
||||
if err := store.db.conn.Close(); err != nil {
|
||||
t.Fatalf("close DB: %v", err)
|
||||
}
|
||||
|
||||
err := store.LoadChunked(50)
|
||||
if err == nil {
|
||||
t.Fatal("LoadChunked must return an error when the DB query fails")
|
||||
}
|
||||
if !errors.Is(err, err) { // satisfy linters; the assertion below is what matters
|
||||
t.Fatalf("unexpected error shape: %v", err)
|
||||
}
|
||||
|
||||
if store.LoadComplete() {
|
||||
t.Fatal("LoadComplete() must remain false after LoadChunked returns an error")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,115 @@
|
||||
package main
|
||||
|
||||
// Regression for PR #1596 / issue #1486 e2e: LoadChunked uses
|
||||
// `cursorID = 0` with a `t2.id > cursorID` predicate, which silently
|
||||
// excludes any transmission with id=0. The e2e seed for #1486 inserts
|
||||
// the grouped-packet row with id=0 (so it sorts LAST in the default
|
||||
// packets view), and the page deep-links to /packets?hash=<seed>.
|
||||
// With the chunked loader skipping id=0, the in-memory store never
|
||||
// learns about the row; QueryGroupedPackets returns 0; the page
|
||||
// renders no `tr[data-hash]` and the e2e times out at 12s.
|
||||
//
|
||||
// Legacy Load() walked all transmissions unconditionally (no id
|
||||
// cursor) and therefore included id=0. Restoring that semantic — by
|
||||
// using a non-existent sentinel (-1) on the first iteration, or by
|
||||
// switching the predicate to `>=` for the initial pass — fixes the
|
||||
// regression.
|
||||
//
|
||||
// This test inserts a transmission with id=0 plus a handful of
|
||||
// id>=1 transmissions and asserts that LoadChunked loads the id=0
|
||||
// row into s.byHash.
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func createTestDBWithIDZero(tb testing.TB, dbPath string, extraTx int) {
|
||||
tb.Helper()
|
||||
conn, err := sql.Open("sqlite", dbPath+"?_journal_mode=WAL")
|
||||
if err != nil {
|
||||
tb.Fatal(err)
|
||||
}
|
||||
defer conn.Close()
|
||||
|
||||
stmts := []string{
|
||||
`CREATE TABLE IF NOT EXISTS transmissions (
|
||||
id INTEGER PRIMARY KEY,
|
||||
raw_hex TEXT, hash TEXT, first_seen TEXT,
|
||||
route_type INTEGER, payload_type INTEGER,
|
||||
payload_version INTEGER, decoded_json TEXT
|
||||
)`,
|
||||
`CREATE TABLE IF NOT EXISTS observations (
|
||||
id INTEGER PRIMARY KEY,
|
||||
transmission_id INTEGER, observer_id TEXT, observer_name TEXT,
|
||||
direction TEXT, snr REAL, rssi REAL, score INTEGER,
|
||||
path_json TEXT, timestamp TEXT, raw_hex TEXT
|
||||
)`,
|
||||
`CREATE TABLE IF NOT EXISTS observers (rowid INTEGER PRIMARY KEY, id TEXT, name TEXT, iata TEXT)`,
|
||||
`CREATE TABLE IF NOT EXISTS nodes (
|
||||
pubkey TEXT PRIMARY KEY, name TEXT, role TEXT, lat REAL, lon REAL,
|
||||
last_seen TEXT, first_seen TEXT, frequency REAL
|
||||
)`,
|
||||
`CREATE TABLE IF NOT EXISTS schema_version (version INTEGER)`,
|
||||
`INSERT INTO schema_version (version) VALUES (1)`,
|
||||
`CREATE INDEX IF NOT EXISTS idx_tx_first_seen ON transmissions(first_seen)`,
|
||||
}
|
||||
for _, s := range stmts {
|
||||
if _, err := conn.Exec(s); err != nil {
|
||||
tb.Fatalf("setup exec: %v\nSQL: %s", err, s)
|
||||
}
|
||||
}
|
||||
|
||||
txStmt, _ := conn.Prepare("INSERT INTO transmissions (id, raw_hex, hash, first_seen, route_type, payload_type, payload_version, decoded_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?)")
|
||||
obsStmt, _ := conn.Prepare("INSERT INTO observations (id, transmission_id, observer_id, observer_name, direction, snr, rssi, score, path_json, timestamp) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")
|
||||
defer txStmt.Close()
|
||||
defer obsStmt.Close()
|
||||
|
||||
now := time.Now().UTC().Truncate(time.Second)
|
||||
// id=0: the #1486-style seed row, within retention window.
|
||||
txStmt.Exec(0, "1500", "fae0c9e6d357a814", now.Add(-1*time.Minute).Format(time.RFC3339), 1, 5, 0, `{"type":"CHAN"}`)
|
||||
obsStmt.Exec(0, 0, "obs1", "Obs1", "rx", 5.0, -95.0, 0, `["AA"]`, now.Add(-1*time.Minute).Unix())
|
||||
|
||||
for i := 1; i <= extraTx; i++ {
|
||||
ts := now.Add(-time.Duration(i+1) * time.Minute).Format(time.RFC3339)
|
||||
unixTs := now.Add(-time.Duration(i+1) * time.Minute).Unix()
|
||||
hash := fmt.Sprintf("h%04d", i)
|
||||
txStmt.Exec(i, "aabb", hash, ts, 0, 4, 1, fmt.Sprintf(`{"pubKey":"pk%04d"}`, i))
|
||||
obsStmt.Exec(i, i, "obs1", "Obs1", "rx", -10.0, -80.0, 5, `["aa","bb"]`, unixTs)
|
||||
}
|
||||
}
|
||||
|
||||
// TestLoadChunked_IncludesIDZero: LoadChunked must load transmissions
|
||||
// with id=0. The legacy Load() (since-replaced by LoadChunked) walked
|
||||
// transmissions unconditionally; LoadChunked uses an id-cursor that
|
||||
// starts at 0 with a strict `t2.id > cursorID` predicate, so id=0
|
||||
// rows are silently dropped. This breaks the #1486 e2e fixture seed
|
||||
// which uses id=0 to sort the grouped row last in the default view.
|
||||
func TestLoadChunked_IncludesIDZero(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "idzero.db")
|
||||
createTestDBWithIDZero(t, dbPath, 10)
|
||||
|
||||
db, err := OpenDB(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenDB: %v", err)
|
||||
}
|
||||
cfg := &PacketStoreConfig{}
|
||||
store := NewPacketStore(db, cfg)
|
||||
defer store.db.conn.Close()
|
||||
|
||||
if err := store.LoadChunked(5); err != nil {
|
||||
t.Fatalf("LoadChunked: %v", err)
|
||||
}
|
||||
|
||||
if _, ok := store.byHash["fae0c9e6d357a814"]; !ok {
|
||||
t.Fatalf("LoadChunked dropped the id=0 transmission: "+
|
||||
"byHash[fae0c9e6d357a814] missing; loaded %d packets total "+
|
||||
"(id-cursor starts at 0 with strict `t2.id > cursorID`, "+
|
||||
"so id=0 is excluded — this is the #1486 e2e regression)",
|
||||
len(store.packets))
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,154 @@
|
||||
package main
|
||||
|
||||
// Regression for PR #1596 (issue #1009) chunked load: when transmission
|
||||
// ids are anti-correlated with first_seen (e.g. id=1 has the NEWEST
|
||||
// timestamp), LoadChunked walks id-ASC and the post-load
|
||||
// `s.oldestLoaded = s.packets[0].FirstSeen` line set oldestLoaded to
|
||||
// the NEWEST first_seen. QueryPackets then mis-routed any
|
||||
// `since>=oldestLoaded` query to the SQL fallback, hiding fresh
|
||||
// in-memory rows. This shows up in real life on the e2e fixture after
|
||||
// tools/freshen-fixture.sh shifts timestamps so id=1 (originally
|
||||
// loaded first) carries the most recent first_seen.
|
||||
//
|
||||
// The mobile e2e test test-observer-iata-1188-e2e.js fails as a
|
||||
// result: with the default 15-minute time window, /api/packets returns
|
||||
// 0 rows and the mobile DOM has no `tr[data-hash]` to tap.
|
||||
//
|
||||
// This test asserts the in-memory invariant: after LoadChunked,
|
||||
// oldestLoaded must equal the actual oldest FirstSeen across loaded
|
||||
// transmissions, not the FirstSeen of the first row in s.packets.
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// createTestDBReverseTime builds numTx transmissions whose ids run
|
||||
// 1..numTx ASC while first_seen runs newest..oldest (id=1 = newest).
|
||||
// This mirrors the freshen-fixture-shifted e2e DB exactly.
|
||||
func createTestDBReverseTime(tb testing.TB, dbPath string, numTx int) {
|
||||
tb.Helper()
|
||||
conn, err := sql.Open("sqlite", dbPath+"?_journal_mode=WAL")
|
||||
if err != nil {
|
||||
tb.Fatal(err)
|
||||
}
|
||||
defer conn.Close()
|
||||
|
||||
stmts := []string{
|
||||
`CREATE TABLE IF NOT EXISTS transmissions (
|
||||
id INTEGER PRIMARY KEY,
|
||||
raw_hex TEXT, hash TEXT, first_seen TEXT,
|
||||
route_type INTEGER, payload_type INTEGER,
|
||||
payload_version INTEGER, decoded_json TEXT
|
||||
)`,
|
||||
`CREATE TABLE IF NOT EXISTS observations (
|
||||
id INTEGER PRIMARY KEY,
|
||||
transmission_id INTEGER, observer_id TEXT, observer_name TEXT,
|
||||
direction TEXT, snr REAL, rssi REAL, score INTEGER,
|
||||
path_json TEXT, timestamp TEXT, raw_hex TEXT
|
||||
)`,
|
||||
`CREATE TABLE IF NOT EXISTS observers (rowid INTEGER PRIMARY KEY, id TEXT, name TEXT, iata TEXT)`,
|
||||
`CREATE TABLE IF NOT EXISTS nodes (
|
||||
pubkey TEXT PRIMARY KEY, name TEXT, role TEXT, lat REAL, lon REAL,
|
||||
last_seen TEXT, first_seen TEXT, frequency REAL
|
||||
)`,
|
||||
`CREATE TABLE IF NOT EXISTS schema_version (version INTEGER)`,
|
||||
`INSERT INTO schema_version (version) VALUES (1)`,
|
||||
`CREATE INDEX IF NOT EXISTS idx_tx_first_seen ON transmissions(first_seen)`,
|
||||
}
|
||||
for _, s := range stmts {
|
||||
if _, err := conn.Exec(s); err != nil {
|
||||
tb.Fatalf("setup exec: %v\nSQL: %s", err, s)
|
||||
}
|
||||
}
|
||||
|
||||
txStmt, _ := conn.Prepare("INSERT INTO transmissions (id, raw_hex, hash, first_seen, route_type, payload_type, payload_version, decoded_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?)")
|
||||
obsStmt, _ := conn.Prepare("INSERT INTO observations (id, transmission_id, observer_id, observer_name, direction, snr, rssi, score, path_json, timestamp) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")
|
||||
defer txStmt.Close()
|
||||
defer obsStmt.Close()
|
||||
|
||||
// id=1 is the NEWEST (now); id=numTx is the OLDEST (numTx minutes ago).
|
||||
now := time.Now().UTC().Truncate(time.Second)
|
||||
for i := 1; i <= numTx; i++ {
|
||||
ts := now.Add(-time.Duration(i-1) * time.Minute).Format(time.RFC3339)
|
||||
unixTs := now.Add(-time.Duration(i-1) * time.Minute).Unix()
|
||||
hash := fmt.Sprintf("h%04d", i)
|
||||
txStmt.Exec(i, "aabb", hash, ts, 0, 4, 1, fmt.Sprintf(`{"pubKey":"pk%04d"}`, i))
|
||||
obsStmt.Exec(i, i, "obs1", "Obs1", "RX", -10.0, -80.0, 5, `["aa","bb"]`, unixTs)
|
||||
}
|
||||
}
|
||||
|
||||
func openReverseTimeStore(t *testing.T, numTx int) *PacketStore {
|
||||
t.Helper()
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "rev.db")
|
||||
createTestDBReverseTime(t, dbPath, numTx)
|
||||
|
||||
db, err := OpenDB(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenDB: %v", err)
|
||||
}
|
||||
cfg := &PacketStoreConfig{}
|
||||
return NewPacketStore(db, cfg)
|
||||
}
|
||||
|
||||
// TestLoadChunked_OldestLoadedIsActualOldest: when LoadChunked walks
|
||||
// transmissions in id-ASC order but timestamps are anti-correlated
|
||||
// with id (PR #1596 regression scenario), oldestLoaded MUST be the
|
||||
// minimum FirstSeen across loaded packets, not the first row's
|
||||
// FirstSeen. Otherwise QueryPackets routes "since=15min ago" to SQL
|
||||
// fallback, hiding fresh rows.
|
||||
func TestLoadChunked_OldestLoadedIsActualOldest(t *testing.T) {
|
||||
store := openReverseTimeStore(t, 50)
|
||||
defer store.db.conn.Close()
|
||||
|
||||
if err := store.LoadChunked(20); err != nil {
|
||||
t.Fatalf("LoadChunked: %v", err)
|
||||
}
|
||||
|
||||
// Compute the actual oldest first_seen across what got loaded.
|
||||
if len(store.packets) == 0 {
|
||||
t.Fatal("no packets loaded")
|
||||
}
|
||||
actualOldest := store.packets[0].FirstSeen
|
||||
for _, p := range store.packets {
|
||||
if p.FirstSeen < actualOldest {
|
||||
actualOldest = p.FirstSeen
|
||||
}
|
||||
}
|
||||
|
||||
if store.oldestLoaded != actualOldest {
|
||||
t.Fatalf("oldestLoaded=%q must equal actual MIN(FirstSeen)=%q "+
|
||||
"(id-ordered chunk walk with anti-correlated timestamps "+
|
||||
"left oldestLoaded pointing at the newest row, which makes "+
|
||||
"QueryPackets mis-route since-windowed queries to SQL fallback "+
|
||||
"and the mobile e2e test renders 0 rows)",
|
||||
store.oldestLoaded, actualOldest)
|
||||
}
|
||||
}
|
||||
|
||||
// TestLoadChunked_PacketsSortedByFirstSeenASC: QueryPackets and
|
||||
// GetTimestamps both assume s.packets is "sorted oldest-first" (see
|
||||
// store.go:2125 comment on GetTimestamps). LoadChunked walks rows
|
||||
// id-ASC which only equals first_seen-ASC when ids and timestamps
|
||||
// are correlated — not true after fixture freshen, not true after
|
||||
// any out-of-order ingest. Assert the invariant directly.
|
||||
func TestLoadChunked_PacketsSortedByFirstSeenASC(t *testing.T) {
|
||||
store := openReverseTimeStore(t, 25)
|
||||
defer store.db.conn.Close()
|
||||
|
||||
if err := store.LoadChunked(10); err != nil {
|
||||
t.Fatalf("LoadChunked: %v", err)
|
||||
}
|
||||
for i := 1; i < len(store.packets); i++ {
|
||||
if store.packets[i-1].FirstSeen > store.packets[i].FirstSeen {
|
||||
t.Fatalf("s.packets must be sorted by FirstSeen ASC; "+
|
||||
"packets[%d].FirstSeen=%q > packets[%d].FirstSeen=%q",
|
||||
i-1, store.packets[i-1].FirstSeen,
|
||||
i, store.packets[i].FirstSeen)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,150 @@
|
||||
package main
|
||||
|
||||
// Issue #1009: chunked Load with early HTTP readiness.
|
||||
//
|
||||
// These tests gate three behaviors:
|
||||
// (a) FirstChunkReady() unblocks BEFORE LoadChunked returns, so the
|
||||
// HTTP listener can bind after the first chunk completes while
|
||||
// remaining rows continue loading in the background.
|
||||
// (b) loadStatusMiddleware stamps an X-CoreScope-Load-Status header
|
||||
// with "loading" + progress while a load is in flight, flipping
|
||||
// to "ready" once LoadComplete() reports true.
|
||||
// (c) LoadChunked honors the configured chunkSize: the per-chunk
|
||||
// progress callback fires once per chunk, so a 2500-row DB with
|
||||
// chunkSize=1000 must yield 3 callbacks (1000 + 1000 + 500).
|
||||
//
|
||||
// Each subtest fails on an assertion (not a build error) when the
|
||||
// production code is absent — that is the red-commit contract.
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func openChunkedTestStore(t *testing.T, numTx int) *PacketStore {
|
||||
t.Helper()
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "chunked.db")
|
||||
createTestDBAt(t, dbPath, numTx)
|
||||
t.Cleanup(func() { os.RemoveAll(dir) })
|
||||
|
||||
db, err := OpenDB(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenDB: %v", err)
|
||||
}
|
||||
cfg := &PacketStoreConfig{}
|
||||
return NewPacketStore(db, cfg)
|
||||
}
|
||||
|
||||
// (a) FirstChunkReady fires before LoadChunked returns.
|
||||
func TestLoadChunked_FirstChunkReadyBeforeComplete(t *testing.T) {
|
||||
store := openChunkedTestStore(t, 2500)
|
||||
defer store.db.conn.Close()
|
||||
|
||||
doneCh := make(chan error, 1)
|
||||
go func() { doneCh <- store.LoadChunked(500) }()
|
||||
|
||||
select {
|
||||
case <-store.FirstChunkReady():
|
||||
// Good: first chunk signaled. Load may or may not have completed
|
||||
// for tiny test DBs, but the gate must have fired without
|
||||
// requiring the full load.
|
||||
case err := <-doneCh:
|
||||
// If load completed before we could observe the signal, the
|
||||
// signal still must be closed.
|
||||
if err != nil {
|
||||
t.Fatalf("LoadChunked: %v", err)
|
||||
}
|
||||
select {
|
||||
case <-store.FirstChunkReady():
|
||||
default:
|
||||
t.Fatal("FirstChunkReady channel must be closed after LoadChunked completes")
|
||||
}
|
||||
case <-time.After(10 * time.Second):
|
||||
t.Fatal("FirstChunkReady did not fire within 10s — listener would never bind")
|
||||
}
|
||||
|
||||
// Drain background completion.
|
||||
select {
|
||||
case err := <-doneCh:
|
||||
if err != nil {
|
||||
t.Fatalf("LoadChunked returned error: %v", err)
|
||||
}
|
||||
case <-time.After(30 * time.Second):
|
||||
t.Fatal("LoadChunked never returned")
|
||||
}
|
||||
|
||||
if !store.LoadComplete() {
|
||||
t.Fatal("LoadComplete() must report true after LoadChunked returns")
|
||||
}
|
||||
}
|
||||
|
||||
// (b) Middleware stamps X-CoreScope-Load-Status correctly across the
|
||||
// loading→ready transition.
|
||||
func TestLoadStatusMiddleware_HeaderTransition(t *testing.T) {
|
||||
store := openChunkedTestStore(t, 100)
|
||||
defer store.db.conn.Close()
|
||||
|
||||
handler := loadStatusMiddleware(store, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}))
|
||||
|
||||
// Pre-load: header must report "loading".
|
||||
req := httptest.NewRequest("GET", "/api/healthz", nil)
|
||||
w := httptest.NewRecorder()
|
||||
handler.ServeHTTP(w, req)
|
||||
if got := w.Header().Get("X-CoreScope-Load-Status"); got == "" || got == "ready" {
|
||||
t.Fatalf("expected loading status header before Load, got %q", got)
|
||||
}
|
||||
|
||||
if err := store.LoadChunked(50); err != nil {
|
||||
t.Fatalf("LoadChunked: %v", err)
|
||||
}
|
||||
|
||||
// Post-load: header must report "ready".
|
||||
req2 := httptest.NewRequest("GET", "/api/healthz", nil)
|
||||
w2 := httptest.NewRecorder()
|
||||
handler.ServeHTTP(w2, req2)
|
||||
if got := w2.Header().Get("X-CoreScope-Load-Status"); got != "ready" {
|
||||
t.Fatalf("expected X-CoreScope-Load-Status=ready after load, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
// (c) LoadChunked honors the chunkSize argument — progress callback
|
||||
// fires once per chunk.
|
||||
func TestLoadChunked_ChunkSizeHonored(t *testing.T) {
|
||||
store := openChunkedTestStore(t, 2500)
|
||||
defer store.db.conn.Close()
|
||||
|
||||
var chunks []int
|
||||
store.OnChunkLoaded(func(rowsThisChunk, totalRows int) {
|
||||
chunks = append(chunks, rowsThisChunk)
|
||||
})
|
||||
|
||||
if err := store.LoadChunked(1000); err != nil {
|
||||
t.Fatalf("LoadChunked: %v", err)
|
||||
}
|
||||
|
||||
if len(chunks) != 3 {
|
||||
t.Fatalf("expected 3 chunks for 2500 rows @ chunkSize=1000, got %d (sizes=%v)", len(chunks), chunks)
|
||||
}
|
||||
if chunks[0] != 1000 || chunks[1] != 1000 || chunks[2] != 500 {
|
||||
t.Fatalf("expected chunk sizes [1000,1000,500], got %v", chunks)
|
||||
}
|
||||
}
|
||||
|
||||
// (d) Config plumbing: DB.Load.ChunkSize threads through.
|
||||
func TestConfig_DBLoadChunkSize(t *testing.T) {
|
||||
c := &Config{}
|
||||
if got := c.DBLoadChunkSize(); got != 10000 {
|
||||
t.Fatalf("DBLoadChunkSize() default = %d, want 10000", got)
|
||||
}
|
||||
c.DB = &DBConfig{Load: &dbLoadConfig{ChunkSize: 2500}}
|
||||
if got := c.DBLoadChunkSize(); got != 2500 {
|
||||
t.Fatalf("DBLoadChunkSize() configured = %d, want 2500", got)
|
||||
}
|
||||
}
|
||||
@@ -33,4 +33,3 @@ func clampLimit(raw string, def, max int) int {
|
||||
func queryLimit(r *http.Request, def, max int) int {
|
||||
return clampLimit(r.URL.Query().Get("limit"), def, max)
|
||||
}
|
||||
|
||||
|
||||
@@ -133,6 +133,7 @@ type NodeClockSkew struct {
|
||||
Samples []SkewSample `json:"samples,omitempty"` // time-series for sparklines
|
||||
GoodFraction float64 `json:"goodFraction"` // fraction of recent samples with |skew| <= 1h
|
||||
RecentBadSampleCount int `json:"recentBadSampleCount"` // count of recent samples with |skew| > 1h
|
||||
RecentBadSamples []BadSample `json:"recentBadSamples,omitempty"` // #1094: per-bad-sample evidence (hash + bad advertTS)
|
||||
RecentSampleCount int `json:"recentSampleCount"` // total recent samples in window
|
||||
RecentHashEvidence []HashEvidence `json:"recentHashEvidence,omitempty"`
|
||||
CalibrationSummary *CalibrationSummary `json:"calibrationSummary,omitempty"`
|
||||
@@ -146,6 +147,15 @@ type SkewSample struct {
|
||||
SkewSec float64 `json:"skew"` // corrected skew in seconds
|
||||
}
|
||||
|
||||
// BadSample is a single recent advert flagged as having a nonsense timestamp
|
||||
// (|corrected skew| in the bimodal-bad band — > 1h, <= 24h). #1094: surfaced
|
||||
// so the UI can link each offender to its packet detail page.
|
||||
type BadSample struct {
|
||||
Hash string `json:"hash"` // transmission hash for packet-detail deep-link
|
||||
AdvertTS int64 `json:"advertTS"` // the offending advert Unix timestamp
|
||||
SkewSec float64 `json:"skewSec"` // corrected skew vs observer at observation time
|
||||
}
|
||||
|
||||
// HashEvidenceObserver is one observer's contribution to a per-hash evidence entry.
|
||||
type HashEvidenceObserver struct {
|
||||
ObserverID string `json:"observerID"`
|
||||
@@ -512,7 +522,7 @@ func (s *PacketStore) getNodeClockSkewLocked(pubkey string) *NodeClockSkew {
|
||||
lastSkew = cs.LastSkewSec
|
||||
lastAdvTS = cs.LastAdvertTS
|
||||
}
|
||||
tsSkews = append(tsSkews, tsSkewPair{ts: cs.LastObservedTS, skew: cs.MedianSkewSec})
|
||||
tsSkews = append(tsSkews, tsSkewPair{ts: cs.LastObservedTS, skew: cs.MedianSkewSec, hash: tx.Hash, advertTS: cs.LastAdvertTS})
|
||||
}
|
||||
|
||||
if len(allSkews) == 0 {
|
||||
@@ -536,6 +546,7 @@ func (s *PacketStore) getNodeClockSkewLocked(pubkey string) *NodeClockSkew {
|
||||
|
||||
recentSkew := lastSkew
|
||||
var recentVals []float64
|
||||
var recentPairs []tsSkewPair
|
||||
if n := len(tsSkews); n > 0 {
|
||||
latestTS := tsSkews[n-1].ts
|
||||
// Index-based window: last K samples.
|
||||
@@ -559,6 +570,7 @@ func (s *PacketStore) getNodeClockSkewLocked(pubkey string) *NodeClockSkew {
|
||||
start = startByTime
|
||||
}
|
||||
recentVals = make([]float64, 0, n-start)
|
||||
recentPairs = tsSkews[start:n]
|
||||
for i := start; i < n; i++ {
|
||||
recentVals = append(recentVals, tsSkews[i].skew)
|
||||
}
|
||||
@@ -583,13 +595,25 @@ func (s *PacketStore) getNodeClockSkewLocked(pubkey string) *NodeClockSkew {
|
||||
// adverts had nonsense timestamps") on otherwise-healthy nodes.
|
||||
var goodSamples []float64
|
||||
var rtcResetCount int
|
||||
for _, v := range recentVals {
|
||||
var recentBadSamples []BadSample // #1094: per-bad-sample evidence (hash + advertTS)
|
||||
for i, v := range recentVals {
|
||||
absV := math.Abs(v)
|
||||
switch {
|
||||
case absV > rtcResetOutlierThresholdSec:
|
||||
rtcResetCount++ // ignored for good/bad classification
|
||||
case absV <= bimodalSkewThresholdSec:
|
||||
goodSamples = append(goodSamples, v)
|
||||
default:
|
||||
// Bimodal-bad: 1h < |skew| <= 24h. Capture hash + advertTS so
|
||||
// the UI can link each offender to its packet detail page
|
||||
// instead of showing a count without evidence (#1094).
|
||||
if i < len(recentPairs) && recentPairs[i].hash != "" {
|
||||
recentBadSamples = append(recentBadSamples, BadSample{
|
||||
Hash: recentPairs[i].hash,
|
||||
AdvertTS: recentPairs[i].advertTS,
|
||||
SkewSec: round(v, 1),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
recentSampleCount := len(recentVals) - rtcResetCount
|
||||
@@ -715,6 +739,7 @@ func (s *PacketStore) getNodeClockSkewLocked(pubkey string) *NodeClockSkew {
|
||||
Samples: samples,
|
||||
GoodFraction: round(goodFraction, 2),
|
||||
RecentBadSampleCount: recentBadCount,
|
||||
RecentBadSamples: recentBadSamples,
|
||||
RecentSampleCount: recentSampleCount,
|
||||
RecentHashEvidence: recentEvidence,
|
||||
CalibrationSummary: &calSummary,
|
||||
@@ -875,10 +900,16 @@ func mean(vals []float64) float64 {
|
||||
return sum / float64(len(vals))
|
||||
}
|
||||
|
||||
// tsSkewPair is a (timestamp, skew) pair for drift estimation.
|
||||
// tsSkewPair is a (timestamp, skew) pair for drift estimation. Also carries
|
||||
// the source hash + advertTS so callers building per-sample evidence (e.g.
|
||||
// recentBadSamples for #1094) can identify the offending packet without a
|
||||
// second pass. Drift code reads only ts/skew; the extra fields are inert
|
||||
// there.
|
||||
type tsSkewPair struct {
|
||||
ts int64
|
||||
skew float64
|
||||
ts int64
|
||||
skew float64
|
||||
hash string
|
||||
advertTS int64
|
||||
}
|
||||
|
||||
// computeDrift estimates linear drift in seconds per day from time-ordered
|
||||
|
||||
@@ -0,0 +1,109 @@
|
||||
package main
|
||||
|
||||
// Regression test for #1094: the bimodal-clock warning currently exposes only
|
||||
// RecentBadSampleCount, leaving the UI to render "⚠️ N of M adverts had
|
||||
// nonsense timestamps" without telling the operator WHICH packets were bad.
|
||||
//
|
||||
// This test pins the additive API contract: alongside the count, the response
|
||||
// must expose RecentBadSamples — a slice of (hash, advertTS, skewSec) — so the
|
||||
// frontend can render each offending hash as a clickable link with its bad
|
||||
// timestamp.
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Seeds 5 recent adverts: 3 healthy (~-20s skew) and 2 with a "nonsense"
|
||||
// bimodal-bad timestamp (|skew| in (1h, 24h]). The recent window is exactly
|
||||
// 5 samples, so all five are inside it.
|
||||
func seedIssue1094Repro(t *testing.T) (*PacketStore, []string, []int64) {
|
||||
t.Helper()
|
||||
ps := NewPacketStore(nil, nil)
|
||||
pt := 4 // ADVERT
|
||||
|
||||
const pubkey = "BADTS1094"
|
||||
baseObs := int64(1779000000)
|
||||
|
||||
var txs []*StoreTx
|
||||
var badHashes []string
|
||||
var badAdvertTSs []int64
|
||||
|
||||
// 3 healthy adverts (skew = -20s).
|
||||
for i := 0; i < 3; i++ {
|
||||
obsTS := baseObs + int64(i)*60
|
||||
advTS := obsTS - 20
|
||||
txs = append(txs, &StoreTx{
|
||||
Hash: "healthy-1094-" + formatInt64(int64(i)),
|
||||
PayloadType: &pt,
|
||||
DecodedJSON: `{"payload":{"timestamp":` + formatInt64(advTS) + `}}`,
|
||||
Observations: []*StoreObs{
|
||||
{ObserverID: "obs1", Timestamp: time.Unix(obsTS, 0).UTC().Format(time.RFC3339)},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// 2 nonsense-timestamp adverts (skew = -7200s = -2h — bimodal-bad,
|
||||
// below the 24h RTC-reset exclusion so they DO count in recentBadCount).
|
||||
for i := 0; i < 2; i++ {
|
||||
obsTS := baseObs + int64(3+i)*60
|
||||
advTS := obsTS - 7200
|
||||
hash := "bad-1094-" + formatInt64(int64(i))
|
||||
txs = append(txs, &StoreTx{
|
||||
Hash: hash,
|
||||
PayloadType: &pt,
|
||||
DecodedJSON: `{"payload":{"timestamp":` + formatInt64(advTS) + `}}`,
|
||||
Observations: []*StoreObs{
|
||||
{ObserverID: "obs1", Timestamp: time.Unix(obsTS, 0).UTC().Format(time.RFC3339)},
|
||||
},
|
||||
})
|
||||
badHashes = append(badHashes, hash)
|
||||
badAdvertTSs = append(badAdvertTSs, advTS)
|
||||
}
|
||||
|
||||
ps.mu.Lock()
|
||||
ps.byNode[pubkey] = txs
|
||||
for _, tx := range txs {
|
||||
ps.byPayloadType[4] = append(ps.byPayloadType[4], tx)
|
||||
}
|
||||
ps.clockSkew.computeInterval = 0
|
||||
ps.mu.Unlock()
|
||||
return ps, badHashes, badAdvertTSs
|
||||
}
|
||||
|
||||
func TestIssue1094_RecentBadSamples_ExposesHashAndTimestamp(t *testing.T) {
|
||||
ps, wantHashes, wantAdvertTSs := seedIssue1094Repro(t)
|
||||
r := ps.GetNodeClockSkew("BADTS1094")
|
||||
if r == nil {
|
||||
t.Fatal("expected clock skew result")
|
||||
}
|
||||
|
||||
// Pre-condition: count must already be 2 (gates the test against the
|
||||
// existing field — if this drops we'd be measuring the wrong thing).
|
||||
if r.RecentBadSampleCount != 2 {
|
||||
t.Fatalf("RecentBadSampleCount = %d, want 2 (seed bug, not the field-under-test)",
|
||||
r.RecentBadSampleCount)
|
||||
}
|
||||
|
||||
if len(r.RecentBadSamples) != 2 {
|
||||
t.Fatalf("RecentBadSamples len = %d, want 2 — operators need to see which "+
|
||||
"adverts had nonsense timestamps, not just the count",
|
||||
len(r.RecentBadSamples))
|
||||
}
|
||||
|
||||
gotByHash := map[string]int64{}
|
||||
for _, bs := range r.RecentBadSamples {
|
||||
gotByHash[bs.Hash] = bs.AdvertTS
|
||||
}
|
||||
for i, h := range wantHashes {
|
||||
ts, ok := gotByHash[h]
|
||||
if !ok {
|
||||
t.Errorf("RecentBadSamples missing hash %q", h)
|
||||
continue
|
||||
}
|
||||
if ts != wantAdvertTSs[i] {
|
||||
t.Errorf("RecentBadSamples[%q].AdvertTS = %d, want %d (the bad advertTS)",
|
||||
h, ts, wantAdvertTSs[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
+320
-33
@@ -8,6 +8,7 @@ import (
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/meshcore-analyzer/dbconfig"
|
||||
@@ -24,11 +25,21 @@ type AreaEntry struct {
|
||||
LonMax *float64 `json:"lonMax,omitempty"`
|
||||
}
|
||||
|
||||
// ListLimitsConfig defines maximum row limits for list endpoints to prevent DoS.
|
||||
type ListLimitsConfig struct {
|
||||
PacketsMax int `json:"packetsMax"`
|
||||
NodesMax int `json:"nodesMax"`
|
||||
AnalyticsMax int `json:"analyticsMax"`
|
||||
ChannelMessagesMax int `json:"channelMessagesMax"`
|
||||
BulkHealthMax int `json:"bulkHealthMax"`
|
||||
}
|
||||
|
||||
// Config mirrors the Node.js config.json structure (read-only fields).
|
||||
type Config struct {
|
||||
Port int `json:"port"`
|
||||
APIKey string `json:"apiKey"`
|
||||
DBPath string `json:"dbPath"`
|
||||
Port int `json:"port"`
|
||||
APIKey string `json:"apiKey"`
|
||||
DBPath string `json:"dbPath"`
|
||||
ListLimits *ListLimitsConfig `json:"listLimits"`
|
||||
|
||||
// NodeBlacklist is a list of public keys to exclude from all API responses.
|
||||
// Blacklisted nodes are hidden from node lists, search, detail, map, and stats.
|
||||
@@ -37,9 +48,40 @@ type Config struct {
|
||||
// operator refuses to fix.
|
||||
NodeBlacklist []string `json:"nodeBlacklist"`
|
||||
|
||||
// blacklistSetCached is the lazily-built set version of NodeBlacklist.
|
||||
blacklistSetCached map[string]bool
|
||||
blacklistOnce sync.Once
|
||||
// HiddenNamePrefixes is a list of name prefixes that mark a node as
|
||||
// hidden from API responses (issue #1181). The default `["🚫"]` mirrors
|
||||
// a convention used by other MeshCore map dashboards: operators who
|
||||
// rename their node with the prefix get hidden from the map without
|
||||
// waiting for normal retention to clear stale data. DB rows are
|
||||
// preserved — the filter is applied at the API layer only, so the
|
||||
// underlying observation history remains intact.
|
||||
HiddenNamePrefixes []string `json:"hiddenNamePrefixes"`
|
||||
|
||||
// hiddenPrefixesPtr holds the active prefix slice as an atomic pointer.
|
||||
// Read path (IsNameHidden) is a single atomic load — no mutex, no
|
||||
// sync.Once. Writers always replace the whole slice; readers see either
|
||||
// the old or the new slice as a single value, never a partial state.
|
||||
// Mirrors blacklistSetPtr.
|
||||
hiddenPrefixesPtr atomic.Pointer[[]string]
|
||||
|
||||
// hiddenPrefixesGen is a monotonic counter bumped every time the
|
||||
// hidden-prefix list mutates via SetHiddenNamePrefixes. Cache wiring
|
||||
// is left for follow-up; the counter is the prerequisite primitive
|
||||
// callers will key on (mirrors blacklistGen / #1629).
|
||||
hiddenPrefixesGen atomic.Uint64
|
||||
|
||||
// blacklistSetPtr holds the active lookup set as an atomic pointer.
|
||||
// Read path is a single atomic load — no mutex, no sync.Once. Writers
|
||||
// always replace the whole map; readers see either the old or the new
|
||||
// map as a single value, never a partially-built one.
|
||||
blacklistSetPtr atomic.Pointer[map[string]bool]
|
||||
|
||||
// blacklistGen is a monotonic generation counter bumped every time the
|
||||
// blacklist mutates via SetNodeBlacklist. Callers that cache responses
|
||||
// keyed by pubkey (e.g. /api/nodes/{pubkey}/reach, #1629) include this
|
||||
// generation in their cache key so any blacklist change naturally
|
||||
// invalidates prior entries on the next request.
|
||||
blacklistGen atomic.Uint64
|
||||
|
||||
Branding map[string]interface{} `json:"branding"`
|
||||
Theme map[string]interface{} `json:"theme"`
|
||||
@@ -63,7 +105,8 @@ type Config struct {
|
||||
|
||||
Roles map[string]interface{} `json:"roles"`
|
||||
HealthThresholds *HealthThresholds `json:"healthThresholds"`
|
||||
Tiles map[string]interface{} `json:"tiles"`
|
||||
Map map[string]interface{} `json:"map"`
|
||||
Tiles map[string]interface{} `json:"tiles"` // deprecated
|
||||
SnrThresholds map[string]interface{} `json:"snrThresholds"`
|
||||
DistThresholds map[string]interface{} `json:"distThresholds"`
|
||||
MaxHopDist *float64 `json:"maxHopDist"`
|
||||
@@ -75,6 +118,7 @@ type Config struct {
|
||||
|
||||
LiveMap struct {
|
||||
PropagationBufferMs int `json:"propagationBufferMs"`
|
||||
MaxNodes int `json:"maxNodes"`
|
||||
} `json:"liveMap"`
|
||||
|
||||
CacheTTL map[string]interface{} `json:"cacheTTL"`
|
||||
@@ -85,6 +129,11 @@ type Config struct {
|
||||
|
||||
PacketStore *PacketStoreConfig `json:"packetStore,omitempty"`
|
||||
|
||||
// Runtime holds Go runtime tuning knobs (#1010).
|
||||
// Currently exposes runtime.maxMemoryMB which sets a soft memory limit
|
||||
// (GOMEMLIMIT) via runtime/debug.SetMemoryLimit at startup. The
|
||||
// GOMEMLIMIT environment variable, when set, takes precedence.
|
||||
Runtime *RuntimeConfig `json:"runtime,omitempty"`
|
||||
GeoFilter *GeoFilterConfig `json:"geo_filter,omitempty"`
|
||||
|
||||
Areas map[string]AreaEntry `json:"areas,omitempty"`
|
||||
@@ -99,10 +148,7 @@ type Config struct {
|
||||
DebugAffinity bool `json:"debugAffinity,omitempty"`
|
||||
|
||||
// MapDarkTileProvider selects the default dark-mode basemap provider for
|
||||
// new visitors. The client may override per-browser via the customizer
|
||||
// (persisted to localStorage). Allowed values: "carto-dark" (default),
|
||||
// "esri-darkgray-labels", "voyager-inverted", "positron-inverted". See
|
||||
// public/map-tile-providers.js for the registry. #1420.
|
||||
// new visitors. Deprecated: use Map.Tiles.DarkDefault instead.
|
||||
MapDarkTileProvider string `json:"mapDarkTileProvider,omitempty"`
|
||||
|
||||
// ObserverBlacklist is a list of observer public keys to exclude from API
|
||||
@@ -126,6 +172,26 @@ type Config struct {
|
||||
|
||||
// BatteryThresholds: voltage cutoffs for low/critical alerts (#663).
|
||||
BatteryThresholds *BatteryThresholdsConfig `json:"batteryThresholds,omitempty"`
|
||||
|
||||
// Customizer controls operator-side knobs for the in-app customizer modal
|
||||
// (theme/branding/etc.). See CustomizerConfig and issue #1508.
|
||||
Customizer *CustomizerConfig `json:"customizer,omitempty"`
|
||||
|
||||
// Known-channels catalogue integration (issue #1323).
|
||||
// URL of a JSON catalogue file (channels-by-country shape) fetched
|
||||
// periodically and exposed via /api/known-channels. Empty disables.
|
||||
KnownChannelsURL string `json:"knownChannelsUrl,omitempty"`
|
||||
// Refresh interval in milliseconds. 0/missing => default 24h.
|
||||
KnownChannelsRefreshMs int64 `json:"knownChannelsRefreshMs,omitempty"`
|
||||
}
|
||||
|
||||
// CustomizerConfig holds operator-side knobs for the in-app customizer modal.
|
||||
// Today only DisabledTabs is exposed: a list of tab ids the operator wants to
|
||||
// hide from end users (e.g. ["branding","geofilter","export"]). The frontend
|
||||
// (public/customize-v2.js _renderTabs) reads this from /api/config/client and
|
||||
// filters those tabs out before rendering. Issue #1508.
|
||||
type CustomizerConfig struct {
|
||||
DisabledTabs []string `json:"disabledTabs"`
|
||||
}
|
||||
|
||||
// weakAPIKeys is the blocklist of known default/example API keys that must be rejected.
|
||||
@@ -226,6 +292,16 @@ type PacketStoreConfig struct {
|
||||
// GeoFilterConfig is an alias for the shared geofilter.Config type.
|
||||
type GeoFilterConfig = geofilter.Config
|
||||
|
||||
// RuntimeConfig holds Go runtime tuning knobs (#1010).
|
||||
type RuntimeConfig struct {
|
||||
// MaxMemoryMB sets the Go soft memory limit (GOMEMLIMIT) in MiB via
|
||||
// runtime/debug.SetMemoryLimit at startup. Takes precedence over the
|
||||
// implicit limit derived from packetStore.maxMemoryMB. The GOMEMLIMIT
|
||||
// environment variable, when set, takes precedence over this value.
|
||||
// 0/unset preserves default behavior.
|
||||
MaxMemoryMB int `json:"maxMemoryMB"`
|
||||
}
|
||||
|
||||
type RetentionConfig struct {
|
||||
NodeDays int `json:"nodeDays"`
|
||||
ObserverDays int `json:"observerDays"`
|
||||
@@ -325,6 +401,10 @@ type HealthThresholds struct {
|
||||
// repeater to be considered "actively relaying" vs only "alive
|
||||
// (advert-only)". See issue #662. Defaults to 24h.
|
||||
RelayActiveHours float64 `json:"relayActiveHours"`
|
||||
// Issue #1552 — observer health classification thresholds (minutes).
|
||||
// Defaults match prior hardcoded behavior in public/observers.js (10/60).
|
||||
ObserverOnlineMinutes int `json:"observerOnlineMinutes"`
|
||||
ObserverStaleMinutes int `json:"observerStaleMinutes"`
|
||||
}
|
||||
|
||||
// ThemeFile mirrors theme.json overlay.
|
||||
@@ -359,14 +439,71 @@ func LoadConfig(baseDirs ...string) (*Config, error) {
|
||||
continue
|
||||
}
|
||||
cfg.NormalizeTimestampConfig()
|
||||
cfg.migrateDeprecatedConfig()
|
||||
cfg.applyListLimitsDefaults()
|
||||
applyCORSEnv(cfg)
|
||||
return cfg, nil
|
||||
}
|
||||
cfg.NormalizeTimestampConfig()
|
||||
cfg.migrateDeprecatedConfig()
|
||||
cfg.applyListLimitsDefaults()
|
||||
applyCORSEnv(cfg)
|
||||
return cfg, nil // defaults
|
||||
}
|
||||
|
||||
func (c *Config) applyListLimitsDefaults() {
|
||||
if c.ListLimits == nil {
|
||||
c.ListLimits = &ListLimitsConfig{}
|
||||
}
|
||||
if c.ListLimits.PacketsMax <= 0 {
|
||||
c.ListLimits.PacketsMax = 10000
|
||||
}
|
||||
if c.ListLimits.NodesMax <= 0 {
|
||||
c.ListLimits.NodesMax = 2000
|
||||
}
|
||||
if c.ListLimits.AnalyticsMax <= 0 {
|
||||
c.ListLimits.AnalyticsMax = 200
|
||||
}
|
||||
if c.ListLimits.ChannelMessagesMax <= 0 {
|
||||
c.ListLimits.ChannelMessagesMax = 500
|
||||
}
|
||||
if c.ListLimits.BulkHealthMax <= 0 {
|
||||
c.ListLimits.BulkHealthMax = 200
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Config) migrateDeprecatedConfig() {
|
||||
migrated := false
|
||||
if c.Map == nil {
|
||||
c.Map = make(map[string]interface{})
|
||||
}
|
||||
if c.Map["tiles"] == nil {
|
||||
c.Map["tiles"] = make(map[string]interface{})
|
||||
}
|
||||
tilesMap, ok := c.Map["tiles"].(map[string]interface{})
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
if c.MapDarkTileProvider != "" {
|
||||
if tilesMap["darkDefault"] == nil {
|
||||
tilesMap["darkDefault"] = c.MapDarkTileProvider
|
||||
}
|
||||
migrated = true
|
||||
}
|
||||
if len(c.Tiles) > 0 {
|
||||
for k, v := range c.Tiles {
|
||||
if tilesMap[k] == nil {
|
||||
tilesMap[k] = v
|
||||
}
|
||||
}
|
||||
migrated = true
|
||||
}
|
||||
if migrated {
|
||||
fmt.Fprintf(os.Stderr, "[deprecated] Top-level 'mapDarkTileProvider' and 'tiles' keys in config.json are deprecated and will be ignored in v3.5.0 (see #1165). Please move them into 'map': { 'tiles': { ... } }.\n")
|
||||
}
|
||||
}
|
||||
|
||||
func LoadTheme(baseDirs ...string) *ThemeFile {
|
||||
if len(baseDirs) == 0 {
|
||||
baseDirs = []string{"."}
|
||||
@@ -415,6 +552,18 @@ func (c *Config) GetHealthThresholds() HealthThresholds {
|
||||
if c.HealthThresholds.RelayActiveHours > 0 {
|
||||
h.RelayActiveHours = c.HealthThresholds.RelayActiveHours
|
||||
}
|
||||
if c.HealthThresholds.ObserverOnlineMinutes > 0 {
|
||||
h.ObserverOnlineMinutes = c.HealthThresholds.ObserverOnlineMinutes
|
||||
}
|
||||
if c.HealthThresholds.ObserverStaleMinutes > 0 {
|
||||
h.ObserverStaleMinutes = c.HealthThresholds.ObserverStaleMinutes
|
||||
}
|
||||
}
|
||||
if h.ObserverOnlineMinutes <= 0 {
|
||||
h.ObserverOnlineMinutes = 60
|
||||
}
|
||||
if h.ObserverStaleMinutes <= 0 {
|
||||
h.ObserverStaleMinutes = 1440
|
||||
}
|
||||
return h
|
||||
}
|
||||
@@ -431,11 +580,14 @@ func (h HealthThresholds) GetHealthMs(role string) (degradedMs, silentMs int) {
|
||||
// ToClientMs returns the thresholds as ms for the frontend.
|
||||
func (h HealthThresholds) ToClientMs() map[string]int {
|
||||
const hourMs = 3600000
|
||||
const minMs = 60000
|
||||
return map[string]int{
|
||||
"infraDegradedMs": int(h.InfraDegradedHours * hourMs),
|
||||
"infraSilentMs": int(h.InfraSilentHours * hourMs),
|
||||
"nodeDegradedMs": int(h.NodeDegradedHours * hourMs),
|
||||
"nodeSilentMs": int(h.NodeSilentHours * hourMs),
|
||||
"infraDegradedMs": int(h.InfraDegradedHours * hourMs),
|
||||
"infraSilentMs": int(h.InfraSilentHours * hourMs),
|
||||
"nodeDegradedMs": int(h.NodeDegradedHours * hourMs),
|
||||
"nodeSilentMs": int(h.NodeSilentHours * hourMs),
|
||||
"observerOnlineMs": h.ObserverOnlineMinutes * minMs,
|
||||
"observerStaleMs": h.ObserverStaleMinutes * minMs,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -502,31 +654,166 @@ func (c *Config) PropagationBufferMs() int {
|
||||
return 5000
|
||||
}
|
||||
|
||||
// blacklistSet lazily builds and caches the nodeBlacklist as a set for O(1) lookups.
|
||||
// Uses sync.Once to eliminate the data race on first concurrent access.
|
||||
func (c *Config) blacklistSet() map[string]bool {
|
||||
c.blacklistOnce.Do(func() {
|
||||
if len(c.NodeBlacklist) == 0 {
|
||||
return
|
||||
// LiveMapMaxNodes returns the operator-configured cap on how many nodes
|
||||
// the live map fetches (and thus renders) in a single page. Default is
|
||||
// 2000; values are clamped to [100, 20000] to defang misconfig.
|
||||
// Negative/zero falls back to default. See #1574.
|
||||
func (c *Config) LiveMapMaxNodes() int {
|
||||
const def = 2000
|
||||
const min = 100
|
||||
const max = 20000
|
||||
if c == nil || c.LiveMap.MaxNodes <= 0 {
|
||||
return def
|
||||
}
|
||||
v := c.LiveMap.MaxNodes
|
||||
if v < min {
|
||||
return min
|
||||
}
|
||||
if v > max {
|
||||
return max
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
// buildBlacklistSet recomputes the lookup set from pks and returns it.
|
||||
// Empty/whitespace-only entries are skipped. Keys are lowercased + trimmed.
|
||||
// Returns nil for an empty effective set so callers can `len(m) == 0` short-circuit.
|
||||
func buildBlacklistSet(pks []string) map[string]bool {
|
||||
if len(pks) == 0 {
|
||||
return nil
|
||||
}
|
||||
m := make(map[string]bool, len(pks))
|
||||
for _, pk := range pks {
|
||||
trimmed := strings.ToLower(strings.TrimSpace(pk))
|
||||
if trimmed != "" {
|
||||
m[trimmed] = true
|
||||
}
|
||||
m := make(map[string]bool, len(c.NodeBlacklist))
|
||||
for _, pk := range c.NodeBlacklist {
|
||||
trimmed := strings.ToLower(strings.TrimSpace(pk))
|
||||
if trimmed != "" {
|
||||
m[trimmed] = true
|
||||
}
|
||||
}
|
||||
c.blacklistSetCached = m
|
||||
})
|
||||
return c.blacklistSetCached
|
||||
}
|
||||
if len(m) == 0 {
|
||||
return nil
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
// SetNodeBlacklist atomically replaces NodeBlacklist with pks, rebuilds the
|
||||
// lookup set, and bumps the generation counter so any cache keyed on the
|
||||
// generation invalidates on the next request (#1629). Safe for concurrent
|
||||
// use with IsBlacklisted / BlacklistGeneration.
|
||||
func (c *Config) SetNodeBlacklist(pks []string) {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
// Copy so callers can mutate their slice without affecting us.
|
||||
cp := make([]string, len(pks))
|
||||
copy(cp, pks)
|
||||
c.NodeBlacklist = cp
|
||||
m := buildBlacklistSet(cp)
|
||||
c.blacklistSetPtr.Store(&m)
|
||||
c.blacklistGen.Add(1)
|
||||
}
|
||||
|
||||
// BlacklistGeneration returns a monotonic counter that increments on every
|
||||
// SetNodeBlacklist call. Response caches keyed per-pubkey embed this value
|
||||
// in their cache key so any blacklist mutation invalidates prior entries on
|
||||
// the next request (#1629).
|
||||
func (c *Config) BlacklistGeneration() uint64 {
|
||||
if c == nil {
|
||||
return 0
|
||||
}
|
||||
return c.blacklistGen.Load()
|
||||
}
|
||||
|
||||
// IsBlacklisted returns true if the given public key is in the nodeBlacklist.
|
||||
// Hot read path: a single atomic pointer load + map lookup. No locks, no
|
||||
// sync.Once. The in-memory set is populated either via SetNodeBlacklist or
|
||||
// lazily on first read from c.NodeBlacklist (covering the JSON-load path
|
||||
// where the setter was never called).
|
||||
func (c *Config) IsBlacklisted(pubkey string) bool {
|
||||
if c == nil || len(c.NodeBlacklist) == 0 {
|
||||
if c == nil {
|
||||
return false
|
||||
}
|
||||
return c.blacklistSet()[strings.ToLower(strings.TrimSpace(pubkey))]
|
||||
mp := c.blacklistSetPtr.Load()
|
||||
if mp == nil {
|
||||
// Lazy first-read materialisation from the JSON-loaded slice.
|
||||
// CAS-style: if another goroutine wins the race, drop ours.
|
||||
built := buildBlacklistSet(c.NodeBlacklist)
|
||||
if c.blacklistSetPtr.CompareAndSwap(nil, &built) {
|
||||
mp = &built
|
||||
} else {
|
||||
mp = c.blacklistSetPtr.Load()
|
||||
}
|
||||
}
|
||||
if mp == nil || len(*mp) == 0 {
|
||||
return false
|
||||
}
|
||||
return (*mp)[strings.ToLower(strings.TrimSpace(pubkey))]
|
||||
}
|
||||
|
||||
// IsNameHidden returns true if the given node name starts with any of the
|
||||
// operator-configured HiddenNamePrefixes (issue #1181). Empty/whitespace
|
||||
// prefixes are ignored. Used to drop nodes from /api/nodes, /api/nodes/search
|
||||
// and /api/nodes/{pubkey} without deleting the underlying DB row, so observer
|
||||
// history stays intact even after the operator hides the node.
|
||||
//
|
||||
// Hot read path: a single atomic pointer load. No locks, no sync.Once.
|
||||
// Writers always replace the whole slice; readers see either the old or
|
||||
// the new slice as a single value, never a partially-built one. Mirrors
|
||||
// IsBlacklisted's CAS-style lazy first-read materialisation for the
|
||||
// JSON-load path where SetHiddenNamePrefixes was never called.
|
||||
func (c *Config) IsNameHidden(name string) bool {
|
||||
if c == nil {
|
||||
return false
|
||||
}
|
||||
pp := c.hiddenPrefixesPtr.Load()
|
||||
if pp == nil {
|
||||
// Lazy first-read materialisation from the JSON-loaded slice.
|
||||
// CAS-style: if another goroutine wins the race, drop ours.
|
||||
built := make([]string, len(c.HiddenNamePrefixes))
|
||||
copy(built, c.HiddenNamePrefixes)
|
||||
if c.hiddenPrefixesPtr.CompareAndSwap(nil, &built) {
|
||||
pp = &built
|
||||
} else {
|
||||
pp = c.hiddenPrefixesPtr.Load()
|
||||
}
|
||||
}
|
||||
if pp == nil || len(*pp) == 0 {
|
||||
return false
|
||||
}
|
||||
for _, p := range *pp {
|
||||
if p == "" {
|
||||
continue
|
||||
}
|
||||
if strings.HasPrefix(name, p) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// SetHiddenNamePrefixes atomically replaces HiddenNamePrefixes with the
|
||||
// given slice and bumps the generation counter. Safe for concurrent use
|
||||
// with IsNameHidden / HiddenNamePrefixesGeneration. Mirrors
|
||||
// SetNodeBlacklist (#1629).
|
||||
func (c *Config) SetHiddenNamePrefixes(prefixes []string) {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
cp := make([]string, len(prefixes))
|
||||
copy(cp, prefixes)
|
||||
c.HiddenNamePrefixes = cp
|
||||
c.hiddenPrefixesPtr.Store(&cp)
|
||||
c.hiddenPrefixesGen.Add(1)
|
||||
}
|
||||
|
||||
// HiddenNamePrefixesGeneration returns a monotonic counter that increments
|
||||
// on every SetHiddenNamePrefixes call. Response caches keyed per-pubkey can
|
||||
// embed this value in their cache key so any prefix mutation invalidates
|
||||
// prior entries on the next request — same pattern as BlacklistGeneration.
|
||||
func (c *Config) HiddenNamePrefixesGeneration() uint64 {
|
||||
if c == nil {
|
||||
return 0
|
||||
}
|
||||
return c.hiddenPrefixesGen.Load()
|
||||
}
|
||||
|
||||
// SaveGeoFilter writes the geo_filter section back to config.json on disk.
|
||||
|
||||
@@ -387,3 +387,131 @@ func TestObserverDaysOrDefault(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Issue #1552 — observer health thresholds configurable.
|
||||
|
||||
func TestObserverThresholdsOverride(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
cfgData := map[string]interface{}{
|
||||
"healthThresholds": map[string]interface{}{
|
||||
"observerOnlineMinutes": 30,
|
||||
"observerStaleMinutes": 120,
|
||||
},
|
||||
}
|
||||
data, _ := json.Marshal(cfgData)
|
||||
os.WriteFile(filepath.Join(dir, "config.json"), data, 0644)
|
||||
cfg, err := LoadConfig(dir)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
h := cfg.GetHealthThresholds()
|
||||
if h.ObserverOnlineMinutes != 30 {
|
||||
t.Errorf("ObserverOnlineMinutes = %d, want 30", h.ObserverOnlineMinutes)
|
||||
}
|
||||
if h.ObserverStaleMinutes != 120 {
|
||||
t.Errorf("ObserverStaleMinutes = %d, want 120", h.ObserverStaleMinutes)
|
||||
}
|
||||
m := h.ToClientMs()
|
||||
if m["observerOnlineMs"] != 30*60*1000 {
|
||||
t.Errorf("observerOnlineMs = %d, want %d", m["observerOnlineMs"], 30*60*1000)
|
||||
}
|
||||
if m["observerStaleMs"] != 120*60*1000 {
|
||||
t.Errorf("observerStaleMs = %d, want %d", m["observerStaleMs"], 120*60*1000)
|
||||
}
|
||||
}
|
||||
|
||||
func TestObserverThresholdsDefaults(t *testing.T) {
|
||||
cfg := &Config{}
|
||||
h := cfg.GetHealthThresholds()
|
||||
if h.ObserverOnlineMinutes != 60 {
|
||||
t.Errorf("default ObserverOnlineMinutes = %d, want 60", h.ObserverOnlineMinutes)
|
||||
}
|
||||
if h.ObserverStaleMinutes != 1440 {
|
||||
t.Errorf("default ObserverStaleMinutes = %d, want 1440", h.ObserverStaleMinutes)
|
||||
}
|
||||
m := h.ToClientMs()
|
||||
if m["observerOnlineMs"] != 3600000 {
|
||||
t.Errorf("default observerOnlineMs = %d, want 3600000", m["observerOnlineMs"])
|
||||
}
|
||||
if m["observerStaleMs"] != 86400000 {
|
||||
t.Errorf("default observerStaleMs = %d, want 86400000", m["observerStaleMs"])
|
||||
}
|
||||
}
|
||||
|
||||
// Loading a config with no healthThresholds block at all must still produce
|
||||
// the new 60 / 1440 defaults (not zero, not the old 10 / 60).
|
||||
func TestObserverThresholdsDefaultsFromEmptyConfigFile(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
os.WriteFile(filepath.Join(dir, "config.json"), []byte(`{"port": 3000}`), 0644)
|
||||
cfg, err := LoadConfig(dir)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
h := cfg.GetHealthThresholds()
|
||||
if h.ObserverOnlineMinutes != 60 {
|
||||
t.Errorf("empty-config ObserverOnlineMinutes = %d, want 60 (new default)", h.ObserverOnlineMinutes)
|
||||
}
|
||||
if h.ObserverStaleMinutes != 1440 {
|
||||
t.Errorf("empty-config ObserverStaleMinutes = %d, want 1440 (new default)", h.ObserverStaleMinutes)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyListLimitsDefaults(t *testing.T) {
|
||||
t.Run("defaults when block is absent", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
os.WriteFile(filepath.Join(dir, "config.json"), []byte(`{"port": 3000}`), 0644)
|
||||
cfg, err := LoadConfig(dir)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if cfg.ListLimits.PacketsMax != 10000 {
|
||||
t.Errorf("expected 10000, got %d", cfg.ListLimits.PacketsMax)
|
||||
}
|
||||
if cfg.ListLimits.NodesMax != 2000 {
|
||||
t.Errorf("expected 2000, got %d", cfg.ListLimits.NodesMax)
|
||||
}
|
||||
if cfg.ListLimits.AnalyticsMax != 200 {
|
||||
t.Errorf("expected 200, got %d", cfg.ListLimits.AnalyticsMax)
|
||||
}
|
||||
if cfg.ListLimits.ChannelMessagesMax != 500 {
|
||||
t.Errorf("expected 500, got %d", cfg.ListLimits.ChannelMessagesMax)
|
||||
}
|
||||
if cfg.ListLimits.BulkHealthMax != 200 {
|
||||
t.Errorf("expected 200, got %d", cfg.ListLimits.BulkHealthMax)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("operator overrides honored", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
cfgData := map[string]interface{}{
|
||||
"listLimits": map[string]interface{}{
|
||||
"packetsMax": 50000,
|
||||
"nodesMax": 5000,
|
||||
"analyticsMax": 500,
|
||||
"channelMessagesMax": 1000,
|
||||
"bulkHealthMax": 300,
|
||||
},
|
||||
}
|
||||
data, _ := json.Marshal(cfgData)
|
||||
os.WriteFile(filepath.Join(dir, "config.json"), data, 0644)
|
||||
cfg, err := LoadConfig(dir)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if cfg.ListLimits.PacketsMax != 50000 {
|
||||
t.Errorf("expected 50000, got %d", cfg.ListLimits.PacketsMax)
|
||||
}
|
||||
if cfg.ListLimits.NodesMax != 5000 {
|
||||
t.Errorf("expected 5000, got %d", cfg.ListLimits.NodesMax)
|
||||
}
|
||||
if cfg.ListLimits.AnalyticsMax != 500 {
|
||||
t.Errorf("expected 500, got %d", cfg.ListLimits.AnalyticsMax)
|
||||
}
|
||||
if cfg.ListLimits.ChannelMessagesMax != 1000 {
|
||||
t.Errorf("expected 1000, got %d", cfg.ListLimits.ChannelMessagesMax)
|
||||
}
|
||||
if cfg.ListLimits.BulkHealthMax != 300 {
|
||||
t.Errorf("expected 300, got %d", cfg.ListLimits.BulkHealthMax)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@@ -2289,6 +2289,10 @@ func TestSubpathPrecomputedIndex(t *testing.T) {
|
||||
defer db.Close()
|
||||
store := NewPacketStore(db, nil)
|
||||
store.Load()
|
||||
// #1008: indexes built in background goroutine; wait before reading.
|
||||
if !store.WaitIndexesReady(5 * time.Second) {
|
||||
t.Fatal("indexes never became ready")
|
||||
}
|
||||
|
||||
// After Load(), the precomputed index must be populated.
|
||||
if len(store.spIndex) == 0 {
|
||||
@@ -2343,6 +2347,10 @@ func TestSubpathTxIndexPopulated(t *testing.T) {
|
||||
defer db.Close()
|
||||
store := NewPacketStore(db, nil)
|
||||
store.Load()
|
||||
// #1008: indexes built in background goroutine; wait before reading.
|
||||
if !store.WaitIndexesReady(5 * time.Second) {
|
||||
t.Fatal("indexes never became ready")
|
||||
}
|
||||
|
||||
// spTxIndex must be populated alongside spIndex
|
||||
if len(store.spTxIndex) == 0 {
|
||||
@@ -2387,6 +2395,10 @@ func TestSubpathDetailMixedCaseHops(t *testing.T) {
|
||||
defer db.Close()
|
||||
store := NewPacketStore(db, nil)
|
||||
store.Load()
|
||||
// #1008: indexes built in background goroutine; wait before reading.
|
||||
if !store.WaitIndexesReady(5 * time.Second) {
|
||||
t.Fatal("indexes never became ready")
|
||||
}
|
||||
|
||||
// Query with lowercase hops to establish baseline
|
||||
lower := store.GetSubpathDetail([]string{"eeff", "0011"})
|
||||
@@ -2701,6 +2713,17 @@ func TestHandleAnalyticsDistanceWithStore(t *testing.T) {
|
||||
router := mux.NewRouter()
|
||||
srv.RegisterRoutes(router)
|
||||
|
||||
// #1011: lazy distance index — first request returns 202; trigger
|
||||
// the build and wait for it before asserting the 200 shape.
|
||||
store.TriggerDistanceIndexBuild()
|
||||
deadline := time.Now().Add(5 * time.Second)
|
||||
for !store.DistanceIndexBuilt() {
|
||||
if time.Now().After(deadline) {
|
||||
t.Fatal("distance index did not finish building within 5s")
|
||||
}
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
}
|
||||
|
||||
req := httptest.NewRequest("GET", "/api/analytics/distance", nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
|
||||
@@ -0,0 +1,96 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"reflect"
|
||||
"sort"
|
||||
"testing"
|
||||
|
||||
"github.com/gorilla/mux"
|
||||
)
|
||||
|
||||
// TestConfigClientExposesCustomizerDisabledTabs verifies that the
|
||||
// /api/config/client endpoint surfaces the operator-set list of customizer
|
||||
// tabs to hide, so the customize-v2 frontend can filter them out of
|
||||
// _renderTabs(). Issue #1508.
|
||||
func TestConfigClientExposesCustomizerDisabledTabs(t *testing.T) {
|
||||
db := setupTestDB(t)
|
||||
seedTestData(t, db)
|
||||
cfg := &Config{
|
||||
Port: 3000,
|
||||
Customizer: &CustomizerConfig{
|
||||
DisabledTabs: []string{"branding", "geofilter", "export"},
|
||||
},
|
||||
}
|
||||
hub := NewHub()
|
||||
srv := NewServer(db, cfg, hub)
|
||||
store := NewPacketStore(db, nil)
|
||||
if err := store.Load(); err != nil {
|
||||
t.Fatalf("store.Load failed: %v", err)
|
||||
}
|
||||
srv.store = store
|
||||
router := mux.NewRouter()
|
||||
srv.RegisterRoutes(router)
|
||||
|
||||
req := httptest.NewRequest("GET", "/api/config/client", nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("expected 200, got %d (body=%s)", w.Code, w.Body.String())
|
||||
}
|
||||
var body map[string]interface{}
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil {
|
||||
t.Fatalf("decode: %v", err)
|
||||
}
|
||||
custRaw, ok := body["customizer"].(map[string]interface{})
|
||||
if !ok {
|
||||
t.Fatalf("expected body.customizer object, got %T (body=%s)", body["customizer"], w.Body.String())
|
||||
}
|
||||
tabsRaw, ok := custRaw["disabledTabs"].([]interface{})
|
||||
if !ok {
|
||||
t.Fatalf("expected body.customizer.disabledTabs array, got %T", custRaw["disabledTabs"])
|
||||
}
|
||||
got := make([]string, 0, len(tabsRaw))
|
||||
for _, v := range tabsRaw {
|
||||
s, ok := v.(string)
|
||||
if !ok {
|
||||
t.Fatalf("disabledTabs element not a string: %T", v)
|
||||
}
|
||||
got = append(got, s)
|
||||
}
|
||||
want := []string{"branding", "export", "geofilter"}
|
||||
sort.Strings(got)
|
||||
if !reflect.DeepEqual(got, want) {
|
||||
t.Errorf("disabledTabs: got %v, want %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
// TestConfigClientDefaultsCustomizerDisabledTabsEmpty verifies the backward-
|
||||
// compat default: when no customizer block is configured, the field is still
|
||||
// present and is an empty array (so the frontend can blindly call .includes()).
|
||||
func TestConfigClientDefaultsCustomizerDisabledTabsEmpty(t *testing.T) {
|
||||
_, router := setupTestServer(t)
|
||||
req := httptest.NewRequest("GET", "/api/config/client", nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("expected 200, got %d", w.Code)
|
||||
}
|
||||
var body map[string]interface{}
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil {
|
||||
t.Fatalf("decode: %v", err)
|
||||
}
|
||||
custRaw, ok := body["customizer"].(map[string]interface{})
|
||||
if !ok {
|
||||
t.Fatalf("expected body.customizer object, got %T", body["customizer"])
|
||||
}
|
||||
tabsRaw, ok := custRaw["disabledTabs"].([]interface{})
|
||||
if !ok {
|
||||
t.Fatalf("expected body.customizer.disabledTabs array, got %T", custRaw["disabledTabs"])
|
||||
}
|
||||
if len(tabsRaw) != 0 {
|
||||
t.Errorf("default disabledTabs should be empty, got %v", tabsRaw)
|
||||
}
|
||||
}
|
||||
+107
-4
@@ -12,6 +12,7 @@ import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/meshcore-analyzer/dbschema"
|
||||
"github.com/meshcore-analyzer/geofilter"
|
||||
_ "modernc.org/sqlite"
|
||||
)
|
||||
@@ -30,6 +31,7 @@ type DB struct {
|
||||
hasScopeName bool // transmissions.scope_name column exists (#899)
|
||||
hasDefaultScope bool // nodes.default_scope column exists (#899)
|
||||
hasMultibyteSupCols bool // nodes/inactive_nodes have multibyte_sup/multibyte_evidence (#903)
|
||||
hasLastSeen bool // transmissions.last_seen column exists (#1690)
|
||||
|
||||
// Channel list cache (60s TTL) — avoids repeated GROUP BY scans (#762)
|
||||
channelsCacheMu sync.Mutex
|
||||
@@ -107,6 +109,9 @@ func (db *DB) detectSchema() {
|
||||
if colName == "scope_name" {
|
||||
db.hasScopeName = true
|
||||
}
|
||||
if colName == "last_seen" {
|
||||
db.hasLastSeen = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -251,6 +256,13 @@ type Observer struct {
|
||||
ClockSkewSeconds *int64 `json:"clock_skew_seconds"`
|
||||
ClockSkewCount24h int `json:"clock_skew_count_24h"`
|
||||
ClockLastNaiveAt *string `json:"clock_last_naive_at"`
|
||||
// Issue #1290: firmware 1.16 `repeat: on|off` flag persisted by the
|
||||
// ingestor. true = relay-capable, false = listener-only, nil =
|
||||
// unknown (legacy observer that never sent the field — drives the
|
||||
// tri-state UI badge so legacy rows don't masquerade as confirmed
|
||||
// repeaters). The ingestor sets can_relay_seen=1 only when it has
|
||||
// an explicit value; the read layer returns nil when seen=0.
|
||||
CanRelay *bool `json:"can_relay,omitempty"`
|
||||
}
|
||||
|
||||
// Transmission represents a row from the transmissions table.
|
||||
@@ -479,6 +491,8 @@ type PacketQuery struct {
|
||||
type PacketResult struct {
|
||||
Packets []map[string]interface{} `json:"packets"`
|
||||
Total int `json:"total"`
|
||||
Limit int `json:"limit"`
|
||||
Offset int `json:"offset"`
|
||||
}
|
||||
|
||||
// QueryPackets returns paginated, filtered packets as transmissions (matching Node.js shape).
|
||||
@@ -1146,9 +1160,24 @@ func (db *DB) getObservationsForTransmissions(txIDs []int) map[int][]map[string]
|
||||
|
||||
// GetObservers returns active observers (not soft-deleted) sorted by last_seen DESC.
|
||||
func (db *DB) GetObservers() ([]Observer, error) {
|
||||
// Issue #1290: can_relay is read via COALESCE(can_relay, 1). The
|
||||
// column is added by internal/dbschema; older test fixtures and
|
||||
// pre-migration DBs may lack it, so we probe and fall back.
|
||||
// PR #1624 MAJOR-2: can_relay_seen is the tri-state sentinel — 1
|
||||
// means the ingestor explicitly wrote a value, 0 means "unknown"
|
||||
// and the server returns CanRelay=nil so the UI shows no badge.
|
||||
canRelayClause := "COALESCE(can_relay, 1)"
|
||||
canRelaySeenClause := "0"
|
||||
if hasCol, _ := dbschema.TableHasColumn(db.conn, "observers", "can_relay"); !hasCol {
|
||||
canRelayClause = "1"
|
||||
}
|
||||
if hasCol, _ := dbschema.TableHasColumn(db.conn, "observers", "can_relay_seen"); hasCol {
|
||||
canRelaySeenClause = "COALESCE(can_relay_seen, 0)"
|
||||
}
|
||||
rows, err := db.conn.Query(`SELECT id, name, iata, last_seen, first_seen, packet_count,
|
||||
model, firmware, client_version, radio, battery_mv, uptime_secs, noise_floor, last_packet_at,
|
||||
clock_skew_seconds, clock_skew_count_24h, clock_last_naive_at
|
||||
clock_skew_seconds, clock_skew_count_24h, clock_last_naive_at,
|
||||
` + canRelayClause + `, ` + canRelaySeenClause + `
|
||||
FROM observers WHERE inactive IS NULL OR inactive = 0 ORDER BY last_seen DESC`)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -1161,11 +1190,16 @@ func (db *DB) GetObservers() ([]Observer, error) {
|
||||
var batteryMv, uptimeSecs, clockSkewSec sql.NullInt64
|
||||
var clockSkewCount sql.NullInt64
|
||||
var noiseFloor sql.NullFloat64
|
||||
var canRelay, canRelaySeen int
|
||||
if err := rows.Scan(&o.ID, &o.Name, &o.IATA, &o.LastSeen, &o.FirstSeen, &o.PacketCount,
|
||||
&o.Model, &o.Firmware, &o.ClientVersion, &o.Radio, &batteryMv, &uptimeSecs, &noiseFloor, &o.LastPacketAt,
|
||||
&clockSkewSec, &clockSkewCount, &o.ClockLastNaiveAt); err != nil {
|
||||
&clockSkewSec, &clockSkewCount, &o.ClockLastNaiveAt, &canRelay, &canRelaySeen); err != nil {
|
||||
continue
|
||||
}
|
||||
if canRelaySeen != 0 {
|
||||
b := canRelay != 0
|
||||
o.CanRelay = &b
|
||||
}
|
||||
if batteryMv.Valid {
|
||||
v := int(batteryMv.Int64)
|
||||
o.BatteryMv = &v
|
||||
@@ -1188,22 +1222,91 @@ func (db *DB) GetObservers() ([]Observer, error) {
|
||||
return observers, nil
|
||||
}
|
||||
|
||||
// GetNonRelayObserverPubkeys returns the lowercase observer.id pubkeys
|
||||
// for observers that have advertised `repeat:off` (#1290). The server's
|
||||
// path-hop disambiguator consumes this to exclude listener-only nodes
|
||||
// from the candidate set. Inactive observers are excluded for
|
||||
// consistency with GetObservers; reactivation flips can_relay only on
|
||||
// the next status message.
|
||||
func (db *DB) GetNonRelayObserverPubkeys() ([]string, error) {
|
||||
// Graceful no-op when can_relay column is absent (legacy DB / older
|
||||
// test fixture). Avoids noisy schema-degradation log spam.
|
||||
if hasCol, _ := dbschema.TableHasColumn(db.conn, "observers", "can_relay"); !hasCol {
|
||||
return nil, nil
|
||||
}
|
||||
rows, err := db.conn.Query(`SELECT LOWER(id) FROM observers
|
||||
WHERE COALESCE(can_relay, 1) = 0
|
||||
AND (inactive IS NULL OR inactive = 0)`)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
var out []string
|
||||
for rows.Next() {
|
||||
var pk string
|
||||
if err := rows.Scan(&pk); err == nil && pk != "" {
|
||||
out = append(out, pk)
|
||||
}
|
||||
}
|
||||
return out, rows.Err()
|
||||
}
|
||||
|
||||
// GetCanRelaySeenObserverPubkeys returns the lowercase observer.id
|
||||
// pubkeys for which the ingestor has explicitly written a repeat-field
|
||||
// value (can_relay_seen=1). PR #1624 MAJOR-2: the badge surface uses
|
||||
// this to render tri-state — observers NOT in this set are "unknown"
|
||||
// and the UI shows no badge.
|
||||
func (db *DB) GetCanRelaySeenObserverPubkeys() ([]string, error) {
|
||||
if hasCol, _ := dbschema.TableHasColumn(db.conn, "observers", "can_relay_seen"); !hasCol {
|
||||
return nil, nil
|
||||
}
|
||||
rows, err := db.conn.Query(`SELECT LOWER(id) FROM observers
|
||||
WHERE COALESCE(can_relay_seen, 0) = 1
|
||||
AND (inactive IS NULL OR inactive = 0)`)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
var out []string
|
||||
for rows.Next() {
|
||||
var pk string
|
||||
if err := rows.Scan(&pk); err == nil && pk != "" {
|
||||
out = append(out, pk)
|
||||
}
|
||||
}
|
||||
return out, rows.Err()
|
||||
}
|
||||
|
||||
// GetObserverByID returns a single observer.
|
||||
func (db *DB) GetObserverByID(id string) (*Observer, error) {
|
||||
var o Observer
|
||||
var batteryMv, uptimeSecs, clockSkewSec sql.NullInt64
|
||||
var clockSkewCount sql.NullInt64
|
||||
var noiseFloor sql.NullFloat64
|
||||
var canRelay, canRelaySeen int
|
||||
canRelayClause := "COALESCE(can_relay, 1)"
|
||||
canRelaySeenClause := "0"
|
||||
if hasCol, _ := dbschema.TableHasColumn(db.conn, "observers", "can_relay"); !hasCol {
|
||||
canRelayClause = "1"
|
||||
}
|
||||
if hasCol, _ := dbschema.TableHasColumn(db.conn, "observers", "can_relay_seen"); hasCol {
|
||||
canRelaySeenClause = "COALESCE(can_relay_seen, 0)"
|
||||
}
|
||||
err := db.conn.QueryRow(`SELECT id, name, iata, last_seen, first_seen, packet_count,
|
||||
model, firmware, client_version, radio, battery_mv, uptime_secs, noise_floor, last_packet_at,
|
||||
clock_skew_seconds, clock_skew_count_24h, clock_last_naive_at
|
||||
clock_skew_seconds, clock_skew_count_24h, clock_last_naive_at,
|
||||
`+canRelayClause+`, `+canRelaySeenClause+`
|
||||
FROM observers WHERE id = ?`, id).
|
||||
Scan(&o.ID, &o.Name, &o.IATA, &o.LastSeen, &o.FirstSeen, &o.PacketCount,
|
||||
&o.Model, &o.Firmware, &o.ClientVersion, &o.Radio, &batteryMv, &uptimeSecs, &noiseFloor, &o.LastPacketAt,
|
||||
&clockSkewSec, &clockSkewCount, &o.ClockLastNaiveAt)
|
||||
&clockSkewSec, &clockSkewCount, &o.ClockLastNaiveAt, &canRelay, &canRelaySeen)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if canRelaySeen != 0 {
|
||||
b := canRelay != 0
|
||||
o.CanRelay = &b
|
||||
}
|
||||
if batteryMv.Valid {
|
||||
v := int(batteryMv.Int64)
|
||||
o.BatteryMv = &v
|
||||
|
||||
+44
-1
@@ -91,6 +91,11 @@ type Payload struct {
|
||||
MAC string `json:"mac,omitempty"`
|
||||
EncryptedData string `json:"encryptedData,omitempty"`
|
||||
ExtraHash string `json:"extraHash,omitempty"`
|
||||
// Extended ACK fields per firmware 1.16.0 (issue #1610) — populated by
|
||||
// decodeAck once the server-side re-decoder is upgraded (issue #1694).
|
||||
AckLen *int `json:"ackLen,omitempty"`
|
||||
AckAttempt *int `json:"ackAttempt,omitempty"`
|
||||
AckRand *int `json:"ackRand,omitempty"`
|
||||
PubKey string `json:"pubKey,omitempty"`
|
||||
Timestamp uint32 `json:"timestamp,omitempty"`
|
||||
TimestampISO string `json:"timestampISO,omitempty"`
|
||||
@@ -124,6 +129,11 @@ type Payload struct {
|
||||
InnerType *int `json:"innerType,omitempty"`
|
||||
InnerTypeName string `json:"innerTypeName,omitempty"`
|
||||
InnerAckCrc string `json:"innerAckCrc,omitempty"`
|
||||
// Extended ACK inner fields (issue #1610 / #1694) — populated by
|
||||
// decodeMultipart once ACK parity is ported from the ingestor.
|
||||
InnerAckLen *int `json:"innerAckLen,omitempty"`
|
||||
InnerAckAttempt *int `json:"innerAckAttempt,omitempty"`
|
||||
InnerAckRand *int `json:"innerAckRand,omitempty"`
|
||||
InnerPayload string `json:"innerPayload,omitempty"`
|
||||
// CONTROL (PAYLOAD_TYPE_CONTROL=0x0B) byte0 flags, per
|
||||
// firmware/src/Mesh.cpp:69 — high-bit = zero-hop direct subset.
|
||||
@@ -241,10 +251,27 @@ func decodeAck(buf []byte) Payload {
|
||||
return Payload{Type: "ACK", Error: "too short", RawHex: hex.EncodeToString(buf)}
|
||||
}
|
||||
checksum := binary.LittleEndian.Uint32(buf[0:4])
|
||||
return Payload{
|
||||
ackLen := len(buf)
|
||||
if ackLen > 6 {
|
||||
ackLen = 6
|
||||
}
|
||||
p := Payload{
|
||||
Type: "ACK",
|
||||
ExtraHash: fmt.Sprintf("%08x", checksum),
|
||||
AckLen: &ackLen,
|
||||
}
|
||||
// Firmware 1.16.0 extended ACK (issue #1610): 5th byte is the attempt
|
||||
// counter (commit f6e6fdaa), 6th byte is a random byte added so identical
|
||||
// attempts still hash uniquely (commit a130a95a).
|
||||
if len(buf) >= 5 {
|
||||
attempt := int(buf[4])
|
||||
p.AckAttempt = &attempt
|
||||
}
|
||||
if len(buf) >= 6 {
|
||||
rnd := int(buf[5])
|
||||
p.AckRand = &rnd
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
func decodeAdvert(buf []byte, validateSignatures bool) Payload {
|
||||
@@ -378,6 +405,22 @@ func decodeMultipart(buf []byte) Payload {
|
||||
if innerType == PayloadACK && len(buf) >= 5 {
|
||||
crc := binary.LittleEndian.Uint32(buf[1:5])
|
||||
p.InnerAckCrc = fmt.Sprintf("%08x", crc)
|
||||
// Firmware 1.16.0 extended ACK (issue #1610): inner ACK blob may be
|
||||
// 5 or 6 bytes (payload_len = 1 + ack_len) instead of always 4.
|
||||
// Attempt counter added in commit f6e6fdaa, RNG byte in commit a130a95a.
|
||||
ackLen := len(buf) - 1
|
||||
if ackLen > 6 {
|
||||
ackLen = 6
|
||||
}
|
||||
p.InnerAckLen = &ackLen
|
||||
if len(buf) >= 6 {
|
||||
attempt := int(buf[5])
|
||||
p.InnerAckAttempt = &attempt
|
||||
}
|
||||
if len(buf) >= 7 {
|
||||
rnd := int(buf[6])
|
||||
p.InnerAckRand = &rnd
|
||||
}
|
||||
} else if len(buf) > 1 {
|
||||
p.InnerPayload = hex.EncodeToString(buf[1:])
|
||||
}
|
||||
|
||||
@@ -0,0 +1,96 @@
|
||||
package main
|
||||
|
||||
// Tests for issue #1694 — server-side decoder parity with the ingestor's
|
||||
// firmware-1.16.0 extended ACK support (issue #1610). Wire vectors mirror
|
||||
// the ingestor's tests so both decoders agree byte-for-byte.
|
||||
//
|
||||
// - decodeAck: firmware/src/helpers/BaseChatMesh.cpp:218-234
|
||||
// - decodeMultipart: firmware/src/Mesh.cpp:287-310
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestDecodeAckExtended(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
buf []byte
|
||||
wantLen int
|
||||
wantAttPtr bool
|
||||
wantAtt int
|
||||
wantRndPtr bool
|
||||
wantRnd int
|
||||
}{
|
||||
{
|
||||
name: "legacy 4-byte ACK (CRC only)",
|
||||
buf: []byte{0xEF, 0xBE, 0xAD, 0xDE},
|
||||
wantLen: 4,
|
||||
},
|
||||
{
|
||||
name: "5-byte ACK (CRC + attempt)",
|
||||
buf: []byte{0xEF, 0xBE, 0xAD, 0xDE, 0x07},
|
||||
wantLen: 5,
|
||||
wantAttPtr: true,
|
||||
wantAtt: 7,
|
||||
},
|
||||
{
|
||||
name: "6-byte ACK (CRC + attempt + rand)",
|
||||
buf: []byte{0xEF, 0xBE, 0xAD, 0xDE, 0x07, 0x42},
|
||||
wantLen: 6,
|
||||
wantAttPtr: true,
|
||||
wantAtt: 7,
|
||||
wantRndPtr: true,
|
||||
wantRnd: 0x42,
|
||||
},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
p := decodeAck(tc.buf)
|
||||
if p.Type != "ACK" {
|
||||
t.Fatalf("type=%q want ACK", p.Type)
|
||||
}
|
||||
if p.AckLen == nil {
|
||||
t.Fatalf("AckLen=nil want %d", tc.wantLen)
|
||||
}
|
||||
if *p.AckLen != tc.wantLen {
|
||||
t.Errorf("AckLen=%d want %d", *p.AckLen, tc.wantLen)
|
||||
}
|
||||
if tc.wantAttPtr {
|
||||
if p.AckAttempt == nil {
|
||||
t.Errorf("AckAttempt=nil want %d", tc.wantAtt)
|
||||
} else if *p.AckAttempt != tc.wantAtt {
|
||||
t.Errorf("AckAttempt=%d want %d", *p.AckAttempt, tc.wantAtt)
|
||||
}
|
||||
} else if p.AckAttempt != nil {
|
||||
t.Errorf("AckAttempt=%d want nil", *p.AckAttempt)
|
||||
}
|
||||
if tc.wantRndPtr {
|
||||
if p.AckRand == nil {
|
||||
t.Errorf("AckRand=nil want %d", tc.wantRnd)
|
||||
} else if *p.AckRand != tc.wantRnd {
|
||||
t.Errorf("AckRand=%d want %d", *p.AckRand, tc.wantRnd)
|
||||
}
|
||||
} else if p.AckRand != nil {
|
||||
t.Errorf("AckRand=%d want nil", *p.AckRand)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestDecodeMultipartAckExtendedInner(t *testing.T) {
|
||||
// byte0 = (remaining<<4)|inner_type = (3<<4)|0x03 = 0x33
|
||||
// inner ACK = CRC(deadbeef LE) + attempt(0x07) + rand(0x42) = 6 bytes
|
||||
// total buf = 1 + 6 = 7 bytes.
|
||||
buf := []byte{0x33, 0xEF, 0xBE, 0xAD, 0xDE, 0x07, 0x42}
|
||||
p := decodeMultipart(buf)
|
||||
if p.InnerAckCrc != "deadbeef" {
|
||||
t.Fatalf("InnerAckCrc=%q want deadbeef", p.InnerAckCrc)
|
||||
}
|
||||
if p.InnerAckLen == nil || *p.InnerAckLen != 6 {
|
||||
t.Errorf("InnerAckLen=%v want 6", p.InnerAckLen)
|
||||
}
|
||||
if p.InnerAckAttempt == nil || *p.InnerAckAttempt != 7 {
|
||||
t.Errorf("InnerAckAttempt=%v want 7", p.InnerAckAttempt)
|
||||
}
|
||||
if p.InnerAckRand == nil || *p.InnerAckRand != 0x42 {
|
||||
t.Errorf("InnerAckRand=%v want 0x42", p.InnerAckRand)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,114 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"net/http/httptest"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/gorilla/mux"
|
||||
)
|
||||
|
||||
// Issue #1011: distance index must NOT be built eagerly at startup.
|
||||
// It is constructed lazily on first /api/analytics/distance request,
|
||||
// the first request returns 202 + Retry-After while the build runs,
|
||||
// and concurrent requests during the build also get 202 (one build
|
||||
// only, not N parallel builds).
|
||||
//
|
||||
// These three assertions encode the acceptance criteria from the
|
||||
// triage Fix path (sync.Once-style first-request trigger, 202+Retry-After).
|
||||
|
||||
// TestDistanceIndexNotBuiltOnLoad: Load() must complete without
|
||||
// populating distHops / distPaths. Eager build is gone.
|
||||
func TestDistanceIndexNotBuiltOnLoad(t *testing.T) {
|
||||
db := setupRichTestDB(t)
|
||||
defer db.Close()
|
||||
store := NewPacketStore(db, nil)
|
||||
if err := store.Load(); err != nil {
|
||||
t.Fatalf("Load(): %v", err)
|
||||
}
|
||||
store.mu.RLock()
|
||||
nHops := len(store.distHops)
|
||||
nPaths := len(store.distPaths)
|
||||
store.mu.RUnlock()
|
||||
if nHops != 0 || nPaths != 0 {
|
||||
t.Fatalf("expected distance index empty after Load() (lazy build, #1011); got %d hops, %d paths — eager build still firing in Load()", nHops, nPaths)
|
||||
}
|
||||
if store.DistanceIndexBuilt() {
|
||||
t.Fatalf("expected DistanceIndexBuilt() = false directly after Load(); got true")
|
||||
}
|
||||
}
|
||||
|
||||
// TestDistanceFirstRequestReturns202: first /api/analytics/distance call
|
||||
// must trigger async build and return 202 + Retry-After. The handler must
|
||||
// NOT block for the full build.
|
||||
func TestDistanceFirstRequestReturns202(t *testing.T) {
|
||||
db := setupRichTestDB(t)
|
||||
defer db.Close()
|
||||
cfg := &Config{Port: 3000}
|
||||
hub := NewHub()
|
||||
srv := NewServer(db, cfg, hub)
|
||||
store := NewPacketStore(db, nil)
|
||||
if err := store.Load(); err != nil {
|
||||
t.Fatalf("Load(): %v", err)
|
||||
}
|
||||
srv.store = store
|
||||
r := mux.NewRouter()
|
||||
srv.RegisterRoutes(r)
|
||||
|
||||
req := httptest.NewRequest("GET", "/api/analytics/distance", nil)
|
||||
w := httptest.NewRecorder()
|
||||
t0 := time.Now()
|
||||
r.ServeHTTP(w, req)
|
||||
elapsed := time.Since(t0)
|
||||
|
||||
if w.Code != 202 {
|
||||
t.Fatalf("expected 202 Accepted on first request (lazy build, #1011); got %d (body=%s)", w.Code, w.Body.String())
|
||||
}
|
||||
if ra := w.Header().Get("Retry-After"); ra == "" {
|
||||
t.Fatalf("expected non-empty Retry-After header on 202 response; got none")
|
||||
}
|
||||
// Handler must return quickly — must not block on the full build.
|
||||
if elapsed > 500*time.Millisecond {
|
||||
t.Fatalf("first-request handler took %v — must not block on build (#1011)", elapsed)
|
||||
}
|
||||
}
|
||||
|
||||
// TestDistanceConcurrentRequestsDuringBuildReturn202: 10 requests fired
|
||||
// in close succession while the build is in flight must all receive 202;
|
||||
// exactly one build runs.
|
||||
func TestDistanceConcurrentRequestsDuringBuildReturn202(t *testing.T) {
|
||||
db := setupRichTestDB(t)
|
||||
defer db.Close()
|
||||
cfg := &Config{Port: 3000}
|
||||
hub := NewHub()
|
||||
srv := NewServer(db, cfg, hub)
|
||||
store := NewPacketStore(db, nil)
|
||||
if err := store.Load(); err != nil {
|
||||
t.Fatalf("Load(): %v", err)
|
||||
}
|
||||
srv.store = store
|
||||
r := mux.NewRouter()
|
||||
srv.RegisterRoutes(r)
|
||||
|
||||
const N = 10
|
||||
var wg sync.WaitGroup
|
||||
var got202 atomic.Int32
|
||||
wg.Add(N)
|
||||
for i := 0; i < N; i++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
req := httptest.NewRequest("GET", "/api/analytics/distance", nil)
|
||||
w := httptest.NewRecorder()
|
||||
r.ServeHTTP(w, req)
|
||||
if w.Code == 202 {
|
||||
got202.Add(1)
|
||||
}
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
if got202.Load() != N {
|
||||
t.Fatalf("expected all %d concurrent first-window requests to get 202; only %d did", N, got202.Load())
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,75 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/gorilla/mux"
|
||||
)
|
||||
|
||||
// TestFirstSeen_1166_HandleNodesSurface pins issue #1166: the /api/nodes
|
||||
// response carries a `first_seen` ISO timestamp per node so the frontend
|
||||
// can show a sortable "First Seen" column.
|
||||
func TestFirstSeen_1166_HandleNodesSurface(t *testing.T) {
|
||||
db := setupCapabilityTestDB(t)
|
||||
defer db.conn.Close()
|
||||
if _, err := db.conn.Exec(`ALTER TABLE nodes ADD COLUMN foreign_advert INTEGER DEFAULT 0`); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
pk := "cccc000000000000000000000000000000000000000000000000000000000000"
|
||||
first := time.Now().Add(-72 * time.Hour).UTC().Format("2006-01-02T15:04:05.000Z")
|
||||
last := time.Now().UTC().Format("2006-01-02T15:04:05.000Z")
|
||||
if _, err := db.conn.Exec(`INSERT INTO nodes
|
||||
(public_key, name, role, lat, lon, last_seen, first_seen, advert_count)
|
||||
VALUES (?, 'rpt', 'repeater', 37.5, -122.0, ?, ?, 5)`,
|
||||
pk, last, first); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
store := NewPacketStore(db, nil)
|
||||
cfg := &Config{Port: 3000}
|
||||
hub := NewHub()
|
||||
srv := NewServer(db, cfg, hub)
|
||||
srv.store = store
|
||||
|
||||
router := mux.NewRouter()
|
||||
srv.RegisterRoutes(router)
|
||||
|
||||
req := httptest.NewRequest("GET", "/api/nodes?limit=10", nil)
|
||||
rr := httptest.NewRecorder()
|
||||
router.ServeHTTP(rr, req)
|
||||
if rr.Code != 200 {
|
||||
t.Fatalf("/api/nodes status: want 200, got %d body=%s", rr.Code, rr.Body.String())
|
||||
}
|
||||
|
||||
var resp struct {
|
||||
Nodes []map[string]interface{} `json:"nodes"`
|
||||
}
|
||||
if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("decode: %v body=%s", err, rr.Body.String())
|
||||
}
|
||||
var got map[string]interface{}
|
||||
for _, n := range resp.Nodes {
|
||||
if k, _ := n["public_key"].(string); k == pk {
|
||||
got = n
|
||||
break
|
||||
}
|
||||
}
|
||||
if got == nil {
|
||||
t.Fatalf("node missing from /api/nodes response")
|
||||
}
|
||||
fs, hasFS := got["first_seen"]
|
||||
if !hasFS {
|
||||
t.Fatalf("first_seen absent from /api/nodes response (issue #1166)")
|
||||
}
|
||||
s, _ := fs.(string)
|
||||
if s == "" {
|
||||
t.Errorf("first_seen empty, want ISO timestamp, got %v", fs)
|
||||
}
|
||||
if s != first {
|
||||
t.Errorf("first_seen = %q, want %q", s, first)
|
||||
}
|
||||
}
|
||||
+4
-2
@@ -36,7 +36,6 @@ require (
|
||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||
github.com/ncruces/go-strftime v0.1.9 // indirect
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
||||
golang.org/x/sync v0.10.0 // indirect
|
||||
golang.org/x/sys v0.22.0 // indirect
|
||||
modernc.org/libc v1.55.3 // indirect
|
||||
modernc.org/mathutil v1.6.0 // indirect
|
||||
@@ -47,6 +46,9 @@ require github.com/meshcore-analyzer/prunequeue v0.0.0
|
||||
|
||||
replace github.com/meshcore-analyzer/prunequeue => ../../internal/prunequeue
|
||||
|
||||
require github.com/meshcore-analyzer/mbcapqueue v0.0.0
|
||||
require (
|
||||
github.com/meshcore-analyzer/mbcapqueue v0.0.0
|
||||
golang.org/x/sync v0.10.0
|
||||
)
|
||||
|
||||
replace github.com/meshcore-analyzer/mbcapqueue => ../../internal/mbcapqueue
|
||||
|
||||
+12
-2
@@ -42,7 +42,7 @@ func (s *Server) handleHealthz(w http.ResponseWriter, r *http.Request) {
|
||||
// processed<total).
|
||||
bfTotal, bfProcessed, bfDone := fromPubkeyBackfillSnapshot()
|
||||
w.WriteHeader(http.StatusOK)
|
||||
json.NewEncoder(w).Encode(map[string]interface{}{
|
||||
resp := map[string]interface{}{
|
||||
"ready": true,
|
||||
"loadedTx": loadedTx,
|
||||
"loadedObs": loadedObs,
|
||||
@@ -51,5 +51,15 @@ func (s *Server) handleHealthz(w http.ResponseWriter, r *http.Request) {
|
||||
"processed": bfProcessed,
|
||||
"done": bfDone,
|
||||
},
|
||||
})
|
||||
}
|
||||
// PR #1609 M1: surface per-MQTT-source receipt vs write-path
|
||||
// liveness so operators can distinguish "broker alive, write
|
||||
// path stuck" (lastReceiptUnix recent, lastMessageUnix stale)
|
||||
// from "everything stalled" (both stale). Additive — older
|
||||
// ingestor builds simply produce no entry and the field is
|
||||
// omitted. Schema-compatible with prior /healthz consumers.
|
||||
if liveness := readIngestorSourceLiveness(); len(liveness) > 0 {
|
||||
resp["ingest_liveness"] = liveness
|
||||
}
|
||||
json.NewEncoder(w).Encode(resp)
|
||||
}
|
||||
|
||||
@@ -0,0 +1,193 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TestHiddenNamePrefix_1181_NodeHealth asserts that /api/nodes/{pk}/health
|
||||
// returns 404 for a node whose name starts with a hidden prefix — mirroring
|
||||
// the existing blacklist guard at the top of handleNodeHealth.
|
||||
//
|
||||
// Anti-tautology: this test FAILS if the IsNameHidden guard is removed from
|
||||
// handleNodeHealth (the handler would 200 with health data instead of 404).
|
||||
func TestHiddenNamePrefix_1181_NodeHealth(t *testing.T) {
|
||||
srv, router := setupTestServer(t)
|
||||
|
||||
pk := "deadbeef00001184"
|
||||
if _, err := srv.db.conn.Exec(`INSERT INTO nodes
|
||||
(public_key, name, role, lat, lon, last_seen, first_seen, advert_count)
|
||||
VALUES (?, ?, ?, 0, 0, '2026-06-01T00:00:00Z', '2026-06-01T00:00:00Z', 1)`,
|
||||
pk, "🚫 health me", "companion"); err != nil {
|
||||
t.Fatalf("insert: %v", err)
|
||||
}
|
||||
|
||||
get := func() *httptest.ResponseRecorder {
|
||||
req := httptest.NewRequest("GET", "/api/nodes/"+pk+"/health", nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
return w
|
||||
}
|
||||
|
||||
srv.cfg.SetHiddenNamePrefixes([]string{"🚫"})
|
||||
w := get()
|
||||
if w.Code != http.StatusNotFound {
|
||||
t.Fatalf("hidden: expected 404 from /api/nodes/%s/health, got %d body=%s", pk, w.Code, w.Body.String())
|
||||
}
|
||||
if strings.Contains(w.Body.String(), "health me") {
|
||||
t.Fatalf("hidden: name leaked in /health 404 body: %s", w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
// TestHiddenNamePrefix_1181_BulkHealth asserts /api/nodes/bulk-health filters
|
||||
// out nodes whose name starts with a hidden prefix — same shape as the
|
||||
// existing blacklist filter inside handleBulkHealth.
|
||||
//
|
||||
// Anti-tautology: remove the IsNameHidden branch from handleBulkHealth and
|
||||
// the hidden node leaks back into the response array; this assertion fails.
|
||||
func TestHiddenNamePrefix_1181_BulkHealth(t *testing.T) {
|
||||
srv, router := setupTestServer(t)
|
||||
|
||||
pk := "deadbeef00001185"
|
||||
if _, err := srv.db.conn.Exec(`INSERT INTO nodes
|
||||
(public_key, name, role, lat, lon, last_seen, first_seen, advert_count)
|
||||
VALUES (?, ?, ?, 0, 0, '2026-06-01T00:00:00Z', '2026-06-01T00:00:00Z', 1)`,
|
||||
pk, "🚫 bulk me", "companion"); err != nil {
|
||||
t.Fatalf("insert: %v", err)
|
||||
}
|
||||
|
||||
srv.cfg.SetHiddenNamePrefixes([]string{"🚫"})
|
||||
srv.cfg.NodeBlacklist = []string{"force-filter-branch"} // force the existing blacklist branch on so results-array path is taken
|
||||
srv.cfg.SetNodeBlacklist(srv.cfg.NodeBlacklist)
|
||||
|
||||
req := httptest.NewRequest("GET", "/api/nodes/bulk-health?limit=2000", nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", w.Code, w.Body.String())
|
||||
}
|
||||
var arr []map[string]interface{}
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &arr); err != nil {
|
||||
t.Fatalf("unmarshal: %v body=%s", err, w.Body.String())
|
||||
}
|
||||
for _, e := range arr {
|
||||
if got, _ := e["public_key"].(string); strings.EqualFold(got, pk) {
|
||||
t.Fatalf("hidden node %s leaked through /api/nodes/bulk-health", pk)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestHiddenNamePrefix_1181_Paths asserts /api/nodes/{pk}/paths returns 404
|
||||
// for a hidden-prefix node, mirroring blacklist behaviour.
|
||||
func TestHiddenNamePrefix_1181_Paths(t *testing.T) {
|
||||
srv, router := setupTestServer(t)
|
||||
|
||||
pk := "deadbeef00001186"
|
||||
if _, err := srv.db.conn.Exec(`INSERT INTO nodes
|
||||
(public_key, name, role, lat, lon, last_seen, first_seen, advert_count)
|
||||
VALUES (?, ?, ?, 0, 0, '2026-06-01T00:00:00Z', '2026-06-01T00:00:00Z', 1)`,
|
||||
pk, "🚫 paths me", "companion"); err != nil {
|
||||
t.Fatalf("insert: %v", err)
|
||||
}
|
||||
|
||||
srv.cfg.SetHiddenNamePrefixes([]string{"🚫"})
|
||||
req := httptest.NewRequest("GET", "/api/nodes/"+pk+"/paths", nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
if w.Code != http.StatusNotFound {
|
||||
t.Fatalf("hidden: expected 404 from /api/nodes/%s/paths, got %d body=%s", pk, w.Code, w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
// TestHiddenNamePrefix_1181_Analytics asserts /api/nodes/{pk}/analytics 404s
|
||||
// for hidden-prefix nodes.
|
||||
func TestHiddenNamePrefix_1181_Analytics(t *testing.T) {
|
||||
srv, router := setupTestServer(t)
|
||||
|
||||
pk := "deadbeef00001187"
|
||||
if _, err := srv.db.conn.Exec(`INSERT INTO nodes
|
||||
(public_key, name, role, lat, lon, last_seen, first_seen, advert_count)
|
||||
VALUES (?, ?, ?, 0, 0, '2026-06-01T00:00:00Z', '2026-06-01T00:00:00Z', 1)`,
|
||||
pk, "🚫 analytics me", "companion"); err != nil {
|
||||
t.Fatalf("insert: %v", err)
|
||||
}
|
||||
|
||||
srv.cfg.SetHiddenNamePrefixes([]string{"🚫"})
|
||||
req := httptest.NewRequest("GET", "/api/nodes/"+pk+"/analytics", nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
if w.Code != http.StatusNotFound {
|
||||
t.Fatalf("hidden: expected 404 from /api/nodes/%s/analytics, got %d body=%s", pk, w.Code, w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
// TestHiddenNamePrefixesGeneration_Increments asserts the per-source
|
||||
// generation counter bumps on every Set call — mirrors
|
||||
// TestConfig_BlacklistGenerationIncrements behaviour. Cache wiring lives in
|
||||
// a follow-up; the counter is the prerequisite primitive.
|
||||
func TestHiddenNamePrefixesGeneration_Increments(t *testing.T) {
|
||||
cfg := &Config{}
|
||||
g0 := cfg.HiddenNamePrefixesGeneration()
|
||||
cfg.SetHiddenNamePrefixes([]string{"🚫"})
|
||||
g1 := cfg.HiddenNamePrefixesGeneration()
|
||||
if g1 != g0+1 {
|
||||
t.Fatalf("first SetHiddenNamePrefixes: gen %d -> %d (want +1)", g0, g1)
|
||||
}
|
||||
cfg.SetHiddenNamePrefixes([]string{"🚫"})
|
||||
g2 := cfg.HiddenNamePrefixesGeneration()
|
||||
if g2 != g1+1 {
|
||||
t.Fatalf("second SetHiddenNamePrefixes: gen %d -> %d (want +1)", g1, g2)
|
||||
}
|
||||
cfg.SetHiddenNamePrefixes(nil)
|
||||
g3 := cfg.HiddenNamePrefixesGeneration()
|
||||
if g3 != g2+1 {
|
||||
t.Fatalf("nil SetHiddenNamePrefixes: gen %d -> %d (want +1)", g2, g3)
|
||||
}
|
||||
}
|
||||
|
||||
// TestHiddenNamePrefixes_ConcurrentAccess hammers Set + IsNameHidden from
|
||||
// multiple goroutines. Doesn't assert anything beyond "doesn't panic" —
|
||||
// atomic.Pointer correctness is what we're verifying, race detector is not
|
||||
// in scope for this PR's CI (see PR scope).
|
||||
func TestHiddenNamePrefixes_ConcurrentAccess(t *testing.T) {
|
||||
cfg := &Config{}
|
||||
cfg.SetHiddenNamePrefixes([]string{"🚫"})
|
||||
|
||||
var stop atomic.Bool
|
||||
var wg sync.WaitGroup
|
||||
|
||||
// Writer
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for i := 0; !stop.Load(); i++ {
|
||||
if i%2 == 0 {
|
||||
cfg.SetHiddenNamePrefixes([]string{"🚫", "test"})
|
||||
} else {
|
||||
cfg.SetHiddenNamePrefixes([]string{"🚫"})
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
// Readers
|
||||
for r := 0; r < 4; r++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for !stop.Load() {
|
||||
_ = cfg.IsNameHidden("🚫 something")
|
||||
_ = cfg.IsNameHidden("normal name")
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
time.Sleep(250 * time.Millisecond)
|
||||
stop.Store(true)
|
||||
wg.Wait()
|
||||
}
|
||||
@@ -0,0 +1,139 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestHiddenNamePrefix_1181 verifies operator-configurable name-prefix hiding
|
||||
// for nodes (issue #1181). When the operator configures HiddenNamePrefixes,
|
||||
// nodes whose name begins with any configured prefix are omitted from API
|
||||
// responses (list, search, detail). DB rows are preserved — filtering happens
|
||||
// at the API layer only.
|
||||
func TestHiddenNamePrefix_1181_NodesList(t *testing.T) {
|
||||
srv, router := setupTestServer(t)
|
||||
|
||||
// Insert a node whose name starts with the configured 🚫 prefix.
|
||||
_, err := srv.db.conn.Exec(`INSERT INTO nodes
|
||||
(public_key, name, role, lat, lon, last_seen, first_seen, advert_count)
|
||||
VALUES (?, ?, ?, 0, 0, '2026-06-01T00:00:00Z', '2026-06-01T00:00:00Z', 1)`,
|
||||
"deadbeef00001181", "🚫 ban me", "companion")
|
||||
if err != nil {
|
||||
t.Fatalf("insert hidden node: %v", err)
|
||||
}
|
||||
|
||||
get := func() []map[string]interface{} {
|
||||
req := httptest.NewRequest("GET", "/api/nodes?limit=2000", nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", w.Code, w.Body.String())
|
||||
}
|
||||
var resp struct {
|
||||
Nodes []map[string]interface{} `json:"nodes"`
|
||||
}
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("unmarshal: %v body=%s", err, w.Body.String())
|
||||
}
|
||||
return resp.Nodes
|
||||
}
|
||||
|
||||
hasName := func(nodes []map[string]interface{}, substr string) bool {
|
||||
for _, n := range nodes {
|
||||
if name, _ := n["name"].(string); strings.Contains(name, substr) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// Empty prefix list: node MUST be present.
|
||||
srv.cfg.SetHiddenNamePrefixes(nil)
|
||||
if !hasName(get(), "ban me") {
|
||||
t.Fatalf("with empty HiddenNamePrefixes, node should be present in /api/nodes")
|
||||
}
|
||||
|
||||
// Configured 🚫 prefix: node MUST be omitted.
|
||||
srv.cfg.SetHiddenNamePrefixes([]string{"🚫"})
|
||||
if hasName(get(), "ban me") {
|
||||
t.Fatalf("with HiddenNamePrefixes=[\"🚫\"], node 🚫 ban me should be hidden from /api/nodes")
|
||||
}
|
||||
}
|
||||
|
||||
// TestHiddenNamePrefix_1181_Search ensures hidden nodes are also filtered
|
||||
// from /api/nodes/search.
|
||||
func TestHiddenNamePrefix_1181_Search(t *testing.T) {
|
||||
srv, router := setupTestServer(t)
|
||||
|
||||
if _, err := srv.db.conn.Exec(`INSERT INTO nodes
|
||||
(public_key, name, role, lat, lon, last_seen, first_seen, advert_count)
|
||||
VALUES (?, ?, ?, 0, 0, '2026-06-01T00:00:00Z', '2026-06-01T00:00:00Z', 1)`,
|
||||
"deadbeef00001182", "🚫 search me", "companion"); err != nil {
|
||||
t.Fatalf("insert: %v", err)
|
||||
}
|
||||
|
||||
srv.cfg.SetHiddenNamePrefixes([]string{"🚫"})
|
||||
|
||||
req := httptest.NewRequest("GET", "/api/nodes/search?q=search", nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", w.Code, w.Body.String())
|
||||
}
|
||||
var resp struct {
|
||||
Nodes []map[string]interface{} `json:"nodes"`
|
||||
}
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("unmarshal: %v", err)
|
||||
}
|
||||
for _, n := range resp.Nodes {
|
||||
if name, _ := n["name"].(string); strings.Contains(name, "search me") {
|
||||
t.Fatalf("hidden node leaked through /api/nodes/search: %v", n)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestHiddenNamePrefix_1181_Detail ensures /api/nodes/{pubkey} returns 404
|
||||
// for a node whose name starts with a hidden prefix — mirroring the
|
||||
// blacklist behaviour so callers learn nothing about whether the row exists.
|
||||
func TestHiddenNamePrefix_1181_Detail(t *testing.T) {
|
||||
srv, router := setupTestServer(t)
|
||||
|
||||
pk := "deadbeef00001183"
|
||||
if _, err := srv.db.conn.Exec(`INSERT INTO nodes
|
||||
(public_key, name, role, lat, lon, last_seen, first_seen, advert_count)
|
||||
VALUES (?, ?, ?, 0, 0, '2026-06-01T00:00:00Z', '2026-06-01T00:00:00Z', 1)`,
|
||||
pk, "🚫 detail me", "companion"); err != nil {
|
||||
t.Fatalf("insert: %v", err)
|
||||
}
|
||||
|
||||
get := func() *httptest.ResponseRecorder {
|
||||
req := httptest.NewRequest("GET", "/api/nodes/"+pk, nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
return w
|
||||
}
|
||||
|
||||
// Empty prefix list: detail MUST be reachable (200 with the name).
|
||||
srv.cfg.SetHiddenNamePrefixes(nil)
|
||||
w := get()
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("baseline: expected 200, got %d body=%s", w.Code, w.Body.String())
|
||||
}
|
||||
if !strings.Contains(w.Body.String(), "detail me") {
|
||||
t.Fatalf("baseline: response missing node name; body=%s", w.Body.String())
|
||||
}
|
||||
|
||||
// Configured 🚫 prefix: detail MUST 404 — no name, no fields, nothing.
|
||||
srv.cfg.SetHiddenNamePrefixes([]string{"🚫"})
|
||||
w = get()
|
||||
if w.Code != http.StatusNotFound {
|
||||
t.Fatalf("hidden: expected 404, got %d body=%s", w.Code, w.Body.String())
|
||||
}
|
||||
if strings.Contains(w.Body.String(), "detail me") {
|
||||
t.Fatalf("hidden: name leaked in 404 body: %s", w.Body.String())
|
||||
}
|
||||
}
|
||||
@@ -172,6 +172,17 @@ func TestTopHopsRespectsContextAcrossAllCallSites(t *testing.T) {
|
||||
t.Fatalf("Load: %v", err)
|
||||
}
|
||||
|
||||
// #1011: distance index is now lazy — trigger it explicitly and
|
||||
// wait for build completion before inspecting distHops.
|
||||
store.TriggerDistanceIndexBuild()
|
||||
deadline := time.Now().Add(5 * time.Second)
|
||||
for !store.DistanceIndexBuilt() {
|
||||
if time.Now().After(deadline) {
|
||||
t.Fatal("distance index did not finish building within 5s")
|
||||
}
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
}
|
||||
|
||||
// Inspect precomputed distance index.
|
||||
store.mu.RLock()
|
||||
hops := make([]distHopRecord, len(store.distHops))
|
||||
|
||||
@@ -298,8 +298,15 @@ func TestHotStartup_ChunkErrorRecovery(t *testing.T) {
|
||||
t.Fatal("loadBackgroundChunks hung after DB close")
|
||||
}
|
||||
|
||||
if !store.backgroundLoadDone.Load() {
|
||||
t.Error("backgroundLoadDone must be set even when all chunks fail")
|
||||
// #1690: backgroundLoadFailed must be true (chunk errors AND coverage
|
||||
// fell short); backgroundLoadDone stays false because the in-memory
|
||||
// store does NOT reflect the on-disk DB. Pre-#1690 the test asserted
|
||||
// Done=true on errors — that was the very lie the issue documents.
|
||||
if !store.backgroundLoadFailed.Load() {
|
||||
t.Error("backgroundLoadFailed must be true after all chunks fail (#1690)")
|
||||
}
|
||||
if store.backgroundLoadDone.Load() {
|
||||
t.Error("backgroundLoadDone must remain false when the store does not reflect the DB (#1690)")
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,218 @@
|
||||
// Issue #1008: background-deferred subpath + pathHop index builds.
|
||||
//
|
||||
// Pattern mirrors the distance index (#1011) — but where distance is
|
||||
// fully lazy (built on first request), these two indexes are kicked off
|
||||
// eagerly by Load() in a background goroutine so HTTP becomes ready
|
||||
// immediately while the indexes finish populating.
|
||||
//
|
||||
// Concurrency model:
|
||||
//
|
||||
// - subpathReady / pathHopReady are atomic.Bool flags written exactly
|
||||
// once by the background builder (false → true) and never reset
|
||||
// thereafter. Handlers read them via SubpathIndexReady() /
|
||||
// PathHopIndexReady() before touching s.spIndex / s.spTxIndex /
|
||||
// s.byPathHop. While a flag is false, the handler responds 503 +
|
||||
// Retry-After: 5.
|
||||
//
|
||||
// - The builder itself acquires s.mu.Lock() and calls the existing
|
||||
// buildSubpathIndex() / buildPathHopIndex() methods. Those methods
|
||||
// replace s.spIndex / s.spTxIndex / s.byPathHop with freshly-
|
||||
// allocated maps under the write lock. Visibility of the populated
|
||||
// maps to handlers that see Ready()==true is guaranteed by Go's
|
||||
// sync/atomic acquire-release semantics (formalized in Go 1.19):
|
||||
// the atomic.Store(true) happens-after the s.mu.Unlock() that
|
||||
// completes the build, and the handler's atomic.Load()==true
|
||||
// synchronizes-with that store. The handler's subsequent s.mu.RLock
|
||||
// is not what establishes visibility — it only serializes against
|
||||
// concurrent ingest writers — so dropping the RLock would still be
|
||||
// safe for the build's "populated map" snapshot (we keep it for
|
||||
// ingest serialization).
|
||||
//
|
||||
// - Ingest-side incremental updates in StoreNewTransmissions /
|
||||
// pruning / hash-collision paths continue to write s.spIndex /
|
||||
// s.spTxIndex / s.byPathHop directly under s.mu.Lock(). Because
|
||||
// the builder also runs under s.mu.Lock() and the builder
|
||||
// overwrites whatever is there, the brief window between Load()
|
||||
// returning and the goroutine acquiring s.mu means any
|
||||
// concurrent ingest writes will be overwritten by the build —
|
||||
// this matches the prior behavior where ingest could not start
|
||||
// until Load() released s.mu, so in practice ingest does not
|
||||
// run during the build window. Documenting this rather than
|
||||
// adding a separate gate: the existing main.go boot sequence
|
||||
// does not start ingest goroutines until after store.Load()
|
||||
// and graph init complete.
|
||||
//
|
||||
// Handler scope of the ready gate (issue #1008 review M2):
|
||||
//
|
||||
// - HARD-GATED with 503 + Retry-After: 5 — analytics endpoints whose
|
||||
// entire response is the index aggregate. Empty data would be
|
||||
// visibly broken (charts, top-N tables). See routes.go:
|
||||
// /api/analytics/subpaths, /api/analytics/subpaths-bulk,
|
||||
// /api/analytics/subpath-detail, /api/nodes/{pubkey}/paths.
|
||||
//
|
||||
// - BEST-EFFORT (not gated) — endpoints where the index drives
|
||||
// enrichment fields that callers already treat as optional. During
|
||||
// the not-ready window these report zero counts / nil scores
|
||||
// rather than 503-ing the whole list. Acceptable because:
|
||||
//
|
||||
// * /api/nodes and /api/nodes/{pubkey} have many other fields
|
||||
// (last-seen, position, advert metadata) that callers depend
|
||||
// on at startup. 503-ing the SPA bootstrap to wait for an
|
||||
// index that exclusively affects "relay activity" badges
|
||||
// would be a worse UX than a 30–60s window of "—" badges.
|
||||
//
|
||||
// * GetRepeaterRelayInfoMap / GetRepeaterUsefulnessScoreMap /
|
||||
// GetBridgeScore / repeater_liveness / repeater_usefulness
|
||||
// all walk s.byPathHop. During the build window they return
|
||||
// empty maps or zero scores; the steady-state recomputer
|
||||
// (#1262) refreshes them every 5min once indexes flip ready
|
||||
// (prewarm guarded by WaitIndexesReady — see review M1).
|
||||
//
|
||||
// This is documented rather than gated so operators do not see
|
||||
// /api/nodes 503 during routine restarts on Cascadia-scale data.
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
"net/http"
|
||||
"time"
|
||||
)
|
||||
|
||||
// writeIndexLoading503 emits the standard 503 response used by handlers
|
||||
// that depend on a not-yet-built index (#1008). Body shape matches the
|
||||
// triage spec: {"error":"index loading","retryAfter":5}. The Retry-After
|
||||
// header is also set so well-behaved clients back off automatically.
|
||||
func writeIndexLoading503(w http.ResponseWriter) {
|
||||
w.Header().Set("Retry-After", "5")
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusServiceUnavailable)
|
||||
_, _ = w.Write([]byte(`{"error":"index loading","retryAfter":5}`))
|
||||
}
|
||||
|
||||
// SubpathIndexReady reports whether the subpath index build kicked off
|
||||
// by Load() has completed (#1008). Until this returns true, callers
|
||||
// must NOT read s.spIndex / s.spTxIndex.
|
||||
func (s *PacketStore) SubpathIndexReady() bool {
|
||||
return s.subpathReady.Load()
|
||||
}
|
||||
|
||||
// PathHopIndexReady reports whether the path-hop index build kicked
|
||||
// off by Load() has completed (#1008). Until this returns true,
|
||||
// callers must NOT read s.byPathHop.
|
||||
func (s *PacketStore) PathHopIndexReady() bool {
|
||||
return s.pathHopReady.Load()
|
||||
}
|
||||
|
||||
// indexReadyCh returns the channel that is closed when BOTH indexes
|
||||
// have flipped ready. Lazily created on first access. Safe to call
|
||||
// concurrently. Used by WaitIndexesReady and any future waiters that
|
||||
// want event-driven semantics instead of polling.
|
||||
func (s *PacketStore) indexReadyCh() <-chan struct{} {
|
||||
s.indexReadyChMu.Lock()
|
||||
defer s.indexReadyChMu.Unlock()
|
||||
if s.indexReadyChan == nil {
|
||||
s.indexReadyChan = make(chan struct{})
|
||||
// If both are already ready (e.g. background chunk loader
|
||||
// flipped them synchronously before any waiter showed up),
|
||||
// close immediately so the channel is usable as a one-shot.
|
||||
if s.subpathReady.Load() && s.pathHopReady.Load() {
|
||||
close(s.indexReadyChan)
|
||||
}
|
||||
}
|
||||
return s.indexReadyChan
|
||||
}
|
||||
|
||||
// maybeCloseIndexReadyCh closes the ready channel iff both flags are
|
||||
// set. Idempotent (a sync.Once on the channel) and safe to call from
|
||||
// either builder goroutine on the green-path transitions, as well as
|
||||
// from markIndexesReadySync.
|
||||
func (s *PacketStore) maybeCloseIndexReadyCh() {
|
||||
if !(s.subpathReady.Load() && s.pathHopReady.Load()) {
|
||||
return
|
||||
}
|
||||
s.indexReadyChMu.Lock()
|
||||
defer s.indexReadyChMu.Unlock()
|
||||
if s.indexReadyChan == nil {
|
||||
// Lazily allocate AND close it in one step so any future
|
||||
// indexReadyCh() caller gets a pre-closed channel.
|
||||
s.indexReadyChan = make(chan struct{})
|
||||
close(s.indexReadyChan)
|
||||
return
|
||||
}
|
||||
select {
|
||||
case <-s.indexReadyChan:
|
||||
// Already closed.
|
||||
default:
|
||||
close(s.indexReadyChan)
|
||||
}
|
||||
}
|
||||
|
||||
// startBackgroundIndexBuilds is called from Load() after s.loaded=true
|
||||
// to populate the subpath + path-hop indexes off the critical path
|
||||
// (#1008). It returns immediately; the work runs in two background
|
||||
// goroutines (one per index — see review m7) that each acquire
|
||||
// s.mu.Lock() independently, install their map, then set the
|
||||
// corresponding atomic ready flag.
|
||||
//
|
||||
// At Cascadia scale (~5M observations) this previously blocked HTTP
|
||||
// readiness ~60s inside Load() under s.mu. Running the two builds in
|
||||
// parallel halves the pathHop-not-ready window since the two builders
|
||||
// are independent of each other.
|
||||
func (s *PacketStore) startBackgroundIndexBuilds() {
|
||||
go func() {
|
||||
t0 := time.Now()
|
||||
s.mu.Lock()
|
||||
s.buildSubpathIndex()
|
||||
s.mu.Unlock()
|
||||
// Atomic.Store happens-after s.mu.Unlock; handlers that
|
||||
// observe Ready()==true synchronize-with this store.
|
||||
s.subpathReady.Store(true)
|
||||
s.maybeCloseIndexReadyCh()
|
||||
log.Printf("[startup] index build complete: subpath (%s)",
|
||||
time.Since(t0).Round(time.Millisecond))
|
||||
}()
|
||||
go func() {
|
||||
t1 := time.Now()
|
||||
s.mu.Lock()
|
||||
s.buildPathHopIndex()
|
||||
s.mu.Unlock()
|
||||
s.pathHopReady.Store(true)
|
||||
s.maybeCloseIndexReadyCh()
|
||||
log.Printf("[startup] index build complete: pathHop (%s)",
|
||||
time.Since(t1).Round(time.Millisecond))
|
||||
}()
|
||||
}
|
||||
|
||||
// markIndexesReadySync is the synchronous-build entry point used by
|
||||
// the background chunk loader in store.go (and by tests). The chunk
|
||||
// loader rebuilds both indexes under s.mu.Lock(); after the Unlock it
|
||||
// calls this to flip the ready flags and close the broadcast channel
|
||||
// in one shot, preserving symmetry with the goroutine path above.
|
||||
func (s *PacketStore) markIndexesReadySync() {
|
||||
s.subpathReady.Store(true)
|
||||
s.pathHopReady.Store(true)
|
||||
s.maybeCloseIndexReadyCh()
|
||||
}
|
||||
|
||||
// WaitIndexesReady blocks until both background indexes built by
|
||||
// startBackgroundIndexBuilds() report ready, or the deadline expires.
|
||||
// Returns true if both flipped in time. Intended for tests that read
|
||||
// s.spIndex / s.spTxIndex / s.byPathHop directly after Load(); production
|
||||
// code paths gate via SubpathIndexReady() / PathHopIndexReady() and
|
||||
// respond 503 + Retry-After to clients instead of blocking.
|
||||
//
|
||||
// Uses the indexReadyCh broadcast channel rather than polling
|
||||
// (see review m6) so wake-up is immediate with no poll-interval jitter.
|
||||
func (s *PacketStore) WaitIndexesReady(timeout time.Duration) bool {
|
||||
if s.SubpathIndexReady() && s.PathHopIndexReady() {
|
||||
return true
|
||||
}
|
||||
ch := s.indexReadyCh()
|
||||
select {
|
||||
case <-ch:
|
||||
return true
|
||||
case <-time.After(timeout):
|
||||
return s.SubpathIndexReady() && s.PathHopIndexReady()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,144 @@
|
||||
// Issue #1008: subpath + pathHop index builds must move off the
|
||||
// synchronous Load() critical path into a background goroutine.
|
||||
//
|
||||
// Contract:
|
||||
// 1. Immediately after Load() returns, SubpathIndexReady() and
|
||||
// PathHopIndexReady() report false (the goroutine has not finished).
|
||||
// 2. Analytics handlers that depend on those indices respond 503 with
|
||||
// Retry-After: 5 until the corresponding ready flag flips true.
|
||||
// 3. After the background build completes (waitable via a helper),
|
||||
// both flags flip true and handlers respond 200.
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TestIssue1008_SubpathIndexReadyFalseImmediatelyAfterLoad asserts the
|
||||
// subpath ready flag is false the instant Load() returns. Red commit: the
|
||||
// stub returns true → assertion fires. Green commit: the flag is owned by
|
||||
// the background goroutine, which has not yet run, so the assertion holds.
|
||||
func TestIssue1008_SubpathIndexReadyFalseImmediatelyAfterLoad(t *testing.T) {
|
||||
db := setupRichTestDB(t)
|
||||
defer db.Close()
|
||||
store := NewPacketStore(db, nil)
|
||||
if err := store.Load(); err != nil {
|
||||
t.Fatalf("Load() error: %v", err)
|
||||
}
|
||||
if store.SubpathIndexReady() {
|
||||
t.Fatal("expected SubpathIndexReady()==false immediately after Load(); want background-deferred build (#1008)")
|
||||
}
|
||||
}
|
||||
|
||||
// TestIssue1008_PathHopIndexReadyFalseImmediatelyAfterLoad: same contract
|
||||
// for the path-hop index.
|
||||
func TestIssue1008_PathHopIndexReadyFalseImmediatelyAfterLoad(t *testing.T) {
|
||||
db := setupRichTestDB(t)
|
||||
defer db.Close()
|
||||
store := NewPacketStore(db, nil)
|
||||
if err := store.Load(); err != nil {
|
||||
t.Fatalf("Load() error: %v", err)
|
||||
}
|
||||
if store.PathHopIndexReady() {
|
||||
t.Fatal("expected PathHopIndexReady()==false immediately after Load(); want background-deferred build (#1008)")
|
||||
}
|
||||
}
|
||||
|
||||
// TestIssue1008_HandlerReturns503WhileSubpathIndexLoading asserts the
|
||||
// analytics/subpaths handler returns 503 + Retry-After: 5 + a JSON body
|
||||
// matching the triage spec while the subpath index is still building.
|
||||
func TestIssue1008_HandlerReturns503WhileSubpathIndexLoading(t *testing.T) {
|
||||
db := setupRichTestDB(t)
|
||||
defer db.Close()
|
||||
store := NewPacketStore(db, nil)
|
||||
if err := store.Load(); err != nil {
|
||||
t.Fatalf("Load() error: %v", err)
|
||||
}
|
||||
// Don't wait for the background build — we want to observe the
|
||||
// not-ready window.
|
||||
cfg := &Config{}
|
||||
cfg.applyListLimitsDefaults()
|
||||
srv := &Server{store: store, cfg: cfg}
|
||||
|
||||
req := httptest.NewRequest("GET", "/api/analytics/subpaths?minLen=2&maxLen=4&limit=10", nil)
|
||||
rec := httptest.NewRecorder()
|
||||
srv.handleAnalyticsSubpaths(rec, req)
|
||||
|
||||
if rec.Code != http.StatusServiceUnavailable {
|
||||
t.Fatalf("status = %d, want 503 (subpath index loading, #1008)", rec.Code)
|
||||
}
|
||||
if got := rec.Header().Get("Retry-After"); got != "5" {
|
||||
t.Errorf("Retry-After header = %q, want %q", got, "5")
|
||||
}
|
||||
var body map[string]interface{}
|
||||
if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil {
|
||||
t.Fatalf("body not valid JSON: %v (body=%s)", err, rec.Body.String())
|
||||
}
|
||||
if body["error"] != "index loading" {
|
||||
t.Errorf(`body["error"] = %v, want "index loading"`, body["error"])
|
||||
}
|
||||
}
|
||||
|
||||
// TestIssue1008_HandlerRecoversAfterIndexReady asserts that, once the
|
||||
// background build completes, the handler returns 200.
|
||||
func TestIssue1008_HandlerRecoversAfterIndexReady(t *testing.T) {
|
||||
db := setupRichTestDB(t)
|
||||
defer db.Close()
|
||||
store := NewPacketStore(db, nil)
|
||||
if err := store.Load(); err != nil {
|
||||
t.Fatalf("Load() error: %v", err)
|
||||
}
|
||||
|
||||
// Wait up to 5s for both background builds to finish on this small
|
||||
// fixture (rich test DB has ~3 packets; build is sub-millisecond).
|
||||
deadline := time.Now().Add(5 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
if store.SubpathIndexReady() && store.PathHopIndexReady() {
|
||||
break
|
||||
}
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
}
|
||||
if !store.SubpathIndexReady() {
|
||||
t.Fatal("SubpathIndexReady() never flipped true within 5s")
|
||||
}
|
||||
if !store.PathHopIndexReady() {
|
||||
t.Fatal("PathHopIndexReady() never flipped true within 5s")
|
||||
}
|
||||
|
||||
cfg := &Config{}
|
||||
cfg.applyListLimitsDefaults()
|
||||
srv := &Server{store: store, cfg: cfg}
|
||||
req := httptest.NewRequest("GET", "/api/analytics/subpaths?minLen=2&maxLen=4&limit=10", nil)
|
||||
rec := httptest.NewRecorder()
|
||||
srv.handleAnalyticsSubpaths(rec, req)
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status after ready = %d, want 200 (body=%s)", rec.Code, rec.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
// TestIssue1008_m7_BothFlagsSetAfterParallelStart verifies that the
|
||||
// parallel two-goroutine version of startBackgroundIndexBuilds (review
|
||||
// m7) sets BOTH ready flags after a bounded wait, regardless of which
|
||||
// goroutine wins the race to s.mu.Lock(). Sanity check that breaking
|
||||
// the two builds apart didn't drop the pathHop flag flip.
|
||||
func TestIssue1008_m7_BothFlagsSetAfterParallelStart(t *testing.T) {
|
||||
db := setupRichTestDB(t)
|
||||
defer db.Close()
|
||||
store := NewPacketStore(db, nil)
|
||||
if err := store.Load(); err != nil {
|
||||
t.Fatalf("Load: %v", err)
|
||||
}
|
||||
if !store.WaitIndexesReady(5 * time.Second) {
|
||||
t.Fatal("indexes never ready after parallel start (#1008 m7)")
|
||||
}
|
||||
if !store.SubpathIndexReady() {
|
||||
t.Error("subpath flag not set after WaitIndexesReady returned true")
|
||||
}
|
||||
if !store.PathHopIndexReady() {
|
||||
t.Error("pathHop flag not set after WaitIndexesReady returned true")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,222 @@
|
||||
package main
|
||||
|
||||
// Tests for issue #1690 — cold-load uses wrong time axis (first_seen instead
|
||||
// of effective recency). Three tests live in this file:
|
||||
//
|
||||
// Test1690_ColdLoad_TimeAxis — long-lived transmissions (first_seen 30d
|
||||
// ago) with recent observations must load
|
||||
// under a 1h hotStartupHours window.
|
||||
// Test1690_BackgroundLoadHonesty — backgroundLoadComplete must NOT flip to
|
||||
// true when coverage is below threshold.
|
||||
// Test1690_PerfStats_NewFields — typed perf response must expose
|
||||
// retentionHours, oldestLoaded,
|
||||
// loadCoverageRatio.
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
_ "modernc.org/sqlite"
|
||||
)
|
||||
|
||||
// createTestDBWithLastSeen seeds a DB with the post-fix schema (last_seen
|
||||
// column on transmissions). nowSec is the unix-second reference; fixture
|
||||
// rows are placed relative to it.
|
||||
//
|
||||
// numTx transmissions, each with first_seen = nowSec - firstSeenAgo, and
|
||||
// last_seen = nowSec - lastSeenAgo. Each tx has obsPerTx observations whose
|
||||
// timestamps are within the last 20 minutes.
|
||||
func createTestDBWithLastSeen(t *testing.T, dbPath string, numTx, obsPerTx int, nowSec int64, firstSeenAgo, lastSeenAgo time.Duration) {
|
||||
t.Helper()
|
||||
conn, err := sql.Open("sqlite", dbPath+"?_journal_mode=WAL")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer conn.Close()
|
||||
|
||||
execOrFail := func(s string) {
|
||||
if _, err := conn.Exec(s); err != nil {
|
||||
t.Fatalf("test DB exec: %v\nSQL: %s", err, s)
|
||||
}
|
||||
}
|
||||
// Use the post-fix schema shape: transmissions has a last_seen INTEGER column.
|
||||
execOrFail(`CREATE TABLE transmissions (
|
||||
id INTEGER PRIMARY KEY,
|
||||
raw_hex TEXT, hash TEXT, first_seen TEXT,
|
||||
route_type INTEGER, payload_type INTEGER,
|
||||
payload_version INTEGER, decoded_json TEXT,
|
||||
last_seen INTEGER NOT NULL DEFAULT 0
|
||||
)`)
|
||||
execOrFail(`CREATE TABLE observations (
|
||||
id INTEGER PRIMARY KEY, transmission_id INTEGER, observer_id TEXT, observer_name TEXT,
|
||||
direction TEXT, snr REAL, rssi REAL, score INTEGER,
|
||||
path_json TEXT, timestamp TEXT, raw_hex TEXT
|
||||
)`)
|
||||
execOrFail(`CREATE TABLE observers (rowid INTEGER PRIMARY KEY, id TEXT, name TEXT, iata TEXT)`)
|
||||
execOrFail(`CREATE TABLE nodes (pubkey TEXT PRIMARY KEY, name TEXT, role TEXT, lat REAL, lon REAL, last_seen TEXT, first_seen TEXT, frequency REAL)`)
|
||||
execOrFail(`CREATE TABLE schema_version (version INTEGER)`)
|
||||
execOrFail(`INSERT INTO schema_version (version) VALUES (1)`)
|
||||
execOrFail(`CREATE INDEX idx_tx_first_seen ON transmissions(first_seen)`)
|
||||
execOrFail(`CREATE INDEX idx_tx_last_seen ON transmissions(last_seen)`)
|
||||
|
||||
firstSeenTime := time.Unix(nowSec, 0).UTC().Add(-firstSeenAgo).Format(time.RFC3339)
|
||||
lastSeenUnix := nowSec - int64(lastSeenAgo.Seconds())
|
||||
|
||||
txStmt, err := conn.Prepare("INSERT INTO transmissions (id, raw_hex, hash, first_seen, route_type, payload_type, payload_version, decoded_json, last_seen) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)")
|
||||
if err != nil {
|
||||
t.Fatalf("prepare tx: %v", err)
|
||||
}
|
||||
defer txStmt.Close()
|
||||
obsStmt, err := conn.Prepare("INSERT INTO observations (id, transmission_id, observer_id, observer_name, direction, snr, rssi, score, path_json, timestamp) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")
|
||||
if err != nil {
|
||||
t.Fatalf("prepare obs: %v", err)
|
||||
}
|
||||
defer obsStmt.Close()
|
||||
|
||||
obsID := 1
|
||||
for i := 1; i <= numTx; i++ {
|
||||
hash := fmt.Sprintf("h%06d", i)
|
||||
if _, err := txStmt.Exec(i, "aabb", hash, firstSeenTime, 0, 4, 1, "{}", lastSeenUnix); err != nil {
|
||||
t.Fatalf("insert tx %d: %v", i, err)
|
||||
}
|
||||
for j := 0; j < obsPerTx; j++ {
|
||||
// Observations within the last 20 minutes relative to nowSec.
|
||||
obsTs := time.Unix(nowSec, 0).UTC().Add(-time.Duration(j)*time.Minute - time.Minute).Format(time.RFC3339)
|
||||
if _, err := obsStmt.Exec(obsID, i, "obs1", "Obs1", "RX", -10.0, -80.0, 5, "[]", obsTs); err != nil {
|
||||
t.Fatalf("insert obs: %v", err)
|
||||
}
|
||||
obsID++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test1690_ColdLoad_TimeAxis seeds 1000 transmissions whose hash *first
|
||||
// appeared* 30 days ago but whose last observation was 30 minutes ago.
|
||||
// With a 1h hotStartupHours, the pre-fix code (filtering on first_seen)
|
||||
// loads zero rows; the post-fix code (filtering on last_seen) must load
|
||||
// all 1000.
|
||||
func Test1690_ColdLoad_TimeAxis(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "test.db")
|
||||
|
||||
nowSec := time.Now().UTC().Unix()
|
||||
createTestDBWithLastSeen(t, dbPath, 1000, 1, nowSec,
|
||||
30*24*time.Hour, // first_seen = 30d ago
|
||||
30*time.Minute) // last_seen = 30min ago
|
||||
|
||||
db, err := OpenDB(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenDB: %v", err)
|
||||
}
|
||||
defer db.conn.Close()
|
||||
|
||||
store := NewPacketStore(db, &PacketStoreConfig{
|
||||
RetentionHours: 168,
|
||||
HotStartupHours: 1,
|
||||
})
|
||||
|
||||
if err := store.LoadChunked(0); err != nil {
|
||||
t.Fatalf("LoadChunked: %v", err)
|
||||
}
|
||||
|
||||
loaded := len(store.packets)
|
||||
if loaded < 1000 {
|
||||
t.Fatalf("Test1690_ColdLoad_TimeAxis: expected ≥1000 transmissions loaded "+
|
||||
"(all 1000 fixture rows have last_seen within 1h), got %d. "+
|
||||
"Pre-fix behavior: chunked_load.go filters t.first_seen >= now-1h "+
|
||||
"which excludes all 30d-old rows.", loaded)
|
||||
}
|
||||
}
|
||||
|
||||
// Test1690_BackgroundLoadHonesty seeds 1000 transmissions but caps the
|
||||
// store's memory budget so it can only fit a fraction. After
|
||||
// loadBackgroundChunks runs, backgroundLoadDone must be FALSE and
|
||||
// backgroundLoadFailed must be TRUE because actual coverage is < 90%.
|
||||
func Test1690_BackgroundLoadHonesty(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "test.db")
|
||||
|
||||
nowSec := time.Now().UTC().Unix()
|
||||
// 5000 rows; chunkSize=500 + maxMemoryMB=1 (→ maxPackets ≈ 1000) so
|
||||
// the load breaks at the end of the chunk that crosses the cap and
|
||||
// totalLoaded ≪ 5000.
|
||||
createTestDBWithLastSeen(t, dbPath, 5000, 1, nowSec,
|
||||
30*time.Minute, 30*time.Minute)
|
||||
|
||||
db, err := OpenDB(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenDB: %v", err)
|
||||
}
|
||||
defer db.conn.Close()
|
||||
|
||||
store := NewPacketStore(db, &PacketStoreConfig{
|
||||
RetentionHours: 168,
|
||||
HotStartupHours: 1,
|
||||
MaxMemoryMB: 1, // forces bounded load ≪ 5000 rows
|
||||
})
|
||||
if err := store.LoadChunked(500); err != nil {
|
||||
t.Fatalf("LoadChunked: %v", err)
|
||||
}
|
||||
store.loadBackgroundChunks()
|
||||
|
||||
if store.backgroundLoadDone.Load() {
|
||||
t.Errorf("backgroundLoadDone=true with only %d/5000 packets loaded; "+
|
||||
"must be false until coverage ≥ 90%%", len(store.packets))
|
||||
}
|
||||
if !store.backgroundLoadFailed.Load() {
|
||||
t.Errorf("backgroundLoadFailed=false despite under-coverage "+
|
||||
"(%d/5000 packets loaded); must be true with a reason", len(store.packets))
|
||||
}
|
||||
// The error message must mention a percentage so operators can see
|
||||
// the actual ratio surface in the perf endpoint.
|
||||
errMsg := store.BackgroundLoadError()
|
||||
if !strings.Contains(errMsg, "%") {
|
||||
t.Errorf("backgroundLoadError=%q; expected human-readable ratio "+
|
||||
"(e.g. 'loaded X%% of Y rows')", errMsg)
|
||||
}
|
||||
}
|
||||
|
||||
// Test1690_PerfStats_NewFields asserts the typed perf payload exposes the
|
||||
// retention/coverage fields needed for prod observability.
|
||||
func Test1690_PerfStats_NewFields(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "test.db")
|
||||
|
||||
nowSec := time.Now().UTC().Unix()
|
||||
createTestDBWithLastSeen(t, dbPath, 10, 1, nowSec,
|
||||
30*time.Minute, 30*time.Minute)
|
||||
|
||||
db, err := OpenDB(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenDB: %v", err)
|
||||
}
|
||||
defer db.conn.Close()
|
||||
|
||||
store := NewPacketStore(db, &PacketStoreConfig{
|
||||
RetentionHours: 168,
|
||||
HotStartupHours: 1,
|
||||
})
|
||||
if err := store.LoadChunked(0); err != nil {
|
||||
t.Fatalf("LoadChunked: %v", err)
|
||||
}
|
||||
|
||||
ps := store.GetPerfStoreStatsTyped()
|
||||
buf, err := json.Marshal(ps)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal: %v", err)
|
||||
}
|
||||
var asMap map[string]interface{}
|
||||
if err := json.Unmarshal(buf, &asMap); err != nil {
|
||||
t.Fatalf("unmarshal: %v", err)
|
||||
}
|
||||
for _, key := range []string{"retentionHours", "oldestLoaded", "loadCoverageRatio"} {
|
||||
if _, ok := asMap[key]; !ok {
|
||||
t.Errorf("PerfPacketStoreStats missing %q field; payload=%s", key, string(buf))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,224 @@
|
||||
package main
|
||||
|
||||
// Known-channels catalogue cache (issue #1323).
|
||||
//
|
||||
// Fetches a community-maintained catalogue of hashtag channels (default:
|
||||
// https://raw.githubusercontent.com/marcelverdult/meshcore-channels/main/channels-by-country.json)
|
||||
// every N hours into an in-memory snapshot. Never blocks startup; never
|
||||
// blocks UI on the fetch; fail-soft to last-known. No DB, no disk cache.
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
)
|
||||
|
||||
// DefaultKnownChannelsURL is the suggested upstream catalogue, pinned to a
|
||||
// specific commit SHA so a hostile or compromised future commit on the
|
||||
// community repo cannot be silently fetched by deployments that opt in.
|
||||
// Operators should periodically bump this pin (see config.example.json).
|
||||
// NOTE: this constant is only used by tests and as documentation — the
|
||||
// feature is OPT-IN: an empty cfg.KnownChannelsURL leaves the cache
|
||||
// disabled (no background fetch, /api/known-channels serves empty).
|
||||
const DefaultKnownChannelsURL = "https://raw.githubusercontent.com/marcelverdult/meshcore-channels/072bc25b6fc983aa2aa7e9d399a97a5f4899ea71/channels-by-country.json"
|
||||
|
||||
// DefaultKnownChannelsRefresh is the default refresh interval (24h).
|
||||
const DefaultKnownChannelsRefresh = 24 * time.Hour
|
||||
|
||||
// maxKnownChannelsBytes caps the upstream response size we are willing to
|
||||
// parse (the catalogue is ~80 KB today; 4 MB ceiling is plenty of headroom
|
||||
// and bounds memory if upstream ever ships a malicious oversize payload).
|
||||
const maxKnownChannelsBytes = 4 * 1024 * 1024
|
||||
|
||||
// KnownChannelEntry is one catalogue entry, region-stamped.
|
||||
type KnownChannelEntry struct {
|
||||
Channel string `json:"channel"` // e.g. "#antwerpen" (# prefix preserved)
|
||||
Description string `json:"description,omitempty"`
|
||||
Key string `json:"key,omitempty"` // optional PSK (base64) — present for some entries
|
||||
Region string `json:"region"` // ISO 3166-1 alpha-2 lowercase
|
||||
RegionName string `json:"regionName,omitempty"`
|
||||
}
|
||||
|
||||
// KnownChannelsSnapshot is the immutable parsed catalogue surfaced over /api.
|
||||
type KnownChannelsSnapshot struct {
|
||||
GeneratedAt string `json:"generatedAt,omitempty"` // upstream generation timestamp
|
||||
License string `json:"license,omitempty"`
|
||||
FetchedAt time.Time `json:"fetchedAt"`
|
||||
Source string `json:"source"`
|
||||
Entries []KnownChannelEntry `json:"entries"`
|
||||
}
|
||||
|
||||
// upstreamPayload mirrors the channels-by-country.json shape.
|
||||
type upstreamPayload struct {
|
||||
GeneratedAt string `json:"generated_at"`
|
||||
License string `json:"license"`
|
||||
Countries map[string][]upstreamCountryChannel `json:"countries"`
|
||||
CountryNames map[string]string `json:"countryNames,omitempty"` // optional extension
|
||||
}
|
||||
|
||||
type upstreamCountryChannel struct {
|
||||
Channel string `json:"channel"`
|
||||
Description string `json:"description"`
|
||||
Key string `json:"key,omitempty"`
|
||||
}
|
||||
|
||||
// parseKnownChannelsJSON parses the upstream JSON into a snapshot.
|
||||
// Tolerant: missing/empty countries are skipped silently; entries with
|
||||
// empty channel strings are dropped.
|
||||
func parseKnownChannelsJSON(raw []byte, source string, now time.Time) (*KnownChannelsSnapshot, error) {
|
||||
if len(raw) == 0 {
|
||||
return nil, errors.New("empty payload")
|
||||
}
|
||||
var p upstreamPayload
|
||||
if err := json.Unmarshal(raw, &p); err != nil {
|
||||
return nil, fmt.Errorf("decode catalogue: %w", err)
|
||||
}
|
||||
out := &KnownChannelsSnapshot{
|
||||
GeneratedAt: p.GeneratedAt,
|
||||
License: p.License,
|
||||
FetchedAt: now,
|
||||
Source: source,
|
||||
Entries: make([]KnownChannelEntry, 0, 256),
|
||||
}
|
||||
for code, list := range p.Countries {
|
||||
if len(list) == 0 {
|
||||
continue
|
||||
}
|
||||
region := strings.ToLower(strings.TrimSpace(code))
|
||||
name := p.CountryNames[code]
|
||||
for _, c := range list {
|
||||
ch := strings.TrimSpace(c.Channel)
|
||||
if ch == "" {
|
||||
continue
|
||||
}
|
||||
out.Entries = append(out.Entries, KnownChannelEntry{
|
||||
Channel: ch,
|
||||
Description: c.Description,
|
||||
Key: c.Key,
|
||||
Region: region,
|
||||
RegionName: name,
|
||||
})
|
||||
}
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// filterSnapshotByRegion returns a copy filtered to the given region
|
||||
// (case-insensitive). Empty/whitespace region returns the original snapshot
|
||||
// (entry slice shared — callers must not mutate). Unknown region returns
|
||||
// a snapshot with an empty (but non-nil) Entries slice so JSON marshals as `[]`.
|
||||
func filterSnapshotByRegion(snap *KnownChannelsSnapshot, region string) *KnownChannelsSnapshot {
|
||||
if snap == nil {
|
||||
return nil
|
||||
}
|
||||
region = strings.ToLower(strings.TrimSpace(region))
|
||||
if region == "" {
|
||||
return snap
|
||||
}
|
||||
out := &KnownChannelsSnapshot{
|
||||
GeneratedAt: snap.GeneratedAt,
|
||||
License: snap.License,
|
||||
FetchedAt: snap.FetchedAt,
|
||||
Source: snap.Source,
|
||||
Entries: []KnownChannelEntry{},
|
||||
}
|
||||
for _, e := range snap.Entries {
|
||||
if e.Region == region {
|
||||
out.Entries = append(out.Entries, e)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// knownChannelsCache holds the atomic snapshot pointer + config.
|
||||
type knownChannelsCache struct {
|
||||
ptr atomic.Pointer[KnownChannelsSnapshot]
|
||||
url string
|
||||
refresh time.Duration
|
||||
client *http.Client
|
||||
|
||||
fetchCount atomic.Int64 // # successful upstream fetches
|
||||
failCount atomic.Int64 // # failed fetches (fail-soft)
|
||||
}
|
||||
|
||||
func newKnownChannelsCache(url string, refresh time.Duration) *knownChannelsCache {
|
||||
if refresh <= 0 {
|
||||
refresh = DefaultKnownChannelsRefresh
|
||||
}
|
||||
return &knownChannelsCache{
|
||||
url: url,
|
||||
refresh: refresh,
|
||||
client: &http.Client{Timeout: 30 * time.Second},
|
||||
}
|
||||
}
|
||||
|
||||
// load returns the current snapshot or nil if never populated.
|
||||
func (c *knownChannelsCache) load() *KnownChannelsSnapshot {
|
||||
return c.ptr.Load()
|
||||
}
|
||||
|
||||
// fetchOnce performs a single upstream fetch. Updates ptr on success;
|
||||
// leaves last-known snapshot in place on failure (fail-soft).
|
||||
func (c *knownChannelsCache) fetchOnce(ctx context.Context) error {
|
||||
if c.url == "" {
|
||||
return errors.New("known channels url not configured")
|
||||
}
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.url, nil)
|
||||
if err != nil {
|
||||
c.failCount.Add(1)
|
||||
return err
|
||||
}
|
||||
req.Header.Set("User-Agent", "CoreScope-KnownChannels/1.0 (+https://github.com/Kpa-clawbot/CoreScope)")
|
||||
resp, err := c.client.Do(req)
|
||||
if err != nil {
|
||||
c.failCount.Add(1)
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
c.failCount.Add(1)
|
||||
return fmt.Errorf("upstream status %s", resp.Status)
|
||||
}
|
||||
body, err := io.ReadAll(io.LimitReader(resp.Body, maxKnownChannelsBytes))
|
||||
if err != nil {
|
||||
c.failCount.Add(1)
|
||||
return err
|
||||
}
|
||||
snap, err := parseKnownChannelsJSON(body, c.url, time.Now())
|
||||
if err != nil {
|
||||
c.failCount.Add(1)
|
||||
return err
|
||||
}
|
||||
c.ptr.Store(snap)
|
||||
c.fetchCount.Add(1)
|
||||
return nil
|
||||
}
|
||||
|
||||
// run kicks off the background fetch loop in a new goroutine. Does an
|
||||
// initial fetch (fail-soft) and then ticks every refresh interval until
|
||||
// ctx is cancelled. Never blocks the caller — startup proceeds immediately
|
||||
// even if the upstream is slow or unreachable.
|
||||
func (c *knownChannelsCache) run(ctx context.Context) {
|
||||
if c.url == "" {
|
||||
return
|
||||
}
|
||||
go func() {
|
||||
_ = c.fetchOnce(ctx) // initial fetch, fail-soft
|
||||
t := time.NewTicker(c.refresh)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-t.C:
|
||||
_ = c.fetchOnce(ctx)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
@@ -0,0 +1,236 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/gorilla/mux"
|
||||
)
|
||||
|
||||
// Canned fixture mirroring the upstream channels-by-country.json shape
|
||||
// (https://raw.githubusercontent.com/marcelverdult/meshcore-channels/main/channels-by-country.json
|
||||
// pinned 2026-05-24). Two countries: one with entries, one empty (to test
|
||||
// the "skip empty countries" branch).
|
||||
const knownChannelsFixture = `{
|
||||
"generated_at": "2026-05-24T22:29:02Z",
|
||||
"license": "CC0-1.0",
|
||||
"countries": {
|
||||
"be": [
|
||||
{"channel": "#antwerpen", "description": "antwerpen"},
|
||||
{"channel": "#bemesh", "description": "bemesh"}
|
||||
],
|
||||
"us": [
|
||||
{"channel": "#bayarea", "description": "Bay Area"}
|
||||
],
|
||||
"ad": []
|
||||
}
|
||||
}`
|
||||
|
||||
// (a) Cache parses a canned JSON fixture into a snapshot.
|
||||
func TestKnownChannelsParseFixture(t *testing.T) {
|
||||
snap, err := parseKnownChannelsJSON([]byte(knownChannelsFixture), "fixture://test", time.Unix(1700000000, 0))
|
||||
if err != nil {
|
||||
t.Fatalf("parseKnownChannelsJSON: %v", err)
|
||||
}
|
||||
if snap == nil {
|
||||
t.Fatal("snapshot is nil")
|
||||
}
|
||||
if snap.GeneratedAt != "2026-05-24T22:29:02Z" {
|
||||
t.Errorf("GeneratedAt = %q, want 2026-05-24T22:29:02Z", snap.GeneratedAt)
|
||||
}
|
||||
if snap.License != "CC0-1.0" {
|
||||
t.Errorf("License = %q, want CC0-1.0", snap.License)
|
||||
}
|
||||
if snap.Source != "fixture://test" {
|
||||
t.Errorf("Source = %q, want fixture://test", snap.Source)
|
||||
}
|
||||
if got, want := len(snap.Entries), 3; got != want {
|
||||
t.Fatalf("len(Entries) = %d, want %d (empty country ad must be skipped)", got, want)
|
||||
}
|
||||
// Spot-check one entry's region stamping.
|
||||
var foundAntwerpen bool
|
||||
for _, e := range snap.Entries {
|
||||
if e.Channel == "#antwerpen" {
|
||||
foundAntwerpen = true
|
||||
if e.Region != "be" {
|
||||
t.Errorf("antwerpen Region = %q, want be", e.Region)
|
||||
}
|
||||
}
|
||||
}
|
||||
if !foundAntwerpen {
|
||||
t.Fatal("antwerpen entry missing from snapshot")
|
||||
}
|
||||
}
|
||||
|
||||
// (b) The route returns 200 + filtered list.
|
||||
func TestKnownChannelsRouteRegionFilter(t *testing.T) {
|
||||
snap, err := parseKnownChannelsJSON([]byte(knownChannelsFixture), "fixture://test", time.Now())
|
||||
if err != nil {
|
||||
t.Fatalf("parse: %v", err)
|
||||
}
|
||||
srv := &Server{
|
||||
knownChannels: &knownChannelsCache{},
|
||||
}
|
||||
srv.knownChannels.ptr.Store(snap)
|
||||
|
||||
r := mux.NewRouter()
|
||||
r.HandleFunc("/api/known-channels", srv.handleKnownChannels).Methods("GET")
|
||||
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/known-channels?region=be", nil)
|
||||
w := httptest.NewRecorder()
|
||||
r.ServeHTTP(w, req)
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("status = %d, want 200; body=%s", w.Code, w.Body.String())
|
||||
}
|
||||
var resp KnownChannelsSnapshot
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("unmarshal: %v; body=%s", err, w.Body.String())
|
||||
}
|
||||
if got := len(resp.Entries); got != 2 {
|
||||
t.Fatalf("filtered entries = %d, want 2 (be has 2); got body=%s", got, w.Body.String())
|
||||
}
|
||||
for _, e := range resp.Entries {
|
||||
if e.Region != "be" {
|
||||
t.Errorf("entry %q has region %q, want be", e.Channel, e.Region)
|
||||
}
|
||||
if !strings.HasPrefix(e.Channel, "#") {
|
||||
t.Errorf("entry channel %q missing # prefix", e.Channel)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// (c) Cache survives upstream 500 (fail-soft): a prior good snapshot must
|
||||
// remain available after a failed refresh.
|
||||
func TestKnownChannelsFailSoftOn500(t *testing.T) {
|
||||
// First server: returns the fixture (success).
|
||||
good := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = w.Write([]byte(knownChannelsFixture))
|
||||
}))
|
||||
defer good.Close()
|
||||
|
||||
c := newKnownChannelsCache(good.URL, time.Hour)
|
||||
if err := c.fetchOnce(context.Background()); err != nil {
|
||||
t.Fatalf("initial fetchOnce: %v", err)
|
||||
}
|
||||
first := c.load()
|
||||
if first == nil || len(first.Entries) == 0 {
|
||||
t.Fatal("first snapshot must be populated")
|
||||
}
|
||||
|
||||
// Second server: always 500.
|
||||
bad := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
http.Error(w, "boom", http.StatusInternalServerError)
|
||||
}))
|
||||
defer bad.Close()
|
||||
|
||||
// Re-point the cache to the failing upstream and fetch.
|
||||
c.url = bad.URL
|
||||
err := c.fetchOnce(context.Background())
|
||||
if err == nil {
|
||||
t.Fatal("expected fetchOnce to return error on 500")
|
||||
}
|
||||
after := c.load()
|
||||
if after == nil {
|
||||
t.Fatal("snapshot wiped after failed fetch — must be fail-soft")
|
||||
}
|
||||
if len(after.Entries) != len(first.Entries) {
|
||||
t.Errorf("snapshot entry count changed after failed fetch: was %d, now %d", len(first.Entries), len(after.Entries))
|
||||
}
|
||||
if c.failCount.Load() < 1 {
|
||||
t.Errorf("failCount = %d, want >=1", c.failCount.Load())
|
||||
}
|
||||
}
|
||||
|
||||
// (d) Malformed JSON returns an error AND increments failCount via
|
||||
// fetchOnce (the parse path lives inside fetchOnce so the metric is
|
||||
// the cache-level signal operators see, not just the parser's return).
|
||||
func TestKnownChannelsParseError(t *testing.T) {
|
||||
// parser-level: garbage in, error out.
|
||||
if _, err := parseKnownChannelsJSON([]byte("{not json"), "fixture://bad", time.Now()); err == nil {
|
||||
t.Fatal("parseKnownChannelsJSON: expected error on malformed JSON")
|
||||
}
|
||||
// cache-level: a 200 with malformed body must bump failCount and
|
||||
// leave any prior snapshot in place.
|
||||
bad := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = w.Write([]byte("{not json"))
|
||||
}))
|
||||
defer bad.Close()
|
||||
c := newKnownChannelsCache(bad.URL, time.Hour)
|
||||
before := c.failCount.Load()
|
||||
if err := c.fetchOnce(context.Background()); err == nil {
|
||||
t.Fatal("fetchOnce: expected parse error to surface")
|
||||
}
|
||||
if c.failCount.Load() <= before {
|
||||
t.Errorf("failCount did not increment: before=%d after=%d", before, c.failCount.Load())
|
||||
}
|
||||
if c.fetchCount.Load() != 0 {
|
||||
t.Errorf("fetchCount = %d, want 0 (parse failed)", c.fetchCount.Load())
|
||||
}
|
||||
}
|
||||
|
||||
// (e) The handler tolerates a nil cache (the startup-window fail-soft
|
||||
// guarantee): server still serves 200 + an empty entries snapshot
|
||||
// rather than 500. Mirrors the production code path where the route
|
||||
// is registered before — or independently of — knownChannels being
|
||||
// instantiated (the OPT-IN gating leaves it nil entirely when disabled).
|
||||
func TestKnownChannelsHandlerNilCache(t *testing.T) {
|
||||
srv := &Server{} // knownChannels intentionally nil
|
||||
r := mux.NewRouter()
|
||||
r.HandleFunc("/api/known-channels", srv.handleKnownChannels).Methods("GET")
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/known-channels", nil)
|
||||
w := httptest.NewRecorder()
|
||||
r.ServeHTTP(w, req)
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("status = %d, want 200 (nil cache must fail-soft); body=%s", w.Code, w.Body.String())
|
||||
}
|
||||
var resp KnownChannelsSnapshot
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("unmarshal: %v; body=%s", err, w.Body.String())
|
||||
}
|
||||
if resp.Entries == nil {
|
||||
t.Fatal("Entries is nil, want non-nil empty slice (JSON [] not null)")
|
||||
}
|
||||
if len(resp.Entries) != 0 {
|
||||
t.Errorf("Entries len = %d, want 0", len(resp.Entries))
|
||||
}
|
||||
if cc := w.Header().Get("Cache-Control"); cc == "" {
|
||||
t.Errorf("Cache-Control header missing on nil-cache response")
|
||||
}
|
||||
}
|
||||
|
||||
// (f) An empty region query param ("?region=") must pass through as if
|
||||
// no filter was supplied — i.e. the full snapshot is returned, NOT an
|
||||
// empty list. Guards against an off-by-one in the trim+filter path.
|
||||
func TestKnownChannelsRegionEmptyPassthrough(t *testing.T) {
|
||||
snap, err := parseKnownChannelsJSON([]byte(knownChannelsFixture), "fixture://test", time.Now())
|
||||
if err != nil {
|
||||
t.Fatalf("parse: %v", err)
|
||||
}
|
||||
srv := &Server{knownChannels: &knownChannelsCache{}}
|
||||
srv.knownChannels.ptr.Store(snap)
|
||||
r := mux.NewRouter()
|
||||
r.HandleFunc("/api/known-channels", srv.handleKnownChannels).Methods("GET")
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/known-channels?region=", nil)
|
||||
w := httptest.NewRecorder()
|
||||
r.ServeHTTP(w, req)
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("status = %d, want 200; body=%s", w.Code, w.Body.String())
|
||||
}
|
||||
var resp KnownChannelsSnapshot
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("unmarshal: %v; body=%s", err, w.Body.String())
|
||||
}
|
||||
if got, want := len(resp.Entries), len(snap.Entries); got != want {
|
||||
t.Fatalf("empty region must return unfiltered snapshot: got %d entries, want %d", got, want)
|
||||
}
|
||||
if cc := w.Header().Get("Cache-Control"); cc == "" {
|
||||
t.Errorf("Cache-Control header missing on populated response")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"time"
|
||||
)
|
||||
|
||||
// handleKnownChannels — GET /api/known-channels?region=XX
|
||||
//
|
||||
// Returns the cached community catalogue of hashtag channels (issue #1323),
|
||||
// optionally filtered to one region (ISO 3166-1 alpha-2, case-insensitive).
|
||||
// Empty/missing cache returns 200 with an empty Entries list so the UI
|
||||
// degrades gracefully (fail-soft). Never blocks on the upstream fetch:
|
||||
// the response is served straight off an atomic snapshot pointer.
|
||||
func (s *Server) handleKnownChannels(w http.ResponseWriter, r *http.Request) {
|
||||
region := r.URL.Query().Get("region")
|
||||
var snap *KnownChannelsSnapshot
|
||||
if s.knownChannels != nil {
|
||||
snap = s.knownChannels.load()
|
||||
}
|
||||
if snap == nil {
|
||||
// Empty cache — return a well-formed empty snapshot. Short
|
||||
// max-age so a slow first fetch (or disabled feature) doesn't
|
||||
// freeze the UI for the whole page lifetime.
|
||||
w.Header().Set("Cache-Control", "public, max-age=30")
|
||||
writeJSON(w, &KnownChannelsSnapshot{
|
||||
FetchedAt: time.Time{},
|
||||
Source: "",
|
||||
Entries: []KnownChannelEntry{},
|
||||
})
|
||||
return
|
||||
}
|
||||
// Catalogue refreshes every 24h upstream; 5 min browser cache is
|
||||
// well under that and avoids hammering the endpoint when the UI
|
||||
// re-renders the sidebar.
|
||||
w.Header().Set("Cache-Control", "public, max-age=300")
|
||||
writeJSON(w, filterSnapshotByRegion(snap, region))
|
||||
}
|
||||
@@ -0,0 +1,67 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// Behavior test (#1574): /api/config/client must expose `liveMapMaxNodes`
|
||||
// so the frontend can honor the operator-configured live-map node cap
|
||||
// instead of the hardcoded 2000 in public/live.js. Default is 2000;
|
||||
// operators tune via `liveMap.maxNodes` in config.json. Server clamps to
|
||||
// [100, 20000] to defang misconfig.
|
||||
func TestConfigClientExposesLiveMapMaxNodes(t *testing.T) {
|
||||
_, router := setupTestServer(t)
|
||||
req := httptest.NewRequest("GET", "/api/config/client", nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
|
||||
if w.Code != 200 {
|
||||
t.Fatalf("expected 200, got %d", w.Code)
|
||||
}
|
||||
var body map[string]interface{}
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil {
|
||||
t.Fatalf("decode body: %v", err)
|
||||
}
|
||||
v, present := body["liveMapMaxNodes"]
|
||||
if !present {
|
||||
t.Fatal("expected liveMapMaxNodes in /api/config/client response")
|
||||
}
|
||||
n, ok := v.(float64)
|
||||
if !ok {
|
||||
t.Fatalf("expected liveMapMaxNodes to be a number, got %T", v)
|
||||
}
|
||||
if int(n) != 2000 {
|
||||
t.Errorf("expected default liveMapMaxNodes=2000, got %d", int(n))
|
||||
}
|
||||
}
|
||||
|
||||
// Server-side clamp: operator misconfig (negative, zero, absurdly large)
|
||||
// must be coerced to safe bounds [100, 20000]. Default (unset) is 2000.
|
||||
func TestLiveMapMaxNodesClamp(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
set int
|
||||
want int
|
||||
}{
|
||||
{"default-when-unset", 0, 2000},
|
||||
{"negative-clamps-to-default", -42, 2000},
|
||||
{"below-min-clamps-up", 50, 100},
|
||||
{"in-range-passthrough", 4300, 4300},
|
||||
{"above-max-clamps-down", 99999, 20000},
|
||||
{"exact-min", 100, 100},
|
||||
{"exact-max", 20000, 20000},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
cfg := &Config{}
|
||||
cfg.LiveMap.MaxNodes = tc.set
|
||||
got := cfg.LiveMapMaxNodes()
|
||||
if got != tc.want {
|
||||
t.Errorf("LiveMapMaxNodes() with set=%d: want %d, got %d",
|
||||
tc.set, tc.want, got)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,90 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
_ "modernc.org/sqlite"
|
||||
)
|
||||
|
||||
// TestLoad_PanicsWhenGraphNotLoadedAndEdgesExist pins the startup-ordering
|
||||
// invariant (munger R1 #2). Graph-load-before-packet-load is the entire
|
||||
// premise of PR #1643's fix: without an in-memory neighbor graph, the
|
||||
// path_json relay-hop fallback cannot resolve hops, so relay-node analytics
|
||||
// history collapses. main.go currently does the right thing — but nothing
|
||||
// asserts the ordering, so a future refactor could silently regress.
|
||||
//
|
||||
// Load() must panic when neighbor_edges has rows but s.graph.Load() returns
|
||||
// nil. Fast-fail at startup beats silently-wrong attribution.
|
||||
func TestLoad_PanicsWhenGraphNotLoadedAndEdgesExist(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "test.db")
|
||||
|
||||
rw, err := sql.Open("sqlite", "file:"+dbPath+"?_journal_mode=WAL")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer rw.Close()
|
||||
|
||||
exec := func(s string, args ...interface{}) {
|
||||
if _, err := rw.Exec(s, args...); err != nil {
|
||||
t.Fatalf("setup exec failed: %v\nSQL: %s", err, s)
|
||||
}
|
||||
}
|
||||
|
||||
// Minimal CoreScope schema. PREFLIGHT: async=true reason="test fixture, in-memory tmpdir DB"
|
||||
exec(`CREATE TABLE transmissions (
|
||||
id INTEGER PRIMARY KEY,
|
||||
raw_hex TEXT, hash TEXT, first_seen TEXT,
|
||||
route_type INTEGER, payload_type INTEGER, payload_version INTEGER,
|
||||
decoded_json TEXT
|
||||
)`)
|
||||
// PREFLIGHT: async=true reason="test fixture, in-memory tmpdir DB"
|
||||
exec(`CREATE TABLE observations (
|
||||
id INTEGER PRIMARY KEY, transmission_id INTEGER,
|
||||
observer_id TEXT, observer_name TEXT,
|
||||
direction TEXT, snr REAL, rssi REAL, score INTEGER,
|
||||
path_json TEXT, timestamp TEXT, raw_hex TEXT, resolved_path TEXT
|
||||
)`)
|
||||
// PREFLIGHT: async=true reason="test fixture, in-memory tmpdir DB"
|
||||
exec(`CREATE TABLE observers (rowid INTEGER PRIMARY KEY, id TEXT, name TEXT, iata TEXT)`)
|
||||
// PREFLIGHT: async=true reason="test fixture, in-memory tmpdir DB"
|
||||
exec(`CREATE TABLE nodes (
|
||||
public_key TEXT PRIMARY KEY, name TEXT, role TEXT, lat REAL, lon REAL,
|
||||
last_seen TEXT, first_seen TEXT, advert_count INTEGER DEFAULT 0
|
||||
)`)
|
||||
// PREFLIGHT: async=true reason="test fixture, in-memory tmpdir DB"
|
||||
exec(`CREATE TABLE schema_version (version INTEGER)`)
|
||||
exec(`INSERT INTO schema_version (version) VALUES (1)`)
|
||||
// PREFLIGHT: async=true reason="test fixture, in-memory tmpdir DB"
|
||||
exec(`CREATE TABLE neighbor_edges (
|
||||
node_a TEXT NOT NULL,
|
||||
node_b TEXT NOT NULL,
|
||||
count INTEGER DEFAULT 1,
|
||||
last_seen TEXT,
|
||||
PRIMARY KEY (node_a, node_b)
|
||||
)`)
|
||||
now := time.Now().UTC().Format(time.RFC3339)
|
||||
exec(`INSERT INTO neighbor_edges (node_a, node_b, count, last_seen) VALUES (?, ?, ?, ?)`,
|
||||
"aaa", "bbb", 5, now)
|
||||
|
||||
d, err := OpenDB(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenDB: %v", err)
|
||||
}
|
||||
defer d.conn.Close()
|
||||
|
||||
// Deliberately DO NOT call store.graph.Store(...). s.graph.Load() returns
|
||||
// nil → the bug condition the invariant guard must catch.
|
||||
store := NewPacketStore(d, &PacketStoreConfig{RetentionHours: 72})
|
||||
|
||||
defer func() {
|
||||
r := recover()
|
||||
if r == nil {
|
||||
t.Fatalf("Load() must panic when neighbor_edges has rows but graph is nil; got no panic")
|
||||
}
|
||||
}()
|
||||
_ = store.Load()
|
||||
}
|
||||
@@ -0,0 +1,172 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
_ "modernc.org/sqlite"
|
||||
)
|
||||
|
||||
// createTestDBAmbiguousPrefix builds a fixture where TWO repeaters share the
|
||||
// same 2-char hop prefix. An observation's path_json carries ONLY the
|
||||
// ambiguous prefix (no longer prefix that would disambiguate). With no
|
||||
// neighbor_edges seeded, the cold-load fallback in scanAndMergeChunk has
|
||||
// nothing to anchor on — yet the current code resolves the prefix anyway
|
||||
// (via observation_count_fallback or candidate[0]) and over-attributes the
|
||||
// hop to ONE of the two repeaters. That is the time-travel bug munger
|
||||
// flagged: the historical packet's actual relay is unknown, but the loader
|
||||
// picks today's tier-4 winner against ~7-day-old observations.
|
||||
func createTestDBAmbiguousPrefix(t *testing.T, relayA, relayB, hop, firstSeen string) string {
|
||||
t.Helper()
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "test.db")
|
||||
|
||||
conn, err := sql.Open("sqlite", dbPath+"?_journal_mode=WAL")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer conn.Close()
|
||||
|
||||
exec := func(s string, args ...interface{}) {
|
||||
if _, err := conn.Exec(s, args...); err != nil {
|
||||
t.Fatalf("setup exec failed: %v\nSQL: %s", err, s)
|
||||
}
|
||||
}
|
||||
|
||||
// PREFLIGHT: async=true reason="test fixture: in-memory t.TempDir SQLite, never touches a real DB."
|
||||
exec(`CREATE TABLE transmissions (
|
||||
id INTEGER PRIMARY KEY,
|
||||
raw_hex TEXT, hash TEXT, first_seen TEXT,
|
||||
route_type INTEGER, payload_type INTEGER, payload_version INTEGER,
|
||||
decoded_json TEXT
|
||||
)`)
|
||||
// PREFLIGHT: async=true reason="test fixture, in-memory tmpdir DB"
|
||||
exec(`CREATE TABLE observations (
|
||||
id INTEGER PRIMARY KEY,
|
||||
transmission_id INTEGER,
|
||||
observer_id TEXT, observer_name TEXT,
|
||||
direction TEXT, snr REAL, rssi REAL, score INTEGER,
|
||||
path_json TEXT, timestamp TEXT,
|
||||
raw_hex TEXT,
|
||||
resolved_path TEXT
|
||||
)`)
|
||||
// PREFLIGHT: async=true reason="test fixture, in-memory tmpdir DB"
|
||||
exec(`CREATE TABLE observers (rowid INTEGER PRIMARY KEY, id TEXT, name TEXT, iata TEXT)`)
|
||||
// PREFLIGHT: async=true reason="test fixture, in-memory tmpdir DB"
|
||||
exec(`CREATE TABLE nodes (
|
||||
public_key TEXT PRIMARY KEY, name TEXT, role TEXT, lat REAL, lon REAL,
|
||||
last_seen TEXT, first_seen TEXT, advert_count INTEGER DEFAULT 0
|
||||
)`)
|
||||
// PREFLIGHT: async=true reason="test fixture, in-memory tmpdir DB"
|
||||
exec(`CREATE TABLE schema_version (version INTEGER)`)
|
||||
exec(`INSERT INTO schema_version (version) VALUES (1)`)
|
||||
// PREFLIGHT: async=true reason="test fixture, in-memory tmpdir DB"
|
||||
exec(`CREATE INDEX idx_tx_first_seen ON transmissions(first_seen)`)
|
||||
|
||||
// Two repeaters sharing the same 2-char prefix `hop`.
|
||||
// Different advert_counts so tier-4 tiebreak deterministically picks one
|
||||
// (proving the bug: it over-attributes to the higher-count node).
|
||||
exec(`INSERT INTO nodes (public_key, name, role, advert_count) VALUES (?,?,?,?)`,
|
||||
relayA, "Relay A", "repeater", 50)
|
||||
exec(`INSERT INTO nodes (public_key, name, role, advert_count) VALUES (?,?,?,?)`,
|
||||
relayB, "Relay B", "repeater", 10)
|
||||
|
||||
// Aged 48h so it lands in the background window (loadChunk path).
|
||||
exec("INSERT INTO transmissions VALUES (?,?,?,?,0,4,1,?)",
|
||||
1, "aa", "hashamb_1", firstSeen, `{}`)
|
||||
exec("INSERT INTO observations (id, transmission_id, observer_id, observer_name, direction, snr, rssi, score, path_json, timestamp, raw_hex, resolved_path) VALUES (?,?,?,?,?,?,?,?,?,?,?,NULL)",
|
||||
1, 1, "obs1", "Obs1", "RX", -10.0, -80.0, 5, fmt.Sprintf(`[%q]`, hop), firstSeen, "")
|
||||
|
||||
return dbPath
|
||||
}
|
||||
|
||||
// TestLoadChunk_AmbiguousPrefix_SkipsAttribution pins the fix for the
|
||||
// time-travel attribution gate (munger R1 #1). When path_json carries an
|
||||
// ambiguous prefix that matches multiple repeaters, the cold-load path
|
||||
// MUST NOT pick a winner via affinity/observation-count tiebreak — today's
|
||||
// affinity winner is not necessarily the historical hop. Safer to
|
||||
// under-attribute (skip byNode for that hop) than to mis-attribute.
|
||||
func TestLoadChunk_AmbiguousPrefix_SkipsAttribution(t *testing.T) {
|
||||
relayA := "aabbccddeeff00112233445566778899aabbccddeeff00112233445566778899"
|
||||
relayB := "aa1122334455667788990011223344556677889900112233445566778899aabb"
|
||||
hop := "aa" // 2-char prefix shared by both relayA and relayB
|
||||
|
||||
aged := time.Now().UTC().Add(-48 * time.Hour).Format(time.RFC3339)
|
||||
dbPath := createTestDBAmbiguousPrefix(t, relayA, relayB, hop, aged)
|
||||
|
||||
db, err := OpenDB(dbPath)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer db.conn.Close()
|
||||
|
||||
store := NewPacketStore(db, &PacketStoreConfig{
|
||||
RetentionHours: 72,
|
||||
HotStartupHours: 1, // hot load skips the 48h-old row → goes to loadChunk
|
||||
})
|
||||
// Empty graph: no neighbor-affinity tiebreak signal. Mirrors a freshly
|
||||
// restarted server whose only relay info is the prefix map.
|
||||
store.graph.Store(NewNeighborGraph())
|
||||
|
||||
if err := store.LoadChunked(0); err != nil {
|
||||
t.Fatalf("LoadChunked: %v", err)
|
||||
}
|
||||
if got := len(store.byNode[relayA]) + len(store.byNode[relayB]); got != 0 {
|
||||
t.Fatalf("setup: hot load unexpectedly picked up 48h-old row "+
|
||||
"(byNode total=%d, want 0) — test would not exercise loadChunk", got)
|
||||
}
|
||||
|
||||
chunkStart := time.Now().UTC().Add(-72 * time.Hour)
|
||||
chunkEnd := time.Now().UTC().Add(-1 * time.Hour)
|
||||
if err := store.loadChunk(chunkStart, chunkEnd); err != nil {
|
||||
t.Fatalf("loadChunk: %v", err)
|
||||
}
|
||||
|
||||
// Neither repeater may be over-attributed. The hop is ambiguous → the
|
||||
// cold-load loader MUST NOT pick one as the byNode owner.
|
||||
if got := len(store.byNode[relayA]); got != 0 {
|
||||
t.Errorf("byNode[%s]: got %d transmissions, want 0 — ambiguous-prefix hop "+
|
||||
"was over-attributed to relayA (time-travel attribution bug)", relayA, got)
|
||||
}
|
||||
if got := len(store.byNode[relayB]); got != 0 {
|
||||
t.Errorf("byNode[%s]: got %d transmissions, want 0 — ambiguous-prefix hop "+
|
||||
"was over-attributed to relayB (time-travel attribution bug)", relayB, got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestLoad_AmbiguousPrefix_SkipsAttribution covers the hot-window Load()
|
||||
// path. Same setup as the loadChunk test but the row falls inside the hot
|
||||
// window so it is loaded by Load() / scanAndMergeChunk.
|
||||
func TestLoad_AmbiguousPrefix_SkipsAttribution(t *testing.T) {
|
||||
relayA := "bbccddeeff00112233445566778899aabbccddeeff00112233445566778899aa"
|
||||
relayB := "bb112233445566778899001122334455667788990011223344556677889900aa"
|
||||
hop := "bb"
|
||||
|
||||
ts := time.Now().UTC().Format(time.RFC3339)
|
||||
dbPath := createTestDBAmbiguousPrefix(t, relayA, relayB, hop, ts)
|
||||
|
||||
db, err := OpenDB(dbPath)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer db.conn.Close()
|
||||
|
||||
store := NewPacketStore(db, &PacketStoreConfig{RetentionHours: 72})
|
||||
store.graph.Store(NewNeighborGraph())
|
||||
|
||||
if err := store.LoadChunked(0); err != nil {
|
||||
t.Fatalf("LoadChunked: %v", err)
|
||||
}
|
||||
|
||||
if got := len(store.byNode[relayA]); got != 0 {
|
||||
t.Errorf("byNode[%s]: got %d transmissions, want 0 — ambiguous-prefix hop "+
|
||||
"was over-attributed (hot Load path)", relayA, got)
|
||||
}
|
||||
if got := len(store.byNode[relayB]); got != 0 {
|
||||
t.Errorf("byNode[%s]: got %d transmissions, want 0 — ambiguous-prefix hop "+
|
||||
"was over-attributed (hot Load path)", relayB, got)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,180 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
_ "modernc.org/sqlite"
|
||||
)
|
||||
|
||||
// createTestDBPathJSONNoResolvedPath builds a fixture that mirrors the LIVE
|
||||
// deployment state after #1287: observations carry a path_json hop list but
|
||||
// observations.resolved_path is NULL (the ingestor no longer writes it; relay
|
||||
// data is persisted as aggregate neighbor_edges instead). A single repeater
|
||||
// node whose public_key starts with hopPrefix lets the in-memory prefix map
|
||||
// resolve that hop unambiguously to relayPubkey.
|
||||
//
|
||||
// The transmission's decoded_json is empty ({}), so relayPubkey is NOT an
|
||||
// endpoint (pubKey/destPubKey/srcPubKey). The ONLY way it can enter
|
||||
// s.byNode is via path_json → resolvePathForObs relay-hop resolution.
|
||||
func createTestDBPathJSONNoResolvedPath(t *testing.T, relayPubkey, hopPrefix, firstSeen string) string {
|
||||
t.Helper()
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "test.db")
|
||||
|
||||
conn, err := sql.Open("sqlite", dbPath+"?_journal_mode=WAL")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer conn.Close()
|
||||
|
||||
exec := func(s string, args ...interface{}) {
|
||||
if _, err := conn.Exec(s, args...); err != nil {
|
||||
t.Fatalf("setup exec failed: %v\nSQL: %s", err, s)
|
||||
}
|
||||
}
|
||||
|
||||
// PREFLIGHT: async=true reason="test fixture: in-memory t.TempDir SQLite, never touches a real DB. Tables are CREATE-from-empty in a one-shot OpenDB call, not a schema migration over existing data."
|
||||
exec(`CREATE TABLE transmissions (
|
||||
id INTEGER PRIMARY KEY,
|
||||
raw_hex TEXT, hash TEXT, first_seen TEXT,
|
||||
route_type INTEGER, payload_type INTEGER, payload_version INTEGER,
|
||||
decoded_json TEXT
|
||||
)`)
|
||||
// resolved_path column present (matches live schema) but left NULL.
|
||||
// PREFLIGHT: async=true reason="test fixture, in-memory tmpdir DB"
|
||||
exec(`CREATE TABLE observations (
|
||||
id INTEGER PRIMARY KEY,
|
||||
transmission_id INTEGER,
|
||||
observer_id TEXT, observer_name TEXT,
|
||||
direction TEXT, snr REAL, rssi REAL, score INTEGER,
|
||||
path_json TEXT, timestamp TEXT,
|
||||
raw_hex TEXT,
|
||||
resolved_path TEXT
|
||||
)`)
|
||||
// PREFLIGHT: async=true reason="test fixture, in-memory tmpdir DB"
|
||||
exec(`CREATE TABLE observers (rowid INTEGER PRIMARY KEY, id TEXT, name TEXT, iata TEXT)`)
|
||||
// Production nodes schema uses public_key (not pubkey) — getAllNodes /
|
||||
// buildPrefixMap reads public_key, role, advert_count, first_seen.
|
||||
// PREFLIGHT: async=true reason="test fixture, in-memory tmpdir DB"
|
||||
exec(`CREATE TABLE nodes (
|
||||
public_key TEXT PRIMARY KEY, name TEXT, role TEXT, lat REAL, lon REAL,
|
||||
last_seen TEXT, first_seen TEXT, advert_count INTEGER DEFAULT 0
|
||||
)`)
|
||||
// PREFLIGHT: async=true reason="test fixture, in-memory tmpdir DB"
|
||||
exec(`CREATE TABLE schema_version (version INTEGER)`)
|
||||
exec(`INSERT INTO schema_version (version) VALUES (1)`)
|
||||
// PREFLIGHT: async=true reason="test fixture, in-memory tmpdir DB"
|
||||
exec(`CREATE INDEX idx_tx_first_seen ON transmissions(first_seen)`)
|
||||
|
||||
// Repeater node so canAppearInPath() admits it to the prefix map.
|
||||
exec(`INSERT INTO nodes (public_key, name, role, advert_count) VALUES (?,?,?,?)`,
|
||||
relayPubkey, "Relay One", "repeater", 10)
|
||||
|
||||
exec("INSERT INTO transmissions VALUES (?,?,?,?,0,4,1,?)",
|
||||
1, "aa", "hashpjf_1", firstSeen, `{}`)
|
||||
// resolved_path explicitly NULL; path_json carries the relay hop prefix.
|
||||
exec("INSERT INTO observations (id, transmission_id, observer_id, observer_name, direction, snr, rssi, score, path_json, timestamp, raw_hex, resolved_path) VALUES (?,?,?,?,?,?,?,?,?,?,?,NULL)",
|
||||
1, 1, "obs1", "Obs1", "RX", -10.0, -80.0, 5, fmt.Sprintf(`[%q]`, hopPrefix), firstSeen, "")
|
||||
|
||||
return dbPath
|
||||
}
|
||||
|
||||
// TestLoadChunked_ResolvesRelayHopsFromPathJSON_WhenResolvedPathEmpty pins the
|
||||
// fix for the "relay-node analytics empty after every restart" bug.
|
||||
//
|
||||
// On live, observations.resolved_path is 100% NULL (since #1287 the ingestor
|
||||
// persists relay data as neighbor_edges, not per-observation resolved_path).
|
||||
// The cold-load paths (Load / scanAndMergeChunk) indexed relay hops ONLY from
|
||||
// resolved_path, so a relay node's path-hop attribution was never rebuilt on
|
||||
// startup — it only re-accumulated from live traffic, collapsing the activity
|
||||
// timeline to "just the hour the server restarted".
|
||||
//
|
||||
// The fix: when resolved_path is empty, fall back to resolving the hops from
|
||||
// the persisted path_json using the in-memory prefix map + neighbor graph
|
||||
// (exactly what the live ingest path already does), then index the relay hops.
|
||||
func TestLoadChunked_ResolvesRelayHopsFromPathJSON_WhenResolvedPathEmpty(t *testing.T) {
|
||||
relayPK := "aabbccddeeff00112233445566778899aabbccddeeff00112233445566778899"
|
||||
hop := "aa" // 2-hex-char path hop; unique 2-char prefix of relayPK
|
||||
|
||||
ts := time.Now().UTC().Format(time.RFC3339)
|
||||
dbPath := createTestDBPathJSONNoResolvedPath(t, relayPK, hop, ts)
|
||||
|
||||
db, err := OpenDB(dbPath)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer db.conn.Close()
|
||||
|
||||
if !db.hasResolvedPath {
|
||||
t.Fatalf("setup: fixture should expose resolved_path column; hasResolvedPath=false")
|
||||
}
|
||||
|
||||
store := NewPacketStore(db, &PacketStoreConfig{RetentionHours: 72})
|
||||
// Empty graph is sufficient: a single prefix candidate resolves without
|
||||
// neighbor-affinity disambiguation. Mirrors a freshly restarted server
|
||||
// that has loaded its neighbor_edges snapshot before the packet load.
|
||||
store.graph.Store(NewNeighborGraph())
|
||||
|
||||
if err := store.LoadChunked(0); err != nil {
|
||||
t.Fatalf("LoadChunked: %v", err)
|
||||
}
|
||||
|
||||
// The relay pubkey only reachable through path_json resolution must be
|
||||
// indexed in byNode for the transmission.
|
||||
if got := len(store.byNode[relayPK]); got != 1 {
|
||||
t.Errorf("byNode[%s]: got %d transmissions, want 1 — cold load did not "+
|
||||
"resolve relay hops from path_json when resolved_path was NULL "+
|
||||
"(relay history lost on restart)", relayPK, got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestLoadChunk_ResolvesRelayHopsFromPathJSON_WhenResolvedPathEmpty covers the
|
||||
// background-window loader (loadBackgroundChunks → loadChunk), which on live
|
||||
// loads everything older than hotStartupHours (24h) up to retentionHours
|
||||
// (168h). Without the path_json fallback here, a relay node's analytics for
|
||||
// the older 6 days would still vanish on every restart even with the hot
|
||||
// window fixed.
|
||||
func TestLoadChunk_ResolvesRelayHopsFromPathJSON_WhenResolvedPathEmpty(t *testing.T) {
|
||||
relayPK := "ccddeeff00112233445566778899aabbccddeeff00112233445566778899aabb"
|
||||
hop := "cc"
|
||||
|
||||
// Aged 48h so it falls in the background window, not the hot window.
|
||||
aged := time.Now().UTC().Add(-48 * time.Hour).Format(time.RFC3339)
|
||||
dbPath := createTestDBPathJSONNoResolvedPath(t, relayPK, hop, aged)
|
||||
|
||||
db, err := OpenDB(dbPath)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer db.conn.Close()
|
||||
|
||||
store := NewPacketStore(db, &PacketStoreConfig{
|
||||
RetentionHours: 72,
|
||||
HotStartupHours: 1, // hot load must NOT pick up the 48h-old row
|
||||
})
|
||||
store.graph.Store(NewNeighborGraph())
|
||||
|
||||
if err := store.LoadChunked(0); err != nil {
|
||||
t.Fatalf("LoadChunked: %v", err)
|
||||
}
|
||||
if got := len(store.byNode[relayPK]); got != 0 {
|
||||
t.Fatalf("setup: hot load unexpectedly picked up 48h-old row; "+
|
||||
"byNode[relayPK]=%d (want 0) — test would not exercise loadChunk", got)
|
||||
}
|
||||
|
||||
chunkStart := time.Now().UTC().Add(-72 * time.Hour)
|
||||
chunkEnd := time.Now().UTC().Add(-1 * time.Hour)
|
||||
if err := store.loadChunk(chunkStart, chunkEnd); err != nil {
|
||||
t.Fatalf("loadChunk: %v", err)
|
||||
}
|
||||
|
||||
if got := len(store.byNode[relayPK]); got != 1 {
|
||||
t.Errorf("byNode[%s]: got %d transmissions, want 1 — background loadChunk "+
|
||||
"did not resolve relay hops from path_json when resolved_path was NULL "+
|
||||
"(relay history lost on restart for the older retention window)", relayPK, got)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,160 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
_ "modernc.org/sqlite"
|
||||
)
|
||||
|
||||
// createTestDBWithResolvedPath creates a fixture DB containing numTx old
|
||||
// transmissions (48h ago, outside any default hot window) where each
|
||||
// observation has a non-empty resolved_path JSON listing relay-hop pubkeys.
|
||||
// Mirrors createTestDBWithAgedPackets shape but adds the resolved_path
|
||||
// column so loadChunk's hasResolvedPath branch is exercised.
|
||||
func createTestDBWithResolvedPath(t *testing.T, numTx int, relayPubkeys []string) string {
|
||||
t.Helper()
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "test.db")
|
||||
|
||||
conn, err := sql.Open("sqlite", dbPath+"?_journal_mode=WAL")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer conn.Close()
|
||||
|
||||
exec := func(s string, args ...interface{}) {
|
||||
if _, err := conn.Exec(s, args...); err != nil {
|
||||
t.Fatalf("setup exec failed: %v\nSQL: %s", err, s)
|
||||
}
|
||||
}
|
||||
|
||||
exec(`CREATE TABLE transmissions (
|
||||
id INTEGER PRIMARY KEY,
|
||||
raw_hex TEXT, hash TEXT, first_seen TEXT,
|
||||
route_type INTEGER, payload_type INTEGER, payload_version INTEGER,
|
||||
decoded_json TEXT
|
||||
)`)
|
||||
exec(`CREATE TABLE observations (
|
||||
id INTEGER PRIMARY KEY,
|
||||
transmission_id INTEGER,
|
||||
observer_id TEXT, observer_name TEXT,
|
||||
direction TEXT, snr REAL, rssi REAL, score INTEGER,
|
||||
path_json TEXT, timestamp TEXT,
|
||||
raw_hex TEXT,
|
||||
resolved_path TEXT
|
||||
)`)
|
||||
exec(`CREATE TABLE observers (rowid INTEGER PRIMARY KEY, id TEXT, name TEXT, iata TEXT)`)
|
||||
exec(`CREATE TABLE nodes (pubkey TEXT PRIMARY KEY, name TEXT, role TEXT, lat REAL, lon REAL, last_seen TEXT, first_seen TEXT, frequency REAL)`)
|
||||
exec(`CREATE TABLE schema_version (version INTEGER)`)
|
||||
exec(`INSERT INTO schema_version (version) VALUES (1)`)
|
||||
exec(`CREATE INDEX idx_tx_first_seen ON transmissions(first_seen)`)
|
||||
|
||||
// Build resolved_path JSON array of pubkey strings: ["pk1","pk2",...]
|
||||
rpJSON := "["
|
||||
for i, pk := range relayPubkeys {
|
||||
if i > 0 {
|
||||
rpJSON += ","
|
||||
}
|
||||
rpJSON += fmt.Sprintf("%q", pk)
|
||||
}
|
||||
rpJSON += "]"
|
||||
|
||||
now := time.Now().UTC()
|
||||
for i := 0; i < numTx; i++ {
|
||||
ts := now.Add(-48 * time.Hour).Add(time.Duration(i) * time.Second).Format(time.RFC3339)
|
||||
hash := fmt.Sprintf("hash1558_%d", i)
|
||||
exec("INSERT INTO transmissions VALUES (?,?,?,?,0,4,1,?)",
|
||||
i+1, "aa", hash, ts, `{}`)
|
||||
exec("INSERT INTO observations (id, transmission_id, observer_id, observer_name, direction, snr, rssi, score, path_json, timestamp, raw_hex, resolved_path) VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
|
||||
i+1, i+1, "obs1", "Obs1", "RX", -10.0, -80.0, 5, `[]`, ts, "", rpJSON)
|
||||
}
|
||||
return dbPath
|
||||
}
|
||||
|
||||
// TestLoadChunk_IndexesResolvedPathPubkeys_Issue1558 verifies the
|
||||
// contract-violation fix from #1558:
|
||||
//
|
||||
// `Load` (cmd/server/store.go:783-799) unmarshals each observation's
|
||||
// resolved_path column and feeds every relay-hop pubkey through
|
||||
// addToByNode / addResolvedPubkeysToPathHopIndex /
|
||||
// addToResolvedPubkeyIndex. `loadChunk` (cmd/server/store.go:937-1023)
|
||||
// scans the same column into resolvedPathStr but never feeds it
|
||||
// anywhere — so background-backfilled transmissions never appear under
|
||||
// their relay pubkeys in s.byNode, even though the same exact rows do
|
||||
// when they happen to fall inside the hot startup window.
|
||||
//
|
||||
// Symptom in production: Home page per-node `packetsToday` /
|
||||
// `totalTransmissions` / observer counts collapse after a container
|
||||
// restart for any node that primarily appears as a relay (rather than
|
||||
// as the endpoint pubKey/destPubKey/srcPubKey of a packet), because the
|
||||
// background backfill path silently drops the relay-hop indexing
|
||||
// branch. See issue #1558 for the full trace + diagnosis.
|
||||
//
|
||||
// This test loads a fixture DB exclusively via loadChunk (skipping
|
||||
// Load) and asserts that for each relay pubkey present in
|
||||
// `resolved_path` of every observation, s.byNode contains the
|
||||
// transmission.
|
||||
func TestLoadChunk_IndexesResolvedPathPubkeys_Issue1558(t *testing.T) {
|
||||
// Two distinct relay pubkeys appear in every observation's resolved_path.
|
||||
// Neither is an endpoint pubkey in decoded_json — so the ONLY path
|
||||
// they can enter byNode through is the resolved_path branch.
|
||||
relayPK1 := "1111111111111111111111111111111111111111111111111111111111111111"
|
||||
relayPK2 := "2222222222222222222222222222222222222222222222222222222222222222"
|
||||
|
||||
dbPath := createTestDBWithResolvedPath(t, 3, []string{relayPK1, relayPK2})
|
||||
|
||||
db, err := OpenDB(dbPath)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer db.conn.Close()
|
||||
|
||||
if !db.hasResolvedPath {
|
||||
t.Fatalf("setup: fixture should expose resolved_path column; hasResolvedPath=false")
|
||||
}
|
||||
|
||||
store := NewPacketStore(db, &PacketStoreConfig{
|
||||
RetentionHours: 72,
|
||||
HotStartupHours: 1, // initial Load should NOT pick up 48h-old fixture rows
|
||||
})
|
||||
if err := store.Load(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
// Confirm the fixture rows are outside the hot window — Load() must
|
||||
// not have already populated byNode for the relay pubkeys; otherwise
|
||||
// the test would not actually be exercising loadChunk.
|
||||
if len(store.byNode[relayPK1]) != 0 {
|
||||
t.Fatalf("setup: Load() unexpectedly picked up 48h-old rows; "+
|
||||
"byNode[relayPK1]=%d entries (expected 0)", len(store.byNode[relayPK1]))
|
||||
}
|
||||
|
||||
// Trigger background backfill of the 48h-old window via loadChunk —
|
||||
// this is the code path under test.
|
||||
chunkStart := time.Now().UTC().Add(-72 * time.Hour)
|
||||
chunkEnd := time.Now().UTC().Add(-1 * time.Hour)
|
||||
if err := store.loadChunk(chunkStart, chunkEnd); err != nil {
|
||||
t.Fatalf("loadChunk failed: %v", err)
|
||||
}
|
||||
|
||||
// Sanity: loadChunk did merge the transmissions into the slice.
|
||||
if len(store.packets) != 3 {
|
||||
t.Fatalf("loadChunk should have merged 3 transmissions; got %d", len(store.packets))
|
||||
}
|
||||
|
||||
// THE ASSERTION: every relay pubkey listed in resolved_path must be
|
||||
// indexed in byNode for every transmission, because loadChunk's
|
||||
// per-row scan should mirror Load()'s 783-799 block.
|
||||
for _, relayPK := range []string{relayPK1, relayPK2} {
|
||||
got := len(store.byNode[relayPK])
|
||||
if got != 3 {
|
||||
t.Errorf("byNode[%s]: got %d transmissions, want 3 — "+
|
||||
"loadChunk dropped the resolved_path indexing branch "+
|
||||
"(issue #1558)",
|
||||
relayPK, got)
|
||||
}
|
||||
}
|
||||
}
|
||||
+88
-15
@@ -109,22 +109,37 @@ func main() {
|
||||
log.Printf("[security] WARNING: API key is weak or a known default — write endpoints are vulnerable")
|
||||
}
|
||||
|
||||
// Apply Go runtime soft memory limit (#836).
|
||||
// Honors GOMEMLIMIT if set; otherwise derives from packetStore.maxMemoryMB.
|
||||
// Apply Go runtime soft memory limit (#836, #1010).
|
||||
// Precedence: GOMEMLIMIT env > runtime.maxMemoryMB > derived from packetStore.maxMemoryMB.
|
||||
{
|
||||
_, envSet := os.LookupEnv("GOMEMLIMIT")
|
||||
runtimeMaxMB := 0
|
||||
if cfg.Runtime != nil {
|
||||
runtimeMaxMB = cfg.Runtime.MaxMemoryMB
|
||||
}
|
||||
maxMB := 0
|
||||
if cfg.PacketStore != nil {
|
||||
maxMB = cfg.PacketStore.MaxMemoryMB
|
||||
}
|
||||
limit, source := applyMemoryLimit(maxMB, envSet)
|
||||
// runtime.maxMemoryMB (explicit) wins over packetStore-derived (implicit).
|
||||
effectiveMB := maxMB
|
||||
usedRuntimeCfg := false
|
||||
if !envSet && runtimeMaxMB > 0 {
|
||||
effectiveMB = runtimeMaxMB
|
||||
usedRuntimeCfg = true
|
||||
}
|
||||
limit, source := applyMemoryLimit(effectiveMB, envSet)
|
||||
switch source {
|
||||
case "env":
|
||||
log.Printf("[memlimit] using GOMEMLIMIT from environment (%s)", os.Getenv("GOMEMLIMIT"))
|
||||
case "derived":
|
||||
log.Printf("[memlimit] derived from packetStore.maxMemoryMB=%d → %d MiB (1.5x headroom)", maxMB, limit/(1024*1024))
|
||||
if usedRuntimeCfg {
|
||||
log.Printf("[memlimit] runtime.maxMemoryMB=%d → %d MiB (1.5x headroom)", runtimeMaxMB, limit/(1024*1024))
|
||||
} else {
|
||||
log.Printf("[memlimit] derived from packetStore.maxMemoryMB=%d → %d MiB (1.5x headroom)", maxMB, limit/(1024*1024))
|
||||
}
|
||||
default:
|
||||
log.Printf("[memlimit] no soft memory limit set (GOMEMLIMIT unset, packetStore.maxMemoryMB=0); recommend setting one to avoid container OOM-kill")
|
||||
log.Printf("[memlimit] unset → default (no soft memory limit; recommend setting GOMEMLIMIT or runtime.maxMemoryMB to ≥1.5× working set to avoid OOM-kill)")
|
||||
}
|
||||
warnIfMemlimitUnderprovisioned(limit)
|
||||
}
|
||||
@@ -183,18 +198,56 @@ func main() {
|
||||
// In-memory packet store
|
||||
store := NewPacketStore(database, cfg.PacketStore, cfg.CacheTTL)
|
||||
store.config = cfg
|
||||
if err := store.Load(); err != nil {
|
||||
log.Fatalf("[store] failed to load: %v", err)
|
||||
|
||||
// Load the persisted neighbor graph BEFORE the packet load so the
|
||||
// chunked loader can resolve relay-hop pubkeys from path_json. Since
|
||||
// #1287 the ingestor persists relay data only as aggregate
|
||||
// neighbor_edges — observations.resolved_path is never written — so
|
||||
// without an available graph at load time a relay node's analytics
|
||||
// history would rebuild only from post-restart live traffic (the
|
||||
// "timeline empty after every restart" bug). neighbor_edges is small,
|
||||
// so this adds negligible latency before the HTTP listener binds. The
|
||||
// fresh-DB branch (no snapshot) still builds in-memory AFTER the load
|
||||
// below, because BuildFromStore needs the loaded packets.
|
||||
neighborEdgesPersisted := neighborEdgesTableExists(database.conn)
|
||||
if neighborEdgesPersisted {
|
||||
store.graph.Store(loadNeighborEdgesFromDB(database.conn))
|
||||
log.Printf("[neighbor] loaded persisted neighbor graph")
|
||||
}
|
||||
|
||||
// #1009: chunked Load with early HTTP readiness. LoadChunked runs
|
||||
// asynchronously and signals FirstChunkReady after the first chunk
|
||||
// is merged so the HTTP listener can bind without waiting for the
|
||||
// full multi-minute scan to finish. loadStatusMiddleware (wired
|
||||
// below) advertises loading|ready via X-CoreScope-Load-Status.
|
||||
chunkSize := cfg.DBLoadChunkSize()
|
||||
loadErrCh := make(chan error, 1)
|
||||
go func() {
|
||||
loadErrCh <- store.LoadChunked(chunkSize)
|
||||
}()
|
||||
select {
|
||||
case <-store.FirstChunkReady():
|
||||
log.Printf("[store] first chunk ready (chunkSize=%d) — HTTP listener may bind", chunkSize)
|
||||
case err := <-loadErrCh:
|
||||
if err != nil {
|
||||
log.Fatalf("[store] LoadChunked failed before first chunk: %v", err)
|
||||
}
|
||||
log.Printf("[store] LoadChunked completed before first-chunk signal (empty DB?)")
|
||||
}
|
||||
go func() {
|
||||
if err := <-loadErrCh; err != nil {
|
||||
log.Printf("[store] LoadChunked background error: %v", err)
|
||||
}
|
||||
}()
|
||||
if store.hotStartupHours > 0 {
|
||||
log.Printf("[store] starting background load: filling retentionHours=%gh from hotStartupHours=%gh",
|
||||
store.retentionHours, store.hotStartupHours)
|
||||
go store.loadBackgroundChunks()
|
||||
}
|
||||
|
||||
// Initialize persisted neighbor graph.
|
||||
// Per #1287, schema migrations all live in the ingestor (see
|
||||
// dbschema.Apply). The server merely loads the snapshot here and
|
||||
// Neighbor graph: the persisted snapshot (if present) was already
|
||||
// loaded above, before the packet load. Per #1287 schema migrations
|
||||
// all live in the ingestor; the server only reads the snapshot and
|
||||
// then refreshes it via the recompNeighborGraph slot every 60s.
|
||||
dbPath = database.path
|
||||
database.hasResolvedPath = true // dbschema.AssertReady above already verified observations.resolved_path exists
|
||||
@@ -202,11 +255,7 @@ func main() {
|
||||
// WaitGroup for background init steps that gate /api/healthz readiness.
|
||||
var initWg sync.WaitGroup
|
||||
|
||||
// Load or build neighbor graph
|
||||
if neighborEdgesTableExists(database.conn) {
|
||||
store.graph.Store(loadNeighborEdgesFromDB(database.conn))
|
||||
log.Printf("[neighbor] loaded persisted neighbor graph")
|
||||
} else {
|
||||
if !neighborEdgesPersisted {
|
||||
// No persisted snapshot yet (e.g. fresh DB before the ingestor
|
||||
// has run its first edge-build cycle). Build an in-memory graph
|
||||
// from the packets we already have so reads aren't empty. We
|
||||
@@ -331,6 +380,26 @@ func main() {
|
||||
defer close(stopNeighborGraphCache)
|
||||
log.Printf("[neighbor-graph-cache] background recompute enabled (interval=%s)", ngInterval)
|
||||
|
||||
// Known-channels catalogue cache (issue #1323). OPT-IN: an empty
|
||||
// cfg.KnownChannelsURL leaves srv.knownChannels nil and starts no
|
||||
// background fetch. The /api/known-channels endpoint then serves an
|
||||
// empty snapshot. Operators who want the community catalogue must
|
||||
// set knownChannelsUrl explicitly in config.json (see
|
||||
// config.example.json for the pinned-SHA recommendation).
|
||||
if cfg.KnownChannelsURL != "" {
|
||||
kcRefresh := DefaultKnownChannelsRefresh
|
||||
if cfg.KnownChannelsRefreshMs > 0 {
|
||||
kcRefresh = time.Duration(cfg.KnownChannelsRefreshMs) * time.Millisecond
|
||||
}
|
||||
srv.knownChannels = newKnownChannelsCache(cfg.KnownChannelsURL, kcRefresh)
|
||||
kcCtx, stopKnownChannels := context.WithCancel(context.Background())
|
||||
srv.knownChannels.run(kcCtx)
|
||||
defer stopKnownChannels()
|
||||
log.Printf("[known-channels] background fetch enabled (url=%s, refresh=%s)", cfg.KnownChannelsURL, kcRefresh)
|
||||
} else {
|
||||
log.Printf("[known-channels] disabled (knownChannelsUrl unset in config)")
|
||||
}
|
||||
|
||||
// Steady-state repeater-enrichment recomputer (issue #1262).
|
||||
// Prewarms the bulk caches feeding handleNodes so the very first
|
||||
// /api/nodes?limit=2000 from live.js's SPA bootstrap hits a
|
||||
@@ -380,6 +449,10 @@ func main() {
|
||||
handler = gzipMiddlewareWithConfig(cfg.Compression, router)
|
||||
log.Printf("[server] HTTP gzip compression enabled")
|
||||
}
|
||||
// #1009: stamp X-CoreScope-Load-Status on every response so probes
|
||||
// and dashboards can see when the chunked Load is still in flight.
|
||||
// Outermost wrap so the header is set regardless of gzip/etc.
|
||||
handler = loadStatusMiddleware(store, handler)
|
||||
if cfg.WSCompressionEnabled() {
|
||||
log.Printf("[server] WebSocket permessage-deflate compression enabled")
|
||||
}
|
||||
|
||||
@@ -0,0 +1,144 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// mqttBrokerSchemes is the set of broker URL schemes whose embedded
|
||||
// `user:pass@host` credentials we want to redact. We URL-parse for these
|
||||
// (defense vs. passwords containing `@`); other strings fall through to
|
||||
// the legacy regex pass for embedded user:pass occurrences in free-form
|
||||
// error strings.
|
||||
var mqttBrokerSchemes = map[string]bool{
|
||||
"mqtt": true, "mqtts": true, "tcp": true, "ssl": true, "ws": true, "wss": true,
|
||||
}
|
||||
|
||||
// mqttBrokerURLRe locates a broker URL (with credentials) embedded inside
|
||||
// a larger free-form string — e.g. an error message that quotes the
|
||||
// failing broker. Each match is fed through url.Parse + redaction. We
|
||||
// match greedily up through the LAST `@` followed by a host-shaped token
|
||||
// so passwords containing `@` are not truncated (#1682 adversarial r1).
|
||||
//
|
||||
// Go's RE2 has no lookahead; we capture the host tail and emit it
|
||||
// unchanged in the replacement.
|
||||
var mqttBrokerURLRe = regexp.MustCompile(`(?i)(?:mqtt|mqtts|tcp|ssl|ws|wss)://[^\s]*`)
|
||||
|
||||
// maskBrokerURL returns the broker URL with any inline password redacted.
|
||||
// `mqtt://user:secret@host:1883` -> `mqtt://user:****@host:1883`.
|
||||
// `mqtt://user:p@ss@host` -> `mqtt://user:****@host` (password with `@`).
|
||||
// URLs without inline credentials are returned unchanged.
|
||||
//
|
||||
// Primary strategy: url.Parse — handles passwords with `@`, `:`, etc.
|
||||
// Fallback: regex sweep for free-form strings (e.g. error messages that
|
||||
// quote a URL fragment but aren't standalone-parseable).
|
||||
func maskBrokerURL(s string) string {
|
||||
if s == "" {
|
||||
return s
|
||||
}
|
||||
// Fast path: the whole string is the broker URL.
|
||||
if masked, ok := redactBrokerURL(s); ok {
|
||||
return masked
|
||||
}
|
||||
// Fallback: free-form string (e.g. error message) containing a URL.
|
||||
// Find embedded broker URLs and redact each in-place.
|
||||
return mqttBrokerURLRe.ReplaceAllStringFunc(s, func(m string) string {
|
||||
if out, ok := redactBrokerURL(m); ok {
|
||||
return out
|
||||
}
|
||||
return m
|
||||
})
|
||||
}
|
||||
|
||||
// redactBrokerURL parses s as a URL and, if it has an mqtt-family scheme
|
||||
// with userinfo containing a password, returns the URL with the password
|
||||
// replaced by `****`. Returns ok=false when s is not such a URL.
|
||||
func redactBrokerURL(s string) (string, bool) {
|
||||
u, err := url.Parse(s)
|
||||
if err != nil || u.Scheme == "" || u.User == nil {
|
||||
return s, false
|
||||
}
|
||||
if !mqttBrokerSchemes[strings.ToLower(u.Scheme)] {
|
||||
return s, false
|
||||
}
|
||||
if _, hasPass := u.User.Password(); !hasPass {
|
||||
return s, false
|
||||
}
|
||||
// Re-assemble manually rather than via url.UserPassword + u.String()
|
||||
// because the latter percent-encodes the `*` mask token into `%2A`,
|
||||
// defeating the user-visible redaction marker. We only need to swap
|
||||
// the userinfo segment of the original string.
|
||||
hostAndAfter := s
|
||||
if idx := strings.LastIndex(s, "@"); idx >= 0 {
|
||||
hostAndAfter = s[idx+1:]
|
||||
}
|
||||
// Preserve original scheme casing (url.Parse lowercases u.Scheme).
|
||||
schemeEnd := strings.Index(s, "://")
|
||||
if schemeEnd < 0 {
|
||||
return s, false
|
||||
}
|
||||
return s[:schemeEnd] + "://" + u.User.Username() + ":****@" + hostAndAfter, true
|
||||
}
|
||||
|
||||
// MqttSourceStatus is the per-MQTT-source status row surfaced via
|
||||
// /api/mqtt/status. Mirrors the on-disk shape the ingestor publishes
|
||||
// (cmd/ingestor SourceStatusSnapshot) but with the broker URL credentials
|
||||
// redacted before serving — operators must not see the broker password
|
||||
// in the API response (#1043 acceptance criterion).
|
||||
type MqttSourceStatus struct {
|
||||
Name string `json:"name"`
|
||||
Broker string `json:"broker"`
|
||||
Connected bool `json:"connected"`
|
||||
LastConnectUnix int64 `json:"lastConnectUnix"`
|
||||
LastDisconnectUnix int64 `json:"lastDisconnectUnix"`
|
||||
LastPacketUnix int64 `json:"lastPacketUnix"`
|
||||
ConnectCount int64 `json:"connectCount"`
|
||||
DisconnectCount int64 `json:"disconnectCount"`
|
||||
PacketsTotal int64 `json:"packetsTotal"`
|
||||
PacketsLast5m int64 `json:"packetsLast5m"`
|
||||
LastError string `json:"lastError,omitempty"`
|
||||
}
|
||||
|
||||
// MqttStatusResponse is the JSON envelope returned by /api/mqtt/status.
|
||||
type MqttStatusResponse struct {
|
||||
Sources []MqttSourceStatus `json:"sources"`
|
||||
SampleAt string `json:"sampleAt"`
|
||||
}
|
||||
|
||||
// ingestorMqttStatusEnvelope is the partial shape the server decodes from
|
||||
// the ingestor stats file (additive — older ingestors omit the field).
|
||||
type ingestorMqttStatusEnvelope struct {
|
||||
SampledAt string `json:"sampledAt"`
|
||||
SourceStatuses []MqttSourceStatus `json:"source_statuses"`
|
||||
}
|
||||
|
||||
// handleMqttStatus serves GET /api/mqtt/status. Reads the ingestor stats
|
||||
// file, masks broker-URL passwords, and returns the per-source status
|
||||
// list. Returns an empty list (200 OK) when the stats file is missing
|
||||
// or unparseable — the UI panel renders a "no data yet" state.
|
||||
func (s *Server) handleMqttStatus(w http.ResponseWriter, r *http.Request) {
|
||||
resp := MqttStatusResponse{Sources: []MqttSourceStatus{}, SampleAt: ""}
|
||||
data, err := os.ReadFile(IngestorStatsPath())
|
||||
if err != nil {
|
||||
writeJSON(w, resp)
|
||||
return
|
||||
}
|
||||
var env ingestorMqttStatusEnvelope
|
||||
if err := json.Unmarshal(data, &env); err != nil {
|
||||
writeJSON(w, resp)
|
||||
return
|
||||
}
|
||||
resp.SampleAt = env.SampledAt
|
||||
for _, src := range env.SourceStatuses {
|
||||
src.Broker = maskBrokerURL(src.Broker)
|
||||
// Broker libraries occasionally quote the failing URL in the
|
||||
// error string — redact there too as defense-in-depth.
|
||||
src.LastError = maskBrokerURL(src.LastError)
|
||||
resp.Sources = append(resp.Sources, src)
|
||||
}
|
||||
writeJSON(w, resp)
|
||||
}
|
||||
@@ -0,0 +1,142 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestMqttStatus_MasksBrokerPassword (#1043) asserts the /api/mqtt/status
|
||||
// handler never leaks the broker password embedded in a mqtt:// URL.
|
||||
// Operators viewing the API response (or the Observers panel that
|
||||
// consumes it) must see `****` in place of the inline credential.
|
||||
//
|
||||
// Test shape: write a stub ingestor stats file with one source whose
|
||||
// broker URL contains a plaintext password, invoke the handler, assert
|
||||
// the JSON response (a) contains the username + host, (b) does NOT
|
||||
// contain the password substring.
|
||||
func TestMqttStatus_MasksBrokerPassword(t *testing.T) {
|
||||
const password = "hunter2supersecret"
|
||||
const rawBroker = "mqtt://obsuser:" + password + "@broker.example.com:1883"
|
||||
|
||||
tmp := t.TempDir()
|
||||
statsPath := filepath.Join(tmp, "ingestor-stats.json")
|
||||
t.Setenv("CORESCOPE_INGESTOR_STATS", statsPath)
|
||||
|
||||
// Stub stats file: one MQTT source with a credentialed broker URL.
|
||||
stub := map[string]any{
|
||||
"sampledAt": "2026-06-12T12:30:00Z",
|
||||
"source_statuses": []map[string]any{{
|
||||
"name": "local",
|
||||
"broker": rawBroker,
|
||||
"connected": true,
|
||||
"lastPacketUnix": 1717977000,
|
||||
"connectCount": 1,
|
||||
"disconnectCount": 0,
|
||||
"packetsTotal": 42,
|
||||
"packetsLast5m": 7,
|
||||
}},
|
||||
}
|
||||
data, err := json.Marshal(stub)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal stub: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(statsPath, data, 0o600); err != nil {
|
||||
t.Fatalf("write stub: %v", err)
|
||||
}
|
||||
|
||||
srv := &Server{}
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/mqtt/status", nil)
|
||||
rec := httptest.NewRecorder()
|
||||
srv.handleMqttStatus(rec, req)
|
||||
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status = %d, want 200; body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
body := rec.Body.String()
|
||||
t.Logf("response body: %s", body)
|
||||
|
||||
if strings.Contains(body, password) {
|
||||
t.Errorf("response leaks broker password %q in body: %s", password, body)
|
||||
}
|
||||
// Sanity: the response still identifies the source by name + host.
|
||||
if !strings.Contains(body, "broker.example.com") {
|
||||
t.Errorf("response missing broker host: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, "obsuser") {
|
||||
t.Errorf("response missing broker username: %s", body)
|
||||
}
|
||||
// Mask token must be present so operators can tell credentials were
|
||||
// redacted vs the broker URL never having a password to begin with.
|
||||
if !strings.Contains(body, "****") {
|
||||
t.Errorf("response missing redaction marker '****': %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
// TestMqttStatus_EmptyWhenNoStatsFile asserts the handler returns an empty
|
||||
// list (200 OK) when the ingestor stats file is missing — the UI panel
|
||||
// renders a "no data yet" state in that case.
|
||||
func TestMqttStatus_EmptyWhenNoStatsFile(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
t.Setenv("CORESCOPE_INGESTOR_STATS", filepath.Join(tmp, "does-not-exist.json"))
|
||||
|
||||
srv := &Server{}
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/mqtt/status", nil)
|
||||
rec := httptest.NewRecorder()
|
||||
srv.handleMqttStatus(rec, req)
|
||||
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status = %d, want 200", rec.Code)
|
||||
}
|
||||
var resp MqttStatusResponse
|
||||
if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("unmarshal: %v; body=%s", err, rec.Body.String())
|
||||
}
|
||||
if len(resp.Sources) != 0 {
|
||||
t.Errorf("Sources len = %d, want 0", len(resp.Sources))
|
||||
}
|
||||
}
|
||||
|
||||
// TestMaskBrokerURL_Patterns is a unit table-driven test for the masking
|
||||
// helper. Kept separate from the handler test so a regression in the
|
||||
// regex localizes immediately.
|
||||
func TestMaskBrokerURL_Patterns(t *testing.T) {
|
||||
cases := []struct {
|
||||
name, in, want string
|
||||
}{
|
||||
{"plain mqtt no creds", "mqtt://broker.example.com:1883", "mqtt://broker.example.com:1883"},
|
||||
{"mqtt with creds", "mqtt://u:secret@broker.example.com:1883", "mqtt://u:****@broker.example.com:1883"},
|
||||
{"mqtts with creds", "mqtts://u:secret@broker.example.com:8883", "mqtts://u:****@broker.example.com:8883"},
|
||||
{"tcp with creds", "tcp://u:p@host:1883", "tcp://u:****@host:1883"},
|
||||
{"ssl with creds", "ssl://u:p@host:8883", "ssl://u:****@host:8883"},
|
||||
{"ws with creds", "ws://u:p@host:8080/mqtt", "ws://u:****@host:8080/mqtt"},
|
||||
{"wss with creds", "wss://u:p@host:443/mqtt", "wss://u:****@host:443/mqtt"},
|
||||
{"uppercase scheme", "MQTT://u:p@host:1883", "MQTT://u:****@host:1883"},
|
||||
{"empty", "", ""},
|
||||
{"long password", "mqtt://obsuser:hunter2supersecretXYZ123@host:1883", "mqtt://obsuser:****@host:1883"},
|
||||
{"no scheme bare host", "host:1883", "host:1883"},
|
||||
// Adversarial r1 review (#1682): password contains @. The previous
|
||||
// regex-only impl matched only up to the FIRST @, exposing "ss" as
|
||||
// part of the path: "mqtt://user:****@ss@host". url.Parse handles
|
||||
// this correctly because Go interprets the LAST @ as the userinfo
|
||||
// boundary.
|
||||
{"password with single @", "mqtt://user:p@ss@host:1883", "mqtt://user:****@host:1883"},
|
||||
{"password with multiple @", "mqtt://user:p@ss@wo@host:1883", "mqtt://user:****@host:1883"},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
got := maskBrokerURL(c.in)
|
||||
if got != c.want {
|
||||
t.Errorf("maskBrokerURL(%q) = %q, want %q", c.in, got, c.want)
|
||||
}
|
||||
// Inline secret must never survive.
|
||||
if c.in != c.want && strings.Contains(got, "secret") {
|
||||
t.Errorf("output still contains 'secret': %q", got)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -26,6 +26,10 @@ type NeighborEntry struct {
|
||||
Name *string `json:"name"`
|
||||
Role *string `json:"role"`
|
||||
Count int `json:"count"`
|
||||
// CountsByMode breaks Count down by observation hash-prefix mode in bytes
|
||||
// (1, 2, 4, 6). Lets the frontend weight confidence by ambiguity rather
|
||||
// than treating every sighting as equal evidence. Issue #1638.
|
||||
CountsByMode map[int]int `json:"counts_by_mode,omitempty"`
|
||||
Score float64 `json:"score"`
|
||||
FirstSeen string `json:"first_seen"`
|
||||
LastSeen string `json:"last_seen"`
|
||||
@@ -104,6 +108,10 @@ func (s *Server) handleNodeNeighbors(w http.ResponseWriter, r *http.Request) {
|
||||
writeError(w, 404, "Not found")
|
||||
return
|
||||
}
|
||||
if s.isPubkeyHidden(pubkey) {
|
||||
writeError(w, 404, "Not found")
|
||||
return
|
||||
}
|
||||
|
||||
minCount := 1
|
||||
if v := r.URL.Query().Get("min_count"); v != "" {
|
||||
@@ -156,13 +164,14 @@ func (s *Server) handleNodeNeighbors(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
entry := NeighborEntry{
|
||||
Prefix: e.Prefix,
|
||||
Count: e.Count,
|
||||
Score: score,
|
||||
FirstSeen: e.FirstSeen.UTC().Format(time.RFC3339),
|
||||
LastSeen: e.LastSeen.UTC().Format(time.RFC3339),
|
||||
Ambiguous: e.Ambiguous,
|
||||
Observers: observerList(e.Observers),
|
||||
Prefix: e.Prefix,
|
||||
Count: e.Count,
|
||||
CountsByMode: copyCountsByMode(e.CountsByMode),
|
||||
Score: score,
|
||||
FirstSeen: e.FirstSeen.UTC().Format(time.RFC3339),
|
||||
LastSeen: e.LastSeen.UTC().Format(time.RFC3339),
|
||||
Ambiguous: e.Ambiguous,
|
||||
Observers: observerList(e.Observers),
|
||||
}
|
||||
|
||||
if e.SNRCount > 0 {
|
||||
@@ -334,6 +343,10 @@ func (s *Server) computeNeighborGraphResponse(minCount int, minScore float64, re
|
||||
if s.cfg != nil && (s.cfg.IsBlacklisted(e.NodeA) || s.cfg.IsBlacklisted(e.NodeB)) {
|
||||
continue
|
||||
}
|
||||
// #1181: also drop edges touching a hidden-prefix node.
|
||||
if s.isPubkeyHidden(e.NodeA) || s.isPubkeyHidden(e.NodeB) {
|
||||
continue
|
||||
}
|
||||
|
||||
ge := GraphEdge{
|
||||
Source: e.NodeA,
|
||||
@@ -412,6 +425,20 @@ func (s *Server) computeNeighborGraphResponse(minCount int, minScore float64, re
|
||||
|
||||
// ─── Helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
// copyCountsByMode returns a shallow copy of the per-mode count map so the
|
||||
// API response doesn't share state with the live in-memory edge. Returns
|
||||
// nil for empty/nil input so omitempty drops the field from legacy payloads.
|
||||
func copyCountsByMode(m map[int]int) map[int]int {
|
||||
if len(m) == 0 {
|
||||
return nil
|
||||
}
|
||||
out := make(map[int]int, len(m))
|
||||
for k, v := range m {
|
||||
out[k] = v
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func observerList(m map[string]bool) []string {
|
||||
if len(m) == 0 {
|
||||
return []string{}
|
||||
@@ -429,6 +456,9 @@ func (s *Server) buildNodeInfoMap() map[string]nodeInfo {
|
||||
if s.store == nil {
|
||||
return nil
|
||||
}
|
||||
// FirstSeen is folded into getAllNodes (and therefore into the 30s
|
||||
// node cache) so callers like /api/nodes/{pk}/reach get the field
|
||||
// without a per-request SELECT — fixes #1627 r3 regression.
|
||||
nodes, _ := s.store.getCachedNodesAndPM()
|
||||
m := make(map[string]nodeInfo, len(nodes))
|
||||
for _, n := range nodes {
|
||||
@@ -497,6 +527,14 @@ func dedupPrefixEntries(entries []NeighborEntry) []NeighborEntry {
|
||||
|
||||
// Merge counts from unresolved into resolved.
|
||||
entries[j].Count += entries[i].Count
|
||||
if entries[i].CountsByMode != nil {
|
||||
if entries[j].CountsByMode == nil {
|
||||
entries[j].CountsByMode = make(map[int]int)
|
||||
}
|
||||
for m, c := range entries[i].CountsByMode {
|
||||
entries[j].CountsByMode[m] += c
|
||||
}
|
||||
}
|
||||
|
||||
// Preserve higher LastSeen.
|
||||
if entries[i].LastSeen > entries[j].LastSeen {
|
||||
|
||||
@@ -525,3 +525,123 @@ func TestBuildNodeInfoMap_ObserverEnrichment(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestBuildNodeInfoMap_FirstSeenIsCached asserts the regression introduced by
|
||||
// #1627 r3 stays fixed: the per-pubkey first_seen field MUST come from the
|
||||
// already-30s-cached getCachedNodesAndPM path, not from a fresh uncached
|
||||
// `SELECT … FROM nodes` scan on every call.
|
||||
//
|
||||
// Method (no DB-driver wrapper needed): mutate the underlying SQLite file's
|
||||
// first_seen via a separate rw connection between two consecutive calls to
|
||||
// buildNodeInfoMap(). If first_seen is read fresh on every call (the
|
||||
// regression), the second call sees the new value. If folded into the
|
||||
// existing 30s node cache, both calls return the original value — same as
|
||||
// every other nodeInfo field that comes from getAllNodes().
|
||||
func TestBuildNodeInfoMap_FirstSeenIsCached(t *testing.T) {
|
||||
tmpDir := t.TempDir()
|
||||
dbPath := tmpDir + "/test.db"
|
||||
|
||||
// Seed via rw connection.
|
||||
rw, err := sql.Open("sqlite", dbPath)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer rw.Close()
|
||||
for _, stmt := range []string{
|
||||
"CREATE TABLE nodes (public_key TEXT PRIMARY KEY, name TEXT, role TEXT, lat REAL, lon REAL, last_seen TEXT, first_seen TEXT, advert_count INTEGER)",
|
||||
"CREATE TABLE observers (id TEXT, name TEXT, iata TEXT)",
|
||||
"INSERT INTO nodes VALUES ('AAAA1111', 'Repeater-1', 'repeater', 0, 0, '', '2024-01-01T00:00:00Z', 0)",
|
||||
} {
|
||||
if _, err := rw.Exec(stmt); err != nil {
|
||||
t.Fatalf("seed exec %q: %v", stmt, err)
|
||||
}
|
||||
}
|
||||
|
||||
db, err := OpenDB(dbPath)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer db.conn.Close()
|
||||
|
||||
store := NewPacketStore(db, nil)
|
||||
store.Load()
|
||||
|
||||
srv := &Server{
|
||||
db: db,
|
||||
store: store,
|
||||
perfStats: NewPerfStats(),
|
||||
}
|
||||
|
||||
// Call 1: warm cache and record observed first_seen.
|
||||
m1 := srv.buildNodeInfoMap()
|
||||
first1 := m1["aaaa1111"].FirstSeen
|
||||
if first1 != "2024-01-01T00:00:00Z" {
|
||||
t.Fatalf("setup: expected first_seen=2024-01-01T00:00:00Z, got %q", first1)
|
||||
}
|
||||
|
||||
// Mutate first_seen out-of-band via the rw connection. Any code path
|
||||
// that re-reads first_seen from disk (uncached) will see this new
|
||||
// value; a path that folds first_seen into the 30s node cache will
|
||||
// not, because the cache is well under 30s old.
|
||||
if _, err := rw.Exec("UPDATE nodes SET first_seen='2099-12-31T23:59:59Z' WHERE public_key='AAAA1111'"); err != nil {
|
||||
t.Fatalf("mutate: %v", err)
|
||||
}
|
||||
|
||||
// Call 2: should match call 1 if first_seen is cached.
|
||||
m2 := srv.buildNodeInfoMap()
|
||||
first2 := m2["aaaa1111"].FirstSeen
|
||||
if first2 != first1 {
|
||||
t.Errorf("buildNodeInfoMap re-scanned nodes.first_seen uncached (#1627 r3 regression): "+
|
||||
"call 1 saw %q, call 2 saw %q after out-of-band UPDATE; expected both calls to return "+
|
||||
"the cached value because getCachedNodesAndPM has a 30s TTL",
|
||||
first1, first2)
|
||||
}
|
||||
}
|
||||
|
||||
// TestGetAllNodes_FirstSeenSchemaFallback exercises the schema-probe rung that
|
||||
// fires when nodes.first_seen is missing. The richest SELECT errors out, the
|
||||
// loop falls through to the next-richest query, and the resulting nodeInfo
|
||||
// values must have empty FirstSeen with no panic. Regression coverage for the
|
||||
// existing fallback branch (#1632 review loop 1).
|
||||
func TestGetAllNodes_FirstSeenSchemaFallback(t *testing.T) {
|
||||
tmpDir := t.TempDir()
|
||||
dbPath := tmpDir + "/test.db"
|
||||
|
||||
// Seed a nodes table WITHOUT first_seen (advert_count + last_seen present).
|
||||
rw, err := sql.Open("sqlite", dbPath)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer rw.Close()
|
||||
for _, stmt := range []string{
|
||||
"CREATE TABLE nodes (public_key TEXT PRIMARY KEY, name TEXT, role TEXT, lat REAL, lon REAL, last_seen TEXT, advert_count INTEGER)",
|
||||
"CREATE TABLE observers (id TEXT, name TEXT, iata TEXT)",
|
||||
"INSERT INTO nodes VALUES ('BBBB2222', 'Repeater-2', 'repeater', 0, 0, '2024-02-02T00:00:00Z', 3)",
|
||||
} {
|
||||
if _, err := rw.Exec(stmt); err != nil {
|
||||
t.Fatalf("seed exec %q: %v", stmt, err)
|
||||
}
|
||||
}
|
||||
|
||||
db, err := OpenDB(dbPath)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer db.conn.Close()
|
||||
|
||||
store := NewPacketStore(db, nil)
|
||||
nodes := store.getAllNodes()
|
||||
if len(nodes) != 1 {
|
||||
t.Fatalf("expected 1 row from fallback rung, got %d", len(nodes))
|
||||
}
|
||||
n := nodes[0]
|
||||
if n.PublicKey != "BBBB2222" {
|
||||
t.Errorf("PublicKey mismatch: got %q", n.PublicKey)
|
||||
}
|
||||
if n.FirstSeen != "" {
|
||||
t.Errorf("FirstSeen should be empty when nodes.first_seen column is missing, got %q", n.FirstSeen)
|
||||
}
|
||||
if n.ObservationCount != 3 {
|
||||
t.Errorf("ObservationCount should still populate from advert_count fallback, got %d", n.ObservationCount)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -62,6 +62,16 @@ type NeighborEdge struct {
|
||||
Ambiguous bool // multiple candidates or zero candidates
|
||||
Candidates []string // candidate pubkeys when ambiguous
|
||||
Resolved bool // true if auto-resolved via Jaccard
|
||||
// CountsByMode tallies sightings broken down by hash-prefix mode in bytes
|
||||
// (1, 2, or 3). Firmware path-byte encoding (Packet.cpp:13-18) sets
|
||||
// hash_size = (pathByte>>6)+1 with values 1/2/3 valid and 4 reserved.
|
||||
// 1-byte prefixes collide ~8-way across a typical mesh; 3-byte are
|
||||
// effectively unambiguous. Bucket 0 is the legacy/unknown bucket used
|
||||
// for edges loaded from the persisted neighbor_edges snapshot (which
|
||||
// stores only the flat Count). Sum of values == Count by construction.
|
||||
// Issue #1638 — lets the frontend weight confidence by ambiguity rather
|
||||
// than treating every observation as equal evidence.
|
||||
CountsByMode map[int]int
|
||||
}
|
||||
|
||||
// Score computes the affinity score at query time with time decay.
|
||||
@@ -106,6 +116,26 @@ func (e *NeighborEdge) AvgSNR() float64 {
|
||||
return e.SNRSum / float64(e.SNRCount)
|
||||
}
|
||||
|
||||
// incCountsByMode bumps the per-hash-mode tally on the edge based on the
|
||||
// observed prefix length (hex chars / 2 = bytes). Per firmware
|
||||
// firmware/src/Packet.cpp:13-18 (hash_size = (pathByte>>6)+1), valid wire
|
||||
// modes are 1, 2 or 3 bytes; hash_size==4 is reserved. Anything outside
|
||||
// 1/2/3 falls into the legacy/unknown bucket (0) so we don't lose the
|
||||
// observation entirely. Issue #1638.
|
||||
func incCountsByMode(e *NeighborEdge, prefix string) {
|
||||
if e.CountsByMode == nil {
|
||||
e.CountsByMode = make(map[int]int)
|
||||
}
|
||||
bytes := len(prefix) / 2
|
||||
switch bytes {
|
||||
case 1, 2, 3:
|
||||
// known firmware hash mode
|
||||
default:
|
||||
bytes = 0
|
||||
}
|
||||
e.CountsByMode[bytes]++
|
||||
}
|
||||
|
||||
// ─── NeighborGraph ─────────────────────────────────────────────────────────────
|
||||
|
||||
// NeighborGraph is a cached, in-memory first-hop neighbor affinity graph.
|
||||
@@ -358,12 +388,13 @@ func (g *NeighborGraph) upsertEdge(pubkeyA, pubkeyB, prefix, observer string, sn
|
||||
e, exists := g.edges[key]
|
||||
if !exists {
|
||||
e = &NeighborEdge{
|
||||
NodeA: key.A,
|
||||
NodeB: key.B,
|
||||
Prefix: prefix,
|
||||
Observers: make(map[string]bool),
|
||||
FirstSeen: ts,
|
||||
LastSeen: ts,
|
||||
NodeA: key.A,
|
||||
NodeB: key.B,
|
||||
Prefix: prefix,
|
||||
Observers: make(map[string]bool),
|
||||
FirstSeen: ts,
|
||||
LastSeen: ts,
|
||||
CountsByMode: make(map[int]int),
|
||||
}
|
||||
g.edges[key] = e
|
||||
g.byNode[key.A] = append(g.byNode[key.A], e)
|
||||
@@ -371,6 +402,7 @@ func (g *NeighborGraph) upsertEdge(pubkeyA, pubkeyB, prefix, observer string, sn
|
||||
}
|
||||
|
||||
e.Count++
|
||||
incCountsByMode(e, prefix)
|
||||
if ts.After(e.LastSeen) {
|
||||
e.LastSeen = ts
|
||||
}
|
||||
@@ -421,20 +453,22 @@ func (g *NeighborGraph) upsertEdgeWithCandidates(knownPK, prefix string, candida
|
||||
e, exists := g.edges[key]
|
||||
if !exists {
|
||||
e = &NeighborEdge{
|
||||
NodeA: key.A,
|
||||
NodeB: "",
|
||||
Prefix: prefix,
|
||||
Observers: make(map[string]bool),
|
||||
Ambiguous: true,
|
||||
Candidates: filtered,
|
||||
FirstSeen: ts,
|
||||
LastSeen: ts,
|
||||
NodeA: key.A,
|
||||
NodeB: "",
|
||||
Prefix: prefix,
|
||||
Observers: make(map[string]bool),
|
||||
Ambiguous: true,
|
||||
Candidates: filtered,
|
||||
FirstSeen: ts,
|
||||
LastSeen: ts,
|
||||
CountsByMode: make(map[int]int),
|
||||
}
|
||||
g.edges[key] = e
|
||||
g.byNode[knownPK] = append(g.byNode[knownPK], e)
|
||||
}
|
||||
|
||||
e.Count++
|
||||
incCountsByMode(e, prefix)
|
||||
if ts.After(e.LastSeen) {
|
||||
e.LastSeen = ts
|
||||
}
|
||||
@@ -653,6 +687,12 @@ func (g *NeighborGraph) resolveEdge(oldKey edgeKey, e *NeighborEdge, knownNode,
|
||||
for obs := range e.Observers {
|
||||
existing.Observers[obs] = true
|
||||
}
|
||||
if existing.CountsByMode == nil {
|
||||
existing.CountsByMode = make(map[int]int)
|
||||
}
|
||||
for m, c := range e.CountsByMode {
|
||||
existing.CountsByMode[m] += c
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
@@ -834,3 +834,63 @@ func BenchmarkBuildFromStore(b *testing.B) {
|
||||
BuildFromStore(store)
|
||||
}
|
||||
}
|
||||
|
||||
// TestBuildNeighborGraph_CountsByMode (issue #1638): verify per-hash-mode
|
||||
// edge counts are tracked separately from the flat Count, so the frontend
|
||||
// confidence indicator can weight 3-byte (effectively unambiguous) sightings
|
||||
// higher than 1-byte (high-collision) sightings. Modes track firmware-valid
|
||||
// hash sizes 1/2/3 per Packet.cpp:13-18.
|
||||
func TestBuildNeighborGraph_CountsByMode(t *testing.T) {
|
||||
// Use a unique-bbbb-prefix R1 so 1/2/3-byte prefixes all resolve to it.
|
||||
nodes := []nodeInfo{
|
||||
{Role: "repeater", PublicKey: "aaaa1111", Name: "NodeX"},
|
||||
{Role: "repeater", PublicKey: "bbbb2222", Name: "NodeR1"},
|
||||
{Role: "repeater", PublicKey: "cccc3333", Name: "Obs"},
|
||||
}
|
||||
// Three ADVERTs from X observed at varying hash modes hitting R1.
|
||||
txs := []*StoreTx{
|
||||
ngMakeTx(1, 4, ngFromNodeJSON("aaaa1111"), []*StoreObs{
|
||||
ngMakeObs("cccc3333", `["bb"]`, nowStr, nil), // 1-byte
|
||||
}),
|
||||
ngMakeTx(2, 4, ngFromNodeJSON("aaaa1111"), []*StoreObs{
|
||||
ngMakeObs("cccc3333", `["bbbb"]`, nowStr, nil), // 2-byte
|
||||
}),
|
||||
ngMakeTx(3, 4, ngFromNodeJSON("aaaa1111"), []*StoreObs{
|
||||
ngMakeObs("cccc3333", `["bbbb22"]`, nowStr, nil), // 3-byte
|
||||
}),
|
||||
}
|
||||
store := ngTestStore(nodes, txs)
|
||||
g := BuildFromStore(store)
|
||||
|
||||
edges := g.Neighbors("aaaa1111")
|
||||
var xr1 *NeighborEdge
|
||||
for _, e := range edges {
|
||||
other := e.NodeB
|
||||
if e.NodeA != "aaaa1111" {
|
||||
other = e.NodeA
|
||||
}
|
||||
if other == "bbbb2222" {
|
||||
xr1 = e
|
||||
break
|
||||
}
|
||||
}
|
||||
if xr1 == nil {
|
||||
t.Fatalf("expected X↔R1 edge, got %d edges", len(edges))
|
||||
}
|
||||
// Back-compat: flat Count == 3.
|
||||
if xr1.Count != 3 {
|
||||
t.Errorf("expected Count=3, got %d", xr1.Count)
|
||||
}
|
||||
if xr1.CountsByMode == nil {
|
||||
t.Fatalf("expected CountsByMode populated, got nil")
|
||||
}
|
||||
if got := xr1.CountsByMode[1]; got != 1 {
|
||||
t.Errorf("CountsByMode[1] = %d, want 1", got)
|
||||
}
|
||||
if got := xr1.CountsByMode[2]; got != 1 {
|
||||
t.Errorf("CountsByMode[2] = %d, want 1", got)
|
||||
}
|
||||
if got := xr1.CountsByMode[3]; got != 1 {
|
||||
t.Errorf("CountsByMode[3] = %d, want 1", got)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -54,19 +54,35 @@ func loadNeighborEdgesFromDB(conn *sql.DB) *NeighborGraph {
|
||||
g.mu.Lock()
|
||||
e, exists := g.edges[key]
|
||||
if !exists {
|
||||
// Persisted snapshot stores only the flat Count — no per-mode
|
||||
// breakdown. Synthesize CountsByMode by attributing all Count
|
||||
// to the legacy/unknown bucket (0) so the invariant
|
||||
// sum(CountsByMode) == Count holds for downstream consumers.
|
||||
// Issue #1638 adv-#1: legacy-edge invariant.
|
||||
cbm := make(map[int]int)
|
||||
if cnt > 0 {
|
||||
cbm[0] = cnt
|
||||
}
|
||||
e = &NeighborEdge{
|
||||
NodeA: key.A,
|
||||
NodeB: key.B,
|
||||
Observers: make(map[string]bool),
|
||||
FirstSeen: ts,
|
||||
LastSeen: ts,
|
||||
Count: cnt,
|
||||
NodeA: key.A,
|
||||
NodeB: key.B,
|
||||
Observers: make(map[string]bool),
|
||||
FirstSeen: ts,
|
||||
LastSeen: ts,
|
||||
Count: cnt,
|
||||
CountsByMode: cbm,
|
||||
}
|
||||
g.edges[key] = e
|
||||
g.byNode[key.A] = append(g.byNode[key.A], e)
|
||||
g.byNode[key.B] = append(g.byNode[key.B], e)
|
||||
} else {
|
||||
e.Count += cnt
|
||||
if e.CountsByMode == nil {
|
||||
e.CountsByMode = make(map[int]int)
|
||||
}
|
||||
if cnt > 0 {
|
||||
e.CountsByMode[0] += cnt
|
||||
}
|
||||
if ts.After(e.LastSeen) {
|
||||
e.LastSeen = ts
|
||||
}
|
||||
@@ -131,6 +147,63 @@ func resolvePathForObs(pathJSON, observerID string, tx *StoreTx, pm *prefixMap,
|
||||
return resolved
|
||||
}
|
||||
|
||||
// resolvePathForObsColdLoad is the cold-load (Load / loadChunk / scanAndMergeChunk)
|
||||
// variant of resolvePathForObs that gates hop resolution on `unique_prefix`
|
||||
// only. Live ingest uses the affinity/observation-count tiebreak via
|
||||
// resolvePathForObs because it has roughly-current state. Cold load runs
|
||||
// against observations up to retentionHours (168h) old, where today's
|
||||
// affinity winner ≠ historical affinity winner for that prefix — silently
|
||||
// mis-attributing the relay (PR #1643 R1 munger #1, "time-travel attribution
|
||||
// gate").
|
||||
//
|
||||
// Behavior: hops whose prefix maps to exactly one repeater resolve as
|
||||
// usual; hops whose prefix maps to multiple candidates return nil and
|
||||
// increment skipped (caller-owned counter for observability — a single
|
||||
// summary log line at the end of Load surfaces the total).
|
||||
//
|
||||
// Under-attribute > mis-attribute (reviewer consensus on PR #1643).
|
||||
func resolvePathForObsColdLoad(pathJSON, observerID string, tx *StoreTx, pm *prefixMap, skipped *int) []*string {
|
||||
hops := parsePathJSON(pathJSON)
|
||||
if len(hops) == 0 {
|
||||
return nil
|
||||
}
|
||||
resolved := make([]*string, len(hops))
|
||||
for i, hop := range hops {
|
||||
// unique_prefix iff the prefix maps to exactly one candidate
|
||||
// after the observer-known nonRelay filter. Mirrors the
|
||||
// `len(candidates) == 1 → "unique_prefix"` arm of
|
||||
// resolveWithContext (store.go ~6380). Calling resolveWithContext
|
||||
// with a nil graph and empty context skips the affinity/
|
||||
// observation-count tiers entirely — but tier-4
|
||||
// observation_count_fallback would still pick a winner for
|
||||
// ambiguous prefixes, which is exactly what we must NOT do.
|
||||
// Hence the explicit candidate-count check here.
|
||||
h := strings.ToLower(hop)
|
||||
candidates := pm.m[h]
|
||||
if len(pm.nonRelay) > 0 && len(candidates) > 0 {
|
||||
filtered := candidates[:0:0]
|
||||
for j := range candidates {
|
||||
if _, isListener := pm.nonRelay[strings.ToLower(candidates[j].PublicKey)]; isListener {
|
||||
continue
|
||||
}
|
||||
filtered = append(filtered, candidates[j])
|
||||
}
|
||||
candidates = filtered
|
||||
}
|
||||
if len(candidates) == 1 {
|
||||
pk := strings.ToLower(candidates[0].PublicKey)
|
||||
resolved[i] = &pk
|
||||
continue
|
||||
}
|
||||
// Ambiguous (len > 1) or no_match (len == 0). Under-attribute.
|
||||
if len(candidates) > 1 && skipped != nil {
|
||||
*skipped++
|
||||
}
|
||||
// resolved[i] stays nil; extractResolvedPubkeys filters it out.
|
||||
}
|
||||
return resolved
|
||||
}
|
||||
|
||||
// marshalResolvedPath converts []*string to JSON for in-memory caching.
|
||||
func marshalResolvedPath(rp []*string) string {
|
||||
if len(rp) == 0 {
|
||||
|
||||
@@ -0,0 +1,125 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
_ "modernc.org/sqlite"
|
||||
)
|
||||
|
||||
// TestNeighborPersist_LegacyEdgeInvariant (#1638 adv-#1): edges loaded from
|
||||
// the persisted neighbor_edges snapshot have no per-hash-mode breakdown
|
||||
// (the table stores only the flat Count). Loader MUST synthesize
|
||||
// CountsByMode so the invariant sum(CountsByMode) == Count holds — all
|
||||
// pre-existing observations land in bucket 0 (legacy/unknown, conservative
|
||||
// weight in the JS confidence indicator).
|
||||
func TestNeighborPersist_LegacyEdgeInvariant(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "neighbor_legacy.db")
|
||||
rw, err := sql.Open("sqlite", "file:"+dbPath+"?_journal_mode=WAL")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer rw.Close()
|
||||
if _, err := rw.Exec(`CREATE TABLE neighbor_edges (
|
||||
node_a TEXT NOT NULL,
|
||||
node_b TEXT NOT NULL,
|
||||
count INTEGER DEFAULT 1,
|
||||
last_seen TEXT,
|
||||
PRIMARY KEY (node_a, node_b)
|
||||
)`); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
now := time.Now().UTC().Format(time.RFC3339)
|
||||
if _, err := rw.Exec(
|
||||
`INSERT INTO neighbor_edges (node_a, node_b, count, last_seen) VALUES (?, ?, ?, ?)`,
|
||||
"aaaa", "bbbb", 7, now,
|
||||
); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
g := loadNeighborEdgesFromDB(rw)
|
||||
edges := g.AllEdges()
|
||||
if len(edges) != 1 {
|
||||
t.Fatalf("expected 1 edge, got %d", len(edges))
|
||||
}
|
||||
e := edges[0]
|
||||
if e.Count != 7 {
|
||||
t.Fatalf("expected Count=7, got %d", e.Count)
|
||||
}
|
||||
if e.CountsByMode == nil {
|
||||
t.Fatalf("expected CountsByMode synthesized for legacy edge, got nil")
|
||||
}
|
||||
// All flat-count observations must land in bucket 0 (legacy/unknown).
|
||||
if got := e.CountsByMode[0]; got != 7 {
|
||||
t.Errorf("CountsByMode[0] = %d, want 7 (all legacy count in bucket 0)", got)
|
||||
}
|
||||
// Buckets 1/2/3 must be empty — no real wire-mode evidence on a
|
||||
// snapshot-only edge.
|
||||
for _, m := range []int{1, 2, 3} {
|
||||
if got := e.CountsByMode[m]; got != 0 {
|
||||
t.Errorf("CountsByMode[%d] = %d, want 0", m, got)
|
||||
}
|
||||
}
|
||||
// Invariant: sum(CountsByMode) == Count.
|
||||
sum := 0
|
||||
for _, c := range e.CountsByMode {
|
||||
sum += c
|
||||
}
|
||||
if sum != e.Count {
|
||||
t.Errorf("invariant violated: sum(CountsByMode)=%d, Count=%d", sum, e.Count)
|
||||
}
|
||||
}
|
||||
|
||||
// TestNeighborPersist_LegacyEdgeMergeOnReload covers the "row appears twice
|
||||
// in the snapshot" path (loader's else-branch): subsequent counts must
|
||||
// accumulate into bucket 0 too, preserving the invariant.
|
||||
func TestNeighborPersist_LegacyEdgeMergeOnReload(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "neighbor_legacy_merge.db")
|
||||
rw, err := sql.Open("sqlite", "file:"+dbPath+"?_journal_mode=WAL")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer rw.Close()
|
||||
// No PRIMARY KEY so we can insert two rows for the same (a,b) pair to
|
||||
// exercise the loader's else-branch.
|
||||
if _, err := rw.Exec(`CREATE TABLE neighbor_edges (
|
||||
node_a TEXT NOT NULL,
|
||||
node_b TEXT NOT NULL,
|
||||
count INTEGER DEFAULT 1,
|
||||
last_seen TEXT
|
||||
)`); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
now := time.Now().UTC().Format(time.RFC3339)
|
||||
for _, cnt := range []int{3, 4} {
|
||||
if _, err := rw.Exec(
|
||||
`INSERT INTO neighbor_edges (node_a, node_b, count, last_seen) VALUES (?, ?, ?, ?)`,
|
||||
"aaaa", "bbbb", cnt, now,
|
||||
); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
g := loadNeighborEdgesFromDB(rw)
|
||||
edges := g.AllEdges()
|
||||
if len(edges) != 1 {
|
||||
t.Fatalf("expected 1 merged edge, got %d", len(edges))
|
||||
}
|
||||
e := edges[0]
|
||||
if e.Count != 7 {
|
||||
t.Fatalf("expected merged Count=7, got %d", e.Count)
|
||||
}
|
||||
if got := e.CountsByMode[0]; got != 7 {
|
||||
t.Errorf("CountsByMode[0] = %d, want 7 after merge", got)
|
||||
}
|
||||
sum := 0
|
||||
for _, c := range e.CountsByMode {
|
||||
sum += c
|
||||
}
|
||||
if sum != e.Count {
|
||||
t.Errorf("invariant violated after merge: sum(CountsByMode)=%d, Count=%d", sum, e.Count)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,93 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Issue #1290 (MAJOR-1, adversarial review of PR #1624) — regression guard.
|
||||
// GetNonRelayObserverPubkeys() returns LOWER(id); the disambiguator
|
||||
// (pm.nonRelay) also uses lowercase. GetNodeHealth previously used
|
||||
// UPPERCASE for both insert and lookup which happens to work by symmetry,
|
||||
// but any refactor that changes how pkt.ObserverID is normalized would
|
||||
// silently break the badge. This test pins lowercase as the convention by
|
||||
// seeding an observer.id with mixed-case packet ObserverID and asserting
|
||||
// the listener badge is rendered for the matching observer in HeardBy.
|
||||
func TestNodeHealth_CanRelayCaseInsensitive_Issue1290(t *testing.T) {
|
||||
srv, router := setupTestServer(t)
|
||||
|
||||
// DB row: observer id is the canonical LOWERCASE pubkey with can_relay=0.
|
||||
const obsIDLower = "deadbeefcafe1290"
|
||||
const obsIDMixed = "DeadBeefCafe1290" // packet observer-id w/ mixed case
|
||||
const nodePubkey = "aabbccdd11223344" // seeded by seedTestData
|
||||
now := time.Now().UTC().Format(time.RFC3339)
|
||||
// The test fixture's observers table predates the can_relay migration;
|
||||
// add both columns (matches dbschema migrations).
|
||||
for _, ddl := range []string{
|
||||
`ALTER TABLE observers ADD COLUMN can_relay INTEGER DEFAULT 1`,
|
||||
`ALTER TABLE observers ADD COLUMN can_relay_seen INTEGER DEFAULT 0`,
|
||||
} {
|
||||
if _, err := srv.store.db.conn.Exec(ddl); err != nil {
|
||||
t.Fatalf("alter: %v", err)
|
||||
}
|
||||
}
|
||||
if _, err := srv.store.db.conn.Exec(
|
||||
`INSERT INTO observers (id, name, iata, last_seen, first_seen, packet_count, can_relay, can_relay_seen)
|
||||
VALUES (?, 'ListenerOnly', 'SJC', ?, '2026-01-01T00:00:00Z', 1, 0, 1)`,
|
||||
obsIDLower, now); err != nil {
|
||||
t.Fatalf("seed observer: %v", err)
|
||||
}
|
||||
|
||||
// In-memory packet with the MIXED-case observer id so the badge resolver
|
||||
// must lower-case both sides to match against the lower-cased pubkey set.
|
||||
snr := 7.0
|
||||
srv.store.mu.Lock()
|
||||
if srv.store.byNode == nil {
|
||||
srv.store.byNode = make(map[string][]*StoreTx)
|
||||
}
|
||||
srv.store.byNode[nodePubkey] = append(srv.store.byNode[nodePubkey], &StoreTx{
|
||||
Hash: "1290casebadge00",
|
||||
FirstSeen: now,
|
||||
SNR: &snr,
|
||||
ObservationCount: 1,
|
||||
ObserverID: obsIDMixed,
|
||||
ObserverName: "ListenerOnly",
|
||||
})
|
||||
srv.store.mu.Unlock()
|
||||
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/nodes/"+nodePubkey+"/health", nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("expected 200, got %d (body: %s)", w.Code, w.Body.String())
|
||||
}
|
||||
|
||||
var body map[string]interface{}
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil {
|
||||
t.Fatalf("json: %v", err)
|
||||
}
|
||||
obs, ok := body["observers"].([]interface{})
|
||||
if !ok {
|
||||
t.Fatalf("expected observers array, got %T", body["observers"])
|
||||
}
|
||||
var found bool
|
||||
for _, raw := range obs {
|
||||
row, ok := raw.(map[string]interface{})
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if row["observer_id"] != obsIDMixed {
|
||||
continue
|
||||
}
|
||||
found = true
|
||||
if row["can_relay"] != false {
|
||||
t.Errorf("listener observer with can_relay=0 + mixed-case ObserverID: expected can_relay=false, got %v", row["can_relay"])
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Fatalf("did not find observer %q in HeardBy rows; got %v", obsIDMixed, obs)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,738 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"log"
|
||||
"net/http"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/gorilla/mux"
|
||||
"golang.org/x/sync/singleflight"
|
||||
)
|
||||
|
||||
// reachScanRowLimit hard-caps the windowed observation scan so a hot relay node
|
||||
// with weeks of traffic can't pull an unbounded result set into memory. A node
|
||||
// with >200k matching observations in the window is far past dashboard scale;
|
||||
// beyond the cap the counts are a (still representative) truncation. The LIKE
|
||||
// filter is unavoidably a text scan of path_json over the timestamp-narrowed
|
||||
// window — an indexed path-token column would need an ingestor-side schema
|
||||
// migration (the server is read-only by invariant), so it's a follow-up.
|
||||
// var (not const) so tests can lower the cap to exercise the truncation path
|
||||
// without inserting 200k rows.
|
||||
var reachScanRowLimit = 200000
|
||||
|
||||
// pathRow is one observation fed to attributeDirections. path tokens are
|
||||
// uppercase hex hop prefixes (as stored in observations.path_json). SNR is a
|
||||
// value + validity flag (not *float64) to avoid a heap escape per row.
|
||||
type pathRow struct {
|
||||
observerPK string // lowercase pubkey of the observer (may be "")
|
||||
fromPubkey string // lowercase originator pubkey (may be "")
|
||||
payloadType int
|
||||
path []string
|
||||
snr float64
|
||||
snrValid bool
|
||||
}
|
||||
|
||||
type obsAgg struct {
|
||||
count int
|
||||
snrSum float64
|
||||
snrN int
|
||||
}
|
||||
|
||||
type dirCounts struct {
|
||||
we map[string]int
|
||||
they map[string]int
|
||||
obs map[string]obsAgg // value map — no per-observer heap alloc
|
||||
relay int
|
||||
}
|
||||
|
||||
// attributeDirections walks each path and attributes directional evidence for
|
||||
// the target node (identified by any token in ourTokens). resolve maps a hop
|
||||
// token → a unique relay pubkey ("" when ambiguous/unknown → skipped). ourPK is
|
||||
// the target's own pubkey (lowercase) so self-edges are ignored.
|
||||
func attributeDirections(rows []pathRow, ourTokens map[string]bool, ourPK string, resolve func(string) string) dirCounts {
|
||||
// Size hint: a small constant covers typical neighbour fan-out (dozens)
|
||||
// without over-allocating ~12.5k buckets on a 100k-row scan. Independent
|
||||
// r2 #4: the old `len(rows)/8+1` was ~250× too large for relays with
|
||||
// modest fan-out.
|
||||
const hint = 64
|
||||
d := dirCounts{
|
||||
we: make(map[string]int, hint),
|
||||
they: make(map[string]int, hint),
|
||||
obs: make(map[string]obsAgg, hint),
|
||||
}
|
||||
for _, r := range rows {
|
||||
n := len(r.path)
|
||||
if n == 0 {
|
||||
continue
|
||||
}
|
||||
hit := false
|
||||
for i, tok := range r.path {
|
||||
if !ourTokens[tok] {
|
||||
continue
|
||||
}
|
||||
hit = true
|
||||
// predecessor → we heard it
|
||||
if i > 0 {
|
||||
if pk := resolve(r.path[i-1]); pk != "" && pk != ourPK {
|
||||
d.we[pk]++
|
||||
}
|
||||
} else if r.payloadType == PayloadADVERT && r.fromPubkey != "" && r.fromPubkey != ourPK {
|
||||
d.we[r.fromPubkey]++
|
||||
}
|
||||
// successor → it heard us; or if we're the last hop, the observer did
|
||||
if i < n-1 {
|
||||
if pk := resolve(r.path[i+1]); pk != "" && pk != ourPK {
|
||||
d.they[pk]++
|
||||
}
|
||||
} else if r.observerPK != "" && r.observerPK != ourPK {
|
||||
d.they[r.observerPK]++
|
||||
a := d.obs[r.observerPK] // value copy; read-modify-write
|
||||
a.count++
|
||||
if r.snrValid {
|
||||
a.snrSum += r.snr
|
||||
a.snrN++
|
||||
}
|
||||
d.obs[r.observerPK] = a
|
||||
}
|
||||
}
|
||||
if hit {
|
||||
d.relay++
|
||||
}
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
// reliableTokens returns the uppercase hex prefixes (1, 2, 3 byte) of pubkey
|
||||
// that are UNIQUE among relay-capable nodes in pm AND resolve to pubkey itself.
|
||||
// 1-byte prefixes almost always collide and are excluded. The self-check matters
|
||||
// for non-relay targets (companion/sensor): pm only holds path-capable roles, so
|
||||
// a companion's prefix could otherwise be "unique" while pointing at an unrelated
|
||||
// relay — which would then credit that relay's traffic to the companion.
|
||||
func reliableTokens(pubkey string, pm *prefixMap) map[string]bool {
|
||||
out := map[string]bool{}
|
||||
lpk := strings.ToLower(pubkey)
|
||||
for _, l := range []int{2, 4, 6} { // hex chars = 1,2,3 bytes
|
||||
if len(lpk) < l {
|
||||
continue
|
||||
}
|
||||
p := lpk[:l]
|
||||
if pm != nil && len(pm.m[p]) == 1 && strings.EqualFold(pm.m[p][0].PublicKey, pubkey) {
|
||||
out[strings.ToUpper(p)] = true
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// uniqueResolve returns the single relay pubkey (lowercase) for a hop token, or
|
||||
// "" when the token resolves to zero or multiple candidates (conservative).
|
||||
// Callers should memoize across a request (see newResolver) so the per-hop
|
||||
// ToLower + map lookup runs once per distinct token, not once per row.
|
||||
func uniqueResolve(pm *prefixMap, token string) string {
|
||||
if pm == nil {
|
||||
return ""
|
||||
}
|
||||
cands := pm.m[strings.ToLower(token)]
|
||||
if len(cands) == 1 {
|
||||
return strings.ToLower(cands[0].PublicKey)
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// parsePathTokens extracts the quoted hex hop tokens from a path_json array
|
||||
// (e.g. `["AA","01FA","BB"]`) in a single pass, uppercased. Avoids the
|
||||
// json.Unmarshal reflection + per-row interface allocations on the hot scan
|
||||
// path. Tokens slice into pj (no copy) except where ToUpper must rewrite a
|
||||
// lowercase hop; path_json holds only hex strings, so there are no escapes to
|
||||
// worry about. Returns nil for an empty/degenerate array.
|
||||
func parsePathTokens(pj string) []string {
|
||||
out := make([]string, 0, 8) // paths are short (a handful of hops)
|
||||
i := 0
|
||||
for {
|
||||
q1 := strings.IndexByte(pj[i:], '"')
|
||||
if q1 < 0 {
|
||||
break
|
||||
}
|
||||
q1 += i
|
||||
rel := strings.IndexByte(pj[q1+1:], '"')
|
||||
if rel < 0 {
|
||||
break
|
||||
}
|
||||
q2 := q1 + 1 + rel
|
||||
out = append(out, strings.ToUpper(pj[q1+1:q2]))
|
||||
i = q2 + 1
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// newResolver returns a memoized hop-token → pubkey resolver. Paths reuse the
|
||||
// same hop tokens across thousands of rows, so caching collapses the repeated
|
||||
// ToLower + prefix-map lookups to once per distinct token.
|
||||
func newResolver(pm *prefixMap) func(string) string {
|
||||
cache := make(map[string]string)
|
||||
return func(tok string) string {
|
||||
if pk, ok := cache[tok]; ok {
|
||||
return pk
|
||||
}
|
||||
pk := uniqueResolve(pm, tok)
|
||||
cache[tok] = pk
|
||||
return pk
|
||||
}
|
||||
}
|
||||
|
||||
type NodeReachInfo struct {
|
||||
Pubkey string `json:"pubkey"`
|
||||
Name string `json:"name"`
|
||||
Role string `json:"role"`
|
||||
Lat *float64 `json:"lat"`
|
||||
Lon *float64 `json:"lon"`
|
||||
FirstSeen string `json:"first_seen"`
|
||||
}
|
||||
type NodeReachWindow struct {
|
||||
Days int `json:"days"`
|
||||
Since string `json:"since"`
|
||||
}
|
||||
type NodeReachImportance struct {
|
||||
NeighborDegree int `json:"neighbor_degree"`
|
||||
DegreeRank int `json:"degree_rank"`
|
||||
NodesWithEdges int `json:"nodes_with_edges"`
|
||||
RelayObservations int `json:"relay_observations"`
|
||||
BidirectionalLinks int `json:"bidirectional_links"`
|
||||
DirectObservers int `json:"direct_observers"`
|
||||
}
|
||||
type NodeReachObserver struct {
|
||||
Pubkey string `json:"pubkey"`
|
||||
Name string `json:"name"`
|
||||
Count int `json:"count"`
|
||||
AvgSNR *float64 `json:"avg_snr"`
|
||||
Lat *float64 `json:"lat"`
|
||||
Lon *float64 `json:"lon"`
|
||||
DistanceKm *float64 `json:"distance_km"`
|
||||
}
|
||||
type NodeReachLink struct {
|
||||
Pubkey string `json:"pubkey"`
|
||||
Name string `json:"name"`
|
||||
Role string `json:"role"`
|
||||
Lat *float64 `json:"lat"`
|
||||
Lon *float64 `json:"lon"`
|
||||
WeHear int `json:"we_hear"`
|
||||
TheyHear int `json:"they_hear"`
|
||||
Bottleneck int `json:"bottleneck"`
|
||||
Bidir bool `json:"bidir"`
|
||||
DistanceKm *float64 `json:"distance_km"`
|
||||
}
|
||||
type NodeReachResponse struct {
|
||||
Node NodeReachInfo `json:"node"`
|
||||
Window NodeReachWindow `json:"window"`
|
||||
ReliableTokens []string `json:"reliable_tokens"`
|
||||
Importance NodeReachImportance `json:"importance"`
|
||||
DirectObservers []NodeReachObserver `json:"direct_observers"`
|
||||
Links []NodeReachLink `json:"links"`
|
||||
}
|
||||
|
||||
func fptr(v float64) *float64 { return &v }
|
||||
|
||||
// gpsPtrs returns (lat,lon) pointers, nil when the node has no GPS.
|
||||
func gpsPtrs(info nodeInfo) (*float64, *float64) {
|
||||
if !info.HasGPS {
|
||||
return nil, nil
|
||||
}
|
||||
return fptr(info.Lat), fptr(info.Lon)
|
||||
}
|
||||
|
||||
// clampDays bounds the lookback window to [1,30]; default callers pass 7.
|
||||
func clampDays(d int) int {
|
||||
if d < 1 {
|
||||
return 1
|
||||
}
|
||||
if d > 30 {
|
||||
return 30
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
// --- bounded TTL cache. perf is gated by the time window; this just avoids
|
||||
// recompute under dashboard polling. Keyed "pubkey|days". ---
|
||||
//
|
||||
// reachCacheMax bounds entry count; at ~2KB of marshalled JSON per entry the
|
||||
// worst case is well under 1MB, so an entry cap (rather than a byte budget)
|
||||
// keeps the bookkeeping trivial while staying memory-safe.
|
||||
const (
|
||||
reachCacheTTL = 5 * time.Minute
|
||||
reachCacheMax = 256
|
||||
)
|
||||
|
||||
type reachCacheEntry struct {
|
||||
at time.Time
|
||||
raw []byte
|
||||
}
|
||||
|
||||
// reachState bundles per-server reach caches. Was a set of package-level
|
||||
// globals — moved onto *Server so two Server instances (tests, future
|
||||
// per-listener) don't share observable state (Independent r2 #2).
|
||||
type reachState struct {
|
||||
cacheMu sync.RWMutex
|
||||
cache map[string]reachCacheEntry
|
||||
// sf dedups concurrent cold-cache requests for the same key so N
|
||||
// simultaneous callers run the scan + attribution once, not N times.
|
||||
sf singleflight.Group
|
||||
|
||||
// lastSeenBlacklistGen is the BlacklistGeneration() value that the cache
|
||||
// was last reconciled with. When the live generation moves past this
|
||||
// value, the cache is purged wholesale on the next request to prevent
|
||||
// prior-gen entries from accumulating until their TTL expires (#1629
|
||||
// round-2, adversarial #5).
|
||||
lastSeenBlacklistGen atomic.Uint64
|
||||
|
||||
degreeMu sync.Mutex
|
||||
degreeSnap *degreeSnapshot
|
||||
}
|
||||
|
||||
// reachCacheGet returns the cached marshalled JSON for key. The returned slice
|
||||
// is shared (not copied): it is treated as immutable — only ever handed to
|
||||
// w.Write — so callers MUST NOT mutate it.
|
||||
func (s *Server) reachCacheGet(key string) ([]byte, bool) {
|
||||
s.reach.cacheMu.RLock()
|
||||
defer s.reach.cacheMu.RUnlock()
|
||||
e, ok := s.reach.cache[key]
|
||||
if !ok || time.Since(e.at) > reachCacheTTL {
|
||||
return nil, false
|
||||
}
|
||||
return e.raw, true
|
||||
}
|
||||
|
||||
// reachCacheLen returns the current entry count in the reach response cache.
|
||||
// Test helper — exposes the size without leaking the internal mutex/map.
|
||||
func (s *Server) reachCacheLen() int {
|
||||
s.reach.cacheMu.RLock()
|
||||
defer s.reach.cacheMu.RUnlock()
|
||||
return len(s.reach.cache)
|
||||
}
|
||||
|
||||
// reachPurgeIfBlacklistGenChanged drops every cached entry when the live
|
||||
// blacklist generation has advanced past the cache's last-seen value. CAS
|
||||
// gates the purge so concurrent callers only do the work once per gen bump
|
||||
// (#1629 round-2, adversarial #5).
|
||||
func (s *Server) reachPurgeIfBlacklistGenChanged(gen uint64) {
|
||||
seen := s.reach.lastSeenBlacklistGen.Load()
|
||||
if gen == seen {
|
||||
return
|
||||
}
|
||||
// CAS gates the actual purge to a single winner on a given gen bump.
|
||||
if !s.reach.lastSeenBlacklistGen.CompareAndSwap(seen, gen) {
|
||||
// Another goroutine already advanced (and purged). Done.
|
||||
return
|
||||
}
|
||||
s.reach.cacheMu.Lock()
|
||||
s.reach.cache = nil
|
||||
s.reach.cacheMu.Unlock()
|
||||
}
|
||||
|
||||
// isHexPubkey reports whether s is a full 64-char lowercase-hex public key.
|
||||
// The handler lowercases input first, so we only accept [0-9a-f].
|
||||
func isHexPubkey(s string) bool {
|
||||
if len(s) != 64 {
|
||||
return false
|
||||
}
|
||||
for i := 0; i < len(s); i++ {
|
||||
c := s[i]
|
||||
if !(c >= '0' && c <= '9' || c >= 'a' && c <= 'f') {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (s *Server) reachCachePut(key string, raw []byte) {
|
||||
s.reach.cacheMu.Lock()
|
||||
defer s.reach.cacheMu.Unlock()
|
||||
if s.reach.cache == nil {
|
||||
s.reach.cache = map[string]reachCacheEntry{}
|
||||
}
|
||||
if _, exists := s.reach.cache[key]; !exists && len(s.reach.cache) >= reachCacheMax {
|
||||
s.evictReachLocked()
|
||||
}
|
||||
s.reach.cache[key] = reachCacheEntry{at: time.Now(), raw: raw}
|
||||
}
|
||||
|
||||
// evictReachLocked drops expired entries first; if still at the cap it evicts
|
||||
// the single oldest entry. Avoids the full-map wipe that thrashed every cached
|
||||
// key once the cap was reached. Caller holds s.reach.cacheMu (write).
|
||||
func (s *Server) evictReachLocked() {
|
||||
now := time.Now()
|
||||
for k, e := range s.reach.cache {
|
||||
if now.Sub(e.at) > reachCacheTTL {
|
||||
delete(s.reach.cache, k)
|
||||
}
|
||||
}
|
||||
if len(s.reach.cache) < reachCacheMax {
|
||||
return
|
||||
}
|
||||
var oldestKey string
|
||||
var oldestAt time.Time
|
||||
first := true
|
||||
for k, e := range s.reach.cache {
|
||||
if first || e.at.Before(oldestAt) {
|
||||
oldestKey, oldestAt, first = k, e.at, false
|
||||
}
|
||||
}
|
||||
if !first {
|
||||
delete(s.reach.cache, oldestKey)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Server) handleNodeReach(w http.ResponseWriter, r *http.Request) {
|
||||
pubkey := strings.ToLower(mux.Vars(r)["pubkey"])
|
||||
// Reject malformed pubkeys up front (cheap defense against cache-key
|
||||
// pollution + wasted work on bogus IDs).
|
||||
if !isHexPubkey(pubkey) {
|
||||
writeError(w, 400, "invalid pubkey: expected 64 hex chars")
|
||||
return
|
||||
}
|
||||
if s.cfg != nil && s.cfg.IsBlacklisted(pubkey) {
|
||||
writeError(w, 404, "Not found")
|
||||
return
|
||||
}
|
||||
if s.isPubkeyHidden(pubkey) {
|
||||
writeError(w, 404, "Not found")
|
||||
return
|
||||
}
|
||||
days := 7
|
||||
if v := r.URL.Query().Get("days"); v != "" {
|
||||
if n, err := strconv.Atoi(v); err == nil {
|
||||
days = n
|
||||
}
|
||||
}
|
||||
days = clampDays(days)
|
||||
|
||||
// cacheKey includes the blacklist generation so any mutation via
|
||||
// SetNodeBlacklist invalidates all prior reach cache entries on the
|
||||
// next request (#1629). Without the generation suffix a node added
|
||||
// to the blacklist post-warm would keep being served the cached
|
||||
// non-blacklisted response until the TTL expires.
|
||||
var gen uint64
|
||||
if s.cfg != nil {
|
||||
gen = s.cfg.BlacklistGeneration()
|
||||
}
|
||||
// Purge prior-gen entries wholesale when the generation advances so a
|
||||
// steady stream of operator blacklist edits cannot leak cache entries
|
||||
// up to the TTL. Cheap: one map reset under the cache mutex, only when
|
||||
// the gen actually moved (#1629 round-2, adversarial #5).
|
||||
s.reachPurgeIfBlacklistGenChanged(gen)
|
||||
cacheKey := pubkey + "|" + strconv.Itoa(days) + "|g" + strconv.FormatUint(gen, 10)
|
||||
if raw, ok := s.reachCacheGet(cacheKey); ok {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.Write(raw)
|
||||
return
|
||||
}
|
||||
|
||||
// singleflight: collapse a thundering herd on a cold key to one scan. The
|
||||
// shared computation uses the triggering request's context; a disconnect
|
||||
// there can cancel the in-flight scan for all waiters (acceptable — the
|
||||
// next request recomputes).
|
||||
v, err, _ := s.reach.sf.Do(cacheKey, func() (interface{}, error) {
|
||||
if raw, ok := s.reachCacheGet(cacheKey); ok {
|
||||
return raw, nil
|
||||
}
|
||||
resp, ok, cErr := s.computeNodeReach(r.Context(), pubkey, days)
|
||||
if cErr != nil {
|
||||
// Real backend failure (e.g. DB scan exploded) — propagate so the
|
||||
// caller renders 500 instead of the misleading empty-reach
|
||||
// response. Do NOT cache. (#1631)
|
||||
return nil, cErr
|
||||
}
|
||||
if !ok {
|
||||
return []byte(nil), nil
|
||||
}
|
||||
raw, mErr := json.Marshal(resp)
|
||||
if mErr != nil {
|
||||
log.Printf("[reach] marshal failed for %s: %v", cacheKey, mErr)
|
||||
return nil, mErr
|
||||
}
|
||||
s.reachCachePut(cacheKey, raw)
|
||||
return raw, nil
|
||||
})
|
||||
if err != nil {
|
||||
writeError(w, 500, "reach computation failed")
|
||||
return
|
||||
}
|
||||
raw, _ := v.([]byte)
|
||||
if len(raw) == 0 {
|
||||
writeError(w, 404, "Not found")
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.Write(raw)
|
||||
}
|
||||
|
||||
// computeNodeReach does the read-only scan + assembly. ok=false → 404
|
||||
// (target node not present / inputs unavailable). A non-nil error signals a
|
||||
// real backend failure (e.g. DB scan exploded) — caller should render 500,
|
||||
// not 404 (issue #1631).
|
||||
func (s *Server) computeNodeReach(ctx context.Context, pubkey string, days int) (NodeReachResponse, bool, error) {
|
||||
if s.store == nil || s.db == nil || s.db.conn == nil {
|
||||
return NodeReachResponse{}, false, nil
|
||||
}
|
||||
nodeMap := s.buildNodeInfoMap()
|
||||
self, found := nodeMap[pubkey]
|
||||
if !found {
|
||||
return NodeReachResponse{}, false, nil
|
||||
}
|
||||
_, pm := s.store.getCachedNodesAndPM()
|
||||
tokens := reliableTokens(pubkey, pm)
|
||||
|
||||
since := time.Now().UTC().Add(-time.Duration(days) * 24 * time.Hour)
|
||||
sinceEpoch := since.Unix()
|
||||
|
||||
var d dirCounts
|
||||
if len(tokens) > 0 {
|
||||
rows, err := s.scanReachRows(ctx, tokens, sinceEpoch)
|
||||
if err != nil {
|
||||
return NodeReachResponse{}, false, err
|
||||
}
|
||||
d = attributeDirections(rows, tokens, pubkey, newResolver(pm))
|
||||
} else {
|
||||
d = dirCounts{we: map[string]int{}, they: map[string]int{}, obs: map[string]obsAgg{}}
|
||||
}
|
||||
|
||||
// importance: neighbor_edges degree + rank (all-time). Served from a
|
||||
// coarse-TTL snapshot so the full UNION+GROUP-BY aggregate runs at most
|
||||
// once per snapshotTTL, not on every cache miss.
|
||||
degree, rank, nodesWithEdges := s.reachDegreeRank(ctx, pubkey)
|
||||
|
||||
// node first_seen comes from nodeInfo (buildNodeInfoMap folds it in via a
|
||||
// single bulk SELECT). Missing → empty string (the node may be
|
||||
// observer-only or pre-first_seen-schema).
|
||||
firstSeen := self.FirstSeen
|
||||
|
||||
// assemble links
|
||||
links := make([]NodeReachLink, 0, len(d.we)+len(d.they))
|
||||
bidir := 0
|
||||
seen := make(map[string]bool, len(d.we)+len(d.they))
|
||||
for pk := range d.we {
|
||||
seen[pk] = true
|
||||
}
|
||||
for pk := range d.they {
|
||||
seen[pk] = true
|
||||
}
|
||||
for pk := range seen {
|
||||
we, they := d.we[pk], d.they[pk]
|
||||
info := nodeMap[pk]
|
||||
lat, lon := gpsPtrs(info)
|
||||
var dist *float64
|
||||
if self.HasGPS && info.HasGPS {
|
||||
dist = fptr(haversineKm(self.Lat, self.Lon, info.Lat, info.Lon))
|
||||
}
|
||||
b := we > 0 && they > 0
|
||||
if b {
|
||||
bidir++
|
||||
}
|
||||
links = append(links, NodeReachLink{
|
||||
Pubkey: pk, Name: info.Name, Role: info.Role, Lat: lat, Lon: lon,
|
||||
WeHear: we, TheyHear: they, Bottleneck: min(we, they), Bidir: b, DistanceKm: dist,
|
||||
})
|
||||
}
|
||||
sort.Slice(links, func(i, j int) bool {
|
||||
if links[i].Bidir != links[j].Bidir {
|
||||
return links[i].Bidir
|
||||
}
|
||||
if links[i].Bottleneck != links[j].Bottleneck {
|
||||
return links[i].Bottleneck > links[j].Bottleneck
|
||||
}
|
||||
return links[i].WeHear+links[i].TheyHear > links[j].WeHear+links[j].TheyHear
|
||||
})
|
||||
|
||||
// direct observers
|
||||
directObs := make([]NodeReachObserver, 0, len(d.obs))
|
||||
for pk, a := range d.obs {
|
||||
info := nodeMap[pk]
|
||||
lat, lon := gpsPtrs(info)
|
||||
var avg, dist *float64
|
||||
if a.snrN > 0 {
|
||||
avg = fptr(a.snrSum / float64(a.snrN))
|
||||
}
|
||||
if self.HasGPS && info.HasGPS {
|
||||
dist = fptr(haversineKm(self.Lat, self.Lon, info.Lat, info.Lon))
|
||||
}
|
||||
directObs = append(directObs, NodeReachObserver{
|
||||
Pubkey: pk, Name: info.Name, Count: a.count, AvgSNR: avg, Lat: lat, Lon: lon, DistanceKm: dist,
|
||||
})
|
||||
}
|
||||
sort.Slice(directObs, func(i, j int) bool { return directObs[i].Count > directObs[j].Count })
|
||||
|
||||
toks := make([]string, 0, len(tokens))
|
||||
for t := range tokens {
|
||||
toks = append(toks, t)
|
||||
}
|
||||
sort.Strings(toks)
|
||||
|
||||
selfLat, selfLon := gpsPtrs(self)
|
||||
return NodeReachResponse{
|
||||
Node: NodeReachInfo{Pubkey: pubkey, Name: self.Name, Role: self.Role,
|
||||
Lat: selfLat, Lon: selfLon, FirstSeen: firstSeen},
|
||||
Window: NodeReachWindow{Days: days, Since: since.Format(time.RFC3339)},
|
||||
ReliableTokens: toks,
|
||||
Importance: NodeReachImportance{
|
||||
NeighborDegree: degree, DegreeRank: rank, NodesWithEdges: nodesWithEdges,
|
||||
RelayObservations: d.relay, BidirectionalLinks: bidir, DirectObservers: len(directObs),
|
||||
},
|
||||
DirectObservers: directObs,
|
||||
Links: links,
|
||||
}, true, nil
|
||||
}
|
||||
|
||||
// --- neighbor-degree snapshot ---------------------------------------------
|
||||
// The degree/rank importance is identical across all reach requests except the
|
||||
// pubkey match, so the full neighbor_edges aggregate is computed once and shared
|
||||
// behind a coarse TTL. Rank is a binary search over the descending degree list.
|
||||
const reachDegreeTTL = 60 * time.Second
|
||||
|
||||
type degreeSnapshot struct {
|
||||
at time.Time
|
||||
total int // nodes that have any edge
|
||||
deg map[string]int // lowercase pubkey → neighbour count
|
||||
sortedDesc []int // degrees sorted descending, for rank
|
||||
}
|
||||
|
||||
func (s *Server) reachDegreeRank(ctx context.Context, pubkey string) (degree, rank, total int) {
|
||||
snap := s.getDegreeSnapshot(ctx)
|
||||
if snap == nil {
|
||||
return 0, 0, 0
|
||||
}
|
||||
degree = snap.deg[pubkey]
|
||||
if degree == 0 {
|
||||
// No edges → not ranked. rank=0 is the documented "off-the-list" value;
|
||||
// avoids the nonsensical "#N+1 / N" the binary search would produce.
|
||||
return 0, 0, snap.total
|
||||
}
|
||||
// rank = 1 + (number of nodes with strictly higher degree). sortedDesc is
|
||||
// descending, so the count of entries > degree is the first index whose
|
||||
// value is <= degree.
|
||||
rank = 1 + sort.Search(len(snap.sortedDesc), func(i int) bool { return snap.sortedDesc[i] <= degree })
|
||||
return degree, rank, snap.total
|
||||
}
|
||||
|
||||
func (s *Server) getDegreeSnapshot(ctx context.Context) *degreeSnapshot {
|
||||
// Fast path: serve a fresh snapshot under a short lock.
|
||||
s.reach.degreeMu.Lock()
|
||||
if s.reach.degreeSnap != nil && time.Since(s.reach.degreeSnap.at) < reachDegreeTTL {
|
||||
snap := s.reach.degreeSnap
|
||||
s.reach.degreeMu.Unlock()
|
||||
return snap
|
||||
}
|
||||
stale := s.reach.degreeSnap
|
||||
s.reach.degreeMu.Unlock()
|
||||
|
||||
// Rebuild WITHOUT holding the lock so concurrent reach requests aren't
|
||||
// serialized behind the aggregate query. A brief cold-start herd may run a
|
||||
// few redundant queries; the last writer wins.
|
||||
rows, err := s.db.conn.QueryContext(ctx, `
|
||||
SELECT pk, COUNT(*) neigh FROM (
|
||||
SELECT node_a pk FROM neighbor_edges
|
||||
UNION ALL SELECT node_b FROM neighbor_edges
|
||||
) GROUP BY pk`)
|
||||
if err != nil {
|
||||
log.Printf("[reach] degree snapshot query failed: %v (serving stale)", err)
|
||||
return stale // serve stale on error rather than zeroing
|
||||
}
|
||||
defer rows.Close()
|
||||
deg := make(map[string]int)
|
||||
var sortedDesc []int
|
||||
for rows.Next() {
|
||||
var pk string
|
||||
var neigh int
|
||||
if rows.Scan(&pk, &neigh) != nil {
|
||||
continue
|
||||
}
|
||||
deg[strings.ToLower(pk)] = neigh
|
||||
sortedDesc = append(sortedDesc, neigh)
|
||||
}
|
||||
sort.Sort(sort.Reverse(sort.IntSlice(sortedDesc)))
|
||||
snap := °reeSnapshot{at: time.Now(), total: len(deg), deg: deg, sortedDesc: sortedDesc}
|
||||
s.reach.degreeMu.Lock()
|
||||
s.reach.degreeSnap = snap
|
||||
s.reach.degreeMu.Unlock()
|
||||
return snap
|
||||
}
|
||||
|
||||
// scanReachRows reads windowed observations whose path contains any reliable
|
||||
// token, with the originator + observer + snr needed for attribution. Observer
|
||||
// id and originator pubkey are lowercased in SQL (not per row), the path slice
|
||||
// is uppercased in place (no second allocation), and the result is hard-capped
|
||||
// at reachScanRowLimit.
|
||||
//
|
||||
// Returns a non-nil error if the underlying QueryContext or rows.Err() fails;
|
||||
// callers MUST treat that as a 500 (issue #1631 — previously the error was
|
||||
// swallowed, surfacing a transient DB failure as a misleading 404 / empty
|
||||
// reach to operators).
|
||||
func (s *Server) scanReachRows(ctx context.Context, tokens map[string]bool, sinceEpoch int64) ([]pathRow, error) {
|
||||
if len(tokens) == 0 {
|
||||
return nil, nil // defensive: an empty LIKE chain would render `AND ()` (SQL error)
|
||||
}
|
||||
likes := make([]string, 0, len(tokens))
|
||||
args := []interface{}{sinceEpoch}
|
||||
// Sort tokens so the generated SQL text is byte-stable across requests
|
||||
// with the same token set — preserves the driver's prepared-statement
|
||||
// cache and keeps query plans reproducible (Independent r2 #3).
|
||||
toks := make([]string, 0, len(tokens))
|
||||
for tok := range tokens {
|
||||
toks = append(toks, tok)
|
||||
}
|
||||
sort.Strings(toks)
|
||||
for _, tok := range toks {
|
||||
likes = append(likes, "o.path_json LIKE ?")
|
||||
args = append(args, "%\""+tok+"\"%")
|
||||
}
|
||||
q := `SELECT LOWER(COALESCE(obs.id,'')), LOWER(COALESCE(t.from_pubkey,'')), COALESCE(t.payload_type,0), o.path_json, o.snr
|
||||
FROM observations o
|
||||
JOIN transmissions t ON t.id = o.transmission_id
|
||||
LEFT JOIN observers obs ON obs.rowid = o.observer_idx
|
||||
WHERE o.timestamp >= ? AND (` + strings.Join(likes, " OR ") + `)
|
||||
LIMIT ?`
|
||||
args = append(args, reachScanRowLimit)
|
||||
rows, err := s.db.conn.QueryContext(ctx, q, args...)
|
||||
if err != nil {
|
||||
log.Printf("[reach] scan query failed: %v", err)
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
// Modest preallocation: most nodes return far fewer than the cap, so seed a
|
||||
// reasonable capacity rather than reserving reachScanRowLimit up front.
|
||||
out := make([]pathRow, 0, 2048)
|
||||
var skipped int // malformed/empty rows discarded — surfaced below so ingest bugs aren't silent
|
||||
for rows.Next() {
|
||||
var oid, fpk, pj string
|
||||
var pt int
|
||||
var snr sql.NullFloat64
|
||||
if err := rows.Scan(&oid, &fpk, &pt, &pj, &snr); err != nil {
|
||||
skipped++
|
||||
continue
|
||||
}
|
||||
path := parsePathTokens(pj)
|
||||
if len(path) == 0 {
|
||||
skipped++
|
||||
continue
|
||||
}
|
||||
pr := pathRow{observerPK: oid, fromPubkey: fpk, payloadType: pt, path: path}
|
||||
if snr.Valid {
|
||||
pr.snr = snr.Float64
|
||||
pr.snrValid = true
|
||||
}
|
||||
out = append(out, pr)
|
||||
}
|
||||
if skipped > 0 {
|
||||
log.Printf("[reach] scan discarded %d malformed/empty rows (kept %d)", skipped, len(out))
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
log.Printf("[reach] scan rows iteration failed: %v", err)
|
||||
return nil, err
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
@@ -0,0 +1,175 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
_ "modernc.org/sqlite"
|
||||
)
|
||||
|
||||
// benchReachDB builds an in-memory DB with nObs observations. matchEvery
|
||||
// controls payload mix: 1 = every row contains the "01FA" token (worst case),
|
||||
// 2 = every other row matches (the rest carry an unrelated path), etc. This
|
||||
// lets benches measure the scan over a realistic mix, not just all-matching.
|
||||
func benchReachDB(b *testing.B, nObs, matchEvery int, lowerHops bool) *DB {
|
||||
b.Helper()
|
||||
if matchEvery < 1 {
|
||||
matchEvery = 1
|
||||
}
|
||||
matchPath, fillerPath := `["AA","01FA","BB"]`, `["AA","CC","BB"]`
|
||||
if lowerHops {
|
||||
// Lowercase hops force parsePathTokens' ToUpper to allocate (production
|
||||
// path_json is uppercase; this measures the worst case Carmack flagged).
|
||||
matchPath, fillerPath = `["aa","01fa","bb"]`, `["aa","cc","bb"]`
|
||||
}
|
||||
conn, err := sql.Open("sqlite", ":memory:")
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
schema := []string{
|
||||
`CREATE TABLE transmissions (id INTEGER PRIMARY KEY, hash TEXT, first_seen TEXT, payload_type INTEGER, from_pubkey TEXT)`,
|
||||
`CREATE TABLE observers (id TEXT PRIMARY KEY, name TEXT)`,
|
||||
`CREATE TABLE observations (id INTEGER PRIMARY KEY, transmission_id INTEGER, observer_idx INTEGER, snr REAL, path_json TEXT, timestamp INTEGER)`,
|
||||
`CREATE INDEX idx_obs_ts ON observations(timestamp)`,
|
||||
}
|
||||
for _, s := range schema {
|
||||
if _, err := conn.Exec(s); err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
}
|
||||
tx, err := conn.Begin()
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
if _, err := tx.Exec(`INSERT INTO observers (id, name) VALUES ('OBS', 'o')`); err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
for i := 0; i < nObs; i++ {
|
||||
if _, err := tx.Exec(`INSERT INTO transmissions (id, hash, first_seen, payload_type, from_pubkey) VALUES (?,?,?,5,'')`,
|
||||
i, fmt.Sprintf("h%d", i), "2026-06-07T00:00:00Z"); err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
path := fillerPath // non-matching filler
|
||||
if i%matchEvery == 0 {
|
||||
path = matchPath
|
||||
}
|
||||
if _, err := tx.Exec(`INSERT INTO observations (id, transmission_id, observer_idx, snr, path_json, timestamp) VALUES (?,?,1,-7.0,?,?)`,
|
||||
i, i, path, 1000); err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
}
|
||||
if err := tx.Commit(); err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
return &DB{conn: conn}
|
||||
}
|
||||
|
||||
// BenchmarkNodeReachScan measures the windowed scan + path-decode at increasing
|
||||
// scale, all-matching (worst case for memory/allocs).
|
||||
func BenchmarkNodeReachScan(b *testing.B) {
|
||||
tokens := map[string]bool{"01FA": true}
|
||||
for _, n := range []int{1000, 10000, 100000} {
|
||||
b.Run(fmt.Sprintf("rows=%d", n), func(b *testing.B) {
|
||||
db := benchReachDB(b, n, 1, false)
|
||||
srv := &Server{db: db}
|
||||
b.ReportAllocs()
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
rows, _ := srv.scanReachRows(context.Background(), tokens, 0)
|
||||
if len(rows) == 0 {
|
||||
b.Fatal("expected rows")
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// BenchmarkNodeReachScanMixed measures the scan when only half the windowed
|
||||
// rows actually contain the token — closer to production path mixes.
|
||||
func BenchmarkNodeReachScanMixed(b *testing.B) {
|
||||
tokens := map[string]bool{"01FA": true}
|
||||
db := benchReachDB(b, 100000, 2, false)
|
||||
srv := &Server{db: db}
|
||||
b.ReportAllocs()
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
rows, _ := srv.scanReachRows(context.Background(), tokens, 0)
|
||||
if len(rows) == 0 {
|
||||
b.Fatal("expected rows")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// BenchmarkNodeReachScanLowerCase measures the worst case for path decoding:
|
||||
// lowercase hops force parsePathTokens' ToUpper to allocate a new string per
|
||||
// hop (production path_json is uppercase, where ToUpper is a no-op). Publishing
|
||||
// this alongside the all-uppercase numbers keeps the perf claims honest.
|
||||
func BenchmarkNodeReachScanLowerCase(b *testing.B) {
|
||||
tokens := map[string]bool{"01FA": true}
|
||||
db := benchReachDB(b, 100000, 1, true)
|
||||
srv := &Server{db: db}
|
||||
b.ReportAllocs()
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
rows, _ := srv.scanReachRows(context.Background(), tokens, 0)
|
||||
if len(rows) == 0 {
|
||||
b.Fatal("expected rows")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// BenchmarkNodeReachAttribute measures the directional attribution pass over an
|
||||
// already-scanned row set (the in-memory hot loop + map building), isolated
|
||||
// from DB I/O.
|
||||
func BenchmarkNodeReachAttribute(b *testing.B) {
|
||||
tokens := map[string]bool{"01FA": true}
|
||||
db := benchReachDB(b, 100000, 1, false)
|
||||
srv := &Server{db: db}
|
||||
rows, _ := srv.scanReachRows(context.Background(), tokens, 0)
|
||||
if len(rows) == 0 {
|
||||
b.Fatal("expected rows")
|
||||
}
|
||||
resolve := func(tok string) string {
|
||||
switch tok {
|
||||
case "AA":
|
||||
return "aa00000000000000"
|
||||
case "BB":
|
||||
return "bb00000000000000"
|
||||
}
|
||||
return ""
|
||||
}
|
||||
b.ReportAllocs()
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
d := attributeDirections(rows, tokens, "01fa326b", resolve)
|
||||
if d.relay == 0 {
|
||||
b.Fatal("expected relay hits")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestScanReachRows_ErrorReturn anchors the new ([]pathRow, error) signature
|
||||
// at the unit-level (issue #1631). Passing a Server whose db.conn is closed
|
||||
// must surface an error, not a swallowed nil. Lives in this file because
|
||||
// the bench callers in the same file rely on the same signature.
|
||||
func TestScanReachRows_ErrorReturn(t *testing.T) {
|
||||
conn, err := sql.Open("sqlite", ":memory:")
|
||||
if err != nil {
|
||||
t.Fatalf("open: %v", err)
|
||||
}
|
||||
// PREFLIGHT: async=true reason="test-only in-memory scratch schema, immediately closed"
|
||||
if _, err := conn.Exec(`CREATE TABLE observations (id INTEGER); CREATE TABLE transmissions (id INTEGER); CREATE TABLE observers (rowid INTEGER, id TEXT)`); err != nil {
|
||||
t.Fatalf("schema: %v", err)
|
||||
}
|
||||
conn.Close() // force QueryContext to fail
|
||||
srv := &Server{db: &DB{conn: conn}}
|
||||
rows, err := srv.scanReachRows(context.Background(), map[string]bool{"01FA": true}, 0)
|
||||
if err == nil {
|
||||
t.Fatalf("expected error from closed DB, got nil (rows=%d)", len(rows))
|
||||
}
|
||||
if rows != nil {
|
||||
t.Fatalf("expected nil rows on error, got %d", len(rows))
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,124 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestNodeReach_BlacklistMutationBustsCache reproduces #1629.
|
||||
//
|
||||
// Scenario:
|
||||
// 1. Warm the reach response cache with a non-blacklisted pubkey (200 OK).
|
||||
// 2. Operator blacklists that pubkey via SetNodeBlacklist (the legitimate
|
||||
// mutation entry point — config reload, admin call, etc.).
|
||||
// 3. The very next /reach request for that pubkey MUST return 404 (the
|
||||
// blacklist response), not the cached 200 payload.
|
||||
//
|
||||
// Pre-fix the blacklist set is locked in by sync.Once at first read, so
|
||||
// IsBlacklisted keeps returning false after the mutation; the cache then
|
||||
// re-serves the prior reach body and the assertion fails.
|
||||
func TestNodeReach_BlacklistMutationBustsCache(t *testing.T) {
|
||||
resetReachState(t)
|
||||
db, n := newReachIntegrationDB(t, `["AABB","01FA","CCDD"]`)
|
||||
defer db.conn.Close()
|
||||
|
||||
// Start with a non-empty blacklist (some unrelated decoy pubkey) so the
|
||||
// blacklist set is materialised on the first IsBlacklisted call below.
|
||||
// This is the realistic state: a deployment running with a populated
|
||||
// blacklist where the operator later ADDS a new entry.
|
||||
decoy := pk64("dec0")
|
||||
cfg := &Config{NodeBlacklist: []string{decoy}}
|
||||
srv := &Server{store: newTestStoreWithDB(t, db, cfg), db: db, cfg: cfg, perfStats: NewPerfStats()}
|
||||
|
||||
// 1. Warm cache (must 200 and populate cache).
|
||||
rr := serveReach(srv, "/api/nodes/"+n+"/reach?days=30")
|
||||
if rr.Code != http.StatusOK {
|
||||
t.Fatalf("warm-up: status=%d want 200 (body=%s)", rr.Code, rr.Body.String())
|
||||
}
|
||||
if srv.reachCacheLen() == 0 {
|
||||
t.Fatalf("warm-up did not populate reach cache")
|
||||
}
|
||||
|
||||
// 2. Operator adds the target node to the blacklist via the public setter.
|
||||
cfg.SetNodeBlacklist([]string{decoy, n})
|
||||
|
||||
// 3. Next request MUST return 404. With the bug, the sync.Once-cached
|
||||
// empty blacklist set makes IsBlacklisted return false, the response
|
||||
// cache hits, and the prior 200 body is re-served.
|
||||
rr2 := serveReach(srv, "/api/nodes/"+n+"/reach?days=30")
|
||||
if rr2.Code != http.StatusNotFound {
|
||||
t.Fatalf("post-blacklist mutation: status=%d want 404 (cached payload was served — #1629)", rr2.Code)
|
||||
}
|
||||
}
|
||||
|
||||
// TestConfig_BlacklistGenerationIncrements asserts that every SetNodeBlacklist
|
||||
// call bumps the generation counter by exactly 1, regardless of whether the
|
||||
// content changed. The /reach cache key embeds this generation, so the
|
||||
// monotonic-bump contract is part of the public API of the package
|
||||
// (adversarial #4 from round-1 polish).
|
||||
func TestConfig_BlacklistGenerationIncrements(t *testing.T) {
|
||||
cfg := &Config{}
|
||||
g0 := cfg.BlacklistGeneration()
|
||||
cfg.SetNodeBlacklist([]string{"aa"})
|
||||
g1 := cfg.BlacklistGeneration()
|
||||
if g1 != g0+1 {
|
||||
t.Fatalf("first SetNodeBlacklist: gen %d -> %d (want +1)", g0, g1)
|
||||
}
|
||||
// Identical content — generation MUST still bump. Callers rely on
|
||||
// "any call invalidates" rather than "content-diff invalidates."
|
||||
cfg.SetNodeBlacklist([]string{"aa"})
|
||||
g2 := cfg.BlacklistGeneration()
|
||||
if g2 != g1+1 {
|
||||
t.Fatalf("second SetNodeBlacklist (same content): gen %d -> %d (want +1)", g1, g2)
|
||||
}
|
||||
// Empty mutation also bumps.
|
||||
cfg.SetNodeBlacklist(nil)
|
||||
g3 := cfg.BlacklistGeneration()
|
||||
if g3 != g2+1 {
|
||||
t.Fatalf("nil SetNodeBlacklist: gen %d -> %d (want +1)", g2, g3)
|
||||
}
|
||||
}
|
||||
|
||||
// TestNodeReach_BlacklistMutationPurgesCache asserts that a blacklist
|
||||
// mutation evicts ALL prior reach cache entries (not just the affected
|
||||
// pubkey) on the next /reach request. Per adversarial #5, the previous
|
||||
// gen-suffix-only design left every prior cached entry stranded until TTL,
|
||||
// growing the cache by N entries per operator edit. The current design
|
||||
// purges on generation bump (detected on the next handler invocation) so a
|
||||
// steady stream of edits cannot leak entries unboundedly.
|
||||
func TestNodeReach_BlacklistMutationPurgesCache(t *testing.T) {
|
||||
resetReachState(t)
|
||||
db, n := newReachIntegrationDB(t, `["AABB","01FA","CCDD"]`)
|
||||
defer db.conn.Close()
|
||||
|
||||
cfg := &Config{}
|
||||
srv := &Server{store: newTestStoreWithDB(t, db, cfg), db: db, cfg: cfg, perfStats: NewPerfStats()}
|
||||
|
||||
// Warm cache with two distinct keys (different days param).
|
||||
for _, days := range []string{"30", "7"} {
|
||||
rr := serveReach(srv, "/api/nodes/"+n+"/reach?days="+days)
|
||||
if rr.Code != http.StatusOK {
|
||||
t.Fatalf("warm-up days=%s: status=%d want 200", days, rr.Code)
|
||||
}
|
||||
}
|
||||
before := srv.reachCacheLen()
|
||||
if before < 2 {
|
||||
t.Fatalf("warm-up populated %d entries, want >=2", before)
|
||||
}
|
||||
|
||||
// Unrelated blacklist mutation. The cached pubkey is not in the
|
||||
// blacklist, but prior entries are now keyed under a stale generation
|
||||
// and would otherwise sit until TTL.
|
||||
cfg.SetNodeBlacklist([]string{pk64("dead")})
|
||||
|
||||
// Next /reach request triggers the purge inside the reach path.
|
||||
rr := serveReach(srv, "/api/nodes/"+n+"/reach?days=30")
|
||||
if rr.Code != http.StatusOK {
|
||||
t.Fatalf("post-mutation request: status=%d want 200", rr.Code)
|
||||
}
|
||||
// After the purge + this single re-populate we expect exactly 1 entry,
|
||||
// not the 2 stale + 1 new = 3 that the leaky design would leave behind.
|
||||
if got := srv.reachCacheLen(); got != 1 {
|
||||
t.Fatalf("post-mutation cache len = %d, want 1 (prior entries leaked — adv #5)", got)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,312 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/gorilla/mux"
|
||||
_ "modernc.org/sqlite"
|
||||
)
|
||||
|
||||
func serveReach(srv *Server, path string) *httptest.ResponseRecorder {
|
||||
router := mux.NewRouter()
|
||||
router.HandleFunc("/api/nodes/{pubkey}/reach", srv.handleNodeReach).Methods("GET")
|
||||
req := httptest.NewRequest("GET", path, nil)
|
||||
rr := httptest.NewRecorder()
|
||||
router.ServeHTTP(rr, req)
|
||||
return rr
|
||||
}
|
||||
|
||||
// pk64 pads a short hex stem to a full 64-char lowercase pubkey.
|
||||
func pk64(stem string) string { return stem + strings.Repeat("0", 64-len(stem)) }
|
||||
|
||||
// resetReachState clears the per-server reach caches so test order cannot
|
||||
// leak observable state between handler tests (and restores after the test).
|
||||
// Now operates on *Server (was package globals — Independent r2 #2); accepts
|
||||
// a variadic *Server so existing call sites that didn't pass one still
|
||||
// compile but the reset is a no-op (used by tests that build the Server
|
||||
// fresh and don't need state cleared).
|
||||
func resetReachState(t *testing.T, servers ...*Server) {
|
||||
t.Helper()
|
||||
clear := func() {
|
||||
for _, s := range servers {
|
||||
if s == nil {
|
||||
continue
|
||||
}
|
||||
s.reach.cacheMu.Lock()
|
||||
s.reach.cache = map[string]reachCacheEntry{}
|
||||
s.reach.cacheMu.Unlock()
|
||||
s.reach.degreeMu.Lock()
|
||||
s.reach.degreeSnap = nil
|
||||
s.reach.degreeMu.Unlock()
|
||||
}
|
||||
}
|
||||
clear()
|
||||
t.Cleanup(clear)
|
||||
}
|
||||
|
||||
// newReachIntegrationDB builds a complete observer_idx-schema DB with a target
|
||||
// node N, two neighbours A/B, and one observation on obsPath so the HTTP handler
|
||||
// exercises real directional attribution. Pass a path that omits N's token to
|
||||
// build the zero-reach case (identifiable node, no matching observations).
|
||||
func newReachIntegrationDB(t *testing.T, obsPath string) (*DB, string) {
|
||||
t.Helper()
|
||||
conn, err := sql.Open("sqlite", ":memory:")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
n := pk64("01fa") // target — unique 2-byte token "01fa"
|
||||
a := pk64("aabb") // predecessor → we hear A
|
||||
b := pk64("ccdd") // successor → B hears us
|
||||
now := time.Now().Unix()
|
||||
stmts := []string{
|
||||
`CREATE TABLE nodes (public_key TEXT, name TEXT, role TEXT, lat REAL, lon REAL, last_seen TEXT, first_seen TEXT, advert_count INTEGER)`,
|
||||
`CREATE TABLE transmissions (id INTEGER PRIMARY KEY, from_pubkey TEXT, payload_type INTEGER)`,
|
||||
`CREATE TABLE observers (id TEXT)`,
|
||||
`CREATE TABLE observations (id INTEGER PRIMARY KEY, transmission_id INTEGER, observer_idx INTEGER, snr REAL, path_json TEXT, timestamp INTEGER)`,
|
||||
`CREATE TABLE neighbor_edges (node_a TEXT, node_b TEXT, count INTEGER)`,
|
||||
}
|
||||
for _, s := range stmts {
|
||||
if _, err := conn.Exec(s); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
ins := []struct {
|
||||
q string
|
||||
args []interface{}
|
||||
}{
|
||||
{`INSERT INTO nodes VALUES (?, 'N', 'repeater', 50.9, 5.4, ?, '2026-06-01T00:00:00Z', 3)`, []interface{}{n, "2026-06-07T00:00:00Z"}},
|
||||
{`INSERT INTO nodes VALUES (?, 'A', 'repeater', 51.0, 5.5, ?, '2026-06-01T00:00:00Z', 1)`, []interface{}{a, "2026-06-07T00:00:00Z"}},
|
||||
{`INSERT INTO nodes VALUES (?, 'B', 'repeater', 51.1, 5.6, ?, '2026-06-01T00:00:00Z', 1)`, []interface{}{b, "2026-06-07T00:00:00Z"}},
|
||||
{`INSERT INTO observers (id) VALUES ('OBS1')`, nil},
|
||||
{`INSERT INTO transmissions (id, from_pubkey, payload_type) VALUES (1, '', 5)`, nil},
|
||||
{`INSERT INTO observations (id, transmission_id, observer_idx, snr, path_json, timestamp) VALUES (1,1,1,-7.0,?,?)`, []interface{}{obsPath, now}},
|
||||
}
|
||||
for _, in := range ins {
|
||||
if _, err := conn.Exec(in.q, in.args...); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
return &DB{conn: conn, isV3: true}, n
|
||||
}
|
||||
|
||||
func TestClampDays(t *testing.T) {
|
||||
cases := []struct{ in, want int }{{0, 1}, {-5, 1}, {1, 1}, {7, 7}, {30, 30}, {31, 30}, {999, 30}}
|
||||
for _, c := range cases {
|
||||
if got := clampDays(c.in); got != c.want {
|
||||
t.Errorf("clampDays(%d)=%d want %d", c.in, got, c.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestNodeReach_UnknownNode(t *testing.T) {
|
||||
srv := makeTestServer(makeTestGraph()) // no store/db wired → 404
|
||||
rr := serveReach(srv, "/api/nodes/"+pk64("deadbeef")+"/reach")
|
||||
if rr.Code != http.StatusNotFound {
|
||||
t.Fatalf("status=%d want 404", rr.Code)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNodeReach_InvalidPubkey(t *testing.T) {
|
||||
srv := makeTestServer(makeTestGraph())
|
||||
for _, bad := range []string{"deadbeef", "xyz", pk64("01") + "zz"} {
|
||||
rr := serveReach(srv, "/api/nodes/"+bad+"/reach")
|
||||
if rr.Code != http.StatusBadRequest {
|
||||
t.Errorf("pubkey %q: status=%d want 400", bad, rr.Code)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestNodeReach_ValidPubkeyNotInNodes(t *testing.T) {
|
||||
resetReachState(t)
|
||||
db := setupTestDBv2(t)
|
||||
cfg := &Config{}
|
||||
srv := &Server{store: newTestStoreWithDB(t, db, cfg), db: db, cfg: cfg, perfStats: NewPerfStats()}
|
||||
// Syntactically valid pubkey that was never inserted → real 404 path.
|
||||
rr := serveReach(srv, "/api/nodes/"+pk64("beef")+"/reach")
|
||||
if rr.Code != http.StatusNotFound {
|
||||
t.Fatalf("status=%d want 404 (body=%s)", rr.Code, rr.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestNodeReach_BlacklistedReturns404(t *testing.T) {
|
||||
pk := pk64("01fa")
|
||||
cfg := &Config{NodeBlacklist: []string{pk}}
|
||||
srv := &Server{cfg: cfg}
|
||||
rr := serveReach(srv, "/api/nodes/"+pk+"/reach")
|
||||
if rr.Code != http.StatusNotFound {
|
||||
t.Fatalf("blacklisted pubkey: status=%d want 404", rr.Code)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNodeReach_AttributionAndCacheHit(t *testing.T) {
|
||||
resetReachState(t)
|
||||
db, n := newReachIntegrationDB(t, `["AABB","01FA","CCDD"]`)
|
||||
defer db.conn.Close()
|
||||
cfg := &Config{}
|
||||
srv := &Server{store: newTestStoreWithDB(t, db, cfg), db: db, cfg: cfg, perfStats: NewPerfStats()}
|
||||
|
||||
rr := serveReach(srv, "/api/nodes/"+n+"/reach?days=30")
|
||||
if rr.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d want 200 (body=%s)", rr.Code, rr.Body.String())
|
||||
}
|
||||
var resp NodeReachResponse
|
||||
if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("bad json: %v", err)
|
||||
}
|
||||
if resp.Importance.RelayObservations < 1 {
|
||||
t.Fatalf("expected ≥1 relay observation, got %d", resp.Importance.RelayObservations)
|
||||
}
|
||||
var weHearA, theyHearB bool
|
||||
for _, l := range resp.Links {
|
||||
if l.Name == "A" && l.WeHear >= 1 {
|
||||
weHearA = true
|
||||
}
|
||||
if l.Name == "B" && l.TheyHear >= 1 {
|
||||
theyHearB = true
|
||||
}
|
||||
}
|
||||
if !weHearA {
|
||||
t.Errorf("expected we_hear≥1 for neighbour A, links=%+v", resp.Links)
|
||||
}
|
||||
if !theyHearB {
|
||||
t.Errorf("expected they_hear≥1 for neighbour B, links=%+v", resp.Links)
|
||||
}
|
||||
|
||||
// Cache hit: the key (now generation-suffixed, #1629) must be populated
|
||||
// and a second request must 200.
|
||||
wantKey := n + "|30|g" + strconv.FormatUint(srv.cfg.BlacklistGeneration(), 10)
|
||||
if _, ok := srv.reachCacheGet(wantKey); !ok {
|
||||
t.Fatalf("expected reach response to be cached under %q", wantKey)
|
||||
}
|
||||
rr2 := serveReach(srv, "/api/nodes/"+n+"/reach?days=30")
|
||||
if rr2.Code != http.StatusOK || rr2.Body.String() != rr.Body.String() {
|
||||
t.Fatalf("cache-hit response differs: code=%d", rr2.Code)
|
||||
}
|
||||
}
|
||||
|
||||
// Zero-reach happy path: a node that IS identifiable (has reliable tokens) but
|
||||
// whose observations contain none of its tokens must return 200 with empty
|
||||
// arrays — NOT 404. A wrong implementation that 404s here passes every other
|
||||
// test. (docs/api-spec.md contract.)
|
||||
func TestNodeReach_ZeroReach(t *testing.T) {
|
||||
resetReachState(t)
|
||||
db, n := newReachIntegrationDB(t, `["AABB","CCDD"]`) // path omits N's "01FA" token
|
||||
defer db.conn.Close()
|
||||
cfg := &Config{}
|
||||
srv := &Server{store: newTestStoreWithDB(t, db, cfg), db: db, cfg: cfg, perfStats: NewPerfStats()}
|
||||
|
||||
rr := serveReach(srv, "/api/nodes/"+n+"/reach?days=30")
|
||||
if rr.Code != http.StatusOK {
|
||||
t.Fatalf("zero-reach must be 200 not 404, got %d (body=%s)", rr.Code, rr.Body.String())
|
||||
}
|
||||
var resp NodeReachResponse
|
||||
if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("bad json: %v", err)
|
||||
}
|
||||
if len(resp.ReliableTokens) == 0 {
|
||||
t.Fatalf("node should still be identifiable (reliable tokens present)")
|
||||
}
|
||||
if len(resp.Links) != 0 || len(resp.DirectObservers) != 0 || resp.Importance.RelayObservations != 0 {
|
||||
t.Fatalf("expected empty reach, got links=%d obs=%d relay=%d",
|
||||
len(resp.Links), len(resp.DirectObservers), resp.Importance.RelayObservations)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNodeReach_ShapeAndClamp(t *testing.T) {
|
||||
resetReachState(t)
|
||||
db := setupTestDBv2(t)
|
||||
const pk = "01fa326b475800a31105abcb9e4cac000b3e5d9e2b5ba0739981ce8d5f3a6754"
|
||||
mustExecDB(t, db, `INSERT INTO nodes (public_key, name, role, lat, lon, last_seen, first_seen, advert_count)
|
||||
VALUES ('`+pk+`', 'BE-Test', 'repeater', 50.9, 5.4, '2026-06-07T00:00:00Z', '2026-06-01T00:00:00Z', 3)`)
|
||||
// scanReachRows joins observations on observer_idx; the v2 schema's
|
||||
// observations table lacks that column. Previously the scan error was
|
||||
// swallowed (issue #1631) and the test still saw empty arrays. With the
|
||||
// fix that returns 500, we rebuild observations to the observer_idx
|
||||
// shape (empty — no rows needed for shape/clamp assertions).
|
||||
mustExecDB(t, db, `DROP TABLE observations`)
|
||||
// PREFLIGHT: async=true reason="test-only in-memory schema rebuild; not a production migration"
|
||||
mustExecDB(t, db, `CREATE TABLE observations (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
transmission_id INTEGER,
|
||||
observer_idx INTEGER,
|
||||
snr REAL,
|
||||
path_json TEXT,
|
||||
timestamp INTEGER
|
||||
)`)
|
||||
|
||||
cfg := &Config{}
|
||||
srv := &Server{store: newTestStoreWithDB(t, db, cfg), db: db, cfg: cfg, perfStats: NewPerfStats()}
|
||||
|
||||
rr := serveReach(srv, "/api/nodes/"+pk+"/reach?days=999")
|
||||
if rr.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d want 200 (body=%s)", rr.Code, rr.Body.String())
|
||||
}
|
||||
var resp NodeReachResponse
|
||||
if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("bad json: %v", err)
|
||||
}
|
||||
if resp.Window.Days != 30 {
|
||||
t.Fatalf("days not clamped to 30: %d", resp.Window.Days)
|
||||
}
|
||||
if resp.Links == nil || resp.DirectObservers == nil || resp.ReliableTokens == nil {
|
||||
t.Fatalf("array fields must be non-nil (never null)")
|
||||
}
|
||||
if !contains(resp.ReliableTokens, "01FA") {
|
||||
t.Fatalf("expected 01FA reliable token, got %v", resp.ReliableTokens)
|
||||
}
|
||||
if resp.Node.FirstSeen != "2026-06-01T00:00:00Z" {
|
||||
t.Fatalf("first_seen not sourced from nodes table: %q", resp.Node.FirstSeen)
|
||||
}
|
||||
}
|
||||
|
||||
// Issue #1631: a DB failure inside scanReachRows must surface as 500, not
|
||||
// as a misleading "no reach" 200 or 404. We warm the integration DB, drop
|
||||
// the observations table so the next reach scan query fails inside
|
||||
// QueryContext, then assert the handler returns 500 (not 200 with empty
|
||||
// arrays, which is the buggy current behavior — scanReachRows swallows the
|
||||
// error and returns nil).
|
||||
func TestNodeReach_ScanDBErrorReturns500(t *testing.T) {
|
||||
resetReachState(t)
|
||||
db, n := newReachIntegrationDB(t, `["AABB","01FA","CCDD"]`)
|
||||
defer db.conn.Close()
|
||||
cfg := &Config{}
|
||||
srv := &Server{store: newTestStoreWithDB(t, db, cfg), db: db, cfg: cfg, perfStats: NewPerfStats()}
|
||||
|
||||
// Warm the store's node cache (so buildNodeInfoMap on the failing call
|
||||
// still finds the target node). One healthy call also primes the
|
||||
// reach response cache — clear it below so the next call recomputes.
|
||||
if rr := serveReach(srv, "/api/nodes/"+n+"/reach?days=30"); rr.Code != http.StatusOK {
|
||||
t.Fatalf("warm-up call: status=%d want 200 (body=%s)", rr.Code, rr.Body.String())
|
||||
}
|
||||
srv.reach.cacheMu.Lock()
|
||||
srv.reach.cache = map[string]reachCacheEntry{}
|
||||
srv.reach.cacheMu.Unlock()
|
||||
|
||||
// Break the table that scanReachRows reads from. nodes / observers /
|
||||
// neighbor_edges remain intact so the failure is isolated to the
|
||||
// scanReachRows QueryContext path.
|
||||
if _, err := db.conn.Exec("DROP TABLE observations"); err != nil {
|
||||
t.Fatalf("drop observations: %v", err)
|
||||
}
|
||||
|
||||
rr := serveReach(srv, "/api/nodes/"+n+"/reach?days=30")
|
||||
if rr.Code != http.StatusInternalServerError {
|
||||
t.Fatalf("expected 500 on DB error inside scanReachRows, got %d (body=%s)", rr.Code, rr.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func contains(s []string, v string) bool {
|
||||
for _, x := range s {
|
||||
if x == v {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
@@ -0,0 +1,291 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"strconv"
|
||||
"testing"
|
||||
|
||||
_ "modernc.org/sqlite"
|
||||
)
|
||||
|
||||
// newReachScanTestDB builds a minimal observer_idx-schema DB with two rows whose
|
||||
// path contains "01FA" and one that does not, for scanReachRows coverage.
|
||||
func newReachScanTestDB(t *testing.T) *DB {
|
||||
t.Helper()
|
||||
conn, err := sql.Open("sqlite", ":memory:")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
stmts := []string{
|
||||
`CREATE TABLE transmissions (id INTEGER PRIMARY KEY, from_pubkey TEXT, payload_type INTEGER)`,
|
||||
`CREATE TABLE observers (id TEXT)`,
|
||||
`CREATE TABLE observations (id INTEGER PRIMARY KEY, transmission_id INTEGER, observer_idx INTEGER, snr REAL, path_json TEXT, timestamp INTEGER)`,
|
||||
`INSERT INTO observers (id) VALUES ('OBS1')`, // rowid 1
|
||||
`INSERT INTO transmissions (id, from_pubkey, payload_type) VALUES (1,'FF00',4),(2,'',5),(3,'',5)`,
|
||||
`INSERT INTO observations (id, transmission_id, observer_idx, snr, path_json, timestamp) VALUES
|
||||
(1,1,1,-7.0,'["AA","01FA","BB"]',1000),
|
||||
(2,2,1,NULL,'["01FA","CC"]',1000),
|
||||
(3,3,1,-5.0,'["AA","CC"]',1000)`, // no 01FA → excluded
|
||||
}
|
||||
for _, s := range stmts {
|
||||
if _, err := conn.Exec(s); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
return &DB{conn: conn}
|
||||
}
|
||||
|
||||
// resolver that only resolves the exact tokens it's told are unique.
|
||||
func testResolver(unique map[string]string) func(string) string {
|
||||
return func(tok string) string {
|
||||
if pk, ok := unique[tok]; ok {
|
||||
return pk
|
||||
}
|
||||
return "" // ambiguous / unknown → skip
|
||||
}
|
||||
}
|
||||
|
||||
func TestParsePathTokens(t *testing.T) {
|
||||
cases := []struct {
|
||||
in string
|
||||
want []string
|
||||
}{
|
||||
{`["AA","01FA","BB"]`, []string{"AA", "01FA", "BB"}},
|
||||
{`["aa","01fa"]`, []string{"AA", "01FA"}}, // uppercased
|
||||
{`["EFEF"]`, []string{"EFEF"}},
|
||||
{`[]`, nil},
|
||||
{``, nil},
|
||||
{`null`, nil},
|
||||
{`["49A985"]`, []string{"49A985"}}, // 3-byte hop preserved
|
||||
}
|
||||
for _, c := range cases {
|
||||
got := parsePathTokens(c.in)
|
||||
if len(got) != len(c.want) {
|
||||
t.Fatalf("parsePathTokens(%q) = %v, want %v", c.in, got, c.want)
|
||||
}
|
||||
for i := range got {
|
||||
if got[i] != c.want[i] {
|
||||
t.Errorf("parsePathTokens(%q)[%d] = %q, want %q", c.in, i, got[i], c.want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestAttributeDirections_PredecessorAndSuccessor(t *testing.T) {
|
||||
// path A(aa) -> N(01fa) -> B(bb): we hear A, B hears us.
|
||||
unique := map[string]string{"AA": "aa00", "BB": "bb00"}
|
||||
rows := []pathRow{{
|
||||
observerPK: "obs1", payloadType: 5,
|
||||
path: []string{"AA", "01FA", "BB"},
|
||||
}}
|
||||
d := attributeDirections(rows, map[string]bool{"01FA": true}, "01fa326b", testResolver(unique))
|
||||
if d.we["aa00"] != 1 {
|
||||
t.Fatalf("we_hear[aa00]=%d want 1", d.we["aa00"])
|
||||
}
|
||||
if d.they["bb00"] != 1 {
|
||||
t.Fatalf("they_hear[bb00]=%d want 1", d.they["bb00"])
|
||||
}
|
||||
if d.relay != 1 {
|
||||
t.Fatalf("relay=%d want 1", d.relay)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAttributeDirections_LastHopObserverAndAdvertFirstHop(t *testing.T) {
|
||||
rows := []pathRow{
|
||||
// N is last hop → observer heard us directly (+snr).
|
||||
{observerPK: "obsx", payloadType: 5, path: []string{"AA", "01FA"}, snr: 4.0, snrValid: true},
|
||||
// N is first hop of an ADVERT (type 4) → we heard the originator.
|
||||
{observerPK: "obsy", payloadType: 4, fromPubkey: "origin1", path: []string{"01FA", "CC"}},
|
||||
}
|
||||
d := attributeDirections(rows, map[string]bool{"01FA": true}, "01fa326b",
|
||||
testResolver(map[string]string{"CC": "cc00"}))
|
||||
if a, ok := d.obs["obsx"]; !ok || a.count != 1 {
|
||||
t.Fatalf("observer obsx not counted")
|
||||
}
|
||||
if a := d.obs["obsx"]; a.snrN != 1 || a.snrSum != 4.0 {
|
||||
t.Fatalf("observer snr not aggregated")
|
||||
}
|
||||
if d.they["obsx"] != 1 {
|
||||
t.Fatalf("they_hear[obsx]=%d want 1", d.they["obsx"])
|
||||
}
|
||||
if d.we["origin1"] != 1 {
|
||||
t.Fatalf("we_hear[origin1]=%d want 1 (advert first-hop)", d.we["origin1"])
|
||||
}
|
||||
if d.they["cc00"] != 1 {
|
||||
t.Fatalf("they_hear[cc00]=%d want 1 (successor)", d.they["cc00"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestAttributeDirections_AmbiguousSkippedAndSelfIgnored(t *testing.T) {
|
||||
// No observer, so the last-hop observer branch can't fire — this isolates
|
||||
// the resolve logic. ZZ is unresolved (ambiguous → skipped); the trailing
|
||||
// 01FA resolves to self (ourPK) and must be ignored as a successor.
|
||||
rows := []pathRow{{observerPK: "", payloadType: 5, path: []string{"ZZ", "01FA", "01FA"}}}
|
||||
d := attributeDirections(rows, map[string]bool{"01FA": true}, "01fa326b",
|
||||
testResolver(map[string]string{"01FA": "01fa326b"}))
|
||||
if len(d.we) != 0 || len(d.they) != 0 {
|
||||
t.Fatalf("ambiguous/self should yield no edges, got we=%v they=%v", d.we, d.they)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAttributeDirections_LastHopWithObserverCountsObserver(t *testing.T) {
|
||||
// Guards the case the previous test deliberately excludes: when our token is
|
||||
// the last hop AND an observer is present, that observer heard us directly.
|
||||
rows := []pathRow{{observerPK: "obs1", payloadType: 5, path: []string{"ZZ", "01FA"}}}
|
||||
d := attributeDirections(rows, map[string]bool{"01FA": true}, "01fa326b",
|
||||
testResolver(map[string]string{}))
|
||||
if a, ok := d.obs["obs1"]; d.they["obs1"] != 1 || !ok || a.count != 1 {
|
||||
t.Fatalf("last-hop observer should be counted, got they=%v", d.they)
|
||||
}
|
||||
}
|
||||
|
||||
func TestReliableTokens(t *testing.T) {
|
||||
// pm where "01fa" is unique but "01" is shared (collision).
|
||||
nodes := []nodeInfo{
|
||||
{PublicKey: "01fa326b0000", Role: "repeater"},
|
||||
{PublicKey: "0188aaaa0000", Role: "repeater"},
|
||||
}
|
||||
pm := buildPrefixMap(nodes)
|
||||
toks := reliableTokens("01fa326b0000", pm)
|
||||
if !toks["01FA"] {
|
||||
t.Fatalf("expected 01FA reliable, got %v", toks)
|
||||
}
|
||||
if toks["01"] {
|
||||
t.Fatalf("1-byte 01 must be excluded (collision), got %v", toks)
|
||||
}
|
||||
}
|
||||
|
||||
func TestReliableTokens_CompanionNotMisattributed(t *testing.T) {
|
||||
// pm holds only path-capable relays. A companion target (not in pm) whose
|
||||
// prefix uniquely matches an UNRELATED relay must yield NO reliable tokens —
|
||||
// otherwise that relay's traffic would be credited to the companion.
|
||||
relay := nodeInfo{PublicKey: "aa11000000000000", Role: "repeater"}
|
||||
pm := buildPrefixMap([]nodeInfo{relay})
|
||||
companion := "aa11ffff00000000" // shares 2-byte "aa11" with the relay, differs at byte 3
|
||||
toks := reliableTokens(companion, pm)
|
||||
if len(toks) != 0 {
|
||||
t.Fatalf("companion must get no reliable tokens (prefix points at a relay), got %v", toks)
|
||||
}
|
||||
// Sanity: the relay itself still resolves to its own prefix.
|
||||
if !reliableTokens(relay.PublicKey, pm)["AA11"] {
|
||||
t.Fatalf("relay should keep its own AA11 token")
|
||||
}
|
||||
}
|
||||
|
||||
func TestScanReachRows_CapTruncates(t *testing.T) {
|
||||
defer func(orig int) { reachScanRowLimit = orig }(reachScanRowLimit)
|
||||
reachScanRowLimit = 1 // newReachScanTestDB has 2 matching rows
|
||||
db := newReachScanTestDB(t)
|
||||
defer db.conn.Close()
|
||||
srv := &Server{db: db}
|
||||
rows, _ := srv.scanReachRows(context.Background(), map[string]bool{"01FA": true}, 0)
|
||||
if len(rows) != 1 {
|
||||
t.Fatalf("scan must hard-cap at reachScanRowLimit (1), got %d rows", len(rows))
|
||||
}
|
||||
}
|
||||
|
||||
func TestReachCacheEviction_BoundedNotWiped(t *testing.T) {
|
||||
srv := &Server{}
|
||||
resetReachState(t, srv)
|
||||
for i := 0; i < reachCacheMax+50; i++ {
|
||||
srv.reachCachePut("k"+strconv.Itoa(i), []byte("x"))
|
||||
}
|
||||
srv.reach.cacheMu.RLock()
|
||||
n := len(srv.reach.cache)
|
||||
srv.reach.cacheMu.RUnlock()
|
||||
// Bounded at the cap and NOT a full wipe (the old crude reset would leave 1).
|
||||
if n != reachCacheMax {
|
||||
t.Fatalf("cache size after overflow = %d, want %d (bounded, evict-oldest not full-wipe)", n, reachCacheMax)
|
||||
}
|
||||
}
|
||||
|
||||
func TestReliableTokens_ThreeByteBranch(t *testing.T) {
|
||||
// Two nodes share the 2-byte prefix "01fa" but diverge at byte 3, so the
|
||||
// 3-byte (6-hex) prefix is the shortest unique token. Exercises the l=6
|
||||
// branch that the 1-/2-byte test does not.
|
||||
nodes := []nodeInfo{
|
||||
{PublicKey: "01fa32000000", Role: "repeater"},
|
||||
{PublicKey: "01fa99000000", Role: "repeater"},
|
||||
}
|
||||
pm := buildPrefixMap(nodes)
|
||||
toks := reliableTokens("01fa32000000", pm)
|
||||
if toks["01FA"] {
|
||||
t.Fatalf("2-byte 01FA collides here and must be excluded, got %v", toks)
|
||||
}
|
||||
if !toks["01FA32"] {
|
||||
t.Fatalf("expected 3-byte 01FA32 reliable token, got %v", toks)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAttributeDirections_NonAdvertFirstHopNotCredited(t *testing.T) {
|
||||
// Our token is the FIRST hop but payloadType is NOT an advert. The
|
||||
// fromPubkey must NOT be credited as we_hear (only adverts carry a
|
||||
// trustworthy originator → first-hop relationship). Guards the
|
||||
// `payloadType == PayloadADVERT` condition on the first-hop branch.
|
||||
rows := []pathRow{{
|
||||
observerPK: "obs1", payloadType: 5, fromPubkey: "origin1",
|
||||
path: []string{"01FA", "BB"},
|
||||
}}
|
||||
d := attributeDirections(rows, map[string]bool{"01FA": true}, "01fa326b",
|
||||
testResolver(map[string]string{"BB": "bb00"}))
|
||||
if d.we["origin1"] != 0 {
|
||||
t.Fatalf("non-advert first hop must not credit we_hear[origin1], got %d", d.we["origin1"])
|
||||
}
|
||||
if len(d.we) != 0 {
|
||||
t.Fatalf("expected no we_hear edges, got %v", d.we)
|
||||
}
|
||||
if d.they["bb00"] != 1 { // successor still counts
|
||||
t.Fatalf("they_hear[bb00]=%d want 1", d.they["bb00"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestAttributeDirections_ObserverAggregatesAcrossRows(t *testing.T) {
|
||||
// Same observer on the last hop across multiple rows: count and SNR must
|
||||
// accumulate, not overwrite.
|
||||
rows := []pathRow{
|
||||
{observerPK: "obs1", payloadType: 5, path: []string{"AA", "01FA"}, snr: 2.0, snrValid: true},
|
||||
{observerPK: "obs1", payloadType: 5, path: []string{"BB", "01FA"}, snr: 6.0, snrValid: true},
|
||||
}
|
||||
d := attributeDirections(rows, map[string]bool{"01FA": true}, "01fa326b", testResolver(nil))
|
||||
a, ok := d.obs["obs1"]
|
||||
if !ok || a.count != 2 {
|
||||
t.Fatalf("observer count should aggregate to 2, got %+v", a)
|
||||
}
|
||||
if a.snrN != 2 || a.snrSum != 8.0 {
|
||||
t.Fatalf("snr should aggregate (n=2,sum=8), got n=%d sum=%v", a.snrN, a.snrSum)
|
||||
}
|
||||
if d.they["obs1"] != 2 {
|
||||
t.Fatalf("they_hear[obs1]=%d want 2", d.they["obs1"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestScanReachRows_DecodesRows(t *testing.T) {
|
||||
db := newReachScanTestDB(t)
|
||||
defer db.conn.Close()
|
||||
srv := &Server{db: db}
|
||||
rows, _ := srv.scanReachRows(context.Background(), map[string]bool{"01FA": true}, 0)
|
||||
if len(rows) != 2 {
|
||||
t.Fatalf("expected 2 matching rows (non-matching path excluded), got %d", len(rows))
|
||||
}
|
||||
// Find the advert row (order is not guaranteed without ORDER BY).
|
||||
var got *pathRow
|
||||
for i := range rows {
|
||||
if rows[i].payloadType == 4 {
|
||||
got = &rows[i]
|
||||
}
|
||||
}
|
||||
if got == nil {
|
||||
t.Fatalf("advert row not returned: %+v", rows)
|
||||
}
|
||||
// Fields are decoded + normalized: lowercase observer/from, uppercase path.
|
||||
if got.observerPK != "obs1" || got.fromPubkey != "ff00" {
|
||||
t.Fatalf("decoded fields wrong: %+v", *got)
|
||||
}
|
||||
if len(got.path) != 3 || got.path[1] != "01FA" {
|
||||
t.Fatalf("path not parsed/uppercased: %v", got.path)
|
||||
}
|
||||
if !got.snrValid || got.snr != -7.0 {
|
||||
t.Fatalf("snr not decoded: valid=%v val=%v", got.snrValid, got.snr)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,114 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Issue #1290 (MAJOR-2, adversarial review of PR #1624) — tri-state badge.
|
||||
//
|
||||
// The badge surface needs to distinguish three states:
|
||||
// 1. legacy observer (never sent `repeat` field) → unknown → no badge
|
||||
// 2. firmware confirmed `repeat:on` → "Repeater"
|
||||
// 3. firmware confirmed `repeat:off` → "Listener"
|
||||
//
|
||||
// Previously `CanRelay bool` defaulted to false in Go even when the row
|
||||
// was the legacy DEFAULT 1, conflating "confirmed repeater" with
|
||||
// "unknown". This pins the API surface to *bool + JSON omitempty so the
|
||||
// frontend tri-state render works.
|
||||
func TestObservers_CanRelayTriState_Issue1290(t *testing.T) {
|
||||
srv, router := setupTestServer(t)
|
||||
|
||||
// Add the can_relay column (matches dbschema migration) PLUS the
|
||||
// can_relay_seen tracking column so the read layer can distinguish
|
||||
// "ingestor explicitly wrote a value" from "default sentinel".
|
||||
for _, ddl := range []string{
|
||||
`ALTER TABLE observers ADD COLUMN can_relay INTEGER DEFAULT 1`,
|
||||
`ALTER TABLE observers ADD COLUMN can_relay_seen INTEGER DEFAULT 0`,
|
||||
} {
|
||||
if _, err := srv.store.db.conn.Exec(ddl); err != nil {
|
||||
t.Fatalf("alter: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
now := time.Now().UTC().Format(time.RFC3339)
|
||||
// Legacy: never received repeat field. can_relay=DEFAULT 1, seen=0.
|
||||
if _, err := srv.store.db.conn.Exec(
|
||||
`INSERT INTO observers (id, name, iata, last_seen, first_seen, packet_count)
|
||||
VALUES ('legacy-obs', 'Legacy', 'SJC', ?, '2026-01-01T00:00:00Z', 1)`, now); err != nil {
|
||||
t.Fatalf("seed legacy: %v", err)
|
||||
}
|
||||
// Repeater: ingestor wrote can_relay=1, seen=1.
|
||||
if _, err := srv.store.db.conn.Exec(
|
||||
`INSERT INTO observers (id, name, iata, last_seen, first_seen, packet_count, can_relay, can_relay_seen)
|
||||
VALUES ('rep-obs', 'Repeater', 'SFO', ?, '2026-01-01T00:00:00Z', 1, 1, 1)`, now); err != nil {
|
||||
t.Fatalf("seed repeater: %v", err)
|
||||
}
|
||||
// Listener: ingestor wrote can_relay=0, seen=1.
|
||||
if _, err := srv.store.db.conn.Exec(
|
||||
`INSERT INTO observers (id, name, iata, last_seen, first_seen, packet_count, can_relay, can_relay_seen)
|
||||
VALUES ('lst-obs', 'Listener', 'OAK', ?, '2026-01-01T00:00:00Z', 1, 0, 1)`, now); err != nil {
|
||||
t.Fatalf("seed listener: %v", err)
|
||||
}
|
||||
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/observers?nocache=1", nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("expected 200, got %d (body: %s)", w.Code, w.Body.String())
|
||||
}
|
||||
|
||||
var body struct {
|
||||
Observers []map[string]interface{} `json:"observers"`
|
||||
}
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil {
|
||||
t.Fatalf("json: %v", err)
|
||||
}
|
||||
|
||||
rows := map[string]map[string]interface{}{}
|
||||
for _, o := range body.Observers {
|
||||
if id, _ := o["id"].(string); id != "" {
|
||||
rows[id] = o
|
||||
}
|
||||
}
|
||||
|
||||
// Legacy: can_relay key must be absent (JSON omitempty for nil *bool).
|
||||
legacy, ok := rows["legacy-obs"]
|
||||
if !ok {
|
||||
ids := make([]string, 0, len(rows))
|
||||
for k := range rows {
|
||||
ids = append(ids, k)
|
||||
}
|
||||
t.Fatalf("legacy-obs missing from response; got ids: %v", ids)
|
||||
}
|
||||
if _, has := legacy["can_relay"]; has {
|
||||
t.Errorf("legacy observer (never sent repeat) should have can_relay omitted (unknown); got can_relay=%v", legacy["can_relay"])
|
||||
}
|
||||
|
||||
// Repeater: can_relay must be true.
|
||||
if v := rows["rep-obs"]["can_relay"]; v != true {
|
||||
t.Errorf("repeater observer: expected can_relay=true, got %v", v)
|
||||
}
|
||||
// Listener: can_relay must be false.
|
||||
if v, has := rows["lst-obs"]["can_relay"]; !has || v != false {
|
||||
t.Errorf("listener observer: expected can_relay=false, got %v (present=%v)", v, has)
|
||||
}
|
||||
|
||||
// And the raw JSON must not contain the legacy observer's can_relay key
|
||||
// (defense against a future ObserverResp change that hardcodes false).
|
||||
raw := w.Body.String()
|
||||
if idx := strings.Index(raw, `"id":"legacy-obs"`); idx >= 0 {
|
||||
// scan its row only — observers are JSON-array-ordered objects.
|
||||
end := strings.Index(raw[idx:], "}")
|
||||
if end > 0 {
|
||||
rowStr := raw[idx : idx+end]
|
||||
if strings.Contains(rowStr, `"can_relay"`) {
|
||||
t.Errorf("legacy observer raw JSON unexpectedly contains can_relay key: %s", rowStr)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -42,6 +42,7 @@ func routeDescriptions() map[string]routeMeta {
|
||||
"GET /api/health": {Summary: "Health check", Description: "Returns server health, uptime, and memory stats.", Tag: "admin"},
|
||||
"GET /api/stats": {Summary: "Network statistics", Description: "Returns aggregate stats (node counts, packet counts, observer counts). Cached for 10s.", Tag: "admin"},
|
||||
"GET /api/perf": {Summary: "Performance statistics", Description: "Returns per-endpoint request timing and slow query log.", Tag: "admin"},
|
||||
"GET /api/mqtt/status": {Summary: "MQTT source status", Description: "Returns per-MQTT-source connection state and counters (lastConnectUnix, lastPacketUnix, packetsTotal, etc.). Broker URL passwords are masked. Sourced from the ingestor stats file; empty list when unavailable. (#1043)", Tag: "admin"},
|
||||
"POST /api/perf/reset": {Summary: "Reset performance stats", Tag: "admin", Auth: true},
|
||||
// "POST /api/admin/prune" removed in #1283 (ingestor owns prune).
|
||||
"GET /api/debug/affinity": {Summary: "Debug neighbor affinity scores", Tag: "admin", Auth: true},
|
||||
|
||||
@@ -0,0 +1,208 @@
|
||||
// Package main: openapi completeness gate.
|
||||
//
|
||||
// Phase 1 of issue #1670: enforce that every `/api/*` route registered via
|
||||
// `*.HandleFunc("/api/...", ...)` in cmd/server/*.go (non-_test) has a
|
||||
// corresponding entry in the OpenAPI spec map declared in
|
||||
// cmd/server/openapi.go (the `routeDescriptions` map literal).
|
||||
//
|
||||
// Ratchet pattern:
|
||||
// - On first land, the spec covers only a subset of handlers. The full
|
||||
// missing list is "frozen" into cmd/server/openapi_known_gaps.json.
|
||||
// - The test FAILS when a NEW HandleFunc("/api/...") is added without
|
||||
// either (a) adding the route to openapi.go, or (b) appending it to
|
||||
// openapi_known_gaps.json.
|
||||
// - It also FAILS if any entry in openapi_known_gaps.json is now covered
|
||||
// by openapi.go (the allowlist must shrink as Phase 2 backfills land).
|
||||
//
|
||||
// Phase 2 (the actual backfill of ~18 routes into openapi.go) is tracked
|
||||
// in a separate issue per the triage on #1670. This file is the gate
|
||||
// that ensures the gap does not GROW while Phase 2 is in progress.
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"go/ast"
|
||||
"go/parser"
|
||||
"go/token"
|
||||
"os"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
const knownGapsFile = "openapi_known_gaps.json"
|
||||
|
||||
// collectHandlerRoutes walks every non-_test .go file in cmd/server/ and
|
||||
// returns the set of string-literal first args to any `*.HandleFunc(...)`
|
||||
// or `*.Handle(...)` call whose value starts with "/api/".
|
||||
//
|
||||
// Both forms are used in cmd/server/routes.go: bare handlers use
|
||||
// `r.HandleFunc("/api/...", fn)`, while handlers wrapped in auth
|
||||
// middleware use `r.Handle("/api/...", wrapped).Methods("...")`. The
|
||||
// completeness gate MUST consider both — anything less lets the
|
||||
// gorilla-style chained routes slip past the ratchet.
|
||||
func collectHandlerRoutes(t *testing.T) map[string]string {
|
||||
t.Helper()
|
||||
out := map[string]string{} // route -> "file:line"
|
||||
entries, err := os.ReadDir(".")
|
||||
if err != nil {
|
||||
t.Fatalf("read cmd/server dir: %v", err)
|
||||
}
|
||||
fset := token.NewFileSet()
|
||||
for _, e := range entries {
|
||||
if e.IsDir() {
|
||||
continue
|
||||
}
|
||||
name := e.Name()
|
||||
if !strings.HasSuffix(name, ".go") || strings.HasSuffix(name, "_test.go") {
|
||||
continue
|
||||
}
|
||||
f, err := parser.ParseFile(fset, name, nil, parser.AllErrors)
|
||||
if err != nil {
|
||||
t.Fatalf("parse %s: %v", name, err)
|
||||
}
|
||||
ast.Inspect(f, func(n ast.Node) bool {
|
||||
call, ok := n.(*ast.CallExpr)
|
||||
if !ok {
|
||||
return true
|
||||
}
|
||||
sel, ok := call.Fun.(*ast.SelectorExpr)
|
||||
if !ok || sel.Sel == nil {
|
||||
return true
|
||||
}
|
||||
if sel.Sel.Name != "HandleFunc" && sel.Sel.Name != "Handle" {
|
||||
return true
|
||||
}
|
||||
if len(call.Args) < 1 {
|
||||
return true
|
||||
}
|
||||
lit, ok := call.Args[0].(*ast.BasicLit)
|
||||
if !ok || lit.Kind != token.STRING {
|
||||
return true
|
||||
}
|
||||
v, err := strconv.Unquote(lit.Value)
|
||||
if err != nil {
|
||||
return true
|
||||
}
|
||||
if !strings.HasPrefix(v, "/api/") {
|
||||
return true
|
||||
}
|
||||
pos := fset.Position(lit.Pos())
|
||||
if _, exists := out[v]; !exists {
|
||||
out[v] = pos.String()
|
||||
}
|
||||
return true
|
||||
})
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// strconvUnquote strips Go string-literal quoting without pulling strconv
|
||||
// into the import list (keeps the file's imports lean).
|
||||
func strconvUnquote(s string) (string, error) {
|
||||
if len(s) >= 2 && s[0] == '"' && s[len(s)-1] == '"' {
|
||||
return s[1 : len(s)-1], nil
|
||||
}
|
||||
if len(s) >= 2 && s[0] == '`' && s[len(s)-1] == '`' {
|
||||
return s[1 : len(s)-1], nil
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// collectSpecRoutes returns the set of "/api/..." paths declared in the
|
||||
// routeDescriptions() map in openapi.go. Keys are "METHOD /path"; we strip
|
||||
// the method and take just the path.
|
||||
func collectSpecRoutes(t *testing.T) map[string]bool {
|
||||
t.Helper()
|
||||
out := map[string]bool{}
|
||||
for k := range routeDescriptions() {
|
||||
// key shape: "GET /api/foo" — split once on space.
|
||||
idx := strings.IndexByte(k, ' ')
|
||||
if idx < 0 {
|
||||
continue
|
||||
}
|
||||
path := k[idx+1:]
|
||||
if strings.HasPrefix(path, "/api/") {
|
||||
out[path] = true
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// loadKnownGaps returns the allowlist of currently-known-missing routes.
|
||||
// Missing file is treated as an empty allowlist (the initial RED state).
|
||||
func loadKnownGaps(t *testing.T) map[string]bool {
|
||||
t.Helper()
|
||||
out := map[string]bool{}
|
||||
b, err := os.ReadFile(knownGapsFile)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return out
|
||||
}
|
||||
t.Fatalf("read %s: %v", knownGapsFile, err)
|
||||
}
|
||||
var payload struct {
|
||||
Routes []string `json:"routes"`
|
||||
}
|
||||
if err := json.Unmarshal(b, &payload); err != nil {
|
||||
t.Fatalf("parse %s: %v", knownGapsFile, err)
|
||||
}
|
||||
for _, r := range payload.Routes {
|
||||
out[r] = true
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// TestOpenAPICompleteness is the ratchet gate for issue #1670.
|
||||
func TestOpenAPICompleteness(t *testing.T) {
|
||||
handlers := collectHandlerRoutes(t)
|
||||
spec := collectSpecRoutes(t)
|
||||
gaps := loadKnownGaps(t)
|
||||
|
||||
// 1. Find routes registered via HandleFunc but missing from spec AND
|
||||
// not in the allowlist — these are new regressions.
|
||||
var newMissing []string
|
||||
for route := range handlers {
|
||||
if spec[route] {
|
||||
continue
|
||||
}
|
||||
if gaps[route] {
|
||||
continue
|
||||
}
|
||||
newMissing = append(newMissing, route)
|
||||
}
|
||||
sort.Strings(newMissing)
|
||||
|
||||
// 2. Find allowlist entries that are now covered by the spec — the
|
||||
// allowlist must shrink, not stay stale.
|
||||
var stale []string
|
||||
for route := range gaps {
|
||||
if spec[route] {
|
||||
stale = append(stale, route)
|
||||
}
|
||||
}
|
||||
sort.Strings(stale)
|
||||
|
||||
// 3. (Diagnostic only) Total current gap count, for visibility.
|
||||
var currentGaps []string
|
||||
for route := range handlers {
|
||||
if !spec[route] {
|
||||
currentGaps = append(currentGaps, route)
|
||||
}
|
||||
}
|
||||
sort.Strings(currentGaps)
|
||||
t.Logf("openapi spec covers %d/%d /api/ handler routes; %d in allowlist; %d total gaps remain",
|
||||
len(handlers)-len(currentGaps), len(handlers), len(gaps), len(currentGaps))
|
||||
|
||||
if len(newMissing) > 0 {
|
||||
t.Errorf("\n%d /api/ route(s) registered in cmd/server but NOT in openapi.go spec AND NOT in %s:\n - %s\n\nFix one of:\n a) Add the route to routeDescriptions() in cmd/server/openapi.go (preferred — Phase 2 of #1670)\n b) Append the route to cmd/server/%s (ratchet — only if Phase 2 backfill is genuinely deferred)\n",
|
||||
len(newMissing), knownGapsFile, strings.Join(newMissing, "\n - "), knownGapsFile)
|
||||
}
|
||||
|
||||
if len(stale) > 0 {
|
||||
t.Errorf("\n%d route(s) in %s are now covered by openapi.go and must be REMOVED from the allowlist (ratchet must shrink):\n - %s\n",
|
||||
len(stale), knownGapsFile, strings.Join(stale, "\n - "))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,27 @@
|
||||
{
|
||||
"_comment": "Allowlist of /api/ routes registered via HandleFunc in cmd/server/ that are NOT yet documented in cmd/server/openapi.go. This is the 'ratchet' baseline for issue #1670 Phase 1: the TestOpenAPICompleteness gate fails when a NEW handler is added without either documenting it in openapi.go OR appending it here. Phase 2 (the actual backfill of these routes into openapi.go) is tracked in a separate issue per the #1670 triage. Entries should be REMOVED as Phase 2 lands docs for each route — the gate also fails if an entry here is already covered by openapi.go (stale allowlist).",
|
||||
"_issue": "https://github.com/Kpa-clawbot/CoreScope/issues/1670",
|
||||
"routes": [
|
||||
"/api/admin/prune-geo-filter",
|
||||
"/api/admin/prune-geo-filter/status",
|
||||
"/api/analytics/relay-airtime-share",
|
||||
"/api/analytics/roles",
|
||||
"/api/config/areas",
|
||||
"/api/config/areas/polygons",
|
||||
"/api/docs",
|
||||
"/api/dropped-packets",
|
||||
"/api/healthz",
|
||||
"/api/known-channels",
|
||||
"/api/nodes/clock-skew",
|
||||
"/api/nodes/{pubkey}/battery",
|
||||
"/api/nodes/{pubkey}/clock-skew",
|
||||
"/api/nodes/{pubkey}/reach",
|
||||
"/api/observers/clock-skew",
|
||||
"/api/paths/inspect",
|
||||
"/api/perf/io",
|
||||
"/api/perf/sqlite",
|
||||
"/api/perf/write-sources",
|
||||
"/api/scope-stats",
|
||||
"/api/spec"
|
||||
]
|
||||
}
|
||||
@@ -146,7 +146,17 @@ type parityEndpoint struct {
|
||||
|
||||
func TestParityShapes(t *testing.T) {
|
||||
shapes := loadShapes(t)
|
||||
_, router := setupTestServer(t)
|
||||
srv, router := setupTestServer(t)
|
||||
// #1011: lazy distance index — pre-warm before parity shape
|
||||
// validation expects 200.
|
||||
srv.store.TriggerDistanceIndexBuild()
|
||||
deadline := time.Now().Add(5 * time.Second)
|
||||
for !srv.store.DistanceIndexBuilt() {
|
||||
if time.Now().After(deadline) {
|
||||
t.Fatal("distance index did not finish building within 5s")
|
||||
}
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
}
|
||||
|
||||
endpoints := []parityEndpoint{
|
||||
{"stats", "/api/stats"},
|
||||
|
||||
@@ -297,6 +297,41 @@ type IngestorStats struct {
|
||||
// ProcIO is the ingestor's own /proc/self/io rates (since its previous
|
||||
// sample). Optional — older ingestor builds don't publish this. See #1120.
|
||||
ProcIO *PerfIOSample `json:"procIO,omitempty"`
|
||||
// WriterPerf is the per-component SQLite writer-lock latency
|
||||
// snapshot (#1340). Optional — older ingestor builds don't
|
||||
// publish this. Surfaced under .writer_perf by
|
||||
// handlePerfWriteSources.
|
||||
WriterPerf map[string]WriterStatsSnapshot `json:"writer_perf,omitempty"`
|
||||
// SourceLiveness (PR #1609 M1) is the per-MQTT-source two-clock
|
||||
// snapshot: lastReceiptUnix (broker liveness, stamped at receipt)
|
||||
// vs lastMessageUnix (write-path liveness, stamped post-write).
|
||||
// Surfaced by /api/healthz under .ingest_liveness so operators can
|
||||
// distinguish "broker alive, write path stuck" from "everything
|
||||
// stalled". Optional — older ingestor builds don't publish this.
|
||||
SourceLiveness map[string]SourceLivenessSnapshot `json:"source_liveness,omitempty"`
|
||||
}
|
||||
|
||||
// SourceLivenessSnapshot mirrors the ingestor's per-MQTT-source liveness
|
||||
// pair (PR #1609 M1). Both fields are unix seconds; 0 means "never".
|
||||
type SourceLivenessSnapshot struct {
|
||||
LastReceiptUnix int64 `json:"lastReceiptUnix"`
|
||||
LastMessageUnix int64 `json:"lastMessageUnix"`
|
||||
}
|
||||
|
||||
// WriterStatsSnapshot mirrors the ingestor's per-component writer-lock
|
||||
// latency snapshot (#1340). Times are milliseconds. Server-side decode
|
||||
// uses this type to keep the JSON contract stable across processes.
|
||||
type WriterStatsSnapshot struct {
|
||||
Count int64 `json:"count"`
|
||||
ContentionTotal int64 `json:"contention_total"`
|
||||
WaitMsP50 float64 `json:"wait_ms_p50"`
|
||||
WaitMsP95 float64 `json:"wait_ms_p95"`
|
||||
WaitMsP99 float64 `json:"wait_ms_p99"`
|
||||
WaitMsMax float64 `json:"wait_ms_max"`
|
||||
HoldMsP50 float64 `json:"hold_ms_p50"`
|
||||
HoldMsP95 float64 `json:"hold_ms_p95"`
|
||||
HoldMsP99 float64 `json:"hold_ms_p99"`
|
||||
HoldMsMax float64 `json:"hold_ms_max"`
|
||||
}
|
||||
|
||||
// IngestorStatsPath is the well-known location where the ingestor writes its
|
||||
@@ -308,6 +343,111 @@ func IngestorStatsPath() string {
|
||||
return "/tmp/corescope-ingestor-stats.json"
|
||||
}
|
||||
|
||||
// readIngestorSourceLiveness returns the per-source receipt/write-path
|
||||
// liveness map from the ingestor stats file, or nil on any error / older
|
||||
// ingestor that doesn't publish the field. PR #1609 M1 — surfaced by
|
||||
// /api/healthz under .ingest_liveness so operators can spot "broker
|
||||
// alive, write path stuck".
|
||||
//
|
||||
// /healthz is a hot path (LB / k8s / uptime monitors), so the result
|
||||
// is memoized with a short TTL (sourceLivenessCacheTTL) and refreshed
|
||||
// whenever the underlying file mtime changes (PR #1623 round-1
|
||||
// finding 4). The lock is held briefly; the costly Unmarshal happens
|
||||
// at most once per refresh window.
|
||||
func readIngestorSourceLiveness() map[string]SourceLivenessSnapshot {
|
||||
path := IngestorStatsPath()
|
||||
now := time.Now()
|
||||
|
||||
sourceLivenessCache.mu.RLock()
|
||||
if sourceLivenessCache.path == path &&
|
||||
now.Sub(sourceLivenessCache.cachedAt) < sourceLivenessCacheTTL {
|
||||
// Cheap mtime probe: if the file moved since we cached, fall
|
||||
// through to the refresh path. Stat is cheap relative to
|
||||
// ReadFile+Unmarshal.
|
||||
info, err := os.Stat(path)
|
||||
fresh := err == nil && info.ModTime().Equal(sourceLivenessCache.mtime)
|
||||
if fresh || (err != nil && sourceLivenessCache.mtime.IsZero()) {
|
||||
out := sourceLivenessCache.value
|
||||
sourceLivenessCache.mu.RUnlock()
|
||||
return out
|
||||
}
|
||||
}
|
||||
sourceLivenessCache.mu.RUnlock()
|
||||
|
||||
sourceLivenessCache.mu.Lock()
|
||||
defer sourceLivenessCache.mu.Unlock()
|
||||
// Re-check under the write lock — another goroutine may have just
|
||||
// refreshed.
|
||||
if sourceLivenessCache.path == path &&
|
||||
time.Since(sourceLivenessCache.cachedAt) < sourceLivenessCacheTTL {
|
||||
info, err := os.Stat(path)
|
||||
fresh := err == nil && info.ModTime().Equal(sourceLivenessCache.mtime)
|
||||
if fresh || (err != nil && sourceLivenessCache.mtime.IsZero()) {
|
||||
return sourceLivenessCache.value
|
||||
}
|
||||
}
|
||||
|
||||
data, err := sourceLivenessReadFile(path)
|
||||
if err != nil {
|
||||
// Cache the negative result too, so a missing file doesn't
|
||||
// hammer the disk under /healthz pressure.
|
||||
sourceLivenessCache.path = path
|
||||
sourceLivenessCache.value = nil
|
||||
sourceLivenessCache.cachedAt = now
|
||||
sourceLivenessCache.mtime = time.Time{}
|
||||
return nil
|
||||
}
|
||||
var st IngestorStats
|
||||
if err := json.Unmarshal(data, &st); err != nil {
|
||||
sourceLivenessCache.path = path
|
||||
sourceLivenessCache.value = nil
|
||||
sourceLivenessCache.cachedAt = now
|
||||
sourceLivenessCache.mtime = time.Time{}
|
||||
return nil
|
||||
}
|
||||
sourceLivenessCache.path = path
|
||||
sourceLivenessCache.value = st.SourceLiveness
|
||||
sourceLivenessCache.cachedAt = now
|
||||
if info, err := os.Stat(path); err == nil {
|
||||
sourceLivenessCache.mtime = info.ModTime()
|
||||
} else {
|
||||
sourceLivenessCache.mtime = time.Time{}
|
||||
}
|
||||
return st.SourceLiveness
|
||||
}
|
||||
|
||||
// sourceLivenessReadFile is the file-reader used by
|
||||
// readIngestorSourceLiveness. Swappable for tests so call counts can
|
||||
// be asserted (PR #1623 round-1 finding 4 TTL cache test).
|
||||
var sourceLivenessReadFile = os.ReadFile
|
||||
|
||||
// sourceLivenessCacheTTL caps how long a parsed liveness map is reused
|
||||
// across /healthz probes. 1s is short enough that operators see stale
|
||||
// data only briefly during incidents, but long enough to coalesce
|
||||
// hundreds of probes/sec from LBs.
|
||||
var sourceLivenessCacheTTL = time.Second
|
||||
|
||||
// sourceLivenessCache memoizes the parsed liveness map keyed by file
|
||||
// path + mtime. See readIngestorSourceLiveness.
|
||||
var sourceLivenessCache struct {
|
||||
mu sync.RWMutex
|
||||
path string
|
||||
value map[string]SourceLivenessSnapshot
|
||||
cachedAt time.Time
|
||||
mtime time.Time
|
||||
}
|
||||
|
||||
// resetSourceLivenessCache clears the memo. Test-only helper; callable
|
||||
// from production code is harmless (next call just re-reads).
|
||||
func resetSourceLivenessCache() {
|
||||
sourceLivenessCache.mu.Lock()
|
||||
defer sourceLivenessCache.mu.Unlock()
|
||||
sourceLivenessCache.path = ""
|
||||
sourceLivenessCache.value = nil
|
||||
sourceLivenessCache.cachedAt = time.Time{}
|
||||
sourceLivenessCache.mtime = time.Time{}
|
||||
}
|
||||
|
||||
// handlePerfWriteSources reads the ingestor's stats file and returns a flat
|
||||
// map of source-name -> counter, plus the sample timestamp.
|
||||
func (s *Server) handlePerfWriteSources(w http.ResponseWriter, r *http.Request) {
|
||||
@@ -342,5 +482,14 @@ func (s *Server) handlePerfWriteSources(w http.ResponseWriter, r *http.Request)
|
||||
}
|
||||
out["sources"] = sources
|
||||
out["sampleAt"] = st.SampledAt
|
||||
// Surface per-component SQLite writer-lock latency histograms
|
||||
// (#1340) under .writer_perf so operators can see when a
|
||||
// component (e.g. neighbor_builder) is starving the writer.
|
||||
// Empty map when the ingestor is too old to publish this field.
|
||||
if len(st.WriterPerf) > 0 {
|
||||
out["writer_perf"] = st.WriterPerf
|
||||
} else {
|
||||
out["writer_perf"] = map[string]WriterStatsSnapshot{}
|
||||
}
|
||||
writeJSON(w, out)
|
||||
}
|
||||
|
||||
@@ -0,0 +1,93 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TestReadIngestorSourceLiveness_CachesWithinTTL guards the /healthz
|
||||
// hot-path TTL cache (PR #1623 round-1 finding 4): readIngestorSourceLiveness
|
||||
// is called per /healthz probe (LB / k8s / uptime monitors), and every
|
||||
// call re-reads + re-unmarshals the entire IngestorStats JSON. Within
|
||||
// the TTL window the function MUST hit a cached parse and avoid the
|
||||
// re-read.
|
||||
func TestReadIngestorSourceLiveness_CachesWithinTTL(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
statsPath := filepath.Join(dir, "ingestor-stats.json")
|
||||
stub := `{
|
||||
"sampledAt": "2026-06-07T00:00:00Z",
|
||||
"source_liveness": {
|
||||
"mqtt-broker-a": {"lastReceiptUnix": 1717000000, "lastMessageUnix": 1716999990}
|
||||
}
|
||||
}`
|
||||
if err := os.WriteFile(statsPath, []byte(stub), 0o600); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
t.Setenv("CORESCOPE_INGESTOR_STATS", statsPath)
|
||||
|
||||
// Swap the read function to a counting wrapper.
|
||||
var calls atomic.Int64
|
||||
prev := sourceLivenessReadFile
|
||||
sourceLivenessReadFile = func(p string) ([]byte, error) {
|
||||
calls.Add(1)
|
||||
return os.ReadFile(p)
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
sourceLivenessReadFile = prev
|
||||
resetSourceLivenessCache()
|
||||
})
|
||||
resetSourceLivenessCache()
|
||||
|
||||
// 5 sequential calls within <1s — the cache TTL window.
|
||||
start := time.Now()
|
||||
for i := 0; i < 5; i++ {
|
||||
got := readIngestorSourceLiveness()
|
||||
if _, ok := got["mqtt-broker-a"]; !ok {
|
||||
t.Fatalf("call %d: expected mqtt-broker-a in liveness map, got %+v", i, got)
|
||||
}
|
||||
}
|
||||
elapsed := time.Since(start)
|
||||
if elapsed > 800*time.Millisecond {
|
||||
t.Fatalf("loop took %s — too slow for a TTL-cache assertion (should be sub-second)", elapsed)
|
||||
}
|
||||
if got := calls.Load(); got != 1 {
|
||||
t.Fatalf("expected 1 os.ReadFile call across 5 readIngestorSourceLiveness() calls within TTL, got %d", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestReadIngestorSourceLiveness_InvalidatesOnMTimeChange guards the
|
||||
// other half of the cache contract: when the underlying stats file
|
||||
// changes (mtime moves), the cache MUST refresh on the next call.
|
||||
func TestReadIngestorSourceLiveness_InvalidatesOnMTimeChange(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
statsPath := filepath.Join(dir, "ingestor-stats.json")
|
||||
stubA := `{"source_liveness": {"a": {"lastReceiptUnix": 1, "lastMessageUnix": 1}}}`
|
||||
stubB := `{"source_liveness": {"b": {"lastReceiptUnix": 2, "lastMessageUnix": 2}}}`
|
||||
if err := os.WriteFile(statsPath, []byte(stubA), 0o600); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
t.Setenv("CORESCOPE_INGESTOR_STATS", statsPath)
|
||||
|
||||
t.Cleanup(resetSourceLivenessCache)
|
||||
resetSourceLivenessCache()
|
||||
|
||||
got := readIngestorSourceLiveness()
|
||||
if _, ok := got["a"]; !ok {
|
||||
t.Fatalf("first call: expected key 'a', got %+v", got)
|
||||
}
|
||||
// Bump mtime forward to guarantee the cache notices.
|
||||
future := time.Now().Add(2 * time.Second)
|
||||
if err := os.WriteFile(statsPath, []byte(stubB), 0o600); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.Chtimes(statsPath, future, future); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
got = readIngestorSourceLiveness()
|
||||
if _, ok := got["b"]; !ok {
|
||||
t.Fatalf("after mtime change: expected key 'b', got %+v", got)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,98 @@
|
||||
package main
|
||||
|
||||
// Regression tests for the three MAJOR findings on PR #1589.
|
||||
// These tests gate three semantic regressions that the rest of the PR's tests
|
||||
// did not catch:
|
||||
//
|
||||
// MAJOR-1: handleAnalyticsSubpaths default limit was silently halved 100→50
|
||||
// when migrated to queryLimit(r, 50, ...AnalyticsMax).
|
||||
// MAJOR-2: handleChannelMessages default limit was silently halved 100→50
|
||||
// when migrated to queryLimit(r, 50, ...ChannelMessagesMax).
|
||||
// MAJOR-3: handleBulkHealth was bundled into NodesMax (default 2000),
|
||||
// 10× its previous ceiling of 200, despite being per-row heavier.
|
||||
//
|
||||
// For MAJOR-1/2 we assert on the literal call-site `def` value via source
|
||||
// inspection because the rendered response does not expose the applied limit.
|
||||
// For MAJOR-3 we assert both the config-defaults plumbing AND the runtime
|
||||
// behavior: BulkHealthMax must exist as its own field with default 200, and
|
||||
// handleBulkHealth must clamp through it (not NodesMax).
|
||||
|
||||
import (
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestPR1589_AnalyticsSubpathsDefaultIs100(t *testing.T) {
|
||||
// MAJOR-1: regression guard.
|
||||
src, err := os.ReadFile("routes.go")
|
||||
if err != nil {
|
||||
t.Fatalf("read routes.go: %v", err)
|
||||
}
|
||||
if !strings.Contains(string(src), "queryLimit(r, 100, s.cfg.ListLimits.AnalyticsMax)") {
|
||||
t.Error("handleAnalyticsSubpaths must use def=100 in queryLimit; " +
|
||||
"PR #1589 inadvertently halved the default to 50 (MAJOR-1)")
|
||||
}
|
||||
}
|
||||
|
||||
func TestPR1589_ChannelMessagesDefaultIs100(t *testing.T) {
|
||||
// MAJOR-2: regression guard.
|
||||
src, err := os.ReadFile("routes.go")
|
||||
if err != nil {
|
||||
t.Fatalf("read routes.go: %v", err)
|
||||
}
|
||||
if !strings.Contains(string(src), "queryLimit(r, 100, s.cfg.ListLimits.ChannelMessagesMax)") {
|
||||
t.Error("handleChannelMessages must use def=100 in queryLimit; " +
|
||||
"PR #1589 inadvertently halved the default to 50 (MAJOR-2)")
|
||||
}
|
||||
}
|
||||
|
||||
func TestPR1589_BulkHealthMaxDefaultsTo200(t *testing.T) {
|
||||
// MAJOR-3 (config plumbing): a dedicated BulkHealthMax must exist with
|
||||
// default 200 — bulk-health is per-row much heavier than /api/nodes,
|
||||
// so it cannot inherit NodesMax (default 2000).
|
||||
dir := t.TempDir()
|
||||
os.WriteFile(dir+"/config.json", []byte(`{"port":3000}`), 0644)
|
||||
cfg, err := LoadConfig(dir)
|
||||
if err != nil {
|
||||
t.Fatalf("LoadConfig: %v", err)
|
||||
}
|
||||
if cfg.ListLimits.BulkHealthMax != 200 {
|
||||
t.Errorf("expected BulkHealthMax default 200, got %d", cfg.ListLimits.BulkHealthMax)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPR1589_BulkHealthClampsViaBulkHealthMax(t *testing.T) {
|
||||
// MAJOR-3 (runtime wiring): /api/nodes/bulk-health must clamp the limit
|
||||
// through BulkHealthMax — not NodesMax. We set BulkHealthMax=1 and
|
||||
// NodesMax=9999; if the handler still uses NodesMax the seed data (3
|
||||
// nodes) will all come back. If wired correctly it must clamp to 1.
|
||||
srv, router := setupTestServer(t)
|
||||
srv.cfg.ListLimits = &ListLimitsConfig{
|
||||
PacketsMax: 10000,
|
||||
NodesMax: 9999,
|
||||
AnalyticsMax: 200,
|
||||
ChannelMessagesMax: 500,
|
||||
BulkHealthMax: 1,
|
||||
}
|
||||
|
||||
req := httptest.NewRequest("GET", "/api/nodes/bulk-health?limit=500", nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
|
||||
if w.Code != 200 {
|
||||
t.Fatalf("expected 200, got %d body=%s", w.Code, w.Body.String())
|
||||
}
|
||||
// Response is a top-level JSON array (filtered or unfiltered).
|
||||
body := strings.TrimSpace(w.Body.String())
|
||||
if !strings.HasPrefix(body, "[") {
|
||||
t.Fatalf("expected JSON array response, got: %s", body)
|
||||
}
|
||||
// Count top-level objects via "public_key" occurrences (each row has one).
|
||||
rowCount := strings.Count(body, `"public_key"`)
|
||||
if rowCount > 1 {
|
||||
t.Errorf("BulkHealthMax=1 should clamp to 1 row, got %d rows; "+
|
||||
"handler is likely still using NodesMax (MAJOR-3): %s", rowCount, body)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,187 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"time"
|
||||
)
|
||||
|
||||
// relay_airtime_share.go — issue #1359
|
||||
//
|
||||
// Implements the "Relay Airtime Share" analytics metric:
|
||||
// score(packet) = payload_bytes × COUNT(DISTINCT repeater_pubkey
|
||||
// across all observations of that packet)
|
||||
//
|
||||
// Aggregated by payload_type. Originator TX is deliberately excluded — a
|
||||
// never-relayed direct message scores 0, which is the correct framing for a
|
||||
// "relay amplification" metric.
|
||||
//
|
||||
// In-memory only; no SQL, no new index, no schema change. The resolved-pubkey
|
||||
// reverse index (populated under s.mu via addToResolvedPubkeyIndex from every
|
||||
// observation's resolved_path) is the source of distinct relays per
|
||||
// transmission — len(resolvedPubkeyReverse[tx.ID]) IS the union of distinct
|
||||
// repeater pubkeys, deduplicated cross-observation. Critical: this is NOT the
|
||||
// length of any single observation's resolved_path (the bug-trap from
|
||||
// #1358's follow-up SQL hint).
|
||||
|
||||
// distinctRelayCount returns the number of distinct repeater pubkeys that
|
||||
// forwarded `tx`, unioned across ALL observations of that transmission_id.
|
||||
//
|
||||
// Source: the resolved-pubkey reverse index — populated by
|
||||
// indexResolvedPathHops / addToResolvedPubkeyIndex from every observation's
|
||||
// resolved_path. Each entry is one distinct pubkey hash for THIS tx (the
|
||||
// indexer dedups (hash, txID) pairs before appending).
|
||||
//
|
||||
// Caller MUST hold s.mu at least RLock.
|
||||
func (s *PacketStore) distinctRelayCount(tx *StoreTx) int {
|
||||
if tx == nil || !s.useResolvedPathIndex {
|
||||
return 0
|
||||
}
|
||||
return len(s.resolvedPubkeyReverse[tx.ID])
|
||||
}
|
||||
|
||||
// computeRelayAirtimeShare aggregates relay-airtime-share per payload_type.
|
||||
//
|
||||
// Returns:
|
||||
//
|
||||
// {
|
||||
// "rows": [{payload_type, type, count, count_pct, score, airtime_pct}, ...] sorted by airtime_pct desc,
|
||||
// "total_count": int,
|
||||
// "total_score": int,
|
||||
// "window": window label,
|
||||
// "cached": false (overwritten by cached wrapper),
|
||||
// }
|
||||
func (s *PacketStore) computeRelayAirtimeShare(window TimeWindow) map[string]interface{} {
|
||||
s.mu.RLock()
|
||||
defer s.mu.RUnlock()
|
||||
|
||||
ptNames := payloadTypeNames
|
||||
|
||||
type bucket struct {
|
||||
count int
|
||||
score int
|
||||
}
|
||||
buckets := make(map[int]*bucket)
|
||||
seenHash := make(map[string]bool, len(s.packets))
|
||||
totalCount := 0
|
||||
totalScore := 0
|
||||
|
||||
for _, tx := range s.packets {
|
||||
if tx == nil || tx.PayloadType == nil {
|
||||
continue
|
||||
}
|
||||
if !window.Includes(tx.FirstSeen) {
|
||||
continue
|
||||
}
|
||||
// Dedup per-hash: each distinct packet counted once. ACKs in the
|
||||
// test fixture have unique hashes so this only collapses true
|
||||
// re-observations of the same packet.
|
||||
if tx.Hash != "" {
|
||||
if seenHash[tx.Hash] {
|
||||
continue
|
||||
}
|
||||
seenHash[tx.Hash] = true
|
||||
}
|
||||
pt := *tx.PayloadType
|
||||
b := buckets[pt]
|
||||
if b == nil {
|
||||
b = &bucket{}
|
||||
buckets[pt] = b
|
||||
}
|
||||
b.count++
|
||||
totalCount++
|
||||
|
||||
// payload bytes from RawHex (2 hex chars per byte).
|
||||
payloadBytes := len(tx.RawHex) / 2
|
||||
relays := s.distinctRelayCount(tx)
|
||||
score := payloadBytes * relays
|
||||
b.score += score
|
||||
totalScore += score
|
||||
}
|
||||
|
||||
rows := make([]map[string]interface{}, 0, len(buckets))
|
||||
for pt, b := range buckets {
|
||||
name := ptNames[pt]
|
||||
if name == "" {
|
||||
name = "UNK"
|
||||
}
|
||||
var countPct, airtimePct float64
|
||||
if totalCount > 0 {
|
||||
countPct = float64(b.count) / float64(totalCount) * 100.0
|
||||
}
|
||||
if totalScore > 0 {
|
||||
airtimePct = float64(b.score) / float64(totalScore) * 100.0
|
||||
}
|
||||
rows = append(rows, map[string]interface{}{
|
||||
"payload_type": name,
|
||||
"type": pt,
|
||||
"count": b.count,
|
||||
"count_pct": countPct,
|
||||
"score": b.score,
|
||||
"airtime_pct": airtimePct,
|
||||
})
|
||||
}
|
||||
|
||||
// Sort descending by airtime_pct; tiebreak count desc, then name asc
|
||||
// for deterministic ordering.
|
||||
sort.SliceStable(rows, func(i, j int) bool {
|
||||
ai, _ := rows[i]["airtime_pct"].(float64)
|
||||
aj, _ := rows[j]["airtime_pct"].(float64)
|
||||
if ai != aj {
|
||||
return ai > aj
|
||||
}
|
||||
ci, _ := rows[i]["count"].(int)
|
||||
cj, _ := rows[j]["count"].(int)
|
||||
if ci != cj {
|
||||
return ci > cj
|
||||
}
|
||||
ni, _ := rows[i]["payload_type"].(string)
|
||||
nj, _ := rows[j]["payload_type"].(string)
|
||||
return ni < nj
|
||||
})
|
||||
|
||||
label := ""
|
||||
if !window.IsZero() {
|
||||
label = window.Label
|
||||
}
|
||||
return map[string]interface{}{
|
||||
"rows": rows,
|
||||
"total_count": totalCount,
|
||||
"total_score": totalScore,
|
||||
"window": label,
|
||||
"cached": false,
|
||||
}
|
||||
}
|
||||
|
||||
// GetRelayAirtimeShareWithWindow is the cached wrapper around
|
||||
// computeRelayAirtimeShare. Reuses the existing rfCache + rfCacheTTL pool
|
||||
// (shared with RF / topology / distance analytics — no new cache layer per
|
||||
// #1359 spec).
|
||||
func (s *PacketStore) GetRelayAirtimeShareWithWindow(window TimeWindow) map[string]interface{} {
|
||||
cacheKey := "relay-airtime-share|"
|
||||
if !window.IsZero() {
|
||||
cacheKey += window.CacheKey()
|
||||
}
|
||||
s.cacheMu.Lock()
|
||||
if cached, ok := s.rfCache[cacheKey]; ok && time.Now().Before(cached.expiresAt) {
|
||||
s.cacheHits++
|
||||
s.cacheMu.Unlock()
|
||||
// Shallow copy with cached=true so the JSON client can tell.
|
||||
m := cached.data
|
||||
out := make(map[string]interface{}, len(m)+1)
|
||||
for k, v := range m {
|
||||
out[k] = v
|
||||
}
|
||||
out["cached"] = true
|
||||
return out
|
||||
}
|
||||
s.cacheMisses++
|
||||
s.cacheMu.Unlock()
|
||||
|
||||
result := s.computeRelayAirtimeShare(window)
|
||||
|
||||
s.cacheMu.Lock()
|
||||
s.rfCache[cacheKey] = &cachedResult{data: result, expiresAt: time.Now().Add(s.rfCacheTTL)}
|
||||
s.cacheMu.Unlock()
|
||||
|
||||
return result
|
||||
}
|
||||
@@ -0,0 +1,185 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// newRelayAirtimeShareTestStore builds a minimal PacketStore for testing
|
||||
// computeRelayAirtimeShare without any DB or background workers.
|
||||
func newRelayAirtimeShareTestStore(packets []*StoreTx) *PacketStore {
|
||||
ps := &PacketStore{
|
||||
packets: packets,
|
||||
byHash: make(map[string]*StoreTx),
|
||||
byTxID: make(map[int]*StoreTx),
|
||||
byObsID: make(map[int]*StoreObs),
|
||||
byObserver: make(map[string][]*StoreObs),
|
||||
byNode: make(map[string][]*StoreTx),
|
||||
byPathHop: make(map[string][]*StoreTx),
|
||||
nodeHashes: make(map[string]map[string]bool),
|
||||
byPayloadType: make(map[int][]*StoreTx),
|
||||
rfCache: make(map[string]*cachedResult),
|
||||
topoCache: make(map[string]*cachedResult),
|
||||
hashCache: make(map[string]*cachedResult),
|
||||
collisionCache: make(map[string]*cachedResult),
|
||||
chanCache: make(map[string]*cachedResult),
|
||||
distCache: make(map[string]*cachedResult),
|
||||
subpathCache: make(map[string]*cachedResult),
|
||||
spIndex: make(map[string]int),
|
||||
spTxIndex: make(map[string][]*StoreTx),
|
||||
advertPubkeys: make(map[string]int),
|
||||
}
|
||||
ps.useResolvedPathIndex = true
|
||||
ps.initResolvedPathIndex()
|
||||
for _, tx := range packets {
|
||||
ps.byTxID[tx.ID] = tx
|
||||
if tx.Hash != "" {
|
||||
ps.byHash[tx.Hash] = tx
|
||||
}
|
||||
if tx.PayloadType != nil {
|
||||
pt := *tx.PayloadType
|
||||
ps.byPayloadType[pt] = append(ps.byPayloadType[pt], tx)
|
||||
}
|
||||
}
|
||||
return ps
|
||||
}
|
||||
|
||||
// makeRelayAirtimeTx builds a synthetic transmission with rawHex sized for the
|
||||
// given byte count and registers `distinctRelays` synthetic resolved-path
|
||||
// pubkeys via the resolved-pubkey reverse index — same source that
|
||||
// distinctRelayCount must read from.
|
||||
func makeRelayAirtimeTx(id int, payloadType int, payloadBytes int, distinctRelays int, hashPrefix string) *StoreTx {
|
||||
pt := payloadType
|
||||
tx := &StoreTx{
|
||||
ID: id,
|
||||
Hash: hashPrefix,
|
||||
FirstSeen: "2026-01-01T00:00:00Z",
|
||||
PayloadType: &pt,
|
||||
RawHex: strings.Repeat("ab", payloadBytes), // 2 hex chars per byte
|
||||
}
|
||||
return tx
|
||||
}
|
||||
|
||||
// TestRelayAirtimeShare_ADVERTvsACKDivergence is the locked acceptance test
|
||||
// from issue #1359:
|
||||
// - 1 ADVERT, 200 B, 8 distinct relays → score = 200 * 8 = 1600
|
||||
// - 1000 ACKs, 10 B each, 0 relays → score = 0
|
||||
//
|
||||
// Count distribution: ACK 1000/1001 = 99.90%, ADVERT 0.10%.
|
||||
// Airtime distribution: ADVERT 1600/1600 = 100%, ACK 0%.
|
||||
//
|
||||
// This is the headline divergence the dumbbell chart must visualize.
|
||||
func TestRelayAirtimeShare_ADVERTvsACKDivergence(t *testing.T) {
|
||||
packets := make([]*StoreTx, 0, 1001)
|
||||
|
||||
// 1 ADVERT with 200 bytes payload + 8 distinct relays
|
||||
advert := makeRelayAirtimeTx(1, PayloadADVERT, 200, 8, "ad000001")
|
||||
packets = append(packets, advert)
|
||||
|
||||
// 1000 ACKs with 10 bytes payload + 0 relays
|
||||
for i := 0; i < 1000; i++ {
|
||||
ack := makeRelayAirtimeTx(100+i, PayloadACK, 10, 0, "")
|
||||
// Give each a unique hash so dedup doesn't collapse them.
|
||||
ack.Hash = "ac" + zeroPad(i, 6)
|
||||
packets = append(packets, ack)
|
||||
}
|
||||
|
||||
store := newRelayAirtimeShareTestStore(packets)
|
||||
|
||||
// Wire up the 8 distinct relay pubkeys for the ADVERT through the
|
||||
// resolved-pubkey reverse index — the helper distinctRelayCount must
|
||||
// read from this source (union across all observations of tx.ID).
|
||||
relayPks := []string{
|
||||
"relay01", "relay02", "relay03", "relay04",
|
||||
"relay05", "relay06", "relay07", "relay08",
|
||||
}
|
||||
store.addToResolvedPubkeyIndex(advert.ID, relayPks)
|
||||
|
||||
// Sanity check the helper directly.
|
||||
if got := store.distinctRelayCount(advert); got != 8 {
|
||||
t.Fatalf("distinctRelayCount(ADVERT) = %d, want 8", got)
|
||||
}
|
||||
if got := store.distinctRelayCount(packets[1]); got != 0 {
|
||||
t.Fatalf("distinctRelayCount(ACK) = %d, want 0", got)
|
||||
}
|
||||
|
||||
result := store.computeRelayAirtimeShare(TimeWindow{})
|
||||
rows, ok := result["rows"].([]map[string]interface{})
|
||||
if !ok {
|
||||
t.Fatalf("result['rows'] missing or wrong type: %T", result["rows"])
|
||||
}
|
||||
if len(rows) < 2 {
|
||||
t.Fatalf("expected at least 2 rows (ADVERT, ACK), got %d: %+v", len(rows), rows)
|
||||
}
|
||||
|
||||
// Index by payload_type name.
|
||||
byType := make(map[string]map[string]interface{})
|
||||
for _, r := range rows {
|
||||
name, _ := r["payload_type"].(string)
|
||||
byType[name] = r
|
||||
}
|
||||
|
||||
advertRow, hasAdvert := byType["ADVERT"]
|
||||
ackRow, hasACK := byType["ACK"]
|
||||
if !hasAdvert {
|
||||
t.Fatalf("rows missing ADVERT bucket: %+v", rows)
|
||||
}
|
||||
if !hasACK {
|
||||
t.Fatalf("rows missing ACK bucket: %+v", rows)
|
||||
}
|
||||
|
||||
// Count percentages: ACK should be ~99.9%, ADVERT ~0.1%.
|
||||
ackCountPct, _ := ackRow["count_pct"].(float64)
|
||||
advertCountPct, _ := advertRow["count_pct"].(float64)
|
||||
if !(ackCountPct > 99.0 && ackCountPct < 100.0) {
|
||||
t.Errorf("ACK count_pct = %.4f, want ~99.9", ackCountPct)
|
||||
}
|
||||
if !(advertCountPct < 1.0 && advertCountPct > 0.0) {
|
||||
t.Errorf("ADVERT count_pct = %.4f, want ~0.1", advertCountPct)
|
||||
}
|
||||
|
||||
// Airtime percentages: ADVERT should be 100%, ACK 0%.
|
||||
advertAirtimePct, _ := advertRow["airtime_pct"].(float64)
|
||||
ackAirtimePct, _ := ackRow["airtime_pct"].(float64)
|
||||
if advertAirtimePct < 99.5 || advertAirtimePct > 100.001 {
|
||||
t.Errorf("ADVERT airtime_pct = %.4f, want 100.0", advertAirtimePct)
|
||||
}
|
||||
if ackAirtimePct != 0.0 {
|
||||
t.Errorf("ACK airtime_pct = %.4f, want 0.0", ackAirtimePct)
|
||||
}
|
||||
|
||||
// Raw score check: ADVERT = 200 * 8 = 1600.
|
||||
advertScore, _ := advertRow["score"].(int)
|
||||
if advertScore != 1600 {
|
||||
t.Errorf("ADVERT score = %d, want 1600 (200B × 8 relays)", advertScore)
|
||||
}
|
||||
ackScore, _ := ackRow["score"].(int)
|
||||
if ackScore != 0 {
|
||||
t.Errorf("ACK score = %d, want 0 (no relays)", ackScore)
|
||||
}
|
||||
|
||||
// Count integer check.
|
||||
advertCount, _ := advertRow["count"].(int)
|
||||
if advertCount != 1 {
|
||||
t.Errorf("ADVERT count = %d, want 1", advertCount)
|
||||
}
|
||||
ackCount, _ := ackRow["count"].(int)
|
||||
if ackCount != 1000 {
|
||||
t.Errorf("ACK count = %d, want 1000", ackCount)
|
||||
}
|
||||
|
||||
// The divergence: ADVERT should rank #1 by airtime even though its
|
||||
// count share is the smallest. This is the whole point of the chart.
|
||||
if rows[0]["payload_type"] != "ADVERT" {
|
||||
t.Errorf("rows must be sorted by airtime_pct desc; rows[0] payload_type = %v, want ADVERT", rows[0]["payload_type"])
|
||||
}
|
||||
}
|
||||
|
||||
func zeroPad(n, width int) string {
|
||||
s := ""
|
||||
for i := 0; i < width; i++ {
|
||||
s = string(rune('0'+(n%10))) + s
|
||||
n /= 10
|
||||
}
|
||||
return s
|
||||
}
|
||||
@@ -0,0 +1,82 @@
|
||||
// Tests for issue #1677: release fast-path workflow.
|
||||
//
|
||||
// These tests gate the workflow config (not Go code) by parsing the YAML
|
||||
// files as text and asserting structural invariants. They follow the same
|
||||
// "config gate" pattern as openapi_completeness_test.go.
|
||||
//
|
||||
// 1. .github/workflows/release-fast-path.yml MUST exist and own the
|
||||
// push.tags trigger for v-tags, with the two execution branches
|
||||
// (re-tag-via-crane on SHA match, fallback to deploy.yml otherwise).
|
||||
// 2. .github/workflows/deploy.yml MUST NOT trigger on push.tags any
|
||||
// more — the fast-path workflow owns tag pushes to avoid double-fire.
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
const (
|
||||
fastPathWorkflowRel = "../../.github/workflows/release-fast-path.yml"
|
||||
deployWorkflowRel = "../../.github/workflows/deploy.yml"
|
||||
)
|
||||
|
||||
func TestReleaseFastPathWorkflowExists(t *testing.T) {
|
||||
abs, _ := filepath.Abs(fastPathWorkflowRel)
|
||||
raw, err := os.ReadFile(fastPathWorkflowRel)
|
||||
if err != nil {
|
||||
t.Fatalf("issue #1677: release-fast-path.yml missing at %s: %v", abs, err)
|
||||
}
|
||||
src := string(raw)
|
||||
|
||||
// Trigger: push.tags matching semver v-tags.
|
||||
triggerRe := regexp.MustCompile(`(?m)^\s*tags:\s*\[\s*['"]v\[0-9\]\+\.\[0-9\]\+\.\[0-9\]\+['"]\s*\]`)
|
||||
if !triggerRe.MatchString(src) {
|
||||
t.Errorf("release-fast-path.yml: missing required push.tags trigger 'v[0-9]+.[0-9]+.[0-9]+'")
|
||||
}
|
||||
|
||||
// Permissions: needs packages:write to re-tag in GHCR, contents:read for checkout.
|
||||
for _, perm := range []string{"packages: write", "contents: read"} {
|
||||
if !strings.Contains(src, perm) {
|
||||
t.Errorf("release-fast-path.yml: missing required permission %q", perm)
|
||||
}
|
||||
}
|
||||
|
||||
// Required markers covering both execution branches:
|
||||
// - re-tag path: install crane, read :edge revision label, apply new tags
|
||||
// - fallback path: dispatch the existing deploy.yml pipeline
|
||||
required := []string{
|
||||
"imjasonh/setup-crane", // crane install action
|
||||
"org.opencontainers.image.revision", // label inspected on :edge
|
||||
"ghcr.io/kpa-clawbot/corescope", // image ref
|
||||
":edge", // source tag we copy from
|
||||
"crane tag", // metadata-only retag
|
||||
"workflow run deploy.yml", // fallback dispatch
|
||||
}
|
||||
for _, need := range required {
|
||||
if !strings.Contains(src, need) {
|
||||
t.Errorf("release-fast-path.yml: missing required marker %q (issue #1677 fix-path)", need)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeployWorkflowNoLongerTriggersOnTags(t *testing.T) {
|
||||
raw, err := os.ReadFile(deployWorkflowRel)
|
||||
if err != nil {
|
||||
t.Fatalf("deploy.yml: %v", err)
|
||||
}
|
||||
// Extract the top-level `on:` block: from `^on:` up to the next
|
||||
// top-level YAML key (line that starts in column 0 with a letter).
|
||||
blockRe := regexp.MustCompile(`(?ms)^on:\s*\n(.*?)\n([a-zA-Z][a-zA-Z0-9_-]*:)`)
|
||||
m := blockRe.FindStringSubmatch(string(raw))
|
||||
if m == nil {
|
||||
t.Fatalf("deploy.yml: could not locate top-level on: block")
|
||||
}
|
||||
onBlock := m[1]
|
||||
if regexp.MustCompile(`(?m)^\s*tags:\s*\[`).MatchString(onBlock) {
|
||||
t.Errorf("deploy.yml: on: block still triggers on push.tags; the fast-path workflow (release-fast-path.yml) must own tag pushes to avoid double-fire (issue #1677).\non-block was:\n%s", onBlock)
|
||||
}
|
||||
}
|
||||
@@ -15,6 +15,20 @@ import (
|
||||
// plenty fresh for an at-a-glance status column.
|
||||
const repeaterEnrichmentRecomputerDefaultInterval = 5 * time.Minute
|
||||
|
||||
// repeaterEnrichmentPrewarmWait is the upper bound on how long the
|
||||
// synchronous prewarm in StartRepeaterEnrichmentRecomputer will wait
|
||||
// for the background subpath+pathHop index builds to flip ready before
|
||||
// skipping the prewarm. Override in tests via the package-level var.
|
||||
//
|
||||
// Background (issue #1008 review M1): the prewarm computes against
|
||||
// s.byPathHop. If the background index builds haven't finished, the
|
||||
// snapshot is built against an empty map and locked into
|
||||
// s.repeaterRelayCache for `interval` (default 5min) — every
|
||||
// /api/nodes during that window would report relay_count_24h=0. We
|
||||
// wait up to this deadline and, on timeout, skip the prewarm entirely
|
||||
// so the next ticker fire (which will see ready=true) does the work.
|
||||
var repeaterEnrichmentPrewarmWait = 60 * time.Second
|
||||
|
||||
// StartRepeaterEnrichmentRecomputer is the steady-state background
|
||||
// recompute loop for the repeater enrichment bulk caches consumed by
|
||||
// handleNodes (GetRepeaterRelayInfoMap + GetRepeaterUsefulnessScoreMap).
|
||||
@@ -55,7 +69,15 @@ func (s *PacketStore) StartRepeaterEnrichmentRecomputer(windowHours float64, int
|
||||
// is to make sure the very first /api/nodes?limit=2000 from
|
||||
// live.js's SPA bootstrap (issue #1262) hits a populated cache
|
||||
// instead of paying the on-thread rebuild cost.
|
||||
recomputeRepeaterEnrichmentSafe(s, windowHours)
|
||||
//
|
||||
// Issue #1008 review M1: skip the prewarm if the background
|
||||
// subpath+pathHop index builds haven't finished — otherwise we'd
|
||||
// snapshot against an empty s.byPathHop and serve relay_count_24h=0
|
||||
// for the entire `interval` window. The next ticker fire will pick
|
||||
// up the populated index.
|
||||
if s.WaitIndexesReady(repeaterEnrichmentPrewarmWait) {
|
||||
recomputeRepeaterEnrichmentSafe(s, windowHours)
|
||||
}
|
||||
|
||||
var stopOnce sync.Once
|
||||
go func() {
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user