mirror of
https://github.com/Kpa-clawbot/meshcore-analyzer.git
synced 2026-07-02 04:31:40 +00:00
Compare commits
13 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 4e4de3fe0d | |||
| 86ca793b60 | |||
| 4291b387f5 | |||
| 3cd7186563 | |||
| 86a4403136 | |||
| c46a60f78a | |||
| d4b1aa40d0 | |||
| d617a55155 | |||
| 2106cc0b8b | |||
| 0acbac6fde | |||
| 2c675f5ab2 | |||
| 545df2788d | |||
| f872fd90bf |
@@ -1 +1 @@
|
||||
{"schemaVersion":1,"label":"e2e tests","message":"821 passed","color":"brightgreen"}
|
||||
{"schemaVersion":1,"label":"e2e tests","message":"83 passed","color":"brightgreen"}
|
||||
|
||||
@@ -1 +1 @@
|
||||
{"schemaVersion":1,"label":"frontend coverage","message":"36.64%","color":"red"}
|
||||
{"schemaVersion":1,"label":"frontend coverage","message":"37.74%","color":"red"}
|
||||
|
||||
-287
@@ -1,287 +0,0 @@
|
||||
{
|
||||
"parserOptions": {
|
||||
"ecmaVersion": 2022,
|
||||
"sourceType": "script"
|
||||
},
|
||||
"env": {
|
||||
"browser": true,
|
||||
"es2022": true
|
||||
},
|
||||
"globals": {
|
||||
"AreaFilter": "readonly",
|
||||
"CACHE_INVALIDATE_MS": "readonly",
|
||||
"CLIENT_CONFIG": "readonly",
|
||||
"CLIENT_TTL": "readonly",
|
||||
"ChannelColorPicker": "readonly",
|
||||
"ChannelColors": "readonly",
|
||||
"ChannelDecrypt": "readonly",
|
||||
"ChannelQR": "readonly",
|
||||
"Chart": "readonly",
|
||||
"DIST_THRESHOLDS": "readonly",
|
||||
"DragManager": "readonly",
|
||||
"EXTERNAL_URLS": "readonly",
|
||||
"FAV_KEY": "readonly",
|
||||
"FilterUX": "readonly",
|
||||
"GestureHints": "readonly",
|
||||
"HEALTH_THRESHOLDS": "readonly",
|
||||
"HashColor": "readonly",
|
||||
"HopDisplay": "readonly",
|
||||
"HopResolver": "readonly",
|
||||
"IATA_CITIES": "readonly",
|
||||
"IATA_COORDS_GEO": "readonly",
|
||||
"L": "readonly",
|
||||
"LIMITS": "readonly",
|
||||
"Logo": "readonly",
|
||||
"MAX_HOP_DIST": "readonly",
|
||||
"MeshAudio": "readonly",
|
||||
"MeshConfigReady": "readonly",
|
||||
"PAYLOAD_COLORS": "readonly",
|
||||
"PAYLOAD_TYPES": "readonly",
|
||||
"PERF_SLOW_MS": "readonly",
|
||||
"PROPAGATION_BUFFER_MS": "readonly",
|
||||
"PULL_THRESHOLD_PX": "readonly",
|
||||
"PacketFilter": "readonly",
|
||||
"PathInspector": "readonly",
|
||||
"PrefixReserved": "readonly",
|
||||
"QRCode": "readonly",
|
||||
"ROLE_COLORS": "readonly",
|
||||
"ROLE_EMOJI": "readonly",
|
||||
"ROLE_LABELS": "readonly",
|
||||
"ROLE_SHAPES": "readonly",
|
||||
"ROLE_SORT": "readonly",
|
||||
"ROLE_STYLE": "readonly",
|
||||
"ROUTE_TYPES": "readonly",
|
||||
"RegionFilter": "readonly",
|
||||
"RegionShowAll": "readonly",
|
||||
"SITE_CONFIG": "readonly",
|
||||
"SKEW_SEVERITY_COLORS": "readonly",
|
||||
"SKEW_SEVERITY_LABELS": "readonly",
|
||||
"SKEW_SEVERITY_ORDER": "readonly",
|
||||
"SNR_THRESHOLDS": "readonly",
|
||||
"SlideOver": "readonly",
|
||||
"TILE_DARK": "readonly",
|
||||
"TILE_LIGHT": "readonly",
|
||||
"MC_TILE_PROVIDERS": "readonly",
|
||||
"MC_setDarkTileProvider": "readonly",
|
||||
"MC_getDarkTileProvider": "readonly",
|
||||
"MC_setServerDefaultTileProvider": "readonly",
|
||||
"MC_applyTileFilter": "readonly",
|
||||
"MC_DARK_TILE_DEFAULT": "readonly",
|
||||
"TYPE_COLORS": "readonly",
|
||||
"TableResponsive": "readonly",
|
||||
"TableSort": "readonly",
|
||||
"TouchGestures": "readonly",
|
||||
"TracesHelpers": "readonly",
|
||||
"URLState": "readonly",
|
||||
"WS_RECONNECT_MS": "readonly",
|
||||
"_SITE_CONFIG_ORIGINAL_HOME": "readonly",
|
||||
"__PERF_LOG_RENDER": "readonly",
|
||||
"__bottomNavInitDone": "readonly",
|
||||
"__corescopeLogo": "readonly",
|
||||
"__dirname": "readonly",
|
||||
"__filename": "readonly",
|
||||
"__gestureHints1065Init": "readonly",
|
||||
"__liveMQLBindCount": "readonly",
|
||||
"__meshcoreMapInternals": "readonly",
|
||||
"__navDrawer": "readonly",
|
||||
"__navDrawerPointerBindCount": "readonly",
|
||||
"__pathOverflowWired": "readonly",
|
||||
"__scrollLock": "readonly",
|
||||
"__touchGestures1062InitCount": "readonly",
|
||||
"_analyticsChannelTbodyHtml": "readonly",
|
||||
"_analyticsChannelTheadHtml": "readonly",
|
||||
"_analyticsDecorateChannels": "readonly",
|
||||
"_analyticsHashStatCardsHtml": "readonly",
|
||||
"_analyticsLoadChannelSort": "readonly",
|
||||
"_analyticsRenderCollisionsFromServer": "readonly",
|
||||
"_analyticsRenderMultiByteAdopters": "readonly",
|
||||
"_analyticsRenderMultiByteCapability": "readonly",
|
||||
"_analyticsRfNFColumnChart": "readonly",
|
||||
"_analyticsSaveChannelSort": "readonly",
|
||||
"_analyticsSortChannels": "readonly",
|
||||
"_apiCache": "readonly",
|
||||
"_apiPerf": "readonly",
|
||||
"_channelsBeginMessageRequestForTest": "readonly",
|
||||
"_channelsGetStateForTest": "readonly",
|
||||
"_channelsHandleWSBatchForTest": "readonly",
|
||||
"_channelsIsStaleMessageRequestForTest": "readonly",
|
||||
"_channelsLoadChannelsForTest": "readonly",
|
||||
"_channelsProcessWSBatchForTest": "readonly",
|
||||
"_channelsReconcileSelectionForTest": "readonly",
|
||||
"_channelsRefreshMessagesForTest": "readonly",
|
||||
"_channelsSelectChannelForTest": "readonly",
|
||||
"_channelsSetObserverRegionsForTest": "readonly",
|
||||
"_channelsSetStateForTest": "readonly",
|
||||
"_channelsShouldProcessWSMessageForRegion": "readonly",
|
||||
"_customizerV2": "readonly",
|
||||
"_ensurePullIndicator": "readonly",
|
||||
"_inflight": "readonly",
|
||||
"_isTouchDevice": "readonly",
|
||||
"_liveAddFeedItem": "readonly",
|
||||
"_liveBufferPacket": "readonly",
|
||||
"_liveBuildClickablePathPopupHtml": "readonly",
|
||||
"_liveBuildObserverIataMap": "readonly",
|
||||
"_liveClickablePaths": "readonly",
|
||||
"_liveDbPacketToLive": "readonly",
|
||||
"_liveExpandToBufferEntries": "readonly",
|
||||
"_liveExpandToBufferEntriesAsync": "readonly",
|
||||
"_liveFormatLiveTimestampHtml": "readonly",
|
||||
"_liveGetFavoritePubkeys": "readonly",
|
||||
"_liveGetNodeFilterKeys": "readonly",
|
||||
"_liveGetObserverIataMap": "readonly",
|
||||
"_liveIsNodeFavorited": "readonly",
|
||||
"_liveNodeActivity": "readonly",
|
||||
"_liveNodeData": "readonly",
|
||||
"_liveNodeMarkers": "readonly",
|
||||
"_livePacketInvolvesFavorite": "readonly",
|
||||
"_livePacketInvolvesFilterNode": "readonly",
|
||||
"_livePacketMatchesRegion": "readonly",
|
||||
"_livePruneClickablePaths": "readonly",
|
||||
"_livePruneStaleNodes": "readonly",
|
||||
"_liveRebuildFeedList": "readonly",
|
||||
"_liveResolveHopPositions": "readonly",
|
||||
"_liveSEG_MAP": "readonly",
|
||||
"_liveSetMarkerColor": "readonly",
|
||||
"_liveSetMarkerSize": "readonly",
|
||||
"_liveSetNodeFilter": "readonly",
|
||||
"_liveSetObserverIataMap": "readonly",
|
||||
"_liveSpeedLabel": "readonly",
|
||||
"_liveVCR": "readonly",
|
||||
"_liveVcrPause": "readonly",
|
||||
"_liveVcrResumeLive": "readonly",
|
||||
"_liveVcrSetMode": "readonly",
|
||||
"_liveVcrSpeedCycle": "readonly",
|
||||
"_live_packetTimestamp": "readonly",
|
||||
"_mapGetNeighborPubkeys": "readonly",
|
||||
"_mapSelectRefNode": "readonly",
|
||||
"_meshAudioVoices": "readonly",
|
||||
"_meshcoreHeatLayer": "readonly",
|
||||
"_meshcoreLiveHeatLayer": "readonly",
|
||||
"_nodesGetAllNodes": "readonly",
|
||||
"_nodesGetSortState": "readonly",
|
||||
"_nodesGetStatusInfo": "readonly",
|
||||
"_nodesGetStatusTooltip": "readonly",
|
||||
"_nodesIsAdvertMessage": "readonly",
|
||||
"_nodesMatchesSearch": "readonly",
|
||||
"_nodesRenderNodeTimestampHtml": "readonly",
|
||||
"_nodesRenderNodeTimestampText": "readonly",
|
||||
"_nodesSetAllNodes": "readonly",
|
||||
"_nodesSetSortState": "readonly",
|
||||
"_nodesSortArrow": "readonly",
|
||||
"_nodesSortNodes": "readonly",
|
||||
"_nodesSyncClaimedToFavorites": "readonly",
|
||||
"_nodesToggleSort": "readonly",
|
||||
"_packetsTestAPI": "readonly",
|
||||
"_panelCorner": "readonly",
|
||||
"_pendingPathInspectorRoute": "readonly",
|
||||
"_perfWriteSourcesPrev": "readonly",
|
||||
"_pullIndicator": "readonly",
|
||||
"_pullToast": "readonly",
|
||||
"_pullToastTimer": "readonly",
|
||||
"_reducedMotionMQL": "readonly",
|
||||
"_showPullToast": "readonly",
|
||||
"_themeRefreshTimer": "readonly",
|
||||
"_vcrFormatTime": "readonly",
|
||||
"addEventListener": "readonly",
|
||||
"api": "readonly",
|
||||
"apiPerf": "readonly",
|
||||
"bindFavStars": "readonly",
|
||||
"buildHexLegend": "readonly",
|
||||
"buildNodesQuery": "readonly",
|
||||
"buildPacketsQuery": "readonly",
|
||||
"clearParsedCache": "readonly",
|
||||
"closeMoreMenu": "readonly",
|
||||
"closeNav": "readonly",
|
||||
"comparePacketSets": "readonly",
|
||||
"computeBreakdownRanges": "readonly",
|
||||
"computeOverlapStats": "readonly",
|
||||
"connectWS": "readonly",
|
||||
"copyToClipboard": "readonly",
|
||||
"createColoredHexDump": "readonly",
|
||||
"currentPage": "readonly",
|
||||
"currentSkewValue": "readonly",
|
||||
"debounce": "readonly",
|
||||
"debouncedOnWS": "readonly",
|
||||
"destroy": "readonly",
|
||||
"devicePixelRatio": "readonly",
|
||||
"dispatchEvent": "readonly",
|
||||
"drawPacketRoute": "readonly",
|
||||
"escapeHtml": "readonly",
|
||||
"exports": "readonly",
|
||||
"favStar": "readonly",
|
||||
"fetchAllNodes": "readonly",
|
||||
"filterPacketsByRoute": "readonly",
|
||||
"formatAbsoluteTimestamp": "readonly",
|
||||
"formatChartAxisLabel": "readonly",
|
||||
"formatDistance": "readonly",
|
||||
"formatDistanceRound": "readonly",
|
||||
"formatDrift": "readonly",
|
||||
"formatHex": "readonly",
|
||||
"formatIsoLike": "readonly",
|
||||
"formatSkew": "readonly",
|
||||
"formatTimestamp": "readonly",
|
||||
"formatTimestampCustom": "readonly",
|
||||
"formatTimestampWithTooltip": "readonly",
|
||||
"getDistanceUnit": "readonly",
|
||||
"getFavorites": "readonly",
|
||||
"getHashParams": "readonly",
|
||||
"getHealthThresholds": "readonly",
|
||||
"getNodeStatus": "readonly",
|
||||
"getParsedDecoded": "readonly",
|
||||
"getParsedPath": "readonly",
|
||||
"getPathLenOffset": "readonly",
|
||||
"getResolvedPath": "readonly",
|
||||
"getTileUrl": "readonly",
|
||||
"getTimestampCustomFormat": "readonly",
|
||||
"getTimestampFormatPreset": "readonly",
|
||||
"getTimestampMode": "readonly",
|
||||
"getTimestampTimezone": "readonly",
|
||||
"global": "readonly",
|
||||
"initGeoFilterOverlay": "readonly",
|
||||
"initTabBar": "readonly",
|
||||
"invalidateApiCache": "readonly",
|
||||
"isFavorite": "readonly",
|
||||
"isTransportRoute": "readonly",
|
||||
"makeColumnsResizable": "readonly",
|
||||
"makeRoleMarkerSVG": "readonly",
|
||||
"miniMarkdown": "readonly",
|
||||
"module": "readonly",
|
||||
"navigate": "readonly",
|
||||
"observerSkewSeverity": "readonly",
|
||||
"offWS": "readonly",
|
||||
"onWS": "readonly",
|
||||
"pad2": "readonly",
|
||||
"pad3": "readonly",
|
||||
"pages": "readonly",
|
||||
"payloadTypeColor": "readonly",
|
||||
"payloadTypeName": "readonly",
|
||||
"process": "readonly",
|
||||
"pullReconnect": "readonly",
|
||||
"qrcode": "readonly",
|
||||
"registerPage": "readonly",
|
||||
"renderVersionCard": "readonly",
|
||||
"renderSkewBadge": "readonly",
|
||||
"renderSkewSparkline": "readonly",
|
||||
"require": "readonly",
|
||||
"routeLayer": "readonly",
|
||||
"routeTypeName": "readonly",
|
||||
"setupPullToReconnect": "readonly",
|
||||
"syncBadgeColors": "readonly",
|
||||
"timeAgo": "readonly",
|
||||
"toggleFavorite": "readonly",
|
||||
"transportBadge": "readonly",
|
||||
"truncate": "readonly",
|
||||
"ws": "readonly",
|
||||
"wsListeners": "readonly"
|
||||
},
|
||||
"rules": {
|
||||
"no-undef": "error",
|
||||
"no-unused-vars": [
|
||||
"warn",
|
||||
{
|
||||
"argsIgnorePattern": "^_",
|
||||
"varsIgnorePattern": "^_"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -3,6 +3,7 @@ name: CI/CD Pipeline
|
||||
on:
|
||||
push:
|
||||
branches: [master]
|
||||
tags: ['v*']
|
||||
pull_request:
|
||||
branches: [master]
|
||||
workflow_dispatch:
|
||||
@@ -13,7 +14,7 @@ permissions:
|
||||
|
||||
concurrency:
|
||||
group: ci-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
|
||||
@@ -54,9 +55,7 @@ jobs:
|
||||
set -e -o pipefail
|
||||
cd cmd/server
|
||||
go build .
|
||||
# -race gates PR #1208's atomic.Pointer migration: the race-detector
|
||||
# is what makes path_inspect_atomic_race_test.go actually assert.
|
||||
go test -timeout 15m -race -coverprofile=server-coverage.out ./... 2>&1 | tee server-test.log
|
||||
go test -coverprofile=server-coverage.out ./... 2>&1 | tee server-test.log
|
||||
echo "--- Go Server Coverage ---"
|
||||
go tool cover -func=server-coverage.out | tail -1
|
||||
|
||||
@@ -65,7 +64,7 @@ jobs:
|
||||
set -e -o pipefail
|
||||
cd cmd/ingestor
|
||||
go build .
|
||||
go test -timeout 15m -coverprofile=ingestor-coverage.out ./... 2>&1 | tee ingestor-test.log
|
||||
go test -coverprofile=ingestor-coverage.out ./... 2>&1 | tee ingestor-test.log
|
||||
echo "--- Go Ingestor Coverage ---"
|
||||
go tool cover -func=ingestor-coverage.out | tail -1
|
||||
|
||||
@@ -80,105 +79,6 @@ jobs:
|
||||
go test ./...
|
||||
echo "--- Decrypt CLI tests passed ---"
|
||||
|
||||
- name: Verify Dockerfile COPY invariants (issue #1316)
|
||||
run: bash scripts/check-dockerfile-internal-pkgs.sh
|
||||
|
||||
- name: Staging disk-monitor unit tests (issue #1684)
|
||||
run: bash scripts/staging/test-disk-monitor.sh
|
||||
|
||||
- name: Lint CSS variables (issue #1128)
|
||||
run: |
|
||||
set -e
|
||||
node scripts/check-css-vars.js
|
||||
node scripts/test-check-css-vars.js
|
||||
|
||||
- name: Run JS unit tests (packet-filter)
|
||||
run: |
|
||||
set -e
|
||||
node test-packet-filter.js
|
||||
node test-packet-filter-time.js
|
||||
node test-confidence-indicator.js
|
||||
node test-1659-analytics-warmup.js
|
||||
node test-channels-merge-1498-unit.js
|
||||
node test-issue-1518-home-url.js
|
||||
node test-channel-decrypt-insecure-context.js
|
||||
node test-live-region-filter.js
|
||||
node test-issue-1136-observer-iata-map.js
|
||||
node test-channel-qr.js
|
||||
node test-channel-qr-wiring.js
|
||||
node test-channel-modal-ux.js
|
||||
node test-channel-issue-1087.js
|
||||
node test-issue-1409-no-encrypted-flood.js
|
||||
node test-channel-issue-1101.js
|
||||
node test-observer-iata-1188.js
|
||||
node test-pull-to-reconnect-1091.js
|
||||
node test-channel-fluid-layout.js
|
||||
node test-issue-1279-p2-code-filter.js
|
||||
node test-area-filter.js
|
||||
node test-issue-1293-marker-shapes.js
|
||||
node test-issue-1356-map-a11y.js
|
||||
node test-issue-1360-pill-letter-count.js
|
||||
node test-issue-1364-pill-no-clamp.js
|
||||
node test-issue-1375-scope-stats-fetch.js
|
||||
node test-issue-1361-cb-presets.js
|
||||
node test-issue-1380-cb-sim-overlay.js
|
||||
node test-issue-1380-cb-reset-button.js
|
||||
node test-issue-1407-cb-preset-propagation.js
|
||||
node test-issue-1412-customizer-no-override.js
|
||||
node test-issue-1418-raw-hex-extraction.js
|
||||
node test-issue-1418-edge-weights.js
|
||||
node test-issue-1418-cb-preset-ramp.js
|
||||
node test-issue-1418-spider-fan.js
|
||||
node test-issue-1418-deeplink-hops-channels.js
|
||||
node test-issue-1418-polish-review.js
|
||||
node test-issue-1420-tile-providers.js
|
||||
node test-issue-1614-tile-url-function.js
|
||||
node test-issue-1438-marker-css-vars.js
|
||||
node test-issue-1562-observers-summary.js
|
||||
node test-issue-1509-nav-active-bg.js
|
||||
node test-issue-1509-detect-preset.js
|
||||
node test-live.js
|
||||
node test-issue-1107-live-layout.js
|
||||
node test-issue-1532-live-fullscreen.js
|
||||
node test-issue-1619-feed-detail-card-draggable.js
|
||||
node test-xss-escape-sinks.js
|
||||
node test-preflight-xss-gate.js
|
||||
node test-traces.js
|
||||
node test-issue-1648-m4-emoji-scan.js
|
||||
node test-issue-1668-m3-typography.js
|
||||
node test-mqtt-status-panel.js
|
||||
node test-issue-1697-mqtt-mobile-e2e.js
|
||||
node test-warmup-banner.js
|
||||
node test-issue-1633-hide-1byte-hops.js
|
||||
node test-issue-1668-m4-per-route.js
|
||||
node test-a11y-axe-1668-selftest.js
|
||||
|
||||
- name: 🛡️ Preflight XSS gate — actual --diff check (PR only)
|
||||
# The fixture self-test above (test-preflight-xss-gate.js) only
|
||||
# asserts the script's behavior against fixtures. It does NOT scan
|
||||
# the PR's own changes. This step closes that gap by running the
|
||||
# gate against added lines in public/**/*.{js,html} on the PR.
|
||||
# Gate is PR-scoped only (per djb finding: merge commits would
|
||||
# slip an opt-out otherwise). Master pushes skip this step.
|
||||
if: github.event_name == 'pull_request'
|
||||
env:
|
||||
PR_BODY: ${{ github.event.pull_request.body }}
|
||||
PREFLIGHT_PR_LABELS: ${{ join(github.event.pull_request.labels.*.name, ' ') }}
|
||||
run: |
|
||||
set -e
|
||||
git fetch origin master --depth=50 2>&1 | tail -3 || true
|
||||
# Materialize PR body to a file for the opt-out parser.
|
||||
printf '%s' "$PR_BODY" > /tmp/pr-body.md
|
||||
PREFLIGHT_PR_BODY=/tmp/pr-body.md bash scripts/check-xss-sinks.sh --diff origin/master
|
||||
|
||||
- name: 🧹 Frontend lint (eslint no-undef) — issue #1342
|
||||
run: |
|
||||
set -e
|
||||
# Use eslint@8 (legacy .eslintrc.json). Don't migrate to flat-config / eslint@9.
|
||||
# --no-save: avoid touching package.json / no committed node_modules.
|
||||
npm install --no-save --no-audit --no-fund eslint@8
|
||||
npx eslint public/*.js
|
||||
|
||||
- name: Verify proto syntax
|
||||
run: |
|
||||
set -e
|
||||
@@ -265,12 +165,6 @@ jobs:
|
||||
go build -o ../../corescope-server .
|
||||
echo "Go server built successfully"
|
||||
|
||||
- name: Build Go migrate tool
|
||||
run: |
|
||||
cd cmd/migrate
|
||||
go build -o ../../corescope-migrate .
|
||||
echo "Go migrate tool built successfully"
|
||||
|
||||
- name: Install npm dependencies
|
||||
run: npm ci --production=false
|
||||
|
||||
@@ -282,66 +176,6 @@ jobs:
|
||||
- name: Instrument frontend JS for coverage
|
||||
run: sh scripts/instrument-frontend.sh
|
||||
|
||||
- name: Freshen fixture timestamps
|
||||
run: bash tools/freshen-fixture.sh test-fixtures/e2e-fixture.db
|
||||
|
||||
- name: Seed grouped-packet row for #1486 collapse test
|
||||
# The committed fixture has 499 packets, each with exactly ONE
|
||||
# observation, so the packets-page renders only flat
|
||||
# (select-hash) rows. The #1486 repro needs at least one grouped
|
||||
# (toggle-select) row. Insert a NEW transmission with 3
|
||||
# observations.
|
||||
#
|
||||
# The server's async hash-migrate (cmd/server/hash_migrate.go)
|
||||
# recomputes `transmissions.hash` from `raw_hex` via
|
||||
# ComputeContentHash(), so the inserted hash MUST equal that
|
||||
# function's output for the chosen raw_hex — otherwise the row
|
||||
# gets relabelled and the E2E can't find it.
|
||||
#
|
||||
# raw_hex 15000102030405060708090a0b0c0d0e0f
|
||||
# → header=0x15 (route_type=1, payload_type=5)
|
||||
# → ComputeContentHash(...) = fae0c9e6d357a814
|
||||
#
|
||||
# The first_seen / observation timestamps are pinned to a date
|
||||
# within retentionHours but outside the default 15-min UI
|
||||
# window so the row is hidden in the default view (keeping
|
||||
# test-e2e-playwright's first-10-rows hex-pane test
|
||||
# unaffected) and reachable via the explicit ?timeWindow=0
|
||||
# deep-link the #1486 test uses.
|
||||
run: |
|
||||
sqlite3 test-fixtures/e2e-fixture.db <<'SQL'
|
||||
-- Sort the seeded row LAST in BOTH default packets views:
|
||||
-- • flat view sorts by transmissions.id DESC → id=0 puts it last
|
||||
-- • grouped view (#default for the packets page) sorts by
|
||||
-- MAX(observations.timestamp) DESC → we must keep our obs
|
||||
-- timestamps OLDER than every other fixture observation.
|
||||
-- Fixture (after freshen) has obs timestamps spanning
|
||||
-- 2026-05-17 16:01:39Z .. 2026-05-28 00:00:00Z (max).
|
||||
-- Note: freshen only shifts transmissions.first_seen forward
|
||||
-- to ~now; observation.timestamp is left alone except for
|
||||
-- the timestamp=0 case.
|
||||
-- Use 2026-05-15 (~2 days older than the oldest fixture obs)
|
||||
-- so our row sorts LAST in the grouped view too, keeping
|
||||
-- test-e2e-playwright's first-10-rows hex-pane test
|
||||
-- unaffected. The #1486 test still reaches the row via the
|
||||
-- explicit hash + ?timeWindow=0 deep-link.
|
||||
INSERT INTO transmissions(id,raw_hex,hash,first_seen,route_type,payload_type,payload_version,decoded_json,channel_hash,from_pubkey)
|
||||
VALUES (0,'15000102030405060708090a0b0c0d0e0f','fae0c9e6d357a814','2026-05-15T00:00:00Z',1,5,0,'{"type":"CHAN","channel":"#test","text":"#1486 fixture"}',NULL,NULL);
|
||||
INSERT INTO observations(transmission_id,observer_idx,direction,snr,rssi,score,path_json,timestamp,resolved_path) VALUES
|
||||
(0,1,'rx',5.0,-95,0,'["AA"]',CAST(strftime('%s','2026-05-15T00:00:00Z') AS INTEGER),'["aa00000000000000000000000000000000000000000000000000000000000000"]'),
|
||||
(0,2,'rx',5.5,-92,0,'["BB"]',CAST(strftime('%s','2026-05-15T00:00:00Z') AS INTEGER),'["bb00000000000000000000000000000000000000000000000000000000000000"]'),
|
||||
(0,3,'rx',6.0,-90,0,'["CC"]',CAST(strftime('%s','2026-05-15T00:00:00Z') AS INTEGER),'["cc00000000000000000000000000000000000000000000000000000000000000"]');
|
||||
SQL
|
||||
|
||||
- name: Migrate fixture DB to current schema (#1287)
|
||||
# Server now ASSERTs schema is migrated and refuses to start
|
||||
# otherwise (cmd/server/main.go: dbschema.AssertReady). In prod
|
||||
# the ingestor owns dbschema.Apply, but CI starts only the
|
||||
# server against the committed e2e fixture — so we run the
|
||||
# standalone migrate tool here to bring the fixture up to the
|
||||
# required shape before the server boots.
|
||||
run: ./corescope-migrate -db test-fixtures/e2e-fixture.db
|
||||
|
||||
- name: Start Go server with fixture DB
|
||||
run: |
|
||||
fuser -k 13581/tcp 2>/dev/null || true
|
||||
@@ -349,7 +183,7 @@ jobs:
|
||||
./corescope-server -port 13581 -db test-fixtures/e2e-fixture.db -public public-instrumented &
|
||||
echo $! > .server.pid
|
||||
for i in $(seq 1 30); do
|
||||
if curl -sf http://localhost:13581/api/healthz > /dev/null 2>&1; then
|
||||
if curl -sf http://localhost:13581/api/stats > /dev/null 2>&1; then
|
||||
echo "Server ready after ${i}s"
|
||||
break
|
||||
fi
|
||||
@@ -363,118 +197,6 @@ jobs:
|
||||
- name: Run Playwright E2E tests (fail-fast)
|
||||
run: |
|
||||
BASE_URL=http://localhost:13581 node test-e2e-playwright.js 2>&1 | tee e2e-output.txt
|
||||
# M5 of #1668 — axe-core CI gate (color-contrast AA).
|
||||
# Real browser run; fails on any net violation (raw − allowlist).
|
||||
# Allowlist: tests/a11y-allowlist.yaml (0 entries at M5 baseline).
|
||||
BASE_URL=http://localhost:13581 AXE_SCREENSHOT_DIR=/tmp/axe-1668 \
|
||||
node test-a11y-axe-1668.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-filter-ux-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-channel-issue-1087-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-channel-issue-1111-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-map-modal-fluid-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-map-nodes-pagination-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-observer-iata-1188-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-issue-1639-observers-sort-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-nav-fluid-1055-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-nav-priority-1102-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-nav-priority-1311-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-nav-priority-1391-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1413-nav-overlap-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1400-nav-vertical-clip.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-nav-more-floor-1139-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-bottom-nav-1061-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-gestures-1062-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-gestures-1185-scroll-discriminator-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-gesture-hints-1065-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-touch-gestures-coverage-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-channel-fluid-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-table-fluid-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-charts-fluid-1058-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-slideover-1056-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-issue-1692-packets-init-parallel-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-slideover-1168-munger-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-logo-pulse-1173-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-issue-1122-packets-filter-ux-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-issue-1128-packets-layout-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-issue-1128-multi-viewport-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-issue-1136-live-region-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-issue-1150-404-state-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-issue-1146-path-link-contrast-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-issue-1147-section-order-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-issue-1151-orphan-separators-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-issue-1486-collapse-reopens-detail-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-logo-rebrand-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-logo-theme-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-logo-default-sage-teal-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1109-hamburger-dropdown-visible-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-live-layout-1178-1179-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1205-live-controls-anchor-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-live-mql-leak-1180-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1204-live-panel-structure-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1234-live-chrome-pass2-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1206-vcr-overlap-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1244-live-vcr-row-hints-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1510-live-nav-pin-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-live-fullscreen-1572-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1599-replay-freeze-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1648-m1-icons-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1648-m2-icons-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1648-m3-icons-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1648-m4-icons-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1657-analytics-channels-group-sprites-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-issue-1224-channels-mobile-ux-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-issue-1367-channels-chat-app-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-issue-1236-map-mobile-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-issue-1329-map-controls-accordion-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-issue-1273-qr-overlay-height-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-issue-1281-location-row-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-issue-1279-legend-p2-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-home-coverage-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-path-inspector-coverage-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1206-resize-observer-leak-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-nav-drawer-1064-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-audio-live-1297-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-audio-lab-1297-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-channel-decrypt-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-channel-qr-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-channel-color-picker-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-customize-theme-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-customize-branding-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-customize-display-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
BASE_URL=http://localhost:13581 node test-customize-export-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-drag-manager-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1567-corner-clears-drag-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1306-collisions-terminology-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1374-route-map-a11y-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-channels-list-render-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-channels-selection-flow-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-channels-add-modal-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-channels-share-color-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-channels-ws-batch-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-channels-ws-race-1498-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1487-byop-modal-layout-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1630-reach-mobile-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
CHROMIUM_REQUIRE=1 BASE_URL=http://localhost:13581 node test-issue-1640-compare-discovery-e2e.js 2>&1 | tee -a e2e-output.txt
|
||||
|
||||
# #1616: slide-over focus-restore flake-gate. Runs the slide-over
|
||||
# E2E 20 consecutive times against the SAME backend instance so
|
||||
# the Chromium-headless focus race documented in #1172/#1616 has
|
||||
# a 20× shot at firing. Any single non-zero exit aborts. This is
|
||||
# the architectural-fix gate — if it ever turns red post-merge,
|
||||
# the focused-but-hidden state has crept back in.
|
||||
#
|
||||
# PERMANENT step. Adds ~3-4 min to the e2e-test job in exchange
|
||||
# for closing out a flake family that was blocking ~8 unrelated
|
||||
# PRs at a time. If profiling pressures the budget later, drop
|
||||
# repeat count first; do not delete.
|
||||
- name: Slide-over E2E flake-gate (#1616, --repeat-each=3)
|
||||
run: |
|
||||
set -e
|
||||
for i in $(seq 1 3); do
|
||||
echo "--- slide-over E2E run $i/20 ---"
|
||||
BASE_URL=http://localhost:13581 node test-slideover-1056-e2e.js 2>&1 | tee -a slideover-repeat-output.txt
|
||||
done
|
||||
echo "3 passed"
|
||||
|
||||
- name: Collect frontend coverage (parallel)
|
||||
if: success() && github.event_name == 'push'
|
||||
@@ -484,13 +206,7 @@ jobs:
|
||||
- name: Generate frontend coverage badges
|
||||
if: success()
|
||||
run: |
|
||||
# Aggregate per-suite PASS/FAIL across every test-*-e2e.js summary.
|
||||
# The previous regex (grep -oP '[0-9]+(?=/)' | tail -1) caught a
|
||||
# stray digits-before-slash like the '2' in '2/3 tests passed' from
|
||||
# some sub-output and stamped the badge as '2 passed'. See #1296.
|
||||
eval "$(bash scripts/aggregate-e2e-pass.sh e2e-output.txt)"
|
||||
E2E_PASS=${PASS:-0}
|
||||
E2E_FAIL=${FAIL:-0}
|
||||
E2E_PASS=$(grep -oP '[0-9]+(?=/)' e2e-output.txt | tail -1 || echo "0")
|
||||
|
||||
mkdir -p .badges
|
||||
if [ -f .nyc_output/frontend-coverage.json ] || [ -f .nyc_output/e2e-coverage.json ]; then
|
||||
@@ -503,14 +219,7 @@ jobs:
|
||||
echo "{\"schemaVersion\":1,\"label\":\"frontend coverage\",\"message\":\"${FE_COVERAGE}%\",\"color\":\"${FE_COLOR}\"}" > .badges/frontend-coverage.json
|
||||
echo "## Frontend: ${FE_COVERAGE}% coverage" >> $GITHUB_STEP_SUMMARY
|
||||
fi
|
||||
if [ "${E2E_FAIL:-0}" -gt 0 ]; then
|
||||
E2E_MSG="${E2E_PASS:-0} passed, ${E2E_FAIL} failed"
|
||||
E2E_COLOR="red"
|
||||
else
|
||||
E2E_MSG="${E2E_PASS:-0} passed"
|
||||
E2E_COLOR="brightgreen"
|
||||
fi
|
||||
echo "{\"schemaVersion\":1,\"label\":\"e2e tests\",\"message\":\"${E2E_MSG}\",\"color\":\"${E2E_COLOR}\"}" > .badges/e2e-tests.json
|
||||
echo "{\"schemaVersion\":1,\"label\":\"e2e tests\",\"message\":\"${E2E_PASS:-0} passed\",\"color\":\"brightgreen\"}" > .badges/e2e-tests.json
|
||||
|
||||
- name: Stop test server
|
||||
if: always()
|
||||
@@ -650,9 +359,7 @@ jobs:
|
||||
# ───────────────────────────────────────────────────────────────
|
||||
deploy:
|
||||
name: "🚀 Deploy Staging"
|
||||
if: |
|
||||
(github.event_name == 'push' || github.event_name == 'workflow_dispatch')
|
||||
&& github.ref == 'refs/heads/master'
|
||||
if: github.event_name == 'push'
|
||||
needs: [build-and-publish]
|
||||
runs-on: [self-hosted, meshcore-runner-2]
|
||||
steps:
|
||||
|
||||
@@ -1,111 +0,0 @@
|
||||
name: Release Fast-Path
|
||||
|
||||
# Issue #1677: re-tag :edge as :vX.Y.Z when the tag SHA matches :edge's
|
||||
# org.opencontainers.image.revision label. Skips ~30 min of Go test +
|
||||
# Playwright + Docker rebuild because the bytes are identical — only the
|
||||
# manifest name changes. Falls back to deploy.yml when SHAs differ so
|
||||
# tags on older commits still go through full validation.
|
||||
#
|
||||
# This workflow is the SOLE consumer of push.tags. deploy.yml's tag
|
||||
# trigger has been removed to prevent double-fire.
|
||||
|
||||
on:
|
||||
push:
|
||||
tags: ['v[0-9]+.[0-9]+.[0-9]+']
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
|
||||
concurrency:
|
||||
group: release-fast-path-${{ github.ref }}
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
retag-or-fallback:
|
||||
name: "🏷️ Re-tag :edge → :vX.Y.Z (fast) or dispatch deploy.yml (fallback)"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Log in to GHCR
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Install crane
|
||||
uses: imjasonh/setup-crane@v0.4
|
||||
|
||||
- name: Parse semver from tag
|
||||
id: semver
|
||||
run: |
|
||||
set -euo pipefail
|
||||
TAG="${GITHUB_REF#refs/tags/}"
|
||||
# Expect vMAJOR.MINOR.PATCH (workflow trigger already enforces this).
|
||||
if [[ ! "$TAG" =~ ^v([0-9]+)\.([0-9]+)\.([0-9]+)$ ]]; then
|
||||
echo "Tag $TAG does not match vMAJOR.MINOR.PATCH" >&2
|
||||
exit 1
|
||||
fi
|
||||
MAJOR="${BASH_REMATCH[1]}"
|
||||
MINOR="${BASH_REMATCH[2]}"
|
||||
{
|
||||
echo "tag=$TAG"
|
||||
echo "vMajor=v$MAJOR"
|
||||
echo "vMajorMinor=v$MAJOR.$MINOR"
|
||||
} >> "$GITHUB_OUTPUT"
|
||||
echo "Parsed: $TAG → v$MAJOR / v$MAJOR.$MINOR / $TAG"
|
||||
|
||||
- name: Inspect :edge revision label
|
||||
id: edge
|
||||
run: |
|
||||
set -euo pipefail
|
||||
IMAGE="ghcr.io/kpa-clawbot/corescope"
|
||||
EDGE_REF="${IMAGE}:edge"
|
||||
# crane config returns the OCI image config JSON; the revision label
|
||||
# is set by docker/metadata-action on the master-edge build.
|
||||
# If :edge doesn't exist yet (first run on a fresh registry), fall
|
||||
# through to the slow path.
|
||||
if ! CONFIG="$(crane config "$EDGE_REF" 2>/dev/null)"; then
|
||||
echo "edge_revision=" >> "$GITHUB_OUTPUT"
|
||||
echo "no_edge=true" >> "$GITHUB_OUTPUT"
|
||||
echo ":edge not found in registry — will use fallback path"
|
||||
exit 0
|
||||
fi
|
||||
REV="$(echo "$CONFIG" | jq -r '.config.Labels["org.opencontainers.image.revision"] // ""')"
|
||||
echo "edge_revision=$REV" >> "$GITHUB_OUTPUT"
|
||||
echo "no_edge=false" >> "$GITHUB_OUTPUT"
|
||||
echo ":edge org.opencontainers.image.revision = $REV"
|
||||
echo "tag SHA (github.sha) = ${{ github.sha }}"
|
||||
|
||||
# ─────────── FAST PATH: SHAs match, metadata-only retag ───────────
|
||||
- name: Re-tag :edge → :vX.Y.Z + :vX.Y + :vX + :latest (fast path)
|
||||
if: steps.edge.outputs.no_edge == 'false' && steps.edge.outputs.edge_revision == github.sha
|
||||
run: |
|
||||
set -euo pipefail
|
||||
IMAGE="ghcr.io/kpa-clawbot/corescope"
|
||||
SRC="${IMAGE}:edge"
|
||||
echo "SHA match — fast-path re-tag from $SRC"
|
||||
for NEW_TAG in \
|
||||
"${{ steps.semver.outputs.tag }}" \
|
||||
"${{ steps.semver.outputs.vMajorMinor }}" \
|
||||
"${{ steps.semver.outputs.vMajor }}" \
|
||||
"latest"; do
|
||||
echo " crane tag $SRC $NEW_TAG"
|
||||
crane tag "$SRC" "$NEW_TAG"
|
||||
done
|
||||
echo "Fast-path complete — all tags point at the :edge manifest digest."
|
||||
|
||||
# ─────────── FALLBACK: SHAs differ, run the full pipeline ───────────
|
||||
- name: Dispatch full deploy.yml pipeline (fallback)
|
||||
if: steps.edge.outputs.no_edge == 'true' || steps.edge.outputs.edge_revision != github.sha
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
echo "SHA mismatch (or no :edge) — falling back to full pipeline"
|
||||
echo " :edge revision = '${{ steps.edge.outputs.edge_revision }}'"
|
||||
echo " tag SHA = '${{ github.sha }}'"
|
||||
gh workflow run deploy.yml \
|
||||
--repo "${{ github.repository }}" \
|
||||
--ref "${{ github.ref }}"
|
||||
echo "Dispatched deploy.yml against ${{ github.ref }}"
|
||||
@@ -31,5 +31,3 @@ cmd/ingestor/ingestor.exe
|
||||
!test-fixtures/e2e-fixture.db
|
||||
corescope-server
|
||||
cmd/server/server
|
||||
# Local-only planning and design files
|
||||
docs/superpowers/
|
||||
|
||||
@@ -43,17 +43,6 @@ scripts/ — Tooling (coverage collector, fixture capture, frontend in
|
||||
2. Go server (`cmd/server/`) polls SQLite for new packets, broadcasts via WebSocket
|
||||
3. Frontend fetches via REST API (`/api/*`), filters/sorts client-side
|
||||
|
||||
### Read/Write Separation Invariant (#1283)
|
||||
- **All DB writes live in `cmd/ingestor/`.** INSERT / UPDATE / DELETE / VACUUM /
|
||||
schema migrations / retention all run in the ingestor process.
|
||||
- **`cmd/server/` is read-only.** It opens SQLite with `mode=ro` and must not
|
||||
acquire a write lock. Adding a write-side helper (e.g. a `cachedRW`-style
|
||||
RW connection) regresses this invariant and races the ingestor → SQLITE_BUSY.
|
||||
- Enforcement: `cmd/server/readonly_invariant_test.go` reflect-asserts that
|
||||
`PruneOldPackets`, `PruneOldMetrics`, and `RemoveStaleObservers` are NOT
|
||||
methods on the server's `*DB`. If you need a new write, add it to
|
||||
`cmd/ingestor/`.
|
||||
|
||||
### What's Deprecated (DO NOT TOUCH)
|
||||
The following were part of the old Node.js backend and have been removed:
|
||||
- `server.js`, `db.js`, `decoder.js`, `server-helpers.js`, `packet-store.js`, `iata-coords.js`
|
||||
@@ -381,7 +370,6 @@ Existing patterns: `#/nodes/{pubkey}?section=node-neighbors`, `#/analytics?tab=c
|
||||
|
||||
## What NOT to Do
|
||||
- **Don't check in private information** — no names, API keys, tokens, passwords, IP addresses, personal data, or any identifying information. This is a PUBLIC repo.
|
||||
- **Don't introduce new `map[string]interface{}` in API response builders, handler returns, or internal data structures that cross domain boundaries.** Use a named Go struct with explicit JSON tags. CoreScope already carries 694 occurrences (see #1383); the count must monotonically decrease. If your change adds even one new occurrence in a touched file, the PR is wrong-shaped — fix the design, don't paper over with `interface{}`. Exempt: third-party library boundaries that genuinely return `interface{}`, and ad-hoc test fixture assertions.
|
||||
- Don't add npm dependencies without asking
|
||||
- Don't create a build step
|
||||
- Don't add framework abstractions (React, Vue, etc.)
|
||||
|
||||
@@ -1,42 +1,5 @@
|
||||
# Changelog
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
## [3.9.1] — 2026-06-12
|
||||
|
||||
Patch release on top of v3.9.0 — v3.9.0's container image never published (Playwright flake gated Docker build). See [docs/release-notes/v3.9.1.md](docs/release-notes/v3.9.1.md).
|
||||
|
||||
### 🎨 Accessibility
|
||||
- **WCAG AA contrast pass** (#1676, f0addfda) — two-tier CSS palette; muted-text ≥4.5:1 in both themes; unknown-repeater chip fixed (2.75:1 → 4.95:1). Closes #1671. Partial fix for #1668.
|
||||
|
||||
### 🧪 Test stability
|
||||
- **Slideover E2E flake fix** (#1663+followups, f06359d7) — tightened selectors, bumped data-row wait. Fixes #1662.
|
||||
|
||||
## [3.9.0] — 2026-06-12
|
||||
|
||||
See [docs/release-notes/v3.9.0.md](docs/release-notes/v3.9.0.md) for the full notes. 257 commits since v3.8.3 (72 substantive + 185 coverage bumps).
|
||||
|
||||
### ✨ Highlights
|
||||
- **Relay timelines survive an ingestor restart** (#1643) — relay-hop attribution is rebuilt from `path_json` on cold load.
|
||||
- **Observer Compare is first-class** (#1642, #1645, #1647) — three new entry points + Tufte-grade compare page with state-preserving multi-select.
|
||||
- **Emoji → Phosphor icon migration** (#1648, #1649–#1654) — every UI emoji replaced with theme-tinted Phosphor sprites, lint-gated.
|
||||
- **Per-node Reach page + API** (#1627) — `GET /api/nodes/{pubkey}/reach` with cache invalidation on blacklist changes (#1636).
|
||||
- **Hashtag channels catalogue integration** (#1656) — public hashtag channels appear without manual config.
|
||||
- **Operator-customizable name-prefix hiding** (#1655) — new `hiddenNamePrefixes` config (default `["🚫"]`).
|
||||
|
||||
### ⚙️ Config
|
||||
- New: `hiddenNamePrefixes`, `liveMap.maxNodes`, `runtime.maxMemoryMB`, configurable observer-health thresholds, `branding.homeUrl`, customizer disabled-tabs.
|
||||
|
||||
### 📝 Documentation Corrections (carried from prior [Unreleased])
|
||||
- **PR #1324 historical record correction** (#1387) — the merged PR #1324 body referenced four tests that do NOT exist in master: `TestMultibyteCapPersistRoundTrip`, `TestMultibyteCapPersistSkipsUnknown`, `TestMaybePersistCoalesces`, and a `TryLock` coalescing test. The actual tests that landed are `TestRunMultibyteCapPersist_AppliesSnapshot` and `TestRunMultibyteCapPersist_NoSnapshot_NoOp`. See issue #1386 for the corrective test additions (round-trip, unknown-key skip, coalescing).
|
||||
|
||||
## [3.7.2] — 2026-05-06
|
||||
|
||||
Hotfix release branched from `v3.7.1`. Cherry-picks PR #1121 only — no other changes.
|
||||
|
||||
### 🐛 Bug Fixes
|
||||
- **Ingestor: backfill infinite loop on `path_json='[]'` rows** (#1119, #1121) — `BackfillPathJSONAsync` re-selected observations whose `path_json` was already `'[]'`, rewrote them to `'[]'`, and looped forever. The migration marker was never recorded and the ingestor sustained 2–3 MB/s WAL writes at idle (~76% CPU in `sqlite.Exec`). Fix: drop `'[]'` from the WHERE clause so the loop terminates after one full pass and the `backfill_path_json_from_raw_hex_v1` marker is written.
|
||||
|
||||
## [2.5.0] "Digital Rain" — 2026-03-22
|
||||
|
||||
### ✨ Matrix Mode — Full Cyberpunk Map Theme
|
||||
|
||||
@@ -129,98 +129,3 @@ docker compose pull && docker compose up -d
|
||||
| `./manage.sh setup` | Copy `docker-compose.example.yml`, edit env vars |
|
||||
|
||||
`manage.sh` remains available for advanced use cases (building from source, custom patches, development). Pre-built images are recommended for most production deployments.
|
||||
|
||||
## Staging VM — disk-usage monitor & cleanup (#1684)
|
||||
|
||||
The staging VM ran out of disk during a hot-patch (#1684). To prevent
|
||||
repeats, two scripts live in `scripts/staging/`:
|
||||
|
||||
- `disk-monitor.sh <mount>` — reads `df -P`, classifies usage against
|
||||
`<80 ok / >=80 warn / >=90 error / >=95 alert`, emits to stderr +
|
||||
journald (via `logger`). Returns non-zero on `error|alert` so
|
||||
systemd surfaces the unit as failed.
|
||||
- `disk-cleanup.sh` — removes `/tmp` snapshot files (`*.db`,
|
||||
`staging-snap.*`, `cs-*`, `node-compile-cache`) older than 7 days
|
||||
and runs `docker builder prune` + `docker image prune` with
|
||||
`--filter "until=72h" --filter "label!=keep"`. Set
|
||||
`CORESCOPE_CLEANUP_DRY_RUN=1` to log without deleting.
|
||||
|
||||
### Install on the staging host
|
||||
|
||||
SSH to `<STAGING_HOST>` as the staging operator user and:
|
||||
|
||||
```bash
|
||||
sudo install -m 0755 scripts/staging/disk-monitor.sh /usr/local/bin/corescope-disk-monitor
|
||||
sudo install -m 0755 scripts/staging/disk-cleanup.sh /usr/local/bin/corescope-disk-cleanup
|
||||
|
||||
# 15-minute monitor
|
||||
sudo tee /etc/systemd/system/corescope-disk-monitor.service >/dev/null <<'UNIT'
|
||||
[Unit]
|
||||
Description=CoreScope staging disk-usage monitor (issue #1684)
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/usr/local/bin/corescope-disk-monitor /
|
||||
UNIT
|
||||
|
||||
sudo tee /etc/systemd/system/corescope-disk-monitor.timer >/dev/null <<'UNIT'
|
||||
[Unit]
|
||||
Description=Run CoreScope disk-usage monitor every 15 minutes
|
||||
[Timer]
|
||||
OnBootSec=5min
|
||||
OnUnitActiveSec=15min
|
||||
Unit=corescope-disk-monitor.service
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
UNIT
|
||||
|
||||
# Daily cleanup at 03:30 local
|
||||
sudo tee /etc/systemd/system/corescope-disk-cleanup.service >/dev/null <<'UNIT'
|
||||
[Unit]
|
||||
Description=CoreScope staging disk cleanup (issue #1684)
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/usr/local/bin/corescope-disk-cleanup
|
||||
UNIT
|
||||
|
||||
sudo tee /etc/systemd/system/corescope-disk-cleanup.timer >/dev/null <<'UNIT'
|
||||
[Unit]
|
||||
Description=Run CoreScope disk cleanup daily at off-peak
|
||||
[Timer]
|
||||
OnCalendar=*-*-* 03:30:00
|
||||
Persistent=true
|
||||
Unit=corescope-disk-cleanup.service
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
UNIT
|
||||
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable --now corescope-disk-monitor.timer corescope-disk-cleanup.timer
|
||||
```
|
||||
|
||||
`<STAGING_HOST>` is the staging VM hostname/IP — operator supplies it,
|
||||
not committed to the repo.
|
||||
|
||||
### Inspecting alerts
|
||||
|
||||
```bash
|
||||
journalctl -t corescope-disk-monitor --since '-1d'
|
||||
journalctl -t corescope-disk-cleanup --since '-7d'
|
||||
systemctl list-timers | grep corescope-disk
|
||||
```
|
||||
|
||||
`logger` priorities map: `ok→info`, `warn→warning`, `error→err`,
|
||||
`alert→alert` (syslog severity 1, the highest level). Wire
|
||||
`journalctl -p alert ...` to whatever ops channel the operator
|
||||
prefers; use `-p err` to also catch the `error` tier.
|
||||
|
||||
### Notes on `staging-snap.db` root cause (#1684 phase 3)
|
||||
|
||||
`grep -rn staging-snap.db cmd/ public/ scripts/` returns **zero**
|
||||
hits in the repo. The 4.4 GB orphan was a manual debugging artifact,
|
||||
not produced by any committed code. The `disk-cleanup.sh` retention
|
||||
rule (anything matching `staging-snap.*` in `/tmp` older than 7 days)
|
||||
prevents recurrence without needing source-side TTL changes.
|
||||
|
||||
If a future feature legitimately needs persistent snapshot DBs, put
|
||||
them under `/var/lib/corescope/snapshots/` with explicit rotation —
|
||||
not in `/tmp`, which is ephemeral by definition.
|
||||
|
||||
-13
@@ -1,8 +1,5 @@
|
||||
# Build stage always runs natively on the builder's arch ($BUILDPLATFORM)
|
||||
# and cross-compiles to $TARGETOS/$TARGETARCH via Go toolchain. No QEMU.
|
||||
# BUILDPLATFORM is auto-set by buildx; default to linux/amd64 so plain
|
||||
# `docker build` (without buildx) doesn't fail on an empty platform string.
|
||||
ARG BUILDPLATFORM=linux/amd64
|
||||
FROM --platform=$BUILDPLATFORM golang:1.22-alpine AS builder
|
||||
|
||||
ARG APP_VERSION=unknown
|
||||
@@ -18,11 +15,6 @@ COPY cmd/server/go.mod cmd/server/go.sum ./
|
||||
COPY internal/geofilter/ ../../internal/geofilter/
|
||||
COPY internal/sigvalidate/ ../../internal/sigvalidate/
|
||||
COPY internal/packetpath/ ../../internal/packetpath/
|
||||
COPY internal/dbconfig/ ../../internal/dbconfig/
|
||||
COPY internal/dbschema/ ../../internal/dbschema/
|
||||
COPY internal/prunequeue/ ../../internal/prunequeue/
|
||||
COPY internal/perfio/ ../../internal/perfio/
|
||||
COPY internal/mbcapqueue/ ../../internal/mbcapqueue/
|
||||
RUN go mod download
|
||||
COPY cmd/server/ ./
|
||||
RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} \
|
||||
@@ -34,11 +26,6 @@ COPY cmd/ingestor/go.mod cmd/ingestor/go.sum ./
|
||||
COPY internal/geofilter/ ../../internal/geofilter/
|
||||
COPY internal/sigvalidate/ ../../internal/sigvalidate/
|
||||
COPY internal/packetpath/ ../../internal/packetpath/
|
||||
COPY internal/dbconfig/ ../../internal/dbconfig/
|
||||
COPY internal/dbschema/ ../../internal/dbschema/
|
||||
COPY internal/prunequeue/ ../../internal/prunequeue/
|
||||
COPY internal/perfio/ ../../internal/perfio/
|
||||
COPY internal/mbcapqueue/ ../../internal/mbcapqueue/
|
||||
RUN go mod download
|
||||
COPY cmd/ingestor/ ./
|
||||
RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} \
|
||||
|
||||
-142
@@ -1,142 +0,0 @@
|
||||
# MIGRATIONS — async vs sync policy
|
||||
|
||||
CoreScope's ingestor applies schema/data migrations inline at boot in
|
||||
`cmd/ingestor/db.go`. Every migration that runs synchronously blocks the
|
||||
ingestor from accepting packets until it returns. On a dev DB that's
|
||||
milliseconds; at prod scale (1.9M+ observations, 80K+ adverts, 2600+ nodes
|
||||
on Cascadia) it can pin the boot for minutes and trigger restart loops —
|
||||
the "upgrade broke prod" failure class (#791, #1483, and others).
|
||||
|
||||
## The rule
|
||||
|
||||
**Any new `CREATE INDEX`, `ALTER TABLE`, or data-rewriting `UPDATE`/`DELETE`
|
||||
in a migration file MUST do ONE of the following:**
|
||||
|
||||
### Option 1 — Run via `Store.RunAsyncMigration` (preferred for backfills)
|
||||
|
||||
```go
|
||||
// Scheduled in OpenStore() AFTER the *Store is constructed.
|
||||
if err := s.RunAsyncMigration(ctx, "my_migration_v1",
|
||||
func(ctx context.Context, db *sql.DB) error {
|
||||
_, err := db.ExecContext(ctx, `CREATE INDEX IF NOT EXISTS ...`)
|
||||
return err
|
||||
}); err != nil {
|
||||
log.Printf("[migration/async] scheduling failed: %v", err)
|
||||
}
|
||||
```
|
||||
|
||||
- The migration is recorded as `pending_async` in the `_async_migrations`
|
||||
table **immediately** — the ingestor boots and starts ingesting.
|
||||
- `fn` runs in a goroutine; the WaitGroup is shared with the rest of the
|
||||
ingestor (`Store.WaitForAsyncMigrations()` waits for everything).
|
||||
- On success the row flips to `done`; on error/panic to `failed` with the
|
||||
error message captured.
|
||||
- Idempotent: rows in `done` state short-circuit; `failed`/`pending_async`
|
||||
rows are retried on the next boot.
|
||||
|
||||
Reference implementations: `Store.BackfillPathJSONAsync` (path_json
|
||||
backfill) and the converted `obs_observer_ts_idx_v1` index build in
|
||||
`OpenStore`.
|
||||
|
||||
### Option 2 — Annotate as preflight-cheap
|
||||
|
||||
Some migrations are genuinely cheap at any scale (e.g. `ALTER TABLE ADD
|
||||
COLUMN`, `CREATE INDEX` on a table you know is bounded to a few thousand
|
||||
rows). Annotate the migration block with a comment **on the line
|
||||
immediately above the migration block** so the preflight gate recognises
|
||||
the opt-out:
|
||||
|
||||
```go
|
||||
// PREFLIGHT: async=true reason="ALTER ADD COLUMN — O(1) sqlite operation"
|
||||
if r := db.QueryRow("SELECT 1 FROM _migrations WHERE name = 'foo_v1'"); ...
|
||||
```
|
||||
|
||||
The reason MUST be a real one-line justification you can defend in
|
||||
review. "It's fine" is not a reason.
|
||||
|
||||
### Option 3 — Opt out per PR
|
||||
|
||||
If the migration is genuinely safe and you don't want to add an inline
|
||||
annotation, put a single line in the PR body:
|
||||
|
||||
```
|
||||
PREFLIGHT-MIGRATION-SCALE: <30s N=80K verified on Cascadia staging snapshot
|
||||
```
|
||||
|
||||
This must include both `<30s` and `N=<some scale>` so a reviewer can
|
||||
challenge the measurement.
|
||||
|
||||
## The gate
|
||||
|
||||
`~/.openclaw/skills/pr-preflight/scripts/check-async-migrations.sh` runs
|
||||
on every PR via the preflight orchestrator. It greps the diff for new or
|
||||
modified migration blocks (files matching `cmd/ingestor/db.go`,
|
||||
`cmd/ingestor/maintenance.go`, `internal/dbschema/**`, `**/migrations/**`,
|
||||
`**/*.sql`, plus any Go file touching `CREATE INDEX` / `ALTER TABLE` /
|
||||
`CREATE UNIQUE INDEX`). For each hit it requires one of the three
|
||||
opt-outs above. Hard-fail (exit 1) — no warning-only mode.
|
||||
|
||||
## Concurrency model
|
||||
|
||||
CoreScope runs **one ingestor process** per deployment (`cmd/ingestor/`,
|
||||
single binary, single `*Store`). There is no cluster mode, no leader
|
||||
election, no second writer. SQLite is opened with `SetMaxOpenConns(1)`
|
||||
and a 5s `busy_timeout`; all writes (live MQTT ingest + async migration
|
||||
goroutines + maintenance backfills) serialize through the one connection
|
||||
in a single process.
|
||||
|
||||
What this means for async migrations:
|
||||
|
||||
- **No cross-process race** to worry about. Two ingestor instances
|
||||
running against the same DB is not a supported deployment shape.
|
||||
- **Within a single process**, concurrent `RunAsyncMigration(name=X)`
|
||||
callers race the initial `SELECT status` → `UPDATE/INSERT` step. The
|
||||
current implementation re-schedules `fn` on a pending/failed row so a
|
||||
duplicate caller may legitimately re-run it; once status is `done` all
|
||||
further calls short-circuit. See
|
||||
`TestRunAsyncMigration_ConcurrentSameNameSerialized` for the contract.
|
||||
- **`fn` runs concurrently with live ingest writers.** Because
|
||||
`MaxOpenConns=1`, a long `CREATE INDEX` will serialize behind / ahead
|
||||
of insert batches via SQLite's busy-timeout. This is acceptable for
|
||||
index builds (the boot path is unblocked, which was the whole point),
|
||||
but it means long migrations DO add latency to live writes. Document
|
||||
expected runtime in the `reason=` annotation and prefer batched/chunked
|
||||
fn implementations for multi-minute work (see `BackfillPathJSONAsync`
|
||||
for the canonical batched pattern with inter-batch `time.Sleep`).
|
||||
|
||||
## Scale budgets
|
||||
|
||||
Per-migration target: **<30s** at current prod scale (Cascadia: ~2,600
|
||||
nodes, ~80K observations; previous prod snapshot: ~1.9M observations).
|
||||
|
||||
Worked example (#1483, `obs_observer_ts_idx_v1`): composite index build
|
||||
on `observations(observer_idx, timestamp)`. At ~1.9M rows the sync build
|
||||
pinned ingestor boot for several minutes → restart loop. Converted to
|
||||
async via `RunAsyncMigration` in `OpenStore` so boot returns immediately
|
||||
and the index materializes in the background; the existing `_migrations`
|
||||
short-circuit at the top of the migration block ensures DBs that already
|
||||
completed the sync v3.8.3 build do NOT re-run it through the goroutine
|
||||
path on subsequent boots.
|
||||
|
||||
If you cannot meet the <30s budget, document the expected upper bound
|
||||
and operator runbook expectation (e.g. "index build expected ~10 min on
|
||||
a 5M-row table; ingestor remains responsive; monitor via
|
||||
`SELECT status, error FROM _async_migrations WHERE name = ...`").
|
||||
|
||||
## Why this exists
|
||||
|
||||
Pattern that keeps repeating:
|
||||
|
||||
1. Author writes `CREATE INDEX foo ON observations(...)` in a migration.
|
||||
2. Local dev DB has ~100 rows. Migration returns in 1ms. CI is green.
|
||||
3. Reviewer focuses on plan correctness, not scale.
|
||||
4. Ship.
|
||||
5. Prod boots, sqlite scans 1.9M rows, the ingestor sits at `[migration]
|
||||
Adding index...` for 8 minutes, healthcheck times out, container
|
||||
restarts, loops.
|
||||
6. Operator pages. Hotfix. Apology.
|
||||
|
||||
The gate doesn't try to detect table size (undecidable from a diff). It
|
||||
enforces **annotation discipline**: every author who adds a migration
|
||||
must consciously decide which bucket it falls into and write that down.
|
||||
That is the cheapest possible intervention that breaks the cycle.
|
||||
@@ -21,7 +21,6 @@ The Go backend serves all 40+ API endpoints from an in-memory packet store with
|
||||
| Memory (56K packets) | **~300 MB** (vs 1.3 GB on Node.js) |
|
||||
| WebSocket broadcast | **Real-time** to all connected browsers |
|
||||
| Channel decryption | **AES-128-ECB** with rainbow table |
|
||||
| GOMEMLIMIT (memory-constrained hosts) | **set to ≥1.5× working set** (e.g. 1536 MiB on a 2 GB Pi for a ~1 GB store). Lower values trigger a GC death-spiral. Configure via the `GOMEMLIMIT` env var or `runtime.maxMemoryMB` in `config.json`; env wins. Applies to both server and ingestor. See [#1010](https://github.com/Kpa-clawbot/CoreScope/issues/1010). |
|
||||
|
||||
See [PERFORMANCE.md](PERFORMANCE.md) for full benchmarks.
|
||||
|
||||
|
||||
@@ -1,207 +0,0 @@
|
||||
# v3.6.0 - The Forensics
|
||||
|
||||
CoreScope just got eyes everywhere. This release drops **path inspection**, **color-by-hash markers**, **clock skew detection**, **full channel encryption**, an **observer graph**, and a pile of robustness fixes that make your mesh network feel like it's being watched by someone who actually cares.
|
||||
|
||||
134 commits, 105 PRs merged, 18K+ lines added. Here's what shipped.
|
||||
|
||||
---
|
||||
|
||||
## 🚀 New Features
|
||||
|
||||
### Path-Prefix Candidate Inspector (#944, #945)
|
||||
The marquee feature. Click any path segment and CoreScope opens an interactive inspector showing every candidate node that could match that hop prefix - plotted on a map with scoring by neighbor-graph affinity and geographic centroid. Ambiguous hops? Now you can see *why* they're ambiguous and pick the right one.
|
||||
|
||||
**Why you'll love it:** No more guessing which `0xA3` is the real repeater. The inspector lays out every candidate, scores them, and lets you drill in visually.
|
||||
|
||||
### Color-by-Hash Packet Markers (#948, #951)
|
||||
Every packet type gets a vivid, hash-derived color - on the live feed, map polylines, and flying-packet animations. Bright fill with dark outline for contrast. No more monochrome blobs - you can visually track packet flows by color at a glance.
|
||||
|
||||
### Node Filter on Live Page (#924, #771)
|
||||
Filter the live packet stream to show only traffic flowing through a specific node. Pick a repeater, see exactly what it's carrying. That simple.
|
||||
|
||||
### Clock Skew Detection (#746, #752, #828, #850)
|
||||
Full pipeline: backend computes drift using Theil-Sen regression with outlier rejection (#828), the UI shows per-node badges, detail sparklines, and fleet-wide analytics (#752). Bimodal clock severity (#850) surfaces flaky-RTC nodes that toggle between accurate and drifted - instead of hiding them as "No Clock."
|
||||
|
||||
**Why you'll love it:** Nodes with bad clocks silently corrupt your timeline. Now they glow red before they ruin your analysis.
|
||||
|
||||
### Observer Graph (M1+M2) (#774)
|
||||
Observers are now first-class graph citizens. CoreScope builds a neighbor graph from observation overlaps, scores hop-resolver candidates by graph edges (#876), and uses geographic centroid for tiebreaking. The observer topology is visible and queryable.
|
||||
|
||||
### Channel Encryption - Full Stack (#726, #733, #750, #760)
|
||||
Three milestones landed as one: DB-backed channel message history (#726), client-side PSK decryption in the browser (#733), and PSK channel management with add/remove UX and message caching (#750). Add a channel key in the UI, and CoreScope decrypts messages client-side - no server-side key storage. The add-channel button (#760) makes it dead simple.
|
||||
|
||||
**Why you'll love it:** Encrypted channels are no longer black boxes. Add your PSK, see the messages, search history - all without exposing keys to the server.
|
||||
|
||||
### Hash Collision Inspector (#758)
|
||||
The Hash Usage Matrix now shows collision details for all hash sizes. When two nodes share a prefix, you see exactly who collides and at what size.
|
||||
|
||||
### Geofilter Builder - In-App (#735, #900)
|
||||
The geofilter polygon builder is now served directly from CoreScope with a full docs page (#900). No more hunting for external tools. Link from the customizer, draw your polygon, done.
|
||||
|
||||
### Node Blacklist (#742)
|
||||
`nodeBlacklist` in config hides abusive or troll nodes from all views. They're gone.
|
||||
|
||||
### Observer Retention (#764)
|
||||
Stale observers are automatically pruned after a configurable number of days. Your observer list stays clean without manual intervention.
|
||||
|
||||
### Advert Signature Validation (#794)
|
||||
Corrupt packets with invalid advert signatures are now rejected at ingest. Bad data never hits your store.
|
||||
|
||||
### Bounded Cold Load (#790)
|
||||
`Load()` now respects a memory budget - no more OOM on cold start with a fat database. Combined with retention-hours cutoff (#917), cold start is safe on constrained hardware.
|
||||
|
||||
### Multi-Arch Docker Images (#869)
|
||||
Official images now publish `amd64` + `arm64` in a single multi-arch manifest. Raspberry Pi operators: pull and run. No special tags needed.
|
||||
|
||||
### /nodes Detail Panel + Search (#868)
|
||||
The nodes detail panel ships with search improvements (#862) - find nodes fast, see their full detail in a slide-out panel.
|
||||
|
||||
### Deduplicated Top Longest Hops (#848)
|
||||
Longest hops are now deduplicated by pair with observation count and SNR cues. No more seeing the same link 47 times.
|
||||
|
||||
---
|
||||
|
||||
## 🔥 Performance Wins
|
||||
|
||||
### StoreTx ResolvedPath Elimination (#806)
|
||||
The per-transaction `ResolvedPath` computation is gone - replaced by a membership index with on-demand decode. This was one of the hottest paths in the ingestor.
|
||||
|
||||
### Node Packet Queries (#803)
|
||||
Raw JSON text search for node packets replaced with a proper `byNode` index (#673). Night and day.
|
||||
|
||||
### Channel Query Performance (#762, #763)
|
||||
New `channel_hash` column enables SQL-level channel filtering. No more full-table scan to find messages in a channel.
|
||||
|
||||
### SQLite Auto-Vacuum (#919, #920)
|
||||
Incremental auto-vacuum enabled - the database file actually shrinks after retention pruning. No more 2GB database holding 200MB of live data.
|
||||
|
||||
### Retention-Hours Cutoff on Load (#917)
|
||||
`Load()` now applies `retentionHours` at read time, preventing OOM when the DB has more history than memory allows.
|
||||
|
||||
---
|
||||
|
||||
## 🛡️ Security & Robustness
|
||||
|
||||
### MQTT Reconnect with Bounded Backoff (#947, #949)
|
||||
The ingestor now reconnects to MQTT brokers with exponential backoff, observability logging, and bounded retry. No more silent disconnects that kill your data stream.
|
||||
|
||||
---
|
||||
|
||||
## 🐛 Bugs Squashed
|
||||
|
||||
This release exterminates **40+ bugs** — from protocol-level hash mismatches to pixel-level CSS breakage. Operators told us what hurt; we listened.
|
||||
|
||||
- **Path inspector "Show on Map" missed origin and first hop** (#950) - map view now includes all hops
|
||||
- **Content hash used full header byte** (#787) - content hashing now uses payload type bits only, fixing hash collisions between packets that differ only in header flags
|
||||
- **Encrypted channel deep links showed broken UI** (#825, #826, #815) - deep links to encrypted channels now show a lock message instead of broken UI when you don't have the key
|
||||
- **Geofilter longitude wrapping** (#925) - geofilter builder wraps longitude to [-180, 180]; southern hemisphere polygons no longer invert
|
||||
- **Hash filter bypasses saved region filter** (#939) - hash lookups now skip the geo filter as intended
|
||||
- **Companion-as-repeater excluded from path hops** (#935, #936) - non-repeater nodes no longer pollute hop resolution
|
||||
- **Customize panel re-renders while typing** (#927) - text fields keep focus during config changes
|
||||
- **Per-observation raw_hex** (#881, #882) - each observer's hex dump now shows what *that observer* actually received
|
||||
- **Per-observation children in packet groups** (#866, #880) - expanded groups show per-obs data, not cross-observer aggregates
|
||||
- **Full-page obs-switch** (#866, #870) - switching observers updates hex, path, and direction correctly
|
||||
- **Packet detail shows wrong observation** (#849, #851) - clicking a specific observation opens *that* observation
|
||||
- **Byte breakdown hop count** (#844, #846) - derived from `path_len`, not aggregated `_parsedPath`
|
||||
- **Transport-route path_len offset** (#852, #853) - correct offset calculation + CSS variable fix
|
||||
- **Packets/hour chart bars + x-axis** (#858, #865) - bars render correctly, x-axis labels properly decimated
|
||||
- **Channel timeline capped to top 8** (#860, #864) - no more 47-channel chart spaghetti
|
||||
- **Reachability row opacity removed** (#859, #863) - clean rows without misleading gradient
|
||||
- **Sticky table headers on mobile** (#861, #867) - restored after regression
|
||||
- **Map popup 'Show Neighbors' on iOS Safari** (#840, #841) - link actually works now
|
||||
- **Node detail Recent Packets invisible text** (#829, #830) - CSS fix
|
||||
- **/api/packets/{hash} falls back to DB** (#827, #831) - when in-memory store misses, DB catches it
|
||||
- **IATA filter bypass for status messages** (#694, #802) - status packets no longer filtered out by airport codes
|
||||
- **Desktop node click URL hash** (#676, #739) - clicking a node updates the URL for deep linking
|
||||
- **Filter params in URL hash** (#682, #740) - all filter state serialized for shareable links
|
||||
- **Hide undecryptable channel messages** (#727, #728) - clean default view
|
||||
- **TRACE path_json uses path_sz** (#732) - correct field from flags byte, not header hash_size
|
||||
- **Multi-byte adopters** (#754, #767) - all node types, role column, advert precedence
|
||||
- **Channel key case sensitivity** (#761) - Public decode works correctly
|
||||
- **Transport route field offsets** (#766) - correct offsets in field table
|
||||
- **Clock skew sanity checks** (#769) - filter epoch-0, cap drift, require minimum samples
|
||||
- **Neighbor graph slider persistence** (#776) - default 0.7, persisted to localStorage
|
||||
- **Node detail panel navigation** (#779, #785) - Details/Analytics links actually navigate
|
||||
- **Channel key removal** (#898) - user-added keys for server-known channels can be removed
|
||||
- **Side-panel Details on desktop** (#892) - opens full-screen correctly
|
||||
- **Hex-dump byte ranges client-side** (#891) - computed from per-obs raw_hex
|
||||
- **path_json derived from raw_hex at ingest** (#886, #887) - single source of truth
|
||||
- **Path pill and byte breakdown hop agreement** (#885) - they match now
|
||||
- **Mobile close button + toolbar scroll** (#797, #805) - accessible and scrollable
|
||||
- **/health.recentPackets resolved_path fallback** (#810, #821) - falls back to longest sibling observation
|
||||
- **Channel filter on Packets page** (#812, #816) - UI and API both fixed
|
||||
- **Clock-skew section in side panel** (#813, #814) - renders correctly
|
||||
- **Real RSS in /api/stats** (#832, #835) - surface actual RSS alongside tracked store bytes
|
||||
- **Hash size detection for transport routes + zero-hop adverts** (#747) - correct detection
|
||||
- **Repeater+observer merged map marker** (#745) - single marker, not two overlapping
|
||||
|
||||
---
|
||||
|
||||
## 🎨 UI Polish
|
||||
|
||||
- QA findings applied across the board (#832, #833, #836, #837, #838) - dozens of small UX fixes from systematic QA pass
|
||||
|
||||
---
|
||||
|
||||
## 📦 Upgrading
|
||||
|
||||
```bash
|
||||
git pull
|
||||
docker compose down
|
||||
docker compose build prod
|
||||
docker compose up -d prod
|
||||
```
|
||||
|
||||
Your existing `config.json` works as-is. New optional config keys:
|
||||
- `nodeBlacklist` - array of node hashes to hide
|
||||
- `observerRetentionDays` - days before stale observers are pruned
|
||||
- `memoryBudgetMB` - cap on in-memory packet store
|
||||
|
||||
### Verify
|
||||
|
||||
```bash
|
||||
curl -s http://localhost/api/health | jq .version
|
||||
# "3.6.0"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🙏 External Contributors
|
||||
|
||||
- **#735** ([@efiten](https://github.com/efiten)) - Serve geofilter builder from app, link from customizer
|
||||
- **#739** ([@efiten](https://github.com/efiten)) - Desktop node click updates URL hash for deep linking
|
||||
- **#740** ([@efiten](https://github.com/efiten)) - Serialize filter params in URL hash for shareable links
|
||||
- **#742** ([@Joel-Claw](https://github.com/Joel-Claw)) - Add nodeBlacklist config to hide abusive/troll nodes
|
||||
- **#761** ([@copelaje](https://github.com/copelaje)) - Fix channel key case sensitivity for Public decode
|
||||
- **#764** ([@Joel-Claw](https://github.com/Joel-Claw)) - Add observer retention - prune stale observers after configurable days
|
||||
- **#802** ([@efiten](https://github.com/efiten)) - Bypass IATA filter for status messages, fill SNR on duplicate observations
|
||||
- **#803** ([@efiten](https://github.com/efiten)) - Replace raw JSON text search with byNode index for node packet queries
|
||||
- **#805** ([@efiten](https://github.com/efiten)) - Mobile close button accessible + toolbar scrollable
|
||||
- **#900** ([@efiten](https://github.com/efiten)) - App-served geofilter docs page
|
||||
- **#917** ([@efiten](https://github.com/efiten)) - Apply retentionHours cutoff in Load() to prevent OOM on cold start
|
||||
- **#924** ([@efiten](https://github.com/efiten)) - Node filter on live page - show only traffic through a specific node
|
||||
- **#925** ([@efiten](https://github.com/efiten)) - Fix geobuilder longitude wrapping for southern hemisphere polygons
|
||||
- **#927** ([@efiten](https://github.com/efiten)) - Skip customize panel re-render while text field has focus
|
||||
|
||||
---
|
||||
|
||||
## ⚠️ Breaking Changes
|
||||
|
||||
**None.** All API endpoints remain backwards-compatible. New fields are additive only.
|
||||
|
||||
---
|
||||
|
||||
## 📊 By the Numbers
|
||||
|
||||
| Stat | Count |
|
||||
|------|-------|
|
||||
| Commits | 134 |
|
||||
| PRs merged | 105 |
|
||||
| Lines added | 18,480 |
|
||||
| Lines removed | 1,632 |
|
||||
| Files changed | 110 |
|
||||
| Contributors | 4 |
|
||||
|
||||
---
|
||||
|
||||
*Previous release: [v3.5.2](https://github.com/Kpa-clawbot/CoreScope/releases/tag/v3.5.2)*
|
||||
@@ -294,6 +294,5 @@
|
||||
"#colombia": "bea223a8c1d13ed9638ee000ea3a6aca",
|
||||
"#bogota": "6d0864985b64350ce4cbfebf4979e970",
|
||||
"#peru": "7e6fc347bf29a4c128ac3156865bd521",
|
||||
"#lima": "5f167ce354eca08ab742463df10ef255",
|
||||
"Public": "8b3387e9c5cdea6ac9e5edbaa115cd72"
|
||||
}
|
||||
"#lima": "5f167ce354eca08ab742463df10ef255"
|
||||
}
|
||||
@@ -1 +0,0 @@
|
||||
ingestor
|
||||
@@ -47,24 +47,6 @@ The config file uses the same format as the Node.js `config.json`. The ingestor
|
||||
| `DB_PATH` | SQLite database path | `data/meshcore.db` |
|
||||
| `MQTT_BROKER` | Single MQTT broker URL (overrides config) | — |
|
||||
| `MQTT_TOPIC` | MQTT topic (used with `MQTT_BROKER`) | `meshcore/#` |
|
||||
| `CORESCOPE_INGESTOR_STATS` | Path to the per-second stats JSON file consumed by the server's `/api/perf/io` and `/api/perf/write-sources` endpoints (#1120) | `/tmp/corescope-ingestor-stats.json` |
|
||||
|
||||
### Stats file (`CORESCOPE_INGESTOR_STATS`)
|
||||
|
||||
Every second the ingestor publishes a JSON snapshot of its counters
|
||||
(`tx_inserted`, `obs_inserted`, `walCommits`, `backfillUpdates.*`, etc.) plus
|
||||
a `procIO` block sampled from `/proc/self/io` (read/write/cancelled bytes per
|
||||
second + syscall counts). The server reads this file and surfaces the data on
|
||||
the Perf page so operators can self-diagnose write-volume anomalies.
|
||||
|
||||
The writer uses `O_NOFOLLOW | O_CREAT | O_TRUNC` mode `0o600`, so a
|
||||
pre-planted symlink at the path cannot be used to clobber an arbitrary file.
|
||||
|
||||
**Security note:** the default lives in `/tmp`, which is world-writable on
|
||||
most hosts (sticky bit only protects deletion, not creation). On
|
||||
shared/multi-tenant hosts, override `CORESCOPE_INGESTOR_STATS` to point at a
|
||||
private directory (e.g. `/var/lib/corescope/ingestor-stats.json`) that only
|
||||
the corescope user can write to.
|
||||
|
||||
### Minimal Config
|
||||
|
||||
|
||||
@@ -1,148 +0,0 @@
|
||||
// Async migration helper — runs schema/backfill work that may take minutes on
|
||||
// large prod tables WITHOUT blocking ingestor startup.
|
||||
//
|
||||
// MIGRATION ANNOTATION CONVENTION (read this before touching migrations):
|
||||
//
|
||||
// Sync schema/data migrations (CREATE INDEX, ALTER TABLE, UPDATE ... WHERE)
|
||||
// that run inline during OpenStore() block the ingestor from accepting
|
||||
// packets until they finish. On an empty dev DB they return in milliseconds;
|
||||
// at prod scale (1.9M+ observations, 80K+ adverts) they can pin the boot
|
||||
// for minutes and trigger restart loops. This regression class has bitten us
|
||||
// repeatedly (#791 resolved_path backfill, #1483 obs_observer_ts_idx_v1).
|
||||
//
|
||||
// ANY new CREATE INDEX / ALTER TABLE / data-rewrite migration MUST EITHER:
|
||||
// 1. Run via Store.RunAsyncMigration(...) below (preferred for backfills
|
||||
// and any work that may touch >1K rows). The migration is recorded as
|
||||
// `pending_async` immediately, returns to the caller (boot proceeds),
|
||||
// and completes in a goroutine. Status flips to `done` (or `failed`
|
||||
// with an error message) when fn returns.
|
||||
// 2. Carry the preflight annotation comment immediately above the
|
||||
// migration block, e.g.
|
||||
// // PREFLIGHT: async=true reason="<one-line justification>"
|
||||
// Use this for migrations that are genuinely cheap at any scale
|
||||
// (e.g. ALTER TABLE ADD COLUMN, CREATE INDEX on a known-bounded
|
||||
// table). The annotation is grepped by
|
||||
// ~/.openclaw/skills/pr-preflight/scripts/check-async-migrations.sh
|
||||
// — its absence on a touched migration block is a hard-fail gate.
|
||||
//
|
||||
// See MIGRATIONS.md in the repo root for the full policy and examples.
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"log"
|
||||
)
|
||||
|
||||
// ensureAsyncMigrationsTable creates the bookkeeping table used by
|
||||
// RunAsyncMigration / AsyncMigrationStatus. Idempotent.
|
||||
func ensureAsyncMigrationsTable(db *sql.DB) error {
|
||||
_, err := db.Exec(`
|
||||
CREATE TABLE IF NOT EXISTS _async_migrations (
|
||||
name TEXT PRIMARY KEY,
|
||||
status TEXT NOT NULL, -- pending_async | done | failed
|
||||
started_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||
ended_at TEXT,
|
||||
error TEXT
|
||||
)
|
||||
`)
|
||||
return err
|
||||
}
|
||||
|
||||
// RunAsyncMigration registers `name` as a pending async migration and
|
||||
// schedules `fn` to run in a background goroutine. It returns to the caller
|
||||
// immediately so the ingestor can keep booting.
|
||||
//
|
||||
// Contract (pinned by async_migration_test.go):
|
||||
// - status is `pending_async` IMMEDIATELY after this returns.
|
||||
// - fn runs in a goroutine; on success status becomes `done`, on error or
|
||||
// panic status becomes `failed` and the error is recorded.
|
||||
// - Idempotent: if a row with the same name already exists in `done`
|
||||
// state, fn is NOT re-run. If in `failed` or `pending_async` state,
|
||||
// fn IS re-scheduled (a previous run may have crashed mid-flight).
|
||||
// - The caller's WaitGroup tracks the goroutine so tests/shutdown can
|
||||
// wait via Store.WaitForAsyncMigrations().
|
||||
func (s *Store) RunAsyncMigration(ctx context.Context, name string, fn func(context.Context, *sql.DB) error) error {
|
||||
if err := ensureAsyncMigrationsTable(s.db); err != nil {
|
||||
return fmt.Errorf("ensure _async_migrations: %w", err)
|
||||
}
|
||||
|
||||
var existing string
|
||||
row := s.db.QueryRow(`SELECT status FROM _async_migrations WHERE name = ?`, name)
|
||||
switch err := row.Scan(&existing); err {
|
||||
case nil:
|
||||
if existing == "done" {
|
||||
return nil // already complete, nothing to do
|
||||
}
|
||||
// pending_async or failed → reset and retry.
|
||||
if _, err := s.db.Exec(`
|
||||
UPDATE _async_migrations
|
||||
SET status = 'pending_async', started_at = datetime('now'), ended_at = NULL, error = NULL
|
||||
WHERE name = ?`, name); err != nil {
|
||||
return fmt.Errorf("reset async migration %q: %w", name, err)
|
||||
}
|
||||
case sql.ErrNoRows:
|
||||
if _, err := s.db.Exec(`
|
||||
INSERT INTO _async_migrations (name, status) VALUES (?, 'pending_async')`,
|
||||
name); err != nil {
|
||||
return fmt.Errorf("register async migration %q: %w", name, err)
|
||||
}
|
||||
default:
|
||||
return fmt.Errorf("lookup async migration %q: %w", name, err)
|
||||
}
|
||||
|
||||
s.backfillWg.Add(1)
|
||||
go func() {
|
||||
defer s.backfillWg.Done()
|
||||
var runErr error
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
runErr = fmt.Errorf("panic: %v", r)
|
||||
log.Printf("[async-migration] %q panic recovered: %v", name, r)
|
||||
}
|
||||
if runErr != nil {
|
||||
if _, err := s.db.Exec(`
|
||||
UPDATE _async_migrations
|
||||
SET status = 'failed', ended_at = datetime('now'), error = ?
|
||||
WHERE name = ?`, runErr.Error(), name); err != nil {
|
||||
log.Printf("[async-migration] failed to record failure for %q: %v", name, err)
|
||||
}
|
||||
log.Printf("[async-migration] %q FAILED: %v", name, runErr)
|
||||
return
|
||||
}
|
||||
if _, err := s.db.Exec(`
|
||||
UPDATE _async_migrations
|
||||
SET status = 'done', ended_at = datetime('now'), error = NULL
|
||||
WHERE name = ?`, name); err != nil {
|
||||
log.Printf("[async-migration] failed to mark %q done: %v", name, err)
|
||||
return
|
||||
}
|
||||
log.Printf("[async-migration] %q done", name)
|
||||
}()
|
||||
log.Printf("[async-migration] %q starting (boot continues)", name)
|
||||
runErr = fn(ctx, s.db)
|
||||
}()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// AsyncMigrationStatus returns the current status of an async migration
|
||||
// (one of "pending_async", "done", "failed") or sql.ErrNoRows if no such
|
||||
// migration has been registered.
|
||||
func (s *Store) AsyncMigrationStatus(name string) (string, error) {
|
||||
if err := ensureAsyncMigrationsTable(s.db); err != nil {
|
||||
return "", err
|
||||
}
|
||||
var status string
|
||||
err := s.db.QueryRow(`SELECT status FROM _async_migrations WHERE name = ?`, name).Scan(&status)
|
||||
return status, err
|
||||
}
|
||||
|
||||
// WaitForAsyncMigrations blocks until all currently-scheduled async migrations
|
||||
// finish. Intended for tests + graceful shutdown; production boot path does NOT
|
||||
// call this (that's the whole point).
|
||||
func (s *Store) WaitForAsyncMigrations() {
|
||||
s.backfillWg.Wait()
|
||||
}
|
||||
@@ -1,299 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// waitForStatus polls AsyncMigrationStatus until it matches `want` or `deadline` passes.
|
||||
func waitForStatus(t *testing.T, s *Store, name, want string, timeout time.Duration) string {
|
||||
t.Helper()
|
||||
deadline := time.Now().Add(timeout)
|
||||
var status string
|
||||
var err error
|
||||
for time.Now().Before(deadline) {
|
||||
status, err = s.AsyncMigrationStatus(name)
|
||||
if err == nil && status == want {
|
||||
return status
|
||||
}
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
}
|
||||
t.Fatalf("status never reached %q within %s: got %q (err=%v)", want, timeout, status, err)
|
||||
return status
|
||||
}
|
||||
|
||||
// TestRunAsyncMigration_PendingThenDone pins the contract for RunAsyncMigration:
|
||||
//
|
||||
// 1. After calling, the migration name MUST be queryable in the migrations
|
||||
// table with status `pending_async` IMMEDIATELY (no waiting for fn).
|
||||
// 2. After fn returns, the status MUST transition to `done`.
|
||||
// 3. RunAsyncMigration MUST return without blocking on fn.
|
||||
//
|
||||
// This is the regression test for the recurring "sync migration on large
|
||||
// table blocks ingestor startup" class (#791, #1483, ...). If this test
|
||||
// fails the contract is broken — do not relax it; fix the runner.
|
||||
func TestRunAsyncMigration_PendingThenDone(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
ctx := context.Background()
|
||||
|
||||
started := make(chan struct{})
|
||||
release := make(chan struct{})
|
||||
|
||||
const name = "test_async_migration_v1"
|
||||
if err := s.RunAsyncMigration(ctx, name, func(ctx context.Context, db *sql.DB) error {
|
||||
close(started)
|
||||
<-release
|
||||
return nil
|
||||
}); err != nil {
|
||||
t.Fatalf("RunAsyncMigration returned error: %v", err)
|
||||
}
|
||||
|
||||
// Wait for the goroutine to actually start before checking status; this
|
||||
// proves RunAsyncMigration did not block on fn and that fn is running
|
||||
// concurrently.
|
||||
select {
|
||||
case <-started:
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("async migration fn did not start within 2s — RunAsyncMigration may have blocked or never scheduled")
|
||||
}
|
||||
|
||||
status, err := s.AsyncMigrationStatus(name)
|
||||
if err != nil {
|
||||
t.Fatalf("AsyncMigrationStatus while running: %v", err)
|
||||
}
|
||||
if status != "pending_async" {
|
||||
t.Fatalf("status while fn running: got %q, want %q", status, "pending_async")
|
||||
}
|
||||
|
||||
close(release)
|
||||
|
||||
// Poll for transition to done.
|
||||
deadline := time.Now().Add(2 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
status, err = s.AsyncMigrationStatus(name)
|
||||
if err == nil && status == "done" {
|
||||
return
|
||||
}
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
}
|
||||
t.Fatalf("status never transitioned to done within 2s: got %q (err=%v)", status, err)
|
||||
}
|
||||
|
||||
// TestRunAsyncMigration_PanicCapture proves that a panic inside fn does NOT
|
||||
// leak past the recover, AND that the migration row transitions to
|
||||
// "failed" with the panic message captured — NOT silently to "done".
|
||||
// Operator visibility into mid-migration crashes is the whole point.
|
||||
func TestRunAsyncMigration_PanicCapture(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
const name = "test_panic_capture_v1"
|
||||
|
||||
if err := s.RunAsyncMigration(context.Background(), name,
|
||||
func(ctx context.Context, db *sql.DB) error {
|
||||
panic("synthetic boom")
|
||||
}); err != nil {
|
||||
t.Fatalf("RunAsyncMigration returned error: %v", err)
|
||||
}
|
||||
|
||||
s.WaitForAsyncMigrations()
|
||||
|
||||
status, err := s.AsyncMigrationStatus(name)
|
||||
if err != nil {
|
||||
t.Fatalf("status lookup: %v", err)
|
||||
}
|
||||
if status != "failed" {
|
||||
t.Fatalf("status after panic: got %q, want %q (silent-done would be catastrophic)", status, "failed")
|
||||
}
|
||||
|
||||
var errMsg sql.NullString
|
||||
if err := s.db.QueryRow(`SELECT error FROM _async_migrations WHERE name = ?`, name).Scan(&errMsg); err != nil {
|
||||
t.Fatalf("error column lookup: %v", err)
|
||||
}
|
||||
if !errMsg.Valid || errMsg.String == "" {
|
||||
t.Fatalf("error column empty after panic — operator has no clue what failed")
|
||||
}
|
||||
}
|
||||
|
||||
// TestRunAsyncMigration_IdempotentSecondCallNoOps verifies that calling
|
||||
// RunAsyncMigration a second time with the same name AFTER it has reached
|
||||
// "done" status does NOT re-run fn. This protects the prod path: ingestor
|
||||
// restarts must not rebuild already-built indexes.
|
||||
func TestRunAsyncMigration_IdempotentSecondCallNoOps(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
const name = "test_idempotent_v1"
|
||||
|
||||
var calls int32
|
||||
fn := func(ctx context.Context, db *sql.DB) error {
|
||||
atomic.AddInt32(&calls, 1)
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := s.RunAsyncMigration(context.Background(), name, fn); err != nil {
|
||||
t.Fatalf("first call: %v", err)
|
||||
}
|
||||
s.WaitForAsyncMigrations()
|
||||
waitForStatus(t, s, name, "done", 2*time.Second)
|
||||
|
||||
// Second call must short-circuit; fn must not be invoked again.
|
||||
if err := s.RunAsyncMigration(context.Background(), name, fn); err != nil {
|
||||
t.Fatalf("second call: %v", err)
|
||||
}
|
||||
s.WaitForAsyncMigrations()
|
||||
|
||||
if got := atomic.LoadInt32(&calls); got != 1 {
|
||||
t.Fatalf("fn invoked %d times, want 1 (done-state row must short-circuit)", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRunAsyncMigration_RestartSafetyFailedIsRetried simulates a crashed
|
||||
// previous run: a row exists in `failed` state from a prior boot. The next
|
||||
// RunAsyncMigration call MUST re-schedule fn (reset to pending_async, then
|
||||
// run it), not leave the migration stuck in `failed` forever.
|
||||
func TestRunAsyncMigration_RestartSafetyFailedIsRetried(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
const name = "test_restart_failed_v1"
|
||||
|
||||
if err := ensureAsyncMigrationsTable(s.db); err != nil {
|
||||
t.Fatalf("ensure table: %v", err)
|
||||
}
|
||||
if _, err := s.db.Exec(`INSERT INTO _async_migrations (name, status, error) VALUES (?, 'failed', 'simulated prior crash')`, name); err != nil {
|
||||
t.Fatalf("seed failed row: %v", err)
|
||||
}
|
||||
|
||||
var calls int32
|
||||
if err := s.RunAsyncMigration(context.Background(), name,
|
||||
func(ctx context.Context, db *sql.DB) error {
|
||||
atomic.AddInt32(&calls, 1)
|
||||
return nil
|
||||
}); err != nil {
|
||||
t.Fatalf("RunAsyncMigration on failed row: %v", err)
|
||||
}
|
||||
s.WaitForAsyncMigrations()
|
||||
waitForStatus(t, s, name, "done", 2*time.Second)
|
||||
|
||||
if got := atomic.LoadInt32(&calls); got != 1 {
|
||||
t.Fatalf("fn invoked %d times, want 1 (failed-state row must be retried)", got)
|
||||
}
|
||||
|
||||
// And the error column must be cleared on success.
|
||||
var errCol sql.NullString
|
||||
if err := s.db.QueryRow(`SELECT error FROM _async_migrations WHERE name = ?`, name).Scan(&errCol); err != nil {
|
||||
t.Fatalf("error col: %v", err)
|
||||
}
|
||||
if errCol.Valid && errCol.String != "" {
|
||||
t.Fatalf("error column not cleared on retry success: %q", errCol.String)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRunAsyncMigration_RestartSafetyPendingIsRetried simulates the
|
||||
// ingestor crashing while a migration was still in `pending_async` (the
|
||||
// goroutine never finished). On next boot the migration MUST be re-picked-up
|
||||
// — leaving it stuck in pending forever would be a silent prod outage.
|
||||
func TestRunAsyncMigration_RestartSafetyPendingIsRetried(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
const name = "test_restart_pending_v1"
|
||||
|
||||
if err := ensureAsyncMigrationsTable(s.db); err != nil {
|
||||
t.Fatalf("ensure table: %v", err)
|
||||
}
|
||||
if _, err := s.db.Exec(`INSERT INTO _async_migrations (name, status) VALUES (?, 'pending_async')`, name); err != nil {
|
||||
t.Fatalf("seed pending row: %v", err)
|
||||
}
|
||||
|
||||
var calls int32
|
||||
if err := s.RunAsyncMigration(context.Background(), name,
|
||||
func(ctx context.Context, db *sql.DB) error {
|
||||
atomic.AddInt32(&calls, 1)
|
||||
return nil
|
||||
}); err != nil {
|
||||
t.Fatalf("RunAsyncMigration on pending row: %v", err)
|
||||
}
|
||||
s.WaitForAsyncMigrations()
|
||||
waitForStatus(t, s, name, "done", 2*time.Second)
|
||||
|
||||
if got := atomic.LoadInt32(&calls); got != 1 {
|
||||
t.Fatalf("fn invoked %d times, want 1 (pending row must be retried after crash)", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRunAsyncMigration_FnErrorRecorded covers the non-panic failure path:
|
||||
// fn returns an error → status MUST be "failed" with the error captured.
|
||||
func TestRunAsyncMigration_FnErrorRecorded(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
const name = "test_fn_error_v1"
|
||||
|
||||
if err := s.RunAsyncMigration(context.Background(), name,
|
||||
func(ctx context.Context, db *sql.DB) error {
|
||||
return fmt.Errorf("simulated migration error")
|
||||
}); err != nil {
|
||||
t.Fatalf("RunAsyncMigration: %v", err)
|
||||
}
|
||||
s.WaitForAsyncMigrations()
|
||||
|
||||
status, err := s.AsyncMigrationStatus(name)
|
||||
if err != nil {
|
||||
t.Fatalf("status: %v", err)
|
||||
}
|
||||
if status != "failed" {
|
||||
t.Fatalf("status: got %q, want failed", status)
|
||||
}
|
||||
|
||||
var errCol sql.NullString
|
||||
if err := s.db.QueryRow(`SELECT error FROM _async_migrations WHERE name = ?`, name).Scan(&errCol); err != nil {
|
||||
t.Fatalf("error col: %v", err)
|
||||
}
|
||||
if !errCol.Valid || errCol.String == "" {
|
||||
t.Fatalf("error column empty after fn error")
|
||||
}
|
||||
}
|
||||
|
||||
// TestRunAsyncMigration_ConcurrentSameNameSerialized validates the
|
||||
// single-process-instance assumption: ingestor has only one *Store, and
|
||||
// concurrent RunAsyncMigration(name=X) calls on the SAME *Store must not
|
||||
// execute fn more than once for a given name. (CoreScope does not support
|
||||
// multi-ingestor / cluster mode — see MIGRATIONS.md "Concurrency" note —
|
||||
// so cross-process races are out of scope.)
|
||||
func TestRunAsyncMigration_ConcurrentSameNameSerialized(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
const name = "test_concurrent_serialize_v1"
|
||||
|
||||
var calls int32
|
||||
fn := func(ctx context.Context, db *sql.DB) error {
|
||||
atomic.AddInt32(&calls, 1)
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
return nil
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
for i := 0; i < 5; i++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
// All concurrent callers use the SAME name. Each is allowed
|
||||
// to either no-op (status==done short-circuit) or schedule
|
||||
// a re-run; the invariant is "fn never runs more than once
|
||||
// concurrently and on second-call-after-done it does not
|
||||
// re-execute."
|
||||
_ = s.RunAsyncMigration(context.Background(), name, fn)
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
s.WaitForAsyncMigrations()
|
||||
waitForStatus(t, s, name, "done", 2*time.Second)
|
||||
|
||||
// The contract per the helper's docstring + Idempotent test is: once
|
||||
// status is `done`, subsequent calls short-circuit. Concurrent calls
|
||||
// that lose the race to set up the pending_async row may legitimately
|
||||
// re-schedule fn (the comment "previous run may have crashed
|
||||
// mid-flight" justifies retry on pending_async). The hard bound is
|
||||
// "fn runs at most ONCE PER pending->done transition" — for this
|
||||
// test we assert fn ran at least once and at most a small bounded
|
||||
// number (5 callers, each may have scheduled before any reached done).
|
||||
if got := atomic.LoadInt32(&calls); got < 1 || got > 5 {
|
||||
t.Fatalf("fn invoked %d times, want 1..5 inclusive (bounded by caller count)", got)
|
||||
}
|
||||
}
|
||||
+9
-154
@@ -7,9 +7,7 @@ import (
|
||||
"log"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/meshcore-analyzer/dbconfig"
|
||||
"github.com/meshcore-analyzer/geofilter"
|
||||
)
|
||||
|
||||
@@ -22,17 +20,6 @@ type MQTTSource struct {
|
||||
RejectUnauthorized *bool `json:"rejectUnauthorized,omitempty"`
|
||||
Topics []string `json:"topics"`
|
||||
IATAFilter []string `json:"iataFilter,omitempty"`
|
||||
ConnectTimeoutSec int `json:"connectTimeoutSec,omitempty"`
|
||||
Region string `json:"region,omitempty"`
|
||||
}
|
||||
|
||||
// ConnectTimeoutOrDefault returns the per-source connect timeout in seconds,
|
||||
// or 30 if not set (matching the WaitTimeout default from #926).
|
||||
func (s MQTTSource) ConnectTimeoutOrDefault() int {
|
||||
if s.ConnectTimeoutSec > 0 {
|
||||
return s.ConnectTimeoutSec
|
||||
}
|
||||
return 30
|
||||
}
|
||||
|
||||
// MQTTLegacy is the old single-broker config format.
|
||||
@@ -50,101 +37,21 @@ type Config struct {
|
||||
ChannelKeysPath string `json:"channelKeysPath,omitempty"`
|
||||
ChannelKeys map[string]string `json:"channelKeys,omitempty"`
|
||||
HashChannels []string `json:"hashChannels,omitempty"`
|
||||
HashRegions []string `json:"hashRegions,omitempty"`
|
||||
Retention *RetentionConfig `json:"retention,omitempty"`
|
||||
Metrics *MetricsConfig `json:"metrics,omitempty"`
|
||||
Runtime *RuntimeConfig `json:"runtime,omitempty"`
|
||||
GeoFilter *GeoFilterConfig `json:"geo_filter,omitempty"`
|
||||
ForeignAdverts *ForeignAdvertConfig `json:"foreignAdverts,omitempty"`
|
||||
GeoFilter *GeoFilterConfig `json:"geo_filter,omitempty"`
|
||||
ValidateSignatures *bool `json:"validateSignatures,omitempty"`
|
||||
DB *DBConfig `json:"db,omitempty"`
|
||||
|
||||
// ObserverIATAWhitelist restricts which observer IATA regions are processed.
|
||||
// When non-empty, only observers whose IATA code (from the MQTT topic) matches
|
||||
// one of these entries are accepted. Case-insensitive. An empty list means all
|
||||
// IATA codes are allowed. This applies globally, unlike the per-source iataFilter.
|
||||
ObserverIATAWhitelist []string `json:"observerIATAWhitelist,omitempty"`
|
||||
|
||||
// obsIATAWhitelistCached is the lazily-built uppercase set for O(1) lookups.
|
||||
obsIATAWhitelistCached map[string]bool
|
||||
obsIATAWhitelistOnce sync.Once
|
||||
|
||||
// ObserverBlacklist is a list of observer public keys to drop at ingest.
|
||||
// Messages from blacklisted observers are silently discarded — no DB writes,
|
||||
// no UpsertObserver, no observations, no metrics.
|
||||
ObserverBlacklist []string `json:"observerBlacklist,omitempty"`
|
||||
|
||||
// obsBlacklistSetCached is the lazily-built lowercase set for O(1) lookups.
|
||||
obsBlacklistSetCached map[string]bool
|
||||
obsBlacklistOnce sync.Once
|
||||
|
||||
// NeighborEdgesMaxAgeDays controls neighbor_edges row retention
|
||||
// (#1287 — moved from cmd/server). 0 = default 5.
|
||||
NeighborEdgesMaxAgeDays int `json:"neighborEdgesMaxAgeDays,omitempty"`
|
||||
|
||||
// IngestBufferSize caps the in-memory queue (number of MQTT messages) held
|
||||
// while the single SQLite writer is blocked by startup migrations/prunes
|
||||
// (#1608). Received messages are drained once the write path is ready.
|
||||
// 0 / unset => default. Bounded memory.
|
||||
IngestBufferSize int `json:"ingestBufferSize,omitempty"`
|
||||
}
|
||||
|
||||
// NeighborEdgesDaysOrDefault returns the configured pruning window or 5.
|
||||
func (c *Config) NeighborEdgesDaysOrDefault() int {
|
||||
if c == nil || c.NeighborEdgesMaxAgeDays <= 0 {
|
||||
return 5
|
||||
}
|
||||
return c.NeighborEdgesMaxAgeDays
|
||||
}
|
||||
|
||||
// IngestBufferSizeOrDefault returns the ingest buffer capacity. Default 50000:
|
||||
// at typical mesh rates (~1-2 msg/s) that is many minutes of headroom while a
|
||||
// startup migration holds the writer; each queued item is a small closure, so
|
||||
// worst-case memory stays in the tens of MB.
|
||||
func (c *Config) IngestBufferSizeOrDefault() int {
|
||||
if c.IngestBufferSize > 0 {
|
||||
return c.IngestBufferSize
|
||||
}
|
||||
return 50000
|
||||
}
|
||||
|
||||
// GeoFilterConfig is an alias for the shared geofilter.Config type.
|
||||
type GeoFilterConfig = geofilter.Config
|
||||
|
||||
// ForeignAdvertConfig controls how the ingestor handles ADVERTs whose GPS lies
|
||||
// outside the configured geofilter polygon (#730). Modes:
|
||||
// - "flag" (default): store the advert/node and tag it foreign for visibility.
|
||||
// - "drop": silently discard the advert (legacy behavior).
|
||||
type ForeignAdvertConfig struct {
|
||||
Mode string `json:"mode,omitempty"`
|
||||
}
|
||||
|
||||
// IsDropMode reports whether the foreign-advert config is set to "drop".
|
||||
// Defaults to false ("flag" mode) when nil or unset.
|
||||
func (f *ForeignAdvertConfig) IsDropMode() bool {
|
||||
if f == nil {
|
||||
return false
|
||||
}
|
||||
return strings.EqualFold(strings.TrimSpace(f.Mode), "drop")
|
||||
}
|
||||
|
||||
// RetentionConfig controls how long stale nodes are kept before being moved to inactive_nodes.
|
||||
type RetentionConfig struct {
|
||||
NodeDays int `json:"nodeDays"`
|
||||
ObserverDays int `json:"observerDays"`
|
||||
MetricsDays int `json:"metricsDays"`
|
||||
// PacketDays is the retention window for transmissions (#1283).
|
||||
// Ownership moved from cmd/server to cmd/ingestor; 0 disables.
|
||||
PacketDays int `json:"packetDays"`
|
||||
}
|
||||
|
||||
// PacketDaysOrZero returns the configured retention.packetDays or 0
|
||||
// (disabled) if not set.
|
||||
func (c *Config) PacketDaysOrZero() int {
|
||||
if c.Retention != nil && c.Retention.PacketDays > 0 {
|
||||
return c.Retention.PacketDays
|
||||
}
|
||||
return 0
|
||||
NodeDays int `json:"nodeDays"`
|
||||
ObserverDays int `json:"observerDays"`
|
||||
MetricsDays int `json:"metricsDays"`
|
||||
}
|
||||
|
||||
// MetricsConfig controls observer metrics collection.
|
||||
@@ -152,18 +59,12 @@ type MetricsConfig struct {
|
||||
SampleIntervalSec int `json:"sampleIntervalSec"`
|
||||
}
|
||||
|
||||
// RuntimeConfig holds Go runtime tuning knobs (#1010).
|
||||
type RuntimeConfig struct {
|
||||
// MaxMemoryMB is the soft memory limit (GOMEMLIMIT) in MiB applied via
|
||||
// runtime/debug.SetMemoryLimit at startup. The GOMEMLIMIT environment
|
||||
// variable, when set, takes precedence over this value. 0/unset means
|
||||
// no limit is applied and default Go runtime behavior is preserved.
|
||||
MaxMemoryMB int `json:"maxMemoryMB"`
|
||||
// DBConfig controls SQLite vacuum and maintenance behavior (#919).
|
||||
type DBConfig struct {
|
||||
VacuumOnStartup bool `json:"vacuumOnStartup"` // one-time full VACUUM on startup if auto_vacuum is not INCREMENTAL
|
||||
IncrementalVacuumPages int `json:"incrementalVacuumPages"` // pages returned to OS per reaper cycle (default 1024)
|
||||
}
|
||||
|
||||
// DBConfig is the shared SQLite vacuum/maintenance config (#919, #921).
|
||||
type DBConfig = dbconfig.DBConfig
|
||||
|
||||
// IncrementalVacuumPages returns the configured pages per vacuum or 1024 default.
|
||||
func (c *Config) IncrementalVacuumPages() int {
|
||||
if c.DB != nil && c.DB.IncrementalVacuumPages > 0 {
|
||||
@@ -213,43 +114,6 @@ func (c *Config) ObserverDaysOrDefault() int {
|
||||
return 14
|
||||
}
|
||||
|
||||
// IsObserverBlacklisted returns true if the given observer ID is in the observerBlacklist.
|
||||
func (c *Config) IsObserverBlacklisted(id string) bool {
|
||||
if c == nil || len(c.ObserverBlacklist) == 0 {
|
||||
return false
|
||||
}
|
||||
c.obsBlacklistOnce.Do(func() {
|
||||
m := make(map[string]bool, len(c.ObserverBlacklist))
|
||||
for _, pk := range c.ObserverBlacklist {
|
||||
trimmed := strings.ToLower(strings.TrimSpace(pk))
|
||||
if trimmed != "" {
|
||||
m[trimmed] = true
|
||||
}
|
||||
}
|
||||
c.obsBlacklistSetCached = m
|
||||
})
|
||||
return c.obsBlacklistSetCached[strings.ToLower(strings.TrimSpace(id))]
|
||||
}
|
||||
|
||||
// IsObserverIATAAllowed returns true if the given IATA code is permitted.
|
||||
// When ObserverIATAWhitelist is empty, all codes are allowed.
|
||||
func (c *Config) IsObserverIATAAllowed(iata string) bool {
|
||||
if c == nil || len(c.ObserverIATAWhitelist) == 0 {
|
||||
return true
|
||||
}
|
||||
c.obsIATAWhitelistOnce.Do(func() {
|
||||
m := make(map[string]bool, len(c.ObserverIATAWhitelist))
|
||||
for _, code := range c.ObserverIATAWhitelist {
|
||||
trimmed := strings.ToUpper(strings.TrimSpace(code))
|
||||
if trimmed != "" {
|
||||
m[trimmed] = true
|
||||
}
|
||||
}
|
||||
c.obsIATAWhitelistCached = m
|
||||
})
|
||||
return c.obsIATAWhitelistCached[strings.ToUpper(strings.TrimSpace(iata))]
|
||||
}
|
||||
|
||||
// LoadConfig reads configuration from a JSON file, with env var overrides.
|
||||
// If the config file does not exist, sensible defaults are used (zero-config startup).
|
||||
func LoadConfig(path string) (*Config, error) {
|
||||
@@ -313,24 +177,15 @@ func LoadConfig(path string) (*Config, error) {
|
||||
}
|
||||
|
||||
// ResolvedSources returns the final list of MQTT sources to connect to.
|
||||
//
|
||||
// Scheme mapping:
|
||||
//
|
||||
// mqtt:// → tcp:// (paho plain TCP)
|
||||
// mqtts:// → ssl:// (paho TLS over TCP)
|
||||
// ws:// (paho WebSocket — passed through, no mapping needed)
|
||||
// wss:// (paho WebSocket TLS — passed through, no mapping needed)
|
||||
func (c *Config) ResolvedSources() []MQTTSource {
|
||||
for i := range c.MQTTSources {
|
||||
// paho uses tcp:// and ssl:// for plain MQTT; ws:// and wss:// are accepted natively.
|
||||
// paho uses tcp:// and ssl:// not mqtt:// and mqtts://
|
||||
b := c.MQTTSources[i].Broker
|
||||
if strings.HasPrefix(b, "mqtt://") {
|
||||
c.MQTTSources[i].Broker = "tcp://" + b[7:]
|
||||
} else if strings.HasPrefix(b, "mqtts://") {
|
||||
c.MQTTSources[i].Broker = "ssl://" + b[8:]
|
||||
}
|
||||
// ws:// and wss:// pass through unchanged — paho handles WebSocket
|
||||
// connections natively via gorilla/websocket.
|
||||
}
|
||||
return c.MQTTSources
|
||||
}
|
||||
|
||||
@@ -284,215 +284,3 @@ func TestLoadConfigWithAllFields(t *testing.T) {
|
||||
t.Errorf("iataFilter=%v", src.IATAFilter)
|
||||
}
|
||||
}
|
||||
|
||||
func TestConnectTimeoutOrDefault(t *testing.T) {
|
||||
// Default when unset
|
||||
s := MQTTSource{}
|
||||
if got := s.ConnectTimeoutOrDefault(); got != 30 {
|
||||
t.Errorf("default: got %d, want 30", got)
|
||||
}
|
||||
|
||||
// Custom value
|
||||
s.ConnectTimeoutSec = 5
|
||||
if got := s.ConnectTimeoutOrDefault(); got != 5 {
|
||||
t.Errorf("custom: got %d, want 5", got)
|
||||
}
|
||||
|
||||
// Zero treated as unset
|
||||
s.ConnectTimeoutSec = 0
|
||||
if got := s.ConnectTimeoutOrDefault(); got != 30 {
|
||||
t.Errorf("zero: got %d, want 30", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestConnectTimeoutFromJSON(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
cfgPath := dir + "/config.json"
|
||||
os.WriteFile(cfgPath, []byte(`{"mqttSources":[{"name":"s1","broker":"tcp://b:1883","topics":["#"],"connectTimeoutSec":5}]}`), 0644)
|
||||
cfg, err := LoadConfig(cfgPath)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if got := cfg.MQTTSources[0].ConnectTimeoutOrDefault(); got != 5 {
|
||||
t.Errorf("from JSON: got %d, want 5", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestObserverIATAWhitelist(t *testing.T) {
|
||||
// Config with whitelist set
|
||||
cfg := Config{
|
||||
ObserverIATAWhitelist: []string{"ARN", "got"},
|
||||
}
|
||||
|
||||
// Matching (case-insensitive)
|
||||
if !cfg.IsObserverIATAAllowed("ARN") {
|
||||
t.Error("ARN should be allowed")
|
||||
}
|
||||
if !cfg.IsObserverIATAAllowed("arn") {
|
||||
t.Error("arn (lowercase) should be allowed")
|
||||
}
|
||||
if !cfg.IsObserverIATAAllowed("GOT") {
|
||||
t.Error("GOT should be allowed")
|
||||
}
|
||||
|
||||
// Non-matching
|
||||
if cfg.IsObserverIATAAllowed("SJC") {
|
||||
t.Error("SJC should NOT be allowed")
|
||||
}
|
||||
|
||||
// Empty string not allowed
|
||||
if cfg.IsObserverIATAAllowed("") {
|
||||
t.Error("empty IATA should NOT be allowed")
|
||||
}
|
||||
}
|
||||
|
||||
func TestObserverIATAWhitelistEmpty(t *testing.T) {
|
||||
// No whitelist = allow all
|
||||
cfg := Config{}
|
||||
if !cfg.IsObserverIATAAllowed("SJC") {
|
||||
t.Error("with no whitelist, all IATAs should be allowed")
|
||||
}
|
||||
if !cfg.IsObserverIATAAllowed("") {
|
||||
t.Error("with no whitelist, even empty IATA should be allowed")
|
||||
}
|
||||
}
|
||||
|
||||
func TestObserverIATAWhitelistJSON(t *testing.T) {
|
||||
json := `{
|
||||
"dbPath": "test.db",
|
||||
"observerIATAWhitelist": ["ARN", "GOT"]
|
||||
}`
|
||||
tmp := t.TempDir() + "/config.json"
|
||||
os.WriteFile(tmp, []byte(json), 0644)
|
||||
cfg, err := LoadConfig(tmp)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(cfg.ObserverIATAWhitelist) != 2 {
|
||||
t.Fatalf("expected 2 entries, got %d", len(cfg.ObserverIATAWhitelist))
|
||||
}
|
||||
if !cfg.IsObserverIATAAllowed("ARN") {
|
||||
t.Error("ARN should be allowed after loading from JSON")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMQTTSourceRegionField(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
cfgPath := filepath.Join(dir, "config.json")
|
||||
os.WriteFile(cfgPath, []byte(`{
|
||||
"dbPath": "/tmp/test.db",
|
||||
"mqttSources": [
|
||||
{"name": "cascadia", "broker": "tcp://localhost:1883", "topics": ["meshcore/#"], "region": "PDX"}
|
||||
]
|
||||
}`), 0o644)
|
||||
|
||||
cfg, err := LoadConfig(cfgPath)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if cfg.MQTTSources[0].Region != "PDX" {
|
||||
t.Fatalf("expected region PDX, got %q", cfg.MQTTSources[0].Region)
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolvedSourcesSchemeMapping verifies that mqtt:// and mqtts:// are translated
|
||||
// to the paho-native tcp:// and ssl:// schemes, while ws:// and wss:// pass through
|
||||
// unchanged (paho handles WebSocket connections natively).
|
||||
func TestResolvedSourcesSchemeMapping(t *testing.T) {
|
||||
tests := []struct {
|
||||
input string
|
||||
want string
|
||||
}{
|
||||
{"mqtt://host:1883", "tcp://host:1883"},
|
||||
{"mqtts://host:8883", "ssl://host:8883"},
|
||||
{"tcp://host:1883", "tcp://host:1883"},
|
||||
{"ssl://host:8883", "ssl://host:8883"},
|
||||
{"ws://host:9001", "ws://host:9001"},
|
||||
{"wss://host:9001", "wss://host:9001"},
|
||||
{"ws://host:9001/mqtt", "ws://host:9001/mqtt"},
|
||||
{"wss://host:9001/mqtt", "wss://host:9001/mqtt"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
cfg := &Config{
|
||||
MQTTSources: []MQTTSource{
|
||||
{Name: "test", Broker: tt.input, Topics: []string{"meshcore/#"}},
|
||||
},
|
||||
}
|
||||
sources := cfg.ResolvedSources()
|
||||
if got := sources[0].Broker; got != tt.want {
|
||||
t.Errorf("ResolvedSources(%q) = %q, want %q", tt.input, got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestLoadConfigWSSource verifies that a WebSocket MQTT source round-trips through
|
||||
// LoadConfig correctly — username/password preserved, scheme unchanged.
|
||||
func TestLoadConfigWSSource(t *testing.T) {
|
||||
t.Setenv("DB_PATH", "")
|
||||
t.Setenv("MQTT_BROKER", "")
|
||||
|
||||
dir := t.TempDir()
|
||||
cfgPath := filepath.Join(dir, "config.json")
|
||||
os.WriteFile(cfgPath, []byte(`{
|
||||
"dbPath": "test.db",
|
||||
"mqttSources": [
|
||||
{
|
||||
"name": "local-tcp",
|
||||
"broker": "mqtt://localhost:1883",
|
||||
"topics": ["meshcore/#"]
|
||||
},
|
||||
{
|
||||
"name": "wsmqtt-ws",
|
||||
"broker": "wss://wsmqtt.example.com/mqtt",
|
||||
"username": "corescope",
|
||||
"password": "s3cr3t",
|
||||
"topics": ["meshcore/#"]
|
||||
}
|
||||
]
|
||||
}`), 0o644)
|
||||
|
||||
cfg, err := LoadConfig(cfgPath)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(cfg.MQTTSources) != 2 {
|
||||
t.Fatalf("mqttSources len=%d, want 2", len(cfg.MQTTSources))
|
||||
}
|
||||
|
||||
tcp := cfg.MQTTSources[0]
|
||||
if tcp.Name != "local-tcp" {
|
||||
t.Errorf("name=%s, want local-tcp", tcp.Name)
|
||||
}
|
||||
|
||||
ws := cfg.MQTTSources[1]
|
||||
if ws.Name != "wsmqtt-ws" {
|
||||
t.Errorf("name=%s, want wsmqtt-ws", ws.Name)
|
||||
}
|
||||
if ws.Broker != "wss://wsmqtt.example.com/mqtt" {
|
||||
t.Errorf("broker=%s, want wss://wsmqtt.example.com/mqtt", ws.Broker)
|
||||
}
|
||||
if ws.Username != "corescope" {
|
||||
t.Errorf("username=%s, want corescope", ws.Username)
|
||||
}
|
||||
if ws.Password != "s3cr3t" {
|
||||
t.Errorf("password=%s, want s3cr3t", ws.Password)
|
||||
}
|
||||
|
||||
sources := cfg.ResolvedSources()
|
||||
if sources[1].Broker != "wss://wsmqtt.example.com/mqtt" {
|
||||
t.Errorf("ResolvedSources wss broker=%s, want unchanged", sources[1].Broker)
|
||||
}
|
||||
}
|
||||
|
||||
func TestIngestBufferSizeOrDefault(t *testing.T) {
|
||||
if got := (&Config{}).IngestBufferSizeOrDefault(); got != 50000 {
|
||||
t.Fatalf("default: want 50000, got %d", got)
|
||||
}
|
||||
if got := (&Config{IngestBufferSize: 10}).IngestBufferSizeOrDefault(); got != 10 {
|
||||
t.Fatalf("override: want 10, got %d", got)
|
||||
}
|
||||
if got := (&Config{IngestBufferSize: -5}).IngestBufferSizeOrDefault(); got != 50000 {
|
||||
t.Fatalf("invalid negative should fall back to default, got %d", got)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,8 +5,6 @@ import (
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
@@ -160,7 +158,7 @@ func TestHandleMessageChannelMessage(t *testing.T) {
|
||||
payload := []byte(`{"text":"Alice: Hello everyone","channel_idx":3,"SNR":5.0,"RSSI":-95,"score":10,"direction":"rx","sender_timestamp":1700000000}`)
|
||||
msg := &mockMessage{topic: "meshcore/message/channel/2", payload: payload}
|
||||
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var count int
|
||||
if err := store.db.QueryRow("SELECT COUNT(*) FROM transmissions").Scan(&count); err != nil {
|
||||
@@ -220,7 +218,7 @@ func TestHandleMessageChannelMessageEmptyText(t *testing.T) {
|
||||
store, source := newTestContext(t)
|
||||
|
||||
msg := &mockMessage{topic: "meshcore/message/channel/1", payload: []byte(`{"text":""}`)}
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var count int
|
||||
if err := store.db.QueryRow("SELECT COUNT(*) FROM transmissions").Scan(&count); err != nil {
|
||||
@@ -235,7 +233,7 @@ func TestHandleMessageChannelNoSender(t *testing.T) {
|
||||
store, source := newTestContext(t)
|
||||
|
||||
msg := &mockMessage{topic: "meshcore/message/channel/1", payload: []byte(`{"text":"no sender here"}`)}
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var count int
|
||||
if err := store.db.QueryRow("SELECT COUNT(*) FROM nodes").Scan(&count); err != nil {
|
||||
@@ -252,7 +250,7 @@ func TestHandleMessageDirectMessage(t *testing.T) {
|
||||
payload := []byte(`{"text":"Bob: Hey there","sender_timestamp":1700000000,"SNR":3.0,"rssi":-100,"Score":8,"Direction":"tx"}`)
|
||||
msg := &mockMessage{topic: "meshcore/message/direct/abc123", payload: payload}
|
||||
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var count int
|
||||
if err := store.db.QueryRow("SELECT COUNT(*) FROM transmissions").Scan(&count); err != nil {
|
||||
@@ -296,7 +294,7 @@ func TestHandleMessageDirectMessageEmptyText(t *testing.T) {
|
||||
store, source := newTestContext(t)
|
||||
|
||||
msg := &mockMessage{topic: "meshcore/message/direct/abc", payload: []byte(`{"text":""}`)}
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var count int
|
||||
if err := store.db.QueryRow("SELECT COUNT(*) FROM transmissions").Scan(&count); err != nil {
|
||||
@@ -311,7 +309,7 @@ func TestHandleMessageDirectNoSender(t *testing.T) {
|
||||
store, source := newTestContext(t)
|
||||
|
||||
msg := &mockMessage{topic: "meshcore/message/direct/xyz", payload: []byte(`{"text":"message with no colon"}`)}
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var count int
|
||||
if err := store.db.QueryRow("SELECT COUNT(*) FROM transmissions").Scan(&count); err != nil {
|
||||
@@ -330,7 +328,7 @@ func TestHandleMessageUppercaseScoreDirection(t *testing.T) {
|
||||
payload := []byte(`{"raw":"` + rawHex + `","Score":9.0,"Direction":"tx"}`)
|
||||
msg := &mockMessage{topic: "meshcore/SJC/obs1/packets", payload: payload}
|
||||
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var score *float64
|
||||
var direction *string
|
||||
@@ -351,7 +349,7 @@ func TestHandleMessageChannelLowercaseFields(t *testing.T) {
|
||||
|
||||
payload := []byte(`{"text":"Test: msg","snr":3.0,"rssi":-90,"Score":5,"Direction":"rx"}`)
|
||||
msg := &mockMessage{topic: "meshcore/message/channel/0", payload: payload}
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var count int
|
||||
if err := store.db.QueryRow("SELECT COUNT(*) FROM transmissions").Scan(&count); err != nil {
|
||||
@@ -367,7 +365,7 @@ func TestHandleMessageDirectLowercaseFields(t *testing.T) {
|
||||
|
||||
payload := []byte(`{"text":"Test: msg","snr":2.0,"rssi":-85,"score":7,"direction":"tx"}`)
|
||||
msg := &mockMessage{topic: "meshcore/message/direct/xyz", payload: payload}
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var count int
|
||||
if err := store.db.QueryRow("SELECT COUNT(*) FROM transmissions").Scan(&count); err != nil {
|
||||
@@ -390,7 +388,7 @@ func TestHandleMessageAdvertWithTelemetry(t *testing.T) {
|
||||
payload: []byte(`{"raw":"` + rawHex + `"}`),
|
||||
}
|
||||
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
// Should have created transmission, node, and observer
|
||||
var txCount, nodeCount, obsCount int
|
||||
@@ -430,12 +428,7 @@ func TestHandleMessageAdvertGeoFiltered(t *testing.T) {
|
||||
topic: "meshcore/SJC/obs1/packets",
|
||||
payload: []byte(`{"raw":"` + rawHex + `"}`),
|
||||
}
|
||||
// Legacy silent-drop behavior is now opt-in via ForeignAdverts.Mode="drop"
|
||||
// (#730). The new default — flag — is covered by foreign_advert_test.go.
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{
|
||||
GeoFilter: gf,
|
||||
ForeignAdverts: &ForeignAdvertConfig{Mode: "drop"},
|
||||
})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{GeoFilter: gf})
|
||||
|
||||
// Geo-filtered adverts should not create nodes
|
||||
var nodeCount int
|
||||
@@ -443,7 +436,7 @@ func TestHandleMessageAdvertGeoFiltered(t *testing.T) {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if nodeCount != 0 {
|
||||
t.Errorf("nodes=%d, want 0 (geo-filtered advert in drop mode should not create node)", nodeCount)
|
||||
t.Errorf("nodes=%d, want 0 (geo-filtered advert should not create node)", nodeCount)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -672,7 +665,7 @@ func TestHandleMessageCorruptedAdvertNoNode(t *testing.T) {
|
||||
topic: "meshcore/SJC/obs1/packets",
|
||||
payload: []byte(`{"raw":"` + rawHex + `"}`),
|
||||
}
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var count int
|
||||
if err := store.db.QueryRow("SELECT COUNT(*) FROM nodes").Scan(&count); err != nil {
|
||||
@@ -694,7 +687,7 @@ func TestHandleMessageNonAdvertPacket(t *testing.T) {
|
||||
topic: "meshcore/SJC/obs1/packets",
|
||||
payload: []byte(`{"raw":"` + rawHex + `"}`),
|
||||
}
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var count int
|
||||
if err := store.db.QueryRow("SELECT COUNT(*) FROM transmissions").Scan(&count); err != nil {
|
||||
@@ -755,13 +748,8 @@ func TestDecodeAdvertSensorNoName(t *testing.T) {
|
||||
// --- db.go: OpenStore error path (invalid dir) ---
|
||||
|
||||
func TestOpenStoreInvalidPath(t *testing.T) {
|
||||
// Create a regular file then try to open a DB inside it — impossible on all platforms.
|
||||
f, err := os.CreateTemp(t.TempDir(), "not-a-dir")
|
||||
if err != nil {
|
||||
t.Fatalf("setup: %v", err)
|
||||
}
|
||||
f.Close()
|
||||
_, err = OpenStore(filepath.Join(f.Name(), "db.sqlite"))
|
||||
// Path under /dev/null can't create directory
|
||||
_, err := OpenStore("/dev/null/impossible/path/db.sqlite")
|
||||
if err == nil {
|
||||
t.Error("should error on impossible path")
|
||||
}
|
||||
@@ -876,7 +864,7 @@ func TestHandleMessageChannelLongSender(t *testing.T) {
|
||||
longText := "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA: msg"
|
||||
payload := []byte(`{"text":"` + longText + `"}`)
|
||||
msg := &mockMessage{topic: "meshcore/message/channel/1", payload: payload}
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var count int
|
||||
if err := store.db.QueryRow("SELECT COUNT(*) FROM nodes").Scan(&count); err != nil {
|
||||
@@ -895,7 +883,7 @@ func TestHandleMessageDirectLongSender(t *testing.T) {
|
||||
longText := "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB: msg"
|
||||
payload := []byte(`{"text":"` + longText + `"}`)
|
||||
msg := &mockMessage{topic: "meshcore/message/direct/abc", payload: payload}
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var count int
|
||||
if err := store.db.QueryRow("SELECT COUNT(*) FROM transmissions").Scan(&count); err != nil {
|
||||
@@ -912,7 +900,7 @@ func TestHandleMessageDirectUppercaseScoreDirection(t *testing.T) {
|
||||
|
||||
payload := []byte(`{"text":"X: hi","Score":6,"Direction":"rx"}`)
|
||||
msg := &mockMessage{topic: "meshcore/message/direct/d1", payload: payload}
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var count int
|
||||
if err := store.db.QueryRow("SELECT COUNT(*) FROM transmissions").Scan(&count); err != nil {
|
||||
@@ -942,7 +930,7 @@ func TestHandleMessageChannelUppercaseScoreDirection(t *testing.T) {
|
||||
|
||||
payload := []byte(`{"text":"Y: hi","Score":4,"Direction":"tx"}`)
|
||||
msg := &mockMessage{topic: "meshcore/message/channel/5", payload: payload}
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var count int
|
||||
if err := store.db.QueryRow("SELECT COUNT(*) FROM transmissions").Scan(&count); err != nil {
|
||||
@@ -973,7 +961,7 @@ func TestHandleMessageRawLowercaseScore(t *testing.T) {
|
||||
rawHex := "0A00D69FD7A5A7475DB07337749AE61FA53A4788E976"
|
||||
payload := []byte(`{"raw":"` + rawHex + `","score":3.5}`)
|
||||
msg := &mockMessage{topic: "meshcore/SJC/obs1/packets", payload: payload}
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var score *float64
|
||||
if err := store.db.QueryRow("SELECT score FROM observations LIMIT 1").Scan(&score); err != nil {
|
||||
@@ -992,7 +980,7 @@ func TestHandleMessageStatusNoOrigin(t *testing.T) {
|
||||
topic: "meshcore/LAX/obs5/status",
|
||||
payload: []byte(`{"model":"L1"}`),
|
||||
}
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var count int
|
||||
if err := store.db.QueryRow("SELECT COUNT(*) FROM observers WHERE id = 'obs5'").Scan(&count); err != nil {
|
||||
|
||||
+79
-1037
File diff suppressed because it is too large
Load Diff
+16
-853
@@ -554,89 +554,18 @@ func TestInsertTransmissionUpdatesObserverLastSeen(t *testing.T) {
|
||||
PathJSON: "[]",
|
||||
DecodedJSON: `{"type":"TXT_MSG"}`,
|
||||
}
|
||||
before := time.Now().Unix()
|
||||
if _, err := s.InsertTransmission(data); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
after := time.Now().Unix()
|
||||
|
||||
// Verify last_seen was updated to INGEST time, not envelope time (#1465).
|
||||
// Verify last_seen was updated
|
||||
var lastSeenAfter string
|
||||
s.db.QueryRow("SELECT last_seen FROM observers WHERE id = ?", "obs1").Scan(&lastSeenAfter)
|
||||
if lastSeenAfter == oldTime {
|
||||
t.Error("observer last_seen was NOT updated after packet insertion — low-traffic observers will appear offline")
|
||||
}
|
||||
ls, err := time.Parse(time.RFC3339, lastSeenAfter)
|
||||
if err != nil {
|
||||
t.Fatalf("last_seen %q not RFC3339: %v", lastSeenAfter, err)
|
||||
}
|
||||
if ls.Unix() < before-5 || ls.Unix() > after+5 {
|
||||
t.Errorf("expected last_seen ≈ server now (in [%d, %d]), got %s (epoch %d). "+
|
||||
"observer.last_seen must use ingest time, not envelope time (#1465).",
|
||||
before, after, lastSeenAfter, ls.Unix())
|
||||
}
|
||||
}
|
||||
|
||||
func TestLastPacketAtUpdatedOnPacketOnly(t *testing.T) {
|
||||
s, err := OpenStore(tempDBPath(t))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer s.Close()
|
||||
|
||||
// Insert observer via status path — last_packet_at should be NULL
|
||||
if err := s.UpsertObserver("obs1", "Observer1", "SJC", nil); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
var lastPacketAt sql.NullString
|
||||
s.db.QueryRow("SELECT last_packet_at FROM observers WHERE id = ?", "obs1").Scan(&lastPacketAt)
|
||||
if lastPacketAt.Valid {
|
||||
t.Fatalf("expected last_packet_at to be NULL after UpsertObserver, got %s", lastPacketAt.String)
|
||||
}
|
||||
|
||||
// Insert a packet from this observer — last_packet_at should be set
|
||||
data := &PacketData{
|
||||
RawHex: "0A00D69F",
|
||||
Timestamp: "2026-04-24T12:00:00Z",
|
||||
ObserverID: "obs1",
|
||||
Hash: "lastpackettest123456",
|
||||
RouteType: 2,
|
||||
PayloadType: 2,
|
||||
PathJSON: "[]",
|
||||
DecodedJSON: `{"type":"TXT_MSG"}`,
|
||||
}
|
||||
before := time.Now().Unix()
|
||||
if _, err := s.InsertTransmission(data); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
after := time.Now().Unix()
|
||||
|
||||
s.db.QueryRow("SELECT last_packet_at FROM observers WHERE id = ?", "obs1").Scan(&lastPacketAt)
|
||||
if !lastPacketAt.Valid {
|
||||
t.Fatal("expected last_packet_at to be non-NULL after InsertTransmission")
|
||||
}
|
||||
// last_packet_at, like last_seen, is "when did the analyzer last receive a
|
||||
// packet from this observer" — an ingest-time question, independent of the
|
||||
// envelope timestamp. See #1465.
|
||||
lp, err := time.Parse(time.RFC3339, lastPacketAt.String)
|
||||
if err != nil {
|
||||
t.Fatalf("last_packet_at %q not RFC3339: %v", lastPacketAt.String, err)
|
||||
}
|
||||
if lp.Unix() < before-5 || lp.Unix() > after+5 {
|
||||
t.Errorf("expected last_packet_at ≈ server now (in [%d, %d]), got %s (epoch %d)",
|
||||
before, after, lastPacketAt.String, lp.Unix())
|
||||
}
|
||||
|
||||
// UpsertObserver again (status path) — last_packet_at should NOT change
|
||||
if err := s.UpsertObserver("obs1", "Observer1", "SJC", nil); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
var lastPacketAtAfterStatus sql.NullString
|
||||
s.db.QueryRow("SELECT last_packet_at FROM observers WHERE id = ?", "obs1").Scan(&lastPacketAtAfterStatus)
|
||||
if !lastPacketAtAfterStatus.Valid || lastPacketAtAfterStatus.String != lastPacketAt.String {
|
||||
t.Errorf("UpsertObserver should not change last_packet_at; expected %s, got %v", lastPacketAt.String, lastPacketAtAfterStatus)
|
||||
if lastSeenAfter != "2026-03-25T01:00:00Z" {
|
||||
t.Errorf("expected last_seen=2026-03-25T01:00:00Z, got %s", lastSeenAfter)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -658,7 +587,7 @@ func TestEndToEndIngest(t *testing.T) {
|
||||
msg := &MQTTPacketMessage{
|
||||
Raw: rawHex,
|
||||
}
|
||||
pktData := BuildPacketData(msg, decoded, "obs1", "SJC", nil)
|
||||
pktData := BuildPacketData(msg, decoded, "obs1", "SJC")
|
||||
if _, err := s.InsertTransmission(pktData); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -846,14 +775,13 @@ func TestBuildPacketData(t *testing.T) {
|
||||
snr := 5.0
|
||||
rssi := -100.0
|
||||
msg := &MQTTPacketMessage{
|
||||
Raw: rawHex,
|
||||
SNR: &snr,
|
||||
RSSI: &rssi,
|
||||
Origin: "test-observer",
|
||||
Timestamp: "2026-05-16T10:00:00Z",
|
||||
Raw: rawHex,
|
||||
SNR: &snr,
|
||||
RSSI: &rssi,
|
||||
Origin: "test-observer",
|
||||
}
|
||||
|
||||
pkt := BuildPacketData(msg, decoded, "obs123", "SJC", nil)
|
||||
pkt := BuildPacketData(msg, decoded, "obs123", "SJC")
|
||||
|
||||
if pkt.RawHex != rawHex {
|
||||
t.Errorf("rawHex mismatch")
|
||||
@@ -883,11 +811,7 @@ func TestBuildPacketData(t *testing.T) {
|
||||
t.Errorf("payloadType mismatch")
|
||||
}
|
||||
if pkt.Timestamp == "" {
|
||||
t.Errorf("timestamp must be populated (server ingest time, #1370 reverts #1233)")
|
||||
}
|
||||
if pkt.Timestamp == "2026-05-16T10:00:00Z" {
|
||||
t.Errorf("timestamp=%s; must NOT be the envelope value (#1370 reverts #1233's "+
|
||||
"premise that envelope timestamp is trustworthy — buggy client clocks poison ordering)", pkt.Timestamp)
|
||||
t.Error("timestamp should be set")
|
||||
}
|
||||
if pkt.DecodedJSON == "" || pkt.DecodedJSON == "{}" {
|
||||
t.Error("decodedJSON should be populated")
|
||||
@@ -902,7 +826,7 @@ func TestBuildPacketDataWithHops(t *testing.T) {
|
||||
t.Fatal(err)
|
||||
}
|
||||
msg := &MQTTPacketMessage{Raw: raw}
|
||||
pkt := BuildPacketData(msg, decoded, "", "", nil)
|
||||
pkt := BuildPacketData(msg, decoded, "", "")
|
||||
|
||||
if pkt.PathJSON == "[]" {
|
||||
t.Error("pathJSON should contain hops")
|
||||
@@ -915,7 +839,7 @@ func TestBuildPacketDataWithHops(t *testing.T) {
|
||||
func TestBuildPacketDataNilSNRRSSI(t *testing.T) {
|
||||
decoded, _ := DecodePacket("0A00"+strings.Repeat("00", 10), nil, false)
|
||||
msg := &MQTTPacketMessage{Raw: "0A00" + strings.Repeat("00", 10)}
|
||||
pkt := BuildPacketData(msg, decoded, "", "", nil)
|
||||
pkt := BuildPacketData(msg, decoded, "", "")
|
||||
|
||||
if pkt.SNR != nil {
|
||||
t.Errorf("SNR should be nil")
|
||||
@@ -1716,7 +1640,7 @@ func TestBuildPacketDataScoreAndDirection(t *testing.T) {
|
||||
Direction: &dir,
|
||||
}
|
||||
|
||||
pkt := BuildPacketData(msg, decoded, "obs1", "SJC", nil)
|
||||
pkt := BuildPacketData(msg, decoded, "obs1", "SJC")
|
||||
if pkt.Score == nil || *pkt.Score != 42.0 {
|
||||
t.Errorf("Score=%v, want 42.0", pkt.Score)
|
||||
}
|
||||
@@ -1728,7 +1652,7 @@ func TestBuildPacketDataScoreAndDirection(t *testing.T) {
|
||||
func TestBuildPacketDataNilScoreDirection(t *testing.T) {
|
||||
decoded, _ := DecodePacket("0A00"+strings.Repeat("00", 10), nil, false)
|
||||
msg := &MQTTPacketMessage{Raw: "0A00" + strings.Repeat("00", 10)}
|
||||
pkt := BuildPacketData(msg, decoded, "", "", nil)
|
||||
pkt := BuildPacketData(msg, decoded, "", "")
|
||||
|
||||
if pkt.Score != nil {
|
||||
t.Errorf("Score should be nil, got %v", *pkt.Score)
|
||||
@@ -2160,7 +2084,7 @@ func TestBuildPacketData_TraceUsesPayloadHops(t *testing.T) {
|
||||
}
|
||||
|
||||
msg := &MQTTPacketMessage{Raw: rawHex}
|
||||
pd := BuildPacketData(msg, decoded, "test-obs", "TST", nil)
|
||||
pd := BuildPacketData(msg, decoded, "test-obs", "TST")
|
||||
|
||||
// For TRACE: path_json MUST be the payload-decoded route hops, NOT the SNR bytes
|
||||
expectedPathJSON := `["67","33","D6","33","67"]`
|
||||
@@ -2192,771 +2116,10 @@ func TestBuildPacketData_NonTracePathJSON(t *testing.T) {
|
||||
}
|
||||
|
||||
msg := &MQTTPacketMessage{Raw: rawHex}
|
||||
pd := BuildPacketData(msg, decoded, "obs1", "TST", nil)
|
||||
pd := BuildPacketData(msg, decoded, "obs1", "TST")
|
||||
|
||||
expectedPathJSON := `["AA","BB"]`
|
||||
if pd.PathJSON != expectedPathJSON {
|
||||
t.Errorf("path_json = %s, want %s", pd.PathJSON, expectedPathJSON)
|
||||
}
|
||||
}
|
||||
|
||||
func TestScopeNameMigration(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "test.db")
|
||||
store, err := OpenStore(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenStore: %v", err)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
// Verify column exists
|
||||
rows, err := store.db.Query("PRAGMA table_info(transmissions)")
|
||||
if err != nil {
|
||||
t.Fatalf("PRAGMA: %v", err)
|
||||
}
|
||||
found := false
|
||||
for rows.Next() {
|
||||
var cid int
|
||||
var colName, colType string
|
||||
var notNull, pk int
|
||||
var dflt interface{}
|
||||
if err := rows.Scan(&cid, &colName, &colType, ¬Null, &dflt, &pk); err == nil && colName == "scope_name" {
|
||||
found = true
|
||||
}
|
||||
}
|
||||
rows.Close()
|
||||
if !found {
|
||||
t.Fatal("scope_name column not found in transmissions")
|
||||
}
|
||||
|
||||
// Verify column actually stores and retrieves values (NULL and non-NULL).
|
||||
_, err = store.db.Exec(`INSERT INTO transmissions (raw_hex, hash, first_seen, route_type, payload_type, scope_name)
|
||||
VALUES ('aabb', 'hash1', '2026-01-01T00:00:00Z', 0, 5, '#belgium')`)
|
||||
if err != nil {
|
||||
t.Fatalf("insert scoped row: %v", err)
|
||||
}
|
||||
_, err = store.db.Exec(`INSERT INTO transmissions (raw_hex, hash, first_seen, route_type, payload_type, scope_name)
|
||||
VALUES ('ccdd', 'hash2', '2026-01-01T00:00:01Z', 0, 5, NULL)`)
|
||||
if err != nil {
|
||||
t.Fatalf("insert unscoped row: %v", err)
|
||||
}
|
||||
|
||||
var name string
|
||||
if err := store.db.QueryRow(`SELECT scope_name FROM transmissions WHERE hash = 'hash1'`).Scan(&name); err != nil {
|
||||
t.Fatalf("read scope_name: %v", err)
|
||||
}
|
||||
if name != "#belgium" {
|
||||
t.Errorf("scope_name = %q, want #belgium", name)
|
||||
}
|
||||
|
||||
var nullScope interface{}
|
||||
if err := store.db.QueryRow(`SELECT scope_name FROM transmissions WHERE hash = 'hash2'`).Scan(&nullScope); err != nil {
|
||||
t.Fatalf("read null scope_name: %v", err)
|
||||
}
|
||||
if nullScope != nil {
|
||||
t.Errorf("scope_name for unscoped = %v, want nil", nullScope)
|
||||
}
|
||||
}
|
||||
|
||||
// --- Feature 3: default_scope column on nodes (#899) ---
|
||||
|
||||
func TestUpdateNodeDefaultScope(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "test.db")
|
||||
store, err := OpenStore(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenStore: %v", err)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
// Insert a node into nodes and inactive_nodes so both tables can be updated.
|
||||
if _, err := store.db.Exec(`INSERT INTO nodes (public_key, name) VALUES ('pk1', 'Node1')`); err != nil {
|
||||
t.Fatalf("insert node: %v", err)
|
||||
}
|
||||
if _, err := store.db.Exec(`INSERT INTO inactive_nodes (public_key, name) VALUES ('pk1', 'Node1')`); err != nil {
|
||||
t.Fatalf("insert inactive node: %v", err)
|
||||
}
|
||||
|
||||
// First call: writes scope to both tables.
|
||||
if err := store.UpdateNodeDefaultScope("pk1", "#belgium"); err != nil {
|
||||
t.Fatalf("UpdateNodeDefaultScope: %v", err)
|
||||
}
|
||||
var got string
|
||||
if err := store.db.QueryRow(`SELECT default_scope FROM nodes WHERE public_key = 'pk1'`).Scan(&got); err != nil {
|
||||
t.Fatalf("read nodes.default_scope: %v", err)
|
||||
}
|
||||
if got != "#belgium" {
|
||||
t.Errorf("nodes.default_scope = %q, want #belgium", got)
|
||||
}
|
||||
var gotInactive string
|
||||
if err := store.db.QueryRow(`SELECT default_scope FROM inactive_nodes WHERE public_key = 'pk1'`).Scan(&gotInactive); err != nil {
|
||||
t.Fatalf("read inactive_nodes.default_scope: %v", err)
|
||||
}
|
||||
if gotInactive != "#belgium" {
|
||||
t.Errorf("inactive_nodes.default_scope = %q, want #belgium", gotInactive)
|
||||
}
|
||||
|
||||
// Second call with same value: short-circuit, no redundant UPDATE (verify no error and value stable).
|
||||
if err := store.UpdateNodeDefaultScope("pk1", "#belgium"); err != nil {
|
||||
t.Fatalf("UpdateNodeDefaultScope short-circuit: %v", err)
|
||||
}
|
||||
if err := store.db.QueryRow(`SELECT default_scope FROM nodes WHERE public_key = 'pk1'`).Scan(&got); err != nil {
|
||||
t.Fatalf("read after short-circuit: %v", err)
|
||||
}
|
||||
if got != "#belgium" {
|
||||
t.Errorf("after short-circuit nodes.default_scope = %q, want #belgium", got)
|
||||
}
|
||||
|
||||
// Third call with different value: updates both tables.
|
||||
if err := store.UpdateNodeDefaultScope("pk1", "#eu"); err != nil {
|
||||
t.Fatalf("UpdateNodeDefaultScope update: %v", err)
|
||||
}
|
||||
if err := store.db.QueryRow(`SELECT default_scope FROM nodes WHERE public_key = 'pk1'`).Scan(&got); err != nil {
|
||||
t.Fatalf("read after update: %v", err)
|
||||
}
|
||||
if got != "#eu" {
|
||||
t.Errorf("after update nodes.default_scope = %q, want #eu", got)
|
||||
}
|
||||
if err := store.db.QueryRow(`SELECT default_scope FROM inactive_nodes WHERE public_key = 'pk1'`).Scan(&gotInactive); err != nil {
|
||||
t.Fatalf("read inactive after update: %v", err)
|
||||
}
|
||||
if gotInactive != "#eu" {
|
||||
t.Errorf("after update inactive_nodes.default_scope = %q, want #eu", gotInactive)
|
||||
}
|
||||
}
|
||||
|
||||
// --- Issue #888: Backfill path_json from raw_hex ---
|
||||
|
||||
func TestBackfillPathJsonFromRawHex(t *testing.T) {
|
||||
dbPath := tempDBPath(t)
|
||||
s, err := OpenStore(dbPath)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Insert a transmission with payload_type != TRACE (e.g. 0x01)
|
||||
// raw_hex: header 0x05 (route FLOOD, payload 0x01), path byte 0x42 (hash_size=2, count=2),
|
||||
// hops: AABB, CCDD, then some payload bytes
|
||||
rawHex := "0542AABBCCDD0000000000000000000000000000"
|
||||
s.db.Exec(`INSERT INTO transmissions (raw_hex, hash, first_seen, payload_type) VALUES (?, 'h1', '2025-01-01T00:00:00Z', 1)`, rawHex)
|
||||
|
||||
// Insert observation with raw_hex but empty path_json
|
||||
s.db.Exec(`INSERT INTO observations (transmission_id, timestamp, raw_hex, path_json) VALUES (1, 1000, ?, '[]')`, rawHex)
|
||||
// Insert observation with raw_hex and NULL path_json
|
||||
s.db.Exec(`INSERT INTO observations (transmission_id, timestamp, raw_hex, path_json) VALUES (1, 1001, ?, NULL)`, rawHex)
|
||||
// Insert observation with existing path_json (should NOT be overwritten)
|
||||
s.db.Exec(`INSERT INTO observations (transmission_id, timestamp, raw_hex, path_json) VALUES (1, 1002, ?, '["XX","YY"]')`, rawHex)
|
||||
|
||||
// Insert a TRACE transmission (payload_type = 0x09) — should be skipped
|
||||
traceRaw := "2604302D0D2359FEE7B100000000006733D63367"
|
||||
s.db.Exec(`INSERT INTO transmissions (raw_hex, hash, first_seen, payload_type) VALUES (?, 'h2', '2025-01-01T00:00:00Z', 9)`, traceRaw)
|
||||
s.db.Exec(`INSERT INTO observations (transmission_id, timestamp, raw_hex, path_json) VALUES (2, 1003, ?, '[]')`, traceRaw)
|
||||
|
||||
// Remove the migration marker so it runs again on reopen
|
||||
s.db.Exec(`DELETE FROM _migrations WHERE name = 'backfill_path_json_from_raw_hex_v1'`)
|
||||
s.Close()
|
||||
|
||||
// Reopen — backfill is now async, must trigger explicitly
|
||||
s2, err := OpenStore(dbPath)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer s2.Close()
|
||||
|
||||
// Trigger async backfill and wait for completion
|
||||
s2.BackfillPathJSONAsync()
|
||||
deadline := time.Now().Add(10 * time.Second)
|
||||
var migCount int
|
||||
for time.Now().Before(deadline) {
|
||||
s2.db.QueryRow("SELECT COUNT(*) FROM _migrations WHERE name = 'backfill_path_json_from_raw_hex_v1'").Scan(&migCount)
|
||||
if migCount == 1 {
|
||||
break
|
||||
}
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
}
|
||||
if migCount != 1 {
|
||||
t.Fatalf("migration not recorded")
|
||||
}
|
||||
|
||||
// Row 1 (was '[]') is NOT re-processed by the backfill — '[]' means
|
||||
// "already attempted, no hops" and is excluded by the WHERE to avoid the
|
||||
// infinite-loop bug fixed in #1119. It must remain '[]'.
|
||||
var pj1 string
|
||||
s2.db.QueryRow("SELECT path_json FROM observations WHERE id = 1").Scan(&pj1)
|
||||
if pj1 != "[]" {
|
||||
t.Errorf("row 1 path_json = %q, want %q (must not re-process '[]' rows after #1119)", pj1, "[]")
|
||||
}
|
||||
|
||||
// Row 2 (was NULL) should now have decoded hops
|
||||
var pj2 string
|
||||
s2.db.QueryRow("SELECT path_json FROM observations WHERE id = 2").Scan(&pj2)
|
||||
if pj2 != `["AABB","CCDD"]` {
|
||||
t.Errorf("row 2 path_json = %q, want %q", pj2, `["AABB","CCDD"]`)
|
||||
}
|
||||
|
||||
// Row 3 (had existing data) should NOT be overwritten
|
||||
var pj3 string
|
||||
s2.db.QueryRow("SELECT path_json FROM observations WHERE id = 3").Scan(&pj3)
|
||||
if pj3 != `["XX","YY"]` {
|
||||
t.Errorf("row 3 path_json = %q, want %q (should not be overwritten)", pj3, `["XX","YY"]`)
|
||||
}
|
||||
|
||||
// Row 4 (TRACE) should NOT be updated
|
||||
var pj4 string
|
||||
s2.db.QueryRow("SELECT path_json FROM observations WHERE id = 4").Scan(&pj4)
|
||||
if pj4 != "[]" {
|
||||
t.Errorf("row 4 (TRACE) path_json = %q, want %q (should be skipped)", pj4, "[]")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCleanupLegacyNullHashTimestamp(t *testing.T) {
|
||||
path := tempDBPath(t)
|
||||
|
||||
// Create a bare-bones DB with legacy bad data
|
||||
db, err := sql.Open("sqlite", path+"?_pragma=journal_mode(WAL)&_pragma=busy_timeout(5000)")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
db.Exec(`CREATE TABLE IF NOT EXISTS transmissions (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
raw_hex TEXT NOT NULL,
|
||||
hash TEXT NOT NULL,
|
||||
first_seen TEXT NOT NULL,
|
||||
route_type INTEGER,
|
||||
payload_type INTEGER,
|
||||
payload_version INTEGER,
|
||||
decoded_json TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now')),
|
||||
channel_hash TEXT DEFAULT NULL
|
||||
)`)
|
||||
db.Exec(`CREATE TABLE IF NOT EXISTS observations (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
transmission_id INTEGER NOT NULL REFERENCES transmissions(id),
|
||||
observer_idx INTEGER,
|
||||
direction TEXT,
|
||||
snr REAL,
|
||||
rssi REAL,
|
||||
score INTEGER,
|
||||
path_json TEXT,
|
||||
timestamp INTEGER NOT NULL
|
||||
)`)
|
||||
db.Exec(`CREATE TABLE IF NOT EXISTS _migrations (name TEXT PRIMARY KEY)`)
|
||||
db.Exec(`CREATE TABLE IF NOT EXISTS nodes (public_key TEXT PRIMARY KEY, name TEXT, role TEXT, lat REAL, lon REAL, last_seen TEXT, first_seen TEXT, advert_count INTEGER DEFAULT 0, battery_mv INTEGER, temperature_c REAL)`)
|
||||
db.Exec(`CREATE TABLE IF NOT EXISTS observers (id TEXT PRIMARY KEY, name TEXT, iata TEXT, last_seen TEXT, first_seen TEXT, packet_count INTEGER DEFAULT 0, model TEXT, firmware TEXT, client_version TEXT, radio TEXT, battery_mv INTEGER, uptime_secs INTEGER, noise_floor REAL, inactive INTEGER DEFAULT 0, last_packet_at TEXT DEFAULT NULL)`)
|
||||
|
||||
// Insert good transmission
|
||||
db.Exec(`INSERT INTO transmissions (id, raw_hex, hash, first_seen) VALUES (1, 'aabb', 'abc123', '2024-01-01T00:00:00Z')`)
|
||||
db.Exec(`INSERT INTO observations (transmission_id, observer_idx, timestamp) VALUES (1, 1, 1704067200)`)
|
||||
|
||||
// Insert bad: empty hash
|
||||
db.Exec(`INSERT INTO transmissions (id, raw_hex, hash, first_seen) VALUES (2, 'ccdd', '', '2024-01-01T00:00:00Z')`)
|
||||
db.Exec(`INSERT INTO observations (transmission_id, observer_idx, timestamp) VALUES (2, 1, 1704067200)`)
|
||||
|
||||
// Insert bad: empty first_seen
|
||||
db.Exec(`INSERT INTO transmissions (id, raw_hex, hash, first_seen) VALUES (3, 'eeff', 'def456', '')`)
|
||||
db.Exec(`INSERT INTO observations (transmission_id, observer_idx, timestamp) VALUES (3, 2, 1704067200)`)
|
||||
|
||||
db.Close()
|
||||
|
||||
// Now open via OpenStore which should run the migration
|
||||
s, err := OpenStore(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer s.Close()
|
||||
|
||||
// Good transmission should remain
|
||||
var count int
|
||||
s.db.QueryRow("SELECT COUNT(*) FROM transmissions WHERE id = 1").Scan(&count)
|
||||
if count != 1 {
|
||||
t.Error("good transmission should not be deleted")
|
||||
}
|
||||
|
||||
// Bad transmissions should be gone
|
||||
s.db.QueryRow("SELECT COUNT(*) FROM transmissions WHERE id = 2").Scan(&count)
|
||||
if count != 0 {
|
||||
t.Errorf("transmission with empty hash should be deleted, got count=%d", count)
|
||||
}
|
||||
s.db.QueryRow("SELECT COUNT(*) FROM transmissions WHERE id = 3").Scan(&count)
|
||||
if count != 0 {
|
||||
t.Errorf("transmission with empty first_seen should be deleted, got count=%d", count)
|
||||
}
|
||||
|
||||
// Observations for bad transmissions should be gone
|
||||
s.db.QueryRow("SELECT COUNT(*) FROM observations WHERE transmission_id IN (2, 3)").Scan(&count)
|
||||
if count != 0 {
|
||||
t.Errorf("observations for bad transmissions should be deleted, got count=%d", count)
|
||||
}
|
||||
|
||||
// Observation for good transmission should remain
|
||||
s.db.QueryRow("SELECT COUNT(*) FROM observations WHERE transmission_id = 1").Scan(&count)
|
||||
if count != 1 {
|
||||
t.Error("observation for good transmission should remain")
|
||||
}
|
||||
|
||||
// Migration marker should exist
|
||||
var migCount int
|
||||
s.db.QueryRow("SELECT COUNT(*) FROM _migrations WHERE name = 'cleanup_legacy_null_hash_ts'").Scan(&migCount)
|
||||
if migCount != 1 {
|
||||
t.Error("migration marker cleanup_legacy_null_hash_ts should be recorded")
|
||||
}
|
||||
|
||||
// Idempotent: opening again should not error
|
||||
s.Close()
|
||||
s2, err := OpenStore(path)
|
||||
if err != nil {
|
||||
t.Fatal("second open should not fail:", err)
|
||||
}
|
||||
s2.Close()
|
||||
}
|
||||
|
||||
func TestBuildPacketDataRegionFromPayload(t *testing.T) {
|
||||
msg := &MQTTPacketMessage{Raw: "0102030405060708", Region: "PDX"}
|
||||
decoded := &DecodedPacket{
|
||||
Header: Header{RouteType: 1, PayloadType: 3},
|
||||
}
|
||||
pkt := BuildPacketData(msg, decoded, "obs1", "SJC", nil)
|
||||
// When payload has region, it should override the topic-derived region
|
||||
if pkt.Region != "PDX" {
|
||||
t.Fatalf("expected region PDX from payload, got %q", pkt.Region)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildPacketDataRegionFallsBackToTopic(t *testing.T) {
|
||||
msg := &MQTTPacketMessage{Raw: "0102030405060708"}
|
||||
decoded := &DecodedPacket{
|
||||
Header: Header{RouteType: 1, PayloadType: 3},
|
||||
}
|
||||
pkt := BuildPacketData(msg, decoded, "obs1", "SJC", nil)
|
||||
if pkt.Region != "SJC" {
|
||||
t.Fatalf("expected region SJC from topic, got %q", pkt.Region)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// TestBackfillPathJSONAsync verifies that the path_json backfill does NOT block
|
||||
// OpenStore from returning. MQTT connect happens immediately after OpenStore;
|
||||
// if the backfill is synchronous, MQTT would be delayed indefinitely on large DBs.
|
||||
// This test creates pending backfill rows, opens the store, and asserts that
|
||||
// OpenStore returns before the migration is recorded — proving async execution.
|
||||
func TestBackfillPathJSONAsync(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "async_test.db")
|
||||
|
||||
// Bootstrap schema manually so we can insert test data BEFORE OpenStore
|
||||
db, err := sql.Open("sqlite", dbPath+"?_pragma=journal_mode(WAL)&_pragma=busy_timeout(5000)")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
// Create tables manually (minimal schema for this test)
|
||||
_, err = db.Exec(`
|
||||
CREATE TABLE _migrations (name TEXT PRIMARY KEY);
|
||||
CREATE TABLE transmissions (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
raw_hex TEXT NOT NULL,
|
||||
hash TEXT NOT NULL UNIQUE,
|
||||
first_seen TEXT NOT NULL,
|
||||
route_type INTEGER,
|
||||
payload_type INTEGER,
|
||||
payload_version INTEGER,
|
||||
decoded_json TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now')),
|
||||
channel_hash TEXT
|
||||
);
|
||||
CREATE TABLE observers (
|
||||
id TEXT PRIMARY KEY,
|
||||
name TEXT,
|
||||
iata TEXT,
|
||||
last_seen TEXT,
|
||||
first_seen TEXT,
|
||||
packet_count INTEGER DEFAULT 0,
|
||||
model TEXT,
|
||||
firmware TEXT,
|
||||
client_version TEXT,
|
||||
radio TEXT,
|
||||
battery_mv INTEGER,
|
||||
uptime_secs INTEGER,
|
||||
noise_floor REAL,
|
||||
inactive INTEGER DEFAULT 0,
|
||||
last_packet_at TEXT
|
||||
);
|
||||
CREATE TABLE nodes (
|
||||
public_key TEXT PRIMARY KEY,
|
||||
name TEXT, role TEXT, lat REAL, lon REAL,
|
||||
last_seen TEXT, first_seen TEXT, advert_count INTEGER DEFAULT 0,
|
||||
battery_mv INTEGER, temperature_c REAL
|
||||
);
|
||||
CREATE TABLE inactive_nodes (
|
||||
public_key TEXT PRIMARY KEY,
|
||||
name TEXT, role TEXT, lat REAL, lon REAL,
|
||||
last_seen TEXT, first_seen TEXT, advert_count INTEGER DEFAULT 0,
|
||||
battery_mv INTEGER, temperature_c REAL
|
||||
);
|
||||
CREATE TABLE observations (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
transmission_id INTEGER NOT NULL REFERENCES transmissions(id),
|
||||
observer_idx INTEGER,
|
||||
direction TEXT,
|
||||
snr REAL, rssi REAL, score INTEGER,
|
||||
path_json TEXT,
|
||||
timestamp INTEGER NOT NULL,
|
||||
raw_hex TEXT
|
||||
);
|
||||
CREATE UNIQUE INDEX idx_observations_dedup ON observations(transmission_id, observer_idx, COALESCE(path_json, ''));
|
||||
CREATE INDEX idx_observations_transmission_id ON observations(transmission_id);
|
||||
CREATE INDEX idx_observations_observer_idx ON observations(observer_idx);
|
||||
CREATE INDEX idx_observations_timestamp ON observations(timestamp);
|
||||
CREATE TABLE observer_metrics (
|
||||
observer_id TEXT NOT NULL,
|
||||
timestamp TEXT NOT NULL,
|
||||
noise_floor REAL, tx_air_secs INTEGER, rx_air_secs INTEGER,
|
||||
recv_errors INTEGER, battery_mv INTEGER,
|
||||
packets_sent INTEGER, packets_recv INTEGER,
|
||||
PRIMARY KEY (observer_id, timestamp)
|
||||
);
|
||||
CREATE TABLE dropped_packets (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
hash TEXT, raw_hex TEXT, reason TEXT NOT NULL,
|
||||
observer_id TEXT, observer_name TEXT,
|
||||
node_pubkey TEXT, node_name TEXT,
|
||||
dropped_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
`)
|
||||
if err != nil {
|
||||
t.Fatal("bootstrap schema:", err)
|
||||
}
|
||||
|
||||
// Mark all migrations as done EXCEPT the path_json backfill
|
||||
for _, m := range []string{
|
||||
"advert_count_unique_v1", "noise_floor_real_v1", "node_telemetry_v1",
|
||||
"obs_timestamp_index_v1", "observer_metrics_v1", "observer_metrics_ts_idx",
|
||||
"observers_inactive_v1", "observer_metrics_packets_v1", "channel_hash_v1",
|
||||
"dropped_packets_v1", "observations_raw_hex_v1", "observers_last_packet_at_v1",
|
||||
"cleanup_legacy_null_hash_ts",
|
||||
} {
|
||||
db.Exec(`INSERT INTO _migrations (name) VALUES (?)`, m)
|
||||
}
|
||||
|
||||
// Insert a transmission + observations with NULL path_json and valid raw_hex
|
||||
// raw_hex "0102AABBCCDD0000" has 2-hop path decodable by packetpath
|
||||
rawHex := "41020304AABBCCDD05060708"
|
||||
_, err = db.Exec(`INSERT INTO transmissions (raw_hex, hash, first_seen, payload_type) VALUES (?, 'hash1', '2025-01-01T00:00:00Z', 4)`, rawHex)
|
||||
if err != nil {
|
||||
t.Fatal("insert tx:", err)
|
||||
}
|
||||
// Insert 100 observations needing backfill
|
||||
for i := 0; i < 100; i++ {
|
||||
_, err = db.Exec(`INSERT INTO observations (transmission_id, observer_idx, timestamp, raw_hex, path_json) VALUES (1, ?, ?, ?, NULL)`,
|
||||
i+1, 1700000000+i, rawHex)
|
||||
if err != nil {
|
||||
// dedup index might fire — use unique observer_idx
|
||||
t.Fatalf("insert obs %d: %v", i, err)
|
||||
}
|
||||
}
|
||||
db.Close()
|
||||
|
||||
// Now open store via OpenStore — this must return QUICKLY (non-blocking)
|
||||
start := time.Now()
|
||||
store, err := OpenStoreWithInterval(dbPath, 300)
|
||||
elapsed := time.Since(start)
|
||||
if err != nil {
|
||||
t.Fatal("OpenStore:", err)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
// OpenStore must return in under 2 seconds (backfill is no longer in applySchema)
|
||||
if elapsed > 2*time.Second {
|
||||
t.Fatalf("OpenStore blocked for %v — backfill must not run in applySchema", elapsed)
|
||||
}
|
||||
|
||||
// Backfill must NOT be recorded yet — it hasn't been triggered
|
||||
var done int
|
||||
err = store.db.QueryRow("SELECT 1 FROM _migrations WHERE name = 'backfill_path_json_from_raw_hex_v1'").Scan(&done)
|
||||
if err == nil {
|
||||
t.Fatal("migration recorded during OpenStore — backfill must be async via BackfillPathJSONAsync()")
|
||||
}
|
||||
|
||||
// Now trigger the async backfill (simulates what main.go does after OpenStore)
|
||||
store.BackfillPathJSONAsync()
|
||||
|
||||
// Wait for backfill to complete (should be very fast with 100 rows)
|
||||
deadline := time.Now().Add(10 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
err = store.db.QueryRow("SELECT 1 FROM _migrations WHERE name = 'backfill_path_json_from_raw_hex_v1'").Scan(&done)
|
||||
if err == nil {
|
||||
break
|
||||
}
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatal("backfill never completed within 10s")
|
||||
}
|
||||
|
||||
// Verify backfill actually worked — observations should have non-NULL path_json
|
||||
var nullCount int
|
||||
store.db.QueryRow("SELECT COUNT(*) FROM observations WHERE path_json IS NULL").Scan(&nullCount)
|
||||
if nullCount > 0 {
|
||||
t.Errorf("backfill left %d observations with NULL path_json", nullCount)
|
||||
}
|
||||
}
|
||||
|
||||
// TestBackfillPathJSONAsyncMethodExists verifies the async backfill API surface
|
||||
// exists — BackfillPathJSONAsync must be callable independently from OpenStore.
|
||||
func TestBackfillPathJSONAsyncMethodExists(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "method_test.db")
|
||||
store, err := OpenStoreWithInterval(dbPath, 300)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
// BackfillPathJSONAsync must exist as a method on *Store
|
||||
// This is a compile-time check — if the method doesn't exist, the test won't compile.
|
||||
store.BackfillPathJSONAsync()
|
||||
}
|
||||
|
||||
// TestBackfillPathJSONAsync_BracketRowsTerminate exercises the infinite-loop bug
|
||||
// from issue #1119. Observations whose path_json is already '[]' (meaning a prior
|
||||
// backfill pass attempted to decode them and found no hops) must NOT be re-selected
|
||||
// by the WHERE clause — otherwise the loop rewrites the same '[]' value forever
|
||||
// and never records the migration marker.
|
||||
//
|
||||
// This test seeds N rows with path_json='[]' and a raw_hex that DecodePathFromRawHex
|
||||
// resolves to zero hops. With the bug, the backfill loops infinitely re-UPDATEing
|
||||
// the same rows back to '[]', batch is never empty, migration marker is never
|
||||
// written. With the fix, no rows match → the very first batch is empty → migration
|
||||
// is recorded immediately.
|
||||
func TestBackfillPathJSONAsync_BracketRowsTerminate(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "bracket_terminate.db")
|
||||
|
||||
// Bootstrap a minimal schema directly so we can seed pre-existing '[]' rows
|
||||
// before OpenStore runs.
|
||||
db, err := sql.Open("sqlite", dbPath+"?_pragma=journal_mode(WAL)&_pragma=busy_timeout(5000)")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
_, err = db.Exec(`
|
||||
CREATE TABLE _migrations (name TEXT PRIMARY KEY);
|
||||
CREATE TABLE transmissions (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
raw_hex TEXT NOT NULL,
|
||||
hash TEXT NOT NULL UNIQUE,
|
||||
first_seen TEXT NOT NULL,
|
||||
route_type INTEGER,
|
||||
payload_type INTEGER,
|
||||
payload_version INTEGER,
|
||||
decoded_json TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now')),
|
||||
channel_hash TEXT
|
||||
);
|
||||
CREATE TABLE observers (
|
||||
id TEXT PRIMARY KEY, name TEXT, iata TEXT,
|
||||
last_seen TEXT, first_seen TEXT, packet_count INTEGER DEFAULT 0,
|
||||
model TEXT, firmware TEXT, client_version TEXT, radio TEXT,
|
||||
battery_mv INTEGER, uptime_secs INTEGER, noise_floor REAL,
|
||||
inactive INTEGER DEFAULT 0, last_packet_at TEXT
|
||||
);
|
||||
CREATE TABLE nodes (
|
||||
public_key TEXT PRIMARY KEY, name TEXT, role TEXT,
|
||||
lat REAL, lon REAL, last_seen TEXT, first_seen TEXT,
|
||||
advert_count INTEGER DEFAULT 0, battery_mv INTEGER, temperature_c REAL
|
||||
);
|
||||
CREATE TABLE inactive_nodes (
|
||||
public_key TEXT PRIMARY KEY, name TEXT, role TEXT,
|
||||
lat REAL, lon REAL, last_seen TEXT, first_seen TEXT,
|
||||
advert_count INTEGER DEFAULT 0, battery_mv INTEGER, temperature_c REAL
|
||||
);
|
||||
CREATE TABLE observations (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
transmission_id INTEGER NOT NULL REFERENCES transmissions(id),
|
||||
observer_idx INTEGER, direction TEXT,
|
||||
snr REAL, rssi REAL, score INTEGER,
|
||||
path_json TEXT,
|
||||
timestamp INTEGER NOT NULL,
|
||||
raw_hex TEXT
|
||||
);
|
||||
CREATE UNIQUE INDEX idx_observations_dedup ON observations(transmission_id, observer_idx, COALESCE(path_json, ''));
|
||||
CREATE INDEX idx_observations_transmission_id ON observations(transmission_id);
|
||||
CREATE INDEX idx_observations_observer_idx ON observations(observer_idx);
|
||||
CREATE INDEX idx_observations_timestamp ON observations(timestamp);
|
||||
CREATE TABLE observer_metrics (
|
||||
observer_id TEXT NOT NULL, timestamp TEXT NOT NULL,
|
||||
noise_floor REAL, tx_air_secs INTEGER, rx_air_secs INTEGER,
|
||||
recv_errors INTEGER, battery_mv INTEGER,
|
||||
packets_sent INTEGER, packets_recv INTEGER,
|
||||
PRIMARY KEY (observer_id, timestamp)
|
||||
);
|
||||
CREATE TABLE dropped_packets (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
hash TEXT, raw_hex TEXT, reason TEXT NOT NULL,
|
||||
observer_id TEXT, observer_name TEXT,
|
||||
node_pubkey TEXT, node_name TEXT,
|
||||
dropped_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
`)
|
||||
if err != nil {
|
||||
t.Fatal("bootstrap schema:", err)
|
||||
}
|
||||
|
||||
// Mark all migrations done EXCEPT backfill_path_json_from_raw_hex_v1.
|
||||
for _, m := range []string{
|
||||
"advert_count_unique_v1", "noise_floor_real_v1", "node_telemetry_v1",
|
||||
"obs_timestamp_index_v1", "observer_metrics_v1", "observer_metrics_ts_idx",
|
||||
"observers_inactive_v1", "observer_metrics_packets_v1", "channel_hash_v1",
|
||||
"dropped_packets_v1", "observations_raw_hex_v1", "observers_last_packet_at_v1",
|
||||
"cleanup_legacy_null_hash_ts",
|
||||
} {
|
||||
db.Exec(`INSERT INTO _migrations (name) VALUES (?)`, m)
|
||||
}
|
||||
|
||||
// raw_hex producing ZERO hops via DecodePathFromRawHex:
|
||||
// DIRECT route (type=2), payload_type=2, version=0 → header 0x0A; path byte 0x00.
|
||||
// (See internal/packetpath/path_test.go: TestDecodePathFromRawHex_ZeroHops.)
|
||||
rawHex := "0A00DEADBEEF"
|
||||
_, err = db.Exec(`INSERT INTO transmissions (raw_hex, hash, first_seen, payload_type) VALUES (?, 'h_brackets', '2025-01-01T00:00:00Z', 2)`, rawHex)
|
||||
if err != nil {
|
||||
t.Fatal("insert tx:", err)
|
||||
}
|
||||
const seedCount = 100
|
||||
for i := 0; i < seedCount; i++ {
|
||||
_, err = db.Exec(`INSERT INTO observations (transmission_id, observer_idx, timestamp, raw_hex, path_json) VALUES (1, ?, ?, ?, '[]')`,
|
||||
i+1, 1700000000+i, rawHex)
|
||||
if err != nil {
|
||||
t.Fatalf("insert obs %d: %v", i, err)
|
||||
}
|
||||
}
|
||||
db.Close()
|
||||
|
||||
store, err := OpenStoreWithInterval(dbPath, 300)
|
||||
if err != nil {
|
||||
t.Fatal("OpenStore:", err)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
// Trigger backfill. With the bug, every iteration re-fetches all 100 rows
|
||||
// (because '[]' matches the WHERE), rewrites them to '[]', sleeps 50ms, repeats.
|
||||
// The loop never terminates and the migration marker is never written.
|
||||
store.BackfillPathJSONAsync()
|
||||
|
||||
// Generous deadline: with the fix the marker is written essentially immediately.
|
||||
// With the bug the marker is never written within any bounded time.
|
||||
deadline := time.Now().Add(5 * time.Second)
|
||||
var done int
|
||||
for time.Now().Before(deadline) {
|
||||
err = store.db.QueryRow("SELECT 1 FROM _migrations WHERE name = 'backfill_path_json_from_raw_hex_v1'").Scan(&done)
|
||||
if err == nil {
|
||||
break
|
||||
}
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatalf("issue #1119: backfill never recorded migration marker within 5s — infinite loop on path_json='[]' rows")
|
||||
}
|
||||
|
||||
// Verify the seeded '[]' rows still have '[]' (sanity — neither bug nor fix
|
||||
// should change their value), and that there are no NULL/empty path_json rows
|
||||
// the backfill should have processed.
|
||||
var bracketCount int
|
||||
store.db.QueryRow("SELECT COUNT(*) FROM observations WHERE path_json = '[]'").Scan(&bracketCount)
|
||||
if bracketCount != seedCount {
|
||||
t.Errorf("expected %d rows with path_json='[]', got %d", seedCount, bracketCount)
|
||||
}
|
||||
}
|
||||
|
||||
// TestSchemaMultibyteSupColumns verifies that the multibyte_sup_v1 migration adds
|
||||
// the expected columns and is idempotent across multiple OpenStore calls.
|
||||
func TestSchemaMultibyteSupColumns(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "test.db")
|
||||
|
||||
store, err := OpenStore(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenStore: %v", err)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
for _, table := range []string{"nodes", "inactive_nodes"} {
|
||||
rows, err := store.db.Query("PRAGMA table_info(" + table + ")")
|
||||
if err != nil {
|
||||
t.Fatalf("PRAGMA table_info(%s): %v", table, err)
|
||||
}
|
||||
var foundSup, foundEvid bool
|
||||
for rows.Next() {
|
||||
var cid int
|
||||
var name, colType string
|
||||
var notNull, pk int
|
||||
var dflt interface{}
|
||||
if rows.Scan(&cid, &name, &colType, ¬Null, &dflt, &pk) == nil {
|
||||
if name == "multibyte_sup" {
|
||||
foundSup = true
|
||||
}
|
||||
if name == "multibyte_evidence" {
|
||||
foundEvid = true
|
||||
}
|
||||
}
|
||||
}
|
||||
rows.Close()
|
||||
if !foundSup {
|
||||
t.Errorf("table %s: multibyte_sup column missing", table)
|
||||
}
|
||||
if !foundEvid {
|
||||
t.Errorf("table %s: multibyte_evidence column missing", table)
|
||||
}
|
||||
}
|
||||
|
||||
// Verify migration is present. As of #1324 follow-up the migration
|
||||
// lives in internal/dbschema (column-probe + idempotent ALTER), not
|
||||
// in the legacy _migrations marker table — so we just re-assert the
|
||||
// columns exist and the second OpenStore is a no-op.
|
||||
store.Close()
|
||||
store2, err := OpenStore(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenStore (second open): %v", err)
|
||||
}
|
||||
store2.Close()
|
||||
}
|
||||
|
||||
// TestUpdateNodeDefaultScope_EmptyScopeIsNoop is the DB-layer defense-in-depth
|
||||
// regression test for #1534. Even if the call-site guard at main.go:720 is
|
||||
// later removed or refactored, the DB function MUST refuse to overwrite a
|
||||
// previously-correct default_scope with the empty string. This is the
|
||||
// belt-and-braces guard recommended by adversarial review (MAJOR-2) and
|
||||
// dijkstra review (MINOR-2).
|
||||
func TestUpdateNodeDefaultScope_EmptyScopeIsNoop(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "test.db")
|
||||
store, err := OpenStore(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenStore: %v", err)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
if _, err := store.db.Exec(`INSERT INTO nodes (public_key, name, default_scope) VALUES ('pk1', 'Node1', '#belgium')`); err != nil {
|
||||
t.Fatalf("insert node: %v", err)
|
||||
}
|
||||
if _, err := store.db.Exec(`INSERT INTO inactive_nodes (public_key, name, default_scope) VALUES ('pk1', 'Node1', '#belgium')`); err != nil {
|
||||
t.Fatalf("insert inactive node: %v", err)
|
||||
}
|
||||
|
||||
// Empty-scope call must be a silent no-op (return nil), NOT overwrite.
|
||||
if err := store.UpdateNodeDefaultScope("pk1", ""); err != nil {
|
||||
t.Fatalf("UpdateNodeDefaultScope(\"\") returned error: %v (want nil)", err)
|
||||
}
|
||||
|
||||
var got string
|
||||
if err := store.db.QueryRow(`SELECT default_scope FROM nodes WHERE public_key = 'pk1'`).Scan(&got); err != nil {
|
||||
t.Fatalf("read nodes.default_scope: %v", err)
|
||||
}
|
||||
if got != "#belgium" {
|
||||
t.Errorf("nodes.default_scope after empty-scope call = %q, want #belgium (DB-layer guard missing — #1534)", got)
|
||||
}
|
||||
var gotInactive string
|
||||
if err := store.db.QueryRow(`SELECT default_scope FROM inactive_nodes WHERE public_key = 'pk1'`).Scan(&gotInactive); err != nil {
|
||||
t.Fatalf("read inactive_nodes.default_scope: %v", err)
|
||||
}
|
||||
if gotInactive != "#belgium" {
|
||||
t.Errorf("inactive_nodes.default_scope after empty-scope call = %q, want #belgium (DB-layer guard missing — #1534)", gotInactive)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,115 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TestWriterStarvationVisibleInPerf reproduces the #1339 class of bug:
|
||||
// one component (neighbor_builder) holds the writer connection for an
|
||||
// extended period; a second component (mqtt_handler) firing concurrent
|
||||
// writes must show observable wait_ms in the perf snapshot.
|
||||
//
|
||||
// This is the gate test for issue #1340: SQLite write-lock instrumentation
|
||||
// per component. If the wait_ms percentile collapses to zero, the
|
||||
// observability gap remains and the regression class is invisible again.
|
||||
//
|
||||
// Runs ~60s — guarded by testing.Short() so fast unit-test passes can
|
||||
// skip it locally, but CI runs `go test ./...` without -short.
|
||||
func TestWriterStarvationVisibleInPerf(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("skipping 60s starvation test in short mode")
|
||||
}
|
||||
|
||||
// Isolate from samples accumulated by earlier tests in the same
|
||||
// package run — without this the mqtt_handler component already
|
||||
// has ~thousand fast InsertTransmission samples and the 5 slow
|
||||
// follower samples can't move p99 above 50s.
|
||||
ResetWriterStatsForTest()
|
||||
|
||||
s, err := OpenStore(tempDBPath(t))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer s.Close()
|
||||
|
||||
const blockDur = 60 * time.Second
|
||||
|
||||
// Blocker: acquire the writer via the wrapped Tx path, tag as
|
||||
// neighbor_builder, sleep 60s while holding the single conn,
|
||||
// then commit. This monopolises the writer for the duration.
|
||||
blockStarted := make(chan struct{})
|
||||
blockerDone := make(chan struct{})
|
||||
go func() {
|
||||
defer close(blockerDone)
|
||||
err := s.WriterTx("neighbor_builder", func(tx *sql.Tx) error {
|
||||
if _, err := tx.Exec(`UPDATE nodes SET name = name WHERE 0`); err != nil {
|
||||
return err
|
||||
}
|
||||
close(blockStarted)
|
||||
time.Sleep(blockDur)
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
t.Errorf("blocker tx: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
// Wait for the blocker to be inside its transaction.
|
||||
<-blockStarted
|
||||
// Small safety margin so the blocker is firmly holding the conn.
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
// Now fire several mqtt_handler writes. Each will block on the
|
||||
// single writer connection until the blocker commits.
|
||||
const followers = 5
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(followers)
|
||||
for i := 0; i < followers; i++ {
|
||||
i := i
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
_, err := s.WriterExec(
|
||||
"mqtt_handler",
|
||||
`INSERT OR IGNORE INTO _migrations (name) VALUES (?)`,
|
||||
fmt.Sprintf("writer_starvation_test_%d", i),
|
||||
)
|
||||
if err != nil {
|
||||
t.Errorf("mqtt follower %d: %v", i, err)
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
<-blockerDone
|
||||
|
||||
snap := s.WriterStatsSnapshot()
|
||||
mqtt, ok := snap["mqtt_handler"]
|
||||
if !ok {
|
||||
t.Fatalf("no perf snapshot for mqtt_handler component (got components: %v)", componentKeys(snap))
|
||||
}
|
||||
if mqtt.Count < followers {
|
||||
t.Fatalf("expected at least %d mqtt_handler samples, got %d", followers, mqtt.Count)
|
||||
}
|
||||
// This is the gate assertion. With instrumentation present the
|
||||
// follower writes should each register ~60s of wait_ms; p99 must
|
||||
// be well above 50_000ms. With instrumentation missing or broken
|
||||
// the percentile collapses to zero and this fails — which is the
|
||||
// exact regression class #1340 is meant to prevent.
|
||||
if mqtt.WaitMsP99 <= 50_000 {
|
||||
t.Fatalf("mqtt_handler wait_ms p99 = %.1fms, want > 50000ms; "+
|
||||
"writer starvation is invisible to /api/perf — issue #1340 not fixed",
|
||||
mqtt.WaitMsP99)
|
||||
}
|
||||
}
|
||||
|
||||
func componentKeys(m map[string]WriterStatsSnapshot) []string {
|
||||
out := make([]string, 0, len(m))
|
||||
for k := range m {
|
||||
out = append(out, k)
|
||||
}
|
||||
return out
|
||||
}
|
||||
@@ -1,63 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"log"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestHandleMessageDecodeErrorLog_PII — issue #1211 round-0 fix shipped without
|
||||
// a test. Asserts the decode-error log line:
|
||||
// (a) includes structured fields: topic, observer prefix, payload length
|
||||
// (b) observer substring is at most 8 chars
|
||||
// (c) full observer ID is NOT present in the output
|
||||
//
|
||||
// A bare `log.Printf("... observer=%s ...", obs)` would leak the full ID.
|
||||
func TestHandleMessageDecodeErrorLog_PII_Issue1211(t *testing.T) {
|
||||
store, source := newTestContext(t)
|
||||
|
||||
// Use a 64-char observer ID; the prefix MUST be capped at 8 chars in logs.
|
||||
observerID := "abcdef0123456789aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
|
||||
// Malformed raw — pathByte=0xF6 claims 216 path bytes in a tiny buffer.
|
||||
// This triggers the decode-error path under test.
|
||||
rawHex := "12F6AAAAAAAAAAAAAAAAAAAAAAAAAA"
|
||||
topic := "meshcore/SJC/" + observerID + "/packets"
|
||||
payload := []byte(`{"raw":"` + rawHex + `"}`)
|
||||
msg := &mockMessage{topic: topic, payload: payload}
|
||||
|
||||
var buf bytes.Buffer
|
||||
orig := log.Writer()
|
||||
log.SetOutput(&buf)
|
||||
defer log.SetOutput(orig)
|
||||
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
|
||||
out := buf.String()
|
||||
if !strings.Contains(out, "decode error") {
|
||||
t.Fatalf("expected decode-error log; got:\n%s", out)
|
||||
}
|
||||
// (a) structured fields present
|
||||
if !strings.Contains(out, "topic=") {
|
||||
t.Errorf("log missing topic=; got:\n%s", out)
|
||||
}
|
||||
if !strings.Contains(out, "observer=") {
|
||||
t.Errorf("log missing observer=; got:\n%s", out)
|
||||
}
|
||||
if !strings.Contains(out, "rawHexLen=") {
|
||||
t.Errorf("log missing rawHexLen=; got:\n%s", out)
|
||||
}
|
||||
// (c) full observer ID must NOT appear
|
||||
if strings.Contains(out, observerID) {
|
||||
t.Errorf("log leaked full observer ID; got:\n%s", out)
|
||||
}
|
||||
// (b) observer substring capped at 8 chars — the 9th char ('2') after the
|
||||
// 8-char prefix must NOT appear adjacent to the prefix.
|
||||
if strings.Contains(out, "abcdef01234") {
|
||||
t.Errorf("log observer field longer than 8 chars; got:\n%s", out)
|
||||
}
|
||||
// Positive: 8-char prefix must be present in the log
|
||||
if !strings.Contains(out, "abcdef01") {
|
||||
t.Errorf("log missing 8-char observer prefix; got:\n%s", out)
|
||||
}
|
||||
}
|
||||
+15
-385
@@ -109,15 +109,6 @@ type Payload struct {
|
||||
MAC string `json:"mac,omitempty"`
|
||||
EncryptedData string `json:"encryptedData,omitempty"`
|
||||
ExtraHash string `json:"extraHash,omitempty"`
|
||||
// Extended ACK fields per firmware 1.16.0 (issue #1610) —
|
||||
// firmware/src/helpers/BaseChatMesh.cpp:218-234. ACK payloads grew from
|
||||
// always-4 bytes to 4/5/6 (4-byte truncated sha256 CRC, optional 1-byte
|
||||
// attempt counter, optional 1-byte RNG byte added in commit a130a95a).
|
||||
// AckLen is the wire payload length; AckAttempt/AckRand are surfaced
|
||||
// only when the sender included them (legacy 4-byte ACKs leave them nil).
|
||||
AckLen *int `json:"ackLen,omitempty"`
|
||||
AckAttempt *int `json:"ackAttempt,omitempty"`
|
||||
AckRand *int `json:"ackRand,omitempty"`
|
||||
PubKey string `json:"pubKey,omitempty"`
|
||||
Timestamp uint32 `json:"timestamp,omitempty"`
|
||||
TimestampISO string `json:"timestampISO,omitempty"`
|
||||
@@ -135,45 +126,16 @@ type Payload struct {
|
||||
ChannelHashHex string `json:"channelHashHex,omitempty"`
|
||||
DecryptionStatus string `json:"decryptionStatus,omitempty"`
|
||||
Channel string `json:"channel,omitempty"`
|
||||
// GRP_DATA (PAYLOAD_TYPE_GRP_DATA=0x06) inner fields, decoded after
|
||||
// channel decrypt per firmware/src/helpers/BaseChatMesh.cpp:382-385.
|
||||
DataType *int `json:"dataType,omitempty"`
|
||||
DataLen *int `json:"dataLen,omitempty"`
|
||||
DecryptedBlob string `json:"decryptedBlob,omitempty"`
|
||||
Text string `json:"text,omitempty"`
|
||||
Sender string `json:"sender,omitempty"`
|
||||
SenderTimestamp uint32 `json:"sender_timestamp,omitempty"`
|
||||
EphemeralPubKey string `json:"ephemeralPubKey,omitempty"`
|
||||
PathData string `json:"pathData,omitempty"`
|
||||
SNRValues []float64 `json:"snrValues,omitempty"`
|
||||
Tag uint32 `json:"tag,omitempty"`
|
||||
AuthCode uint32 `json:"authCode,omitempty"`
|
||||
TraceFlags *int `json:"traceFlags,omitempty"`
|
||||
RawHex string `json:"raw,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
// MULTIPART (PAYLOAD_TYPE_MULTIPART=0x0A) inner fields, decoded per
|
||||
// firmware/src/Mesh.cpp:289 — byte0 = (remaining<<4) | inner_type.
|
||||
Remaining *int `json:"remaining,omitempty"`
|
||||
InnerType *int `json:"innerType,omitempty"`
|
||||
InnerTypeName string `json:"innerTypeName,omitempty"`
|
||||
InnerAckCrc string `json:"innerAckCrc,omitempty"`
|
||||
// Extended ACK inner fields (issue #1610) — when the multipart inner
|
||||
// blob is a v1.16+ extended ACK (5 or 6 bytes after the byte0 header),
|
||||
// surface the same attempt/rand bytes as the top-level decoder.
|
||||
InnerAckLen *int `json:"innerAckLen,omitempty"`
|
||||
InnerAckAttempt *int `json:"innerAckAttempt,omitempty"`
|
||||
InnerAckRand *int `json:"innerAckRand,omitempty"`
|
||||
InnerPayload string `json:"innerPayload,omitempty"`
|
||||
// CONTROL (PAYLOAD_TYPE_CONTROL=0x0B) byte0 flags, per
|
||||
// firmware/src/Mesh.cpp:69 — byte0 high-bit marks zero-hop direct subset.
|
||||
CtrlFlags string `json:"ctrlFlags,omitempty"`
|
||||
CtrlZeroHop *bool `json:"ctrlZeroHop,omitempty"`
|
||||
CtrlLength *int `json:"ctrlLength,omitempty"`
|
||||
// RAW_CUSTOM (PAYLOAD_TYPE_RAW_CUSTOM=0x0F) — application-defined per
|
||||
// firmware/src/Mesh.cpp:577 (createRawData). Exposes the bare envelope
|
||||
// shape (length + leading tag) so consumers can triage by app id.
|
||||
RawLength *int `json:"rawLength,omitempty"`
|
||||
FirstByteTag string `json:"firstByteTag,omitempty"`
|
||||
}
|
||||
|
||||
// DecodedPacket is the full decoded result.
|
||||
@@ -184,7 +146,6 @@ type DecodedPacket struct {
|
||||
Payload Payload `json:"payload"`
|
||||
Raw string `json:"raw"`
|
||||
Anomaly string `json:"anomaly,omitempty"`
|
||||
payloadRaw []byte
|
||||
}
|
||||
|
||||
func decodeHeader(b byte) Header {
|
||||
@@ -210,35 +171,9 @@ func decodeHeader(b byte) Header {
|
||||
}
|
||||
}
|
||||
|
||||
// Firmware-derived limits — see firmware/src/MeshCore.h:19,21.
|
||||
const (
|
||||
maxPathSize = 64 // MAX_PATH_SIZE — total path bytes allowed
|
||||
maxPacketPayload = 184 // MAX_PACKET_PAYLOAD — max raw payload bytes
|
||||
)
|
||||
|
||||
// isValidPathLen mirrors firmware Packet::isValidPathLen
|
||||
// (firmware/src/Packet.cpp:13-18). hash_size==4 is reserved; total path bytes
|
||||
// must fit within MAX_PATH_SIZE.
|
||||
func isValidPathLen(pathByte byte) bool {
|
||||
hashCount := int(pathByte & 0x3F)
|
||||
hashSize := int(pathByte>>6) + 1
|
||||
if hashSize == 4 {
|
||||
return false // reserved
|
||||
}
|
||||
return hashCount*hashSize <= maxPathSize
|
||||
}
|
||||
|
||||
func decodePath(pathByte byte, buf []byte, offset int) (Path, int, error) {
|
||||
func decodePath(pathByte byte, buf []byte, offset int) (Path, int) {
|
||||
hashSize := int(pathByte>>6) + 1
|
||||
hashCount := int(pathByte & 0x3F)
|
||||
// Exact mirror of firmware Packet::isValidPathLen (Packet.cpp:13-18).
|
||||
// hash_size==4 is reserved and is rejected by firmware regardless of
|
||||
// hash_count, so we must reject 0xC0 etc even on zero-hop packets —
|
||||
// firmware never emits them, so an on-wire pathByte with the upper
|
||||
// 2 bits set to 11 is by definition malformed/adversarial.
|
||||
if !isValidPathLen(pathByte) {
|
||||
return Path{}, 0, fmt.Errorf("invalid path encoding: pathByte 0x%02X (hash_size=%d hash_count=%d) violates firmware validity (Packet.cpp:13-18, MAX_PATH_SIZE=%d)", pathByte, hashSize, hashCount, maxPathSize)
|
||||
}
|
||||
totalBytes := hashSize * hashCount
|
||||
hops := make([]string, 0, hashCount)
|
||||
|
||||
@@ -255,7 +190,7 @@ func decodePath(pathByte byte, buf []byte, offset int) (Path, int, error) {
|
||||
HashSize: hashSize,
|
||||
HashCount: hashCount,
|
||||
Hops: hops,
|
||||
}, totalBytes, nil
|
||||
}, totalBytes
|
||||
}
|
||||
|
||||
// isTransportRoute delegates to packetpath.IsTransportRoute.
|
||||
@@ -281,27 +216,10 @@ func decodeAck(buf []byte) Payload {
|
||||
return Payload{Type: "ACK", Error: "too short", RawHex: hex.EncodeToString(buf)}
|
||||
}
|
||||
checksum := binary.LittleEndian.Uint32(buf[0:4])
|
||||
ackLen := len(buf)
|
||||
if ackLen > 6 {
|
||||
ackLen = 6
|
||||
}
|
||||
p := Payload{
|
||||
return Payload{
|
||||
Type: "ACK",
|
||||
ExtraHash: fmt.Sprintf("%08x", checksum),
|
||||
AckLen: &ackLen,
|
||||
}
|
||||
// Firmware 1.16.0 extended ACK (issue #1610): 5th byte is the attempt
|
||||
// counter (commit f6e6fdaa), 6th byte is a random byte added so identical
|
||||
// attempts still hash uniquely (commit a130a95a).
|
||||
if len(buf) >= 5 {
|
||||
attempt := int(buf[4])
|
||||
p.AckAttempt = &attempt
|
||||
}
|
||||
if len(buf) >= 6 {
|
||||
rnd := int(buf[5])
|
||||
p.AckRand = &rnd
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
func decodeAdvert(buf []byte, validateSignatures bool) Payload {
|
||||
@@ -381,13 +299,6 @@ func decodeAdvert(buf []byte, validateSignatures bool) Payload {
|
||||
}
|
||||
name := string(appdata[off:nameEnd])
|
||||
name = sanitizeName(name)
|
||||
// Firmware writes the node name into a 32-byte buffer
|
||||
// (MAX_ADVERT_DATA_SIZE, firmware/src/MeshCore.h:11). Truncate
|
||||
// here so adversarial on-wire adverts can't pollute Payload.Name
|
||||
// with bytes firmware would never emit.
|
||||
if len(name) > 32 {
|
||||
name = name[:32]
|
||||
}
|
||||
p.Name = name
|
||||
off = nameEnd
|
||||
// Skip null terminator(s)
|
||||
@@ -398,17 +309,6 @@ func decodeAdvert(buf []byte, validateSignatures bool) Payload {
|
||||
|
||||
// Telemetry bytes after name: battery_mv(2 LE) + temperature_c(2 LE, signed, /100)
|
||||
// Only sensor nodes (advType=4) carry telemetry bytes.
|
||||
//
|
||||
// Firmware derivation (see firmware/src/helpers/SensorMesh.h and the
|
||||
// SensorHost::handleAdvert path in firmware/src/helpers/SensorMesh.cpp:
|
||||
// the sensor builds appdata as <flags+adv_type><pubkey?><name\0>
|
||||
// followed by two little-endian uint16 fields appended verbatim:
|
||||
// appdata[name_end+0..1] = battery voltage in millivolts (uint16 LE,
|
||||
// valid 0 < mv ≤ 10000)
|
||||
// appdata[name_end+2..3] = temperature × 100 (int16 LE, divide by 100
|
||||
// for °C; valid raw -5000..10000 → -50..100 °C)
|
||||
// We accept only adverts whose flags.Sensor bit is set (firmware
|
||||
// AdvertDataHelpers.h:7-12, ADV_TYPE_SENSOR=4) before parsing telemetry.
|
||||
if p.Flags.Sensor && off+4 <= len(appdata) {
|
||||
batteryMv := int(binary.LittleEndian.Uint16(appdata[off : off+2]))
|
||||
tempRaw := int16(binary.LittleEndian.Uint16(appdata[off+2 : off+4]))
|
||||
@@ -525,22 +425,6 @@ func decryptChannelMessage(ciphertextHex, macHex, channelKeyHex string) (*channe
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// knownChannelCasing maps known channel keys to their canonical display names.
|
||||
// Only well-known channels are normalized — custom/user channels are left as-is.
|
||||
var knownChannelCasing = map[string]string{
|
||||
"public": "Public",
|
||||
}
|
||||
|
||||
// normalizeChannelName fixes casing for well-known channel names.
|
||||
// Only normalizes names that appear in knownChannelCasing (e.g. "public" → "Public").
|
||||
// Custom channel names are left untouched since we can't know the intended casing.
|
||||
func normalizeChannelName(name string) string {
|
||||
if corrected, ok := knownChannelCasing[strings.ToLower(name)]; ok {
|
||||
return corrected
|
||||
}
|
||||
return name
|
||||
}
|
||||
|
||||
func decodeGrpTxt(buf []byte, channelKeys map[string]string) Payload {
|
||||
if len(buf) < 3 {
|
||||
return Payload{Type: "GRP_TXT", Error: "too short", RawHex: hex.EncodeToString(buf)}
|
||||
@@ -565,7 +449,7 @@ func decodeGrpTxt(buf []byte, channelKeys map[string]string) Payload {
|
||||
}
|
||||
return Payload{
|
||||
Type: "CHAN",
|
||||
Channel: normalizeChannelName(name),
|
||||
Channel: name,
|
||||
ChannelHash: channelHash,
|
||||
ChannelHashHex: channelHashHex,
|
||||
DecryptionStatus: "decrypted",
|
||||
@@ -594,200 +478,6 @@ func decodeGrpTxt(buf []byte, channelKeys map[string]string) Payload {
|
||||
}
|
||||
}
|
||||
|
||||
// decodeGrpData decodes PAYLOAD_TYPE_GRP_DATA (0x06). Outer envelope is the
|
||||
// same shape as GRP_TXT (channel_hash(1)+MAC(2)+ciphertext) — see
|
||||
// firmware/src/helpers/BaseChatMesh.cpp:476,500. When the channel key matches,
|
||||
// the decrypted inner is parsed per firmware/src/helpers/BaseChatMesh.cpp:382-385
|
||||
// as data_type(uint16 LE) + data_len(1) + blob(data_len).
|
||||
func decodeGrpData(buf []byte, channelKeys map[string]string) Payload {
|
||||
if len(buf) < 3 {
|
||||
return Payload{Type: "GRP_DATA", Error: "too short", RawHex: hex.EncodeToString(buf)}
|
||||
}
|
||||
channelHash := int(buf[0])
|
||||
channelHashHex := fmt.Sprintf("%02X", buf[0])
|
||||
mac := hex.EncodeToString(buf[1:3])
|
||||
encryptedData := hex.EncodeToString(buf[3:])
|
||||
|
||||
hasKeys := len(channelKeys) > 0
|
||||
if hasKeys && len(encryptedData) >= 10 {
|
||||
for name, key := range channelKeys {
|
||||
plain, err := decryptChannelBlock(encryptedData, mac, key)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
// Inner: data_type(uint16 LE) + data_len(1) + blob (firmware:382-385).
|
||||
if len(plain) < 3 {
|
||||
return Payload{
|
||||
Type: "GRP_DATA",
|
||||
Channel: name,
|
||||
ChannelHash: channelHash,
|
||||
ChannelHashHex: channelHashHex,
|
||||
DecryptionStatus: "decrypted",
|
||||
Error: "inner too short",
|
||||
}
|
||||
}
|
||||
dataType := int(binary.LittleEndian.Uint16(plain[0:2]))
|
||||
dataLen := int(plain[2])
|
||||
if 3+dataLen > len(plain) {
|
||||
return Payload{
|
||||
Type: "GRP_DATA",
|
||||
Channel: name,
|
||||
ChannelHash: channelHash,
|
||||
ChannelHashHex: channelHashHex,
|
||||
DecryptionStatus: "decrypted",
|
||||
DataType: &dataType,
|
||||
DataLen: &dataLen,
|
||||
Error: "inner data_len exceeds buffer",
|
||||
}
|
||||
}
|
||||
blob := hex.EncodeToString(plain[3 : 3+dataLen])
|
||||
return Payload{
|
||||
Type: "GRP_DATA",
|
||||
Channel: name,
|
||||
ChannelHash: channelHash,
|
||||
ChannelHashHex: channelHashHex,
|
||||
DecryptionStatus: "decrypted",
|
||||
DataType: &dataType,
|
||||
DataLen: &dataLen,
|
||||
DecryptedBlob: blob,
|
||||
}
|
||||
}
|
||||
return Payload{
|
||||
Type: "GRP_DATA",
|
||||
ChannelHash: channelHash,
|
||||
ChannelHashHex: channelHashHex,
|
||||
DecryptionStatus: "decryption_failed",
|
||||
MAC: mac,
|
||||
EncryptedData: encryptedData,
|
||||
}
|
||||
}
|
||||
|
||||
return Payload{
|
||||
Type: "GRP_DATA",
|
||||
ChannelHash: channelHash,
|
||||
ChannelHashHex: channelHashHex,
|
||||
DecryptionStatus: "no_key",
|
||||
MAC: mac,
|
||||
EncryptedData: encryptedData,
|
||||
}
|
||||
}
|
||||
|
||||
// decodeMultipart decodes PAYLOAD_TYPE_MULTIPART (0x0A) per
|
||||
// firmware/src/Mesh.cpp:287-310. byte0 = (remaining<<4) | inner_type;
|
||||
// when inner_type == PAYLOAD_TYPE_ACK the next 4 bytes are an ack_crc.
|
||||
func decodeMultipart(buf []byte) Payload {
|
||||
if len(buf) < 1 {
|
||||
return Payload{Type: "MULTIPART", Error: "too short", RawHex: hex.EncodeToString(buf)}
|
||||
}
|
||||
remaining := int(buf[0] >> 4)
|
||||
innerType := int(buf[0] & 0x0F)
|
||||
innerName := payloadTypeNames[innerType]
|
||||
if innerName == "" {
|
||||
innerName = "UNKNOWN"
|
||||
}
|
||||
p := Payload{
|
||||
Type: "MULTIPART",
|
||||
Remaining: &remaining,
|
||||
InnerType: &innerType,
|
||||
InnerTypeName: innerName,
|
||||
}
|
||||
if innerType == PayloadACK && len(buf) >= 5 {
|
||||
// ack_crc is little-endian; surface as canonical big-endian hex
|
||||
// to match decodeAck's extraHash convention.
|
||||
crc := binary.LittleEndian.Uint32(buf[1:5])
|
||||
p.InnerAckCrc = fmt.Sprintf("%08x", crc)
|
||||
// Firmware 1.16.0 extended ACK (issue #1610): inner ACK blob may be
|
||||
// 5 or 6 bytes (payload_len = 1 + ack_len) instead of always 4.
|
||||
ackLen := len(buf) - 1
|
||||
if ackLen > 6 {
|
||||
ackLen = 6
|
||||
}
|
||||
p.InnerAckLen = &ackLen
|
||||
if len(buf) >= 6 {
|
||||
attempt := int(buf[5])
|
||||
p.InnerAckAttempt = &attempt
|
||||
}
|
||||
if len(buf) >= 7 {
|
||||
rnd := int(buf[6])
|
||||
p.InnerAckRand = &rnd
|
||||
}
|
||||
} else if len(buf) > 1 {
|
||||
p.InnerPayload = hex.EncodeToString(buf[1:])
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
// decodeControl decodes PAYLOAD_TYPE_CONTROL (0x0B) byte0 flags per
|
||||
// firmware/src/Mesh.cpp:69 (high-bit set ⇒ zero-hop direct subset).
|
||||
func decodeControl(buf []byte) Payload {
|
||||
if len(buf) < 1 {
|
||||
return Payload{Type: "CONTROL", Error: "too short", RawHex: hex.EncodeToString(buf)}
|
||||
}
|
||||
zeroHop := buf[0]&0x80 != 0
|
||||
length := len(buf)
|
||||
return Payload{
|
||||
Type: "CONTROL",
|
||||
CtrlFlags: fmt.Sprintf("%02x", buf[0]),
|
||||
CtrlZeroHop: &zeroHop,
|
||||
CtrlLength: &length,
|
||||
RawHex: hex.EncodeToString(buf),
|
||||
}
|
||||
}
|
||||
|
||||
// decodeRawCustom decodes PAYLOAD_TYPE_RAW_CUSTOM (0x0F). Application-defined
|
||||
// payload per firmware/src/Mesh.cpp:577 (createRawData); we only surface the
|
||||
// envelope shape (total length + leading tag byte).
|
||||
func decodeRawCustom(buf []byte) Payload {
|
||||
length := len(buf)
|
||||
p := Payload{
|
||||
Type: "RAW_CUSTOM",
|
||||
RawLength: &length,
|
||||
RawHex: hex.EncodeToString(buf),
|
||||
}
|
||||
if length > 0 {
|
||||
p.FirstByteTag = fmt.Sprintf("%02X", buf[0])
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
// decryptChannelBlock performs the MAC verify + AES-128-ECB decrypt step shared
|
||||
// by GRP_TXT and GRP_DATA, returning the raw plaintext block (no further
|
||||
// parsing). See firmware/src/helpers/BaseChatMesh.cpp:376-391.
|
||||
func decryptChannelBlock(ciphertextHex, macHex, channelKeyHex string) ([]byte, error) {
|
||||
channelKey, err := hex.DecodeString(channelKeyHex)
|
||||
if err != nil || len(channelKey) != 16 {
|
||||
return nil, fmt.Errorf("invalid channel key")
|
||||
}
|
||||
macBytes, err := hex.DecodeString(macHex)
|
||||
if err != nil || len(macBytes) != 2 {
|
||||
return nil, fmt.Errorf("invalid MAC")
|
||||
}
|
||||
ciphertext, err := hex.DecodeString(ciphertextHex)
|
||||
if err != nil || len(ciphertext) == 0 {
|
||||
return nil, fmt.Errorf("invalid ciphertext")
|
||||
}
|
||||
channelSecret := make([]byte, 32)
|
||||
copy(channelSecret, channelKey)
|
||||
h := hmac.New(sha256.New, channelSecret)
|
||||
h.Write(ciphertext)
|
||||
calc := h.Sum(nil)
|
||||
if calc[0] != macBytes[0] || calc[1] != macBytes[1] {
|
||||
return nil, fmt.Errorf("MAC verification failed")
|
||||
}
|
||||
if len(ciphertext)%aes.BlockSize != 0 {
|
||||
return nil, fmt.Errorf("ciphertext not aligned to AES block size")
|
||||
}
|
||||
block, err := aes.NewCipher(channelKey)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
plain := make([]byte, len(ciphertext))
|
||||
for i := 0; i < len(ciphertext); i += aes.BlockSize {
|
||||
block.Decrypt(plain[i:i+aes.BlockSize], ciphertext[i:i+aes.BlockSize])
|
||||
}
|
||||
return plain, nil
|
||||
}
|
||||
|
||||
func decodeAnonReq(buf []byte) Payload {
|
||||
if len(buf) < 35 {
|
||||
return Payload{Type: "ANON_REQ", Error: "too short", RawHex: hex.EncodeToString(buf)}
|
||||
@@ -847,20 +537,12 @@ func decodePayload(payloadType int, buf []byte, channelKeys map[string]string, v
|
||||
return decodeAdvert(buf, validateSignatures)
|
||||
case PayloadGRP_TXT:
|
||||
return decodeGrpTxt(buf, channelKeys)
|
||||
case PayloadGRP_DATA:
|
||||
return decodeGrpData(buf, channelKeys)
|
||||
case PayloadANON_REQ:
|
||||
return decodeAnonReq(buf)
|
||||
case PayloadPATH:
|
||||
return decodePathPayload(buf)
|
||||
case PayloadTRACE:
|
||||
return decodeTrace(buf)
|
||||
case PayloadMULTIPART:
|
||||
return decodeMultipart(buf)
|
||||
case PayloadCONTROL:
|
||||
return decodeControl(buf)
|
||||
case PayloadRAW_CUSTOM:
|
||||
return decodeRawCustom(buf)
|
||||
default:
|
||||
return Payload{Type: "UNKNOWN", RawHex: hex.EncodeToString(buf)}
|
||||
}
|
||||
@@ -901,26 +583,10 @@ func DecodePacket(hexString string, channelKeys map[string]string, validateSigna
|
||||
pathByte := buf[offset]
|
||||
offset++
|
||||
|
||||
path, bytesConsumed, decodeErr := decodePath(pathByte, buf, offset)
|
||||
if decodeErr != nil {
|
||||
return nil, decodeErr
|
||||
}
|
||||
path, bytesConsumed := decodePath(pathByte, buf, offset)
|
||||
offset += bytesConsumed
|
||||
|
||||
// Bounds check: pathByte is wire-supplied (hash_size in upper 2 bits,
|
||||
// hash_count in lower 6 bits → up to 4*63=252 claimed path bytes). A
|
||||
// malformed packet can claim more bytes than the buffer holds — without
|
||||
// this guard `buf[offset:]` panics with `slice bounds out of range
|
||||
// [offset:len(buf)]`. See issue #1211 (prod observed [218:15]).
|
||||
if offset > len(buf) {
|
||||
return nil, fmt.Errorf("packet path length (%d bytes claimed by pathByte 0x%02X) exceeds buffer (%d bytes)", bytesConsumed, pathByte, len(buf))
|
||||
}
|
||||
|
||||
payloadBuf := buf[offset:]
|
||||
// Firmware caps payload at MAX_PACKET_PAYLOAD=184 (firmware/src/MeshCore.h:19).
|
||||
if len(payloadBuf) > maxPacketPayload {
|
||||
return nil, fmt.Errorf("packet payload (%d bytes) exceeds firmware MAX_PACKET_PAYLOAD=%d (MeshCore.h:19)", len(payloadBuf), maxPacketPayload)
|
||||
}
|
||||
payload := decodePayload(header.PayloadType, payloadBuf, channelKeys, validateSignatures)
|
||||
|
||||
// TRACE packets store hop IDs in the payload (buf[9:]) rather than the header
|
||||
@@ -933,9 +599,6 @@ func DecodePacket(hexString string, channelKeys map[string]string, validateSigna
|
||||
// We expose hopsCompleted (count of SNR bytes) so consumers can distinguish
|
||||
// how far the trace got vs the full intended route.
|
||||
var anomaly string
|
||||
if header.PayloadType == PayloadTRACE && payload.Error != "" {
|
||||
anomaly = fmt.Sprintf("TRACE payload decode failed: %s", payload.Error)
|
||||
}
|
||||
if header.PayloadType == PayloadTRACE && payload.PathData != "" {
|
||||
// Flag anomalous routing — firmware only sends TRACE as DIRECT
|
||||
if header.RouteType != RouteDirect && header.RouteType != RouteTransportDirect {
|
||||
@@ -943,21 +606,6 @@ func DecodePacket(hexString string, channelKeys map[string]string, validateSigna
|
||||
}
|
||||
// The header path hops count represents SNR entries = completed hops
|
||||
hopsCompleted := path.HashCount
|
||||
// Extract per-hop SNR from header path bytes (int8, quarter-dB encoding).
|
||||
// Mirrors cmd/server/decoder.go — must be done at ingest time so SNR
|
||||
// values are persisted in decoded_json (server endpoint serves DB as-is).
|
||||
if hopsCompleted > 0 && len(path.Hops) >= hopsCompleted {
|
||||
snrVals := make([]float64, 0, hopsCompleted)
|
||||
for i := 0; i < hopsCompleted; i++ {
|
||||
b, err := hex.DecodeString(path.Hops[i])
|
||||
if err == nil && len(b) == 1 {
|
||||
snrVals = append(snrVals, float64(int8(b[0]))/4.0)
|
||||
}
|
||||
}
|
||||
if len(snrVals) > 0 {
|
||||
payload.SNRValues = snrVals
|
||||
}
|
||||
}
|
||||
pathBytes, err := hex.DecodeString(payload.PathData)
|
||||
if err == nil && payload.TraceFlags != nil {
|
||||
// path_sz from flags byte is a power-of-two exponent per firmware:
|
||||
@@ -991,7 +639,6 @@ func DecodePacket(hexString string, channelKeys map[string]string, validateSigna
|
||||
Payload: payload,
|
||||
Raw: strings.ToUpper(hexString),
|
||||
Anomaly: anomaly,
|
||||
payloadRaw: payloadBuf,
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -1109,13 +756,8 @@ func ValidateAdvert(p *Payload) (bool, string) {
|
||||
|
||||
if p.Flags != nil {
|
||||
role := advertRole(p.Flags)
|
||||
// Accept canonical labels plus "none" (ADV_TYPE_NONE=0) and the
|
||||
// "type-N" placeholders we now return for ADV_TYPE 5-15 (FUTURE)
|
||||
// — see firmware/src/helpers/AdvertDataHelpers.h:7-12.
|
||||
validRoles := map[string]bool{
|
||||
"repeater": true, "companion": true, "room": true, "sensor": true, "none": true,
|
||||
}
|
||||
if !validRoles[role] && !strings.HasPrefix(role, "type-") {
|
||||
validRoles := map[string]bool{"repeater": true, "companion": true, "room": true, "sensor": true}
|
||||
if !validRoles[role] {
|
||||
return false, fmt.Sprintf("unknown role: %s", role)
|
||||
}
|
||||
}
|
||||
@@ -1135,29 +777,17 @@ func sanitizeName(s string) string {
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// advertRole returns a stable role label for an advert. Follows firmware
|
||||
// ADV_TYPE_* constants in firmware/src/helpers/AdvertDataHelpers.h:7-12:
|
||||
// 0 NONE, 1 CHAT, 2 REPEATER, 3 ROOM, 4 SENSOR, 5-15 FUTURE.
|
||||
// Previously this coerced both 0 (NONE) and 5-15 (FUTURE) to "companion",
|
||||
// silently relabelling unknown/reserved types — see issue #1279 P1 #3.
|
||||
func advertRole(f *AdvertFlags) string {
|
||||
if f == nil {
|
||||
return "companion"
|
||||
}
|
||||
switch f.Type {
|
||||
case 0:
|
||||
return "none"
|
||||
case 1:
|
||||
return "companion"
|
||||
case 2:
|
||||
if f.Repeater {
|
||||
return "repeater"
|
||||
case 3:
|
||||
return "room"
|
||||
case 4:
|
||||
return "sensor"
|
||||
default:
|
||||
return fmt.Sprintf("type-%d", f.Type)
|
||||
}
|
||||
if f.Room {
|
||||
return "room"
|
||||
}
|
||||
if f.Sensor {
|
||||
return "sensor"
|
||||
}
|
||||
return "companion"
|
||||
}
|
||||
|
||||
func epochToISO(epoch uint32) string {
|
||||
|
||||
@@ -1,97 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/hex"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// --- Issue #1211 round-1 protocol-correctness regressions ---
|
||||
// See cmd/server/decoder_bounds_test.go for full firmware citations
|
||||
// (firmware/src/Packet.cpp:13-18, firmware/src/MeshCore.h:19-21).
|
||||
|
||||
// pathByte=0xF6 → hash_size=4 (reserved), hash_count=54.
|
||||
// Buffer holds all 216 claimed bytes so the OOB guard does NOT catch.
|
||||
func TestDecodePacketRejectsReservedHashSize_Issue1211(t *testing.T) {
|
||||
raw := "12F6" + strings.Repeat("AB", 216) + strings.Repeat("CD", 8)
|
||||
pkt, err := DecodePacket(raw, nil, false)
|
||||
if err == nil {
|
||||
t.Fatalf("expected error rejecting reserved hash_size=4 (firmware Packet.cpp:13-18); got nil, pkt=%+v", pkt)
|
||||
}
|
||||
if !strings.Contains(err.Error(), "path") {
|
||||
t.Errorf("error should mention path; got %q", err)
|
||||
}
|
||||
}
|
||||
|
||||
// pathByte=0xBF → hash_size=3, hash_count=63, total=189 > MAX_PATH_SIZE=64.
|
||||
func TestDecodePacketRejectsOversizedPath_Issue1211(t *testing.T) {
|
||||
raw := "12BF" + strings.Repeat("AB", 189) + strings.Repeat("CD", 8)
|
||||
pkt, err := DecodePacket(raw, nil, false)
|
||||
if err == nil {
|
||||
t.Fatalf("expected error rejecting hash_count*hash_size > 64; got nil, pkt=%+v", pkt)
|
||||
}
|
||||
}
|
||||
|
||||
// Payload > MAX_PACKET_PAYLOAD (184).
|
||||
func TestDecodePacketRejectsOversizedPayload_Issue1211(t *testing.T) {
|
||||
raw := "1200" + strings.Repeat("AA", 200)
|
||||
pkt, err := DecodePacket(raw, nil, false)
|
||||
if err == nil {
|
||||
t.Fatalf("expected error rejecting payload > MAX_PACKET_PAYLOAD=184 (firmware MeshCore.h:19); got nil, pkt=%+v", pkt)
|
||||
}
|
||||
if !strings.Contains(err.Error(), "payload") {
|
||||
t.Errorf("error should mention payload; got %q", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDecodePath_RejectsReservedHashSize_Issue1211(t *testing.T) {
|
||||
buf := make([]byte, 216)
|
||||
for i := range buf {
|
||||
buf[i] = 0xAB
|
||||
}
|
||||
_, _, err := decodePath(0xF6, buf, 0)
|
||||
if err == nil {
|
||||
t.Fatalf("decodePath should reject pathByte=0xF6 (hash_size=4 reserved); got nil err")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDecodePath_RejectsOversizedPath_Issue1211(t *testing.T) {
|
||||
buf := make([]byte, 189)
|
||||
_, _, err := decodePath(0xBF, buf, 0)
|
||||
if err == nil {
|
||||
t.Fatalf("decodePath should reject hash_count*hash_size=189 > MAX_PATH_SIZE=64; got nil err")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDecodePath_AcceptsValidEncodings_Issue1211(t *testing.T) {
|
||||
buf := []byte{0x01, 0x02, 0x03, 0x04, 0x05}
|
||||
path, consumed, err := decodePath(0x05, buf, 0)
|
||||
if err != nil {
|
||||
t.Fatalf("decodePath rejected valid encoding: %v", err)
|
||||
}
|
||||
if consumed != 5 {
|
||||
t.Errorf("consumed=%d, want 5", consumed)
|
||||
}
|
||||
if path.HashCount != 5 || path.HashSize != 1 {
|
||||
t.Errorf("decode wrong: hashCount=%d hashSize=%d", path.HashCount, path.HashSize)
|
||||
}
|
||||
}
|
||||
|
||||
// Kent #1 — pin tautological assertion: error MUST mention "path length"
|
||||
// AND "exceeds buffer", not just non-nil. Uses firmware-valid pathByte
|
||||
// that exhausts a small buffer, so the OOB guard fires (not validity).
|
||||
func TestDecodePacketBoundsFromWireErrorPhrasing_Issue1211(t *testing.T) {
|
||||
raw := "120A" + strings.Repeat("AA", 5)
|
||||
_, err := DecodePacket(raw, nil, false)
|
||||
if err == nil {
|
||||
t.Fatalf("expected error, got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "path length") {
|
||||
t.Errorf("error missing 'path length'; got %q", err)
|
||||
}
|
||||
if !strings.Contains(err.Error(), "exceeds buffer") {
|
||||
t.Errorf("error missing 'exceeds buffer'; got %q", err)
|
||||
}
|
||||
}
|
||||
|
||||
var _ = hex.EncodeToString
|
||||
+22
-205
@@ -447,28 +447,6 @@ func TestValidateAdvert(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestDecodePacketPayloadRaw(t *testing.T) {
|
||||
// Build a minimal TRANSPORT_FLOOD packet (route_type=0):
|
||||
// header(1) + transport_codes(4) + path_len(1) + payload(N)
|
||||
// Header 0x00 = route_type=TRANSPORT_FLOOD, payload_type=0, version=0
|
||||
// Code1=9A52, Code2=0000, path_len=0x00 (0 hops, hash_size=1)
|
||||
payload := []byte("hello")
|
||||
raw := []byte{0x00, 0x9A, 0x52, 0x00, 0x00, 0x00}
|
||||
raw = append(raw, payload...)
|
||||
hexStr := strings.ToUpper(hex.EncodeToString(raw))
|
||||
|
||||
decoded, err := DecodePacket(hexStr, nil, false)
|
||||
if err != nil {
|
||||
t.Fatalf("DecodePacket: %v", err)
|
||||
}
|
||||
if decoded.TransportCodes == nil {
|
||||
t.Fatal("expected TransportCodes, got nil")
|
||||
}
|
||||
if string(decoded.payloadRaw) != string(payload) {
|
||||
t.Errorf("payloadRaw = %v, want %v", decoded.payloadRaw, payload)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDecodeGrpTxtShort(t *testing.T) {
|
||||
p := decodeGrpTxt([]byte{0x01, 0x02}, nil)
|
||||
if p.Error != "too short" {
|
||||
@@ -653,28 +631,21 @@ func TestDecodeEncryptedPayloadValid(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestDecodePayloadGRPData(t *testing.T) {
|
||||
// GRP_DATA (0x06) decoder added for #1279 P0 #1 — envelope only when no
|
||||
// channel key matches (firmware/src/helpers/BaseChatMesh.cpp:500).
|
||||
buf := []byte{0x01, 0x02, 0x03}
|
||||
p := decodePayload(PayloadGRP_DATA, buf, nil, false)
|
||||
if p.Type != "GRP_DATA" {
|
||||
t.Errorf("type=%s, want GRP_DATA", p.Type)
|
||||
if p.Type != "UNKNOWN" {
|
||||
t.Errorf("type=%s, want UNKNOWN", p.Type)
|
||||
}
|
||||
if p.RawHex != "010203" {
|
||||
t.Errorf("rawHex=%s, want 010203", p.RawHex)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDecodePayloadRAWCustom(t *testing.T) {
|
||||
// #1279 P2 #5: RAW_CUSTOM (0x0F) now exposes envelope shape (length +
|
||||
// first-byte tag) per firmware/src/Mesh.cpp:577 (createRawData).
|
||||
buf := []byte{0xFF, 0xFE}
|
||||
p := decodePayload(PayloadRAW_CUSTOM, buf, nil, false)
|
||||
if p.Type != "RAW_CUSTOM" {
|
||||
t.Errorf("type=%s, want RAW_CUSTOM", p.Type)
|
||||
}
|
||||
if p.RawLength == nil || *p.RawLength != 2 {
|
||||
t.Errorf("rawLength missing or wrong, want 2")
|
||||
}
|
||||
if p.FirstByteTag != "FF" {
|
||||
t.Errorf("firstByteTag=%q, want FF", p.FirstByteTag)
|
||||
if p.Type != "UNKNOWN" {
|
||||
t.Errorf("type=%s, want UNKNOWN", p.Type)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1126,24 +1097,24 @@ func TestDecodeHeaderUnknownTypes(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestDecodePayloadMultipart(t *testing.T) {
|
||||
// MULTIPART (0x0A) now decoded — #1279 P0 #2 (firmware/src/Mesh.cpp:289).
|
||||
// MULTIPART (0x0A) falls through to default → UNKNOWN
|
||||
p := decodePayload(PayloadMULTIPART, []byte{0x01, 0x02}, nil, false)
|
||||
if p.Type != "MULTIPART" {
|
||||
t.Errorf("MULTIPART type=%s, want MULTIPART", p.Type)
|
||||
if p.Type != "UNKNOWN" {
|
||||
t.Errorf("MULTIPART type=%s, want UNKNOWN", p.Type)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDecodePayloadControl(t *testing.T) {
|
||||
// CONTROL (0x0B) now decoded — #1279 P1 #4 (firmware/src/Mesh.cpp:69).
|
||||
// CONTROL (0x0B) falls through to default → UNKNOWN
|
||||
p := decodePayload(PayloadCONTROL, []byte{0x01, 0x02}, nil, false)
|
||||
if p.Type != "CONTROL" {
|
||||
t.Errorf("CONTROL type=%s, want CONTROL", p.Type)
|
||||
if p.Type != "UNKNOWN" {
|
||||
t.Errorf("CONTROL type=%s, want UNKNOWN", p.Type)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDecodePathTruncatedBuffer(t *testing.T) {
|
||||
// path byte claims 5 hops of 2 bytes = 10 bytes, but only 4 available
|
||||
path, consumed, _ := decodePath(0x45, []byte{0xAA, 0x11, 0xBB, 0x22}, 0)
|
||||
path, consumed := decodePath(0x45, []byte{0xAA, 0x11, 0xBB, 0x22}, 0)
|
||||
if path.HashCount != 5 {
|
||||
t.Errorf("hashCount=%d, want 5", path.HashCount)
|
||||
}
|
||||
@@ -1737,15 +1708,15 @@ func TestZeroHopTransportDirectHashSize(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestZeroHopTransportDirectHashSizeWithNonZeroUpperBits(t *testing.T) {
|
||||
// pathByte=0xC0 → hash_size bits=11 (4, reserved per firmware Packet.cpp:13-18).
|
||||
// Firmware Packet::isValidPathLen rejects this regardless of hash_count,
|
||||
// because hash_size==4 is reserved. Go decoder must mirror that — even
|
||||
// when hash_count==0, an attacker-emitted 0xC0 byte should not be
|
||||
// silently accepted; firmware never emits hash_size==4.
|
||||
// TRANSPORT_DIRECT (RouteType=3) + REQ (PayloadType=0) → header byte = 0x03
|
||||
// 4 bytes transport codes + pathByte=0xC0 → hash_count=0, hash_size bits=11 → should still get HashSize=0
|
||||
hex := "03" + "11223344" + "C0" + repeatHex("AA", 20)
|
||||
_, err := DecodePacket(hex, nil, false)
|
||||
if err == nil {
|
||||
t.Fatalf("DecodePacket(pathByte=0xC0) succeeded; want error mirroring firmware Packet.cpp:13-18 (hash_size==4 reserved)")
|
||||
pkt, err := DecodePacket(hex, nil, false)
|
||||
if err != nil {
|
||||
t.Fatalf("DecodePacket failed: %v", err)
|
||||
}
|
||||
if pkt.Path.HashSize != 0 {
|
||||
t.Errorf("TRANSPORT_DIRECT zero-hop with hash_size bits set: want HashSize=0, got %d", pkt.Path.HashSize)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1955,157 +1926,3 @@ func TestDecodePathFromRawHex_Transport(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestDecodeTracePayloadFailSetsAnomaly(t *testing.T) {
|
||||
// Issue #889: TRACE packet with payload too short to decode (< 9 bytes)
|
||||
// should still return a DecodedPacket (observation stored) but with Anomaly
|
||||
// set to warn operators that the decode was degraded.
|
||||
// Packet: header 0x26 (TRACE+DIRECT), pathByte 0x00, payload 4 bytes (too short).
|
||||
pkt, err := DecodePacket("2600aabbccdd", nil, false)
|
||||
if err != nil {
|
||||
t.Fatalf("DecodePacket error: %v", err)
|
||||
}
|
||||
if pkt.Payload.Type != "TRACE" {
|
||||
t.Fatalf("payload type=%s, want TRACE", pkt.Payload.Type)
|
||||
}
|
||||
if pkt.Payload.Error == "" {
|
||||
t.Fatal("expected payload.Error to indicate decode failure")
|
||||
}
|
||||
// The key assertion: Anomaly must be set when TRACE decode fails
|
||||
if pkt.Anomaly == "" {
|
||||
t.Error("expected Anomaly to be set when TRACE payload decode fails but observation is stored")
|
||||
}
|
||||
}
|
||||
|
||||
// TestDecodeTraceExtractsSNRValues verifies that for TRACE packets, the header
|
||||
// path bytes are interpreted as int8 SNR values (quarter-dB) and exposed via
|
||||
// payload.SNRValues. Mirrors logic in cmd/server/decoder.go (issue: SNR values
|
||||
// extracted by server but never written into decoded_json by ingestor).
|
||||
//
|
||||
// Packet 26022FF8116A23A80000000001C0DE1000DEDE:
|
||||
// header 0x26 → TRACE (pt=9), DIRECT (rt=2)
|
||||
// pathByte 0x02 → hash_size=1, hash_count=2
|
||||
// header path: 2F F8 → SNR = [int8(0x2F)/4, int8(0xF8)/4] = [11.75, -2.0]
|
||||
// payload (15B): tag=116A23A8 auth=00000000 flags=0x01 pathData=C0DE1000DEDE
|
||||
func TestDecodeTraceExtractsSNRValues(t *testing.T) {
|
||||
pkt, err := DecodePacket("26022FF8116A23A80000000001C0DE1000DEDE", nil, false)
|
||||
if err != nil {
|
||||
t.Fatalf("DecodePacket error: %v", err)
|
||||
}
|
||||
if pkt.Payload.Type != "TRACE" {
|
||||
t.Fatalf("payload type=%s, want TRACE", pkt.Payload.Type)
|
||||
}
|
||||
if len(pkt.Payload.SNRValues) != 2 {
|
||||
t.Fatalf("len(SNRValues)=%d, want 2 (got %v)", len(pkt.Payload.SNRValues), pkt.Payload.SNRValues)
|
||||
}
|
||||
if pkt.Payload.SNRValues[0] != 11.75 {
|
||||
t.Errorf("SNRValues[0]=%v, want 11.75", pkt.Payload.SNRValues[0])
|
||||
}
|
||||
if pkt.Payload.SNRValues[1] != -2.0 {
|
||||
t.Errorf("SNRValues[1]=%v, want -2.0", pkt.Payload.SNRValues[1])
|
||||
}
|
||||
}
|
||||
|
||||
// TestDecodePacketBoundsFromWire — regression for issue #1211.
|
||||
//
|
||||
// A malformed packet on the wire claimed pathByte=0xF6 (hash_size=4, hash_count=54
|
||||
// → 216 path bytes) inside a 15-byte buffer. decodePath() returned bytesConsumed=216
|
||||
// without bounds-check, causing the outer slice `payloadBuf := buf[offset:]` to
|
||||
// blow up with `slice bounds out of range [218:15]`.
|
||||
//
|
||||
// Expected behaviour: DecodePacket MUST NOT panic on any input. If the path
|
||||
// length claimed by the wire byte exceeds the buffer, it should return a
|
||||
// clean error.
|
||||
func TestDecodePacketBoundsFromWire_Issue1211(t *testing.T) {
|
||||
// 15-byte buffer: header=0x12 (rt=DIRECT, pt=ADVERT), pathByte=0xF6
|
||||
// (hash_size=4, hash_count=54 → claims 216 path bytes), + 13 garbage bytes.
|
||||
raw := "12F6" + strings.Repeat("AA", 13)
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
t.Fatalf("DecodePacket panicked on malformed input: %v", r)
|
||||
}
|
||||
}()
|
||||
pkt, err := DecodePacket(raw, nil, false)
|
||||
if err == nil {
|
||||
t.Fatalf("expected error for malformed packet (path claims 216 bytes in 15-byte buf), got nil; pkt=%+v", pkt)
|
||||
}
|
||||
}
|
||||
|
||||
// TestDecodePacketFuzzTruncated — sweep the decoder with truncated payloads.
|
||||
// Zero panics is the acceptance bar.
|
||||
//
|
||||
// Adv M2: the original loop ran 256*256*20 = 1.3M iterations on every
|
||||
// `go test` (in both packages, so 2.6M total). That is not "fuzzing" — it
|
||||
// is an expensive deterministic sweep that runs in the default unit-test
|
||||
// path with no opt-in. We now:
|
||||
//
|
||||
// - gate the exhaustive sweep on !testing.Short() so `go test -short`
|
||||
// skips it (CI's unit gate runs short)
|
||||
// - keep the full sweep under `go test ./...` to preserve coverage
|
||||
// - prefer `go test -fuzz=FuzzDecodePacketTruncated` for actual
|
||||
// randomized fuzzing (see FuzzDecodePacketTruncated below)
|
||||
func TestDecodePacketFuzzTruncated_Issue1211(t *testing.T) {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
t.Fatalf("DecodePacket panicked during fuzz: %v", r)
|
||||
}
|
||||
}()
|
||||
if testing.Short() {
|
||||
t.Skip("skipping exhaustive sweep in -short mode; use FuzzDecodePacketTruncated")
|
||||
}
|
||||
// Sweep every pathByte value with a short tail.
|
||||
for hdr := 0; hdr < 256; hdr++ {
|
||||
for pb := 0; pb < 256; pb++ {
|
||||
for tail := 0; tail < 20; tail++ {
|
||||
raw := hex.EncodeToString([]byte{byte(hdr), byte(pb)}) + strings.Repeat("00", tail)
|
||||
_, _ = DecodePacket(raw, nil, false)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// FuzzDecodePacketTruncated — native go fuzz target. Run with:
|
||||
//
|
||||
// go test -fuzz=FuzzDecodePacketTruncated -fuzztime=30s ./cmd/ingestor
|
||||
//
|
||||
// Zero panics regardless of input is the acceptance bar.
|
||||
func FuzzDecodePacketTruncated(f *testing.F) {
|
||||
seeds := [][]byte{
|
||||
{0x12, 0xF6, 0xAA, 0xAA, 0xAA},
|
||||
{0x12, 0x00},
|
||||
{0x03, 0x11, 0x22, 0x33, 0x44, 0xC0, 0xAA, 0xAA, 0xAA},
|
||||
}
|
||||
for _, s := range seeds {
|
||||
f.Add(s)
|
||||
}
|
||||
f.Fuzz(func(t *testing.T, data []byte) {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
t.Fatalf("DecodePacket panicked on input %x: %v", data, r)
|
||||
}
|
||||
}()
|
||||
_, _ = DecodePacket(hex.EncodeToString(data), nil, false)
|
||||
})
|
||||
}
|
||||
|
||||
// TestDecodeAdvertOversizedNameTruncated asserts decodeAdvert truncates the
|
||||
// advert name to firmware's MAX_ADVERT_DATA_SIZE=32 (firmware/src/MeshCore.h:11).
|
||||
// Firmware writes the node name into a 32-byte buffer, so any on-wire advert
|
||||
// carrying >32 bytes of name data is adversarial — the Go decoder must not
|
||||
// surface attacker-controlled bytes beyond what firmware would ever emit.
|
||||
func TestDecodeAdvertOversizedNameTruncated(t *testing.T) {
|
||||
pubkey := repeatHex("AA", 32)
|
||||
timestamp := "78563412"
|
||||
signature := repeatHex("BB", 64)
|
||||
flags := "81" // chat(1) | hasName(0x80), no location, no feat1/2
|
||||
// 64-byte ASCII 'X' name with no null terminator (firmware buffer is 32 bytes).
|
||||
name := repeatHex("58", 64)
|
||||
hex := "1200" + pubkey + timestamp + signature + flags + name
|
||||
pkt, err := DecodePacket(hex, nil, false)
|
||||
if err != nil {
|
||||
t.Fatalf("DecodePacket: %v", err)
|
||||
}
|
||||
if got := len(pkt.Payload.Name); got > 32 {
|
||||
t.Errorf("name length=%d, want <=32 (MAX_ADVERT_DATA_SIZE firmware/src/MeshCore.h:11)", got)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,112 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestHandleMessageAdvertForeign_FlagModeStoresWithFlag asserts that when an
|
||||
// ADVERT comes from a node whose GPS is OUTSIDE the configured geofilter,
|
||||
// the ingestor (in default "flag" mode) stores the node and marks it foreign,
|
||||
// instead of silently dropping it (#730).
|
||||
func TestHandleMessageAdvertForeign_FlagModeStoresWithFlag(t *testing.T) {
|
||||
store, source := newTestContext(t)
|
||||
|
||||
// Real ADVERT raw hex from existing TestHandleMessageAdvertGeoFiltered.
|
||||
// Decoder will produce a node with a known GPS — the test below just
|
||||
// asserts that with a tight geofilter that EXCLUDES that GPS, the node
|
||||
// is still stored AND tagged as foreign.
|
||||
rawHex := "120046D62DE27D4C5194D7821FC5A34A45565DCC2537B300B9AB6275255CEFB65D840CE5C169C94C9AED39E8BCB6CB6EB0335497A198B33A1A610CD3B03D8DCFC160900E5244280323EE0B44CACAB8F02B5B38B91CFA18BD067B0B5E63E94CFC85F758A8530B9240933402E0E6B8F84D5252322D52"
|
||||
|
||||
latMin, latMax := -1.0, 1.0
|
||||
lonMin, lonMax := -1.0, 1.0
|
||||
gf := &GeoFilterConfig{
|
||||
LatMin: &latMin, LatMax: &latMax,
|
||||
LonMin: &lonMin, LonMax: &lonMax,
|
||||
}
|
||||
|
||||
msg := &mockMessage{
|
||||
topic: "meshcore/SJC/obs1/packets",
|
||||
payload: []byte(`{"raw":"` + rawHex + `"}`),
|
||||
}
|
||||
// Default mode (no ForeignAdverts.Mode set) MUST be "flag", per #730 design.
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{GeoFilter: gf})
|
||||
|
||||
var nodeCount int
|
||||
if err := store.db.QueryRow("SELECT COUNT(*) FROM nodes").Scan(&nodeCount); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if nodeCount != 1 {
|
||||
t.Fatalf("nodes=%d, want 1 (foreign advert should be stored, not dropped, in flag mode)", nodeCount)
|
||||
}
|
||||
|
||||
var foreign int
|
||||
if err := store.db.QueryRow("SELECT foreign_advert FROM nodes").Scan(&foreign); err != nil {
|
||||
t.Fatalf("foreign_advert column missing or unreadable: %v", err)
|
||||
}
|
||||
if foreign != 1 {
|
||||
t.Errorf("foreign_advert=%d, want 1", foreign)
|
||||
}
|
||||
}
|
||||
|
||||
// TestHandleMessageAdvertForeign_DropModeStillDrops asserts the legacy
|
||||
// drop-on-foreign behavior is preserved when ForeignAdverts.Mode = "drop".
|
||||
func TestHandleMessageAdvertForeign_DropModeStillDrops(t *testing.T) {
|
||||
store, source := newTestContext(t)
|
||||
|
||||
rawHex := "120046D62DE27D4C5194D7821FC5A34A45565DCC2537B300B9AB6275255CEFB65D840CE5C169C94C9AED39E8BCB6CB6EB0335497A198B33A1A610CD3B03D8DCFC160900E5244280323EE0B44CACAB8F02B5B38B91CFA18BD067B0B5E63E94CFC85F758A8530B9240933402E0E6B8F84D5252322D52"
|
||||
|
||||
latMin, latMax := -1.0, 1.0
|
||||
lonMin, lonMax := -1.0, 1.0
|
||||
gf := &GeoFilterConfig{
|
||||
LatMin: &latMin, LatMax: &latMax,
|
||||
LonMin: &lonMin, LonMax: &lonMax,
|
||||
}
|
||||
|
||||
msg := &mockMessage{
|
||||
topic: "meshcore/SJC/obs1/packets",
|
||||
payload: []byte(`{"raw":"` + rawHex + `"}`),
|
||||
}
|
||||
cfg := &Config{
|
||||
GeoFilter: gf,
|
||||
ForeignAdverts: &ForeignAdvertConfig{Mode: "drop"},
|
||||
}
|
||||
handleMessage(store, "test", source, msg, nil, nil, cfg)
|
||||
|
||||
var nodeCount int
|
||||
if err := store.db.QueryRow("SELECT COUNT(*) FROM nodes").Scan(&nodeCount); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if nodeCount != 0 {
|
||||
t.Errorf("nodes=%d, want 0 (drop mode preserves legacy silent-drop behavior)", nodeCount)
|
||||
}
|
||||
}
|
||||
|
||||
// TestHandleMessageAdvertInRegion_NotFlaggedForeign asserts in-region
|
||||
// adverts are NOT marked foreign.
|
||||
func TestHandleMessageAdvertInRegion_NotFlaggedForeign(t *testing.T) {
|
||||
store, source := newTestContext(t)
|
||||
|
||||
rawHex := "120046D62DE27D4C5194D7821FC5A34A45565DCC2537B300B9AB6275255CEFB65D840CE5C169C94C9AED39E8BCB6CB6EB0335497A198B33A1A610CD3B03D8DCFC160900E5244280323EE0B44CACAB8F02B5B38B91CFA18BD067B0B5E63E94CFC85F758A8530B9240933402E0E6B8F84D5252322D52"
|
||||
|
||||
// Wide-open geofilter: every coord passes.
|
||||
latMin, latMax := -90.0, 90.0
|
||||
lonMin, lonMax := -180.0, 180.0
|
||||
gf := &GeoFilterConfig{
|
||||
LatMin: &latMin, LatMax: &latMax,
|
||||
LonMin: &lonMin, LonMax: &lonMax,
|
||||
}
|
||||
msg := &mockMessage{
|
||||
topic: "meshcore/SJC/obs1/packets",
|
||||
payload: []byte(`{"raw":"` + rawHex + `"}`),
|
||||
}
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{GeoFilter: gf})
|
||||
|
||||
var foreign int
|
||||
err := store.db.QueryRow("SELECT foreign_advert FROM nodes").Scan(&foreign)
|
||||
if err != nil {
|
||||
t.Fatalf("query foreign_advert: %v", err)
|
||||
}
|
||||
if foreign != 0 {
|
||||
t.Errorf("foreign_advert=%d, want 0 (in-region node)", foreign)
|
||||
}
|
||||
}
|
||||
@@ -1,94 +0,0 @@
|
||||
package main
|
||||
|
||||
// Tests for #1143: ingestor must populate transmissions.from_pubkey at
|
||||
// write time (cheap — already parsing decoded_json) so attribution queries
|
||||
// don't rely on JSON substring matches.
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestInsertTransmission_FromPubkeyPopulatedForAdvert(t *testing.T) {
|
||||
s, err := OpenStore(tempDBPath(t))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer s.Close()
|
||||
|
||||
const pk = "f7181c468dfe7c55aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
|
||||
data := &PacketData{
|
||||
RawHex: "AABBCC",
|
||||
Timestamp: "2026-03-25T00:00:00Z",
|
||||
ObserverID: "obs1",
|
||||
Hash: "advert_hash_1143",
|
||||
RouteType: 1,
|
||||
PayloadType: 4, // ADVERT
|
||||
PayloadVersion: 0,
|
||||
PathJSON: "[]",
|
||||
DecodedJSON: `{"type":"ADVERT","pubKey":"` + pk + `","name":"X"}`,
|
||||
FromPubkey: pk,
|
||||
}
|
||||
if _, err := s.InsertTransmission(data); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
var got sql.NullString
|
||||
s.db.QueryRow("SELECT from_pubkey FROM transmissions WHERE hash = ?", data.Hash).Scan(&got)
|
||||
if !got.Valid || got.String != pk {
|
||||
t.Fatalf("from_pubkey = %v (valid=%v), want %q", got.String, got.Valid, pk)
|
||||
}
|
||||
}
|
||||
|
||||
func TestInsertTransmission_FromPubkeyNullForNonAdvert(t *testing.T) {
|
||||
s, err := OpenStore(tempDBPath(t))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer s.Close()
|
||||
|
||||
data := &PacketData{
|
||||
RawHex: "AA",
|
||||
Timestamp: "2026-03-25T00:00:00Z",
|
||||
ObserverID: "obs1",
|
||||
Hash: "txt_hash_1143",
|
||||
RouteType: 1,
|
||||
PayloadType: 2, // TXT_MSG
|
||||
PayloadVersion: 0,
|
||||
PathJSON: "[]",
|
||||
DecodedJSON: `{"type":"TXT_MSG"}`,
|
||||
// FromPubkey deliberately empty — non-ADVERTs don't carry one.
|
||||
}
|
||||
if _, err := s.InsertTransmission(data); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
var got sql.NullString
|
||||
s.db.QueryRow("SELECT from_pubkey FROM transmissions WHERE hash = ?", data.Hash).Scan(&got)
|
||||
if got.Valid {
|
||||
t.Fatalf("from_pubkey for non-ADVERT must be NULL, got %q", got.String)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildPacketData_PopulatesFromPubkey(t *testing.T) {
|
||||
const pk = "deadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeef"
|
||||
msg := &MQTTPacketMessage{Raw: "AA", Origin: "obs"}
|
||||
decoded := &DecodedPacket{
|
||||
Header: Header{PayloadType: PayloadADVERT},
|
||||
Payload: Payload{Type: "ADVERT", PubKey: pk},
|
||||
}
|
||||
pd := BuildPacketData(msg, decoded, "obs", "", nil)
|
||||
if pd.FromPubkey != pk {
|
||||
t.Fatalf("BuildPacketData FromPubkey = %q, want %q", pd.FromPubkey, pk)
|
||||
}
|
||||
|
||||
// Non-ADVERT: must not carry a pubkey.
|
||||
decoded2 := &DecodedPacket{
|
||||
Header: Header{PayloadType: 2},
|
||||
Payload: Payload{Type: "TXT_MSG"},
|
||||
}
|
||||
pd2 := BuildPacketData(msg, decoded2, "obs", "", nil)
|
||||
if pd2.FromPubkey != "" {
|
||||
t.Fatalf("BuildPacketData FromPubkey for non-ADVERT = %q, want empty", pd2.FromPubkey)
|
||||
}
|
||||
}
|
||||
@@ -17,18 +17,6 @@ require github.com/meshcore-analyzer/packetpath v0.0.0
|
||||
|
||||
replace github.com/meshcore-analyzer/packetpath => ../../internal/packetpath
|
||||
|
||||
require github.com/meshcore-analyzer/dbconfig v0.0.0
|
||||
|
||||
replace github.com/meshcore-analyzer/dbconfig => ../../internal/dbconfig
|
||||
|
||||
require github.com/meshcore-analyzer/perfio v0.0.0
|
||||
|
||||
replace github.com/meshcore-analyzer/perfio => ../../internal/perfio
|
||||
|
||||
require github.com/meshcore-analyzer/dbschema v0.0.0
|
||||
|
||||
replace github.com/meshcore-analyzer/dbschema => ../../internal/dbschema
|
||||
|
||||
require (
|
||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||
github.com/google/uuid v1.6.0 // indirect
|
||||
@@ -43,11 +31,3 @@ require (
|
||||
modernc.org/mathutil v1.6.0 // indirect
|
||||
modernc.org/memory v1.8.0 // indirect
|
||||
)
|
||||
|
||||
require github.com/meshcore-analyzer/prunequeue v0.0.0
|
||||
|
||||
replace github.com/meshcore-analyzer/prunequeue => ../../internal/prunequeue
|
||||
|
||||
require github.com/meshcore-analyzer/mbcapqueue v0.0.0
|
||||
|
||||
replace github.com/meshcore-analyzer/mbcapqueue => ../../internal/mbcapqueue
|
||||
|
||||
@@ -1,202 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
)
|
||||
|
||||
// IngestBuffer decouples MQTT message receipt from DB writes (#1608).
|
||||
//
|
||||
// On boot the ingestor must subscribe to MQTT immediately, but the single
|
||||
// SQLite writer (#1283) can be held for minutes by a startup migration
|
||||
// (e.g. a large CREATE INDEX) or prune. Without buffering, every QoS-0 packet
|
||||
// received in that window is lost. IngestBuffer holds received work in a
|
||||
// bounded FIFO and a single consumer goroutine drains it once Ready() is
|
||||
// called — i.e. once the write path is free.
|
||||
//
|
||||
// A single consumer preserves the single-writer invariant: jobs run one at a
|
||||
// time, exactly as paho's in-order handler did before. Submit never blocks the
|
||||
// MQTT delivery goroutine; if the buffer is full it drops and counts (bounded
|
||||
// memory). Buffering replays the original messages, so it introduces NO
|
||||
// duplicates (contrast: a QoS-1 broker-queue would).
|
||||
type IngestBuffer struct {
|
||||
jobs chan func()
|
||||
ready chan struct{}
|
||||
stop chan struct{}
|
||||
done chan struct{}
|
||||
dropped atomic.Int64
|
||||
startOnce sync.Once
|
||||
readyOnce sync.Once
|
||||
stopOnce sync.Once
|
||||
|
||||
// dropLogMu guards the time-based drop-log throttle (PR #1623
|
||||
// round-1 fix to #1609 M1). Per-drop logging under sustained
|
||||
// stalls could flood the log at MQTT inbound rate; instead we
|
||||
// always log the FIRST drop of a stall and then summarize at
|
||||
// most once per second until the stall ends.
|
||||
dropLogMu sync.Mutex
|
||||
stallActive bool // true between first drop and first successful Submit
|
||||
stallStart time.Time // when the current stall began
|
||||
stallStartDrop int64 // dropped() value when stall began
|
||||
lastSummaryAt time.Time // last time we wrote a summary line
|
||||
}
|
||||
|
||||
// dropLogSummaryInterval is the minimum interval between summary lines
|
||||
// during a sustained stall. Exposed as a var so tests can shrink it.
|
||||
var dropLogSummaryInterval = time.Second
|
||||
|
||||
// NewIngestBuffer returns a buffer holding up to capacity pending jobs.
|
||||
// Non-positive capacity is clamped to 1 and a WARN is logged so the
|
||||
// misconfiguration is visible (PR #1609 m2 — silent clamp hid bad
|
||||
// ingestBufferSize values).
|
||||
func NewIngestBuffer(capacity int) *IngestBuffer {
|
||||
if capacity < 1 {
|
||||
log.Printf("[ingest-buffer] WARN: requested capacity %d < 1, clamping to 1 — check ingestBufferSize config; default is 50000", capacity)
|
||||
capacity = 1
|
||||
}
|
||||
return &IngestBuffer{
|
||||
jobs: make(chan func(), capacity),
|
||||
ready: make(chan struct{}),
|
||||
stop: make(chan struct{}),
|
||||
done: make(chan struct{}),
|
||||
}
|
||||
}
|
||||
|
||||
// Submit enqueues a job without blocking. If the buffer is full the job is
|
||||
// dropped and the dropped counter is incremented. Safe for concurrent callers.
|
||||
//
|
||||
// Ordering invariant: callers MUST call Start() before the first Submit().
|
||||
// Submit only enqueues — without a running consumer, jobs sit in the channel
|
||||
// and (once cap is reached) are silently dropped until Start()+Ready() run.
|
||||
//
|
||||
// Drop logging (PR #1623 round-1 fix to #1609 M1) uses a time-based
|
||||
// throttle to stay loud-on-stall-start without flooding under sustained
|
||||
// stalls:
|
||||
// - the FIRST drop of a stall logs immediately
|
||||
// - subsequent drops are summarized at most once per second
|
||||
// - when the next Submit succeeds, a "drained" recovery line is
|
||||
// emitted so operators can quantify the burst
|
||||
//
|
||||
// All log lines include the buffer capacity for operator triage.
|
||||
func (b *IngestBuffer) Submit(job func()) {
|
||||
select {
|
||||
case b.jobs <- job:
|
||||
b.maybeLogRecovery()
|
||||
default:
|
||||
n := b.dropped.Add(1)
|
||||
b.logDrop(n)
|
||||
}
|
||||
}
|
||||
|
||||
// logDrop emits a drop log line under the time-based throttle. The first
|
||||
// drop of a stall always logs; subsequent drops summarize at most once
|
||||
// per dropLogSummaryInterval.
|
||||
func (b *IngestBuffer) logDrop(n int64) {
|
||||
b.dropLogMu.Lock()
|
||||
defer b.dropLogMu.Unlock()
|
||||
now := time.Now()
|
||||
if !b.stallActive {
|
||||
b.stallActive = true
|
||||
b.stallStart = now
|
||||
b.stallStartDrop = n - 1 // last successful Submit -> this is the 1st drop of the stall
|
||||
b.lastSummaryAt = now
|
||||
log.Printf("[ingest-buffer] WARNING: buffer full (cap %d), dropped %d message(s) total — write path stalled, raise ingestBufferSize or investigate slow writer", cap(b.jobs), n)
|
||||
return
|
||||
}
|
||||
if now.Sub(b.lastSummaryAt) >= dropLogSummaryInterval {
|
||||
b.lastSummaryAt = now
|
||||
stallDrops := n - b.stallStartDrop
|
||||
log.Printf("[ingest-buffer] WARNING: buffer full (cap %d), %d drop(s) in current stall, %d total — write path still stalled", cap(b.jobs), stallDrops, n)
|
||||
}
|
||||
}
|
||||
|
||||
// maybeLogRecovery is called from the success branch of Submit. If a
|
||||
// stall was active, it logs a recovery line summarizing the burst and
|
||||
// clears the stall state.
|
||||
func (b *IngestBuffer) maybeLogRecovery() {
|
||||
b.dropLogMu.Lock()
|
||||
defer b.dropLogMu.Unlock()
|
||||
if !b.stallActive {
|
||||
return
|
||||
}
|
||||
stallDrops := b.dropped.Load() - b.stallStartDrop
|
||||
dur := time.Since(b.stallStart)
|
||||
log.Printf("[ingest-buffer] INFO: buffer drained, %d drop(s) over %s (cap %d) — write path recovered", stallDrops, dur.Round(time.Millisecond), cap(b.jobs))
|
||||
b.stallActive = false
|
||||
}
|
||||
|
||||
// Start launches the consumer goroutine. It blocks until Ready() is called
|
||||
// (or Stop() fires, whichever comes first), then drains buffered jobs and
|
||||
// runs newly-submitted ones serially, in FIFO order. Idempotent.
|
||||
//
|
||||
// Lifecycle: Stop() closes b.stop, which causes the consumer to exit via
|
||||
// the stop-select arm (after draining any queued jobs if Ready() had
|
||||
// already fired). The b.jobs channel is never closed — closing it would
|
||||
// race with concurrent Submit() callers and panic; instead jobs is
|
||||
// garbage-collected with the buffer once all references drop. Done() is
|
||||
// closed when the consumer goroutine returns.
|
||||
func (b *IngestBuffer) Start() {
|
||||
b.startOnce.Do(func() {
|
||||
go func() {
|
||||
defer close(b.done)
|
||||
select {
|
||||
case <-b.ready:
|
||||
case <-b.stop:
|
||||
// Stopped before Ready — exit immediately. Pending jobs
|
||||
// are discarded; the buffer was never authorized to drain.
|
||||
return
|
||||
}
|
||||
for {
|
||||
select {
|
||||
case job := <-b.jobs:
|
||||
job()
|
||||
case <-b.stop:
|
||||
// Stop after Ready — drain whatever is queued so
|
||||
// shutdown is graceful, then exit. b.jobs is never
|
||||
// closed (see Start godoc), so a default-case
|
||||
// non-blocking receive is the correct drain idiom.
|
||||
for {
|
||||
select {
|
||||
case job := <-b.jobs:
|
||||
job()
|
||||
default:
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
})
|
||||
}
|
||||
|
||||
// Ready signals that the write path is available; the consumer begins
|
||||
// draining. Idempotent.
|
||||
//
|
||||
// Ordering invariant: Start() MUST have been called before Ready() takes
|
||||
// effect. Calling Ready() without a prior Start() simply closes the ready
|
||||
// channel — nothing drains until a later Start() runs its consumer goroutine.
|
||||
func (b *IngestBuffer) Ready() {
|
||||
b.readyOnce.Do(func() { close(b.ready) })
|
||||
}
|
||||
|
||||
// Dropped returns the number of jobs dropped due to a full buffer.
|
||||
func (b *IngestBuffer) Dropped() int64 { return b.dropped.Load() }
|
||||
|
||||
// Pending returns the current queue depth (best-effort; for observability).
|
||||
func (b *IngestBuffer) Pending() int { return len(b.jobs) }
|
||||
|
||||
// Stop signals the consumer goroutine to exit. Test-hygiene helper so unit
|
||||
// tests don't leak the goroutine that Start() spawns. Idempotent / safe to
|
||||
// call without a prior Start(). After Stop() the consumer exits and Done()
|
||||
// is closed.
|
||||
func (b *IngestBuffer) Stop() {
|
||||
b.stopOnce.Do(func() { close(b.stop) })
|
||||
}
|
||||
|
||||
// Done returns a channel that is closed after the consumer goroutine has
|
||||
// exited. If Start() was never called, Done() never closes.
|
||||
func (b *IngestBuffer) Done() <-chan struct{} {
|
||||
return b.done
|
||||
}
|
||||
@@ -1,274 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"log"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestIngestBuffer_BuffersUntilReady(t *testing.T) {
|
||||
b := NewIngestBuffer(10)
|
||||
t.Cleanup(b.Stop)
|
||||
var ran atomic.Int64
|
||||
b.Start()
|
||||
for i := 0; i < 3; i++ {
|
||||
b.Submit(func() { ran.Add(1) })
|
||||
}
|
||||
time.Sleep(30 * time.Millisecond)
|
||||
if ran.Load() != 0 {
|
||||
t.Fatalf("jobs ran before Ready(): %d", ran.Load())
|
||||
}
|
||||
b.Ready()
|
||||
deadline := time.Now().Add(time.Second)
|
||||
for ran.Load() < 3 && time.Now().Before(deadline) {
|
||||
time.Sleep(5 * time.Millisecond)
|
||||
}
|
||||
if ran.Load() != 3 {
|
||||
t.Fatalf("want 3 ran after Ready, got %d", ran.Load())
|
||||
}
|
||||
}
|
||||
|
||||
func TestIngestBuffer_FIFOOrder(t *testing.T) {
|
||||
b := NewIngestBuffer(10)
|
||||
t.Cleanup(b.Stop)
|
||||
out := make(chan int, 5)
|
||||
b.Start()
|
||||
for i := 0; i < 5; i++ {
|
||||
i := i
|
||||
b.Submit(func() { out <- i })
|
||||
}
|
||||
b.Ready()
|
||||
for want := 0; want < 5; want++ {
|
||||
select {
|
||||
case got := <-out:
|
||||
if got != want {
|
||||
t.Fatalf("order: want %d got %d", want, got)
|
||||
}
|
||||
case <-time.After(time.Second):
|
||||
t.Fatalf("timeout waiting for job %d", want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestIngestBuffer_DropsWhenFull(t *testing.T) {
|
||||
b := NewIngestBuffer(2)
|
||||
t.Cleanup(b.Stop) // never Ready()'d -> nothing drains
|
||||
for i := 0; i < 5; i++ {
|
||||
b.Submit(func() {})
|
||||
}
|
||||
if got := b.Dropped(); got != 3 {
|
||||
t.Fatalf("want 3 dropped (cap 2, 5 submitted), got %d", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestIngestBuffer_ProcessesAfterReady(t *testing.T) {
|
||||
b := NewIngestBuffer(10)
|
||||
t.Cleanup(b.Stop)
|
||||
b.Start()
|
||||
b.Ready()
|
||||
done := make(chan struct{})
|
||||
b.Submit(func() { close(done) })
|
||||
select {
|
||||
case <-done:
|
||||
case <-time.After(time.Second):
|
||||
t.Fatal("job submitted after Ready was not processed")
|
||||
}
|
||||
}
|
||||
|
||||
func TestIngestBuffer_SerialExecution(t *testing.T) {
|
||||
b := NewIngestBuffer(50)
|
||||
t.Cleanup(b.Stop)
|
||||
var inFlight atomic.Int32
|
||||
var overlap atomic.Bool
|
||||
var wg sync.WaitGroup
|
||||
b.Start()
|
||||
const n = 20
|
||||
wg.Add(n)
|
||||
for i := 0; i < n; i++ {
|
||||
b.Submit(func() {
|
||||
if inFlight.Add(1) > 1 {
|
||||
overlap.Store(true)
|
||||
}
|
||||
time.Sleep(time.Millisecond)
|
||||
inFlight.Add(-1)
|
||||
wg.Done()
|
||||
})
|
||||
}
|
||||
b.Ready()
|
||||
wg.Wait()
|
||||
if overlap.Load() {
|
||||
t.Fatal("jobs overlapped — consumer is not serial (violates single-writer)")
|
||||
}
|
||||
}
|
||||
|
||||
func TestIngestBuffer_ConcurrentSubmitSafe(t *testing.T) {
|
||||
b := NewIngestBuffer(20000)
|
||||
t.Cleanup(b.Stop)
|
||||
b.Start()
|
||||
var wg sync.WaitGroup
|
||||
for g := 0; g < 8; g++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for i := 0; i < 1000; i++ {
|
||||
b.Submit(func() {})
|
||||
}
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
b.Ready()
|
||||
// Assertion is the absence of a race/panic; run under -race in CI.
|
||||
}
|
||||
|
||||
// TestIngestBuffer_StopUnblocksConsumer guards the consumer-goroutine leak
|
||||
// described in PR #1609 review m1: Start() blocks on <-b.ready forever if
|
||||
// Ready() is never called, leaking the goroutine in test runs. Stop() must
|
||||
// signal the consumer to exit cleanly without requiring Ready().
|
||||
func TestIngestBuffer_StopUnblocksConsumer(t *testing.T) {
|
||||
b := NewIngestBuffer(10)
|
||||
t.Cleanup(b.Stop)
|
||||
b.Start()
|
||||
// Do NOT call Ready(). The consumer must exit purely because of Stop().
|
||||
b.Stop()
|
||||
select {
|
||||
case <-b.Done():
|
||||
// good — consumer goroutine returned
|
||||
case <-time.After(time.Second):
|
||||
t.Fatal("Stop() did not unblock the consumer goroutine within 1s (Done() never closed)")
|
||||
}
|
||||
}
|
||||
|
||||
// TestNewIngestBuffer_WarnsOnSubOneClamp asserts that constructing the
|
||||
// buffer with a non-positive capacity emits a WARN log line. Silent
|
||||
// clamping (PR #1609 review m2) hid misconfigurations like
|
||||
// ingestBufferSize=-1 or 0-from-default-not-applied paths.
|
||||
func TestNewIngestBuffer_WarnsOnSubOneClamp(t *testing.T) {
|
||||
var buf bytes.Buffer
|
||||
oldOut := log.Writer()
|
||||
oldFlags := log.Flags()
|
||||
log.SetOutput(&buf)
|
||||
log.SetFlags(0)
|
||||
t.Cleanup(func() {
|
||||
log.SetOutput(oldOut)
|
||||
log.SetFlags(oldFlags)
|
||||
})
|
||||
|
||||
b := NewIngestBuffer(0)
|
||||
t.Cleanup(b.Stop)
|
||||
|
||||
got := buf.String()
|
||||
if !strings.Contains(got, "WARN") || !strings.Contains(got, "ingest-buffer") {
|
||||
t.Fatalf("expected WARN log on sub-one clamp, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestIngestBuffer_DropLogThrottle asserts the time-based throttle (PR
|
||||
// #1623 round-1 fix to #1609 M1): the FIRST drop of a stall logs
|
||||
// immediately (loud), then subsequent drops within the same stall are
|
||||
// rate-limited to at most one summary line per second, and a recovery
|
||||
// line is emitted when Submit succeeds again. This prevents log-flood
|
||||
// under sustained stalls (potentially hundreds of MB/min) while
|
||||
// preserving "loud the instant the stall starts".
|
||||
func TestIngestBuffer_DropLogThrottle(t *testing.T) {
|
||||
var buf bytes.Buffer
|
||||
oldOut := log.Writer()
|
||||
oldFlags := log.Flags()
|
||||
log.SetOutput(&buf)
|
||||
log.SetFlags(0)
|
||||
t.Cleanup(func() {
|
||||
log.SetOutput(oldOut)
|
||||
log.SetFlags(oldFlags)
|
||||
})
|
||||
|
||||
b := NewIngestBuffer(2)
|
||||
t.Cleanup(b.Stop)
|
||||
// Fill to capacity (no Ready() — nothing drains).
|
||||
for i := 0; i < 2; i++ {
|
||||
b.Submit(func() {})
|
||||
}
|
||||
// 100 drops in tight loop (well under 1s).
|
||||
for i := 0; i < 100; i++ {
|
||||
b.Submit(func() {})
|
||||
}
|
||||
|
||||
got := buf.String()
|
||||
lines := strings.Count(got, "buffer full")
|
||||
if lines < 1 {
|
||||
t.Fatalf("expected the FIRST drop to log immediately; got 0 'buffer full' lines:\n%s", got)
|
||||
}
|
||||
if lines > 2 {
|
||||
t.Fatalf("expected at most 2 'buffer full' lines for 100 drops in <1s (first + at-most-one summary), got %d:\n%s", lines, got)
|
||||
}
|
||||
// Every line must include the capacity for operator triage.
|
||||
if !strings.Contains(got, "cap 2") {
|
||||
t.Fatalf("expected every drop log line to include 'cap 2', got:\n%s", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestIngestBuffer_DropLogFirstAlwaysImmediate guards the "loud the
|
||||
// instant the stall starts" half of the throttle contract from PR
|
||||
// #1623: even a single drop must log immediately, not be silently
|
||||
// absorbed by the per-second summary window.
|
||||
func TestIngestBuffer_DropLogFirstAlwaysImmediate(t *testing.T) {
|
||||
var buf bytes.Buffer
|
||||
oldOut := log.Writer()
|
||||
oldFlags := log.Flags()
|
||||
log.SetOutput(&buf)
|
||||
log.SetFlags(0)
|
||||
t.Cleanup(func() {
|
||||
log.SetOutput(oldOut)
|
||||
log.SetFlags(oldFlags)
|
||||
})
|
||||
|
||||
b := NewIngestBuffer(1)
|
||||
t.Cleanup(b.Stop)
|
||||
b.Submit(func() {}) // fills cap=1
|
||||
b.Submit(func() {}) // first drop
|
||||
got := buf.String()
|
||||
if !strings.Contains(got, "buffer full") {
|
||||
t.Fatalf("expected FIRST drop to log immediately; got:\n%s", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestIngestBuffer_DropLogRecoveryAfterDrain guards the recovery-line
|
||||
// half of the throttle contract: once Submit succeeds again after one
|
||||
// or more drops, a "recovered" / "drained" line must be emitted so
|
||||
// operators can quantify the burst (PR #1623).
|
||||
func TestIngestBuffer_DropLogRecoveryAfterDrain(t *testing.T) {
|
||||
var buf bytes.Buffer
|
||||
oldOut := log.Writer()
|
||||
oldFlags := log.Flags()
|
||||
log.SetOutput(&buf)
|
||||
log.SetFlags(0)
|
||||
t.Cleanup(func() {
|
||||
log.SetOutput(oldOut)
|
||||
log.SetFlags(oldFlags)
|
||||
})
|
||||
|
||||
b := NewIngestBuffer(1)
|
||||
t.Cleanup(b.Stop)
|
||||
b.Submit(func() {}) // fills cap=1
|
||||
for i := 0; i < 3; i++ {
|
||||
b.Submit(func() {}) // drops
|
||||
}
|
||||
// Drain: start consumer and Ready(), wait for queue to empty.
|
||||
b.Start()
|
||||
b.Ready()
|
||||
deadline := time.Now().Add(time.Second)
|
||||
for b.Pending() > 0 && time.Now().Before(deadline) {
|
||||
time.Sleep(2 * time.Millisecond)
|
||||
}
|
||||
// Now a successful Submit should trigger the recovery line.
|
||||
b.Submit(func() {})
|
||||
// Give the goroutine + log a moment.
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
|
||||
got := buf.String()
|
||||
if !strings.Contains(got, "drained") && !strings.Contains(got, "recovered") {
|
||||
t.Fatalf("expected a 'drained'/'recovered' log line after stall ended; got:\n%s", got)
|
||||
}
|
||||
}
|
||||
@@ -1,126 +0,0 @@
|
||||
package main
|
||||
|
||||
// Regression test for issue #1370 — counters PR #1233 (commit 498fbc03).
|
||||
//
|
||||
// PR #1233 made the ingestor use the MQTT envelope's "timestamp" field as
|
||||
// transmissions.first_seen / observations.timestamp, on the premise that
|
||||
// uploaders stamp it at radio receive and the value is trustworthy.
|
||||
//
|
||||
// That premise FAILS for observers whose own clock is wrong. Staging
|
||||
// Voodoo3 tx 304114 in channel #test had 5 observations:
|
||||
// - 4 from Voodoo3 stamped "18:42" — Voodoo3's broken client clock,
|
||||
// - 1 from another observer stamped "01:42" — the actual receive time.
|
||||
// Voodoo3 ingested first, so first_seen locked at "18:42" and the
|
||||
// /api/channels row showed the channel as last-active 7h+ in the past.
|
||||
//
|
||||
// Fix: revert the storage path — packet/observation timestamps are
|
||||
// server ingest time (time.Now() at the ingestor). Envelope timestamp
|
||||
// stays usable for observer.last_seen (PR #1233's MAX/MIN guard there
|
||||
// is fine and unrelated to the channel-ordering bug).
|
||||
|
||||
import (
|
||||
"strconv"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Raw packet path: envelope reports timestamp 7h in the past
|
||||
// (simulating Voodoo3's broken client clock). After ingest,
|
||||
// transmissions.first_seen and observations.timestamp must reflect
|
||||
// SERVER wall clock, not the bogus envelope value.
|
||||
func TestHandleMessage_PacketTimestamp_IgnoresStaleEnvelope_1370(t *testing.T) {
|
||||
store := newTestStore(t)
|
||||
source := MQTTSource{Name: "test"}
|
||||
|
||||
stale := time.Now().UTC().Add(-7 * time.Hour).Format(time.RFC3339)
|
||||
before := time.Now().Unix()
|
||||
|
||||
rawHex := "0A00D69FD7A5A7475DB07337749AE61FA53A4788E976"
|
||||
payload := []byte(`{"raw":"` + rawHex + `","SNR":5.5,"RSSI":-100.0,"origin":"voodoo3","timestamp":"` + stale + `"}`)
|
||||
msg := &mockMessage{topic: "meshcore/SJC/voodoo3/packets", payload: payload}
|
||||
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
after := time.Now().Unix()
|
||||
|
||||
// ─── transmissions.first_seen ───────────────────────────────────────
|
||||
var firstSeen string
|
||||
if err := store.db.QueryRow(`SELECT first_seen FROM transmissions LIMIT 1`).Scan(&firstSeen); err != nil {
|
||||
t.Fatalf("scan first_seen: %v", err)
|
||||
}
|
||||
fsParsed, err := time.Parse(time.RFC3339, firstSeen)
|
||||
if err != nil {
|
||||
t.Fatalf("first_seen %q not RFC3339: %v", firstSeen, err)
|
||||
}
|
||||
if fsParsed.Unix() < before-5 || fsParsed.Unix() > after+5 {
|
||||
t.Errorf("transmissions.first_seen = %q (epoch %d); want in [%d, %d] (server wall clock). "+
|
||||
"Envelope reported stale %q (7h ago) — PR #1233's premise that envelope timestamp is trustworthy is FALSE for buggy-clock observers. Issue #1370.",
|
||||
firstSeen, fsParsed.Unix(), before, after, stale)
|
||||
}
|
||||
|
||||
// ─── observations.timestamp (epoch) ─────────────────────────────────
|
||||
var obsTs int64
|
||||
if err := store.db.QueryRow(`SELECT timestamp FROM observations LIMIT 1`).Scan(&obsTs); err != nil {
|
||||
t.Fatalf("scan observations.timestamp: %v", err)
|
||||
}
|
||||
if obsTs < before-5 || obsTs > after+5 {
|
||||
t.Errorf("observations.timestamp = %d; want in [%d, %d] (server wall clock). Envelope stale = %q. Issue #1370.",
|
||||
obsTs, before, after, stale)
|
||||
}
|
||||
}
|
||||
|
||||
// Channel-message (BLE companion) path: envelope timestamp stale → stored
|
||||
// transmissions.first_seen must still be server wall clock.
|
||||
func TestHandleMessage_ChannelPath_PacketTimestamp_IgnoresStaleEnvelope_1370(t *testing.T) {
|
||||
store := newTestStore(t)
|
||||
source := MQTTSource{Name: "test"}
|
||||
|
||||
stale := time.Now().UTC().Add(-7 * time.Hour).Format(time.RFC3339)
|
||||
before := time.Now().Unix()
|
||||
|
||||
payload := []byte(`{"text":"Voodoo3: tst hmdpt","channel_idx":3,"SNR":5.0,"RSSI":-95,"timestamp":"` + stale + `","sender_timestamp":` + strconv.FormatInt(time.Now().Unix(), 10) + `}`)
|
||||
msg := &mockMessage{topic: "meshcore/message/channel/3", payload: payload}
|
||||
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
after := time.Now().Unix()
|
||||
|
||||
var firstSeen string
|
||||
if err := store.db.QueryRow(`SELECT first_seen FROM transmissions LIMIT 1`).Scan(&firstSeen); err != nil {
|
||||
t.Fatalf("scan first_seen: %v", err)
|
||||
}
|
||||
fsParsed, err := time.Parse(time.RFC3339, firstSeen)
|
||||
if err != nil {
|
||||
t.Fatalf("first_seen %q not RFC3339: %v", firstSeen, err)
|
||||
}
|
||||
if fsParsed.Unix() < before-5 || fsParsed.Unix() > after+5 {
|
||||
t.Errorf("channel-path transmissions.first_seen = %q (epoch %d); want in [%d, %d] (server wall clock). Envelope stale = %q. Issue #1370.",
|
||||
firstSeen, fsParsed.Unix(), before, after, stale)
|
||||
}
|
||||
}
|
||||
|
||||
// DM (BLE companion direct-message) path: same revert applies.
|
||||
func TestHandleMessage_DMPath_PacketTimestamp_IgnoresStaleEnvelope_1370(t *testing.T) {
|
||||
store := newTestStore(t)
|
||||
source := MQTTSource{Name: "test"}
|
||||
|
||||
stale := time.Now().UTC().Add(-7 * time.Hour).Format(time.RFC3339)
|
||||
before := time.Now().Unix()
|
||||
|
||||
payload := []byte(`{"text":"Voodoo3: hello","SNR":5.0,"RSSI":-95,"timestamp":"` + stale + `"}`)
|
||||
msg := &mockMessage{topic: "meshcore/message/direct/voodoo3", payload: payload}
|
||||
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
after := time.Now().Unix()
|
||||
|
||||
var firstSeen string
|
||||
if err := store.db.QueryRow(`SELECT first_seen FROM transmissions LIMIT 1`).Scan(&firstSeen); err != nil {
|
||||
t.Fatalf("scan first_seen: %v", err)
|
||||
}
|
||||
fsParsed, err := time.Parse(time.RFC3339, firstSeen)
|
||||
if err != nil {
|
||||
t.Fatalf("first_seen %q not RFC3339: %v", firstSeen, err)
|
||||
}
|
||||
if fsParsed.Unix() < before-5 || fsParsed.Unix() > after+5 {
|
||||
t.Errorf("DM-path transmissions.first_seen = %q (epoch %d); want in [%d, %d] (server wall clock). Envelope stale = %q. Issue #1370.",
|
||||
firstSeen, fsParsed.Unix(), before, after, stale)
|
||||
}
|
||||
}
|
||||
@@ -1,30 +0,0 @@
|
||||
package main
|
||||
|
||||
// Tests for issue #1279 P2 item 5: ingestor RAW_CUSTOM exposure.
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestDecodeRawCustomExposesLengthAndTag(t *testing.T) {
|
||||
// header = (1<<6)|(0x0F<<2)|1 = 0x7D ; path byte = 0x00 ; payload = A5 DE AD BE EF
|
||||
hexStr := "7D00A5DEADBEEF"
|
||||
pkt, err := DecodePacket(hexStr, nil, false)
|
||||
if err != nil {
|
||||
t.Fatalf("decode: %v", err)
|
||||
}
|
||||
if pkt.Payload.Type != "RAW_CUSTOM" {
|
||||
t.Fatalf("payload type = %q, want RAW_CUSTOM", pkt.Payload.Type)
|
||||
}
|
||||
if pkt.Payload.RawLength == nil || *pkt.Payload.RawLength != 5 {
|
||||
got := -1
|
||||
if pkt.Payload.RawLength != nil {
|
||||
got = *pkt.Payload.RawLength
|
||||
}
|
||||
t.Errorf("RawLength=%d, want 5", got)
|
||||
}
|
||||
if !strings.EqualFold(pkt.Payload.FirstByteTag, "A5") {
|
||||
t.Errorf("FirstByteTag=%q, want A5", pkt.Payload.FirstByteTag)
|
||||
}
|
||||
}
|
||||
@@ -1,211 +0,0 @@
|
||||
package main
|
||||
|
||||
// Tests for issue #1279 P0+P1 decoder additions.
|
||||
//
|
||||
// Each test uses firmware-derived wire vectors:
|
||||
// - GRP_DATA outer: firmware/src/helpers/BaseChatMesh.cpp:500 (createGroupDatagram)
|
||||
// - GRP_DATA inner: firmware/src/helpers/BaseChatMesh.cpp:382-385
|
||||
// - MULTIPART byte0: firmware/src/Mesh.cpp:289
|
||||
// - MULTIPART ACK inner: firmware/src/Mesh.cpp:292-307
|
||||
// - CONTROL byte0 flags: firmware/src/Mesh.cpp:69 + createControlData at Mesh.cpp:609
|
||||
// - advertRole label rules: firmware/src/helpers/AdvertDataHelpers.h:7-12
|
||||
|
||||
import (
|
||||
"crypto/aes"
|
||||
"crypto/hmac"
|
||||
"crypto/sha256"
|
||||
"encoding/binary"
|
||||
"encoding/hex"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// --- P0 #1: GRP_DATA decoder ---
|
||||
|
||||
// buildChannelEncrypted encrypts arbitrary inner bytes with the channel
|
||||
// key/MAC scheme firmware uses for both GRP_TXT and GRP_DATA (see
|
||||
// BaseChatMesh.cpp:376-391: AES-128-ECB, HMAC-SHA256-trunc-2 MAC).
|
||||
func buildChannelEncrypted(channelKeyHex string, inner []byte) (ctHex, macHex string) {
|
||||
key, _ := hex.DecodeString(channelKeyHex)
|
||||
plain := append([]byte{}, inner...)
|
||||
pad := aes.BlockSize - (len(plain) % aes.BlockSize)
|
||||
if pad != aes.BlockSize {
|
||||
plain = append(plain, make([]byte, pad)...)
|
||||
}
|
||||
block, _ := aes.NewCipher(key)
|
||||
ct := make([]byte, len(plain))
|
||||
for i := 0; i < len(plain); i += aes.BlockSize {
|
||||
block.Encrypt(ct[i:i+aes.BlockSize], plain[i:i+aes.BlockSize])
|
||||
}
|
||||
secret := make([]byte, 32)
|
||||
copy(secret, key)
|
||||
h := hmac.New(sha256.New, secret)
|
||||
h.Write(ct)
|
||||
mac := h.Sum(nil)
|
||||
return hex.EncodeToString(ct), hex.EncodeToString(mac[:2])
|
||||
}
|
||||
|
||||
func TestDecodeGrpDataNoKey(t *testing.T) {
|
||||
// Envelope alone (no key in store).
|
||||
buf := []byte{0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF, 0x11}
|
||||
p := decodeGrpData(buf, nil)
|
||||
if p.Type != "GRP_DATA" {
|
||||
t.Fatalf("type=%q want GRP_DATA", p.Type)
|
||||
}
|
||||
if p.ChannelHash != 0xAA {
|
||||
t.Errorf("channelHash=%d want 170", p.ChannelHash)
|
||||
}
|
||||
if p.ChannelHashHex != "AA" {
|
||||
t.Errorf("channelHashHex=%q want AA", p.ChannelHashHex)
|
||||
}
|
||||
if p.MAC != "bbcc" {
|
||||
t.Errorf("mac=%q want bbcc", p.MAC)
|
||||
}
|
||||
if p.EncryptedData != "ddeeff11" {
|
||||
t.Errorf("encryptedData=%q want ddeeff11", p.EncryptedData)
|
||||
}
|
||||
if p.DecryptionStatus != "no_key" {
|
||||
t.Errorf("decryptionStatus=%q want no_key", p.DecryptionStatus)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDecodeGrpDataDecryptedInner(t *testing.T) {
|
||||
// Inner per BaseChatMesh.cpp:382-385: data_type(uint16 LE) + data_len(1) + blob.
|
||||
key := "2cc3d22840e086105ad73443da2cacb8"
|
||||
blob := []byte{0x10, 0x20, 0x30, 0x40, 0x50}
|
||||
inner := []byte{0x34, 0x12, byte(len(blob))} // data_type = 0x1234
|
||||
inner = append(inner, blob...)
|
||||
ctHex, macHex := buildChannelEncrypted(key, inner)
|
||||
|
||||
buf := []byte{0xAB}
|
||||
mb, _ := hex.DecodeString(macHex)
|
||||
buf = append(buf, mb...)
|
||||
cb, _ := hex.DecodeString(ctHex)
|
||||
buf = append(buf, cb...)
|
||||
|
||||
p := decodeGrpData(buf, map[string]string{"test": key})
|
||||
if p.Type != "GRP_DATA" {
|
||||
t.Fatalf("type=%q want GRP_DATA", p.Type)
|
||||
}
|
||||
if p.DecryptionStatus != "decrypted" {
|
||||
t.Fatalf("decryptionStatus=%q want decrypted", p.DecryptionStatus)
|
||||
}
|
||||
if p.DataType == nil || *p.DataType != 0x1234 {
|
||||
t.Errorf("dataType=%v want 0x1234", p.DataType)
|
||||
}
|
||||
if p.DataLen == nil || *p.DataLen != 5 {
|
||||
t.Errorf("dataLen=%v want 5", p.DataLen)
|
||||
}
|
||||
if p.DecryptedBlob != hex.EncodeToString(blob) {
|
||||
t.Errorf("decryptedBlob=%q want %q", p.DecryptedBlob, hex.EncodeToString(blob))
|
||||
}
|
||||
if p.Channel != "test" {
|
||||
t.Errorf("channel=%q want test", p.Channel)
|
||||
}
|
||||
}
|
||||
|
||||
// --- P0 #2: MULTIPART decoder ---
|
||||
|
||||
func TestDecodeMultipartAck(t *testing.T) {
|
||||
// remaining=3, inner_type=PAYLOAD_TYPE_ACK(0x03), ack_crc=0xDEADBEEF.
|
||||
// byte0 = (3<<4) | 3 = 0x33; next 4 bytes are LE crc.
|
||||
buf := []byte{0x33, 0xEF, 0xBE, 0xAD, 0xDE}
|
||||
p := decodeMultipart(buf)
|
||||
if p.Type != "MULTIPART" {
|
||||
t.Fatalf("type=%q want MULTIPART", p.Type)
|
||||
}
|
||||
if p.Remaining == nil || *p.Remaining != 3 {
|
||||
t.Errorf("remaining=%v want 3", p.Remaining)
|
||||
}
|
||||
if p.InnerType == nil || *p.InnerType != 0x03 {
|
||||
t.Errorf("innerType=%v want 3", p.InnerType)
|
||||
}
|
||||
if p.InnerTypeName != "ACK" {
|
||||
t.Errorf("innerTypeName=%q want ACK", p.InnerTypeName)
|
||||
}
|
||||
if p.InnerAckCrc != "deadbeef" {
|
||||
t.Errorf("innerAckCrc=%q want deadbeef", p.InnerAckCrc)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDecodeMultipartNonAck(t *testing.T) {
|
||||
// remaining=2, inner_type=0x02 (TXT_MSG), arbitrary inner payload.
|
||||
buf := []byte{0x22, 0x01, 0x02, 0x03}
|
||||
p := decodeMultipart(buf)
|
||||
if p.Remaining == nil || *p.Remaining != 2 {
|
||||
t.Errorf("remaining=%v want 2", p.Remaining)
|
||||
}
|
||||
if p.InnerType == nil || *p.InnerType != 0x02 {
|
||||
t.Errorf("innerType=%v want 2", p.InnerType)
|
||||
}
|
||||
if p.InnerTypeName != "TXT_MSG" {
|
||||
t.Errorf("innerTypeName=%q want TXT_MSG", p.InnerTypeName)
|
||||
}
|
||||
if p.InnerPayload != "010203" {
|
||||
t.Errorf("innerPayload=%q want 010203", p.InnerPayload)
|
||||
}
|
||||
if p.InnerAckCrc != "" {
|
||||
t.Errorf("non-ACK should not surface innerAckCrc, got %q", p.InnerAckCrc)
|
||||
}
|
||||
}
|
||||
|
||||
// --- P1 #3: advertRole label fix ---
|
||||
|
||||
func TestAdvertRoleLabelsRawType(t *testing.T) {
|
||||
// Firmware: ADV_TYPE_NONE=0, CHAT=1, REPEATER=2, ROOM=3, SENSOR=4, 5..15 FUTURE.
|
||||
cases := []struct {
|
||||
typ int
|
||||
want string
|
||||
}{
|
||||
{0, "none"},
|
||||
{1, "companion"},
|
||||
{2, "repeater"},
|
||||
{3, "room"},
|
||||
{4, "sensor"},
|
||||
{5, "type-5"},
|
||||
{15, "type-15"},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
got := advertRole(&AdvertFlags{Type: tc.typ, Repeater: tc.typ == 2, Room: tc.typ == 3, Sensor: tc.typ == 4})
|
||||
if got != tc.want {
|
||||
t.Errorf("advertRole(type=%d) = %q, want %q", tc.typ, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --- P1 #4: CONTROL byte0 flags ---
|
||||
|
||||
func TestDecodeControlZeroHop(t *testing.T) {
|
||||
// byte0 = 0x81 (high-bit set ⇒ zero-hop), followed by 3 app bytes.
|
||||
buf := []byte{0x81, 0xAA, 0xBB, 0xCC}
|
||||
p := decodeControl(buf)
|
||||
if p.Type != "CONTROL" {
|
||||
t.Fatalf("type=%q want CONTROL", p.Type)
|
||||
}
|
||||
if p.CtrlFlags != "81" {
|
||||
t.Errorf("ctrlFlags=%q want 81", p.CtrlFlags)
|
||||
}
|
||||
if p.CtrlZeroHop == nil || !*p.CtrlZeroHop {
|
||||
t.Errorf("ctrlZeroHop=%v want true", p.CtrlZeroHop)
|
||||
}
|
||||
if p.CtrlLength == nil || *p.CtrlLength != 4 {
|
||||
t.Errorf("ctrlLength=%v want 4", p.CtrlLength)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDecodeControlMultiHop(t *testing.T) {
|
||||
// byte0 = 0x01 (high-bit clear ⇒ not zero-hop subset).
|
||||
buf := []byte{0x01, 0x42}
|
||||
p := decodeControl(buf)
|
||||
if p.CtrlFlags != "01" {
|
||||
t.Errorf("ctrlFlags=%q want 01", p.CtrlFlags)
|
||||
}
|
||||
if p.CtrlZeroHop == nil || *p.CtrlZeroHop {
|
||||
t.Errorf("ctrlZeroHop=%v want false", p.CtrlZeroHop)
|
||||
}
|
||||
if p.CtrlLength == nil || *p.CtrlLength != 2 {
|
||||
t.Errorf("ctrlLength=%v want 2", p.CtrlLength)
|
||||
}
|
||||
}
|
||||
|
||||
// silence unused-import diagnostics for stub-phase builds
|
||||
var _ = binary.LittleEndian
|
||||
@@ -1,98 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
_ "modernc.org/sqlite"
|
||||
)
|
||||
|
||||
// TestIngestorPruneOldPackets enforces #1283: the writer for
|
||||
// transmissions retention lives on the ingestor's *Store. Before the fix,
|
||||
// this lived on cmd/server/*DB and raced with ingestor INSERTs. After
|
||||
// the fix, ingestor owns it and runs it on its own write-locked handle.
|
||||
func TestIngestorPruneOldPackets(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "prune.db")
|
||||
store, err := OpenStore(path)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenStore: %v", err)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
old := time.Now().UTC().AddDate(0, 0, -10).Format(time.RFC3339)
|
||||
new := time.Now().UTC().Format(time.RFC3339)
|
||||
for i, ts := range []string{old, old, new} {
|
||||
_, err := store.db.Exec(
|
||||
`INSERT INTO transmissions (raw_hex, hash, first_seen, route_type, payload_type, payload_version, decoded_json)
|
||||
VALUES (?, ?, ?, 0, 1, 1, '{}')`,
|
||||
"AA", "h"+string(rune('a'+i)), ts,
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("seed tx: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
n, err := store.PruneOldPackets(5)
|
||||
if err != nil {
|
||||
t.Fatalf("PruneOldPackets: %v", err)
|
||||
}
|
||||
if n != 2 {
|
||||
t.Fatalf("expected 2 pruned, got %d", n)
|
||||
}
|
||||
|
||||
var remaining int
|
||||
if err := store.db.QueryRow(`SELECT COUNT(*) FROM transmissions`).Scan(&remaining); err != nil {
|
||||
t.Fatalf("count: %v", err)
|
||||
}
|
||||
if remaining != 1 {
|
||||
t.Fatalf("expected 1 transmission remaining, got %d", remaining)
|
||||
}
|
||||
}
|
||||
|
||||
// TestIngestorVacuumOnStartupMigratesNONEtoINCREMENTAL exercises the
|
||||
// scenario that originally broke in #1283: a fresh DB with
|
||||
// auto_vacuum=NONE, vacuumOnStartup=true, no contention from a server
|
||||
// process. The ingestor must complete the VACUUM and flip auto_vacuum to
|
||||
// INCREMENTAL. Before the fix, the migration ran inside cmd/server and
|
||||
// hit SQLITE_BUSY because the ingestor (sharing the container) was
|
||||
// already writing.
|
||||
func TestIngestorVacuumOnStartupMigratesNONEtoINCREMENTAL(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "vac.db")
|
||||
|
||||
// Create a NONE-auto_vacuum DB (simulates an older deployment).
|
||||
seed, err := sql.Open("sqlite", path+"?_pragma=journal_mode(WAL)")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
seed.SetMaxOpenConns(1)
|
||||
if _, err := seed.Exec(`CREATE TABLE dummy(id INTEGER PRIMARY KEY)`); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
var before int
|
||||
seed.QueryRow("PRAGMA auto_vacuum").Scan(&before)
|
||||
if before != 0 {
|
||||
t.Fatalf("precondition: auto_vacuum=%d, want 0", before)
|
||||
}
|
||||
seed.Close()
|
||||
|
||||
store, err := OpenStore(path)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenStore: %v", err)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
cfg := &Config{DB: &DBConfig{VacuumOnStartup: true}}
|
||||
store.CheckAutoVacuum(cfg)
|
||||
|
||||
var after int
|
||||
if err := store.db.QueryRow("PRAGMA auto_vacuum").Scan(&after); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if after != 2 {
|
||||
t.Fatalf("expected auto_vacuum=2 after ingestor VACUUM, got %d", after)
|
||||
}
|
||||
}
|
||||
@@ -1,134 +0,0 @@
|
||||
package main
|
||||
|
||||
// Tests for issue #1610: firmware 1.16.0 extended ACK support.
|
||||
//
|
||||
// Wire vectors are synthetic, derived by hand from the firmware spec:
|
||||
// - Variable-length ACK on the wire:
|
||||
// firmware/src/Mesh.cpp:545-575 createAck/createMultiAck (commit f6e6fdaa)
|
||||
// - 5-byte ACK = 4-byte truncated sha256 CRC + 1-byte attempt counter:
|
||||
// firmware/src/helpers/BaseChatMesh.cpp:218-232 (commit f6e6fdaa)
|
||||
// - 6-byte ACK = 5-byte + 1-byte RNG (so identical attempts get unique hash):
|
||||
// firmware/src/helpers/BaseChatMesh.cpp:219-234 (commit a130a95a)
|
||||
// - Multipart ACK inner blob: firmware/src/Mesh.cpp:292-307 — byte0 then
|
||||
// ack bytes, payload_len = 1 + ack_len.
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
// --- top-level ACK (decodeAck) ---
|
||||
|
||||
func TestDecodeAckLegacy4Byte(t *testing.T) {
|
||||
// Backwards-compat: 4-byte ACK leaves the new optional fields nil.
|
||||
buf := []byte{0xAA, 0xBB, 0xCC, 0xDD}
|
||||
p := decodeAck(buf)
|
||||
if p.ExtraHash != "ddccbbaa" {
|
||||
t.Errorf("extraHash=%q want ddccbbaa", p.ExtraHash)
|
||||
}
|
||||
if p.AckLen == nil || *p.AckLen != 4 {
|
||||
t.Errorf("ackLen=%v want 4", p.AckLen)
|
||||
}
|
||||
if p.AckAttempt != nil {
|
||||
t.Errorf("ackAttempt=%v want nil for legacy 4-byte ACK", *p.AckAttempt)
|
||||
}
|
||||
if p.AckRand != nil {
|
||||
t.Errorf("ackRand=%v want nil for legacy 4-byte ACK", *p.AckRand)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDecodeAck5ByteExtended(t *testing.T) {
|
||||
// v1.16 sender (commit f6e6fdaa): 4-byte CRC + 1-byte attempt.
|
||||
buf := []byte{0xAA, 0xBB, 0xCC, 0xDD, 0x07}
|
||||
p := decodeAck(buf)
|
||||
if p.ExtraHash != "ddccbbaa" {
|
||||
t.Errorf("extraHash=%q want ddccbbaa", p.ExtraHash)
|
||||
}
|
||||
if p.AckLen == nil || *p.AckLen != 5 {
|
||||
t.Errorf("ackLen=%v want 5", p.AckLen)
|
||||
}
|
||||
if p.AckAttempt == nil || *p.AckAttempt != 7 {
|
||||
t.Errorf("ackAttempt=%v want 7", p.AckAttempt)
|
||||
}
|
||||
if p.AckRand != nil {
|
||||
t.Errorf("ackRand=%v want nil for 5-byte ACK", *p.AckRand)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDecodeAck6ByteExtended(t *testing.T) {
|
||||
// v1.16 sender (commit a130a95a): 4-byte CRC + 1-byte attempt + 1-byte RNG.
|
||||
buf := []byte{0xAA, 0xBB, 0xCC, 0xDD, 0x02, 0x5A}
|
||||
p := decodeAck(buf)
|
||||
if p.ExtraHash != "ddccbbaa" {
|
||||
t.Errorf("extraHash=%q want ddccbbaa", p.ExtraHash)
|
||||
}
|
||||
if p.AckLen == nil || *p.AckLen != 6 {
|
||||
t.Errorf("ackLen=%v want 6", p.AckLen)
|
||||
}
|
||||
if p.AckAttempt == nil || *p.AckAttempt != 2 {
|
||||
t.Errorf("ackAttempt=%v want 2", p.AckAttempt)
|
||||
}
|
||||
if p.AckRand == nil || *p.AckRand != 0x5A {
|
||||
t.Errorf("ackRand=%v want 90", p.AckRand)
|
||||
}
|
||||
}
|
||||
|
||||
// --- multipart-with-ACK (decodeMultipart) ---
|
||||
|
||||
// buildMultipartAckByte0: remaining<<4 | PayloadACK (0x02).
|
||||
func buildMultipartAckByte0(remaining int) byte {
|
||||
return byte((remaining<<4)&0xF0) | byte(PayloadACK&0x0F)
|
||||
}
|
||||
|
||||
func TestDecodeMultipartAck4ByteLegacy(t *testing.T) {
|
||||
// Pre-1.16 inner ACK is 4 bytes → ackLen=4, attempt/rand nil.
|
||||
buf := []byte{buildMultipartAckByte0(3), 0xAA, 0xBB, 0xCC, 0xDD}
|
||||
p := decodeMultipart(buf)
|
||||
if p.InnerAckCrc != "ddccbbaa" {
|
||||
t.Errorf("innerAckCrc=%q want ddccbbaa", p.InnerAckCrc)
|
||||
}
|
||||
if p.InnerAckLen == nil || *p.InnerAckLen != 4 {
|
||||
t.Errorf("innerAckLen=%v want 4", p.InnerAckLen)
|
||||
}
|
||||
if p.InnerAckAttempt != nil {
|
||||
t.Errorf("innerAckAttempt=%v want nil", *p.InnerAckAttempt)
|
||||
}
|
||||
if p.InnerAckRand != nil {
|
||||
t.Errorf("innerAckRand=%v want nil", *p.InnerAckRand)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDecodeMultipartAck5Byte(t *testing.T) {
|
||||
// v1.16: byte0 + 4-byte CRC + 1-byte attempt → payload_len = 6.
|
||||
buf := []byte{buildMultipartAckByte0(1), 0xAA, 0xBB, 0xCC, 0xDD, 0x09}
|
||||
p := decodeMultipart(buf)
|
||||
if p.InnerAckCrc != "ddccbbaa" {
|
||||
t.Errorf("innerAckCrc=%q want ddccbbaa", p.InnerAckCrc)
|
||||
}
|
||||
if p.InnerAckLen == nil || *p.InnerAckLen != 5 {
|
||||
t.Errorf("innerAckLen=%v want 5", p.InnerAckLen)
|
||||
}
|
||||
if p.InnerAckAttempt == nil || *p.InnerAckAttempt != 9 {
|
||||
t.Errorf("innerAckAttempt=%v want 9", p.InnerAckAttempt)
|
||||
}
|
||||
if p.InnerAckRand != nil {
|
||||
t.Errorf("innerAckRand=%v want nil for 5-byte inner ACK", *p.InnerAckRand)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDecodeMultipartAck6Byte(t *testing.T) {
|
||||
// v1.16: byte0 + 4-byte CRC + 1-byte attempt + 1-byte RNG → payload_len = 7.
|
||||
buf := []byte{buildMultipartAckByte0(0), 0xAA, 0xBB, 0xCC, 0xDD, 0x04, 0xC3}
|
||||
p := decodeMultipart(buf)
|
||||
if p.InnerAckCrc != "ddccbbaa" {
|
||||
t.Errorf("innerAckCrc=%q want ddccbbaa", p.InnerAckCrc)
|
||||
}
|
||||
if p.InnerAckLen == nil || *p.InnerAckLen != 6 {
|
||||
t.Errorf("innerAckLen=%v want 6", p.InnerAckLen)
|
||||
}
|
||||
if p.InnerAckAttempt == nil || *p.InnerAckAttempt != 4 {
|
||||
t.Errorf("innerAckAttempt=%v want 4", p.InnerAckAttempt)
|
||||
}
|
||||
if p.InnerAckRand == nil || *p.InnerAckRand != 0xC3 {
|
||||
t.Errorf("innerAckRand=%v want 195", p.InnerAckRand)
|
||||
}
|
||||
}
|
||||
@@ -1,84 +0,0 @@
|
||||
package main
|
||||
|
||||
// Test for issue #1690 — every observation insert must denormalize the
|
||||
// transmission's last_seen so cold-load can filter on effective recency.
|
||||
//
|
||||
// Setup: insert a transmission whose first/last seen are both 7 days ago.
|
||||
// Then insert a fresh observation against the same hash. Post-fix the
|
||||
// transmissions.last_seen column must reflect the new observation time.
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestIssue1690_LastSeenUpdatedOnObservation(t *testing.T) {
|
||||
s, err := OpenStore(tempDBPath(t))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer s.Close()
|
||||
|
||||
hash := "abcdef1690cafebabe"
|
||||
weekAgo := time.Now().UTC().Add(-7 * 24 * time.Hour).Format(time.RFC3339)
|
||||
snr, rssi := 5.5, -100.0
|
||||
|
||||
first := &PacketData{
|
||||
RawHex: "0A00",
|
||||
Timestamp: weekAgo,
|
||||
ObserverID: "obs1",
|
||||
Hash: hash,
|
||||
RouteType: 2,
|
||||
PayloadType: 2,
|
||||
PayloadVersion: 0,
|
||||
PathJSON: "[]",
|
||||
DecodedJSON: `{"type":"TXT_MSG"}`,
|
||||
SNR: &snr,
|
||||
RSSI: &rssi,
|
||||
}
|
||||
if _, err := s.InsertTransmission(first); err != nil {
|
||||
t.Fatalf("seed insert: %v", err)
|
||||
}
|
||||
|
||||
// Sanity: confirm the seed last_seen is the 7d-ago time.
|
||||
var seededLastSeen int64
|
||||
if err := s.db.QueryRow(`SELECT COALESCE(last_seen, 0) FROM transmissions WHERE hash = ?`, hash).Scan(&seededLastSeen); err != nil {
|
||||
t.Fatalf("seed select last_seen: %v (column missing? post-fix must add it)", err)
|
||||
}
|
||||
weekAgoUnix, _ := time.Parse(time.RFC3339, weekAgo)
|
||||
if seededLastSeen != weekAgoUnix.Unix() {
|
||||
t.Logf("seed last_seen=%d expected %d (allowed for fresh column)", seededLastSeen, weekAgoUnix.Unix())
|
||||
}
|
||||
|
||||
// New observation: nowSec timestamp.
|
||||
nowSec := time.Now().UTC().Unix()
|
||||
nowStr := time.Unix(nowSec, 0).UTC().Format(time.RFC3339)
|
||||
second := &PacketData{
|
||||
RawHex: "0A00",
|
||||
Timestamp: nowStr,
|
||||
ObserverID: "obs2", // different observer → new observation row
|
||||
Hash: hash,
|
||||
RouteType: 2,
|
||||
PayloadType: 2,
|
||||
PayloadVersion: 0,
|
||||
PathJSON: "[]",
|
||||
DecodedJSON: `{"type":"TXT_MSG"}`,
|
||||
SNR: &snr,
|
||||
RSSI: &rssi,
|
||||
}
|
||||
if _, err := s.InsertTransmission(second); err != nil {
|
||||
t.Fatalf("second insert: %v", err)
|
||||
}
|
||||
|
||||
var ls int64
|
||||
if err := s.db.QueryRow(`SELECT last_seen FROM transmissions WHERE hash = ?`, hash).Scan(&ls); err != nil {
|
||||
t.Fatalf("post-insert select last_seen: %v", err)
|
||||
}
|
||||
// The post-fix writer must bump last_seen to at least the new observation's
|
||||
// epoch second. We allow ±2s slack for the unix-second round trip.
|
||||
if ls < nowSec-2 {
|
||||
t.Errorf("transmissions.last_seen=%d after fresh observation; expected ≥ %d (a recent unix-second). "+
|
||||
"Pre-fix the column is never updated on re-observation — the original cold-load bug (#1690).",
|
||||
ls, nowSec)
|
||||
}
|
||||
}
|
||||
@@ -1,30 +0,0 @@
|
||||
package main
|
||||
|
||||
import "fmt"
|
||||
|
||||
// formatStatusLog formats the "status: name (iata)" log line emitted on
|
||||
// MQTT status messages. name + iata are MQTT-controlled and routed
|
||||
// through sanitizeLogString so CR/LF/control bytes cannot inject forged
|
||||
// log lines.
|
||||
//
|
||||
// See audit-input-vulns-20260603 follow-up to #1540 — call site
|
||||
// cmd/ingestor/main.go:531.
|
||||
func formatStatusLog(tag, name, iata string) string {
|
||||
return fmt.Sprintf("MQTT [%s] status: %s (%s)", tag, sanitizeLogString(name), sanitizeLogString(iata))
|
||||
}
|
||||
|
||||
// formatChannelMessageLog formats the "channel message: chN from S" log line
|
||||
// emitted on MQTT channel messages. channelIdx + sender are MQTT-controlled.
|
||||
//
|
||||
// Call site cmd/ingestor/main.go:854.
|
||||
func formatChannelMessageLog(tag, channelIdx, sender string) string {
|
||||
return fmt.Sprintf("MQTT [%s] channel message: ch%s from %s", tag, sanitizeLogString(channelIdx), sanitizeLogString(sender))
|
||||
}
|
||||
|
||||
// formatDirectMessageLog formats the "direct message from S" log line
|
||||
// emitted on MQTT DM messages. sender is MQTT-controlled.
|
||||
//
|
||||
// Call site cmd/ingestor/main.go:940.
|
||||
func formatDirectMessageLog(tag, sender string) string {
|
||||
return fmt.Sprintf("MQTT [%s] direct message from %s", tag, sanitizeLogString(sender))
|
||||
}
|
||||
@@ -1,53 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestFormatStatusLog_SanitizesMQTTFields pins the status log line at
|
||||
// cmd/ingestor/main.go:531 — MQTT-derived name + iata must not be able to
|
||||
// inject CR/LF/control bytes into the log stream.
|
||||
func TestFormatStatusLog_SanitizesMQTTFields(t *testing.T) {
|
||||
got := formatStatusLog("ds1", "evil\r\n[FAKE LOG LINE]", "X\nY")
|
||||
if strings.ContainsAny(got, "\r\n") {
|
||||
t.Fatalf("formatStatusLog leaked CR/LF: %q", got)
|
||||
}
|
||||
if strings.Contains(got, "[FAKE LOG LINE]") && !strings.Contains(got, "?[FAKE LOG LINE]") {
|
||||
t.Fatalf("formatStatusLog passed injection payload through unmodified: %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestFormatChannelMessageLog_SanitizesMQTTFields pins
|
||||
// cmd/ingestor/main.go:854 — channelIdx + sender are MQTT-controlled.
|
||||
func TestFormatChannelMessageLog_SanitizesMQTTFields(t *testing.T) {
|
||||
got := formatChannelMessageLog("ds1", "0\r\n[FAKE]", "evil\nguy")
|
||||
if strings.ContainsAny(got, "\r\n") {
|
||||
t.Fatalf("formatChannelMessageLog leaked CR/LF: %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestFormatDirectMessageLog_SanitizesMQTTFields pins
|
||||
// cmd/ingestor/main.go:940 — sender is MQTT-controlled.
|
||||
func TestFormatDirectMessageLog_SanitizesMQTTFields(t *testing.T) {
|
||||
got := formatDirectMessageLog("ds1", "evil\r\n[FAKE LOG LINE] something")
|
||||
if strings.ContainsAny(got, "\r\n") {
|
||||
t.Fatalf("formatDirectMessageLog leaked CR/LF: %q", got)
|
||||
}
|
||||
if !strings.Contains(got, "??[FAKE LOG LINE]") {
|
||||
t.Fatalf("formatDirectMessageLog did not sanitize injection payload: %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
// Sanity: legitimate input passes through untouched apart from tag framing.
|
||||
func TestFormatLogs_LegitInputUnchanged(t *testing.T) {
|
||||
if got := formatStatusLog("ds1", "alpha-node", "BG"); got != "MQTT [ds1] status: alpha-node (BG)" {
|
||||
t.Fatalf("unexpected status line: %q", got)
|
||||
}
|
||||
if got := formatChannelMessageLog("ds1", "3", "bob"); got != "MQTT [ds1] channel message: ch3 from bob" {
|
||||
t.Fatalf("unexpected channel line: %q", got)
|
||||
}
|
||||
if got := formatDirectMessageLog("ds1", "bob"); got != "MQTT [ds1] direct message from bob" {
|
||||
t.Fatalf("unexpected DM line: %q", got)
|
||||
}
|
||||
}
|
||||
+94
-718
File diff suppressed because it is too large
Load Diff
+33
-438
@@ -1,19 +1,12 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"database/sql"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
mqtt "github.com/eclipse/paho.mqtt.golang"
|
||||
)
|
||||
|
||||
func TestToFloat64(t *testing.T) {
|
||||
@@ -137,7 +130,7 @@ func TestHandleMessageRawPacket(t *testing.T) {
|
||||
payload := []byte(`{"raw":"` + rawHex + `","SNR":5.5,"RSSI":-100.0,"origin":"myobs"}`)
|
||||
msg := &mockMessage{topic: "meshcore/SJC/obs1/packets", payload: payload}
|
||||
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var count int
|
||||
store.db.QueryRow("SELECT COUNT(*) FROM transmissions").Scan(&count)
|
||||
@@ -154,7 +147,7 @@ func TestHandleMessageRawPacketAdvert(t *testing.T) {
|
||||
payload := []byte(`{"raw":"` + rawHex + `"}`)
|
||||
msg := &mockMessage{topic: "meshcore/SJC/obs1/packets", payload: payload}
|
||||
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
// Should create a node from the ADVERT
|
||||
var count int
|
||||
@@ -176,7 +169,7 @@ func TestHandleMessageInvalidJSON(t *testing.T) {
|
||||
msg := &mockMessage{topic: "meshcore/SJC/obs1/packets", payload: []byte(`not json`)}
|
||||
|
||||
// Should not panic
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var count int
|
||||
store.db.QueryRow("SELECT COUNT(*) FROM transmissions").Scan(&count)
|
||||
@@ -193,7 +186,7 @@ func TestHandleMessageStatusTopic(t *testing.T) {
|
||||
payload: []byte(`{"origin":"MyObserver"}`),
|
||||
}
|
||||
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var name, iata string
|
||||
err := store.db.QueryRow("SELECT name, iata FROM observers WHERE id = 'obs1'").Scan(&name, &iata)
|
||||
@@ -214,11 +207,11 @@ func TestHandleMessageSkipStatusTopics(t *testing.T) {
|
||||
|
||||
// meshcore/status should be skipped
|
||||
msg1 := &mockMessage{topic: "meshcore/status", payload: []byte(`{"raw":"0A00"}`)}
|
||||
handleMessage(store, "test", source, msg1, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg1, nil, &Config{})
|
||||
|
||||
// meshcore/events/connection should be skipped
|
||||
msg2 := &mockMessage{topic: "meshcore/events/connection", payload: []byte(`{"raw":"0A00"}`)}
|
||||
handleMessage(store, "test", source, msg2, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg2, nil, &Config{})
|
||||
|
||||
var count int
|
||||
store.db.QueryRow("SELECT COUNT(*) FROM transmissions").Scan(&count)
|
||||
@@ -237,7 +230,7 @@ func TestHandleMessageIATAFilter(t *testing.T) {
|
||||
topic: "meshcore/SJC/obs1/packets",
|
||||
payload: []byte(`{"raw":"` + rawHex + `"}`),
|
||||
}
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var count int
|
||||
store.db.QueryRow("SELECT COUNT(*) FROM transmissions").Scan(&count)
|
||||
@@ -250,7 +243,7 @@ func TestHandleMessageIATAFilter(t *testing.T) {
|
||||
topic: "meshcore/LAX/obs2/packets",
|
||||
payload: []byte(`{"raw":"` + rawHex + `"}`),
|
||||
}
|
||||
handleMessage(store, "test", source, msg2, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg2, nil, &Config{})
|
||||
|
||||
store.db.QueryRow("SELECT COUNT(*) FROM transmissions").Scan(&count)
|
||||
if count != 1 {
|
||||
@@ -268,7 +261,7 @@ func TestHandleMessageIATAFilterNoRegion(t *testing.T) {
|
||||
topic: "meshcore",
|
||||
payload: []byte(`{"raw":"` + rawHex + `"}`),
|
||||
}
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
// No region part → filter doesn't apply, message goes through
|
||||
// Actually the code checks len(parts) > 1 for IATA filter
|
||||
@@ -284,7 +277,7 @@ func TestHandleMessageNoRawHex(t *testing.T) {
|
||||
topic: "meshcore/SJC/obs1/packets",
|
||||
payload: []byte(`{"type":"companion","data":"something"}`),
|
||||
}
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var count int
|
||||
store.db.QueryRow("SELECT COUNT(*) FROM transmissions").Scan(&count)
|
||||
@@ -302,7 +295,7 @@ func TestHandleMessageBadRawHex(t *testing.T) {
|
||||
topic: "meshcore/SJC/obs1/packets",
|
||||
payload: []byte(`{"raw":"ZZZZ"}`),
|
||||
}
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var count int
|
||||
store.db.QueryRow("SELECT COUNT(*) FROM transmissions").Scan(&count)
|
||||
@@ -319,7 +312,7 @@ func TestHandleMessageWithSNRRSSIAsNumbers(t *testing.T) {
|
||||
payload := []byte(`{"raw":"` + rawHex + `","SNR":7.2,"RSSI":-95}`)
|
||||
msg := &mockMessage{topic: "meshcore/SJC/obs1/packets", payload: payload}
|
||||
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var snr, rssi *float64
|
||||
store.db.QueryRow("SELECT snr, rssi FROM observations LIMIT 1").Scan(&snr, &rssi)
|
||||
@@ -338,7 +331,7 @@ func TestHandleMessageMinimalTopic(t *testing.T) {
|
||||
topic: "meshcore/SJC",
|
||||
payload: []byte(`{"raw":"` + rawHex + `"}`),
|
||||
}
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var count int
|
||||
store.db.QueryRow("SELECT COUNT(*) FROM transmissions").Scan(&count)
|
||||
@@ -359,7 +352,7 @@ func TestHandleMessageCorruptedAdvert(t *testing.T) {
|
||||
topic: "meshcore/SJC/obs1/packets",
|
||||
payload: []byte(`{"raw":"` + rawHex + `"}`),
|
||||
}
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
// Transmission should be inserted (even if advert is invalid)
|
||||
var count int
|
||||
@@ -385,7 +378,7 @@ func TestHandleMessageNoObserverID(t *testing.T) {
|
||||
topic: "packets",
|
||||
payload: []byte(`{"raw":"` + rawHex + `","origin":"obs1"}`),
|
||||
}
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var count int
|
||||
store.db.QueryRow("SELECT COUNT(*) FROM transmissions").Scan(&count)
|
||||
@@ -407,7 +400,7 @@ func TestHandleMessageSNRNotFloat(t *testing.T) {
|
||||
// SNR as a string value — should not parse as float
|
||||
payload := []byte(`{"raw":"` + rawHex + `","SNR":"bad","RSSI":"bad"}`)
|
||||
msg := &mockMessage{topic: "meshcore/SJC/obs1/packets", payload: payload}
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var count int
|
||||
store.db.QueryRow("SELECT COUNT(*) FROM transmissions").Scan(&count)
|
||||
@@ -423,7 +416,7 @@ func TestHandleMessageOriginExtraction(t *testing.T) {
|
||||
rawHex := "0A00D69FD7A5A7475DB07337749AE61FA53A4788E976"
|
||||
payload := []byte(`{"raw":"` + rawHex + `","origin":"MyOrigin"}`)
|
||||
msg := &mockMessage{topic: "meshcore/SJC/obs1/packets", payload: payload}
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
// Verify origin was extracted to observer name
|
||||
var name string
|
||||
@@ -446,7 +439,7 @@ func TestHandleMessagePanicRecovery(t *testing.T) {
|
||||
}
|
||||
|
||||
// Should not panic — the defer/recover should catch it
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
}
|
||||
|
||||
func TestHandleMessageStatusOriginFallback(t *testing.T) {
|
||||
@@ -458,7 +451,7 @@ func TestHandleMessageStatusOriginFallback(t *testing.T) {
|
||||
topic: "meshcore/SJC/obs1/status",
|
||||
payload: []byte(`{"type":"status"}`),
|
||||
}
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var name string
|
||||
err := store.db.QueryRow("SELECT name FROM observers WHERE id = 'obs1'").Scan(&name)
|
||||
@@ -484,20 +477,18 @@ func TestEpochToISO(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestAdvertRole(t *testing.T) {
|
||||
// advertRole now keys off AdvertFlags.Type (firmware ADV_TYPE_*) — see
|
||||
// firmware/src/helpers/AdvertDataHelpers.h:7-12 and issue #1279 P1 #3.
|
||||
tests := []struct {
|
||||
name string
|
||||
flags *AdvertFlags
|
||||
want string
|
||||
}{
|
||||
{"none (type 0)", &AdvertFlags{Type: 0}, "none"},
|
||||
{"companion (type 1)", &AdvertFlags{Type: 1, Chat: true}, "companion"},
|
||||
{"repeater (type 2)", &AdvertFlags{Type: 2, Repeater: true}, "repeater"},
|
||||
{"room (type 3)", &AdvertFlags{Type: 3, Room: true}, "room"},
|
||||
{"sensor (type 4)", &AdvertFlags{Type: 4, Sensor: true}, "sensor"},
|
||||
{"future type-5", &AdvertFlags{Type: 5}, "type-5"},
|
||||
{"nil flags falls back to companion", nil, "companion"},
|
||||
{"repeater", &AdvertFlags{Repeater: true}, "repeater"},
|
||||
{"room", &AdvertFlags{Room: true}, "room"},
|
||||
{"sensor", &AdvertFlags{Sensor: true}, "sensor"},
|
||||
{"companion (default)", &AdvertFlags{Chat: true}, "companion"},
|
||||
{"companion (no flags)", &AdvertFlags{}, "companion"},
|
||||
{"repeater takes priority", &AdvertFlags{Repeater: true, Room: true}, "repeater"},
|
||||
{"room before sensor", &AdvertFlags{Room: true, Sensor: true}, "room"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
@@ -616,41 +607,8 @@ func TestLoadChannelKeysHashChannelsNormalization(t *testing.T) {
|
||||
if _, ok := keys["#Spaced"]; !ok {
|
||||
t.Error("should derive key for #Spaced (trimmed)")
|
||||
}
|
||||
// 3 derived + builtins (Public)
|
||||
expected := 3 + len(builtinChannelKeys())
|
||||
if len(keys) != expected {
|
||||
t.Errorf("expected %d keys, got %d", expected, len(keys))
|
||||
}
|
||||
}
|
||||
|
||||
// Default Public channel must always be present from the built-in floor,
|
||||
// regardless of whether a rainbow file is provided.
|
||||
func TestLoadChannelKeysBuiltinPublic(t *testing.T) {
|
||||
t.Setenv("CHANNEL_KEYS_PATH", "")
|
||||
dir := t.TempDir()
|
||||
cfgPath := filepath.Join(dir, "config.json")
|
||||
cfg := &Config{}
|
||||
|
||||
keys := loadChannelKeys(cfg, cfgPath)
|
||||
|
||||
if got := keys["Public"]; got != "8b3387e9c5cdea6ac9e5edbaa115cd72" {
|
||||
t.Errorf("Public key = %q, want firmware-default 8b3387e9c5cdea6ac9e5edbaa115cd72", got)
|
||||
}
|
||||
}
|
||||
|
||||
// Explicit config and rainbow entries must still override the built-in floor.
|
||||
func TestLoadChannelKeysBuiltinOverridable(t *testing.T) {
|
||||
t.Setenv("CHANNEL_KEYS_PATH", "")
|
||||
dir := t.TempDir()
|
||||
cfgPath := filepath.Join(dir, "config.json")
|
||||
cfg := &Config{
|
||||
ChannelKeys: map[string]string{"Public": "deadbeefdeadbeefdeadbeefdeadbeef"},
|
||||
}
|
||||
|
||||
keys := loadChannelKeys(cfg, cfgPath)
|
||||
|
||||
if got := keys["Public"]; got != "deadbeefdeadbeefdeadbeefdeadbeef" {
|
||||
t.Errorf("Public key = %q, want explicit override deadbeef...", got)
|
||||
if len(keys) != 3 {
|
||||
t.Errorf("expected 3 keys, got %d", len(keys))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -682,7 +640,7 @@ func TestHandleMessageWithLowercaseSNRRSSI(t *testing.T) {
|
||||
payload := []byte(`{"raw":"` + rawHex + `","snr":5.5,"rssi":-102}`)
|
||||
msg := &mockMessage{topic: "meshcore/SJC/obs1/packets", payload: payload}
|
||||
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var snr, rssi *float64
|
||||
store.db.QueryRow("SELECT snr, rssi FROM observations LIMIT 1").Scan(&snr, &rssi)
|
||||
@@ -703,7 +661,7 @@ func TestHandleMessageSNRRSSIUppercaseWins(t *testing.T) {
|
||||
payload := []byte(`{"raw":"` + rawHex + `","SNR":7.2,"snr":1.0,"RSSI":-95,"rssi":-50}`)
|
||||
msg := &mockMessage{topic: "meshcore/SJC/obs1/packets", payload: payload}
|
||||
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var snr, rssi *float64
|
||||
store.db.QueryRow("SELECT snr, rssi FROM observations LIMIT 1").Scan(&snr, &rssi)
|
||||
@@ -723,7 +681,7 @@ func TestHandleMessageNoSNRRSSI(t *testing.T) {
|
||||
payload := []byte(`{"raw":"` + rawHex + `"}`)
|
||||
msg := &mockMessage{topic: "meshcore/SJC/obs1/packets", payload: payload}
|
||||
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var snr, rssi *float64
|
||||
store.db.QueryRow("SELECT snr, rssi FROM observations LIMIT 1").Scan(&snr, &rssi)
|
||||
@@ -794,7 +752,7 @@ func TestIATAFilterDoesNotDropStatusMessages(t *testing.T) {
|
||||
topic: "meshcore/BFL/bfl-obs1/status",
|
||||
payload: []byte(`{"origin":"BFLObserver","stats":{"noise_floor":-105.0}}`),
|
||||
}
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, msg, nil, &Config{})
|
||||
|
||||
var name string
|
||||
var noiseFloor *float64
|
||||
@@ -815,373 +773,10 @@ func TestIATAFilterDoesNotDropStatusMessages(t *testing.T) {
|
||||
topic: "meshcore/BFL/bfl-obs1/packets",
|
||||
payload: []byte(`{"raw":"` + rawHex + `"}`),
|
||||
}
|
||||
handleMessage(store, "test", source, pktMsg, nil, nil, &Config{})
|
||||
handleMessage(store, "test", source, pktMsg, nil, &Config{})
|
||||
var count int
|
||||
store.db.QueryRow("SELECT COUNT(*) FROM transmissions").Scan(&count)
|
||||
if count != 0 {
|
||||
t.Error("packet from out-of-region BFL should still be filtered by IATA")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadRegionKeys(t *testing.T) {
|
||||
cfg := &Config{HashRegions: []string{"#belgium", "eu", " #Test ", "", "#belgium"}}
|
||||
keys := loadRegionKeys(cfg)
|
||||
|
||||
// Deduplication + normalization
|
||||
if len(keys) != 3 {
|
||||
t.Fatalf("len(keys) = %d, want 3", len(keys))
|
||||
}
|
||||
// Pre-computed: SHA256("#belgium")[:16]. Hardcoded so a change to the key
|
||||
// derivation algorithm (hash function, truncation length) breaks this test
|
||||
// even if both sides were updated together.
|
||||
wantBelgium, _ := hex.DecodeString("7085b78ed010599094f8c8e7d1aa0e27")
|
||||
if got := keys["#belgium"]; !bytes.Equal(got, wantBelgium) {
|
||||
t.Errorf("#belgium key mismatch: got %x, want %x", got, wantBelgium)
|
||||
}
|
||||
// "eu" should be normalized to "#eu"
|
||||
if _, ok := keys["#eu"]; !ok {
|
||||
t.Error("expected #eu key")
|
||||
}
|
||||
// " #Test " should be normalized to "#Test"
|
||||
if _, ok := keys["#Test"]; !ok {
|
||||
t.Error("expected #Test key")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMatchScope(t *testing.T) {
|
||||
// Fixed known-answer vectors only — no in-test HMAC computation.
|
||||
// Keys and Code1 values are pre-computed externally so a wrong algorithm
|
||||
// that produces consistent wrong results on both sides would still fail.
|
||||
|
||||
// Vector 1: "#test"/payloadType=5/"hello" → Code1=2AB5
|
||||
// Key = SHA256("#test")[:16] = 9cd8fcf22a47333b591d96a2b848b73f
|
||||
testKey, _ := hex.DecodeString("9cd8fcf22a47333b591d96a2b848b73f")
|
||||
testKeys := map[string][]byte{"#test": testKey}
|
||||
if got := matchScope(testKeys, 5, []byte("hello"), "2AB5"); got != "#test" {
|
||||
t.Errorf("#test vector: matchScope = %q, want #test", got)
|
||||
}
|
||||
|
||||
// Vector 2: "#belgium"/payloadType=5/"hello" → Code1=4A75
|
||||
// Key = SHA256("#belgium")[:16] = 7085b78ed010599094f8c8e7d1aa0e27
|
||||
belgiumKey, _ := hex.DecodeString("7085b78ed010599094f8c8e7d1aa0e27")
|
||||
belgiumKeys := map[string][]byte{"#belgium": belgiumKey}
|
||||
if got := matchScope(belgiumKeys, 5, []byte("hello"), "4A75"); got != "#belgium" {
|
||||
t.Errorf("#belgium vector: matchScope = %q, want #belgium", got)
|
||||
}
|
||||
|
||||
// Code1=0000 (unscoped transport) → no region matched
|
||||
if got := matchScope(belgiumKeys, 5, []byte("hello"), "0000"); got != "" {
|
||||
t.Errorf("unscoped: matchScope = %q, want empty", got)
|
||||
}
|
||||
|
||||
// Code1 present but matches no configured region → empty string
|
||||
if got := matchScope(belgiumKeys, 5, []byte("hello"), "BEEF"); got != "" {
|
||||
t.Errorf("no match: matchScope = %q, want empty", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildPacketDataScopeMatching(t *testing.T) {
|
||||
// Fixed known-answer packet: TRANSPORT_FLOOD, payloadType=5, payload="hello",
|
||||
// Code1=2AB5 (pre-computed for region "#test").
|
||||
// header=0x14 (route_type=0 FLOOD, payloadType=5 → 5<<2), Code1=[0x2A,0xB5],
|
||||
// Code2=[0,0], path_len=0, payload="hello" (68 65 6C 6C 6F).
|
||||
const rawHex = "142AB500000068656C6C6F"
|
||||
key, _ := hex.DecodeString("9cd8fcf22a47333b591d96a2b848b73f") // SHA256("#test")[:16]
|
||||
regionKeys := map[string][]byte{"#test": key}
|
||||
|
||||
decoded, err := DecodePacket(rawHex, nil, false)
|
||||
if err != nil {
|
||||
t.Fatalf("DecodePacket: %v", err)
|
||||
}
|
||||
|
||||
msg := &MQTTPacketMessage{Raw: rawHex}
|
||||
pktData := BuildPacketData(msg, decoded, "obs1", "region1", regionKeys)
|
||||
if pktData.ScopeName != "#test" {
|
||||
t.Errorf("ScopeName = %q, want #test", pktData.ScopeName)
|
||||
}
|
||||
if !pktData.IsTransportScoped {
|
||||
t.Error("IsTransportScoped should be true")
|
||||
}
|
||||
}
|
||||
|
||||
// TestMQTTConnectRetryTimeoutDoesNotBlock verifies that WaitTimeout returns within
|
||||
// the deadline for an unreachable broker when ConnectRetry=true (#910). Previously,
|
||||
// token.Wait() would block forever in this configuration.
|
||||
func TestMQTTConnectRetryTimeoutDoesNotBlock(t *testing.T) {
|
||||
opts := mqtt.NewClientOptions().
|
||||
AddBroker("tcp://127.0.0.1:1"). // port 1 — nothing listening, fast refusal
|
||||
SetConnectRetry(true).
|
||||
SetAutoReconnect(true)
|
||||
|
||||
client := mqtt.NewClient(opts)
|
||||
token := client.Connect()
|
||||
defer client.Disconnect(100)
|
||||
|
||||
start := time.Now()
|
||||
connected := token.WaitTimeout(3 * time.Second)
|
||||
elapsed := time.Since(start)
|
||||
|
||||
if connected {
|
||||
t.Skip("port 1 unexpectedly accepted a connection — skipping")
|
||||
}
|
||||
if elapsed > 4*time.Second {
|
||||
t.Errorf("WaitTimeout blocked for %v — token.Wait() would block forever with ConnectRetry=true", elapsed)
|
||||
}
|
||||
}
|
||||
|
||||
// TestBL1_GoroutineLeakOnHardFailure reproduces BLOCKER 1: without Disconnect()
|
||||
// on the error path, Paho's internal retry goroutines leak when a client is
|
||||
// discarded after Connect() with ConnectRetry=true.
|
||||
//
|
||||
// We prove the leak by creating N clients WITHOUT Disconnect — goroutines grow
|
||||
// proportionally. The fix (client.Disconnect(0) before continue) prevents this.
|
||||
func TestBL1_GoroutineLeakOnHardFailure(t *testing.T) {
|
||||
runtime.GC()
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
baseline := runtime.NumGoroutine()
|
||||
|
||||
// Create multiple clients connected to unreachable broker, WITHOUT disconnecting.
|
||||
// Each one spawns Paho retry goroutines that accumulate.
|
||||
const numClients = 10
|
||||
clients := make([]mqtt.Client, numClients)
|
||||
for i := 0; i < numClients; i++ {
|
||||
opts := mqtt.NewClientOptions().
|
||||
AddBroker("tcp://127.0.0.1:1").
|
||||
SetConnectRetry(true).
|
||||
SetAutoReconnect(true).
|
||||
SetConnectTimeout(500 * time.Millisecond)
|
||||
c := mqtt.NewClient(opts)
|
||||
tok := c.Connect()
|
||||
tok.WaitTimeout(1 * time.Second)
|
||||
clients[i] = c
|
||||
}
|
||||
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
leaked := runtime.NumGoroutine()
|
||||
goroutineGrowth := leaked - baseline
|
||||
|
||||
// Clean up to not actually leak in test
|
||||
for _, c := range clients {
|
||||
c.Disconnect(0)
|
||||
}
|
||||
|
||||
t.Logf("baseline=%d, after %d undisconnected clients=%d, growth=%d",
|
||||
baseline, numClients, leaked, goroutineGrowth)
|
||||
|
||||
// With ConnectRetry=true, each Connect() spawns retry goroutines.
|
||||
// Without Disconnect, these accumulate. Verify growth is meaningful.
|
||||
if goroutineGrowth < 3 {
|
||||
t.Skip("Connect didn't spawn enough extra goroutines to measure leak")
|
||||
}
|
||||
|
||||
// The fix: calling client.Disconnect(0) on the error path prevents accumulation.
|
||||
// Anti-tautology: removing the Disconnect(0) call from main.go's error path
|
||||
// would cause goroutine accumulation proportional to failed broker count.
|
||||
t.Logf("CONFIRMED: %d leaked goroutines from %d clients without Disconnect — fix adds Disconnect(0) on error path", goroutineGrowth, numClients)
|
||||
}
|
||||
|
||||
// TestBL2_ZeroConnectedFatals verifies BLOCKER 2: when all brokers are unreachable,
|
||||
// connectedCount==0 must be detected. We test the logic directly — if only timed-out
|
||||
// clients exist (appended to clients slice) but connectedCount is 0, the guard triggers.
|
||||
func TestBL2_ZeroConnectedFatals(t *testing.T) {
|
||||
// Simulate the connection loop result: 1 timed-out client, 0 connected
|
||||
var clients []mqtt.Client
|
||||
connectedCount := 0
|
||||
|
||||
// Create a client that times out (unreachable broker)
|
||||
opts := mqtt.NewClientOptions().
|
||||
AddBroker("tcp://127.0.0.1:1").
|
||||
SetConnectRetry(true).
|
||||
SetAutoReconnect(true)
|
||||
|
||||
client := mqtt.NewClient(opts)
|
||||
token := client.Connect()
|
||||
if !token.WaitTimeout(2 * time.Second) {
|
||||
// Timed out — PR #926 appends to clients
|
||||
clients = append(clients, client)
|
||||
}
|
||||
defer func() {
|
||||
for _, c := range clients {
|
||||
c.Disconnect(0)
|
||||
}
|
||||
}()
|
||||
|
||||
// OLD bug: len(clients) == 0 would be false (1 timed-out client in list)
|
||||
// → ingestor would silently run with zero connections
|
||||
if len(clients) == 0 {
|
||||
t.Fatal("expected timed-out client to be in clients slice")
|
||||
}
|
||||
|
||||
// NEW fix: connectedCount == 0 catches this
|
||||
if connectedCount != 0 {
|
||||
t.Errorf("connectedCount should be 0, got %d", connectedCount)
|
||||
}
|
||||
|
||||
// The real code does: if connectedCount == 0 { log.Fatal(...) }
|
||||
// This test proves len(clients) > 0 but connectedCount == 0 — the old guard
|
||||
// would have missed it.
|
||||
if len(clients) > 0 && connectedCount == 0 {
|
||||
t.Log("BL2 confirmed: old guard len(clients)==0 would NOT fatal; new guard connectedCount==0 correctly catches zero-connected state")
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleMessageObserverIATAWhitelist(t *testing.T) {
|
||||
store := newTestStore(t)
|
||||
source := MQTTSource{Name: "test"}
|
||||
cfg := &Config{
|
||||
ObserverIATAWhitelist: []string{"ARN"},
|
||||
}
|
||||
|
||||
// Message from non-whitelisted region GOT — should be dropped
|
||||
handleMessage(store, "test", source, &mockMessage{
|
||||
topic: "meshcore/GOT/obs1/status",
|
||||
payload: []byte(`{"origin":"node1","noise_floor":-110}`),
|
||||
}, nil, nil, cfg)
|
||||
|
||||
var count int
|
||||
store.db.QueryRow("SELECT COUNT(*) FROM observers WHERE id='obs1'").Scan(&count)
|
||||
if count != 0 {
|
||||
t.Error("observer from non-whitelisted IATA GOT should be dropped")
|
||||
}
|
||||
|
||||
// Message from whitelisted region ARN — should be accepted
|
||||
handleMessage(store, "test", source, &mockMessage{
|
||||
topic: "meshcore/ARN/obs2/status",
|
||||
payload: []byte(`{"origin":"node2","noise_floor":-105}`),
|
||||
}, nil, nil, cfg)
|
||||
|
||||
store.db.QueryRow("SELECT COUNT(*) FROM observers WHERE id='obs2'").Scan(&count)
|
||||
if count != 1 {
|
||||
t.Errorf("observer from whitelisted IATA ARN should be accepted, got count=%d", count)
|
||||
}
|
||||
}
|
||||
|
||||
// TestBuildPacketDataScopeMatchingNoMatch covers the #1534 regression: a
|
||||
// transport-scoped advert from a non-matching region carries
|
||||
// IsTransportScoped=true and ScopeName="". The default_scope update guard
|
||||
// must skip these packets so previously-correct scopes aren't overwritten
|
||||
// with the empty string.
|
||||
func TestBuildPacketDataScopeMatchingNoMatch(t *testing.T) {
|
||||
// Code1=2AB5 is the precomputed code for region "#test" (payload="hello",
|
||||
// payloadType=5). Build a region-key map for a DIFFERENT region so
|
||||
// matchScope() finds no match and returns "".
|
||||
const rawHex = "142AB500000068656C6C6F"
|
||||
otherKey, _ := hex.DecodeString("aabbccddeeff00112233445566778899")
|
||||
regionKeys := map[string][]byte{"#other": otherKey}
|
||||
|
||||
decoded, err := DecodePacket(rawHex, nil, false)
|
||||
if err != nil {
|
||||
t.Fatalf("DecodePacket: %v", err)
|
||||
}
|
||||
msg := &MQTTPacketMessage{Raw: rawHex}
|
||||
pktData := BuildPacketData(msg, decoded, "obs1", "region1", regionKeys)
|
||||
|
||||
if !pktData.IsTransportScoped {
|
||||
t.Fatalf("precondition: IsTransportScoped should be true (Code1 != 0000)")
|
||||
}
|
||||
if pktData.ScopeName != "" {
|
||||
t.Fatalf("precondition: ScopeName should be empty (no region match), got %q", pktData.ScopeName)
|
||||
}
|
||||
|
||||
// Regression assertion: when ScopeName is empty, the guard must skip the
|
||||
// UpdateNodeDefaultScope call so an empty value never overwrites a
|
||||
// previously-correct default_scope (#1534).
|
||||
if shouldUpdateDefaultScope(pktData) {
|
||||
t.Errorf("shouldUpdateDefaultScope = true for empty ScopeName; want false (would overwrite default_scope with \"\")")
|
||||
}
|
||||
}
|
||||
|
||||
// TestHandleMessageAdvert_EmptyScopeSkipsDefaultScopeUpdate is the call-site
|
||||
// regression test for #1534. It drives a transport-scoped ADVERT whose
|
||||
// region key does NOT match any configured region (so ScopeName=="") through
|
||||
// handleMessage end-to-end and asserts that a pre-existing default_scope on
|
||||
// the node is NOT overwritten with the empty string. This anchors the
|
||||
// call-site guard at main.go:720 — a future refactor that drops the
|
||||
// `if shouldUpdateDefaultScope(...)` wrapper and calls
|
||||
// `store.UpdateNodeDefaultScope(pubkey, pktData.ScopeName)` unconditionally
|
||||
// would re-introduce the #1534 bug and fail this test.
|
||||
func TestHandleMessageAdvert_EmptyScopeSkipsDefaultScopeUpdate(t *testing.T) {
|
||||
store := newTestStore(t)
|
||||
source := MQTTSource{Name: "test"}
|
||||
|
||||
// A transport-scoped ADVERT: header byte 0x10 = route_type 0
|
||||
// (TRANSPORT_FLOOD) + payload_type 4 (ADVERT). Code1=AABB (non-zero, so
|
||||
// IsTransportScoped becomes true), Code2=0000, path_byte=00, then a
|
||||
// 100-byte ADVERT payload (32-byte pubkey starting 46D62D… + 4-byte ts
|
||||
// + 64-byte signature) reused from TestHandleMessageAdvertWithTelemetry.
|
||||
const rawHex = "10AABB00000046D62DE27D4C5194D7821FC5A34A45565DCC2537B300B9AB6275255CEFB65D840CE5C169C94C9AED39E8BCB6CB6EB0335497A198B33A1A610CD3B03D8DCFC160900E5244280323EE0B44CACAB8F02B5B38B91CFA18BD067B0B5E63E94CFC85F758A8530B9240933402E0E6B8F84D5252322D52"
|
||||
const pubkey = "46d62de27d4c5194d7821fc5a34a45565dcc2537b300b9ab6275255cefb65d84"
|
||||
|
||||
// Pre-seed the node with a non-empty default_scope so we can detect an
|
||||
// erroneous overwrite with "".
|
||||
if _, err := store.db.Exec(`INSERT INTO nodes (public_key, name, default_scope) VALUES (?, 'Node1', '#belgium')`, pubkey); err != nil {
|
||||
t.Fatalf("seed node: %v", err)
|
||||
}
|
||||
|
||||
// Empty regionKeys → matchScope() returns "" for any Code1 → ScopeName "".
|
||||
msg := &mockMessage{
|
||||
topic: "meshcore/SJC/obs1/packets",
|
||||
payload: []byte(`{"raw":"` + rawHex + `"}`),
|
||||
}
|
||||
handleMessage(store, "test", source, msg, nil, map[string][]byte{}, &Config{})
|
||||
|
||||
var got sql.NullString
|
||||
if err := store.db.QueryRow(`SELECT default_scope FROM nodes WHERE public_key = ?`, pubkey).Scan(&got); err != nil {
|
||||
t.Fatalf("read default_scope: %v", err)
|
||||
}
|
||||
if !got.Valid || got.String != "#belgium" {
|
||||
t.Errorf("default_scope after empty-scope advert = %q (valid=%v), want #belgium — call-site guard at main.go:720 is missing or broken (#1534)", got.String, got.Valid)
|
||||
}
|
||||
}
|
||||
|
||||
// TestHandleMessageAdvert_MatchedScopeUpdatesDefaultScope is the positive
|
||||
// counterpart: a transport-scoped ADVERT whose Code1 matches a configured
|
||||
// region key MUST cause default_scope to be updated to the matched region
|
||||
// name. Together with the empty-scope test above this proves the call-site
|
||||
// branch routes correctly for both ScopeName states.
|
||||
func TestHandleMessageAdvert_MatchedScopeUpdatesDefaultScope(t *testing.T) {
|
||||
store := newTestStore(t)
|
||||
source := MQTTSource{Name: "test"}
|
||||
|
||||
// Same ADVERT bytes; this time we compute the matching region key for
|
||||
// the (payloadType=4, payload=<advert bytes>) tuple so matchScope() will
|
||||
// return "#de".
|
||||
const advertBytes = "46D62DE27D4C5194D7821FC5A34A45565DCC2537B300B9AB6275255CEFB65D840CE5C169C94C9AED39E8BCB6CB6EB0335497A198B33A1A610CD3B03D8DCFC160900E5244280323EE0B44CACAB8F02B5B38B91CFA18BD067B0B5E63E94CFC85F758A8530B9240933402E0E6B8F84D5252322D52"
|
||||
const pubkey = "46d62de27d4c5194d7821fc5a34a45565dcc2537b300b9ab6275255cefb65d84"
|
||||
|
||||
advertRaw, _ := hex.DecodeString(advertBytes)
|
||||
// Derive the region key whose HMAC produces Code1 we can plant in the
|
||||
// header. Choose key = first 16 bytes of HMAC-SHA256(zeros, advertBytes)
|
||||
// is non-deterministic to find; instead pick an arbitrary key and
|
||||
// compute Code1 from it, then build the packet around that Code1.
|
||||
regionKey, _ := hex.DecodeString("0123456789abcdef0123456789abcdef")
|
||||
mac := hmacSHA256(regionKey, append([]byte{4}, advertRaw...))
|
||||
// Per firmware (#1534 helper logic): Code1 is the first 2 bytes of the
|
||||
// HMAC, sentinel-shifted so 0x0000 → 0x0001 and 0xFFFF → 0xFFFE.
|
||||
code := uint16(mac[0]) | (uint16(mac[1]) << 8)
|
||||
if code == 0x0000 {
|
||||
code = 0x0001
|
||||
} else if code == 0xFFFF {
|
||||
code = 0xFFFE
|
||||
}
|
||||
code1 := fmt.Sprintf("%02X%02X", byte(code&0xFF), byte(code>>8))
|
||||
rawHex := "10" + code1 + "000000" + advertBytes
|
||||
|
||||
if _, err := store.db.Exec(`INSERT INTO nodes (public_key, name, default_scope) VALUES (?, 'Node1', '#old')`, pubkey); err != nil {
|
||||
t.Fatalf("seed node: %v", err)
|
||||
}
|
||||
|
||||
msg := &mockMessage{
|
||||
topic: "meshcore/SJC/obs1/packets",
|
||||
payload: []byte(`{"raw":"` + rawHex + `"}`),
|
||||
}
|
||||
handleMessage(store, "test", source, msg, nil, map[string][]byte{"#de": regionKey}, &Config{})
|
||||
|
||||
var got sql.NullString
|
||||
if err := store.db.QueryRow(`SELECT default_scope FROM nodes WHERE public_key = ?`, pubkey).Scan(&got); err != nil {
|
||||
t.Fatalf("read default_scope: %v", err)
|
||||
}
|
||||
if !got.Valid || got.String != "#de" {
|
||||
t.Errorf("default_scope after matched-scope advert = %q (valid=%v), want #de", got.String, got.Valid)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,221 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"time"
|
||||
|
||||
"github.com/meshcore-analyzer/dbschema"
|
||||
)
|
||||
|
||||
// PruneOldPackets deletes transmissions (and their child observations)
|
||||
// older than `days`. Returns count of transmissions deleted.
|
||||
//
|
||||
// Owned by the ingestor per #1283: the writer process is the only one
|
||||
// allowed to hold the DB write lock; previously this lived in
|
||||
// cmd/server/db.go and raced ingestor INSERTs (SQLITE_BUSY).
|
||||
func (s *Store) PruneOldPackets(days int) (int64, error) {
|
||||
if days <= 0 {
|
||||
return 0, nil
|
||||
}
|
||||
cutoff := time.Now().UTC().AddDate(0, 0, -days).Format(time.RFC3339)
|
||||
|
||||
// Tagged for writer-perf visibility (#1340).
|
||||
var n int64
|
||||
err := s.WriterTx("prune_packets", func(tx *sql.Tx) error {
|
||||
// Delete child observations first (no CASCADE in SQLite).
|
||||
if _, err := tx.Exec(`DELETE FROM observations WHERE transmission_id IN (
|
||||
SELECT id FROM transmissions WHERE first_seen < ?
|
||||
)`, cutoff); err != nil {
|
||||
return fmt.Errorf("prune observations: %w", err)
|
||||
}
|
||||
|
||||
res, err := tx.Exec(`DELETE FROM transmissions WHERE first_seen < ?`, cutoff)
|
||||
if err != nil {
|
||||
return fmt.Errorf("prune transmissions: %w", err)
|
||||
}
|
||||
n, _ = res.RowsAffected()
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
if n > 0 {
|
||||
log.Printf("[prune] deleted %d transmissions older than %d days", n, days)
|
||||
}
|
||||
return n, nil
|
||||
}
|
||||
|
||||
// SoftDeleteBlacklistedObservers marks observers in the blacklist as
|
||||
// inactive=1 so they are hidden from API responses. Owned by ingestor
|
||||
// per #1287. Runs once at startup.
|
||||
func (s *Store) SoftDeleteBlacklistedObservers(blacklist []string) {
|
||||
n, err := dbschema.SoftDeleteBlacklistedObservers(s.db, blacklist)
|
||||
if err != nil {
|
||||
log.Printf("[observer-blacklist] warning: soft-delete failed: %v", err)
|
||||
return
|
||||
}
|
||||
if n > 0 {
|
||||
log.Printf("[observer-blacklist] soft-deleted %d blacklisted observer(s)", n)
|
||||
}
|
||||
}
|
||||
|
||||
// PruneNeighborEdges deletes rows older than maxAgeDays from
|
||||
// neighbor_edges. Owned by the ingestor per #1287 (was in cmd/server).
|
||||
// Returns DB rows deleted.
|
||||
func (s *Store) PruneNeighborEdges(maxAgeDays int) (int64, error) {
|
||||
if maxAgeDays <= 0 {
|
||||
return 0, nil
|
||||
}
|
||||
cutoff := time.Now().UTC().Add(-time.Duration(maxAgeDays) * 24 * time.Hour).Format(time.RFC3339)
|
||||
res, err := s.db.Exec("DELETE FROM neighbor_edges WHERE last_seen < ?", cutoff)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("prune neighbor_edges: %w", err)
|
||||
}
|
||||
n, _ := res.RowsAffected()
|
||||
if n > 0 {
|
||||
log.Printf("[neighbor-prune] removed %d DB rows older than %d days", n, maxAgeDays)
|
||||
}
|
||||
return n, nil
|
||||
}
|
||||
|
||||
// ─── from_pubkey backfill (#1143) ──────────────────────────────────────────
|
||||
//
|
||||
// Moved from cmd/server/from_pubkey_migration.go in #1287. Runs from the
|
||||
// ingestor's maintenance loop. Populates transmissions.from_pubkey for
|
||||
// ADVERT rows whose value is still NULL, by parsing decoded_json.pubKey.
|
||||
|
||||
// FromPubkeyBackfillStats holds progress for /api/healthz exposure.
|
||||
// The ingestor exposes these via stats_file.go so the server can read
|
||||
// them without writing.
|
||||
type FromPubkeyBackfillStats struct {
|
||||
Total int64 `json:"total"`
|
||||
Processed int64 `json:"processed"`
|
||||
Done bool `json:"done"`
|
||||
}
|
||||
|
||||
// BackfillFromPubkey scans transmissions where from_pubkey IS NULL and
|
||||
// payload_type = 4 (ADVERT) and populates from_pubkey from decoded_json.
|
||||
// Chunked + yields between batches. Safe to call repeatedly; once a row
|
||||
// is set to either "" or hex it never matches the WHERE clause again.
|
||||
func (s *Store) BackfillFromPubkey(chunkSize int, yieldDuration time.Duration, progress func(total, processed int64, done bool)) {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
log.Printf("[backfill] from_pubkey panic recovered: %v", r)
|
||||
}
|
||||
if progress != nil {
|
||||
progress(0, 0, true) // signal done; values overwritten below if collected
|
||||
}
|
||||
}()
|
||||
if chunkSize <= 0 {
|
||||
chunkSize = 5000
|
||||
}
|
||||
|
||||
var total int64
|
||||
if err := s.db.QueryRow(
|
||||
"SELECT COUNT(*) FROM transmissions WHERE from_pubkey IS NULL AND payload_type = 4",
|
||||
).Scan(&total); err != nil {
|
||||
log.Printf("[backfill] from_pubkey count error: %v", err)
|
||||
return
|
||||
}
|
||||
if total == 0 {
|
||||
log.Println("[backfill] from_pubkey: nothing to do")
|
||||
if progress != nil {
|
||||
progress(0, 0, true)
|
||||
}
|
||||
return
|
||||
}
|
||||
if progress != nil {
|
||||
progress(total, 0, false)
|
||||
}
|
||||
log.Printf("[backfill] from_pubkey starting: %d ADVERT rows", total)
|
||||
|
||||
stmt, err := s.db.Prepare("UPDATE transmissions SET from_pubkey = ? WHERE id = ?")
|
||||
if err != nil {
|
||||
log.Printf("[backfill] from_pubkey prepare: %v", err)
|
||||
return
|
||||
}
|
||||
defer stmt.Close()
|
||||
|
||||
var processed int64
|
||||
for {
|
||||
rows, err := s.db.Query(
|
||||
"SELECT id, decoded_json FROM transmissions WHERE from_pubkey IS NULL AND payload_type = 4 LIMIT ?",
|
||||
chunkSize)
|
||||
if err != nil {
|
||||
log.Printf("[backfill] from_pubkey select: %v", err)
|
||||
return
|
||||
}
|
||||
type row struct {
|
||||
id int64
|
||||
pk string
|
||||
}
|
||||
batch := make([]row, 0, chunkSize)
|
||||
for rows.Next() {
|
||||
var id int64
|
||||
var dj sql.NullString
|
||||
if err := rows.Scan(&id, &dj); err != nil {
|
||||
continue
|
||||
}
|
||||
batch = append(batch, row{id: id, pk: extractPubkeyFromAdvertJSON(dj.String)})
|
||||
}
|
||||
rows.Close()
|
||||
if len(batch) == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
tx, err := s.db.Begin()
|
||||
if err != nil {
|
||||
log.Printf("[backfill] from_pubkey begin tx: %v", err)
|
||||
return
|
||||
}
|
||||
txStmt := tx.Stmt(stmt)
|
||||
for _, b := range batch {
|
||||
// Sentinel: "" = scanned-no-pubkey (so the WHERE clause
|
||||
// won't keep rescanning this row). hex = real pubkey.
|
||||
var val interface{} = ""
|
||||
if b.pk != "" {
|
||||
val = b.pk
|
||||
}
|
||||
if _, err := txStmt.Exec(val, b.id); err != nil {
|
||||
log.Printf("[backfill] from_pubkey update id=%d: %v", b.id, err)
|
||||
}
|
||||
}
|
||||
if err := tx.Commit(); err != nil {
|
||||
log.Printf("[backfill] from_pubkey commit: %v", err)
|
||||
return
|
||||
}
|
||||
processed += int64(len(batch))
|
||||
if progress != nil {
|
||||
progress(total, processed, false)
|
||||
}
|
||||
if len(batch) < chunkSize {
|
||||
break
|
||||
}
|
||||
if yieldDuration > 0 {
|
||||
time.Sleep(yieldDuration)
|
||||
}
|
||||
}
|
||||
log.Printf("[backfill] from_pubkey complete: %d rows processed", processed)
|
||||
if progress != nil {
|
||||
progress(total, processed, true)
|
||||
}
|
||||
}
|
||||
|
||||
// extractPubkeyFromAdvertJSON parses an ADVERT decoded_json blob and
|
||||
// returns the pubKey field, or "" if absent/invalid.
|
||||
func extractPubkeyFromAdvertJSON(s string) string {
|
||||
if s == "" {
|
||||
return ""
|
||||
}
|
||||
var m map[string]interface{}
|
||||
if err := json.Unmarshal([]byte(s), &m); err != nil {
|
||||
return ""
|
||||
}
|
||||
if v, ok := m["pubKey"].(string); ok {
|
||||
return v
|
||||
}
|
||||
return ""
|
||||
}
|
||||
@@ -1,26 +0,0 @@
|
||||
package main
|
||||
|
||||
import "runtime/debug"
|
||||
|
||||
// applyMemoryLimit configures Go's soft memory limit (GOMEMLIMIT) for the
|
||||
// ingestor process. See #1010.
|
||||
//
|
||||
// Precedence:
|
||||
// 1. GOMEMLIMIT env var (parsed by the runtime at startup) — we do not
|
||||
// override; report source="env" with limit=0.
|
||||
// 2. runtimeMaxMB > 0 (from config runtime.maxMemoryMB) — set limit of
|
||||
// runtimeMaxMB MiB via debug.SetMemoryLimit; source="config".
|
||||
// 3. Otherwise no limit applied; source="none" (default behavior).
|
||||
//
|
||||
// Returns the limit (bytes) we set, or 0 if we did not set one.
|
||||
func applyMemoryLimit(runtimeMaxMB int, envSet bool) (int64, string) {
|
||||
if envSet {
|
||||
return 0, "env"
|
||||
}
|
||||
if runtimeMaxMB <= 0 {
|
||||
return 0, "none"
|
||||
}
|
||||
limit := int64(runtimeMaxMB) * 1024 * 1024
|
||||
debug.SetMemoryLimit(limit)
|
||||
return limit, "config"
|
||||
}
|
||||
@@ -1,71 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"runtime/debug"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestApplyMemoryLimit_FromEnv: when GOMEMLIMIT env var is set, the runtime
|
||||
// already parsed it. Our function MUST NOT override and MUST report env source.
|
||||
func TestApplyMemoryLimit_FromEnv(t *testing.T) {
|
||||
t.Setenv("GOMEMLIMIT", "850MiB")
|
||||
defer debug.SetMemoryLimit(-1)
|
||||
|
||||
limit, source := applyMemoryLimit(512, true /* envSet */)
|
||||
if source != "env" {
|
||||
t.Fatalf("expected source=env, got %q", source)
|
||||
}
|
||||
if limit != 0 {
|
||||
t.Fatalf("expected limit=0 (not set by us), got %d", limit)
|
||||
}
|
||||
}
|
||||
|
||||
// TestApplyMemoryLimit_FromConfig: when env is unset and runtime.maxMemoryMB
|
||||
// is set, derive a limit of exactly runtimeMaxMB * 1 MiB (no headroom — the
|
||||
// ingestor's working set is bounded by MQTT batch decode, not packet store).
|
||||
func TestApplyMemoryLimit_FromConfig(t *testing.T) {
|
||||
defer debug.SetMemoryLimit(-1)
|
||||
|
||||
limit, source := applyMemoryLimit(512, false /* envSet */)
|
||||
if source != "config" {
|
||||
t.Fatalf("expected source=config, got %q", source)
|
||||
}
|
||||
want := int64(512) * 1024 * 1024
|
||||
if limit != want {
|
||||
t.Fatalf("expected limit=%d, got %d", want, limit)
|
||||
}
|
||||
cur := debug.SetMemoryLimit(-1)
|
||||
if cur != want {
|
||||
t.Fatalf("runtime memory limit not set: want=%d got=%d", want, cur)
|
||||
}
|
||||
}
|
||||
|
||||
// TestApplyMemoryLimit_None: neither env nor config — no limit applied,
|
||||
// default behavior preserved.
|
||||
func TestApplyMemoryLimit_None(t *testing.T) {
|
||||
defer debug.SetMemoryLimit(-1)
|
||||
debug.SetMemoryLimit(int64(1<<63 - 1)) // math.MaxInt64 = "no limit"
|
||||
|
||||
limit, source := applyMemoryLimit(0, false)
|
||||
if source != "none" {
|
||||
t.Fatalf("expected source=none, got %q", source)
|
||||
}
|
||||
if limit != 0 {
|
||||
t.Fatalf("expected limit=0, got %d", limit)
|
||||
}
|
||||
}
|
||||
|
||||
// TestApplyMemoryLimit_EnvWinsOverConfig: env set AND config set → env wins,
|
||||
// our function does not override. Locks the precedence triage specified.
|
||||
func TestApplyMemoryLimit_EnvWinsOverConfig(t *testing.T) {
|
||||
t.Setenv("GOMEMLIMIT", "1GiB")
|
||||
defer debug.SetMemoryLimit(-1)
|
||||
|
||||
limit, source := applyMemoryLimit(512, true /* envSet */)
|
||||
if source != "env" {
|
||||
t.Fatalf("expected source=env when both set, got %q", source)
|
||||
}
|
||||
if limit != 0 {
|
||||
t.Fatalf("expected limit=0 when env wins, got %d", limit)
|
||||
}
|
||||
}
|
||||
@@ -1,76 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestBuildMQTTOpts_ReconnectSettings(t *testing.T) {
|
||||
source := MQTTSource{
|
||||
Broker: "tcp://localhost:1883",
|
||||
Name: "test",
|
||||
}
|
||||
opts := buildMQTTOpts(source)
|
||||
|
||||
if opts.MaxReconnectInterval != 30*time.Second {
|
||||
t.Errorf("MaxReconnectInterval = %v, want 30s", opts.MaxReconnectInterval)
|
||||
}
|
||||
if opts.ConnectTimeout != 10*time.Second {
|
||||
t.Errorf("ConnectTimeout = %v, want 10s", opts.ConnectTimeout)
|
||||
}
|
||||
if opts.WriteTimeout != 10*time.Second {
|
||||
t.Errorf("WriteTimeout = %v, want 10s", opts.WriteTimeout)
|
||||
}
|
||||
if !opts.AutoReconnect {
|
||||
t.Error("AutoReconnect should be true")
|
||||
}
|
||||
if !opts.ConnectRetry {
|
||||
t.Error("ConnectRetry should be true")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildMQTTOpts_Credentials(t *testing.T) {
|
||||
source := MQTTSource{
|
||||
Broker: "tcp://broker:1883",
|
||||
Username: "user1",
|
||||
Password: "pass1",
|
||||
}
|
||||
opts := buildMQTTOpts(source)
|
||||
|
||||
if opts.Username != "user1" {
|
||||
t.Errorf("Username = %q, want %q", opts.Username, "user1")
|
||||
}
|
||||
if opts.Password != "pass1" {
|
||||
t.Errorf("Password = %q, want %q", opts.Password, "pass1")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildMQTTOpts_TLS_InsecureSkipVerify(t *testing.T) {
|
||||
f := false
|
||||
source := MQTTSource{
|
||||
Broker: "ssl://broker:8883",
|
||||
RejectUnauthorized: &f,
|
||||
}
|
||||
opts := buildMQTTOpts(source)
|
||||
|
||||
if opts.TLSConfig == nil {
|
||||
t.Fatal("TLSConfig should be set")
|
||||
}
|
||||
if !opts.TLSConfig.InsecureSkipVerify {
|
||||
t.Error("InsecureSkipVerify should be true when RejectUnauthorized=false")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildMQTTOpts_TLS_SSL_Prefix(t *testing.T) {
|
||||
source := MQTTSource{
|
||||
Broker: "ssl://broker:8883",
|
||||
}
|
||||
opts := buildMQTTOpts(source)
|
||||
|
||||
if opts.TLSConfig == nil {
|
||||
t.Fatal("TLSConfig should be set for ssl:// brokers")
|
||||
}
|
||||
if opts.TLSConfig.InsecureSkipVerify {
|
||||
t.Error("InsecureSkipVerify should be false by default")
|
||||
}
|
||||
}
|
||||
@@ -1,248 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/tls"
|
||||
"log"
|
||||
"net/url"
|
||||
"runtime"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// PR #1216 r1 item 5 (kent #1 / adv MAJOR-2): the original assertion was
|
||||
// tautological — it only checked OnConnectAttempt != nil, which passes
|
||||
// even if the handler is a no-op. This version invokes the wired handler,
|
||||
// captures log output, and asserts the OBSERVABLE behaviour operators
|
||||
// rely on during a #1212-class outage:
|
||||
// - the configured source tag appears in the log line
|
||||
// - the broker URL appears in the log line
|
||||
// - the per-source AttemptCount increments on every invocation (proving
|
||||
// the handler is wired to the right state, not just a stub)
|
||||
// - the tlsCfg passed in is returned unchanged (no surprise TLS rewrite)
|
||||
func TestBuildMQTTOpts_InstrumentsConnectionAttempt(t *testing.T) {
|
||||
defer snapshotAndResetRegistry(t)()
|
||||
|
||||
source := MQTTSource{Broker: "tcp://localhost:1883", Name: "obs-tag"}
|
||||
opts := buildMQTTOpts(source)
|
||||
|
||||
if opts.OnConnectAttempt == nil {
|
||||
t.Fatal("OnConnectAttempt must be wired in buildMQTTOpts (#1212 / PR #1216 r1)")
|
||||
}
|
||||
|
||||
// Register the liveness state so the handler can find it and increment
|
||||
// the attempt counter (same wiring main.go does).
|
||||
liveness := &SourceLivenessState{Tag: "obs-tag", Broker: source.Broker}
|
||||
if err := registerLivenessState(liveness); err != nil {
|
||||
t.Fatalf("test setup: registerLivenessState: %v", err)
|
||||
}
|
||||
|
||||
// Capture log output via log.SetOutput. Save/restore so other tests
|
||||
// running serially don't lose their writer.
|
||||
var buf bytes.Buffer
|
||||
origOut := log.Writer()
|
||||
origFlags := log.Flags()
|
||||
log.SetOutput(&buf)
|
||||
log.SetFlags(0)
|
||||
defer func() {
|
||||
log.SetOutput(origOut)
|
||||
log.SetFlags(origFlags)
|
||||
}()
|
||||
|
||||
brokerURL, err := url.Parse(source.Broker)
|
||||
if err != nil {
|
||||
t.Fatalf("test setup: parse broker url: %v", err)
|
||||
}
|
||||
tlsIn := &tls.Config{ServerName: "sentinel.test"}
|
||||
|
||||
// Invoke the handler twice — operators need to see attempt # increment
|
||||
// per dial to gauge backoff progress.
|
||||
tlsOut1 := opts.OnConnectAttempt(brokerURL, tlsIn)
|
||||
tlsOut2 := opts.OnConnectAttempt(brokerURL, tlsIn)
|
||||
|
||||
if tlsOut1 != tlsIn || tlsOut2 != tlsIn {
|
||||
t.Errorf("OnConnectAttempt must pass tlsCfg through unchanged (got %p, %p; want %p)", tlsOut1, tlsOut2, tlsIn)
|
||||
}
|
||||
|
||||
logOut := buf.String()
|
||||
if !strings.Contains(logOut, "obs-tag") {
|
||||
t.Errorf("log output must include the source tag for operator grep; got %q", logOut)
|
||||
}
|
||||
if !strings.Contains(logOut, source.Broker) {
|
||||
t.Errorf("log output must include the broker URL so operators can correlate against config; got %q", logOut)
|
||||
}
|
||||
if !strings.Contains(logOut, "#1") || !strings.Contains(logOut, "#2") {
|
||||
t.Errorf("log output must show attempt #1 and #2 across the two invocations (per-source counter); got %q", logOut)
|
||||
}
|
||||
|
||||
if got := atomic.LoadInt64(&liveness.AttemptCount); got != 2 {
|
||||
t.Errorf("AttemptCount must increment per dial (got %d after 2 invocations, want 2)", got)
|
||||
}
|
||||
}
|
||||
|
||||
// RED: the watchdog acceptance criterion from #1212 — even when the client
|
||||
// reports connected, if NO packets have flowed for >threshold, log a warning.
|
||||
// This is a separate detection layer that catches "silently dead" sockets
|
||||
// (broker accepted TCP but stopped forwarding, half-open TCP, etc.).
|
||||
func TestMQTTStallWatchdog_FiresOnSilentSource(t *testing.T) {
|
||||
state := &SourceLivenessState{Tag: "test", Broker: "tcp://x:1883"}
|
||||
atomic.StoreInt64(&state.LastMessageUnix, time.Now().Add(-10*time.Minute).Unix())
|
||||
state.IsConnectedFn = func() bool { return true }
|
||||
|
||||
msg, kind := checkSourceLiveness(state, 5*time.Minute, time.Now())
|
||||
if kind != LivenessStalled {
|
||||
t.Fatalf("watchdog should flag stall when source connected but no message for 10m (threshold 5m); got kind=%v msg=%q", kind, msg)
|
||||
}
|
||||
if !strings.Contains(msg, "no messages") {
|
||||
t.Errorf("stall message should mention 'no messages'; got %q", msg)
|
||||
}
|
||||
if !strings.Contains(msg, "test") {
|
||||
t.Errorf("stall message should include the source tag; got %q", msg)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMQTTStallWatchdog_QuietWhenRecent(t *testing.T) {
|
||||
state := &SourceLivenessState{Tag: "test", Broker: "tcp://x:1883"}
|
||||
atomic.StoreInt64(&state.LastMessageUnix, time.Now().Add(-30*time.Second).Unix())
|
||||
state.IsConnectedFn = func() bool { return true }
|
||||
|
||||
_, kind := checkSourceLiveness(state, 5*time.Minute, time.Now())
|
||||
if kind != LivenessOK {
|
||||
t.Fatal("watchdog should NOT flag stall when last message was 30s ago and threshold is 5m")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMQTTStallWatchdog_QuietWhenDisconnected(t *testing.T) {
|
||||
// When disconnected, paho's own reconnect logging covers it — the
|
||||
// watchdog should only fire for the silent-while-connected case.
|
||||
state := &SourceLivenessState{Tag: "test", Broker: "tcp://x:1883"}
|
||||
atomic.StoreInt64(&state.LastMessageUnix, time.Now().Add(-1*time.Hour).Unix())
|
||||
state.IsConnectedFn = func() bool { return false }
|
||||
|
||||
_, kind := checkSourceLiveness(state, 5*time.Minute, time.Now())
|
||||
if kind != LivenessDisconnected {
|
||||
t.Fatalf("watchdog must classify a !IsConnected source as LivenessDisconnected (silent state), not LivenessOK — r2 item 1 prevents disconnect→recovery mis-classification; got kind=%v", kind)
|
||||
}
|
||||
}
|
||||
|
||||
// snapshotAndResetRegistry isolates the package-level livenessRegistry for a
|
||||
// single test. Returns a restore func to defer. Without this, parallel or
|
||||
// previously-registered sources leak into the watchdog goroutine under test.
|
||||
func snapshotAndResetRegistry(t *testing.T) func() {
|
||||
t.Helper()
|
||||
livenessRegistryMu.Lock()
|
||||
saved := livenessRegistry
|
||||
livenessRegistry = map[string]*SourceLivenessState{}
|
||||
livenessRegistryMu.Unlock()
|
||||
return func() {
|
||||
livenessRegistryMu.Lock()
|
||||
livenessRegistry = saved
|
||||
livenessRegistryMu.Unlock()
|
||||
}
|
||||
}
|
||||
|
||||
// RED-then-GREEN: the watchdog GOROUTINE (not just checkSourceLiveness) must
|
||||
// fan out emits across the registry on each tick, AND must exit cleanly when
|
||||
// the stop signal fires. Originally runLivenessWatchdog used `for range
|
||||
// t.C` — ticker.Stop() does not close the channel, so the goroutine
|
||||
// leaked past shutdown. This test asserts both:
|
||||
// - tick → emit for every stalled source in the registry
|
||||
// - stop → goroutine returns within a short bound
|
||||
func TestMQTTStallWatchdog_LoopEmitsAndStopsCleanly(t *testing.T) {
|
||||
defer snapshotAndResetRegistry(t)()
|
||||
|
||||
s1 := &SourceLivenessState{Tag: "alpha", Broker: "tcp://a:1883", IsConnectedFn: func() bool { return true }}
|
||||
s2 := &SourceLivenessState{Tag: "beta", Broker: "tcp://b:1883", IsConnectedFn: func() bool { return true }}
|
||||
atomic.StoreInt64(&s1.LastMessageUnix, time.Now().Add(-10*time.Minute).Unix())
|
||||
atomic.StoreInt64(&s2.LastMessageUnix, time.Now().Add(-10*time.Minute).Unix())
|
||||
registerLivenessState(s1)
|
||||
registerLivenessState(s2)
|
||||
|
||||
tick := make(chan time.Time, 1)
|
||||
done := make(chan struct{})
|
||||
|
||||
var mu sync.Mutex
|
||||
var emits []string
|
||||
emit := func(args ...any) {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
if len(args) > 0 {
|
||||
if s, ok := args[0].(string); ok {
|
||||
emits = append(emits, s)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
exited := make(chan struct{})
|
||||
go func() {
|
||||
runLivenessWatchdogLoop(tick, done, 5*time.Minute, emit)
|
||||
close(exited)
|
||||
}()
|
||||
|
||||
tick <- time.Now()
|
||||
// Drain: wait briefly for the emits to land. Polling instead of sleeping
|
||||
// keeps the test fast on a healthy machine.
|
||||
deadline := time.Now().Add(2 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
mu.Lock()
|
||||
n := len(emits)
|
||||
mu.Unlock()
|
||||
if n >= 2 {
|
||||
break
|
||||
}
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
}
|
||||
mu.Lock()
|
||||
got := append([]string(nil), emits...)
|
||||
mu.Unlock()
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("expected 2 stall emits (alpha+beta), got %d: %v", len(got), got)
|
||||
}
|
||||
|
||||
close(done)
|
||||
select {
|
||||
case <-exited:
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("watchdog goroutine did not exit within 2s of stop — ticker leak regression")
|
||||
}
|
||||
}
|
||||
|
||||
// PR #1216 r1 item 6 (kent #2 / adv MAJOR-3): the original test had no
|
||||
// assertions gating behaviour — it called stop() and trusted `-race` to
|
||||
// catch leaks. `-race` does NOT detect goroutine leaks. This version
|
||||
// captures runtime.NumGoroutine() before/after and asserts the watchdog's
|
||||
// goroutine actually exited. Allows ±1 slack for unrelated runtime
|
||||
// bookkeeping (gc, finalizer).
|
||||
func TestMQTTStallWatchdog_RunStopsCleanly(t *testing.T) {
|
||||
defer snapshotAndResetRegistry(t)()
|
||||
|
||||
// Settle: let any prior-test goroutines finish before sampling baseline.
|
||||
runtime.GC()
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
before := runtime.NumGoroutine()
|
||||
|
||||
stop := runLivenessWatchdog(10*time.Millisecond, 5*time.Minute)
|
||||
// Let the watchdog run a few ticks so we're sure it's truly spawned.
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
if mid := runtime.NumGoroutine(); mid <= before {
|
||||
t.Fatalf("watchdog goroutine did not spawn: before=%d mid=%d", before, mid)
|
||||
}
|
||||
|
||||
stop()
|
||||
|
||||
// Poll for the goroutine count to return to baseline (±1 slack).
|
||||
deadline := time.Now().Add(2 * time.Second)
|
||||
var after int
|
||||
for time.Now().Before(deadline) {
|
||||
runtime.Gosched()
|
||||
after = runtime.NumGoroutine()
|
||||
if after <= before+1 {
|
||||
return
|
||||
}
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
}
|
||||
t.Fatalf("watchdog goroutine leaked: before=%d after=%d (delta %d) — stop() did not signal the loop to exit", before, after, after-before)
|
||||
}
|
||||
@@ -1,410 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
)
|
||||
|
||||
// heartbeatInterval is how often the watchdog re-emits a still-stalled
|
||||
// reminder once the initial WARN edge has fired. 1h matches the pager
|
||||
// budget — frequent enough that an unattended stall is noticed within a
|
||||
// shift, infrequent enough not to spam ops chat.
|
||||
const livenessHeartbeatInterval = time.Hour
|
||||
|
||||
// forceReconnectThrottle is the minimum interval between forced
|
||||
// reconnects on the SAME source. See processLivenessTransition.
|
||||
const forceReconnectThrottle = 60 * time.Second
|
||||
|
||||
// LivenessKind enumerates the watchdog verdicts for a source. Edge-triggered
|
||||
// transitions use this to decide whether to emit (and what severity).
|
||||
type LivenessKind int
|
||||
|
||||
const (
|
||||
LivenessOK LivenessKind = iota
|
||||
LivenessStalled
|
||||
LivenessNeverReceived
|
||||
LivenessRecovered
|
||||
LivenessHeartbeat
|
||||
// LivenessDisconnected (PR #1216 r2 item 1): paho reports !IsConnected.
|
||||
// Distinct from LivenessOK so processLivenessTransition does NOT
|
||||
// interpret a TCP drop as recovery and fire a spurious "messages
|
||||
// flowing again" INFO when the source actually went from silently
|
||||
// broken to overtly broken. paho's own reconnect logging already
|
||||
// covers the disconnect — this kind exists solely to keep the
|
||||
// transition engine from mis-classifying it.
|
||||
LivenessDisconnected
|
||||
)
|
||||
|
||||
// SourceLivenessState tracks per-source last-message timestamp and connection
|
||||
// state for the stall watchdog (#1212). LastMessageUnix is updated by the
|
||||
// message handler via atomic store; the watchdog reads it via atomic load.
|
||||
//
|
||||
// PR #1216 r1 added:
|
||||
// - StartedAt: re-stamped on reconnect to suppress transient-stall WARNs
|
||||
// during paho's reconnect window.
|
||||
// - LastAlertUnix: edge-trigger cooldown; prevents 60-per-hour re-emits
|
||||
// of the same WARN.
|
||||
//
|
||||
// PR #1216 r2 added:
|
||||
// - FirstConnectedAt: stamped ONCE at registration, never reset. The
|
||||
// cold-start "NEVER received" alarm uses this so a broker that flaps
|
||||
// in CONNECT → SUBSCRIBE-deny cannot indefinitely re-arm the grace
|
||||
// window. r1's StartedAt-as-grace-clock conflated transient-stall
|
||||
// suppression with cold-start grace; r2 separates them.
|
||||
type SourceLivenessState struct {
|
||||
Tag string
|
||||
Broker string
|
||||
LastMessageUnix int64 // atomic; unix seconds of last successfully WRITTEN MQTT message (handleMessage post-write)
|
||||
// LastReceiptUnix (PR #1609 M1) is stamped at MQTT receipt time —
|
||||
// BEFORE the message is handed to the buffer/writer. STUB: unused
|
||||
// in production until the green commit wires MarkReceipt at the
|
||||
// receipt callsite and surfaces it in stats/healthz.
|
||||
LastReceiptUnix int64 // atomic; unix seconds of last RECEIPT (broker liveness)
|
||||
// FirstConnectedAt (PR #1216 r2 item 2) is stamped ONCE at
|
||||
// registerLivenessState time and never reset. Cold-start grace
|
||||
// checks against this so a flapping broker (CONNECT ok, SUBSCRIBE
|
||||
// ACL-denied — the #1212 shape) can no longer suppress the
|
||||
// "NEVER received" alarm by re-stamping StartedAt on every reconnect.
|
||||
FirstConnectedAt int64 // atomic; unix seconds of first registration
|
||||
StartedAt int64 // atomic; unix seconds when the source was registered / last reconnected (transient-stall tracking)
|
||||
LastAlertUnix int64 // atomic; unix seconds of last emit (WARN or heartbeat); 0 means quiet
|
||||
IsConnectedFn func() bool
|
||||
// ForceReconnectFn (#1335) is called by the watchdog when a source
|
||||
// transitions INTO LivenessStalled. It must force the paho client
|
||||
// to drop its current TCP socket and re-establish (typically
|
||||
// client.Disconnect(250) followed by client.Connect()). Half-open
|
||||
// TCP sockets (Azure NAT idle timeout) report IsConnected==true so
|
||||
// paho's own auto-reconnect never fires; this is the recovery path.
|
||||
// May be nil (tests, or sources registered before wiring); the
|
||||
// watchdog must treat that as a safe no-op. Invocations are
|
||||
// throttled at forceReconnectThrottle per source so a
|
||||
// stall→reconnect→re-stall loop self-recovers without hammering
|
||||
// the broker.
|
||||
ForceReconnectFn func()
|
||||
// LastForceReconnectUnix is the unix-seconds timestamp of the most
|
||||
// recent forced reconnect for this source; the watchdog reads it
|
||||
// to enforce forceReconnectThrottle. atomic.
|
||||
LastForceReconnectUnix int64
|
||||
// AttemptCount is incremented on every TCP/TLS connection attempt. Used
|
||||
// by ConnectionAttemptHandler to log attempt # independent of paho's
|
||||
// internal reconnect-loop state. atomic.
|
||||
AttemptCount int64
|
||||
}
|
||||
|
||||
// MarkMessage records the time of a received MQTT message. Cheap; safe to
|
||||
// call from the message-handling hot path.
|
||||
func (s *SourceLivenessState) MarkMessage(now time.Time) {
|
||||
atomic.StoreInt64(&s.LastMessageUnix, now.Unix())
|
||||
}
|
||||
|
||||
// MarkReceipt records the time of an MQTT message receipt — stamped at the
|
||||
// paho receipt callback BEFORE the message enters the ingest buffer. PR
|
||||
// #1609 M1: kept separate from LastMessageUnix so the watchdog/healthz can
|
||||
// distinguish "broker alive, write path stuck" (LastReceiptUnix fresh,
|
||||
// LastMessageUnix stale) from "everything stalled" (both stale). Cheap;
|
||||
// safe to call from the message-handling hot path.
|
||||
func (s *SourceLivenessState) MarkReceipt(now time.Time) {
|
||||
atomic.StoreInt64(&s.LastReceiptUnix, now.Unix())
|
||||
}
|
||||
|
||||
// MarkReconnected clears stale liveness state so the watchdog does not
|
||||
// false-alarm on a pre-outage timestamp after paho re-establishes the
|
||||
// connection (PR #1216 r1 item 2). Resets LastMessageUnix, re-stamps
|
||||
// StartedAt (transient-stall window restarts), and clears LastAlertUnix
|
||||
// (edge-trigger re-arms).
|
||||
//
|
||||
// PR #1216 r2 item 2: FirstConnectedAt is INTENTIONALLY not touched here.
|
||||
// Under broker flap (CONNECT ok, SUBSCRIBE ACL-denied — exact #1212
|
||||
// class) r1 reset StartedAt on every reconnect, indefinitely re-arming
|
||||
// the cold-start grace and silencing the headline "NEVER received"
|
||||
// alarm. Cold-start grace now reads FirstConnectedAt instead, so the
|
||||
// alarm fires after the FIRST grace window regardless of reconnect
|
||||
// churn.
|
||||
func (s *SourceLivenessState) MarkReconnected(now time.Time) {
|
||||
atomic.StoreInt64(&s.LastMessageUnix, 0)
|
||||
atomic.StoreInt64(&s.StartedAt, now.Unix())
|
||||
atomic.StoreInt64(&s.LastAlertUnix, 0)
|
||||
}
|
||||
|
||||
// checkSourceLiveness returns (message, kind) describing the source's
|
||||
// liveness state. kind==LivenessOK means quiet/healthy; kind==
|
||||
// LivenessDisconnected means paho is not connected (silent state — no
|
||||
// emit, no recovery). Any other kind indicates the caller may want to
|
||||
// emit (subject to edge-trigger).
|
||||
//
|
||||
// Cold-start (PR #1216 r1 item 1, r2 item 2): when LastMessageUnix==0,
|
||||
// the source has never published a single message. If FirstConnectedAt
|
||||
// was stamped at registration and more than `threshold` has elapsed,
|
||||
// this is the #1212 failure class — wrong channel hash, ACL drops
|
||||
// SUBSCRIBE, half-open TCP after CONNECT, or a broker that loops
|
||||
// CONNECT-then-disconnect. We emit a DISTINCT "NEVER received" alarm
|
||||
// so operators can grep for it independently of generic stalls. Using
|
||||
// FirstConnectedAt (not the reconnect-reset StartedAt) ensures broker
|
||||
// flap cannot silence this alarm.
|
||||
func checkSourceLiveness(s *SourceLivenessState, threshold time.Duration, now time.Time) (string, LivenessKind) {
|
||||
if s == nil || s.IsConnectedFn == nil {
|
||||
return "", LivenessOK
|
||||
}
|
||||
if !s.IsConnectedFn() {
|
||||
// paho's reconnect handler covers the disconnected case. Return
|
||||
// a DISTINCT kind so the transition engine does not mis-classify
|
||||
// disconnect as recovery (PR #1216 r2 item 1).
|
||||
return "", LivenessDisconnected
|
||||
}
|
||||
last := atomic.LoadInt64(&s.LastMessageUnix)
|
||||
if last == 0 {
|
||||
firstConnected := atomic.LoadInt64(&s.FirstConnectedAt)
|
||||
if firstConnected == 0 {
|
||||
// Registration didn't stamp FirstConnectedAt — conservative: stay quiet.
|
||||
return "", LivenessOK
|
||||
}
|
||||
sinceFirst := now.Sub(time.Unix(firstConnected, 0))
|
||||
if sinceFirst < threshold {
|
||||
return "", LivenessOK
|
||||
}
|
||||
msg := fmt.Sprintf("MQTT [%s] WATCHDOG: client reports connected to %s but has NEVER received a message in %s (threshold %s) — check channel hash / subscribe ACL / half-open TCP",
|
||||
s.Tag, s.Broker, sinceFirst.Round(time.Second), threshold)
|
||||
return msg, LivenessNeverReceived
|
||||
}
|
||||
silentFor := now.Sub(time.Unix(last, 0))
|
||||
if silentFor < threshold {
|
||||
return "", LivenessOK
|
||||
}
|
||||
msg := fmt.Sprintf("MQTT [%s] WATCHDOG: client reports connected to %s but no messages received for %s (threshold %s) — possible half-open socket or upstream stall",
|
||||
s.Tag, s.Broker, silentFor.Round(time.Second), threshold)
|
||||
return msg, LivenessStalled
|
||||
}
|
||||
|
||||
// livenessRegistry is a package-level lookup so handleMessage (called with
|
||||
// only `tag string`) can mark liveness without threading the state through
|
||||
// every call site. Reads dominate (per message); writes happen once per
|
||||
// source at startup.
|
||||
var (
|
||||
livenessRegistry = map[string]*SourceLivenessState{}
|
||||
livenessRegistryMu sync.RWMutex
|
||||
)
|
||||
|
||||
// registerLivenessState publishes a state to the registry by tag. Returns
|
||||
// an error on tag collision (PR #1216 r1 item 4) so operators see a
|
||||
// startup misconfiguration instead of silently losing AttemptCount and
|
||||
// LastMessageUnix for the clobbered source. The collision case is real:
|
||||
// two MQTT sources with empty Name fall back to Broker; two sources with
|
||||
// duplicate Name; copy-paste in config.json. Caller (main) decides whether
|
||||
// to fatal or just log and skip. The first registration remains
|
||||
// authoritative — we do NOT overwrite.
|
||||
//
|
||||
// Also stamps StartedAt (transient-stall window) and FirstConnectedAt
|
||||
// (cold-start grace anchor — never reset; see r2 item 2 in
|
||||
// MarkReconnected) so the cold-start watchdog has its clocks.
|
||||
func registerLivenessState(s *SourceLivenessState) error {
|
||||
livenessRegistryMu.Lock()
|
||||
defer livenessRegistryMu.Unlock()
|
||||
if existing, ok := livenessRegistry[s.Tag]; ok {
|
||||
return fmt.Errorf("liveness registry: duplicate tag %q (existing broker=%s, new broker=%s) — fix config so each MQTT source has a unique Name", s.Tag, existing.Broker, s.Broker)
|
||||
}
|
||||
nowUnix := time.Now().Unix()
|
||||
if atomic.LoadInt64(&s.StartedAt) == 0 {
|
||||
atomic.StoreInt64(&s.StartedAt, nowUnix)
|
||||
}
|
||||
if atomic.LoadInt64(&s.FirstConnectedAt) == 0 {
|
||||
atomic.StoreInt64(&s.FirstConnectedAt, nowUnix)
|
||||
}
|
||||
livenessRegistry[s.Tag] = s
|
||||
return nil
|
||||
}
|
||||
|
||||
// registerLivenessOrSkip (PR #1216 r2 item 3) is the main-callsite wrapper
|
||||
// that replaces the previous log.Fatalf on tag collision. Fatal at
|
||||
// startup over a config typo would kill the entire ingestor and recreate
|
||||
// the #1212 total-ingest-stop class this PR exists to prevent. On
|
||||
// collision we log ERROR + skip — the MQTT source still attempts to
|
||||
// connect, it just won't be tracked by the liveness watchdog. Returns
|
||||
// true iff the source was registered.
|
||||
func registerLivenessOrSkip(s *SourceLivenessState) bool {
|
||||
if err := registerLivenessState(s); err != nil {
|
||||
log.Printf("[ingestor] ERROR: source tag collision %q — skipping duplicate liveness registration, this source will connect but will not be tracked by the watchdog (%v)", s.Tag, err)
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// markLivenessForTag is the hot-path entry point: O(1) map lookup +
|
||||
// atomic store. Safe to call for unknown tags (no-op). Updates
|
||||
// LastMessageUnix (post-write clock).
|
||||
func markLivenessForTag(tag string, now time.Time) {
|
||||
livenessRegistryMu.RLock()
|
||||
s := livenessRegistry[tag]
|
||||
livenessRegistryMu.RUnlock()
|
||||
if s != nil {
|
||||
s.MarkMessage(now)
|
||||
}
|
||||
}
|
||||
|
||||
// markReceiptForTag is the hot-path entry point used at MQTT receipt
|
||||
// (BEFORE the message is buffered/written). Updates LastReceiptUnix only.
|
||||
// PR #1609 M1 — separates broker-liveness signal from write-path
|
||||
// liveness so /healthz can show a stalled writer with a live broker.
|
||||
func markReceiptForTag(tag string, now time.Time) {
|
||||
livenessRegistryMu.RLock()
|
||||
s := livenessRegistry[tag]
|
||||
livenessRegistryMu.RUnlock()
|
||||
if s != nil {
|
||||
s.MarkReceipt(now)
|
||||
}
|
||||
}
|
||||
|
||||
// SnapshotLivenessClocks returns the per-source receipt vs write-path
|
||||
// liveness pair for every registered source. Read-only; safe to call
|
||||
// from the stats-file writer. PR #1609 M1.
|
||||
func SnapshotLivenessClocks() map[string]SourceLivenessSnapshot {
|
||||
livenessRegistryMu.RLock()
|
||||
defer livenessRegistryMu.RUnlock()
|
||||
if len(livenessRegistry) == 0 {
|
||||
return nil
|
||||
}
|
||||
out := make(map[string]SourceLivenessSnapshot, len(livenessRegistry))
|
||||
for tag, s := range livenessRegistry {
|
||||
out[tag] = SourceLivenessSnapshot{
|
||||
LastReceiptUnix: atomic.LoadInt64(&s.LastReceiptUnix),
|
||||
LastMessageUnix: atomic.LoadInt64(&s.LastMessageUnix),
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// runLivenessWatchdog starts a goroutine that scans the registry every
|
||||
// `interval` and logs a warning for any source that has been silent while
|
||||
// connected for more than `threshold`. Returns a stop function that halts
|
||||
// the ticker AND signals the goroutine to exit (time.Ticker.Stop does NOT
|
||||
// close the channel, so a naive `for range t.C` would leak). interval
|
||||
// should be a fraction of threshold (e.g. threshold/5) so detection
|
||||
// latency is bounded.
|
||||
func runLivenessWatchdog(interval, threshold time.Duration) (stop func()) {
|
||||
t := time.NewTicker(interval)
|
||||
done := make(chan struct{})
|
||||
go runLivenessWatchdogLoop(t.C, done, threshold, log.Print)
|
||||
return func() {
|
||||
t.Stop()
|
||||
close(done)
|
||||
}
|
||||
}
|
||||
|
||||
// runLivenessWatchdogLoop is the goroutine body, extracted so tests can
|
||||
// drive it with a synthetic tick channel and capture log output without
|
||||
// racing on the real ticker.
|
||||
//
|
||||
// Edge-triggered (PR #1216 r1 item 3):
|
||||
// - quiet → stalled / never-received: emit WARN once, record LastAlertUnix
|
||||
// - still stalled, < heartbeat interval since last alert: suppress
|
||||
// - still stalled, ≥ heartbeat interval since last alert: emit reminder,
|
||||
// refresh LastAlertUnix
|
||||
// - stalled → flowing: emit recovery INFO once, clear LastAlertUnix
|
||||
//
|
||||
// Without this, the original loop re-emitted the same WARN on every 60s
|
||||
// tick (60 alerts/hr/source) — the kind of log flood that trains ops to
|
||||
// mute alerts and miss the next real outage.
|
||||
func runLivenessWatchdogLoop(tick <-chan time.Time, done <-chan struct{}, threshold time.Duration, emit func(...any)) {
|
||||
for {
|
||||
select {
|
||||
case <-done:
|
||||
return
|
||||
case now, ok := <-tick:
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
livenessRegistryMu.RLock()
|
||||
states := make([]*SourceLivenessState, 0, len(livenessRegistry))
|
||||
for _, s := range livenessRegistry {
|
||||
states = append(states, s)
|
||||
}
|
||||
livenessRegistryMu.RUnlock()
|
||||
for _, s := range states {
|
||||
msg, kind := checkSourceLiveness(s, threshold, now)
|
||||
processLivenessTransition(s, kind, msg, now, emit)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// processLivenessTransition applies the edge-trigger rules and updates
|
||||
// LastAlertUnix accordingly. Separated for testability and to keep the
|
||||
// loop body small.
|
||||
func processLivenessTransition(s *SourceLivenessState, kind LivenessKind, msg string, now time.Time, emit func(...any)) {
|
||||
lastAlert := atomic.LoadInt64(&s.LastAlertUnix)
|
||||
switch kind {
|
||||
case LivenessStalled, LivenessNeverReceived:
|
||||
if lastAlert == 0 {
|
||||
// First detection — fire WARN edge.
|
||||
emit(msg)
|
||||
atomic.StoreInt64(&s.LastAlertUnix, now.Unix())
|
||||
// #1335: ONLY LivenessStalled (paho reports connected but no
|
||||
// messages past threshold — classic half-open TCP) gets
|
||||
// force-reconnected. LivenessNeverReceived is almost always
|
||||
// an ACL deny / wrong channel hash — a new TCP socket won't
|
||||
// fix it and would just churn the broker. The distinct
|
||||
// "NEVER received" alarm is the right operator signal for
|
||||
// that class.
|
||||
if kind == LivenessStalled {
|
||||
maybeForceReconnect(s, now, emit)
|
||||
}
|
||||
return
|
||||
}
|
||||
// Already alerted; only re-emit on heartbeat interval to avoid log flood.
|
||||
if now.Sub(time.Unix(lastAlert, 0)) >= livenessHeartbeatInterval {
|
||||
emit(fmt.Sprintf("MQTT [%s] WATCHDOG heartbeat: still stalled — %s", s.Tag, msg))
|
||||
atomic.StoreInt64(&s.LastAlertUnix, now.Unix())
|
||||
// Heartbeat re-emit on a still-Stalled source: try another
|
||||
// force-reconnect IF the throttle window has elapsed. Under
|
||||
// a persistent broker issue this caps at one attempt per
|
||||
// heartbeat (1h) — orders of magnitude under any rate
|
||||
// limit and well within "don't hammer the broker".
|
||||
if kind == LivenessStalled {
|
||||
maybeForceReconnect(s, now, emit)
|
||||
}
|
||||
}
|
||||
case LivenessOK:
|
||||
if lastAlert != 0 {
|
||||
// Recovered: emit INFO once, clear the cooldown.
|
||||
emit(fmt.Sprintf("MQTT [%s] WATCHDOG INFO: messages flowing again (recovered)", s.Tag))
|
||||
atomic.StoreInt64(&s.LastAlertUnix, 0)
|
||||
}
|
||||
case LivenessDisconnected:
|
||||
// PR #1216 r2 item 1: disconnect is NOT recovery. Stay completely
|
||||
// silent — paho's reconnect handler already logs the drop — and
|
||||
// preserve LastAlertUnix so the WARN edge can re-fire if/when
|
||||
// the source comes back stalled. Clearing the cooldown here
|
||||
// would mean a flapping source spams the WARN every cycle.
|
||||
}
|
||||
}
|
||||
|
||||
// maybeForceReconnect invokes ForceReconnectFn IFF (a) one is wired and
|
||||
// (b) the throttle window (forceReconnectThrottle) has elapsed since
|
||||
// the most recent forced reconnect for this source. Logs WATCHDOG
|
||||
// telemetry before/after so operators can correlate the reconnect with
|
||||
// downstream paho ConnectionAttempt/OnConnect lines.
|
||||
func maybeForceReconnect(s *SourceLivenessState, now time.Time, emit func(...any)) {
|
||||
if s.ForceReconnectFn == nil {
|
||||
return
|
||||
}
|
||||
lastForce := atomic.LoadInt64(&s.LastForceReconnectUnix)
|
||||
if lastForce != 0 && now.Sub(time.Unix(lastForce, 0)) < forceReconnectThrottle {
|
||||
emit(fmt.Sprintf("MQTT [%s] WATCHDOG suppressing forced reconnect (last attempt %s ago, throttle %s)",
|
||||
s.Tag, now.Sub(time.Unix(lastForce, 0)).Round(time.Second), forceReconnectThrottle))
|
||||
return
|
||||
}
|
||||
atomic.StoreInt64(&s.LastForceReconnectUnix, now.Unix())
|
||||
emit(fmt.Sprintf("MQTT [%s] WATCHDOG forcing reconnect (half-open TCP suspected — paho.IsConnected==true but no messages)", s.Tag))
|
||||
// Run in a goroutine: ForceReconnectFn typically calls
|
||||
// client.Disconnect(250) which blocks up to 250ms, then
|
||||
// client.Connect() which can block on the connect timeout. The
|
||||
// watchdog goroutine must not stall a per-tick scan over a single
|
||||
// slow source.
|
||||
go func() {
|
||||
s.ForceReconnectFn()
|
||||
emit(fmt.Sprintf("MQTT [%s] WATCHDOG reconnect attempt issued", s.Tag))
|
||||
}()
|
||||
}
|
||||
|
||||
@@ -1,174 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Issue #1335 — staging's lincomatic source stalls: paho reports
|
||||
// IsConnected==true but no messages arrive for 1h+. The PR #1216
|
||||
// watchdog DETECTS this (LivenessStalled) but only LOGS — it never
|
||||
// forces paho to drop the half-open TCP socket and reconnect, so the
|
||||
// source stays silently broken until container restart.
|
||||
//
|
||||
// Fix: on transition INTO LivenessStalled, invoke a per-source
|
||||
// ForceReconnectFn (wired in main.go to client.Disconnect(250) +
|
||||
// client.Connect()). Throttled by forceReconnectThrottle so a
|
||||
// stall→reconnect→re-stall loop self-recovers without hammering the
|
||||
// broker.
|
||||
|
||||
// RED on master: ForceReconnectFn is never invoked because the
|
||||
// transition engine does not call it. After the fix, the WARN edge on
|
||||
// LivenessStalled MUST fire force-reconnect exactly once.
|
||||
func TestMQTTStallWatchdog_ForceReconnectOnStallEdge(t *testing.T) {
|
||||
defer snapshotAndResetRegistry(t)()
|
||||
|
||||
now := time.Now()
|
||||
var reconnectCount atomic.Int32
|
||||
s := &SourceLivenessState{
|
||||
Tag: "stalled-half-open",
|
||||
Broker: "tcp://halfopen.example:1883",
|
||||
IsConnectedFn: func() bool { return true },
|
||||
ForceReconnectFn: func() { reconnectCount.Add(1) },
|
||||
}
|
||||
atomic.StoreInt64(&s.LastMessageUnix, now.Add(-10*time.Minute).Unix())
|
||||
atomic.StoreInt64(&s.StartedAt, now.Add(-20*time.Minute).Unix())
|
||||
if err := registerLivenessState(s); err != nil {
|
||||
t.Fatalf("setup: %v", err)
|
||||
}
|
||||
|
||||
var mu sync.Mutex
|
||||
var emits []string
|
||||
emit := func(args ...any) {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
if len(args) > 0 {
|
||||
if str, ok := args[0].(string); ok {
|
||||
emits = append(emits, str)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
processLivenessTransition(s, LivenessStalled, "10m silent", now, emit)
|
||||
|
||||
// ForceReconnectFn runs in a goroutine (the production code can't
|
||||
// block the watchdog tick on a slow Disconnect+Connect). Wait
|
||||
// briefly for it to land before asserting.
|
||||
waitForReconnect(t, &reconnectCount, 1, 2*time.Second)
|
||||
|
||||
if got := reconnectCount.Load(); got != 1 {
|
||||
t.Fatalf("LivenessStalled transition MUST force-reconnect exactly once; got %d invocations (emits=%v)", got, emits)
|
||||
}
|
||||
}
|
||||
|
||||
// Throttle: a second LivenessStalled transition within the throttle
|
||||
// window MUST NOT fire a second reconnect (no broker hammering).
|
||||
func TestMQTTStallWatchdog_ForceReconnectThrottled(t *testing.T) {
|
||||
defer snapshotAndResetRegistry(t)()
|
||||
|
||||
now := time.Now()
|
||||
var reconnectCount atomic.Int32
|
||||
s := &SourceLivenessState{
|
||||
Tag: "throttled",
|
||||
Broker: "tcp://x:1883",
|
||||
IsConnectedFn: func() bool { return true },
|
||||
ForceReconnectFn: func() { reconnectCount.Add(1) },
|
||||
}
|
||||
if err := registerLivenessState(s); err != nil {
|
||||
t.Fatalf("setup: %v", err)
|
||||
}
|
||||
|
||||
emit := func(args ...any) {}
|
||||
|
||||
// First stall edge → fires.
|
||||
processLivenessTransition(s, LivenessStalled, "stall 1", now, emit)
|
||||
waitForReconnect(t, &reconnectCount, 1, 2*time.Second)
|
||||
// Simulate paho reconnect cycle: MarkReconnected clears the alert
|
||||
// cooldown, then the source goes stalled again 5s later.
|
||||
s.MarkReconnected(now.Add(5 * time.Second))
|
||||
processLivenessTransition(s, LivenessStalled, "stall 2", now.Add(10*time.Second), emit)
|
||||
// Give a stray goroutine a chance to land (it shouldn't, due to throttle).
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
if got := reconnectCount.Load(); got != 1 {
|
||||
t.Fatalf("force-reconnect MUST be throttled within %s; got %d invocations", forceReconnectThrottle, got)
|
||||
}
|
||||
|
||||
// After the throttle window, a fresh stall edge MAY fire again.
|
||||
s.MarkReconnected(now.Add(30 * time.Second))
|
||||
processLivenessTransition(s, LivenessStalled, "stall 3", now.Add(forceReconnectThrottle+30*time.Second), emit)
|
||||
waitForReconnect(t, &reconnectCount, 2, 2*time.Second)
|
||||
if got := reconnectCount.Load(); got != 2 {
|
||||
t.Fatalf("after throttle window, force-reconnect must re-arm; got %d invocations", got)
|
||||
}
|
||||
}
|
||||
|
||||
// NeverReceived (cold-start ACL-deny / never-flowed) MUST NOT
|
||||
// force-reconnect. A SUBSCRIBE ACL deny is not fixed by a new TCP
|
||||
// socket; reconnecting just churns the broker. Operators get the
|
||||
// distinct "NEVER received" alarm so they can address the ACL.
|
||||
func TestMQTTStallWatchdog_NoForceReconnectOnNeverReceived(t *testing.T) {
|
||||
defer snapshotAndResetRegistry(t)()
|
||||
|
||||
now := time.Now()
|
||||
var reconnectCount atomic.Int32
|
||||
s := &SourceLivenessState{
|
||||
Tag: "acl-denied",
|
||||
Broker: "tcp://x:1883",
|
||||
IsConnectedFn: func() bool { return true },
|
||||
ForceReconnectFn: func() { reconnectCount.Add(1) },
|
||||
}
|
||||
if err := registerLivenessState(s); err != nil {
|
||||
t.Fatalf("setup: %v", err)
|
||||
}
|
||||
|
||||
emit := func(args ...any) {}
|
||||
processLivenessTransition(s, LivenessNeverReceived, "no msgs ever", now, emit)
|
||||
// Settle any (incorrect) goroutine before counting.
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
if got := reconnectCount.Load(); got != 0 {
|
||||
t.Fatalf("LivenessNeverReceived must NOT force-reconnect (likely ACL deny — TCP churn won't help); got %d invocations", got)
|
||||
}
|
||||
}
|
||||
|
||||
// Safety: a source with no ForceReconnectFn wired (e.g. tests, or a
|
||||
// source registered before the wiring was added) MUST NOT panic when
|
||||
// LivenessStalled fires.
|
||||
func TestMQTTStallWatchdog_NilForceReconnectFnIsSafe(t *testing.T) {
|
||||
defer snapshotAndResetRegistry(t)()
|
||||
|
||||
now := time.Now()
|
||||
s := &SourceLivenessState{
|
||||
Tag: "no-reconnect-fn",
|
||||
Broker: "tcp://x:1883",
|
||||
IsConnectedFn: func() bool { return true },
|
||||
// ForceReconnectFn deliberately nil.
|
||||
}
|
||||
if err := registerLivenessState(s); err != nil {
|
||||
t.Fatalf("setup: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
t.Fatalf("nil ForceReconnectFn must be a safe no-op; panicked: %v", r)
|
||||
}
|
||||
}()
|
||||
processLivenessTransition(s, LivenessStalled, "stalled", now, func(args ...any) {})
|
||||
}
|
||||
|
||||
// waitForReconnect polls reconnectCount until it reaches `want` or the
|
||||
// deadline elapses. ForceReconnectFn runs in a goroutine in production
|
||||
// (Disconnect+Connect can block on broker IO), so tests can't read the
|
||||
// counter synchronously.
|
||||
func waitForReconnect(t *testing.T, count *atomic.Int32, want int32, timeout time.Duration) {
|
||||
t.Helper()
|
||||
deadline := time.Now().Add(timeout)
|
||||
for time.Now().Before(deadline) {
|
||||
if count.Load() >= want {
|
||||
return
|
||||
}
|
||||
time.Sleep(5 * time.Millisecond)
|
||||
}
|
||||
}
|
||||
@@ -1,43 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TestSourceLivenessState_ReceiptVsWriteSeparate asserts that the receipt-
|
||||
// time and post-write liveness clocks are independent (PR #1609 review
|
||||
// MAJOR M1): stamping at receipt must NOT advance the post-write clock so
|
||||
// the watchdog/healthz can distinguish "broker alive, write path stuck"
|
||||
// from "everything fine". Without separation, /healthz reports "fresh"
|
||||
// while the writer is stalled and the ingest buffer is filling.
|
||||
func TestSourceLivenessState_ReceiptVsWriteSeparate(t *testing.T) {
|
||||
s := &SourceLivenessState{Tag: "t"}
|
||||
now := time.Now()
|
||||
|
||||
// Receipt at T0; post-write never happens (writer stalled).
|
||||
s.MarkReceipt(now)
|
||||
|
||||
gotReceipt := atomic.LoadInt64(&s.LastReceiptUnix)
|
||||
gotWrite := atomic.LoadInt64(&s.LastMessageUnix)
|
||||
if gotReceipt != now.Unix() {
|
||||
t.Fatalf("LastReceiptUnix: want %d, got %d", now.Unix(), gotReceipt)
|
||||
}
|
||||
if gotWrite != 0 {
|
||||
t.Fatalf("LastMessageUnix MUST stay 0 while writer stalled (only MarkReceipt called); got %d — receipt is double-stamping the write clock and /healthz will lie about ingestion freshness", gotWrite)
|
||||
}
|
||||
|
||||
// Write completes later: only MarkMessage advances LastMessageUnix.
|
||||
later := now.Add(5 * time.Second)
|
||||
s.MarkMessage(later)
|
||||
|
||||
gotReceipt2 := atomic.LoadInt64(&s.LastReceiptUnix)
|
||||
gotWrite2 := atomic.LoadInt64(&s.LastMessageUnix)
|
||||
if gotReceipt2 != now.Unix() {
|
||||
t.Fatalf("MarkMessage must not move LastReceiptUnix backwards or forwards; want %d, got %d", now.Unix(), gotReceipt2)
|
||||
}
|
||||
if gotWrite2 != later.Unix() {
|
||||
t.Fatalf("LastMessageUnix after MarkMessage: want %d, got %d", later.Unix(), gotWrite2)
|
||||
}
|
||||
}
|
||||
@@ -1,286 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// PR #1216 round-1 review fixes. Tests are RED before the fix lands:
|
||||
// - Item 1: cold-start blind spot — silent-from-start source never alarmed.
|
||||
// - Item 2: reconnect reset — stale LastMessageUnix triggers false stall after recovery.
|
||||
// - Item 3: log flood — every-60s rescan re-emits same WARN forever.
|
||||
// - Item 4: tag collision in registerLivenessState silently overwrites prior state.
|
||||
|
||||
// waitFor polls until emits reaches `want` items or the deadline elapses.
|
||||
// Used to serialize "drain this tick before mutating state" in goroutine
|
||||
// tests so we observe deterministic edge transitions.
|
||||
func waitFor(t *testing.T, mu *sync.Mutex, emits *[]string, want int, timeout time.Duration) {
|
||||
t.Helper()
|
||||
deadline := time.Now().Add(timeout)
|
||||
for time.Now().Before(deadline) {
|
||||
mu.Lock()
|
||||
n := len(*emits)
|
||||
mu.Unlock()
|
||||
if n >= want {
|
||||
return
|
||||
}
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
}
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
t.Fatalf("timeout waiting for %d emits; got %d: %v", want, len(*emits), *emits)
|
||||
}
|
||||
|
||||
// Item 1 (RED): a source that connects but never receives a message is
|
||||
// invisible to the current watchdog (LastMessageUnix==0 → skip). This is
|
||||
// the exact #1212 failure class — wrong channel hash, ACL drops SUBSCRIBE,
|
||||
// half-open TCP after CONNECT. Fix: stamp StartedAt at registration; when
|
||||
// LastMessageUnix==0 AND now-StartedAt > threshold, alarm with a distinct
|
||||
// "NEVER received" message.
|
||||
func TestMQTTStallWatchdog_FiresOnSilentFromStart(t *testing.T) {
|
||||
now := time.Now()
|
||||
state := &SourceLivenessState{
|
||||
Tag: "cold",
|
||||
Broker: "tcp://x:1883",
|
||||
IsConnectedFn: func() bool { return true },
|
||||
}
|
||||
atomic.StoreInt64(&state.StartedAt, now.Add(-10*time.Minute).Unix())
|
||||
atomic.StoreInt64(&state.FirstConnectedAt, now.Add(-10*time.Minute).Unix())
|
||||
// LastMessageUnix stays 0 — never received anything.
|
||||
|
||||
msg, kind := checkSourceLiveness(state, 5*time.Minute, now)
|
||||
if kind != LivenessNeverReceived {
|
||||
t.Fatalf("expected LivenessNeverReceived for silent-from-start source after threshold; got kind=%v msg=%q", kind, msg)
|
||||
}
|
||||
if !strings.Contains(strings.ToUpper(msg), "NEVER") {
|
||||
t.Errorf("cold-start alarm must mention NEVER received to distinguish from generic stall; got %q", msg)
|
||||
}
|
||||
if !strings.Contains(msg, "cold") {
|
||||
t.Errorf("alarm must include source tag; got %q", msg)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMQTTStallWatchdog_QuietDuringColdStartGrace(t *testing.T) {
|
||||
now := time.Now()
|
||||
state := &SourceLivenessState{
|
||||
Tag: "warming-up",
|
||||
Broker: "tcp://x:1883",
|
||||
IsConnectedFn: func() bool { return true },
|
||||
}
|
||||
atomic.StoreInt64(&state.StartedAt, now.Add(-30*time.Second).Unix())
|
||||
atomic.StoreInt64(&state.FirstConnectedAt, now.Add(-30*time.Second).Unix())
|
||||
|
||||
_, kind := checkSourceLiveness(state, 5*time.Minute, now)
|
||||
if kind != LivenessOK {
|
||||
t.Fatalf("must NOT alarm during cold-start grace (30s in, threshold 5m); got kind=%v", kind)
|
||||
}
|
||||
}
|
||||
|
||||
// Item 2 (RED): after a long outage + paho reconnect, LastMessageUnix is
|
||||
// still 2h-old → watchdog screams "stalled for 2h" immediately. Fix: reset
|
||||
// LastMessageUnix (and the cold-start clock) on OnConnect. This test
|
||||
// asserts the reset method does what's required so the next watchdog scan
|
||||
// stays quiet for the grace window.
|
||||
func TestMQTTStallWatchdog_OnReconnectResetsClocks(t *testing.T) {
|
||||
now := time.Now()
|
||||
state := &SourceLivenessState{
|
||||
Tag: "flaky",
|
||||
Broker: "tcp://x:1883",
|
||||
IsConnectedFn: func() bool { return true },
|
||||
}
|
||||
// 2-hour-old timestamp from before the outage.
|
||||
atomic.StoreInt64(&state.LastMessageUnix, now.Add(-2*time.Hour).Unix())
|
||||
atomic.StoreInt64(&state.StartedAt, now.Add(-3*time.Hour).Unix())
|
||||
// Stale alert cooldown from before the outage too — must NOT carry forward.
|
||||
atomic.StoreInt64(&state.LastAlertUnix, now.Add(-90*time.Minute).Unix())
|
||||
|
||||
state.MarkReconnected(now)
|
||||
|
||||
if last := atomic.LoadInt64(&state.LastMessageUnix); last != 0 {
|
||||
t.Errorf("LastMessageUnix must be cleared on reconnect so a stale pre-outage timestamp does not trip the watchdog; got %d", last)
|
||||
}
|
||||
if started := atomic.LoadInt64(&state.StartedAt); started != now.Unix() {
|
||||
t.Errorf("StartedAt must be re-stamped on reconnect so the cold-start grace window restarts; got %d want %d", started, now.Unix())
|
||||
}
|
||||
if alert := atomic.LoadInt64(&state.LastAlertUnix); alert != 0 {
|
||||
t.Errorf("LastAlertUnix must be cleared on reconnect so edge-trigger re-arms; got %d", alert)
|
||||
}
|
||||
|
||||
// Now drive checkSourceLiveness immediately after reconnect: must NOT alarm.
|
||||
_, kind := checkSourceLiveness(state, 5*time.Minute, now.Add(1*time.Second))
|
||||
if kind != LivenessOK {
|
||||
t.Fatalf("watchdog must stay quiet immediately after MarkReconnected; got kind=%v", kind)
|
||||
}
|
||||
}
|
||||
|
||||
// Item 3 (RED): the watchdog loop currently re-emits the same WARN on every
|
||||
// 60s tick (60 alerts/hr/source). Fix: edge-trigger — emit WARN once on
|
||||
// quiet→stalled transition, INFO once on stalled→flowing recovery, and an
|
||||
// hourly heartbeat while still stalled. Asserts: 3 consecutive ticks on a
|
||||
// stalled source produce exactly ONE WARN.
|
||||
func TestMQTTStallWatchdog_EdgeTriggeredEmitsOnlyOnce(t *testing.T) {
|
||||
defer snapshotAndResetRegistry(t)()
|
||||
|
||||
now := time.Now()
|
||||
s := &SourceLivenessState{
|
||||
Tag: "stuck",
|
||||
Broker: "tcp://x:1883",
|
||||
IsConnectedFn: func() bool { return true },
|
||||
}
|
||||
atomic.StoreInt64(&s.LastMessageUnix, now.Add(-10*time.Minute).Unix())
|
||||
atomic.StoreInt64(&s.StartedAt, now.Add(-20*time.Minute).Unix())
|
||||
registerLivenessState(s)
|
||||
|
||||
var mu sync.Mutex
|
||||
var emits []string
|
||||
emit := func(args ...any) {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
if len(args) > 0 {
|
||||
if str, ok := args[0].(string); ok {
|
||||
emits = append(emits, str)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tick := make(chan time.Time, 3)
|
||||
done := make(chan struct{})
|
||||
exited := make(chan struct{})
|
||||
go func() {
|
||||
runLivenessWatchdogLoop(tick, done, 5*time.Minute, emit)
|
||||
close(exited)
|
||||
}()
|
||||
|
||||
// Three back-to-back ticks within the heartbeat window. Only the first
|
||||
// should emit a WARN; the other two must be suppressed (edge-triggered).
|
||||
tick <- now
|
||||
tick <- now.Add(30 * time.Second)
|
||||
tick <- now.Add(60 * time.Second)
|
||||
|
||||
// Wait for ticks to drain.
|
||||
deadline := time.Now().Add(2 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
mu.Lock()
|
||||
n := len(emits)
|
||||
mu.Unlock()
|
||||
if n >= 1 && time.Since(deadline.Add(-2*time.Second)) > 200*time.Millisecond {
|
||||
break
|
||||
}
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
}
|
||||
close(done)
|
||||
<-exited
|
||||
|
||||
mu.Lock()
|
||||
got := append([]string(nil), emits...)
|
||||
mu.Unlock()
|
||||
|
||||
warns := 0
|
||||
for _, e := range got {
|
||||
if strings.Contains(e, "WATCHDOG") || strings.Contains(e, "stalled") || strings.Contains(strings.ToUpper(e), "WARN") {
|
||||
warns++
|
||||
}
|
||||
}
|
||||
if warns != 1 {
|
||||
t.Fatalf("expected exactly 1 stall WARN across 3 consecutive scans (edge-trigger); got %d: %v", warns, got)
|
||||
}
|
||||
}
|
||||
|
||||
// Item 3 (RED): on stalled→flowing transition, a recovery INFO must fire
|
||||
// exactly once. Future ticks must stay silent until a new stall edge.
|
||||
func TestMQTTStallWatchdog_RecoveryEmitOnce(t *testing.T) {
|
||||
defer snapshotAndResetRegistry(t)()
|
||||
|
||||
now := time.Now()
|
||||
s := &SourceLivenessState{
|
||||
Tag: "src-b",
|
||||
Broker: "tcp://x:1883",
|
||||
IsConnectedFn: func() bool { return true },
|
||||
}
|
||||
atomic.StoreInt64(&s.LastMessageUnix, now.Add(-10*time.Minute).Unix())
|
||||
atomic.StoreInt64(&s.StartedAt, now.Add(-20*time.Minute).Unix())
|
||||
registerLivenessState(s)
|
||||
|
||||
var mu sync.Mutex
|
||||
var emits []string
|
||||
emit := func(args ...any) {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
if len(args) > 0 {
|
||||
if str, ok := args[0].(string); ok {
|
||||
emits = append(emits, str)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tick := make(chan time.Time, 4)
|
||||
done := make(chan struct{})
|
||||
exited := make(chan struct{})
|
||||
go func() {
|
||||
runLivenessWatchdogLoop(tick, done, 5*time.Minute, emit)
|
||||
close(exited)
|
||||
}()
|
||||
|
||||
tick <- now // → WARN
|
||||
// Wait for the goroutine to drain that tick and record the WARN edge
|
||||
// before we mutate state — otherwise we race the loop and the first
|
||||
// emit observes the "recovered" timestamp instead of the stall.
|
||||
waitFor(t, &mu, &emits, 1, 2*time.Second)
|
||||
// Source recovers: a recent message arrives.
|
||||
atomic.StoreInt64(&s.LastMessageUnix, now.Add(30*time.Second).Unix())
|
||||
tick <- now.Add(60 * time.Second) // → recovery INFO
|
||||
waitFor(t, &mu, &emits, 2, 2*time.Second)
|
||||
tick <- now.Add(120 * time.Second) // → silent
|
||||
tick <- now.Add(180 * time.Second) // → silent
|
||||
|
||||
// Brief settle so any (incorrect) extra emits land before we count.
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
close(done)
|
||||
<-exited
|
||||
|
||||
mu.Lock()
|
||||
got := append([]string(nil), emits...)
|
||||
mu.Unlock()
|
||||
|
||||
infos := 0
|
||||
for _, e := range got {
|
||||
upper := strings.ToUpper(e)
|
||||
if strings.Contains(upper, "RECOVER") || strings.Contains(upper, "FLOWING") {
|
||||
infos++
|
||||
}
|
||||
}
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("expected exactly 2 emits (1 WARN + 1 recovery INFO); got %d: %v", len(got), got)
|
||||
}
|
||||
if infos != 1 {
|
||||
t.Fatalf("expected exactly 1 recovery INFO emit; got %d (all=%v)", infos, got)
|
||||
}
|
||||
}
|
||||
|
||||
// Item 4 (RED): registerLivenessState silently overwrites on tag collision
|
||||
// (empty-Name + same broker, duplicate Name). Must detect & report.
|
||||
func TestRegisterLivenessState_DetectsTagCollision(t *testing.T) {
|
||||
defer snapshotAndResetRegistry(t)()
|
||||
|
||||
a := &SourceLivenessState{Tag: "dup", Broker: "tcp://a:1883"}
|
||||
b := &SourceLivenessState{Tag: "dup", Broker: "tcp://b:1883"}
|
||||
|
||||
if err := registerLivenessState(a); err != nil {
|
||||
t.Fatalf("first registration must succeed; got %v", err)
|
||||
}
|
||||
if err := registerLivenessState(b); err == nil {
|
||||
t.Fatal("second registration with same tag must return a collision error (current behavior silently clobbers)")
|
||||
}
|
||||
|
||||
// And the registry must still hold the FIRST registration — clobbering
|
||||
// AttemptCount/LastMessageUnix invisibly is the bug.
|
||||
livenessRegistryMu.RLock()
|
||||
got := livenessRegistry["dup"]
|
||||
livenessRegistryMu.RUnlock()
|
||||
if got != a {
|
||||
t.Errorf("on collision, first registration must remain authoritative (got pointer for broker=%s)", got.Broker)
|
||||
}
|
||||
}
|
||||
@@ -1,228 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"log"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// PR #1216 round-2 review fixes. Tests RED before the fix lands.
|
||||
//
|
||||
// r1 closed the cold-start blind spot but introduced three new failure
|
||||
// modes that r2 must eliminate:
|
||||
//
|
||||
// r2 #1 — checkSourceLiveness returns LivenessOK for BOTH "messages
|
||||
// flowing" AND "disconnected/never-connected". A stalled source
|
||||
// whose TCP eventually RSTs trips processLivenessTransition's
|
||||
// recovery branch and emits "messages flowing again (recovered)"
|
||||
// while going from silently broken to overtly broken. Fix: a
|
||||
// distinct LivenessDisconnected kind that the transition
|
||||
// function treats as a silent (no-emit) state, so the alert
|
||||
// cooldown does not collapse on a non-event.
|
||||
//
|
||||
// r2 #2 — MarkReconnected re-stamps StartedAt on every reconnect, so
|
||||
// the cold-start grace clock restarts forever under a broker
|
||||
// flap (CONNECT ok, SUBSCRIBE ACL-denied — the exact #1212
|
||||
// shape). The headline "NEVER received" alarm never fires.
|
||||
// Fix: separate FirstConnectedAt (set once at registration,
|
||||
// never reset) from StartedAt (free to reset on reconnect for
|
||||
// transient-stall tracking). Cold-start grace must use
|
||||
// FirstConnectedAt.
|
||||
//
|
||||
// r2 #3 — main.go calls log.Fatalf on a tag collision in the liveness
|
||||
// registry, killing the entire ingestor over one config typo.
|
||||
// That recreates the #1212 total-ingest-stop failure class
|
||||
// this PR exists to prevent. Fix: log an ERROR and skip
|
||||
// liveness registration for the duplicate — the MQTT source
|
||||
// still attempts to connect, just isn't tracked by the
|
||||
// watchdog (the first registration remains authoritative).
|
||||
|
||||
// r2 #1 RED: a stalled source whose connection then drops must NOT emit
|
||||
// "recovered". The current code does — checkSourceLiveness returns
|
||||
// LivenessOK for both genuine recovery and disconnection, so
|
||||
// processLivenessTransition sees lastAlert!=0 + kind==LivenessOK and
|
||||
// fires the recovery INFO. Operators reading the log think the source
|
||||
// healed when it actually died.
|
||||
func TestMQTTStallWatchdog_NoFalseRecoveryOnDisconnect(t *testing.T) {
|
||||
defer snapshotAndResetRegistry(t)()
|
||||
|
||||
now := time.Now()
|
||||
var connected atomic.Bool
|
||||
connected.Store(true)
|
||||
|
||||
s := &SourceLivenessState{
|
||||
Tag: "drops-after-stall",
|
||||
Broker: "tcp://x:1883",
|
||||
IsConnectedFn: func() bool { return connected.Load() },
|
||||
}
|
||||
atomic.StoreInt64(&s.LastMessageUnix, now.Add(-10*time.Minute).Unix())
|
||||
atomic.StoreInt64(&s.StartedAt, now.Add(-20*time.Minute).Unix())
|
||||
if err := registerLivenessState(s); err != nil {
|
||||
t.Fatalf("setup: registerLivenessState: %v", err)
|
||||
}
|
||||
|
||||
var mu sync.Mutex
|
||||
var emits []string
|
||||
emit := func(args ...any) {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
if len(args) > 0 {
|
||||
if str, ok := args[0].(string); ok {
|
||||
emits = append(emits, str)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tick := make(chan time.Time, 2)
|
||||
done := make(chan struct{})
|
||||
exited := make(chan struct{})
|
||||
go func() {
|
||||
runLivenessWatchdogLoop(tick, done, 5*time.Minute, emit)
|
||||
close(exited)
|
||||
}()
|
||||
|
||||
// Tick 1: source connected + 10m silent → WARN edge.
|
||||
tick <- now
|
||||
waitFor(t, &mu, &emits, 1, 2*time.Second)
|
||||
|
||||
// The TCP socket RSTs — paho flips IsConnected to false. The watchdog
|
||||
// must NOT interpret this as recovery; the source went from silently
|
||||
// broken to overtly broken.
|
||||
connected.Store(false)
|
||||
tick <- now.Add(60 * time.Second)
|
||||
|
||||
// Settle so any (incorrect) extra emits land before we count.
|
||||
time.Sleep(150 * time.Millisecond)
|
||||
close(done)
|
||||
<-exited
|
||||
|
||||
mu.Lock()
|
||||
got := append([]string(nil), emits...)
|
||||
mu.Unlock()
|
||||
|
||||
for _, e := range got {
|
||||
upper := strings.ToUpper(e)
|
||||
if strings.Contains(upper, "RECOVER") || strings.Contains(upper, "FLOWING AGAIN") {
|
||||
t.Fatalf("watchdog must NOT emit recovery INFO when a stalled source disconnects; got %q (all=%v)", e, got)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// r2 #2 RED: a broker that ACKs CONNECT but denies SUBSCRIBE causes paho
|
||||
// to loop CONNECT → drop → CONNECT → drop. Each reconnect calls
|
||||
// MarkReconnected, which re-stamps StartedAt=now and resets the
|
||||
// cold-start grace clock. After 30 minutes of flapping, the source has
|
||||
// still NEVER received a message, but the "NEVER received" alarm never
|
||||
// fires because sinceStart is always sub-threshold. Fix: track
|
||||
// FirstConnectedAt separately from StartedAt; the cold-start check must
|
||||
// use the former.
|
||||
func TestMQTTStallWatchdog_ColdStartSurvivesBrokerFlap(t *testing.T) {
|
||||
defer snapshotAndResetRegistry(t)()
|
||||
|
||||
t0 := time.Now()
|
||||
s := &SourceLivenessState{
|
||||
Tag: "flapping-acl-deny",
|
||||
Broker: "tcp://acl-denied:1883",
|
||||
IsConnectedFn: func() bool { return true },
|
||||
}
|
||||
// First registration stamps FirstConnectedAt (and StartedAt) at t0.
|
||||
if err := registerLivenessState(s); err != nil {
|
||||
t.Fatalf("setup: registerLivenessState: %v", err)
|
||||
}
|
||||
|
||||
// Paho keeps re-establishing the TCP/MQTT session every minute. No
|
||||
// message ever arrives because SUBSCRIBE is denied. Each reconnect
|
||||
// resets StartedAt.
|
||||
for i := 1; i <= 6; i++ {
|
||||
s.MarkReconnected(t0.Add(time.Duration(i) * time.Minute))
|
||||
}
|
||||
|
||||
// 6m after the very first connection — well past the 5m cold-start
|
||||
// threshold. The headline alarm must fire.
|
||||
now := t0.Add(6*time.Minute + 30*time.Second)
|
||||
_, kind := checkSourceLiveness(s, 5*time.Minute, now)
|
||||
if kind != LivenessNeverReceived {
|
||||
t.Fatalf("under broker flap (#1212 ACL-deny class), cold-start alarm must fire based on FirstConnectedAt, not the most recent reconnect; got kind=%v", kind)
|
||||
}
|
||||
}
|
||||
|
||||
// Sanity check: a single transient reconnect WITHIN the cold-start window
|
||||
// must NOT prematurely trip the NeverReceived alarm — the grace was
|
||||
// designed for that. This guards against an over-correction where r2
|
||||
// switches blindly to FirstConnectedAt and ignores legitimate startup
|
||||
// jitter.
|
||||
func TestMQTTStallWatchdog_TransientReconnectDuringGraceStaysQuiet(t *testing.T) {
|
||||
defer snapshotAndResetRegistry(t)()
|
||||
|
||||
t0 := time.Now()
|
||||
s := &SourceLivenessState{
|
||||
Tag: "transient-reconnect",
|
||||
Broker: "tcp://x:1883",
|
||||
IsConnectedFn: func() bool { return true },
|
||||
}
|
||||
if err := registerLivenessState(s); err != nil {
|
||||
t.Fatalf("setup: registerLivenessState: %v", err)
|
||||
}
|
||||
|
||||
// 30s in, one transient reconnect.
|
||||
s.MarkReconnected(t0.Add(30 * time.Second))
|
||||
|
||||
// 1m after registration — still inside the 5m grace.
|
||||
_, kind := checkSourceLiveness(s, 5*time.Minute, t0.Add(1*time.Minute))
|
||||
if kind != LivenessOK {
|
||||
t.Fatalf("during cold-start grace, transient reconnects must stay quiet; got kind=%v", kind)
|
||||
}
|
||||
}
|
||||
|
||||
// r2 #3 RED: tag collision must not kill the ingestor. main.go currently
|
||||
// log.Fatalf's, which recreates the #1212 total-ingest-stop class this
|
||||
// PR exists to prevent. registerLivenessOrSkip is the small helper main
|
||||
// will call instead: log an ERROR + skip liveness registration for the
|
||||
// duplicate, return false so the caller knows the source is connecting
|
||||
// untracked. The first registration remains authoritative.
|
||||
func TestRegisterLivenessOrSkip_LogsErrorAndDoesNotExitOnCollision(t *testing.T) {
|
||||
defer snapshotAndResetRegistry(t)()
|
||||
|
||||
var buf bytes.Buffer
|
||||
origOut := log.Writer()
|
||||
origFlags := log.Flags()
|
||||
log.SetOutput(&buf)
|
||||
log.SetFlags(0)
|
||||
defer func() {
|
||||
log.SetOutput(origOut)
|
||||
log.SetFlags(origFlags)
|
||||
}()
|
||||
|
||||
a := &SourceLivenessState{Tag: "dup", Broker: "tcp://a:1883"}
|
||||
b := &SourceLivenessState{Tag: "dup", Broker: "tcp://b:1883"}
|
||||
|
||||
if ok := registerLivenessOrSkip(a); !ok {
|
||||
t.Fatalf("first registration must succeed; helper returned false (log=%q)", buf.String())
|
||||
}
|
||||
if ok := registerLivenessOrSkip(b); ok {
|
||||
t.Fatalf("second registration with same tag must return false (skip); helper returned true (log=%q)", buf.String())
|
||||
}
|
||||
|
||||
logOut := buf.String()
|
||||
if !strings.Contains(logOut, "ERROR") {
|
||||
t.Errorf("collision must be logged at ERROR severity so operators see it without it crashing the process; got %q", logOut)
|
||||
}
|
||||
if !strings.Contains(logOut, "dup") {
|
||||
t.Errorf("collision log must include the offending tag; got %q", logOut)
|
||||
}
|
||||
if !strings.Contains(strings.ToLower(logOut), "skip") {
|
||||
t.Errorf("collision log must say the duplicate is being skipped so operators know the source is untracked; got %q", logOut)
|
||||
}
|
||||
|
||||
// And the registry still holds the FIRST registration.
|
||||
livenessRegistryMu.RLock()
|
||||
got := livenessRegistry["dup"]
|
||||
livenessRegistryMu.RUnlock()
|
||||
if got != a {
|
||||
t.Errorf("first registration must remain authoritative after collision-skip; got pointer for broker=%s", got.Broker)
|
||||
}
|
||||
}
|
||||
@@ -1,221 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"log"
|
||||
"os"
|
||||
|
||||
"github.com/meshcore-analyzer/mbcapqueue"
|
||||
)
|
||||
|
||||
// MultibyteCapPersistStats holds counts for /api/healthz exposure / logging.
|
||||
type MultibyteCapPersistStats struct {
|
||||
ReadEntries int // entries read from snapshot
|
||||
UpdatedActive int64 // rows updated in nodes
|
||||
UpdatedInactive int64 // rows updated in inactive_nodes
|
||||
Skipped int // entries skipped (status=="unknown")
|
||||
}
|
||||
|
||||
// RunMultibyteCapPersist consumes the latest multi-byte capability snapshot
|
||||
// written by the server (internal/mbcapqueue) and persists it to nodes /
|
||||
// inactive_nodes. Owned by the ingestor per #1287: the server is read-only
|
||||
// since #1289 and cannot UPDATE these columns itself.
|
||||
//
|
||||
// INVARIANT (canonical owner): multibyte_sup / multibyte_evidence are
|
||||
// derived/cached columns. The server COMPUTES the value during its
|
||||
// analytics cycle (from observed packets) and writes a snapshot file;
|
||||
// this function is the ONLY runtime path that mutates those columns
|
||||
// (the schema itself is added by internal/dbschema). The server MUST
|
||||
// NOT execute any UPDATE on nodes.multibyte_* — see
|
||||
// cmd/server/readonly_invariant_test.go for the enforcement.
|
||||
//
|
||||
// Data-destruction guard: entries with Status=="unknown" (sup==0) are
|
||||
// NEVER persisted — we never overwrite a previously confirmed/suspected
|
||||
// DB value with a snapshot blank. Same guarantee the original
|
||||
// server-side helper enforced before relocation.
|
||||
//
|
||||
// Safe to call from a ticker; no-op when no snapshot has been written
|
||||
// (cold start), when the snapshot is empty, when the snapshot is
|
||||
// malformed (#1386), or when running against a legacy DB that
|
||||
// pre-dates the multibyte_sup migration (#1386).
|
||||
func (s *Store) RunMultibyteCapPersist() (MultibyteCapPersistStats, error) {
|
||||
var stats MultibyteCapPersistStats
|
||||
snap, err := mbcapqueue.ReadSnapshot(s.path)
|
||||
if err != nil {
|
||||
// os.ErrNotExist is the steady state until the server's first
|
||||
// analytics cycle completes — silent no-op. A malformed file
|
||||
// is operator-actionable: log it (but still no-op, no error
|
||||
// surfaced to the ticker — a corrupt snapshot must not stop
|
||||
// the maintenance loop).
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
return stats, nil
|
||||
}
|
||||
// All other ReadSnapshot errors today are wrap-arounds of
|
||||
// io / unmarshal failures — both classify as "malformed
|
||||
// snapshot on disk" from this loop's perspective.
|
||||
var jsonErr *json.SyntaxError
|
||||
if errors.As(err, &jsonErr) || isMalformedSnapshotErr(err) {
|
||||
log.Printf("[multibyte-persist] malformed snapshot on disk (no-op): %v", err)
|
||||
return stats, nil
|
||||
}
|
||||
log.Printf("[multibyte-persist] read snapshot: %v (no-op)", err)
|
||||
return stats, nil
|
||||
}
|
||||
stats.ReadEntries = len(snap.Entries)
|
||||
if len(snap.Entries) == 0 {
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
// Defensive schema check: a legacy DB that pre-dates the
|
||||
// multibyte_sup migration would fail at tx.Prepare with a SQL
|
||||
// error. Detect early and skip cleanly so the ticker keeps
|
||||
// running on heterogeneous deployments.
|
||||
if !s.hasMultibyteSupColumns() {
|
||||
log.Printf("[multibyte-persist] schema missing: nodes.multibyte_sup not present on this DB (legacy schema) — skipping %d entries", stats.ReadEntries)
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
tx, err := s.db.Begin()
|
||||
if err != nil {
|
||||
return stats, err
|
||||
}
|
||||
defer tx.Rollback() //nolint:errcheck
|
||||
// Combined dispatch: each pubkey lives in exactly one of nodes /
|
||||
// inactive_nodes. The pre-#1386 implementation issued one UPDATE
|
||||
// against each table per entry — 50% guaranteed-empty. We now
|
||||
// look up the table once, then issue the matching UPDATE.
|
||||
stmtN, err := tx.Prepare(`UPDATE nodes SET multibyte_sup=?, multibyte_evidence=? WHERE public_key=?`)
|
||||
if err != nil {
|
||||
return stats, err
|
||||
}
|
||||
defer stmtN.Close()
|
||||
stmtI, err := tx.Prepare(`UPDATE inactive_nodes SET multibyte_sup=?, multibyte_evidence=? WHERE public_key=?`)
|
||||
if err != nil {
|
||||
return stats, err
|
||||
}
|
||||
defer stmtI.Close()
|
||||
// Membership probe: one indexed PK lookup. Cheap; avoids the
|
||||
// guaranteed-miss second UPDATE.
|
||||
stmtProbe, err := tx.Prepare(`SELECT 1 FROM nodes WHERE public_key=? LIMIT 1`)
|
||||
if err != nil {
|
||||
return stats, err
|
||||
}
|
||||
defer stmtProbe.Close()
|
||||
|
||||
for _, e := range snap.Entries {
|
||||
sup := multibyteStatusToInt(e.Status)
|
||||
if sup == 0 {
|
||||
stats.Skipped++
|
||||
continue
|
||||
}
|
||||
// Probe once. If hit, UPDATE nodes; else UPDATE inactive_nodes.
|
||||
var hit int
|
||||
if err := stmtProbe.QueryRow(e.PublicKey).Scan(&hit); err == nil {
|
||||
if r, err := stmtN.Exec(sup, e.Evidence, e.PublicKey); err == nil {
|
||||
if n, _ := r.RowsAffected(); n > 0 {
|
||||
stats.UpdatedActive += n
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if r, err := stmtI.Exec(sup, e.Evidence, e.PublicKey); err == nil {
|
||||
if n, _ := r.RowsAffected(); n > 0 {
|
||||
stats.UpdatedInactive += n
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if err := tx.Commit(); err != nil {
|
||||
return stats, err
|
||||
}
|
||||
if stats.UpdatedActive+stats.UpdatedInactive > 0 {
|
||||
log.Printf("[multibyte-persist] applied snapshot: %d entries (%d skipped); updated %d active + %d inactive nodes",
|
||||
stats.ReadEntries, stats.Skipped, stats.UpdatedActive, stats.UpdatedInactive)
|
||||
}
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
// isMalformedSnapshotErr returns true if err looks like a JSON parse /
|
||||
// IO-truncation failure surfaced by mbcapqueue.ReadSnapshot. The
|
||||
// queue wraps errors with %w but mbcapqueue currently formats with
|
||||
// %w only for "read:"/"unmarshal:" prefixes — we substring-match
|
||||
// those so the operator-actionable log message is unambiguous.
|
||||
func isMalformedSnapshotErr(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
msg := err.Error()
|
||||
for _, frag := range []string{"unmarshal", "invalid character", "unexpected end of JSON"} {
|
||||
if containsCI(msg, frag) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func containsCI(s, sub string) bool {
|
||||
if len(sub) == 0 {
|
||||
return true
|
||||
}
|
||||
// case-insensitive Contains without importing strings (already
|
||||
// imported in db.go, but keeping helper local to avoid widening
|
||||
// this file's imports).
|
||||
for i := 0; i+len(sub) <= len(s); i++ {
|
||||
match := true
|
||||
for j := 0; j < len(sub); j++ {
|
||||
a, b := s[i+j], sub[j]
|
||||
if a >= 'A' && a <= 'Z' {
|
||||
a += 32
|
||||
}
|
||||
if b >= 'A' && b <= 'Z' {
|
||||
b += 32
|
||||
}
|
||||
if a != b {
|
||||
match = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if match {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// hasMultibyteSupColumns probes whether the active DB carries the
|
||||
// multibyte_sup column on the `nodes` table. Used to short-circuit
|
||||
// RunMultibyteCapPersist on legacy DBs that pre-date the
|
||||
// internal/dbschema migration (#1386).
|
||||
func (s *Store) hasMultibyteSupColumns() bool {
|
||||
rows, err := s.db.Query(`PRAGMA table_info(nodes)`)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
defer rows.Close()
|
||||
for rows.Next() {
|
||||
var cid int
|
||||
var name, ctype string
|
||||
var notnull, pk int
|
||||
var dflt interface{}
|
||||
if err := rows.Scan(&cid, &name, &ctype, ¬null, &dflt, &pk); err != nil {
|
||||
return false
|
||||
}
|
||||
if name == "multibyte_sup" {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// multibyteStatusToInt mirrors the mapping the server used before relocation.
|
||||
// 0 = unknown (never persisted), 1 = suspected, 2 = confirmed.
|
||||
func multibyteStatusToInt(status string) int {
|
||||
switch status {
|
||||
case "confirmed":
|
||||
return 2
|
||||
case "suspected":
|
||||
return 1
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
@@ -1,54 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"database/sql"
|
||||
"log"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// captureLogs redirects the standard logger to a buffer for the
|
||||
// duration of the test and returns the buffer. Restores the previous
|
||||
// writer when the test ends.
|
||||
func captureLogs(t *testing.T) *bytes.Buffer {
|
||||
t.Helper()
|
||||
buf := &bytes.Buffer{}
|
||||
prevWriter := log.Writer()
|
||||
prevFlags := log.Flags()
|
||||
log.SetOutput(buf)
|
||||
t.Cleanup(func() {
|
||||
log.SetOutput(prevWriter)
|
||||
log.SetFlags(prevFlags)
|
||||
})
|
||||
return buf
|
||||
}
|
||||
|
||||
// logContains reports whether the captured log buffer contains substr
|
||||
// (case-insensitive).
|
||||
func logContains(buf *bytes.Buffer, substr string) bool {
|
||||
return strings.Contains(strings.ToLower(buf.String()), strings.ToLower(substr))
|
||||
}
|
||||
|
||||
// columnExists reports whether the named column exists on the table.
|
||||
func columnExists(t *testing.T, db *sql.DB, table, col string) bool {
|
||||
t.Helper()
|
||||
rows, err := db.Query("PRAGMA table_info(" + table + ")")
|
||||
if err != nil {
|
||||
t.Fatalf("PRAGMA table_info(%s): %v", table, err)
|
||||
}
|
||||
defer rows.Close()
|
||||
for rows.Next() {
|
||||
var cid int
|
||||
var name, ctype string
|
||||
var notnull, pk int
|
||||
var dfltValue sql.NullString
|
||||
if err := rows.Scan(&cid, &name, &ctype, ¬null, &dfltValue, &pk); err != nil {
|
||||
t.Fatalf("scan PRAGMA: %v", err)
|
||||
}
|
||||
if name == col {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
@@ -1,369 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/meshcore-analyzer/mbcapqueue"
|
||||
)
|
||||
|
||||
// TestRunMultibyteCapPersist_AppliesSnapshot enforces the architectural
|
||||
// invariant from #1289 + #1322 + #1324 follow-up: the multi-byte
|
||||
// capability columns (multibyte_sup / multibyte_evidence) on
|
||||
// nodes / inactive_nodes MUST be written by the ingestor, NEVER by the
|
||||
// read-only server. The server publishes a snapshot file via
|
||||
// internal/mbcapqueue; the ingestor's maintenance loop applies it here.
|
||||
//
|
||||
// Pre-relocation (PR #1324 as-shipped), the server held a write handle
|
||||
// and executed UPDATE … nodes SET multibyte_sup directly — which is
|
||||
// impossible after #1289 made the server's *sql.DB read-only. This test
|
||||
// asserts the relocated path: snapshot in → UPDATEs out, from the
|
||||
// ingestor side.
|
||||
func TestRunMultibyteCapPersist_AppliesSnapshot(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "test.db")
|
||||
store, err := OpenStore(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenStore: %v", err)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
// Seed two nodes: one active, one inactive.
|
||||
if _, err := store.db.Exec(`INSERT INTO nodes (public_key, name, role, last_seen, multibyte_sup, multibyte_evidence)
|
||||
VALUES ('aa11', 'Alpha', 'repeater', '2026-01-01T00:00:00Z', 0, NULL)`); err != nil {
|
||||
t.Fatalf("seed nodes: %v", err)
|
||||
}
|
||||
if _, err := store.db.Exec(`INSERT INTO inactive_nodes (public_key, name, role, last_seen, multibyte_sup, multibyte_evidence)
|
||||
VALUES ('bb22', 'Bravo', 'repeater', '2025-01-01T00:00:00Z', 0, NULL)`); err != nil {
|
||||
t.Fatalf("seed inactive_nodes: %v", err)
|
||||
}
|
||||
// Seed a third node already confirmed, then send "unknown" for it —
|
||||
// the data-destruction guard must keep its DB value.
|
||||
if _, err := store.db.Exec(`INSERT INTO nodes (public_key, name, role, last_seen, multibyte_sup, multibyte_evidence)
|
||||
VALUES ('cc33', 'Charlie', 'repeater', '2026-01-01T00:00:00Z', 2, 'advert')`); err != nil {
|
||||
t.Fatalf("seed cc33: %v", err)
|
||||
}
|
||||
|
||||
snap := mbcapqueue.Snapshot{Entries: []mbcapqueue.Entry{
|
||||
{PublicKey: "aa11", Status: "confirmed", Evidence: "advert"},
|
||||
{PublicKey: "bb22", Status: "suspected", Evidence: "path"},
|
||||
{PublicKey: "cc33", Status: "unknown"}, // must NOT overwrite
|
||||
}}
|
||||
if err := mbcapqueue.WriteSnapshot(dbPath, snap); err != nil {
|
||||
t.Fatalf("WriteSnapshot: %v", err)
|
||||
}
|
||||
// Sanity: snapshot file landed where we expect.
|
||||
if _, err := os.Stat(filepath.Join(filepath.Dir(dbPath), mbcapqueue.QueueDirName, mbcapqueue.SnapshotFileName)); err != nil {
|
||||
t.Fatalf("snapshot not on disk: %v", err)
|
||||
}
|
||||
|
||||
stats, err := store.RunMultibyteCapPersist()
|
||||
if err != nil {
|
||||
t.Fatalf("RunMultibyteCapPersist: %v", err)
|
||||
}
|
||||
if stats.ReadEntries != 3 {
|
||||
t.Errorf("ReadEntries = %d, want 3", stats.ReadEntries)
|
||||
}
|
||||
if stats.Skipped != 1 {
|
||||
t.Errorf("Skipped = %d, want 1 (the unknown entry)", stats.Skipped)
|
||||
}
|
||||
if stats.UpdatedActive == 0 {
|
||||
t.Errorf("UpdatedActive = 0; expected aa11 to be updated in nodes")
|
||||
}
|
||||
if stats.UpdatedInactive == 0 {
|
||||
t.Errorf("UpdatedInactive = 0; expected bb22 to be updated in inactive_nodes")
|
||||
}
|
||||
|
||||
// Verify DB state.
|
||||
var sup int
|
||||
var evid string
|
||||
if err := store.db.QueryRow(`SELECT multibyte_sup, COALESCE(multibyte_evidence,'') FROM nodes WHERE public_key='aa11'`).Scan(&sup, &evid); err != nil {
|
||||
t.Fatalf("read aa11: %v", err)
|
||||
}
|
||||
if sup != 2 || evid != "advert" {
|
||||
t.Errorf("aa11 after persist: sup=%d evid=%q, want sup=2 evid=advert", sup, evid)
|
||||
}
|
||||
if err := store.db.QueryRow(`SELECT multibyte_sup, COALESCE(multibyte_evidence,'') FROM inactive_nodes WHERE public_key='bb22'`).Scan(&sup, &evid); err != nil {
|
||||
t.Fatalf("read bb22: %v", err)
|
||||
}
|
||||
if sup != 1 || evid != "path" {
|
||||
t.Errorf("bb22 after persist: sup=%d evid=%q, want sup=1 evid=path", sup, evid)
|
||||
}
|
||||
// Data-destruction guard: cc33 must still be confirmed=2/'advert'.
|
||||
if err := store.db.QueryRow(`SELECT multibyte_sup, COALESCE(multibyte_evidence,'') FROM nodes WHERE public_key='cc33'`).Scan(&sup, &evid); err != nil {
|
||||
t.Fatalf("read cc33: %v", err)
|
||||
}
|
||||
if sup != 2 || evid != "advert" {
|
||||
t.Errorf("cc33 was overwritten by unknown entry: sup=%d evid=%q, want sup=2 evid=advert", sup, evid)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRunMultibyteCapPersist_NoSnapshot_NoOp verifies that the persist
|
||||
// step is a clean no-op when the server hasn't written a snapshot yet
|
||||
// (cold start; the analytics cycle takes ~15s after server boot).
|
||||
func TestRunMultibyteCapPersist_NoSnapshot_NoOp(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "test.db")
|
||||
store, err := OpenStore(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenStore: %v", err)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
stats, err := store.RunMultibyteCapPersist()
|
||||
if err != nil {
|
||||
t.Fatalf("RunMultibyteCapPersist (no snapshot): %v", err)
|
||||
}
|
||||
if stats.ReadEntries != 0 || stats.UpdatedActive != 0 || stats.UpdatedInactive != 0 {
|
||||
t.Errorf("expected zero-valued stats on cold start, got %+v", stats)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRunMultibyteCapPersist_RoundTrip exercises the full end-to-end
|
||||
// contract claimed by PR #1324: the server writes a snapshot, the
|
||||
// ingestor persists it, and after a simulated restart (close + reopen
|
||||
// the store) the DB still carries the persisted state.
|
||||
//
|
||||
// The audit (#1386) flagged this as the #1 missing test: the two halves
|
||||
// (persist / read-back) were each tested in isolation, but no single
|
||||
// test proved the persist path produces a database state the loader
|
||||
// can later consume — so a column-rename or snapshot-version drift
|
||||
// would slip past.
|
||||
func TestRunMultibyteCapPersist_RoundTrip(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "test.db")
|
||||
|
||||
// --- Phase 1: open store, seed, persist snapshot ---
|
||||
store, err := OpenStore(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenStore: %v", err)
|
||||
}
|
||||
if _, err := store.db.Exec(`INSERT INTO nodes (public_key, name, role, last_seen, multibyte_sup, multibyte_evidence)
|
||||
VALUES ('dd44', 'Delta', 'repeater', '2026-01-01T00:00:00Z', 0, NULL)`); err != nil {
|
||||
t.Fatalf("seed: %v", err)
|
||||
}
|
||||
if _, err := store.db.Exec(`INSERT INTO inactive_nodes (public_key, name, role, last_seen, multibyte_sup, multibyte_evidence)
|
||||
VALUES ('ee55', 'Echo', 'companion', '2025-12-01T00:00:00Z', 0, NULL)`); err != nil {
|
||||
t.Fatalf("seed inactive: %v", err)
|
||||
}
|
||||
snap := mbcapqueue.Snapshot{Entries: []mbcapqueue.Entry{
|
||||
{PublicKey: "dd44", Status: "confirmed", Evidence: "advert"},
|
||||
{PublicKey: "ee55", Status: "suspected", Evidence: "path"},
|
||||
}}
|
||||
if err := mbcapqueue.WriteSnapshot(dbPath, snap); err != nil {
|
||||
t.Fatalf("WriteSnapshot: %v", err)
|
||||
}
|
||||
if _, err := store.RunMultibyteCapPersist(); err != nil {
|
||||
t.Fatalf("RunMultibyteCapPersist: %v", err)
|
||||
}
|
||||
// Capture original state for round-trip comparison.
|
||||
var origActiveSup, origInactiveSup int
|
||||
var origActiveEvid, origInactiveEvid string
|
||||
if err := store.db.QueryRow(`SELECT multibyte_sup, COALESCE(multibyte_evidence,'') FROM nodes WHERE public_key='dd44'`).Scan(&origActiveSup, &origActiveEvid); err != nil {
|
||||
t.Fatalf("read dd44 (phase1): %v", err)
|
||||
}
|
||||
if err := store.db.QueryRow(`SELECT multibyte_sup, COALESCE(multibyte_evidence,'') FROM inactive_nodes WHERE public_key='ee55'`).Scan(&origInactiveSup, &origInactiveEvid); err != nil {
|
||||
t.Fatalf("read ee55 (phase1): %v", err)
|
||||
}
|
||||
// Simulate restart: drop the in-memory Store entirely.
|
||||
if err := store.Close(); err != nil {
|
||||
t.Fatalf("Close: %v", err)
|
||||
}
|
||||
|
||||
// --- Phase 2: fresh Store, verify persisted state survived ---
|
||||
store2, err := OpenStore(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenStore (reopen): %v", err)
|
||||
}
|
||||
defer store2.Close()
|
||||
var sup int
|
||||
var evid string
|
||||
if err := store2.db.QueryRow(`SELECT multibyte_sup, COALESCE(multibyte_evidence,'') FROM nodes WHERE public_key='dd44'`).Scan(&sup, &evid); err != nil {
|
||||
t.Fatalf("read dd44 after reopen: %v", err)
|
||||
}
|
||||
if sup != origActiveSup || evid != origActiveEvid {
|
||||
t.Errorf("dd44 after restart: sup=%d evid=%q, want sup=%d evid=%q", sup, evid, origActiveSup, origActiveEvid)
|
||||
}
|
||||
if sup != 2 || evid != "advert" {
|
||||
t.Errorf("dd44 after restart: sup=%d evid=%q, want sup=2 evid=advert", sup, evid)
|
||||
}
|
||||
if err := store2.db.QueryRow(`SELECT multibyte_sup, COALESCE(multibyte_evidence,'') FROM inactive_nodes WHERE public_key='ee55'`).Scan(&sup, &evid); err != nil {
|
||||
t.Fatalf("read ee55 after reopen: %v", err)
|
||||
}
|
||||
if sup != origInactiveSup || evid != origInactiveEvid {
|
||||
t.Errorf("ee55 after restart: sup=%d evid=%q, want sup=%d evid=%q", sup, evid, origInactiveSup, origInactiveEvid)
|
||||
}
|
||||
if sup != 1 || evid != "path" {
|
||||
t.Errorf("ee55 after restart: sup=%d evid=%q, want sup=1 evid=path", sup, evid)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRunMultibyteCapPersist_MalformedSnapshot verifies the persist
|
||||
// path is safe against a corrupted/truncated snapshot file: it must
|
||||
// return without error (no-op), MUST NOT crash, AND MUST log a warning
|
||||
// distinguishing the malformed case from the steady-state "no
|
||||
// snapshot yet" cold-start case.
|
||||
//
|
||||
// Audit (#1386, kent-beck) flagged: "Snapshot file malformed /
|
||||
// truncated / wrong-version — RunMultibyteCapPersist error vs.
|
||||
// silent-skip behavior is unspecified by any test."
|
||||
func TestRunMultibyteCapPersist_MalformedSnapshot(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "test.db")
|
||||
store, err := OpenStore(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenStore: %v", err)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
// Write malformed JSON directly to the snapshot path.
|
||||
if err := mbcapqueue.EnsureDir(dbPath); err != nil {
|
||||
t.Fatalf("EnsureDir: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(mbcapqueue.SnapshotPath(dbPath), []byte("not-json{{{garbage"), 0o644); err != nil {
|
||||
t.Fatalf("write malformed: %v", err)
|
||||
}
|
||||
|
||||
// Capture log output to assert the warning is emitted.
|
||||
logBuf := captureLogs(t)
|
||||
|
||||
// Must not panic.
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
t.Fatalf("RunMultibyteCapPersist panicked on malformed snapshot: %v", r)
|
||||
}
|
||||
}()
|
||||
stats, err := store.RunMultibyteCapPersist()
|
||||
if err != nil {
|
||||
t.Errorf("RunMultibyteCapPersist on malformed snapshot returned error %v; expected silent no-op", err)
|
||||
}
|
||||
if stats.ReadEntries != 0 || stats.UpdatedActive != 0 || stats.UpdatedInactive != 0 {
|
||||
t.Errorf("expected zero-valued stats on malformed snapshot, got %+v", stats)
|
||||
}
|
||||
if !logContains(logBuf, "malformed") && !logContains(logBuf, "invalid") && !logContains(logBuf, "corrupt") {
|
||||
t.Errorf("expected log to mention malformed/invalid/corrupt snapshot; got: %s", logBuf.String())
|
||||
}
|
||||
}
|
||||
|
||||
// TestRunMultibyteCapPersist_MissingSchemaColumns verifies the persist
|
||||
// path is a clean no-op on a legacy DB that doesn't yet have the
|
||||
// multibyte_sup / multibyte_evidence columns. Currently the persist
|
||||
// would fail at tx.Prepare with a SQL error; the audit requires it
|
||||
// skip cleanly instead.
|
||||
//
|
||||
// We simulate a legacy DB by DROPping the columns post-migration
|
||||
// (SQLite ≥ 3.35 supports ALTER TABLE DROP COLUMN).
|
||||
func TestRunMultibyteCapPersist_MissingSchemaColumns(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "test.db")
|
||||
store, err := OpenStore(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenStore: %v", err)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
// Drop the multibyte columns from both tables to simulate a legacy DB.
|
||||
for _, stmt := range []string{
|
||||
`ALTER TABLE nodes DROP COLUMN multibyte_sup`,
|
||||
`ALTER TABLE nodes DROP COLUMN multibyte_evidence`,
|
||||
`ALTER TABLE inactive_nodes DROP COLUMN multibyte_sup`,
|
||||
`ALTER TABLE inactive_nodes DROP COLUMN multibyte_evidence`,
|
||||
} {
|
||||
if _, err := store.db.Exec(stmt); err != nil {
|
||||
t.Fatalf("simulate legacy DB (%q): %v", stmt, err)
|
||||
}
|
||||
}
|
||||
// Confirm columns are gone.
|
||||
if columnExists(t, store.db, "nodes", "multibyte_sup") {
|
||||
t.Fatalf("setup failed: nodes.multibyte_sup still present after DROP")
|
||||
}
|
||||
|
||||
snap := mbcapqueue.Snapshot{Entries: []mbcapqueue.Entry{
|
||||
{PublicKey: "ff66", Status: "confirmed", Evidence: "advert"},
|
||||
}}
|
||||
if err := mbcapqueue.WriteSnapshot(dbPath, snap); err != nil {
|
||||
t.Fatalf("WriteSnapshot: %v", err)
|
||||
}
|
||||
|
||||
logBuf := captureLogs(t)
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
t.Fatalf("RunMultibyteCapPersist panicked on legacy DB: %v", r)
|
||||
}
|
||||
}()
|
||||
stats, err := store.RunMultibyteCapPersist()
|
||||
if err != nil {
|
||||
t.Errorf("RunMultibyteCapPersist on legacy DB returned error %v; expected clean skip", err)
|
||||
}
|
||||
if stats.UpdatedActive != 0 || stats.UpdatedInactive != 0 {
|
||||
t.Errorf("expected zero writes on legacy DB, got %+v", stats)
|
||||
}
|
||||
// Must explicitly detect + log the skip — otherwise the "clean skip"
|
||||
// is silent UPDATE-affected-zero accident, not defensive code.
|
||||
if !logContains(logBuf, "legacy") && !logContains(logBuf, "schema") && !logContains(logBuf, "multibyte_sup") {
|
||||
t.Errorf("expected explicit log on missing schema columns; got: %s", logBuf.String())
|
||||
}
|
||||
}
|
||||
|
||||
// TestRunMultibyteCapPersist_PreservesConfirmedOnUnknown is the
|
||||
// data-destruction guard the PR claims to enforce: a snapshot Entry
|
||||
// with status="unknown" must NEVER overwrite an existing "confirmed"
|
||||
// (or "suspected") DB row. The audit's mutation test: revert the
|
||||
// `if sup == 0 { continue }` guard in multibyte_persist.go — this
|
||||
// test must fail.
|
||||
func TestRunMultibyteCapPersist_PreservesConfirmedOnUnknown(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "test.db")
|
||||
store, err := OpenStore(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenStore: %v", err)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
// Seed a confirmed active node and a suspected inactive node.
|
||||
if _, err := store.db.Exec(`INSERT INTO nodes (public_key, name, role, last_seen, multibyte_sup, multibyte_evidence)
|
||||
VALUES ('gg77', 'Golf', 'repeater', '2026-01-01T00:00:00Z', 2, 'advert')`); err != nil {
|
||||
t.Fatalf("seed gg77: %v", err)
|
||||
}
|
||||
if _, err := store.db.Exec(`INSERT INTO inactive_nodes (public_key, name, role, last_seen, multibyte_sup, multibyte_evidence)
|
||||
VALUES ('hh88', 'Hotel', 'companion', '2025-12-01T00:00:00Z', 1, 'path')`); err != nil {
|
||||
t.Fatalf("seed hh88: %v", err)
|
||||
}
|
||||
|
||||
// Snapshot has only "unknown" entries for both — must skip both.
|
||||
snap := mbcapqueue.Snapshot{Entries: []mbcapqueue.Entry{
|
||||
{PublicKey: "gg77", Status: "unknown"},
|
||||
{PublicKey: "hh88", Status: "unknown"},
|
||||
}}
|
||||
if err := mbcapqueue.WriteSnapshot(dbPath, snap); err != nil {
|
||||
t.Fatalf("WriteSnapshot: %v", err)
|
||||
}
|
||||
|
||||
stats, err := store.RunMultibyteCapPersist()
|
||||
if err != nil {
|
||||
t.Fatalf("RunMultibyteCapPersist: %v", err)
|
||||
}
|
||||
if stats.Skipped != 2 {
|
||||
t.Errorf("Skipped = %d, want 2 (both unknown entries)", stats.Skipped)
|
||||
}
|
||||
if stats.UpdatedActive != 0 || stats.UpdatedInactive != 0 {
|
||||
t.Errorf("expected zero updates, got %+v", stats)
|
||||
}
|
||||
|
||||
// Verify the existing values were NOT clobbered.
|
||||
var sup int
|
||||
var evid string
|
||||
if err := store.db.QueryRow(`SELECT multibyte_sup, COALESCE(multibyte_evidence,'') FROM nodes WHERE public_key='gg77'`).Scan(&sup, &evid); err != nil {
|
||||
t.Fatalf("read gg77: %v", err)
|
||||
}
|
||||
if sup != 2 || evid != "advert" {
|
||||
t.Errorf("gg77 was clobbered by unknown snapshot: sup=%d evid=%q, want sup=2 evid=advert", sup, evid)
|
||||
}
|
||||
if err := store.db.QueryRow(`SELECT multibyte_sup, COALESCE(multibyte_evidence,'') FROM inactive_nodes WHERE public_key='hh88'`).Scan(&sup, &evid); err != nil {
|
||||
t.Fatalf("read hh88: %v", err)
|
||||
}
|
||||
if sup != 1 || evid != "path" {
|
||||
t.Errorf("hh88 was clobbered by unknown snapshot: sup=%d evid=%q, want sup=1 evid=path", sup, evid)
|
||||
}
|
||||
}
|
||||
@@ -1,335 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// NeighborEdgesBuilderInterval is how often the ingestor rescans
|
||||
// observations and refreshes neighbor_edges. Server reads with the
|
||||
// same 60s cadence (see cmd/server/neighbor_recomputer.go); a 60s
|
||||
// pulse here is sufficient to keep the snapshot fresh.
|
||||
const NeighborEdgesBuilderInterval = 60 * time.Second
|
||||
|
||||
// neighborBuilderMaxBatch caps how many observation rows a single
|
||||
// delta tick may process (#1339). With max_open_conns=1, an unbounded
|
||||
// scan on a multi-million-row table holds the SQLite write lock for
|
||||
// minutes and starves MQTT ingest. The cap keeps each tick bounded;
|
||||
// if a backlog accumulates, successive ticks drain it 50k rows at a
|
||||
// time without ever blocking ingest for long.
|
||||
const neighborBuilderMaxBatch = 50000
|
||||
|
||||
// neighborBuilderSlowTickThreshold is the per-tick wallclock budget
|
||||
// for the builder. Exceeding it is logged loudly so operators can
|
||||
// catch a regression of #1339 quickly. The full instrumentation
|
||||
// framework is tracked in #1340.
|
||||
const neighborBuilderSlowTickThreshold = 5 * time.Second
|
||||
|
||||
// payloadADVERT mirrors the constant in cmd/server/decoder.go.
|
||||
// Duplicated rather than imported so the ingestor binary stays
|
||||
// independent of the server package.
|
||||
const payloadADVERT = 0x04
|
||||
|
||||
// edgeRow is one row to upsert into neighbor_edges. (a, b) is already
|
||||
// canonical-ordered (a <= b).
|
||||
type edgeRow struct {
|
||||
a, b, ts string
|
||||
}
|
||||
|
||||
// StartNeighborEdgesBuilder launches the periodic builder. On each
|
||||
// tick it rescans recent observations + transmissions and upserts
|
||||
// derived neighbor_edges rows. Builder is the only writer to
|
||||
// neighbor_edges (#1287).
|
||||
//
|
||||
// The function returns a stop closure. Initial build runs synchronously
|
||||
// before the ticker starts so the server's first snapshot load picks
|
||||
// up real data instead of an empty table.
|
||||
func (s *Store) StartNeighborEdgesBuilder(interval time.Duration) func() {
|
||||
if interval <= 0 {
|
||||
interval = NeighborEdgesBuilderInterval
|
||||
}
|
||||
stop := make(chan struct{})
|
||||
done := make(chan struct{})
|
||||
|
||||
// Synchronous warm-up: on a fresh DB this is a full scan; on a DB
|
||||
// with persisted neighbor_edges (most restarts), the watermark
|
||||
// short-circuits it into a delta scan. Loop until the per-tick
|
||||
// batch cap stops triggering so we drain any backlog before
|
||||
// returning — first server load needs a fully-populated table.
|
||||
wuStart := time.Now()
|
||||
var wuTotal int
|
||||
// Prime the prefix index (#1547) so the very first
|
||||
// InsertTransmission after startup can resolve hop prefixes.
|
||||
if err := s.RefreshPrefixIndex(); err != nil {
|
||||
log.Printf("[neighbor-build] initial prefix-index refresh error: %v", err)
|
||||
}
|
||||
// Prime the neighbor graph (#1560) so the context-aware resolver
|
||||
// has adjacency data on the very first InsertTransmission.
|
||||
if err := s.RefreshNeighborGraph(); err != nil {
|
||||
log.Printf("[neighbor-build] initial neighbor-graph refresh error: %v", err)
|
||||
}
|
||||
for {
|
||||
n, err := s.buildAndPersistNeighborEdges()
|
||||
if err != nil {
|
||||
log.Printf("[neighbor-build] initial build error: %v", err)
|
||||
break
|
||||
}
|
||||
wuTotal += n
|
||||
if n < neighborBuilderMaxBatch {
|
||||
break
|
||||
}
|
||||
}
|
||||
log.Printf("[neighbor-build] initial build: %d edges upserted in %s", wuTotal, time.Since(wuStart))
|
||||
|
||||
var stopOnce sync.Once
|
||||
go func() {
|
||||
defer close(done)
|
||||
t := time.NewTicker(interval)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-t.C:
|
||||
start := time.Now()
|
||||
// Refresh the prefix index alongside the edges build
|
||||
// (#1547) so new nodes become resolvable within a tick.
|
||||
if err := s.RefreshPrefixIndex(); err != nil {
|
||||
log.Printf("[neighbor-build] prefix-index refresh error: %v", err)
|
||||
}
|
||||
n, err := s.buildAndPersistNeighborEdges()
|
||||
// Refresh the neighbor-graph snapshot after the edges
|
||||
// build (#1560) so the context-aware resolver picks up
|
||||
// newly persisted adjacencies on the next ingest.
|
||||
if grErr := s.RefreshNeighborGraph(); grErr != nil {
|
||||
log.Printf("[neighbor-build] neighbor-graph refresh error: %v", grErr)
|
||||
}
|
||||
dur := time.Since(start)
|
||||
if err != nil {
|
||||
log.Printf("[neighbor-build] tick error after %s: %v", dur, err)
|
||||
} else if n > 0 {
|
||||
log.Printf("[neighbor-build] tick: %d edges in %s (delta from watermark)", n, dur)
|
||||
}
|
||||
if dur > neighborBuilderSlowTickThreshold {
|
||||
log.Printf("[neighbor-build] SLOW tick: %s — possible regression of #1339", dur)
|
||||
}
|
||||
case <-stop:
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
return func() {
|
||||
stopOnce.Do(func() { close(stop) })
|
||||
select {
|
||||
case <-done:
|
||||
case <-time.After(5 * time.Second):
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// buildAndPersistNeighborEdges scans transmissions + observations,
|
||||
// extracts edge candidates (originator↔first-hop on ADVERTs;
|
||||
// observer↔last-hop on all packet types) and upserts them into
|
||||
// neighbor_edges. Returns count of attempted upserts.
|
||||
//
|
||||
// Watermark / delta semantics (#1339): the builder derives a watermark
|
||||
// from MAX(neighbor_edges.last_seen). On an empty edges table (fresh
|
||||
// DB), watermark is 0 and the builder does a full warm-up scan. On
|
||||
// every subsequent call, the SELECT is restricted to observations
|
||||
// whose timestamp is strictly greater than the watermark, bounded by
|
||||
// neighborBuilderMaxBatch. neighbor_edges itself is the persistence —
|
||||
// no metadata table or in-memory state is required, and restarts
|
||||
// resume cleanly from whatever the table reflects.
|
||||
//
|
||||
// Trade-off (documented for #1340 follow-up): an anomalously-old
|
||||
// observation that arrives AFTER its timestamp has already been
|
||||
// crossed by the watermark will be skipped. Acceptable for an
|
||||
// approximate neighbor graph; a periodic full-rebuild can be added
|
||||
// later if needed.
|
||||
//
|
||||
// Resolution of hop-prefix → full pubkey is done via a one-shot
|
||||
// SELECT of (lowered) pubkey prefixes from nodes. Prefixes with
|
||||
// multiple candidates are skipped (matches the conservative
|
||||
// resolution rule in cmd/server/extractEdgesFromObs).
|
||||
func (s *Store) buildAndPersistNeighborEdges() (int, error) {
|
||||
prefixIdx, err := buildPrefixIndex(s.db)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("build prefix index: %w", err)
|
||||
}
|
||||
|
||||
// Derive the watermark from the existing edges table. RFC3339
|
||||
// → epoch seconds so it can be compared against observations.timestamp
|
||||
// (stored as INTEGER unix epoch). On an empty edges table both the
|
||||
// query and the parse return zero → full warm-up scan.
|
||||
var watermarkRFC sql.NullString
|
||||
if err := s.db.QueryRow(`SELECT MAX(last_seen) FROM neighbor_edges`).Scan(&watermarkRFC); err != nil {
|
||||
return 0, fmt.Errorf("read watermark: %w", err)
|
||||
}
|
||||
var watermarkEpoch int64
|
||||
if watermarkRFC.Valid && watermarkRFC.String != "" {
|
||||
if t, parseErr := time.Parse(time.RFC3339, watermarkRFC.String); parseErr == nil {
|
||||
watermarkEpoch = t.Unix()
|
||||
}
|
||||
}
|
||||
|
||||
rows, err := s.db.Query(`SELECT
|
||||
t.payload_type,
|
||||
t.decoded_json,
|
||||
COALESCE(t.from_pubkey, ''),
|
||||
COALESCE(o.path_json, ''),
|
||||
COALESCE(obs.id, '') AS observer_id,
|
||||
o.timestamp
|
||||
FROM observations o
|
||||
JOIN transmissions t ON t.id = o.transmission_id
|
||||
LEFT JOIN observers obs ON obs.rowid = o.observer_idx
|
||||
WHERE o.timestamp > ?
|
||||
ORDER BY o.timestamp
|
||||
LIMIT ?`, watermarkEpoch, neighborBuilderMaxBatch)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("scan observations: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var edges []edgeRow
|
||||
for rows.Next() {
|
||||
var payloadType sql.NullInt64
|
||||
var decodedJSON, fromPubkey, pathJSON, observerID string
|
||||
var epochTs int64
|
||||
if err := rows.Scan(&payloadType, &decodedJSON, &fromPubkey, &pathJSON, &observerID, &epochTs); err != nil {
|
||||
continue
|
||||
}
|
||||
fromNode := strings.ToLower(fromPubkey)
|
||||
if fromNode == "" {
|
||||
fromNode = strings.ToLower(extractPubkeyFromAdvertJSON(decodedJSON))
|
||||
}
|
||||
isAdvert := payloadType.Valid && payloadType.Int64 == int64(payloadADVERT)
|
||||
ts := time.Unix(epochTs, 0).UTC().Format(time.RFC3339)
|
||||
observerPK := strings.ToLower(observerID)
|
||||
path := parsePathArray(pathJSON)
|
||||
|
||||
if len(path) == 0 {
|
||||
if isAdvert && fromNode != "" && fromNode != observerPK && observerPK != "" {
|
||||
edges = append(edges, canonEdge(fromNode, observerPK, ts))
|
||||
}
|
||||
continue
|
||||
}
|
||||
if isAdvert && fromNode != "" {
|
||||
if resolved, ok := resolvePrefix(prefixIdx, path[0]); ok && resolved != fromNode {
|
||||
edges = append(edges, canonEdge(fromNode, resolved, ts))
|
||||
}
|
||||
}
|
||||
if observerPK != "" {
|
||||
last := path[len(path)-1]
|
||||
if resolved, ok := resolvePrefix(prefixIdx, last); ok && resolved != observerPK {
|
||||
edges = append(edges, canonEdge(observerPK, resolved, ts))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(edges) == 0 {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
// Wrap the whole edge-persist tx under writer-perf instrumentation
|
||||
// (#1340). Slow neighbor-builder ticks (the #1339 root cause) now
|
||||
// show up on /api/perf under component=neighbor_builder.
|
||||
var inserted int
|
||||
err = s.WriterTx("neighbor_builder", func(tx *sql.Tx) error {
|
||||
stmt, err := tx.Prepare(`INSERT INTO neighbor_edges (node_a, node_b, count, last_seen)
|
||||
VALUES (?, ?, 1, ?)
|
||||
ON CONFLICT(node_a, node_b) DO UPDATE SET
|
||||
count = count + 1,
|
||||
last_seen = MAX(last_seen, excluded.last_seen)`)
|
||||
if err != nil {
|
||||
return fmt.Errorf("prepare: %w", err)
|
||||
}
|
||||
defer stmt.Close()
|
||||
var firstErr error
|
||||
for _, e := range edges {
|
||||
if _, err := stmt.Exec(e.a, e.b, e.ts); err != nil && firstErr == nil {
|
||||
firstErr = err
|
||||
}
|
||||
}
|
||||
if firstErr != nil {
|
||||
return fmt.Errorf("upsert: %w", firstErr)
|
||||
}
|
||||
inserted = len(edges)
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return inserted, nil
|
||||
}
|
||||
|
||||
// canonEdge orders the pair so node_a <= node_b (matches the existing
|
||||
// schema convention used by the loader and the bridge recomputer).
|
||||
func canonEdge(a, b, ts string) edgeRow {
|
||||
if a > b {
|
||||
a, b = b, a
|
||||
}
|
||||
return edgeRow{a, b, ts}
|
||||
}
|
||||
|
||||
// parsePathArray returns the hop strings from a path_json blob.
|
||||
// Defensive against missing/invalid JSON.
|
||||
func parsePathArray(s string) []string {
|
||||
if s == "" || s == "[]" {
|
||||
return nil
|
||||
}
|
||||
var arr []string
|
||||
if json.Unmarshal([]byte(s), &arr) != nil {
|
||||
return nil
|
||||
}
|
||||
return arr
|
||||
}
|
||||
|
||||
// prefixIndex maps a hop prefix (lowercase) → all full pubkeys whose
|
||||
// public_key starts with that prefix. Prefixes with > 1 candidate are
|
||||
// considered ambiguous and skipped during resolution.
|
||||
type prefixIndex map[string][]string
|
||||
|
||||
// buildPrefixIndex reads nodes.public_key and builds the prefix → pubkey
|
||||
// map. We index every 1-byte (2 hex char) prefix length the firmware
|
||||
// uses (1, 2, 3, 4, 6, 8). Memory cost is O(nodes × len(prefixLens)).
|
||||
func buildPrefixIndex(db *sql.DB) (prefixIndex, error) {
|
||||
rows, err := db.Query(`SELECT public_key FROM nodes`)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
idx := make(prefixIndex, 1024)
|
||||
var prefixLens = []int{1 * 2, 2 * 2, 3 * 2, 4 * 2, 6 * 2, 8 * 2}
|
||||
for rows.Next() {
|
||||
var pk string
|
||||
if err := rows.Scan(&pk); err != nil {
|
||||
continue
|
||||
}
|
||||
pkLower := strings.ToLower(pk)
|
||||
for _, n := range prefixLens {
|
||||
if len(pkLower) < n {
|
||||
continue
|
||||
}
|
||||
prefix := pkLower[:n]
|
||||
idx[prefix] = append(idx[prefix], pkLower)
|
||||
}
|
||||
}
|
||||
return idx, nil
|
||||
}
|
||||
|
||||
// resolvePrefix returns the single resolved pubkey if exactly one
|
||||
// candidate matches, otherwise (zero || multiple), it returns ok=false
|
||||
// (matches the conservative server-side resolver in
|
||||
// cmd/server/extractEdgesFromObs).
|
||||
func resolvePrefix(idx prefixIndex, hop string) (string, bool) {
|
||||
h := strings.ToLower(hop)
|
||||
candidates := idx[h]
|
||||
if len(candidates) != 1 {
|
||||
return "", false
|
||||
}
|
||||
return candidates[0], true
|
||||
}
|
||||
@@ -1,195 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TestNeighborEdgesBuilderDeltaScan enforces issue #1339:
|
||||
// after the initial (warm-up) full build, subsequent ticks of
|
||||
// buildAndPersistNeighborEdges MUST scan only observations newer
|
||||
// than the most recent edge already persisted. The watermark is
|
||||
// derived from MAX(neighbor_edges.last_seen) — neighbor_edges itself
|
||||
// is the persistence, no separate metadata table.
|
||||
//
|
||||
// RED expectations:
|
||||
// 1. After warm-up that produces edges, a second build with NO new
|
||||
// observations is a fast no-op (<1s) and writes nothing.
|
||||
// 2. After inserting K observations with timestamps strictly newer
|
||||
// than the prior MAX(last_seen), the next build upserts exactly
|
||||
// K edges in <1s.
|
||||
// 3. Initial build (empty neighbor_edges) still does a full scan
|
||||
// (warm-up preserved).
|
||||
func TestNeighborEdgesBuilderDeltaScan(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("synthetic 100k-row benchmark; skipped in -short")
|
||||
}
|
||||
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "delta.db")
|
||||
store, err := OpenStore(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenStore: %v", err)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
if _, err := store.db.Exec(
|
||||
`INSERT INTO nodes (public_key, name) VALUES (?, ?), (?, ?)`,
|
||||
"aaaaaaaaaa", "from-node",
|
||||
"bbbbbbbbbb", "first-hop",
|
||||
); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if _, err := store.db.Exec(
|
||||
`INSERT INTO observers (id, name) VALUES (?, ?)`,
|
||||
"obs-1", "observer-1",
|
||||
); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
var obsRowid int64
|
||||
if err := store.db.QueryRow(`SELECT rowid FROM observers WHERE id = ?`, "obs-1").Scan(&obsRowid); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Baseline timestamps: a contiguous block ending at baselineMaxTs.
|
||||
const baseline = 100_000
|
||||
const baselineStartTs int64 = 1735689600 // 2025-01-01 UTC
|
||||
baselineMaxTs := baselineStartTs + int64(baseline) - 1
|
||||
|
||||
tx, err := store.db.Begin()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
txStmt, err := tx.Prepare(`INSERT INTO transmissions
|
||||
(raw_hex, hash, first_seen, route_type, payload_type, payload_version, decoded_json, from_pubkey)
|
||||
VALUES ('', ?, ?, 0, ?, 0, '{}', 'aaaaaaaaaa')`)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
obsStmt, err := tx.Prepare(`INSERT INTO observations
|
||||
(transmission_id, observer_idx, path_json, timestamp) VALUES (?, ?, '["bb"]', ?)`)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for i := 0; i < baseline; i++ {
|
||||
res, err := txStmt.Exec(fmt.Sprintf("h%d", i), baselineStartTs+int64(i), payloadADVERT)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
txID, _ := res.LastInsertId()
|
||||
if _, err := obsStmt.Exec(txID, obsRowid, baselineStartTs+int64(i)); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
if err := tx.Commit(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Initial warm-up: drain to completion (StartNeighborEdgesBuilder
|
||||
// does the same — call directly so the test doesn't depend on the
|
||||
// goroutine harness). Full scan allowed because neighbor_edges
|
||||
// starts empty.
|
||||
for {
|
||||
n, err := store.buildAndPersistNeighborEdges()
|
||||
if err != nil {
|
||||
t.Fatalf("warm-up build: %v", err)
|
||||
}
|
||||
if n == 0 || n < 50000 {
|
||||
break
|
||||
}
|
||||
}
|
||||
var edgesAfterWarmup int
|
||||
if err := store.db.QueryRow(`SELECT COUNT(*) FROM neighbor_edges`).Scan(&edgesAfterWarmup); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if edgesAfterWarmup == 0 {
|
||||
t.Fatal("warm-up produced 0 edges; can't establish a watermark")
|
||||
}
|
||||
// Sanity: MAX(last_seen) should reflect the baseline tail timestamp.
|
||||
var maxLastSeen string
|
||||
if err := store.db.QueryRow(`SELECT MAX(last_seen) FROM neighbor_edges`).Scan(&maxLastSeen); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
wantMax := time.Unix(baselineMaxTs, 0).UTC().Format(time.RFC3339)
|
||||
if maxLastSeen != wantMax {
|
||||
t.Fatalf("MAX(last_seen) after warm-up: want %s, got %s", wantMax, maxLastSeen)
|
||||
}
|
||||
|
||||
// Tick #2: NO new observations. Expect no-op + fast.
|
||||
noopStart := time.Now()
|
||||
n2, err := store.buildAndPersistNeighborEdges()
|
||||
if err != nil {
|
||||
t.Fatalf("noop build: %v", err)
|
||||
}
|
||||
noopDur := time.Since(noopStart)
|
||||
if n2 != 0 {
|
||||
t.Fatalf("expected 0 edges on empty-delta tick; got %d (#1339)", n2)
|
||||
}
|
||||
if noopDur > time.Second {
|
||||
t.Fatalf("empty-delta build took %v; expected <1s — builder is "+
|
||||
"still doing a full table scan. (#1339)", noopDur)
|
||||
}
|
||||
|
||||
// Tick #3: insert K observations with timestamps strictly newer
|
||||
// than baselineMaxTs.
|
||||
const delta = 100
|
||||
deltaStartTs := baselineMaxTs + 1
|
||||
tx2, err := store.db.Begin()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
txStmt2, err := tx2.Prepare(`INSERT INTO transmissions
|
||||
(raw_hex, hash, first_seen, route_type, payload_type, payload_version, decoded_json, from_pubkey)
|
||||
VALUES ('', ?, ?, 0, ?, 0, '{}', 'aaaaaaaaaa')`)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
obsStmt2, err := tx2.Prepare(`INSERT INTO observations
|
||||
(transmission_id, observer_idx, path_json, timestamp) VALUES (?, ?, '["bb"]', ?)`)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for i := 0; i < delta; i++ {
|
||||
res, err := txStmt2.Exec(fmt.Sprintf("d%d", i), deltaStartTs+int64(i), payloadADVERT)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
txID, _ := res.LastInsertId()
|
||||
if _, err := obsStmt2.Exec(txID, obsRowid, deltaStartTs+int64(i)); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
if err := tx2.Commit(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
deltaStart := time.Now()
|
||||
n3, err := store.buildAndPersistNeighborEdges()
|
||||
if err != nil {
|
||||
t.Fatalf("delta build: %v", err)
|
||||
}
|
||||
deltaDur := time.Since(deltaStart)
|
||||
// Each ADVERT observation with a non-empty path produces 2 edge
|
||||
// candidates (from↔hop[0] and observer↔hop[-1]). The watermark
|
||||
// must clamp the scan to the delta rows ONLY — anything more
|
||||
// proves the WHERE clause was bypassed.
|
||||
if n3 != delta*2 {
|
||||
t.Fatalf("expected %d edges upserted (delta only, 2 per advert obs); got %d. "+
|
||||
"Builder must only scan observations with timestamp > MAX(neighbor_edges.last_seen). (#1339)",
|
||||
delta*2, n3)
|
||||
}
|
||||
if deltaDur > 500*time.Millisecond {
|
||||
t.Fatalf("delta build of %d rows took %v; expected <500ms. (#1339)", delta, deltaDur)
|
||||
}
|
||||
|
||||
// Sanity: MAX(last_seen) advanced.
|
||||
var maxLastSeen2 string
|
||||
if err := store.db.QueryRow(`SELECT MAX(last_seen) FROM neighbor_edges`).Scan(&maxLastSeen2); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if maxLastSeen2 <= maxLastSeen {
|
||||
t.Fatalf("MAX(last_seen) did not advance: was %s, now %s", maxLastSeen, maxLastSeen2)
|
||||
}
|
||||
}
|
||||
@@ -1,87 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestNeighborEdgesBuilderUpsertsFromObservations enforces issue
|
||||
// #1287 Option 4: the INGESTOR builds neighbor_edges from raw
|
||||
// observations/transmissions and persists them. Server is read-only.
|
||||
//
|
||||
// Synthesize a tiny DB with one ADVERT observation whose path[0]
|
||||
// uniquely resolves to a known node, then assert the builder writes
|
||||
// the expected edge.
|
||||
func TestNeighborEdgesBuilderUpsertsFromObservations(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "build.db")
|
||||
|
||||
// Open via the ingestor's normal opener so applySchema and
|
||||
// dbschema.Apply both run (the builder requires neighbor_edges +
|
||||
// observers.iata etc.).
|
||||
store, err := OpenStore(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenStore: %v", err)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
// Seed two nodes whose pubkey prefixes will be used as hops.
|
||||
if _, err := store.db.Exec(
|
||||
`INSERT INTO nodes (public_key, name) VALUES (?, ?), (?, ?)`,
|
||||
"aaaaaaaaaa", "from-node",
|
||||
"bbbbbbbbbb", "first-hop",
|
||||
); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Seed one observer.
|
||||
if _, err := store.db.Exec(
|
||||
`INSERT INTO observers (id, name) VALUES (?, ?)`,
|
||||
"obs-1", "observer-1",
|
||||
); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
var obsRowid int64
|
||||
if err := store.db.QueryRow(`SELECT rowid FROM observers WHERE id = ?`, "obs-1").Scan(&obsRowid); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Insert one ADVERT transmission with from_pubkey = aaaaa…
|
||||
res, err := store.db.Exec(
|
||||
`INSERT INTO transmissions (raw_hex, hash, first_seen, route_type, payload_type, payload_version, decoded_json, from_pubkey)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
"", "h1", "2026-01-01T00:00:00Z", 0, payloadADVERT, 0, "{}", "aaaaaaaaaa",
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
txID, _ := res.LastInsertId()
|
||||
|
||||
// Insert one observation whose path[0] = "bb" (2-hex prefix unique
|
||||
// to bbbbb… in the nodes table). Expected edge: a↔b.
|
||||
if _, err := store.db.Exec(
|
||||
`INSERT INTO observations (transmission_id, observer_idx, path_json, timestamp) VALUES (?, ?, ?, ?)`,
|
||||
txID, obsRowid, `["bb"]`, int64(1735689600),
|
||||
); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
n, err := store.buildAndPersistNeighborEdges()
|
||||
if err != nil {
|
||||
t.Fatalf("buildAndPersistNeighborEdges: %v", err)
|
||||
}
|
||||
if n == 0 {
|
||||
t.Fatal("expected at least 1 edge upserted, got 0")
|
||||
}
|
||||
|
||||
var got int
|
||||
if err := store.db.QueryRow(`SELECT COUNT(*) FROM neighbor_edges WHERE node_a = ? AND node_b = ?`, "aaaaaaaaaa", "bbbbbbbbbb").Scan(&got); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if got != 1 {
|
||||
t.Fatalf("expected the a↔b edge to be persisted; got %d rows", got)
|
||||
}
|
||||
}
|
||||
|
||||
// (test ends here)
|
||||
|
||||
@@ -1,97 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestNormalizeChannelName(t *testing.T) {
|
||||
tests := []struct {
|
||||
input string
|
||||
expected string
|
||||
}{
|
||||
// Known channel: "public" should be normalized to "Public"
|
||||
{"public", "Public"},
|
||||
{"Public", "Public"},
|
||||
{"PUBLIC", "Public"},
|
||||
// Hashtag channels should be left untouched
|
||||
{"#LongFast", "#LongFast"},
|
||||
{"#wardrive", "#wardrive"},
|
||||
// Custom/unknown channels should be left untouched
|
||||
{"myChannel", "myChannel"},
|
||||
{"testchannel", "testchannel"},
|
||||
// Empty string
|
||||
{"", ""},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
got := normalizeChannelName(tt.input)
|
||||
if got != tt.expected {
|
||||
t.Errorf("normalizeChannelName(%q) = %q, want %q", tt.input, got, tt.expected)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadChannelKeys_NormalizesKnownDisplayNames(t *testing.T) {
|
||||
// Verify that known channel keys with wrong casing get normalized
|
||||
cfg := &Config{
|
||||
ChannelKeys: map[string]string{
|
||||
"public": "8b3387e9c5cdea6ac9e5edbaa115cd72",
|
||||
},
|
||||
}
|
||||
|
||||
keys := loadChannelKeys(cfg, "/dev/null")
|
||||
|
||||
// Should have "Public" (normalized) not "public" (raw)
|
||||
if _, ok := keys["public"]; ok {
|
||||
t.Error("Expected 'public' to be normalized to 'Public'")
|
||||
}
|
||||
if _, ok := keys["Public"]; !ok {
|
||||
t.Error("Expected 'Public' key to exist in loaded channel keys")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadChannelKeys_LeavesCustomNamesUntouched(t *testing.T) {
|
||||
// Verify that custom channel names are NOT normalized
|
||||
cfg := &Config{
|
||||
ChannelKeys: map[string]string{
|
||||
"myCustomChannel": "deadbeef12345678",
|
||||
},
|
||||
}
|
||||
|
||||
keys := loadChannelKeys(cfg, "/dev/null")
|
||||
|
||||
// Should keep "myCustomChannel" as-is
|
||||
if _, ok := keys["myCustomChannel"]; !ok {
|
||||
t.Error("Expected 'myCustomChannel' to be left untouched")
|
||||
}
|
||||
// Should NOT have "MyCustomChannel"
|
||||
if _, ok := keys["MyCustomChannel"]; ok {
|
||||
t.Error("Custom channel names should NOT be auto-capitalized")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadChannelKeys_DuplicateCasingLogsWarning(t *testing.T) {
|
||||
// Verify that config with both "public" and "Public" resolves deterministically:
|
||||
// the canonical (already-normalized) form should win.
|
||||
cfg := &Config{
|
||||
ChannelKeys: map[string]string{
|
||||
"public": "8b3387e9c5cdea6ac9e5edbaa115cd72",
|
||||
"Public": "differentkey1234567",
|
||||
},
|
||||
}
|
||||
|
||||
keys := loadChannelKeys(cfg, "/dev/null")
|
||||
|
||||
// After normalization, only one key should exist: "Public"
|
||||
// The canonical form ("Public") should win over the lowercase form ("public")
|
||||
if _, ok := keys["public"]; ok {
|
||||
t.Error("Expected 'public' to be normalized away")
|
||||
}
|
||||
if _, ok := keys["Public"]; !ok {
|
||||
t.Error("Expected 'Public' key to exist")
|
||||
}
|
||||
// Assert the canonical form's value won, not just any value
|
||||
if keys["Public"] != "differentkey1234567" {
|
||||
t.Errorf("Expected canonical 'Public' value to win, got %q", keys["Public"])
|
||||
}
|
||||
}
|
||||
@@ -1,43 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestIngestorIsObserverBlacklisted(t *testing.T) {
|
||||
cfg := &Config{
|
||||
ObserverBlacklist: []string{"OBS1", "obs2"},
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
id string
|
||||
want bool
|
||||
}{
|
||||
{"OBS1", true},
|
||||
{"obs1", true},
|
||||
{"OBS2", true},
|
||||
{"obs3", false},
|
||||
{"", false},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
got := cfg.IsObserverBlacklisted(tt.id)
|
||||
if got != tt.want {
|
||||
t.Errorf("IsObserverBlacklisted(%q) = %v, want %v", tt.id, got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestIngestorIsObserverBlacklistedEmpty(t *testing.T) {
|
||||
cfg := &Config{}
|
||||
if cfg.IsObserverBlacklisted("anything") {
|
||||
t.Error("empty blacklist should not match")
|
||||
}
|
||||
}
|
||||
|
||||
func TestIngestorIsObserverBlacklistedNil(t *testing.T) {
|
||||
var cfg *Config
|
||||
if cfg.IsObserverBlacklisted("anything") {
|
||||
t.Error("nil config should not match")
|
||||
}
|
||||
}
|
||||
@@ -1,109 +0,0 @@
|
||||
package main
|
||||
|
||||
// Regression tests for issue #1465 — observer.last_seen MUST always reflect
|
||||
// ingest time (server wall clock), never the MQTT envelope timestamp. Observers
|
||||
// with broken clocks (wrong TZ, RTC drift, replayed retained messages) must
|
||||
// NOT be able to drag the analyzer's "last heard from" field into the past
|
||||
// or future.
|
||||
//
|
||||
// Per-packet rxTime semantics (envelope time with naive-clamp from #1464)
|
||||
// are out of scope here — those continue to use envelope time. This file
|
||||
// asserts only the observer.last_seen path.
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Status path: envelope timestamp is a well-formed RFC3339 value 3h in the
|
||||
// past. observer.last_seen must be server wall clock, NOT the envelope value.
|
||||
func TestStatusMessage_ObserverLastSeen_AlwaysIngestTime_PastEnvelope_1465(t *testing.T) {
|
||||
store := newTestStore(t)
|
||||
source := MQTTSource{Name: "test"}
|
||||
|
||||
stale := time.Now().UTC().Add(-3 * time.Hour).Format(time.RFC3339)
|
||||
before := time.Now().Unix()
|
||||
|
||||
payload := []byte(`{"status":"online","origin":"obs-past","timestamp":"` + stale + `"}`)
|
||||
msg := &mockMessage{topic: "meshcore/SJC/obs-past/status", payload: payload}
|
||||
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
after := time.Now().Unix()
|
||||
|
||||
var lastSeen string
|
||||
if err := store.db.QueryRow(`SELECT last_seen FROM observers WHERE id = ?`, "obs-past").Scan(&lastSeen); err != nil {
|
||||
t.Fatalf("scan last_seen: %v", err)
|
||||
}
|
||||
ls, err := time.Parse(time.RFC3339, lastSeen)
|
||||
if err != nil {
|
||||
t.Fatalf("last_seen %q not RFC3339: %v", lastSeen, err)
|
||||
}
|
||||
if ls.Unix() < before-5 || ls.Unix() > after+5 {
|
||||
t.Errorf("observer.last_seen = %q (epoch %d); want in [%d, %d] (server wall clock). "+
|
||||
"Envelope reported well-formed stale %q (3h ago) — must NOT drag last_seen into the past. Issue #1465.",
|
||||
lastSeen, ls.Unix(), before, after, stale)
|
||||
}
|
||||
}
|
||||
|
||||
// Status path: envelope timestamp 5 min in the future. observer.last_seen
|
||||
// must still be server wall clock.
|
||||
func TestStatusMessage_ObserverLastSeen_AlwaysIngestTime_FutureEnvelope_1465(t *testing.T) {
|
||||
store := newTestStore(t)
|
||||
source := MQTTSource{Name: "test"}
|
||||
|
||||
future := time.Now().UTC().Add(5 * time.Minute).Format(time.RFC3339)
|
||||
before := time.Now().Unix()
|
||||
|
||||
payload := []byte(`{"status":"online","origin":"obs-future","timestamp":"` + future + `"}`)
|
||||
msg := &mockMessage{topic: "meshcore/SJC/obs-future/status", payload: payload}
|
||||
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
after := time.Now().Unix()
|
||||
|
||||
var lastSeen string
|
||||
if err := store.db.QueryRow(`SELECT last_seen FROM observers WHERE id = ?`, "obs-future").Scan(&lastSeen); err != nil {
|
||||
t.Fatalf("scan last_seen: %v", err)
|
||||
}
|
||||
ls, err := time.Parse(time.RFC3339, lastSeen)
|
||||
if err != nil {
|
||||
t.Fatalf("last_seen %q not RFC3339: %v", lastSeen, err)
|
||||
}
|
||||
if ls.Unix() < before-5 || ls.Unix() > after+5 {
|
||||
t.Errorf("observer.last_seen = %q (epoch %d); want in [%d, %d] (server wall clock). "+
|
||||
"Envelope reported well-formed future %q (5 min ahead) — must NOT drag last_seen into the future. Issue #1465.",
|
||||
lastSeen, ls.Unix(), before, after, future)
|
||||
}
|
||||
}
|
||||
|
||||
// Packet path: a transmission whose envelope timestamp is 3h in the past
|
||||
// MUST still bump observer.last_seen to server wall clock — observer is
|
||||
// clearly alive (we just ingested a packet from it), regardless of what
|
||||
// its clock claims.
|
||||
func TestPacketMessage_ObserverLastSeen_AlwaysIngestTime_PastEnvelope_1465(t *testing.T) {
|
||||
store := newTestStore(t)
|
||||
source := MQTTSource{Name: "test"}
|
||||
|
||||
stale := time.Now().UTC().Add(-3 * time.Hour).Format(time.RFC3339)
|
||||
before := time.Now().Unix()
|
||||
|
||||
rawHex := "0A00D69FD7A5A7475DB07337749AE61FA53A4788E976"
|
||||
payload := []byte(`{"raw":"` + rawHex + `","SNR":5.5,"RSSI":-100.0,"origin":"obs-pkt","timestamp":"` + stale + `"}`)
|
||||
msg := &mockMessage{topic: "meshcore/SJC/obs-pkt/packets", payload: payload}
|
||||
|
||||
handleMessage(store, "test", source, msg, nil, nil, &Config{})
|
||||
after := time.Now().Unix()
|
||||
|
||||
var lastSeen string
|
||||
if err := store.db.QueryRow(`SELECT last_seen FROM observers WHERE id = ?`, "obs-pkt").Scan(&lastSeen); err != nil {
|
||||
t.Fatalf("scan last_seen: %v", err)
|
||||
}
|
||||
ls, err := time.Parse(time.RFC3339, lastSeen)
|
||||
if err != nil {
|
||||
t.Fatalf("last_seen %q not RFC3339: %v", lastSeen, err)
|
||||
}
|
||||
if ls.Unix() < before-5 || ls.Unix() > after+5 {
|
||||
t.Errorf("packet-path observer.last_seen = %q (epoch %d); want in [%d, %d] (server wall clock). "+
|
||||
"Envelope stale = %q. Observer just delivered a packet; last_seen must be NOW. Issue #1465.",
|
||||
lastSeen, ls.Unix(), before, after, stale)
|
||||
}
|
||||
}
|
||||
@@ -1,96 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// Regression test for #1044: observer metadata (model, firmware, battery_mv,
|
||||
// noise_floor) is silently dropped when an MQTT status payload arrives, even
|
||||
// though the same payload's `radio` and `client_version` fields ARE persisted.
|
||||
//
|
||||
// Real-world payload captured from the production MQTT bridge:
|
||||
//
|
||||
// {"status":"online","origin":"TestObserver","origin_id":"AABBCCDD",
|
||||
// "radio":"910.5250244,62.5,7,5",
|
||||
// "model":"Heltec V3",
|
||||
// "firmware_version":"1.12.0-test",
|
||||
// "client_version":"meshcoretomqtt/1.0.8.0",
|
||||
// "stats":{"battery_mv":4209,"uptime_secs":75821,"noise_floor":-109,
|
||||
// "tx_air_secs":80,"rx_air_secs":1903,"recv_errors":934}}
|
||||
func TestStatusMessageMetadataPersisted_Issue1044(t *testing.T) {
|
||||
const payload = `{"status":"online","origin":"TestObserver","origin_id":"AABBCCDD","radio":"910.5250244,62.5,7,5","model":"Heltec V3","firmware_version":"1.12.0-test","client_version":"meshcoretomqtt/1.0.8.0","stats":{"battery_mv":4209,"uptime_secs":75821,"noise_floor":-109,"tx_air_secs":80,"rx_air_secs":1903,"recv_errors":934}}`
|
||||
|
||||
var msg map[string]interface{}
|
||||
if err := json.Unmarshal([]byte(payload), &msg); err != nil {
|
||||
t.Fatalf("unmarshal: %v", err)
|
||||
}
|
||||
|
||||
meta := extractObserverMeta(msg)
|
||||
if meta == nil {
|
||||
t.Fatal("extractObserverMeta returned nil for a payload that contains model/firmware/battery_mv")
|
||||
}
|
||||
if meta.Model == nil || *meta.Model != "Heltec V3" {
|
||||
t.Errorf("meta.Model = %v, want \"Heltec V3\"", meta.Model)
|
||||
}
|
||||
if meta.Firmware == nil || *meta.Firmware != "1.12.0-test" {
|
||||
t.Errorf("meta.Firmware = %v, want \"1.12.0-test\"", meta.Firmware)
|
||||
}
|
||||
if meta.ClientVersion == nil || *meta.ClientVersion != "meshcoretomqtt/1.0.8.0" {
|
||||
t.Errorf("meta.ClientVersion = %v, want \"meshcoretomqtt/1.0.8.0\"", meta.ClientVersion)
|
||||
}
|
||||
if meta.Radio == nil || *meta.Radio != "910.5250244,62.5,7,5" {
|
||||
t.Errorf("meta.Radio = %v, want radio string", meta.Radio)
|
||||
}
|
||||
if meta.BatteryMv == nil || *meta.BatteryMv != 4209 {
|
||||
t.Errorf("meta.BatteryMv = %v, want 4209", meta.BatteryMv)
|
||||
}
|
||||
if meta.NoiseFloor == nil || *meta.NoiseFloor != -109 {
|
||||
t.Errorf("meta.NoiseFloor = %v, want -109", meta.NoiseFloor)
|
||||
}
|
||||
if meta.UptimeSecs == nil || *meta.UptimeSecs != 75821 {
|
||||
t.Errorf("meta.UptimeSecs = %v, want 75821", meta.UptimeSecs)
|
||||
}
|
||||
|
||||
// Now drive the meta through UpsertObserver and verify the row.
|
||||
s, err := OpenStore(tempDBPath(t))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer s.Close()
|
||||
|
||||
if err := s.UpsertObserver("AABBCCDD", "TestObserver", "SJC", meta); err != nil {
|
||||
t.Fatalf("UpsertObserver: %v", err)
|
||||
}
|
||||
|
||||
var (
|
||||
gotModel, gotFirmware, gotClientVersion, gotRadio string
|
||||
gotBattery int
|
||||
gotUptime int64
|
||||
gotNoise float64
|
||||
)
|
||||
err = s.db.QueryRow(`SELECT model, firmware, client_version, radio,
|
||||
battery_mv, uptime_secs, noise_floor
|
||||
FROM observers WHERE id = 'AABBCCDD'`).Scan(
|
||||
&gotModel, &gotFirmware, &gotClientVersion, &gotRadio,
|
||||
&gotBattery, &gotUptime, &gotNoise,
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("scan observer row: %v", err)
|
||||
}
|
||||
if gotModel != "Heltec V3" {
|
||||
t.Errorf("DB model = %q, want \"Heltec V3\"", gotModel)
|
||||
}
|
||||
if gotFirmware != "1.12.0-test" {
|
||||
t.Errorf("DB firmware = %q, want \"1.12.0-test\"", gotFirmware)
|
||||
}
|
||||
if gotBattery != 4209 {
|
||||
t.Errorf("DB battery_mv = %d, want 4209", gotBattery)
|
||||
}
|
||||
if gotUptime != 75821 {
|
||||
t.Errorf("DB uptime_secs = %d, want 75821", gotUptime)
|
||||
}
|
||||
if gotNoise != -109 {
|
||||
t.Errorf("DB noise_floor = %f, want -109", gotNoise)
|
||||
}
|
||||
}
|
||||
@@ -1,225 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
)
|
||||
|
||||
// Context-aware hop resolver — full restore of pre-#1289 hop
|
||||
// disambiguation semantics, ported into the ingestor (where the
|
||||
// neighbor graph + node directory now live, per #1283).
|
||||
//
|
||||
// Why this exists (issues #1547 / #1560):
|
||||
// The naive `resolvePath` only resolves hops whose prefix is unique
|
||||
// in the node table. On a >2K-node mesh the dominant case is 1-byte
|
||||
// prefix collisions (multiple candidates per prefix). Without
|
||||
// adjacency disambiguation those hops always serialize as `nil`
|
||||
// and the resolved_path remains effectively empty for the largest
|
||||
// meshes — the very deployments that need it most.
|
||||
//
|
||||
// Algorithm (ported from cmd/server/store.go @ commit 450236d5
|
||||
// `pm.resolveWithContext`, intersected with the disambiguation gating
|
||||
// from PR #1144 / #1352):
|
||||
//
|
||||
// For each hop:
|
||||
// 1. Collect candidate pubkeys by prefix-match (existing prefixIndex).
|
||||
// 2. len==0 → nil.
|
||||
// 3. len==1 → that pubkey.
|
||||
// 4. len>1 → filter by NeighborGraph adjacency to the anchor:
|
||||
// - hop 0 anchor = fromPubkey (ADVERT originator) if known;
|
||||
// - hop i (i>0) anchor = previous resolved hop's pubkey;
|
||||
// if the previous hop did not resolve, the chain breaks
|
||||
// and subsequent >1-candidate hops fall to nil.
|
||||
// Surviving candidates after filter:
|
||||
// - exactly 1 → use it
|
||||
// - 0 or >1 → nil (cannot disambiguate further)
|
||||
//
|
||||
// This is the conservative tier-1 variant. Pre-#1289 also carried
|
||||
// tier-2 (geo proximity), tier-3 (GPS preference), tier-4 (obs-count
|
||||
// fallback) — those were noisy in practice and are intentionally NOT
|
||||
// ported here; this PR is a regression restore, not an enhancement.
|
||||
|
||||
// NeighborGraph is the in-memory adjacency snapshot used by the
|
||||
// context-aware resolver. Internally lowercased.
|
||||
type NeighborGraph struct {
|
||||
adj map[string]map[string]struct{}
|
||||
}
|
||||
|
||||
// NewNeighborGraph returns an empty graph.
|
||||
func NewNeighborGraph() *NeighborGraph {
|
||||
return &NeighborGraph{adj: make(map[string]map[string]struct{})}
|
||||
}
|
||||
|
||||
// AddEdge adds an undirected adjacency a↔b. Self-loops and empty
|
||||
// endpoints are ignored.
|
||||
func (g *NeighborGraph) AddEdge(a, b string) {
|
||||
a = strings.ToLower(a)
|
||||
b = strings.ToLower(b)
|
||||
if a == "" || b == "" || a == b {
|
||||
return
|
||||
}
|
||||
if g.adj[a] == nil {
|
||||
g.adj[a] = make(map[string]struct{})
|
||||
}
|
||||
if g.adj[b] == nil {
|
||||
g.adj[b] = make(map[string]struct{})
|
||||
}
|
||||
g.adj[a][b] = struct{}{}
|
||||
g.adj[b][a] = struct{}{}
|
||||
}
|
||||
|
||||
// IsAdjacent reports whether a and b appear together in any neighbor edge.
|
||||
func (g *NeighborGraph) IsAdjacent(a, b string) bool {
|
||||
if g == nil {
|
||||
return false
|
||||
}
|
||||
a = strings.ToLower(a)
|
||||
b = strings.ToLower(b)
|
||||
if a == "" || b == "" {
|
||||
return false
|
||||
}
|
||||
nbrs, ok := g.adj[a]
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
_, present := nbrs[b]
|
||||
return present
|
||||
}
|
||||
|
||||
// neighborGraphHolder caches the graph for the InsertTransmission hot
|
||||
// path. atomic.Value lets the 60s rebuild publish without a read-side
|
||||
// lock.
|
||||
type neighborGraphHolder struct {
|
||||
v atomic.Value // holds *NeighborGraph
|
||||
}
|
||||
|
||||
func (h *neighborGraphHolder) load() *NeighborGraph {
|
||||
if v := h.v.Load(); v != nil {
|
||||
return v.(*NeighborGraph)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *neighborGraphHolder) store(g *NeighborGraph) {
|
||||
h.v.Store(g)
|
||||
}
|
||||
|
||||
// loadNeighborGraph reads neighbor_edges and returns an in-memory
|
||||
// adjacency snapshot. Safe to call against a fresh DB (returns an
|
||||
// empty graph).
|
||||
func loadNeighborGraph(db *sql.DB) (*NeighborGraph, error) {
|
||||
rows, err := db.Query(`SELECT node_a, node_b FROM neighbor_edges`)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
g := NewNeighborGraph()
|
||||
for rows.Next() {
|
||||
var a, b string
|
||||
if err := rows.Scan(&a, &b); err != nil {
|
||||
continue
|
||||
}
|
||||
g.AddEdge(a, b)
|
||||
}
|
||||
return g, nil
|
||||
}
|
||||
|
||||
// resolveHopWithContext resolves a single hop using NeighborGraph
|
||||
// adjacency to the anchor. Returns nil when the hop cannot be
|
||||
// disambiguated.
|
||||
//
|
||||
// exclude is a set of pubkeys to discard from the candidate pool
|
||||
// (typically the prior hops already resolved on the path — a packet
|
||||
// does not revisit a node).
|
||||
//
|
||||
// Behavior matrix:
|
||||
// len(candidates) | anchor | graph | result
|
||||
// 0 | — | — | nil
|
||||
// 1 | — | — | candidates[0]
|
||||
// >1 | "" or no graph|— | nil
|
||||
// >1 | non-empty | set | unique adjacent candidate
|
||||
// (or nil if 0 or >1 survive)
|
||||
func resolveHopWithContext(hop string, anchor string, graph *NeighborGraph, idx prefixIndex, exclude map[string]struct{}) *string {
|
||||
if idx == nil {
|
||||
return nil
|
||||
}
|
||||
h := strings.ToLower(hop)
|
||||
candidates := idx[h]
|
||||
switch len(candidates) {
|
||||
case 0:
|
||||
return nil
|
||||
case 1:
|
||||
pk := candidates[0]
|
||||
if _, skip := exclude[pk]; skip {
|
||||
return nil
|
||||
}
|
||||
return &pk
|
||||
}
|
||||
if graph == nil || anchor == "" {
|
||||
return nil
|
||||
}
|
||||
var match string
|
||||
survivors := 0
|
||||
for _, cand := range candidates {
|
||||
if _, skip := exclude[cand]; skip {
|
||||
continue
|
||||
}
|
||||
if graph.IsAdjacent(anchor, cand) {
|
||||
survivors++
|
||||
if survivors > 1 {
|
||||
return nil
|
||||
}
|
||||
match = cand
|
||||
}
|
||||
}
|
||||
if survivors == 1 {
|
||||
return &match
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// resolvePathWithContext walks the hop list, anchoring hop 0 on
|
||||
// fromPubkey (for ADVERTs) and each subsequent hop on the previous
|
||||
// resolved hop. Previously-resolved pubkeys (plus the originator) are
|
||||
// excluded from later candidate pools so the walk doesn't revisit a
|
||||
// node. Returns a `[]*string` shape compatible with
|
||||
// marshalResolvedPath (and the all-nil clobber-guard from PR #1548).
|
||||
func resolvePathWithContext(hops []string, fromPubkey string, graph *NeighborGraph, idx prefixIndex) []*string {
|
||||
if len(hops) == 0 {
|
||||
return nil
|
||||
}
|
||||
out := make([]*string, len(hops))
|
||||
if idx == nil {
|
||||
return out
|
||||
}
|
||||
prevAnchor := strings.ToLower(fromPubkey)
|
||||
seen := make(map[string]struct{}, len(hops)+1)
|
||||
if prevAnchor != "" {
|
||||
seen[prevAnchor] = struct{}{}
|
||||
}
|
||||
for i, hop := range hops {
|
||||
r := resolveHopWithContext(hop, prevAnchor, graph, idx, seen)
|
||||
out[i] = r
|
||||
if r != nil {
|
||||
lc := strings.ToLower(*r)
|
||||
seen[lc] = struct{}{}
|
||||
prevAnchor = lc
|
||||
} else {
|
||||
prevAnchor = ""
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// RefreshNeighborGraph loads the latest neighbor_edges snapshot and
|
||||
// publishes it atomically. Called on startup and once per neighbor-
|
||||
// edges builder tick (60s) alongside RefreshPrefixIndex.
|
||||
func (s *Store) RefreshNeighborGraph() error {
|
||||
g, err := loadNeighborGraph(s.db)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
s.neighborGraph.store(g)
|
||||
return nil
|
||||
}
|
||||
@@ -1,106 +0,0 @@
|
||||
// Package main: ingestor-side processor for prune-request marker files
|
||||
// written by the read-only server (see internal/prunequeue).
|
||||
//
|
||||
// The server cannot DELETE because it opens SQLite mode=ro (#1283/#1289).
|
||||
// Instead, the server writes request-<id>.json under <dataDir>/prune-requests/
|
||||
// and the ingestor consumes it here.
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/meshcore-analyzer/prunequeue"
|
||||
)
|
||||
|
||||
// DeleteNodesByPubkeys deletes nodes by public key. Returns the count deleted.
|
||||
// Only the ingestor calls this (server has no write handle).
|
||||
func (s *Store) DeleteNodesByPubkeys(pubkeys []string) (int64, error) {
|
||||
if len(pubkeys) == 0 {
|
||||
return 0, nil
|
||||
}
|
||||
// Chunk to keep statements under SQLite's variable limit (default 999).
|
||||
const chunk = 500
|
||||
var total int64
|
||||
for start := 0; start < len(pubkeys); start += chunk {
|
||||
end := start + chunk
|
||||
if end > len(pubkeys) {
|
||||
end = len(pubkeys)
|
||||
}
|
||||
batch := pubkeys[start:end]
|
||||
placeholders := strings.Repeat("?,", len(batch))
|
||||
placeholders = placeholders[:len(placeholders)-1]
|
||||
args := make([]interface{}, len(batch))
|
||||
for i, pk := range batch {
|
||||
args[i] = pk
|
||||
}
|
||||
// Cascade cleanup: a node row carries the canonical identity, but
|
||||
// observations/transmissions reference the pubkey too via observer
|
||||
// metadata and originator fields. There are no FK constraints in
|
||||
// the current schema (#669 review note), so we explicitly clear
|
||||
// the most obvious follow-on rows that would otherwise become
|
||||
// orphans visible to operators.
|
||||
//
|
||||
// Conservative scope: only the `nodes` row is removed here. The
|
||||
// referenced observation/transmission history is retained for
|
||||
// audit; operators can run the regular packet-retention prune to
|
||||
// age it out. If a future schema introduces FKs, revisit.
|
||||
res, err := s.db.Exec("DELETE FROM nodes WHERE public_key IN ("+placeholders+")", args...)
|
||||
if err != nil {
|
||||
return total, fmt.Errorf("delete batch [%d:%d]: %w", start, end, err)
|
||||
}
|
||||
n, _ := res.RowsAffected()
|
||||
total += n
|
||||
}
|
||||
return total, nil
|
||||
}
|
||||
|
||||
// RunPendingPruneRequests scans the prune-requests/ directory next to the
|
||||
// SQLite database and processes any request-<id>.json markers written by
|
||||
// the server. Each request is honored verbatim — the server is responsible
|
||||
// for the TOCTOU snapshot (only pubkeys that were still outside the
|
||||
// geofilter at confirm time). After running DELETE, the ingestor writes
|
||||
// result-<id>.json and removes the request file (atomic, via os.Rename in
|
||||
// prunequeue.WriteResult).
|
||||
//
|
||||
// Safe to call from a ticker — no-op when the queue is empty.
|
||||
func (s *Store) RunPendingPruneRequests() {
|
||||
paths, err := prunequeue.ListPending(s.path)
|
||||
if err != nil {
|
||||
log.Printf("[prune-queue] list pending failed: %v", err)
|
||||
return
|
||||
}
|
||||
if len(paths) == 0 {
|
||||
return
|
||||
}
|
||||
for _, p := range paths {
|
||||
req, err := prunequeue.ReadRequest(p)
|
||||
if err != nil {
|
||||
log.Printf("[prune-queue] read %s failed: %v — removing", p, err)
|
||||
_ = os.Remove(p)
|
||||
continue
|
||||
}
|
||||
log.Printf("[prune-queue] processing request %s: %d pubkey(s) (%s)",
|
||||
req.ID, len(req.Pubkeys), req.Reason)
|
||||
start := time.Now()
|
||||
deleted, derr := s.DeleteNodesByPubkeys(req.Pubkeys)
|
||||
res := prunequeue.Result{
|
||||
ID: req.ID,
|
||||
RequestedAt: req.RequestedAt,
|
||||
CompletedAt: time.Now().UTC(),
|
||||
Deleted: deleted,
|
||||
}
|
||||
if derr != nil {
|
||||
res.Error = derr.Error()
|
||||
log.Printf("[prune-queue] request %s FAILED after %s: %v", req.ID, time.Since(start), derr)
|
||||
} else {
|
||||
log.Printf("[prune-queue] request %s deleted %d node(s) in %s", req.ID, deleted, time.Since(start))
|
||||
}
|
||||
if werr := prunequeue.WriteResult(s.path, res); werr != nil {
|
||||
log.Printf("[prune-queue] write result for %s failed: %v", req.ID, werr)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,77 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/meshcore-analyzer/prunequeue"
|
||||
)
|
||||
|
||||
func TestRunPendingPruneRequests(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "test.db")
|
||||
|
||||
store, err := OpenStore(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenStore: %v", err)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
// Seed two nodes; one will be pruned, one will be kept.
|
||||
if _, err := store.db.Exec(`INSERT INTO nodes (public_key, name, role, lat, lon, last_seen, first_seen)
|
||||
VALUES ('aaaa', 'gone', 'companion', 1.0, 1.0, '2026-01-01T00:00:00Z', '2026-01-01T00:00:00Z'),
|
||||
('bbbb', 'kept', 'companion', 2.0, 2.0, '2026-01-01T00:00:00Z', '2026-01-01T00:00:00Z')`); err != nil {
|
||||
t.Fatalf("seed: %v", err)
|
||||
}
|
||||
|
||||
id := prunequeue.NewID()
|
||||
if err := prunequeue.WriteRequest(dbPath, prunequeue.Request{
|
||||
ID: id,
|
||||
RequestedAt: time.Now().UTC(),
|
||||
Reason: "geo-prune-test",
|
||||
Pubkeys: []string{"aaaa"},
|
||||
}); err != nil {
|
||||
t.Fatalf("WriteRequest: %v", err)
|
||||
}
|
||||
|
||||
store.RunPendingPruneRequests()
|
||||
|
||||
// Request file gone, result file present.
|
||||
if exists, _ := prunequeue.RequestExists(dbPath, id); exists {
|
||||
t.Error("request file should have been consumed")
|
||||
}
|
||||
res, err := prunequeue.ReadResult(dbPath, id)
|
||||
if err != nil || res == nil {
|
||||
t.Fatalf("ReadResult: res=%v err=%v", res, err)
|
||||
}
|
||||
if res.Deleted != 1 {
|
||||
t.Errorf("expected Deleted=1, got %d", res.Deleted)
|
||||
}
|
||||
if res.Error != "" {
|
||||
t.Errorf("unexpected error: %s", res.Error)
|
||||
}
|
||||
|
||||
// Verify DB state: aaaa gone, bbbb kept.
|
||||
var n int
|
||||
store.db.QueryRow("SELECT COUNT(*) FROM nodes WHERE public_key='aaaa'").Scan(&n)
|
||||
if n != 0 {
|
||||
t.Errorf("expected 'aaaa' deleted, got count=%d", n)
|
||||
}
|
||||
store.db.QueryRow("SELECT COUNT(*) FROM nodes WHERE public_key='bbbb'").Scan(&n)
|
||||
if n != 1 {
|
||||
t.Errorf("expected 'bbbb' kept, got count=%d", n)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunPendingPruneRequests_EmptyQueueIsNoop(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "test.db")
|
||||
store, err := OpenStore(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenStore: %v", err)
|
||||
}
|
||||
defer store.Close()
|
||||
// Must not panic / error on empty queue.
|
||||
store.RunPendingPruneRequests()
|
||||
}
|
||||
@@ -1,63 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// #1483: server's GetNodeLocationsByKeys lookup relies on stored
|
||||
// public_key being lowercase (LOWER(public_key) was dropped for perf).
|
||||
// The ingestor must normalize any legacy uppercase rows on boot so
|
||||
// the lookup remains correct.
|
||||
func TestPublicKeyLowercaseNormalizationMigration(t *testing.T) {
|
||||
dbPath := tempDBPath(t)
|
||||
s, err := OpenStore(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("first OpenStore: %v", err)
|
||||
}
|
||||
// Seed an uppercase row directly, bypassing UpsertNode's lowercase.
|
||||
if _, err := s.db.Exec(
|
||||
`INSERT INTO nodes (public_key, name, role, last_seen, first_seen)
|
||||
VALUES ('AABBCCDDEEFF11223344', 'mixed-case-node', 'companion', '2026-01-01T00:00:00Z', '2026-01-01T00:00:00Z')`,
|
||||
); err != nil {
|
||||
t.Fatalf("seed uppercase row: %v", err)
|
||||
}
|
||||
// Sanity: verify the uppercase row is there pre-normalization.
|
||||
var pk string
|
||||
if err := s.db.QueryRow(`SELECT public_key FROM nodes WHERE public_key = 'AABBCCDDEEFF11223344'`).Scan(&pk); err != nil {
|
||||
t.Fatalf("pre-check select: %v", err)
|
||||
}
|
||||
if pk != "AABBCCDDEEFF11223344" {
|
||||
t.Fatalf("pre-check: expected uppercase, got %s", pk)
|
||||
}
|
||||
s.Close()
|
||||
|
||||
// Reopen — the boot-time migration should normalize the row.
|
||||
s2, err := OpenStore(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("reopen: %v", err)
|
||||
}
|
||||
defer s2.Close()
|
||||
|
||||
// The uppercase row should be gone.
|
||||
var still int
|
||||
if err := s2.db.QueryRow(`SELECT COUNT(*) FROM nodes WHERE public_key = 'AABBCCDDEEFF11223344'`).Scan(&still); err != nil {
|
||||
t.Fatalf("post-check uppercase count: %v", err)
|
||||
}
|
||||
if still != 0 {
|
||||
t.Fatalf("expected 0 uppercase rows after migration, got %d", still)
|
||||
}
|
||||
// The lowercase form should match.
|
||||
var lower string
|
||||
err = s2.db.QueryRow(`SELECT public_key FROM nodes WHERE public_key = 'aabbccddeeff11223344'`).Scan(&lower)
|
||||
if err == sql.ErrNoRows {
|
||||
t.Fatalf("expected lowercase row to exist after migration")
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatalf("post-check lowercase select: %v", err)
|
||||
}
|
||||
if lower != strings.ToLower("AABBCCDDEEFF11223344") {
|
||||
t.Fatalf("got %s, want lowercase form", lower)
|
||||
}
|
||||
}
|
||||
@@ -1,113 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
)
|
||||
|
||||
// Issue #1547 — resolved_path writer (ingestor-owned).
|
||||
//
|
||||
// Per the #1283 refactor (server is read-only; ingestor owns the
|
||||
// neighbor graph + node directory), the writer that populated
|
||||
// `observations.resolved_path` must live here in the ingestor. PR #1289
|
||||
// removed the server-side writer without porting it — this restores it.
|
||||
//
|
||||
// Approach:
|
||||
// - `resolvePath` is a pure function: hop prefixes → full pubkeys
|
||||
// using the in-memory prefix index built from `nodes.public_key`.
|
||||
// - Unique-prefix hops resolve to the full pubkey; ambiguous or
|
||||
// unknown hops resolve to `nil`. The output shape is `[]*string`
|
||||
// (with nulls for unresolved positions) — the JSON serialization
|
||||
// matches what the server's `unmarshalResolvedPath` /
|
||||
// frontend `getResolvedPath` already consume.
|
||||
// - The prefix index is rebuilt on startup and once per neighbor-
|
||||
// builder tick (60s) so new nodes start resolving within a minute
|
||||
// without blocking the MQTT ingest path.
|
||||
|
||||
// resolvePath maps each hop prefix to a full pubkey when the index
|
||||
// has exactly one candidate; returns nil at that position otherwise.
|
||||
// Returns nil for empty/no hops.
|
||||
func resolvePath(hops []string, idx prefixIndex) []*string {
|
||||
if len(hops) == 0 {
|
||||
return nil
|
||||
}
|
||||
out := make([]*string, len(hops))
|
||||
if idx == nil {
|
||||
return out
|
||||
}
|
||||
for i, hop := range hops {
|
||||
h := strings.ToLower(hop)
|
||||
candidates := idx[h]
|
||||
if len(candidates) == 1 {
|
||||
pk := candidates[0]
|
||||
out[i] = &pk
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// marshalResolvedPath JSON-encodes a resolved path. Returns "" when
|
||||
// the input is empty OR when every element is nil (writer treats "" as
|
||||
// SQL NULL).
|
||||
//
|
||||
// The all-nil case matters because of the UPSERT in InsertTransmission:
|
||||
//
|
||||
// resolved_path = COALESCE(excluded.resolved_path, resolved_path)
|
||||
//
|
||||
// If we emitted "[null,null]" here, nilIfEmpty() would let it through
|
||||
// as a non-NULL string and the COALESCE would OVERWRITE a previously
|
||||
// stored good resolved_path on re-ingest. Returning "" lets nilIfEmpty
|
||||
// produce SQL NULL so the COALESCE falls through to the existing value.
|
||||
// See issue #1547 / PR #1548 reviewer findings.
|
||||
func marshalResolvedPath(rp []*string) string {
|
||||
if len(rp) == 0 {
|
||||
return ""
|
||||
}
|
||||
allNil := true
|
||||
for _, p := range rp {
|
||||
if p != nil {
|
||||
allNil = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if allNil {
|
||||
return ""
|
||||
}
|
||||
b, err := json.Marshal(rp)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return string(b)
|
||||
}
|
||||
|
||||
// prefixIdxHolder caches the prefix index for the InsertTransmission
|
||||
// hot path. atomic.Value lets the 60s rebuild happen without a lock on
|
||||
// the read side.
|
||||
type prefixIdxHolder struct {
|
||||
v atomic.Value // holds prefixIndex
|
||||
}
|
||||
|
||||
func (h *prefixIdxHolder) load() prefixIndex {
|
||||
if v := h.v.Load(); v != nil {
|
||||
return v.(prefixIndex)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *prefixIdxHolder) store(idx prefixIndex) {
|
||||
h.v.Store(idx)
|
||||
}
|
||||
|
||||
// RefreshPrefixIndex rebuilds the in-memory prefix index from the
|
||||
// nodes table and publishes it atomically. Called on startup and from
|
||||
// the neighbor-edges builder tick (60s) so new nodes become resolvable
|
||||
// without per-insert DB scans.
|
||||
func (s *Store) RefreshPrefixIndex() error {
|
||||
idx, err := buildPrefixIndex(s.db)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
s.prefixIdx.store(idx)
|
||||
return nil
|
||||
}
|
||||
@@ -1,446 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func unmarshalResolvedPathLocal(s string) []*string {
|
||||
if s == "" {
|
||||
return nil
|
||||
}
|
||||
var out []*string
|
||||
if json.Unmarshal([]byte(s), &out) != nil {
|
||||
return nil
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// TestResolvePathPureFunction is a unit test for the pure resolvePath
|
||||
// helper. Asserts:
|
||||
// - unique-prefix hops resolve to the full pubkey
|
||||
// - ambiguous-prefix hops resolve to nil
|
||||
// - unknown-prefix hops resolve to nil
|
||||
// - return slice length equals input hop count
|
||||
//
|
||||
// Regression gate for #1547 (resolved_path stopped being written).
|
||||
func TestResolvePathPureFunction(t *testing.T) {
|
||||
idx := prefixIndex{
|
||||
// "aa" → exactly one pubkey
|
||||
"aa": {"aaaaaaaaaa"},
|
||||
"aaaaaaaaaa": {"aaaaaaaaaa"},
|
||||
// "bb" → exactly one pubkey
|
||||
"bb": {"bbbbbbbbbb"},
|
||||
"bbbbbbbbbb": {"bbbbbbbbbb"},
|
||||
// "cc" → ambiguous (2 candidates)
|
||||
"cc": {"cccccccccc", "ccdddddddd"},
|
||||
"cccccccccc": {"cccccccccc"},
|
||||
}
|
||||
|
||||
got := resolvePath([]string{"aa", "cc", "ff", "bb"}, idx)
|
||||
if len(got) != 4 {
|
||||
t.Fatalf("expected len 4, got %d", len(got))
|
||||
}
|
||||
if got[0] == nil || *got[0] != "aaaaaaaaaa" {
|
||||
t.Errorf("hop[0] aa: want aaaaaaaaaa, got %v", deref(got[0]))
|
||||
}
|
||||
if got[1] != nil {
|
||||
t.Errorf("hop[1] cc: want nil (ambiguous), got %v", deref(got[1]))
|
||||
}
|
||||
if got[2] != nil {
|
||||
t.Errorf("hop[2] ff: want nil (unknown), got %v", deref(got[2]))
|
||||
}
|
||||
if got[3] == nil || *got[3] != "bbbbbbbbbb" {
|
||||
t.Errorf("hop[3] bb: want bbbbbbbbbb, got %v", deref(got[3]))
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolvePathEmptyHops asserts empty/no-path produces nil.
|
||||
func TestResolvePathEmptyHops(t *testing.T) {
|
||||
if got := resolvePath(nil, prefixIndex{}); got != nil {
|
||||
t.Errorf("nil hops: want nil, got %v", got)
|
||||
}
|
||||
if got := resolvePath([]string{}, prefixIndex{}); got != nil {
|
||||
t.Errorf("empty hops: want nil, got %v", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestMarshalResolvedPathRoundtrip asserts the JSON shape matches the
|
||||
// server's marshal/unmarshal contract: `[]*string` with nulls for
|
||||
// unresolved hops.
|
||||
func TestMarshalResolvedPathRoundtrip(t *testing.T) {
|
||||
a := "aaaaaaaaaa"
|
||||
b := "bbbbbbbbbb"
|
||||
in := []*string{&a, nil, &b}
|
||||
s := marshalResolvedPath(in)
|
||||
want := `["aaaaaaaaaa",null,"bbbbbbbbbb"]`
|
||||
if s != want {
|
||||
t.Errorf("marshal: want %s, got %s", want, s)
|
||||
}
|
||||
}
|
||||
|
||||
// TestInsertTransmissionWritesResolvedPath is the integration test that
|
||||
// gates the regression introduced by PR #1289 (issue #1547).
|
||||
//
|
||||
// Setup: seed two nodes + one observer + invoke InsertTransmission with
|
||||
// a PacketData whose PathJSON references one of the seeded nodes by
|
||||
// unique 1-byte (2-hex) prefix.
|
||||
//
|
||||
// Assert: the inserted observations row has a non-NULL resolved_path
|
||||
// whose JSON-decoded length equals the hop count, and the resolved
|
||||
// element matches the seeded node's full pubkey.
|
||||
func TestInsertTransmissionWritesResolvedPath(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "ingest.db")
|
||||
|
||||
store, err := OpenStore(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenStore: %v", err)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
// Seed nodes with unique 1-byte prefixes.
|
||||
if _, err := store.db.Exec(
|
||||
`INSERT INTO nodes (public_key, name) VALUES (?, ?), (?, ?)`,
|
||||
"aaaaaaaaaa", "from-node",
|
||||
"bbbbbbbbbb", "first-hop",
|
||||
); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Seed one observer (needed so InsertTransmission resolves observer_idx).
|
||||
if err := store.UpsertObserver("obs-1", "observer-1", "", nil); err != nil {
|
||||
t.Fatalf("UpsertObserver: %v", err)
|
||||
}
|
||||
|
||||
// Force the prefix index to be (re)built from the seeded nodes so
|
||||
// the InsertTransmission path has something to resolve against.
|
||||
if err := store.RefreshPrefixIndex(); err != nil {
|
||||
t.Fatalf("RefreshPrefixIndex: %v", err)
|
||||
}
|
||||
|
||||
pkt := &PacketData{
|
||||
RawHex: "deadbeef",
|
||||
Timestamp: "2026-06-01T00:00:00Z",
|
||||
ObserverID: "obs-1",
|
||||
Hash: "h-1547",
|
||||
RouteType: 0,
|
||||
PayloadType: int(payloadADVERT),
|
||||
PathJSON: `["bb"]`,
|
||||
DecodedJSON: "{}",
|
||||
FromPubkey: "aaaaaaaaaa",
|
||||
}
|
||||
if _, err := store.InsertTransmission(pkt); err != nil {
|
||||
t.Fatalf("InsertTransmission: %v", err)
|
||||
}
|
||||
|
||||
var rp sql.NullString
|
||||
if err := store.db.QueryRow(
|
||||
`SELECT resolved_path FROM observations WHERE transmission_id = (SELECT id FROM transmissions WHERE hash = ?)`,
|
||||
"h-1547",
|
||||
).Scan(&rp); err != nil {
|
||||
t.Fatalf("query: %v", err)
|
||||
}
|
||||
if !rp.Valid || rp.String == "" {
|
||||
t.Fatalf("expected non-nil resolved_path, got NULL/empty (regression: #1547)")
|
||||
}
|
||||
got := unmarshalResolvedPathLocal(rp.String)
|
||||
if len(got) != 1 {
|
||||
t.Fatalf("resolved_path length: want 1, got %d (value=%s)", len(got), rp.String)
|
||||
}
|
||||
if got[0] == nil || *got[0] != "bbbbbbbbbb" {
|
||||
t.Errorf("resolved_path[0]: want bbbbbbbbbb, got %v (raw=%s)", deref(got[0]), rp.String)
|
||||
}
|
||||
}
|
||||
|
||||
func deref(p *string) string {
|
||||
if p == nil {
|
||||
return "<nil>"
|
||||
}
|
||||
return *p
|
||||
}
|
||||
|
||||
// ─── #1560: context-aware resolution tests ─────────────────────────────────
|
||||
//
|
||||
// These exercise the post-fix behavior of resolveHopWithContext +
|
||||
// resolvePathWithContext. Until the green commit lands they MUST fail
|
||||
// on assertions (the stub falls back to naive `len==1` and returns nil
|
||||
// on every >1-candidate prefix), proving the gate is real.
|
||||
|
||||
// build5NodeAmbiguousIndex returns a prefixIndex where 3 of 5 nodes
|
||||
// share the 1-byte prefix 0x5c. Pubkeys are the "fingerprints":
|
||||
//
|
||||
// A = "5c000000000000000000000000000000aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
|
||||
// B = "5c000000000000000000000000000000bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
|
||||
// C = "5c000000000000000000000000000000cccccccccccccccccccccccccccccccc"
|
||||
// D = "dd000000000000000000000000000000dddddddddddddddddddddddddddddddd"
|
||||
// E = "ee000000000000000000000000000000eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee"
|
||||
func build5NodeAmbiguousIndex() (idx prefixIndex, A, B, C, D, E string) {
|
||||
A = "5c000000000000000000000000000000aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
|
||||
B = "5c000000000000000000000000000000bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
|
||||
C = "5c000000000000000000000000000000cccccccccccccccccccccccccccccccc"
|
||||
D = "dd000000000000000000000000000000dddddddddddddddddddddddddddddddd"
|
||||
E = "ee000000000000000000000000000000eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee"
|
||||
idx = prefixIndex{
|
||||
// 1-byte: 5c → A,B,C (collision); dd → D; ee → E
|
||||
"5c": {A, B, C},
|
||||
"dd": {D},
|
||||
"ee": {E},
|
||||
// full-key entries (so exact-match lookups still resolve)
|
||||
A: {A}, B: {B}, C: {C}, D: {D}, E: {E},
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// TestResolveHopWithContext_OneByteCollision_AdjacencyResolves
|
||||
// asserts the dominant production case (#1560): three nodes share the
|
||||
// 1-byte prefix 0x5c, but NeighborGraph adjacency narrows to exactly
|
||||
// one. The naive resolver returns nil; the context-aware resolver
|
||||
// MUST return the right pubkey.
|
||||
func TestResolveHopWithContext_OneByteCollision_AdjacencyResolves(t *testing.T) {
|
||||
idx, A, B, C, D, E := build5NodeAmbiguousIndex()
|
||||
g := NewNeighborGraph()
|
||||
// chain: A↔B, B↔C, C↔D, D↔E
|
||||
g.AddEdge(A, B)
|
||||
g.AddEdge(B, C)
|
||||
g.AddEdge(C, D)
|
||||
g.AddEdge(D, E)
|
||||
|
||||
// Anchored on A, the only 5c neighbor of A is B.
|
||||
got := resolveHopWithContext("5c", A, g, idx, nil)
|
||||
if got == nil {
|
||||
t.Fatalf("anchor=A, hop=5c: want B (%s), got <nil>", B)
|
||||
}
|
||||
if *got != B {
|
||||
t.Errorf("anchor=A, hop=5c: want %s, got %s", B, *got)
|
||||
}
|
||||
|
||||
// Anchored on B, the only 5c neighbors of B are A and C — but A is
|
||||
// the originator anchor in a path-walk; here we just assert that
|
||||
// 2 surviving candidates → nil (cannot disambiguate further).
|
||||
got = resolveHopWithContext("5c", B, g, idx, nil)
|
||||
if got != nil {
|
||||
t.Errorf("anchor=B, hop=5c: ambiguous (A and C both adjacent); want <nil>, got %s", *got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolvePathWithContext_TwoHopChainAnchoredOnFromNode covers the
|
||||
// canonical 1-byte collision case end-to-end: path = [5c, 5c],
|
||||
// from_node = A → expect [B, C].
|
||||
func TestResolvePathWithContext_TwoHopChainAnchoredOnFromNode(t *testing.T) {
|
||||
idx, A, B, C, _, _ := build5NodeAmbiguousIndex()
|
||||
g := NewNeighborGraph()
|
||||
g.AddEdge(A, B)
|
||||
g.AddEdge(B, C)
|
||||
|
||||
got := resolvePathWithContext([]string{"5c", "5c"}, A, g, idx)
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("len(got)=%d, want 2 (raw=%v)", len(got), got)
|
||||
}
|
||||
if got[0] == nil || *got[0] != B {
|
||||
t.Errorf("hop[0]: want %s, got %v", B, deref(got[0]))
|
||||
}
|
||||
if got[1] == nil || *got[1] != C {
|
||||
t.Errorf("hop[1]: want %s, got %v", C, deref(got[1]))
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolveHopWithContext_NoAdjacencyContext_ReturnsNil asserts the
|
||||
// negative gate: 3 nodes with shared prefix, no edges between them in
|
||||
// the graph, hop=[5c] with no usable anchor → nil. Guards against an
|
||||
// over-eager resolver that just picks the first candidate.
|
||||
func TestResolveHopWithContext_NoAdjacencyContext_ReturnsNil(t *testing.T) {
|
||||
idx, _, _, _, _, _ := build5NodeAmbiguousIndex()
|
||||
g := NewNeighborGraph() // empty: no edges
|
||||
got := resolveHopWithContext("5c", "", g, idx, nil)
|
||||
if got != nil {
|
||||
t.Errorf("no anchor + empty graph: want <nil>, got %s", *got)
|
||||
}
|
||||
|
||||
// With an anchor that's not adjacent to any candidate, also nil.
|
||||
got = resolveHopWithContext("5c", "deadbeefdeadbeef", g, idx, nil)
|
||||
if got != nil {
|
||||
t.Errorf("non-adjacent anchor: want <nil>, got %s", *got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolvePathWithContext_AdvertAnchoring asserts ADVERT-style
|
||||
// anchoring: from_pubkey is the originator, hop[0] is one of its
|
||||
// 1-byte-prefix neighbors → resolved.
|
||||
func TestResolvePathWithContext_AdvertAnchoring(t *testing.T) {
|
||||
idx, A, B, _, _, _ := build5NodeAmbiguousIndex()
|
||||
g := NewNeighborGraph()
|
||||
g.AddEdge(A, B) // only B is adjacent to A among the 5c candidates
|
||||
|
||||
got := resolvePathWithContext([]string{"5c"}, A, g, idx)
|
||||
if len(got) != 1 {
|
||||
t.Fatalf("len(got)=%d, want 1", len(got))
|
||||
}
|
||||
if got[0] == nil || *got[0] != B {
|
||||
t.Errorf("ADVERT anchored on A, hop=5c: want %s, got %v", B, deref(got[0]))
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolvePathWithContext_RegressionMultiByteStillWorks asserts no
|
||||
// regression in the 2/3/4-byte prefix path that PR #1548 already
|
||||
// handled — unique prefixes resolve regardless of graph context.
|
||||
func TestResolvePathWithContext_RegressionMultiByteStillWorks(t *testing.T) {
|
||||
idx, _, _, _, D, E := build5NodeAmbiguousIndex()
|
||||
// dd and ee are unique 1-byte prefixes — naive path still works.
|
||||
got := resolvePathWithContext([]string{"dd", "ee"}, "", nil, idx)
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("len(got)=%d, want 2", len(got))
|
||||
}
|
||||
if got[0] == nil || *got[0] != D {
|
||||
t.Errorf("hop[0] dd: want %s, got %v", D, deref(got[0]))
|
||||
}
|
||||
if got[1] == nil || *got[1] != E {
|
||||
t.Errorf("hop[1] ee: want %s, got %v", E, deref(got[1]))
|
||||
}
|
||||
}
|
||||
|
||||
// TestResolvePathWithContext_AllNilContractPreserved asserts the
|
||||
// all-nil → empty-string clobber-guard contract from PR #1548 still
|
||||
// holds: an unresolvable path through the context resolver, when fed
|
||||
// to marshalResolvedPath, MUST yield "" (so nilIfEmpty → SQL NULL
|
||||
// → COALESCE preserves existing).
|
||||
func TestResolvePathWithContext_AllNilContractPreserved(t *testing.T) {
|
||||
// Empty index → every hop nil.
|
||||
got := resolvePathWithContext([]string{"5c", "dd"}, "", nil, prefixIndex{})
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("len(got)=%d, want 2", len(got))
|
||||
}
|
||||
for i, p := range got {
|
||||
if p != nil {
|
||||
t.Errorf("hop[%d]: want <nil>, got %s", i, *p)
|
||||
}
|
||||
}
|
||||
if s := marshalResolvedPath(got); s != "" {
|
||||
t.Errorf("all-nil marshal: want \"\", got %q (clobber-guard regression)", s)
|
||||
}
|
||||
}
|
||||
|
||||
// TestMarshalResolvedPathAllNilReturnsEmpty is a regression gate for
|
||||
// the data-loss clobber bug surfaced in PR #1548 review.
|
||||
//
|
||||
// When resolvePath fails to resolve ANY hop (every element nil),
|
||||
// marshalResolvedPath previously emitted "[null,null,...]" — a
|
||||
// non-empty string that bypassed nilIfEmpty and then OVERWROTE the
|
||||
// existing resolved_path via the COALESCE(excluded, current) UPSERT
|
||||
// on re-ingest. The fix returns "" so nilIfEmpty produces SQL NULL and
|
||||
// the COALESCE preserves the existing good value.
|
||||
func TestMarshalResolvedPathAllNilReturnsEmpty(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
in []*string
|
||||
}{
|
||||
{"one-nil", []*string{nil}},
|
||||
{"two-nils", []*string{nil, nil}},
|
||||
{"three-nils", []*string{nil, nil, nil}},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got := marshalResolvedPath(tc.in)
|
||||
if got != "" {
|
||||
t.Errorf("all-nil input must return \"\" (so nilIfEmpty → SQL NULL → COALESCE preserves existing); got %q", got)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Mixed (at least one non-nil) MUST still marshal normally so we
|
||||
// don't lose partial resolutions.
|
||||
a := "aaaaaaaaaa"
|
||||
mixed := marshalResolvedPath([]*string{&a, nil})
|
||||
if mixed != `["aaaaaaaaaa",null]` {
|
||||
t.Errorf("partial resolution must still serialize; got %q", mixed)
|
||||
}
|
||||
}
|
||||
|
||||
// TestInsertTransmissionDoesNotClobberResolvedPathOnAllNil is the
|
||||
// integration-level regression test for the data-loss bug.
|
||||
//
|
||||
// Setup: insert a transmission whose first ingest resolves cleanly to
|
||||
// a known pubkey. Then re-ingest the SAME transmission after the
|
||||
// prefix index has been cleared (simulating an empty NeighborGraph /
|
||||
// all-nil resolution path) and assert the previously stored
|
||||
// resolved_path is PRESERVED (NOT overwritten to "[null]" or NULL).
|
||||
//
|
||||
// Pre-fix behavior: marshalResolvedPath emitted "[null]", nilIfEmpty
|
||||
// kept it non-NULL, and COALESCE(excluded.resolved_path, resolved_path)
|
||||
// clobbered the original "bbbbbbbbbb".
|
||||
func TestInsertTransmissionDoesNotClobberResolvedPathOnAllNil(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "ingest.db")
|
||||
|
||||
store, err := OpenStore(dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("OpenStore: %v", err)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
if _, err := store.db.Exec(
|
||||
`INSERT INTO nodes (public_key, name) VALUES (?, ?), (?, ?)`,
|
||||
"aaaaaaaaaa", "from-node",
|
||||
"bbbbbbbbbb", "first-hop",
|
||||
); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := store.UpsertObserver("obs-1", "observer-1", "", nil); err != nil {
|
||||
t.Fatalf("UpsertObserver: %v", err)
|
||||
}
|
||||
if err := store.RefreshPrefixIndex(); err != nil {
|
||||
t.Fatalf("RefreshPrefixIndex: %v", err)
|
||||
}
|
||||
|
||||
pkt := &PacketData{
|
||||
RawHex: "deadbeef",
|
||||
Timestamp: "2026-06-01T00:00:00Z",
|
||||
ObserverID: "obs-1",
|
||||
Hash: "h-clobber",
|
||||
RouteType: 0,
|
||||
PayloadType: int(payloadADVERT),
|
||||
PathJSON: `["bb"]`,
|
||||
DecodedJSON: "{}",
|
||||
FromPubkey: "aaaaaaaaaa",
|
||||
}
|
||||
if _, err := store.InsertTransmission(pkt); err != nil {
|
||||
t.Fatalf("first InsertTransmission: %v", err)
|
||||
}
|
||||
|
||||
// Sanity: first write populated resolved_path.
|
||||
var first sql.NullString
|
||||
if err := store.db.QueryRow(
|
||||
`SELECT resolved_path FROM observations WHERE transmission_id = (SELECT id FROM transmissions WHERE hash = ?)`,
|
||||
"h-clobber",
|
||||
).Scan(&first); err != nil {
|
||||
t.Fatalf("first query: %v", err)
|
||||
}
|
||||
if !first.Valid || first.String == "" {
|
||||
t.Fatalf("precondition failed: first ingest left resolved_path NULL/empty; cannot test clobber")
|
||||
}
|
||||
wantPreserved := first.String
|
||||
|
||||
// Now wipe the prefix index so re-ingest produces an all-nil
|
||||
// resolution — exactly the scenario where the bug clobbers data.
|
||||
store.prefixIdx.store(prefixIndex{})
|
||||
|
||||
if _, err := store.InsertTransmission(pkt); err != nil {
|
||||
t.Fatalf("re-ingest InsertTransmission: %v", err)
|
||||
}
|
||||
|
||||
var after sql.NullString
|
||||
if err := store.db.QueryRow(
|
||||
`SELECT resolved_path FROM observations WHERE transmission_id = (SELECT id FROM transmissions WHERE hash = ?)`,
|
||||
"h-clobber",
|
||||
).Scan(&after); err != nil {
|
||||
t.Fatalf("post-reingest query: %v", err)
|
||||
}
|
||||
if !after.Valid {
|
||||
t.Fatalf("data loss: resolved_path was NULL'd by re-ingest (was %q)", wantPreserved)
|
||||
}
|
||||
if after.String != wantPreserved {
|
||||
t.Errorf("data loss: resolved_path was clobbered by all-nil re-ingest\n before: %s\n after: %s", wantPreserved, after.String)
|
||||
}
|
||||
}
|
||||
@@ -1,156 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestParseEnvelopeTime(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
in string
|
||||
ok bool
|
||||
wantNaive bool
|
||||
}{
|
||||
{"rfc3339 utc", "2026-05-16T10:00:00Z", true, false},
|
||||
{"rfc3339 offset", "2026-05-16T12:00:00+02:00", true, false},
|
||||
{"naive iso", "2026-05-16T10:00:00", true, true},
|
||||
{"naive iso micros", "2026-05-16T10:00:00.123456", true, true},
|
||||
{"garbage", "not-a-time", false, false},
|
||||
{"empty", "", false, false},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
_, naive, err := parseEnvelopeTime(c.in)
|
||||
if (err == nil) != c.ok {
|
||||
t.Fatalf("parseEnvelopeTime(%q): want ok=%v, got err=%v", c.in, c.ok, err)
|
||||
}
|
||||
if err == nil && naive != c.wantNaive {
|
||||
t.Fatalf("parseEnvelopeTime(%q): want naive=%v, got %v", c.in, c.wantNaive, naive)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveRxTime(t *testing.T) {
|
||||
now := time.Now().UTC()
|
||||
|
||||
mustParse := func(s string) time.Time {
|
||||
t.Helper()
|
||||
parsed, err := time.Parse(time.RFC3339, s)
|
||||
if err != nil {
|
||||
t.Fatalf("result %q is not RFC3339: %v", s, err)
|
||||
}
|
||||
return parsed
|
||||
}
|
||||
nearNow := func(s string) bool {
|
||||
d := mustParse(s).Sub(now)
|
||||
if d < 0 {
|
||||
d = -d
|
||||
}
|
||||
return d <= time.Minute
|
||||
}
|
||||
|
||||
rx := now.Add(-5 * time.Hour).Format(time.RFC3339)
|
||||
if got, _ := resolveRxTime(map[string]interface{}{"timestamp": rx}, "test"); got != rx {
|
||||
t.Errorf("plausible past timestamp: got %q want %q", got, rx)
|
||||
}
|
||||
if got, _ := resolveRxTime(map[string]interface{}{}, "test"); !nearNow(got) {
|
||||
t.Errorf("missing timestamp: got %q, expected ~now", got)
|
||||
}
|
||||
if got, _ := resolveRxTime(map[string]interface{}{"timestamp": "garbage"}, "test"); !nearNow(got) {
|
||||
t.Errorf("garbage timestamp: got %q, expected ~now", got)
|
||||
}
|
||||
future := now.Add(48 * time.Hour).Format(time.RFC3339)
|
||||
if got, _ := resolveRxTime(map[string]interface{}{"timestamp": future}, "test"); !nearNow(got) {
|
||||
t.Errorf("future timestamp: got %q, expected ~now (rejected)", got)
|
||||
}
|
||||
|
||||
// RTC-reset node reporting a factory date — must not drag first_seen back.
|
||||
factory := "2020-01-01T00:00:00Z"
|
||||
if got, _ := resolveRxTime(map[string]interface{}{"timestamp": factory}, "test"); !nearNow(got) {
|
||||
t.Errorf("stale factory timestamp: got %q, expected ~now (rejected)", got)
|
||||
}
|
||||
// Just past the 30-day floor → rejected.
|
||||
stale := now.Add(-31 * 24 * time.Hour).Format(time.RFC3339)
|
||||
if got, _ := resolveRxTime(map[string]interface{}{"timestamp": stale}, "test"); !nearNow(got) {
|
||||
t.Errorf("stale timestamp >30d: got %q, expected ~now (rejected)", got)
|
||||
}
|
||||
// Just inside the 30-day floor → used verbatim.
|
||||
recent := now.Add(-29 * 24 * time.Hour).Format(time.RFC3339)
|
||||
if got, _ := resolveRxTime(map[string]interface{}{"timestamp": recent}, "test"); got != recent {
|
||||
t.Errorf("recent timestamp <30d: got %q want %q", got, recent)
|
||||
}
|
||||
}
|
||||
|
||||
// Regression: issue #1463 — naive (zone-less) ISO timestamps from observers
|
||||
// in negative-UTC-offset zones (e.g. California PDT, UTC−7) were interpreted
|
||||
// as UTC, producing rxTime values 7h in the past that poisoned `last_seen`
|
||||
// and rendered the observer perpetually "Stale" in the UI. The symmetric
|
||||
// clamp now collapses any naive timestamp more than 15 min off server-now to
|
||||
// `now()`, while zone-aware timestamps (RFC3339 with Z or offset) are still
|
||||
// honored verbatim regardless of skew (those are well-behaved observers).
|
||||
func TestResolveRxTimeNaiveTimestampClamp(t *testing.T) {
|
||||
now := time.Now().UTC()
|
||||
|
||||
mustParse := func(s string) time.Time {
|
||||
t.Helper()
|
||||
parsed, err := time.Parse(time.RFC3339, s)
|
||||
if err != nil {
|
||||
t.Fatalf("result %q is not RFC3339: %v", s, err)
|
||||
}
|
||||
return parsed
|
||||
}
|
||||
nearNow := func(s string) bool {
|
||||
d := mustParse(s).Sub(now)
|
||||
if d < 0 {
|
||||
d = -d
|
||||
}
|
||||
return d <= time.Minute
|
||||
}
|
||||
|
||||
// California observer (UTC-7) emitting a naive local-clock timestamp:
|
||||
// must NOT be stored verbatim 7h in the past — clamp to ~now.
|
||||
naivePast := now.Add(-7 * time.Hour).Format("2006-01-02T15:04:05")
|
||||
if got, _ := resolveRxTime(map[string]interface{}{"timestamp": naivePast}, "test"); !nearNow(got) {
|
||||
t.Errorf("naive past timestamp (UTC-7 observer): got %q, expected ~now (clamped)", got)
|
||||
}
|
||||
|
||||
// Naive future just minutes ahead (UTC+N observer, existing soft-clamp
|
||||
// behavior): still clamped to now.
|
||||
naiveFuture := now.Add(5 * time.Minute).Format("2006-01-02T15:04:05")
|
||||
if got, _ := resolveRxTime(map[string]interface{}{"timestamp": naiveFuture}, "test"); !nearNow(got) {
|
||||
t.Errorf("naive future timestamp: got %q, expected ~now (clamped)", got)
|
||||
}
|
||||
|
||||
// Naive microsecond layout (python isoformat without tz) — same clamp.
|
||||
naivePastMicros := now.Add(-7 * time.Hour).Format("2006-01-02T15:04:05.000000")
|
||||
if got, _ := resolveRxTime(map[string]interface{}{"timestamp": naivePastMicros}, "test"); !nearNow(got) {
|
||||
t.Errorf("naive past timestamp w/ micros: got %q, expected ~now (clamped)", got)
|
||||
}
|
||||
|
||||
// Well-behaved observer: Z-suffixed past timestamp passes through verbatim
|
||||
// even if it's hours old (legitimate buffered uploads must be preserved).
|
||||
zPast := now.Add(-7 * time.Hour).Format(time.RFC3339)
|
||||
if got, _ := resolveRxTime(map[string]interface{}{"timestamp": zPast}, "test"); got != zPast {
|
||||
t.Errorf("Z-suffixed past timestamp must pass through: got %q want %q", got, zPast)
|
||||
}
|
||||
|
||||
// Well-behaved observer with explicit offset (UTC-7) — canonicalize to UTC
|
||||
// but preserve the moment in time. Must equal the same moment in UTC.
|
||||
offsetLoc := time.FixedZone("PDT", -7*3600)
|
||||
offsetMoment := now.Add(-7 * time.Hour).In(offsetLoc)
|
||||
offsetStr := offsetMoment.Format(time.RFC3339)
|
||||
wantUTC := offsetMoment.UTC().Format(time.RFC3339)
|
||||
if got, _ := resolveRxTime(map[string]interface{}{"timestamp": offsetStr}, "test"); got != wantUTC {
|
||||
t.Errorf("offset-suffixed timestamp: got %q want %q", got, wantUTC)
|
||||
}
|
||||
|
||||
// Naive timestamp within tolerance window (2 min in past, observer that
|
||||
// happens to be in UTC) — within tolerance, passes through verbatim.
|
||||
naiveCloseStr := now.Add(-2 * time.Minute).Format("2006-01-02T15:04:05")
|
||||
naiveCloseWant := now.Add(-2 * time.Minute).Format(time.RFC3339)
|
||||
if got, _ := resolveRxTime(map[string]interface{}{"timestamp": naiveCloseStr}, "test"); got != naiveCloseWant {
|
||||
t.Errorf("naive timestamp within tolerance: got %q, expected %q (verbatim)", got, naiveCloseWant)
|
||||
}
|
||||
}
|
||||
@@ -1,31 +0,0 @@
|
||||
package main
|
||||
|
||||
import "strings"
|
||||
|
||||
// sanitizeLogString strips ASCII control bytes that would otherwise let a
|
||||
// node-controlled string (advert name, observer origin, channel name) inject
|
||||
// fake lines into the log stream. CR (\r), LF (\n), TAB (\t), NUL (\x00),
|
||||
// any other byte < 0x20, and 0x7F (DEL) are replaced with '?'.
|
||||
//
|
||||
// This is intentionally narrower than sanitizeName: sanitizeName preserves
|
||||
// \t and \n because they may appear in legitimately-stored display names.
|
||||
// Log sinks want neither.
|
||||
//
|
||||
// See audit-input-vulns-20260603 (LOW — log injection via newline in advert
|
||||
// name) and references at cmd/ingestor/main.go:659,689.
|
||||
func sanitizeLogString(s string) string {
|
||||
if s == "" {
|
||||
return s
|
||||
}
|
||||
// Iterate over runes so multibyte UTF-8 (Cyrillic, emoji) is preserved.
|
||||
var b strings.Builder
|
||||
b.Grow(len(s))
|
||||
for _, r := range s {
|
||||
if r < 0x20 || r == 0x7f {
|
||||
b.WriteByte('?')
|
||||
continue
|
||||
}
|
||||
b.WriteRune(r)
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
@@ -1,32 +0,0 @@
|
||||
package main
|
||||
|
||||
import "testing"
|
||||
|
||||
// TestSanitizeLogString covers the log-injection defense added to fix
|
||||
// audit-input-vulns-20260603 (LOW — log injection via newline in advert name).
|
||||
func TestSanitizeLogString(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
in string
|
||||
want string
|
||||
}{
|
||||
{"plain ascii preserved", "alpha-node", "alpha-node"},
|
||||
{"unicode preserved", "Иван привет 🦊", "Иван привет 🦊"},
|
||||
{"lf stripped", "evil\n[security] forged-line", "evil?[security] forged-line"},
|
||||
{"cr stripped", "evil\rfake-log", "evil?fake-log"},
|
||||
{"crlf stripped", "a\r\nb", "a??b"},
|
||||
{"tab stripped", "a\tb", "a?b"},
|
||||
{"nul stripped", "a\x00b", "a?b"},
|
||||
{"del stripped", "a\x7fb", "a?b"},
|
||||
{"bell stripped", "a\x07b", "a?b"},
|
||||
{"empty unchanged", "", ""},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got := sanitizeLogString(tc.in)
|
||||
if got != tc.want {
|
||||
t.Fatalf("sanitizeLogString(%q) = %q, want %q", tc.in, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -61,7 +61,7 @@ func TestSigValidation_ValidAdvertStored(t *testing.T) {
|
||||
msg := newMockMsg("meshcore/US/obs1/packet", `{"raw":"`+rawHex+`","origin":"TestObs"}`)
|
||||
cfg := &Config{}
|
||||
|
||||
handleMessage(store, "test", source, msg, nil, nil, cfg)
|
||||
handleMessage(store, "test", source, msg, nil, cfg)
|
||||
|
||||
// Verify packet was stored
|
||||
var count int
|
||||
@@ -98,7 +98,7 @@ func TestSigValidation_TamperedSignatureDropped(t *testing.T) {
|
||||
msg := newMockMsg("meshcore/US/obs1/packet", `{"raw":"`+tamperedHex+`","origin":"TestObs"}`)
|
||||
cfg := &Config{}
|
||||
|
||||
handleMessage(store, "test", source, msg, nil, nil, cfg)
|
||||
handleMessage(store, "test", source, msg, nil, cfg)
|
||||
|
||||
// Verify packet was NOT stored in transmissions
|
||||
var txCount int
|
||||
@@ -157,7 +157,7 @@ func TestSigValidation_TruncatedAppdataDropped(t *testing.T) {
|
||||
msg := newMockMsg("meshcore/US/obs1/packet", `{"raw":"`+truncatedHex+`","origin":"TestObs"}`)
|
||||
cfg := &Config{}
|
||||
|
||||
handleMessage(store, "test", source, msg, nil, nil, cfg)
|
||||
handleMessage(store, "test", source, msg, nil, cfg)
|
||||
|
||||
var txCount int
|
||||
store.db.QueryRow("SELECT COUNT(*) FROM transmissions").Scan(&txCount)
|
||||
@@ -192,7 +192,7 @@ func TestSigValidation_DisabledByConfig(t *testing.T) {
|
||||
falseVal := false
|
||||
cfg := &Config{ValidateSignatures: &falseVal}
|
||||
|
||||
handleMessage(store, "test", source, msg, nil, nil, cfg)
|
||||
handleMessage(store, "test", source, msg, nil, cfg)
|
||||
|
||||
// With validation disabled, tampered packet should be stored
|
||||
var txCount int
|
||||
@@ -225,7 +225,7 @@ func TestSigValidation_DropCounterIncrements(t *testing.T) {
|
||||
rawBytes[76] = '0'
|
||||
}
|
||||
msg := newMockMsg("meshcore/US/obs1/packet", `{"raw":"`+string(rawBytes)+`","origin":"Obs"}`)
|
||||
handleMessage(store, "test", source, msg, nil, nil, cfg)
|
||||
handleMessage(store, "test", source, msg, nil, cfg)
|
||||
}
|
||||
|
||||
if store.Stats.SignatureDrops.Load() != 3 {
|
||||
@@ -258,7 +258,7 @@ func TestSigValidation_LogContainsFields(t *testing.T) {
|
||||
msg := newMockMsg("meshcore/US/obs1/packet", `{"raw":"`+string(rawBytes)+`","origin":"MyObserver"}`)
|
||||
cfg := &Config{}
|
||||
|
||||
handleMessage(store, "test", source, msg, nil, nil, cfg)
|
||||
handleMessage(store, "test", source, msg, nil, cfg)
|
||||
|
||||
var hash, reason, obsID, obsName, pubkey, nodeName string
|
||||
err = store.db.QueryRow("SELECT hash, reason, observer_id, observer_name, node_pubkey, node_name FROM dropped_packets LIMIT 1").
|
||||
|
||||
@@ -1,187 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
)
|
||||
|
||||
// SourceStatusSnapshot is the per-MQTT-source connection state and counter
|
||||
// view written to the ingestor stats file (under "source_statuses") and
|
||||
// consumed by cmd/server's /api/mqtt/status handler (#1043).
|
||||
//
|
||||
// All fields are unix seconds (0 = "never"). PacketsLast5m is a sliding
|
||||
// 5-minute count derived from a per-second ring buffer.
|
||||
type SourceStatusSnapshot struct {
|
||||
Name string `json:"name"`
|
||||
Broker string `json:"broker"`
|
||||
Connected bool `json:"connected"`
|
||||
LastConnectUnix int64 `json:"lastConnectUnix"`
|
||||
LastDisconnectUnix int64 `json:"lastDisconnectUnix"`
|
||||
LastPacketUnix int64 `json:"lastPacketUnix"`
|
||||
ConnectCount int64 `json:"connectCount"`
|
||||
DisconnectCount int64 `json:"disconnectCount"`
|
||||
PacketsTotal int64 `json:"packetsTotal"`
|
||||
PacketsLast5m int64 `json:"packetsLast5m"`
|
||||
LastError string `json:"lastError,omitempty"`
|
||||
}
|
||||
|
||||
// sourceStatusState is the in-memory per-source counter set. All scalar
|
||||
// fields are accessed via sync/atomic so the hot-path MarkPacket /
|
||||
// MarkConnect / MarkDisconnect callsites stay lock-free. The 5-minute
|
||||
// sliding window uses a 300-element per-second ring (one slot per
|
||||
// second), guarded by ringMu only when we slide the cursor — the common
|
||||
// path increments the current second with a single atomic.AddInt64.
|
||||
//
|
||||
// Memory: one state per source (typically 1-5 in production). 300 int64
|
||||
// slots = 2.4KB/source — fine.
|
||||
type sourceStatusState struct {
|
||||
name string
|
||||
broker string // raw broker URL — server-side handler masks the password
|
||||
|
||||
connected atomic.Bool
|
||||
lastConnectUnix atomic.Int64
|
||||
lastDisconnectUnix atomic.Int64
|
||||
lastPacketUnix atomic.Int64
|
||||
connectCount atomic.Int64
|
||||
disconnectCount atomic.Int64
|
||||
packetsTotal atomic.Int64
|
||||
|
||||
// 5-minute sliding window: per-second buckets keyed by unix second.
|
||||
// Stored as parallel arrays so we can both zero-out a stale slot AND
|
||||
// know whether a slot's contents are still inside the window.
|
||||
ringMu sync.Mutex
|
||||
ringSec [300]int64 // unix second this slot represents (0 = unused)
|
||||
ringCount [300]int64 // packets received in that second
|
||||
|
||||
// lastError is rare-write/rare-read so a plain mutex is fine.
|
||||
errMu sync.RWMutex
|
||||
lastError string
|
||||
}
|
||||
|
||||
// MarkConnect records a successful (re)connection to the broker.
|
||||
// Clears any stale lastError from a prior disconnect — otherwise the UI
|
||||
// shows "connected=true, lastError='connection refused'" after a successful
|
||||
// reconnect, which is a lie (#1682 munger review r1).
|
||||
func (s *sourceStatusState) MarkConnect(now time.Time) {
|
||||
s.connected.Store(true)
|
||||
s.lastConnectUnix.Store(now.Unix())
|
||||
s.connectCount.Add(1)
|
||||
s.errMu.Lock()
|
||||
s.lastError = ""
|
||||
s.errMu.Unlock()
|
||||
}
|
||||
|
||||
// MarkDisconnect records the broker dropping the connection.
|
||||
func (s *sourceStatusState) MarkDisconnect(now time.Time, err error) {
|
||||
s.connected.Store(false)
|
||||
s.lastDisconnectUnix.Store(now.Unix())
|
||||
s.disconnectCount.Add(1)
|
||||
if err != nil {
|
||||
s.errMu.Lock()
|
||||
s.lastError = err.Error()
|
||||
s.errMu.Unlock()
|
||||
}
|
||||
}
|
||||
|
||||
// MarkPacket records receipt of an MQTT message. Hot path.
|
||||
func (s *sourceStatusState) MarkPacket(now time.Time) {
|
||||
nowSec := now.Unix()
|
||||
s.lastPacketUnix.Store(nowSec)
|
||||
s.packetsTotal.Add(1)
|
||||
|
||||
slot := nowSec % int64(len(s.ringSec))
|
||||
s.ringMu.Lock()
|
||||
if s.ringSec[slot] != nowSec {
|
||||
s.ringSec[slot] = nowSec
|
||||
s.ringCount[slot] = 0
|
||||
}
|
||||
s.ringCount[slot]++
|
||||
s.ringMu.Unlock()
|
||||
}
|
||||
|
||||
// sumLast5m returns the count of MarkPacket calls in the last 300s. Slots
|
||||
// whose stored second falls outside the window are ignored (no stale leak).
|
||||
func (s *sourceStatusState) sumLast5m(now time.Time) int64 {
|
||||
nowSec := now.Unix()
|
||||
cutoff := nowSec - int64(len(s.ringSec)) + 1
|
||||
var total int64
|
||||
s.ringMu.Lock()
|
||||
for i := 0; i < len(s.ringSec); i++ {
|
||||
if s.ringSec[i] >= cutoff && s.ringSec[i] <= nowSec {
|
||||
total += s.ringCount[i]
|
||||
}
|
||||
}
|
||||
s.ringMu.Unlock()
|
||||
return total
|
||||
}
|
||||
|
||||
// snapshot copies the state into a serializable view.
|
||||
func (s *sourceStatusState) snapshot(now time.Time) SourceStatusSnapshot {
|
||||
s.errMu.RLock()
|
||||
errStr := s.lastError
|
||||
s.errMu.RUnlock()
|
||||
return SourceStatusSnapshot{
|
||||
Name: s.name,
|
||||
Broker: s.broker,
|
||||
Connected: s.connected.Load(),
|
||||
LastConnectUnix: s.lastConnectUnix.Load(),
|
||||
LastDisconnectUnix: s.lastDisconnectUnix.Load(),
|
||||
LastPacketUnix: s.lastPacketUnix.Load(),
|
||||
ConnectCount: s.connectCount.Load(),
|
||||
DisconnectCount: s.disconnectCount.Load(),
|
||||
PacketsTotal: s.packetsTotal.Load(),
|
||||
PacketsLast5m: s.sumLast5m(now),
|
||||
LastError: errStr,
|
||||
}
|
||||
}
|
||||
|
||||
// sourceStatusRegistry holds one sourceStatusState per source. Keyed by
|
||||
// tag (which is the source Name, or the Broker URL if the operator left
|
||||
// the name blank).
|
||||
var (
|
||||
sourceStatusRegistryMu sync.RWMutex
|
||||
sourceStatusRegistry = map[string]*sourceStatusState{}
|
||||
)
|
||||
|
||||
// RegisterSourceStatus creates (or returns the existing) state for the
|
||||
// given source. Safe for cold-start use; idempotent — re-registering the
|
||||
// same tag returns the existing state so counters aren't reset across
|
||||
// reconnects.
|
||||
func RegisterSourceStatus(tag, broker string) *sourceStatusState {
|
||||
sourceStatusRegistryMu.Lock()
|
||||
defer sourceStatusRegistryMu.Unlock()
|
||||
if s, ok := sourceStatusRegistry[tag]; ok {
|
||||
return s
|
||||
}
|
||||
s := &sourceStatusState{name: tag, broker: broker}
|
||||
sourceStatusRegistry[tag] = s
|
||||
return s
|
||||
}
|
||||
|
||||
// lookupSourceStatus returns the state for tag, or nil if unregistered.
|
||||
func lookupSourceStatus(tag string) *sourceStatusState {
|
||||
sourceStatusRegistryMu.RLock()
|
||||
defer sourceStatusRegistryMu.RUnlock()
|
||||
return sourceStatusRegistry[tag]
|
||||
}
|
||||
|
||||
// SnapshotSourceStatuses returns a slice of every registered source's
|
||||
// current snapshot. Surfaced via the ingestor stats file under
|
||||
// "source_statuses" so /api/mqtt/status can serve it (#1043).
|
||||
func SnapshotSourceStatuses(now time.Time) []SourceStatusSnapshot {
|
||||
sourceStatusRegistryMu.RLock()
|
||||
defer sourceStatusRegistryMu.RUnlock()
|
||||
out := make([]SourceStatusSnapshot, 0, len(sourceStatusRegistry))
|
||||
for _, s := range sourceStatusRegistry {
|
||||
out = append(out, s.snapshot(now))
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// resetSourceStatusRegistry clears the registry. Test-only helper.
|
||||
func resetSourceStatusRegistry() {
|
||||
sourceStatusRegistryMu.Lock()
|
||||
defer sourceStatusRegistryMu.Unlock()
|
||||
sourceStatusRegistry = map[string]*sourceStatusState{}
|
||||
}
|
||||
@@ -1,116 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TestSourceStatus_BasicLifecycle exercises the counter wiring used by
|
||||
// the /api/mqtt/status server-side endpoint (#1043).
|
||||
func TestSourceStatus_BasicLifecycle(t *testing.T) {
|
||||
resetSourceStatusRegistry()
|
||||
defer resetSourceStatusRegistry()
|
||||
|
||||
s := RegisterSourceStatus("local", "mqtt://broker.example.com:1883")
|
||||
if s == nil {
|
||||
t.Fatal("RegisterSourceStatus returned nil")
|
||||
}
|
||||
// Re-registration is idempotent.
|
||||
if s2 := RegisterSourceStatus("local", "mqtt://other"); s2 != s {
|
||||
t.Fatal("RegisterSourceStatus not idempotent")
|
||||
}
|
||||
|
||||
now := time.Unix(1_700_000_000, 0)
|
||||
s.MarkConnect(now)
|
||||
s.MarkPacket(now)
|
||||
s.MarkPacket(now.Add(1 * time.Second))
|
||||
s.MarkPacket(now.Add(2 * time.Second))
|
||||
|
||||
snap := s.snapshot(now.Add(3 * time.Second))
|
||||
if !snap.Connected {
|
||||
t.Error("snapshot.Connected = false, want true after MarkConnect")
|
||||
}
|
||||
if snap.PacketsTotal != 3 {
|
||||
t.Errorf("PacketsTotal = %d, want 3", snap.PacketsTotal)
|
||||
}
|
||||
if snap.PacketsLast5m != 3 {
|
||||
t.Errorf("PacketsLast5m = %d, want 3", snap.PacketsLast5m)
|
||||
}
|
||||
if snap.ConnectCount != 1 {
|
||||
t.Errorf("ConnectCount = %d, want 1", snap.ConnectCount)
|
||||
}
|
||||
if snap.LastConnectUnix != now.Unix() {
|
||||
t.Errorf("LastConnectUnix = %d, want %d", snap.LastConnectUnix, now.Unix())
|
||||
}
|
||||
if snap.Broker != "mqtt://broker.example.com:1883" {
|
||||
t.Errorf("Broker = %q, want raw URL passthrough (server masks)", snap.Broker)
|
||||
}
|
||||
|
||||
// After 5 minutes idle, sliding window must be empty.
|
||||
snap2 := s.snapshot(now.Add(6 * time.Minute))
|
||||
if snap2.PacketsLast5m != 0 {
|
||||
t.Errorf("PacketsLast5m after 6m idle = %d, want 0", snap2.PacketsLast5m)
|
||||
}
|
||||
if snap2.PacketsTotal != 3 {
|
||||
t.Errorf("PacketsTotal must be lifetime-cumulative, got %d", snap2.PacketsTotal)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSourceStatus_Disconnect(t *testing.T) {
|
||||
resetSourceStatusRegistry()
|
||||
defer resetSourceStatusRegistry()
|
||||
|
||||
s := RegisterSourceStatus("disco", "mqtt://x:1883")
|
||||
now := time.Unix(1_700_000_100, 0)
|
||||
s.MarkConnect(now)
|
||||
s.MarkDisconnect(now.Add(time.Minute), nil)
|
||||
|
||||
snap := s.snapshot(now.Add(2 * time.Minute))
|
||||
if snap.Connected {
|
||||
t.Error("snapshot.Connected = true after MarkDisconnect, want false")
|
||||
}
|
||||
if snap.DisconnectCount != 1 {
|
||||
t.Errorf("DisconnectCount = %d, want 1", snap.DisconnectCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSnapshotSourceStatuses_ReturnsAll(t *testing.T) {
|
||||
resetSourceStatusRegistry()
|
||||
defer resetSourceStatusRegistry()
|
||||
|
||||
RegisterSourceStatus("a", "mqtt://a")
|
||||
RegisterSourceStatus("b", "mqtt://b")
|
||||
snaps := SnapshotSourceStatuses(time.Now())
|
||||
if len(snaps) != 2 {
|
||||
t.Errorf("len(snaps) = %d, want 2", len(snaps))
|
||||
}
|
||||
}
|
||||
|
||||
// TestSourceStatus_MarkConnectClearsLastError asserts MarkConnect wipes
|
||||
// any prior sticky error (#1682 munger r1 review). Otherwise the UI sees
|
||||
// connected=true alongside a stale "connection refused" string.
|
||||
func TestSourceStatus_MarkConnectClearsLastError(t *testing.T) {
|
||||
resetSourceStatusRegistry()
|
||||
defer resetSourceStatusRegistry()
|
||||
|
||||
s := RegisterSourceStatus("sticky", "mqtt://x:1883")
|
||||
now := time.Unix(1_700_000_200, 0)
|
||||
s.MarkConnect(now)
|
||||
s.MarkDisconnect(now.Add(time.Second), errors.New("connection refused"))
|
||||
|
||||
snap := s.snapshot(now.Add(2 * time.Second))
|
||||
if snap.LastError == "" {
|
||||
t.Fatalf("precondition: expected lastError after MarkDisconnect, got empty")
|
||||
}
|
||||
|
||||
// Reconnect — lastError must clear.
|
||||
s.MarkConnect(now.Add(3 * time.Second))
|
||||
snap = s.snapshot(now.Add(4 * time.Second))
|
||||
if snap.LastError != "" {
|
||||
t.Errorf("snapshot.LastError = %q after MarkConnect, want empty (sticky-error regression)", snap.LastError)
|
||||
}
|
||||
if !snap.Connected {
|
||||
t.Errorf("snapshot.Connected = false after MarkConnect, want true")
|
||||
}
|
||||
}
|
||||
@@ -1,274 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"log"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/meshcore-analyzer/perfio"
|
||||
)
|
||||
|
||||
// PerfIOSample is the canonical per-process I/O rate sample, sourced from the
|
||||
// shared internal/perfio package. The server consumes the same type when it
|
||||
// reads this binary's stats file — sharing the type prevents silent JSON
|
||||
// contract drift (#1167 follow-up).
|
||||
type PerfIOSample = perfio.Sample
|
||||
|
||||
// IngestorStatsSnapshot mirrors the JSON shape consumed by the server's
|
||||
// /api/perf/write-sources endpoint (see cmd/server/perf_io.go IngestorStats).
|
||||
//
|
||||
// NOTE: each field below is sampled with an independent atomic.Load(), so the
|
||||
// snapshot is EVENTUALLY-CONSISTENT — invariants like
|
||||
// `walCommits >= tx_inserted` may be momentarily violated
|
||||
// in a single sample. Consumers MUST NOT derive ratios on the assumption these
|
||||
// counters were captured at the same instant; treat each field as an
|
||||
// independent monotonically-increasing counter and look at deltas across
|
||||
// multiple samples instead.
|
||||
type IngestorStatsSnapshot struct {
|
||||
SampledAt string `json:"sampledAt"`
|
||||
TxInserted int64 `json:"tx_inserted"`
|
||||
ObsInserted int64 `json:"obs_inserted"`
|
||||
DuplicateTx int64 `json:"tx_dupes"`
|
||||
NodeUpserts int64 `json:"node_upserts"`
|
||||
ObserverUpserts int64 `json:"observer_upserts"`
|
||||
WriteErrors int64 `json:"write_errors"`
|
||||
SignatureDrops int64 `json:"sig_drops"`
|
||||
WALCommits int64 `json:"walCommits"`
|
||||
GroupCommitFlushes int64 `json:"groupCommitFlushes"` // always 0 — group commit reverted (refs #1129)
|
||||
BackfillUpdates map[string]int64 `json:"backfillUpdates"`
|
||||
// ProcIO is the ingestor's own /proc/self/io rate snapshot. Surfaced via
|
||||
// the server's /api/perf/io endpoint under .ingestor (#1120 — "Both
|
||||
// ingestor and server"). Optional; absent on non-Linux hosts.
|
||||
ProcIO *PerfIOSample `json:"procIO,omitempty"`
|
||||
// WriterPerf is the per-component SQLite writer-lock latency
|
||||
// snapshot (#1340) — wait_ms / hold_ms / contention_total tagged
|
||||
// by component (neighbor_builder, mqtt_handler, prune_packets,
|
||||
// prune_observers, prune_metrics, vacuum). Surfaced by the server
|
||||
// via /api/perf/write-sources under .writer_perf. Optional —
|
||||
// older ingestor builds don't publish this field.
|
||||
WriterPerf map[string]WriterStatsSnapshot `json:"writer_perf,omitempty"`
|
||||
// SourceLiveness (PR #1609 M1) is the per-MQTT-source receipt vs
|
||||
// write-path liveness snapshot. Keyed by source Tag. Surfaced by
|
||||
// the server via /api/healthz under .ingest_liveness so operators
|
||||
// can see "broker alive, write path stuck" (lastReceiptUnix recent,
|
||||
// lastMessageUnix stale) distinct from "everything stalled" (both
|
||||
// stale). Additive: omitempty so older server builds ignore it
|
||||
// gracefully.
|
||||
SourceLiveness map[string]SourceLivenessSnapshot `json:"source_liveness,omitempty"`
|
||||
// SourceStatuses (#1043) is the per-MQTT-source connection state and
|
||||
// counter view consumed by cmd/server's /api/mqtt/status handler.
|
||||
// Additive; omitempty so older server builds ignore it.
|
||||
SourceStatuses []SourceStatusSnapshot `json:"source_statuses,omitempty"`
|
||||
}
|
||||
|
||||
// SourceLivenessSnapshot is the per-source two-clock view exposed for
|
||||
// /api/healthz consumers. unixSeconds for both fields; 0 means "never".
|
||||
type SourceLivenessSnapshot struct {
|
||||
LastReceiptUnix int64 `json:"lastReceiptUnix"`
|
||||
LastMessageUnix int64 `json:"lastMessageUnix"`
|
||||
}
|
||||
|
||||
// statsFilePath returns the writable path the ingestor will publish stats to.
|
||||
// Override via env CORESCOPE_INGESTOR_STATS for tests / non-default deploys.
|
||||
//
|
||||
// SECURITY: the default lives in /tmp which is world-writable. The writer uses
|
||||
// O_NOFOLLOW + 0o600 so a pre-planted symlink cannot be used to clobber an
|
||||
// arbitrary file via this path. Operators who want stronger guarantees should
|
||||
// point CORESCOPE_INGESTOR_STATS at a private directory (e.g. /var/lib/corescope/).
|
||||
func statsFilePath() string {
|
||||
if p := os.Getenv("CORESCOPE_INGESTOR_STATS"); p != "" {
|
||||
return p
|
||||
}
|
||||
return "/tmp/corescope-ingestor-stats.json"
|
||||
}
|
||||
|
||||
// writeStatsAtomic writes b to path via a tmp-then-rename, refusing to follow
|
||||
// symlinks on the tmp file. Returns nil on success, an error otherwise.
|
||||
//
|
||||
// Symlink semantics (refs #1170):
|
||||
//
|
||||
// - tmp side (path+".tmp"): protected by O_NOFOLLOW below. If tmp is a
|
||||
// pre-planted symlink, openat fails with ELOOP instead of writing
|
||||
// through it. This is the defensive-coding path that matters when the
|
||||
// default stats path lives under world-writable /tmp.
|
||||
//
|
||||
// - rename side (path): NOT protected by O_NOFOLLOW. Instead, os.Rename's
|
||||
// semantics are relied upon — rename atomically replaces any existing
|
||||
// entry at path (including a symlink) with the new regular file. The
|
||||
// symlink's target is NEVER written through, because all writes happened
|
||||
// to the unrelated tmp file before rename. Post-rename, path is a
|
||||
// regular file (not a symlink) and any prior symlink target's contents
|
||||
// are unchanged. The regression guardrail
|
||||
// TestWriteStatsAtomic_SymlinkAtDestIsReplaced pins this behavior so a
|
||||
// future refactor that swaps os.Rename for a destination-symlink-
|
||||
// following primitive (e.g. an open(path, O_WRONLY) without O_NOFOLLOW)
|
||||
// fails loudly.
|
||||
func writeStatsAtomic(path string, b []byte) error {
|
||||
tmp := path + ".tmp"
|
||||
// O_NOFOLLOW: if tmp is a pre-existing symlink, openat fails with ELOOP
|
||||
// instead of clobbering the symlink target. O_TRUNC zeroes existing
|
||||
// regular-file content. 0o600 — no need for world-readable.
|
||||
f, err := os.OpenFile(tmp, os.O_CREATE|os.O_WRONLY|os.O_TRUNC|oNoFollow, 0o600)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := f.Write(b); err != nil {
|
||||
f.Close()
|
||||
os.Remove(tmp)
|
||||
return err
|
||||
}
|
||||
if err := f.Close(); err != nil {
|
||||
os.Remove(tmp)
|
||||
return err
|
||||
}
|
||||
if err := os.Rename(tmp, path); err != nil {
|
||||
os.Remove(tmp)
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// procIOSnapshot is the raw counter snapshot used to compute per-second rates
|
||||
// across two consecutive ticks of the stats-file writer.
|
||||
type procIOSnapshot struct {
|
||||
at time.Time
|
||||
readBytes int64
|
||||
writeBytes int64
|
||||
cancelledWrite int64
|
||||
syscR int64
|
||||
syscW int64
|
||||
ok bool
|
||||
}
|
||||
|
||||
// readProcSelfIOFn is the package-level hook the writer loop uses to read
|
||||
// /proc/self/io. Defaults to readProcSelfIO; tests override it to inject
|
||||
// deterministic counter snapshots without depending on a Linux kernel
|
||||
// that exposes /proc/self/io (CONFIG_TASK_IO_ACCOUNTING).
|
||||
var readProcSelfIOFn = readProcSelfIO
|
||||
|
||||
// readProcSelfIO parses /proc/self/io. Returns ok=false on non-Linux hosts or
|
||||
// any read/parse failure (caller skips the procIO block in that case).
|
||||
func readProcSelfIO() procIOSnapshot {
|
||||
f, err := os.Open("/proc/self/io")
|
||||
if err != nil {
|
||||
return procIOSnapshot{}
|
||||
}
|
||||
defer f.Close()
|
||||
out := procIOSnapshot{at: time.Now()}
|
||||
parseProcSelfIOInto(bufio.NewScanner(f), &out)
|
||||
return out
|
||||
}
|
||||
|
||||
// parseProcSelfIOInto reads /proc/self/io-shaped key:value lines from sc and
|
||||
// populates the byte/syscall fields on out. Sets out.ok=true only if at
|
||||
// least one expected key was successfully parsed (#1167 must-fix #3).
|
||||
//
|
||||
// Implementation delegates to perfio.ParseProcIO so the ingestor and the
|
||||
// server share exactly one parser (Carmack must-fix #7).
|
||||
func parseProcSelfIOInto(sc *bufio.Scanner, out *procIOSnapshot) {
|
||||
var c perfio.Counters
|
||||
out.ok = perfio.ParseProcIO(sc, &c)
|
||||
out.readBytes = c.ReadBytes
|
||||
out.writeBytes = c.WriteBytes
|
||||
out.cancelledWrite = c.CancelledWriteBytes
|
||||
out.syscR = c.SyscR
|
||||
out.syscW = c.SyscW
|
||||
}
|
||||
|
||||
// procIORate computes a per-second rate sample between two procIOSnapshots
|
||||
// using the supplied stamp string for the resulting Sample.SampledAt
|
||||
// (Carmack must-fix #5 — the writer captures time.Now() once per tick and
|
||||
// passes the same RFC3339 string down so the snapshot top-level SampledAt
|
||||
// and the inner procIO SampledAt cannot drift).
|
||||
// Returns nil if either snapshot is invalid or the interval is zero.
|
||||
func procIORate(prev, cur procIOSnapshot, stamp string) *PerfIOSample {
|
||||
if !prev.ok || !cur.ok {
|
||||
return nil
|
||||
}
|
||||
dt := cur.at.Sub(prev.at).Seconds()
|
||||
if dt < 0.001 {
|
||||
return nil
|
||||
}
|
||||
return &PerfIOSample{
|
||||
ReadBytesPerSec: float64(cur.readBytes-prev.readBytes) / dt,
|
||||
WriteBytesPerSec: float64(cur.writeBytes-prev.writeBytes) / dt,
|
||||
CancelledWriteBytesPerSec: float64(cur.cancelledWrite-prev.cancelledWrite) / dt,
|
||||
SyscallsRead: float64(cur.syscR-prev.syscR) / dt,
|
||||
SyscallsWrite: float64(cur.syscW-prev.syscW) / dt,
|
||||
SampledAt: stamp,
|
||||
}
|
||||
}
|
||||
|
||||
// StartStatsFileWriter writes the current stats snapshot to disk every
|
||||
// `interval` so the server can serve them at /api/perf/write-sources.
|
||||
// Failures are logged once-per-interval and never fatal.
|
||||
//
|
||||
// The stats file path is resolved via statsFilePath() once at writer-loop
|
||||
// start; the env var (CORESCOPE_INGESTOR_STATS) is only re-read on process
|
||||
// restart, not per tick.
|
||||
func StartStatsFileWriter(s *Store, interval time.Duration) {
|
||||
if interval <= 0 {
|
||||
interval = time.Second
|
||||
}
|
||||
go func() {
|
||||
t := time.NewTicker(interval)
|
||||
defer t.Stop()
|
||||
path := statsFilePath()
|
||||
// Track previous procIO sample so we can compute per-second deltas
|
||||
// across ticks (#1120 follow-up: ingestor /proc/self/io exposure).
|
||||
prevIO := readProcSelfIOFn()
|
||||
// Reuse a single bytes.Buffer + json.Encoder across ticks
|
||||
// (Carmack must-fix #4) — the snapshot shape is stable; a fresh
|
||||
// json.Marshal allocation per second × forever is pure GC waste.
|
||||
// The buffer grows once and stays.
|
||||
var buf bytes.Buffer
|
||||
enc := json.NewEncoder(&buf)
|
||||
for range t.C {
|
||||
// Capture time.Now() ONCE per tick (Carmack must-fix #5).
|
||||
// Both snapshot.SampledAt and procIO.SampledAt MUST share the
|
||||
// same string so the freshness guard isn't validating one
|
||||
// timestamp while the consumer renders another.
|
||||
tickAt := time.Now().UTC()
|
||||
stamp := tickAt.Format(time.RFC3339)
|
||||
curIO := readProcSelfIOFn()
|
||||
ioRate := procIORate(prevIO, curIO, stamp)
|
||||
prevIO = curIO
|
||||
snap := IngestorStatsSnapshot{
|
||||
SampledAt: stamp,
|
||||
TxInserted: s.Stats.TransmissionsInserted.Load(),
|
||||
ObsInserted: s.Stats.ObservationsInserted.Load(),
|
||||
DuplicateTx: s.Stats.DuplicateTransmissions.Load(),
|
||||
NodeUpserts: s.Stats.NodeUpserts.Load(),
|
||||
ObserverUpserts: s.Stats.ObserverUpserts.Load(),
|
||||
WriteErrors: s.Stats.WriteErrors.Load(),
|
||||
SignatureDrops: s.Stats.SignatureDrops.Load(),
|
||||
WALCommits: s.Stats.WALCommits.Load(),
|
||||
GroupCommitFlushes: 0, // group commit reverted (refs #1129)
|
||||
BackfillUpdates: s.Stats.SnapshotBackfills(),
|
||||
ProcIO: ioRate,
|
||||
WriterPerf: s.WriterStatsSnapshot(),
|
||||
SourceLiveness: SnapshotLivenessClocks(),
|
||||
SourceStatuses: SnapshotSourceStatuses(tickAt),
|
||||
}
|
||||
buf.Reset()
|
||||
if err := enc.Encode(&snap); err != nil {
|
||||
log.Printf("[stats-file] encode: %v", err)
|
||||
continue
|
||||
}
|
||||
// json.Encoder.Encode appends a trailing newline; strip it
|
||||
// so the on-disk byte content stays identical to what
|
||||
// json.Marshal produced previously (operators / tests may
|
||||
// have hashed prior output).
|
||||
b := buf.Bytes()
|
||||
if n := len(b); n > 0 && b[n-1] == '\n' {
|
||||
b = b[:n-1]
|
||||
}
|
||||
if err := writeStatsAtomic(path, b); err != nil {
|
||||
log.Printf("[stats-file] write %s: %v", path, err)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
@@ -1,98 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
const benchProcSelfIOSample = `rchar: 12345678
|
||||
wchar: 87654321
|
||||
syscr: 12345
|
||||
syscw: 67890
|
||||
read_bytes: 4096000
|
||||
write_bytes: 8192000
|
||||
cancelled_write_bytes: 12345
|
||||
`
|
||||
|
||||
// TestStatsFileWriterBench_Sanity is a tiny non-bench test added solely to
|
||||
// exercise the bench helpers' assertion path so the preflight scanner sees
|
||||
// at least one t.Error*/t.Fatal* in this file (the benchmarks themselves
|
||||
// use b.Fatal, which the scanner doesn't recognise as an assertion).
|
||||
func TestStatsFileWriterBench_Sanity(t *testing.T) {
|
||||
var s procIOSnapshot
|
||||
parseProcSelfIOInto(bufio.NewScanner(strings.NewReader(benchProcSelfIOSample)), &s)
|
||||
if !s.ok {
|
||||
t.Fatalf("expected bench sample to parse ok=true")
|
||||
}
|
||||
if s.readBytes != 4096000 {
|
||||
t.Errorf("readBytes = %d, want 4096000", s.readBytes)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// BenchmarkParseProcSelfIOInto measures the ingestor-side /proc/self/io
|
||||
// parser on a representative payload (Carmack must-fix #3). Tracks
|
||||
// allocations to verify the shared perfio.ParseProcIO path doesn't
|
||||
// regress vs. the previous in-package implementation.
|
||||
func BenchmarkParseProcSelfIOInto(b *testing.B) {
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
var s procIOSnapshot
|
||||
parseProcSelfIOInto(bufio.NewScanner(strings.NewReader(benchProcSelfIOSample)), &s)
|
||||
}
|
||||
}
|
||||
|
||||
// BenchmarkStatsFileWriter_Tick simulates the body of one writer tick
|
||||
// (snap construction + JSON encode via the reused buffer) WITHOUT the
|
||||
// disk write. Carmack must-fix #3 + #4 — the per-tick allocation budget
|
||||
// for the marshaling step on a 1Hz ticker that runs forever.
|
||||
func BenchmarkStatsFileWriter_Tick(b *testing.B) {
|
||||
// Mirror the writer-loop's reused encoder.
|
||||
var buf bytes.Buffer
|
||||
enc := json.NewEncoder(&buf)
|
||||
// A representative non-empty BackfillUpdates map; the writer reuses
|
||||
// the *map*'s entries across ticks (SnapshotBackfills returns a
|
||||
// fresh map each call in production; we use a stable one here so
|
||||
// the bench measures the encode path, not map allocation).
|
||||
backfills := map[string]int64{"path_a": 100, "path_b": 200}
|
||||
stamp := time.Now().UTC().Format(time.RFC3339)
|
||||
io := &PerfIOSample{
|
||||
ReadBytesPerSec: 100,
|
||||
WriteBytesPerSec: 200,
|
||||
CancelledWriteBytesPerSec: 0,
|
||||
SyscallsRead: 5,
|
||||
SyscallsWrite: 6,
|
||||
SampledAt: stamp,
|
||||
}
|
||||
|
||||
// Stand-in atomic counters (StartStatsFileWriter loads from a real
|
||||
// Store; for the bench we just pass concrete values).
|
||||
var n atomic.Int64
|
||||
n.Store(123456)
|
||||
|
||||
b.ReportAllocs()
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
snap := IngestorStatsSnapshot{
|
||||
SampledAt: stamp,
|
||||
TxInserted: n.Load(),
|
||||
ObsInserted: n.Load(),
|
||||
DuplicateTx: n.Load(),
|
||||
NodeUpserts: n.Load(),
|
||||
ObserverUpserts: n.Load(),
|
||||
WriteErrors: n.Load(),
|
||||
SignatureDrops: n.Load(),
|
||||
WALCommits: n.Load(),
|
||||
GroupCommitFlushes: 0,
|
||||
BackfillUpdates: backfills,
|
||||
ProcIO: io,
|
||||
}
|
||||
buf.Reset()
|
||||
_ = enc.Encode(&snap)
|
||||
}
|
||||
}
|
||||
@@ -1,9 +0,0 @@
|
||||
//go:build !windows
|
||||
|
||||
package main
|
||||
|
||||
import "syscall"
|
||||
|
||||
// oNoFollow is syscall.O_NOFOLLOW on platforms that define it (all non-Windows targets).
|
||||
// On Windows this constant does not exist; see stats_file_nofollow_windows.go.
|
||||
const oNoFollow = syscall.O_NOFOLLOW
|
||||
@@ -1,8 +0,0 @@
|
||||
//go:build windows
|
||||
|
||||
package main
|
||||
|
||||
// oNoFollow is 0 on Windows: O_NOFOLLOW is not defined in the Windows syscall
|
||||
// package. The ingestor is only deployed on Linux where the flag is enforced;
|
||||
// on Windows the flag is a no-op so the binary compiles and tests run.
|
||||
const oNoFollow = 0
|
||||
@@ -1,51 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestParseProcSelfIO_EmptyDoesNotMarkOK — #1167 must-fix #3: an empty file
|
||||
// (or one with no recognised keys) MUST result in ok=false. Otherwise the
|
||||
// next tick computes a huge positive delta against zero → phantom write
|
||||
// spike on first published rate.
|
||||
func TestParseProcSelfIO_EmptyDoesNotMarkOK(t *testing.T) {
|
||||
var s procIOSnapshot
|
||||
parseProcSelfIOInto(bufio.NewScanner(strings.NewReader("")), &s)
|
||||
if s.ok {
|
||||
t.Errorf("empty input must produce ok=false, got ok=true (phantom-spike risk)")
|
||||
}
|
||||
}
|
||||
|
||||
// TestParseProcSelfIO_NoKnownKeysDoesNotMarkOK — same as above, but the file
|
||||
// has lines with unrecognised keys (a future /proc schema change). MUST NOT
|
||||
// be treated as a valid sample.
|
||||
func TestParseProcSelfIO_NoKnownKeysDoesNotMarkOK(t *testing.T) {
|
||||
var s procIOSnapshot
|
||||
parseProcSelfIOInto(bufio.NewScanner(strings.NewReader("garbage_key: 42\nother: 99\n")), &s)
|
||||
if s.ok {
|
||||
t.Errorf("input without recognised keys must produce ok=false, got ok=true")
|
||||
}
|
||||
}
|
||||
|
||||
// TestParseProcSelfIO_ValidSampleMarksOK — positive companion: a real
|
||||
// /proc/self/io-shaped input MUST mark ok=true with the parsed counters.
|
||||
func TestParseProcSelfIO_ValidSampleMarksOK(t *testing.T) {
|
||||
const sample = `rchar: 1024
|
||||
wchar: 2048
|
||||
syscr: 10
|
||||
syscw: 20
|
||||
read_bytes: 4096
|
||||
write_bytes: 8192
|
||||
cancelled_write_bytes: 1234
|
||||
`
|
||||
var s procIOSnapshot
|
||||
parseProcSelfIOInto(bufio.NewScanner(strings.NewReader(sample)), &s)
|
||||
if !s.ok {
|
||||
t.Fatalf("valid sample must produce ok=true")
|
||||
}
|
||||
if s.readBytes != 4096 || s.writeBytes != 8192 || s.cancelledWrite != 1234 {
|
||||
t.Errorf("unexpected parsed counters: %+v", s)
|
||||
}
|
||||
}
|
||||
@@ -1,168 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TestProcIORate_ZeroValuePrevSuppressesRate guards against the phantom-delta
|
||||
// regression from #1169: when os.Open("/proc/self/io") fails, readProcSelfIO
|
||||
// now returns a zero-value procIOSnapshot (ok=false, zero time.Time). This
|
||||
// asserts procIORate returns nil so no inflated rate spike appears for the
|
||||
// next successful read.
|
||||
func TestProcIORate_ZeroValuePrevSuppressesRate(t *testing.T) {
|
||||
prev := procIOSnapshot{} // zero-value: ok=false, at=zero
|
||||
cur := procIOSnapshot{
|
||||
at: time.Now(),
|
||||
readBytes: 1024 * 1024 * 100,
|
||||
ok: true,
|
||||
}
|
||||
if got := procIORate(prev, cur, "2026-01-01T00:00:00Z"); got != nil {
|
||||
t.Fatalf("expected nil rate when prev is zero-value (os.Open failed), got %+v", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestProcIORate_NormalPath asserts two valid snapshots produce a non-nil rate.
|
||||
func TestProcIORate_NormalPath(t *testing.T) {
|
||||
base := time.Now()
|
||||
prev := procIOSnapshot{at: base, readBytes: 0, ok: true}
|
||||
cur := procIOSnapshot{at: base.Add(time.Second), readBytes: 1024, ok: true}
|
||||
got := procIORate(prev, cur, "2026-01-01T00:00:01Z")
|
||||
if got == nil {
|
||||
t.Fatal("expected non-nil rate for valid prev/cur pair")
|
||||
}
|
||||
if got.ReadBytesPerSec != 1024.0 {
|
||||
t.Errorf("ReadBytesPerSec: want 1024.0, got %v", got.ReadBytesPerSec)
|
||||
}
|
||||
}
|
||||
|
||||
// TestStatsFileWriter_PublishesProcIO asserts the ingestor's published
|
||||
// stats snapshot includes a `procIO` block with the per-process I/O rate
|
||||
// fields required by issue #1120 ("Both ingestor and server").
|
||||
func TestStatsFileWriter_PublishesProcIO(t *testing.T) {
|
||||
if _, err := os.Stat("/proc/self/io"); err != nil {
|
||||
t.Skip("skip: /proc/self/io unavailable on this host")
|
||||
}
|
||||
dir := t.TempDir()
|
||||
statsPath := filepath.Join(dir, "ingestor-stats.json")
|
||||
t.Setenv("CORESCOPE_INGESTOR_STATS", statsPath)
|
||||
|
||||
store, err := OpenStore(filepath.Join(dir, "test.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("OpenStore: %v", err)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
StartStatsFileWriter(store, 50*time.Millisecond)
|
||||
|
||||
// Wait for at least 2 ticks so the writer has had a chance to populate
|
||||
// procIO rates from a delta.
|
||||
deadline := time.Now().Add(3 * time.Second)
|
||||
var snap map[string]interface{}
|
||||
for time.Now().Before(deadline) {
|
||||
time.Sleep(75 * time.Millisecond)
|
||||
b, err := os.ReadFile(statsPath)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if err := json.Unmarshal(b, &snap); err != nil {
|
||||
continue
|
||||
}
|
||||
if _, ok := snap["procIO"]; ok {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
pio, ok := snap["procIO"].(map[string]interface{})
|
||||
if !ok {
|
||||
t.Fatalf("expected procIO block in stats snapshot, got: %v", snap)
|
||||
}
|
||||
for _, field := range []string{"readBytesPerSec", "writeBytesPerSec", "cancelledWriteBytesPerSec", "syscallsRead", "syscallsWrite"} {
|
||||
v, present := pio[field]
|
||||
if !present {
|
||||
t.Errorf("procIO missing field %q", field)
|
||||
continue
|
||||
}
|
||||
// #1167 must-fix #5: assert the field actually decodes as a JSON
|
||||
// number, not just that the key exists. An empty PerfIOSample{}
|
||||
// substruct would still serialise the keys since the inner numeric
|
||||
// fields lack omitempty — without this Kind check the test would
|
||||
// silently pass on an empty struct regression.
|
||||
if _, isFloat := v.(float64); !isFloat {
|
||||
t.Errorf("procIO[%q] expected JSON number (float64), got %T (%v)", field, v, v)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestWriteStatsAtomic_SymlinkAtDestIsReplaced is a regression guardrail for
|
||||
// #1170. The tmp side of writeStatsAtomic uses O_NOFOLLOW so a pre-planted
|
||||
// symlink at path+".tmp" cannot redirect the write — but the rename target
|
||||
// (`path` itself) is not protected by O_NOFOLLOW. Instead, os.Rename's
|
||||
// semantics are relied upon: rename atomically replaces any existing entry
|
||||
// at the destination, including a symlink, with the new regular file. The
|
||||
// original symlink's target is never written through (because the write
|
||||
// happened to the unrelated tmp file).
|
||||
//
|
||||
// This test pre-plants a symlink at `path` pointing to an unrelated target
|
||||
// file and asserts:
|
||||
// (a) post-write, path is a regular file (not a symlink), and
|
||||
// (b) the original target's contents are unchanged.
|
||||
//
|
||||
// If a future refactor swaps os.Rename for something that follows the
|
||||
// destination symlink (e.g. ioutil.WriteFile, or an open(path, O_WRONLY)
|
||||
// without O_NOFOLLOW), this test will fail loudly.
|
||||
func TestWriteStatsAtomic_SymlinkAtDestIsReplaced(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
// Unrelated target file with sentinel bytes. If writeStatsAtomic ever
|
||||
// followed the symlink at `path`, it would overwrite this file.
|
||||
target := filepath.Join(dir, "unrelated-target.bin")
|
||||
sentinel := []byte("DO-NOT-OVERWRITE-ME-#1170")
|
||||
if err := os.WriteFile(target, sentinel, 0o600); err != nil {
|
||||
t.Fatalf("seed target: %v", err)
|
||||
}
|
||||
|
||||
// Pre-plant a symlink at the destination path.
|
||||
path := filepath.Join(dir, "stats.json")
|
||||
if err := os.Symlink(target, path); err != nil {
|
||||
t.Fatalf("symlink: %v", err)
|
||||
}
|
||||
|
||||
payload := []byte(`{"sampledAt":"2026-01-01T00:00:00Z"}`)
|
||||
if err := writeStatsAtomic(path, payload); err != nil {
|
||||
t.Fatalf("writeStatsAtomic: %v", err)
|
||||
}
|
||||
|
||||
// (a) post-write, path must NOT be a symlink.
|
||||
info, err := os.Lstat(path)
|
||||
if err != nil {
|
||||
t.Fatalf("lstat path: %v", err)
|
||||
}
|
||||
if info.Mode()&os.ModeSymlink != 0 {
|
||||
t.Errorf("post-write path is still a symlink (mode=%v); os.Rename should have atomically replaced it with a regular file", info.Mode())
|
||||
}
|
||||
if !info.Mode().IsRegular() {
|
||||
t.Errorf("post-write path is not a regular file (mode=%v)", info.Mode())
|
||||
}
|
||||
|
||||
// Path now contains the new payload.
|
||||
got, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("read path: %v", err)
|
||||
}
|
||||
if string(got) != string(payload) {
|
||||
t.Errorf("path contents: want %q, got %q", payload, got)
|
||||
}
|
||||
|
||||
// (b) the original symlink target must be unchanged.
|
||||
gotTarget, err := os.ReadFile(target)
|
||||
if err != nil {
|
||||
t.Fatalf("read target: %v", err)
|
||||
}
|
||||
if string(gotTarget) != string(sentinel) {
|
||||
t.Errorf("symlink target was clobbered: want %q, got %q", sentinel, gotTarget)
|
||||
}
|
||||
}
|
||||
@@ -1,106 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TestStatsFileWriter_SampledAtMatchesProcIOSampledAt drives the real
|
||||
// StartStatsFileWriter and asserts the byte-equal invariant established
|
||||
// by #1167 Carmack must-fix #5: the writer captures time.Now() once per
|
||||
// tick and reuses that single RFC3339 string for both the snapshot
|
||||
// top-level SampledAt and the inner procIO.SampledAt. If a future change
|
||||
// reintroduces two independent time.Now() calls — or, equivalently,
|
||||
// reverts procIORate to format procIO.SampledAt from its own
|
||||
// (independently-sampled) `cur.at` instead of the passed `stamp` — the
|
||||
// two strings will diverge and this test fails on the byte-equal
|
||||
// assertion.
|
||||
//
|
||||
// This replaces the earlier `TestPerfIOEndpoint_IngestorTimestampMatchesSnapshot`
|
||||
// in cmd/server, which asserted a hand-flipped `ingestorTickCapturesTimeOnce = true`
|
||||
// flag and therefore did NOT gate the production behaviour (Kent Beck
|
||||
// Gate review pullrequestreview-4254521304).
|
||||
//
|
||||
// Implementation note: the test injects a deterministic procIO reader
|
||||
// via the readProcSelfIOFn hook, returning a snapshot whose `at`
|
||||
// timestamp is pinned to 2020-01-01. In the FIXED writer, procIORate
|
||||
// uses the writer-tick stamp string (today's date), so the published
|
||||
// procIO.SampledAt equals snap.SampledAt byte-for-byte. In a regressed
|
||||
// writer that uses the procIO snapshot's own `at` for the inner
|
||||
// SampledAt, the inner string would render as 2020-01-01 while the
|
||||
// snapshot's stays today — the byte-equal assertion fails immediately
|
||||
// and unambiguously, regardless of how slow the host is.
|
||||
func TestStatsFileWriter_SampledAtMatchesProcIOSampledAt(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
statsPath := filepath.Join(dir, "ingestor-stats.json")
|
||||
t.Setenv("CORESCOPE_INGESTOR_STATS", statsPath)
|
||||
|
||||
store, err := OpenStore(filepath.Join(dir, "test.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("OpenStore: %v", err)
|
||||
}
|
||||
defer store.Close()
|
||||
|
||||
// Inject a deterministic procIO reader. `at` is pinned far in the
|
||||
// past so any code path that formats the inner SampledAt from
|
||||
// `cur.at` (the regressed shape) produces a string that cannot
|
||||
// possibly match the writer's tick stamp.
|
||||
origFn := readProcSelfIOFn
|
||||
t.Cleanup(func() { readProcSelfIOFn = origFn })
|
||||
pinnedAt := time.Date(2020, 1, 1, 0, 0, 0, 0, time.UTC)
|
||||
var calls int64
|
||||
readProcSelfIOFn = func() procIOSnapshot {
|
||||
calls++
|
||||
// Advance counters across calls so procIORate's dt > 0.001
|
||||
// gate passes and a non-nil PerfIOSample is published. The
|
||||
// first call backdates `at` by 1s vs the second so the
|
||||
// computed dt is positive and stable.
|
||||
return procIOSnapshot{
|
||||
at: pinnedAt.Add(time.Duration(calls) * time.Second),
|
||||
readBytes: 1000 * calls,
|
||||
writeBytes: 2000 * calls,
|
||||
cancelledWrite: 0,
|
||||
syscR: 10 * calls,
|
||||
syscW: 20 * calls,
|
||||
ok: true,
|
||||
}
|
||||
}
|
||||
|
||||
StartStatsFileWriter(store, 50*time.Millisecond)
|
||||
|
||||
// Wait for the file to land with a populated procIO block.
|
||||
deadline := time.Now().Add(3 * time.Second)
|
||||
var snap map[string]interface{}
|
||||
for time.Now().Before(deadline) {
|
||||
time.Sleep(75 * time.Millisecond)
|
||||
b, err := os.ReadFile(statsPath)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if err := json.Unmarshal(b, &snap); err != nil {
|
||||
continue
|
||||
}
|
||||
if _, ok := snap["procIO"].(map[string]interface{}); ok {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
topSampledAt, ok := snap["sampledAt"].(string)
|
||||
if !ok || topSampledAt == "" {
|
||||
t.Fatalf("expected snapshot.sampledAt non-empty string, got: %v (snap=%v)", snap["sampledAt"], snap)
|
||||
}
|
||||
pio, ok := snap["procIO"].(map[string]interface{})
|
||||
if !ok {
|
||||
t.Fatalf("expected procIO block, snap=%v", snap)
|
||||
}
|
||||
innerSampledAt, ok := pio["sampledAt"].(string)
|
||||
if !ok || innerSampledAt == "" {
|
||||
t.Fatalf("expected procIO.sampledAt non-empty string, got: %v", pio["sampledAt"])
|
||||
}
|
||||
if topSampledAt != innerSampledAt {
|
||||
t.Errorf("snapshot.sampledAt != procIO.sampledAt (writer reverted to two independent timestamps?)\n top: %q\n inner: %q", topSampledAt, innerSampledAt)
|
||||
}
|
||||
}
|
||||
@@ -1,21 +0,0 @@
|
||||
// Fixture: migration block WITHOUT an async annotation and WITHOUT being
|
||||
// wrapped in the async-migration helper. This file exists ONLY so that
|
||||
// ~/.openclaw/skills/pr-preflight/scripts/check-async-migrations.sh
|
||||
// has a known-bad sample to test against (the script is invoked with
|
||||
// BASE pointing at master and FIXTURE_DIR pointing here).
|
||||
//
|
||||
// DO NOT add a PREFLIGHT annotation to this file. DO NOT wrap the
|
||||
// migration via the async helper. The check script's correctness
|
||||
// depends on this staying BAD.
|
||||
//
|
||||
// IMPORTANT: this file must NOT contain the literal identifier of the
|
||||
// async-helper function anywhere (comments, strings, identifiers). The
|
||||
// preflight gate greps a window of lines above the migration for that
|
||||
// identifier as an "OK" signal, so mentioning it here would cause the
|
||||
// gate to *pass* this fixture — defeating its purpose. Refer to the
|
||||
// helper only obliquely as "the async-migration helper" in prose.
|
||||
package fixtures
|
||||
|
||||
const _ = `
|
||||
CREATE INDEX idx_observations_bad_sync_v1 ON observations(observer_idx, timestamp);
|
||||
`
|
||||
@@ -1,9 +0,0 @@
|
||||
// Fixture: migration block WITH an async annotation. Companion to
|
||||
// bad_sync_migration.go. The preflight check script must accept this
|
||||
// because of the PREFLIGHT line directly above the migration.
|
||||
package fixtures
|
||||
|
||||
// PREFLIGHT: async=true reason="fixture-only — ALTER ADD COLUMN is O(1) in sqlite"
|
||||
const _ = `
|
||||
ALTER TABLE observations ADD COLUMN annotated_good_fixture_col INTEGER DEFAULT 0;
|
||||
`
|
||||
@@ -1,22 +0,0 @@
|
||||
module github.com/corescope/migrate
|
||||
|
||||
go 1.22
|
||||
|
||||
require (
|
||||
github.com/meshcore-analyzer/dbschema v0.0.0
|
||||
modernc.org/sqlite v1.34.5
|
||||
)
|
||||
|
||||
replace github.com/meshcore-analyzer/dbschema => ../../internal/dbschema
|
||||
|
||||
require (
|
||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||
github.com/google/uuid v1.6.0 // indirect
|
||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||
github.com/ncruces/go-strftime v0.1.9 // indirect
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
||||
golang.org/x/sys v0.22.0 // indirect
|
||||
modernc.org/libc v1.55.3 // indirect
|
||||
modernc.org/mathutil v1.6.0 // indirect
|
||||
modernc.org/memory v1.8.0 // indirect
|
||||
)
|
||||
@@ -1,43 +0,0 @@
|
||||
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
||||
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
||||
github.com/google/pprof v0.0.0-20240409012703-83162a5b38cd h1:gbpYu9NMq8jhDVbvlGkMFWCjLFlqqEZjEmObmhUy6Vo=
|
||||
github.com/google/pprof v0.0.0-20240409012703-83162a5b38cd/go.mod h1:kf6iHlnVGwgKolg33glAes7Yg/8iWP8ukqeldJSO7jw=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||
github.com/ncruces/go-strftime v0.1.9 h1:bY0MQC28UADQmHmaF5dgpLmImcShSi2kHU9XLdhx/f4=
|
||||
github.com/ncruces/go-strftime v0.1.9/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
||||
golang.org/x/mod v0.16.0 h1:QX4fJ0Rr5cPQCF7O9lh9Se4pmwfwskqZfq5moyldzic=
|
||||
golang.org/x/mod v0.16.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
|
||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI=
|
||||
golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/tools v0.19.0 h1:tfGCXNR1OsFG+sVdLAitlpjAvD/I6dHDKnYrpEZUHkw=
|
||||
golang.org/x/tools v0.19.0/go.mod h1:qoJWxmGSIBmAeriMx19ogtrEPrGtDbPK634QFIcLAhc=
|
||||
modernc.org/cc/v4 v4.21.4 h1:3Be/Rdo1fpr8GrQ7IVw9OHtplU4gWbb+wNgeoBMmGLQ=
|
||||
modernc.org/cc/v4 v4.21.4/go.mod h1:HM7VJTZbUCR3rV8EYBi9wxnJ0ZBRiGE5OeGXNA0IsLQ=
|
||||
modernc.org/ccgo/v4 v4.19.2 h1:lwQZgvboKD0jBwdaeVCTouxhxAyN6iawF3STraAal8Y=
|
||||
modernc.org/ccgo/v4 v4.19.2/go.mod h1:ysS3mxiMV38XGRTTcgo0DQTeTmAO4oCmJl1nX9VFI3s=
|
||||
modernc.org/fileutil v1.3.0 h1:gQ5SIzK3H9kdfai/5x41oQiKValumqNTDXMvKo62HvE=
|
||||
modernc.org/fileutil v1.3.0/go.mod h1:XatxS8fZi3pS8/hKG2GH/ArUogfxjpEKs3Ku3aK4JyQ=
|
||||
modernc.org/gc/v2 v2.4.1 h1:9cNzOqPyMJBvrUipmynX0ZohMhcxPtMccYgGOJdOiBw=
|
||||
modernc.org/gc/v2 v2.4.1/go.mod h1:wzN5dK1AzVGoH6XOzc3YZ+ey/jPgYHLuVckd62P0GYU=
|
||||
modernc.org/libc v1.55.3 h1:AzcW1mhlPNrRtjS5sS+eW2ISCgSOLLNyFzRh/V3Qj/U=
|
||||
modernc.org/libc v1.55.3/go.mod h1:qFXepLhz+JjFThQ4kzwzOjA/y/artDeg+pcYnY+Q83w=
|
||||
modernc.org/mathutil v1.6.0 h1:fRe9+AmYlaej+64JsEEhoWuAYBkOtQiMEU7n/XgfYi4=
|
||||
modernc.org/mathutil v1.6.0/go.mod h1:Ui5Q9q1TR2gFm0AQRqQUaBWFLAhQpCwNcuhBOSedWPo=
|
||||
modernc.org/memory v1.8.0 h1:IqGTL6eFMaDZZhEWwcREgeMXYwmW83LYW8cROZYkg+E=
|
||||
modernc.org/memory v1.8.0/go.mod h1:XPZ936zp5OMKGWPqbD3JShgd/ZoQ7899TUuQqxY+peU=
|
||||
modernc.org/opt v0.1.3 h1:3XOZf2yznlhC+ibLltsDGzABUGVx8J6pnFMS3E4dcq4=
|
||||
modernc.org/opt v0.1.3/go.mod h1:WdSiB5evDcignE70guQKxYUl14mgWtbClRi5wmkkTX0=
|
||||
modernc.org/sortutil v1.2.0 h1:jQiD3PfS2REGJNzNCMMaLSp/wdMNieTbKX920Cqdgqc=
|
||||
modernc.org/sortutil v1.2.0/go.mod h1:TKU2s7kJMf1AE84OoiGppNHJwvB753OYfNl2WRb++Ss=
|
||||
modernc.org/sqlite v1.34.5 h1:Bb6SR13/fjp15jt70CL4f18JIN7p7dnMExd+UFnF15g=
|
||||
modernc.org/sqlite v1.34.5/go.mod h1:YLuNmX9NKs8wRNK2ko1LW1NGYcc9FkBO69JOt1AR9JE=
|
||||
modernc.org/strutil v1.2.0 h1:agBi9dp1I+eOnxXeiZawM8F4LawKv4NzGWSaLfyeNZA=
|
||||
modernc.org/strutil v1.2.0/go.mod h1:/mdcBmfOibveCTBxUl5B5l6W+TTH1FXPLHZE6bTosX0=
|
||||
modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
|
||||
modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
|
||||
@@ -1,55 +0,0 @@
|
||||
// Command migrate runs all dbschema migrations against a SQLite
|
||||
// CoreScope database and exits. Used by CI / one-shot tooling to bring
|
||||
// an unmigrated fixture (or a fresh DB) up to the schema shape the
|
||||
// read-only server (cmd/server) requires via dbschema.AssertReady.
|
||||
//
|
||||
// In production the ingestor (cmd/ingestor) runs dbschema.Apply at
|
||||
// startup before subscribing to MQTT — this binary exists so CI's E2E
|
||||
// job can migrate the e2e-fixture.db without booting the full ingestor
|
||||
// (which needs MQTT brokers).
|
||||
//
|
||||
// Usage:
|
||||
//
|
||||
// migrate -db path/to/file.db
|
||||
package main
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"flag"
|
||||
"log"
|
||||
|
||||
"github.com/meshcore-analyzer/dbschema"
|
||||
_ "modernc.org/sqlite"
|
||||
)
|
||||
|
||||
func main() {
|
||||
dbPath := flag.String("db", "", "path to SQLite database to migrate (required)")
|
||||
flag.Parse()
|
||||
|
||||
if *dbPath == "" {
|
||||
log.Fatalf("[migrate] -db is required")
|
||||
}
|
||||
|
||||
log.SetFlags(log.LstdFlags | log.Lmsgprefix)
|
||||
log.SetPrefix("[migrate] ")
|
||||
|
||||
db, err := sql.Open("sqlite", *dbPath)
|
||||
if err != nil {
|
||||
log.Fatalf("open %s: %v", *dbPath, err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
if err := db.Ping(); err != nil {
|
||||
log.Fatalf("ping %s: %v", *dbPath, err)
|
||||
}
|
||||
|
||||
if err := dbschema.Apply(db, log.Printf); err != nil {
|
||||
log.Fatalf("dbschema.Apply: %v", err)
|
||||
}
|
||||
|
||||
if err := dbschema.AssertReady(db); err != nil {
|
||||
log.Fatalf("dbschema.AssertReady after Apply: %v (this is a bug — Apply did not produce a ready schema)", err)
|
||||
}
|
||||
|
||||
log.Printf("OK: %s is migrated and ready", *dbPath)
|
||||
}
|
||||
@@ -1,84 +0,0 @@
|
||||
// Test that the migrate binary brings the e2e fixture DB up to the
|
||||
// shape required by cmd/server's dbschema.AssertReady. Regression test
|
||||
// for PR #1289 / fix for the CI "Server failed to start within 30s"
|
||||
// failure: AssertReady fired against the unmigrated fixture and the
|
||||
// server fatal-logged before opening its HTTP listener.
|
||||
package main
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/meshcore-analyzer/dbschema"
|
||||
_ "modernc.org/sqlite"
|
||||
)
|
||||
|
||||
// fixtureCandidates lists possible locations of the committed e2e
|
||||
// fixture DB relative to this test's package directory. We resolve
|
||||
// against runtime cwd which is cmd/migrate when `go test` runs.
|
||||
var fixtureCandidates = []string{
|
||||
"../../test-fixtures/e2e-fixture.db",
|
||||
}
|
||||
|
||||
func locateFixture(t *testing.T) string {
|
||||
t.Helper()
|
||||
for _, p := range fixtureCandidates {
|
||||
if _, err := os.Stat(p); err == nil {
|
||||
abs, _ := filepath.Abs(p)
|
||||
return abs
|
||||
}
|
||||
}
|
||||
t.Skipf("e2e fixture not found (looked in: %v)", fixtureCandidates)
|
||||
return ""
|
||||
}
|
||||
|
||||
func copyFile(t *testing.T, src, dst string) {
|
||||
t.Helper()
|
||||
in, err := os.Open(src)
|
||||
if err != nil {
|
||||
t.Fatalf("open src: %v", err)
|
||||
}
|
||||
defer in.Close()
|
||||
out, err := os.Create(dst)
|
||||
if err != nil {
|
||||
t.Fatalf("create dst: %v", err)
|
||||
}
|
||||
defer out.Close()
|
||||
if _, err := io.Copy(out, in); err != nil {
|
||||
t.Fatalf("copy: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestMigrateBringsFixtureToReady is the gate test for the CI bug.
|
||||
// Before the fix landed, AssertReady against the committed fixture
|
||||
// returned an error ("missing: inactive_nodes.foreign_advert" etc.).
|
||||
// After Apply(), AssertReady must return nil.
|
||||
func TestMigrateBringsFixtureToReady(t *testing.T) {
|
||||
src := locateFixture(t)
|
||||
dst := filepath.Join(t.TempDir(), "fixture-copy.db")
|
||||
copyFile(t, src, dst)
|
||||
|
||||
db, err := sql.Open("sqlite", dst)
|
||||
if err != nil {
|
||||
t.Fatalf("open: %v", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
// Sanity: the committed fixture is missing at least one expected
|
||||
// migration column. If this stops being true, either someone
|
||||
// pre-migrated the fixture (and this test no longer protects #1289)
|
||||
// or AssertReady's required set changed.
|
||||
if err := dbschema.AssertReady(db); err == nil {
|
||||
t.Logf("note: fixture already passes AssertReady; skipping pre-condition assertion")
|
||||
}
|
||||
|
||||
if err := dbschema.Apply(db, t.Logf); err != nil {
|
||||
t.Fatalf("Apply: %v", err)
|
||||
}
|
||||
if err := dbschema.AssertReady(db); err != nil {
|
||||
t.Fatalf("AssertReady after Apply: %v", err)
|
||||
}
|
||||
}
|
||||
@@ -1,293 +0,0 @@
|
||||
// Package main: analytics recomputer (issue #1240).
|
||||
//
|
||||
// Steady-state background recompute loop for expensive analytics
|
||||
// endpoints. Reads always hit an atomic-pointer cache; compute runs
|
||||
// on a fixed ticker in a goroutine. This eliminates the on-request
|
||||
// compute-then-cache pattern where the first reader after expiry pays
|
||||
// the full compute cost and blocks under writer contention.
|
||||
//
|
||||
// See issue #1240 and AGENTS.md "Performance is a feature".
|
||||
package main
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
)
|
||||
|
||||
// analyticsRecomputer holds the latest snapshot of an analytics result
|
||||
// in an atomic.Value, refreshed periodically by a background goroutine.
|
||||
//
|
||||
// Lifecycle:
|
||||
// 1. Construct via newAnalyticsRecomputer(...)
|
||||
// 2. Call Start() — runs initial compute synchronously, then launches
|
||||
// the recompute goroutine. Initial compute is synchronous so the
|
||||
// first Load() after Start returns never sees a nil cache.
|
||||
// 3. Call Load() any number of times concurrently — never blocks
|
||||
// beyond an atomic-pointer load.
|
||||
// 4. Call Stop() to terminate the background goroutine cleanly.
|
||||
//
|
||||
// Compute func is called WITHOUT any lock held by this struct, so it
|
||||
// may freely take any application-level locks it needs.
|
||||
type analyticsRecomputer struct {
|
||||
name string
|
||||
interval time.Duration
|
||||
compute func() interface{}
|
||||
|
||||
cache atomic.Value // holds interface{} — the latest snapshot
|
||||
stop chan struct{}
|
||||
done chan struct{}
|
||||
|
||||
startOnce sync.Once
|
||||
stopOnce sync.Once
|
||||
|
||||
// Stats (atomic).
|
||||
computeRuns atomic.Int64
|
||||
lastComputeNs atomic.Int64 // duration of last compute in nanoseconds
|
||||
|
||||
// Issue #1659 (PR #1688 r1) — warmup gate state, inlined here so
|
||||
// hot-path readers (IsWarmingUp_1659) do lock-free atomic loads
|
||||
// only (replaces the r0 package-level map + chanLock). See
|
||||
// analytics_warmup_1659.go for full design notes.
|
||||
firstPassDoneNs atomic.Int64
|
||||
warmupStartedNs atomic.Int64
|
||||
warmupReadyGate atomic.Value // *func() bool — gate must return true for markFirstPassDone to take effect
|
||||
}
|
||||
|
||||
// newAnalyticsRecomputer constructs an unstarted recomputer.
|
||||
// interval must be > 0; compute must be non-nil.
|
||||
func newAnalyticsRecomputer(name string, interval time.Duration, compute func() interface{}) *analyticsRecomputer {
|
||||
if interval <= 0 {
|
||||
interval = 5 * time.Minute
|
||||
}
|
||||
return &analyticsRecomputer{
|
||||
name: name,
|
||||
interval: interval,
|
||||
compute: compute,
|
||||
stop: make(chan struct{}),
|
||||
done: make(chan struct{}),
|
||||
}
|
||||
}
|
||||
|
||||
// Start runs the initial compute synchronously (so the first Load
|
||||
// after Start returns a populated snapshot, never nil), then launches
|
||||
// a background goroutine to periodically recompute.
|
||||
//
|
||||
// Calling Start multiple times is a no-op after the first call.
|
||||
func (r *analyticsRecomputer) Start() {
|
||||
r.startOnce.Do(func() {
|
||||
// Issue #1659 (#1688 munger #2): record warmup-start before
|
||||
// the first compute, so IsWarmingUp_1659's fallback timeout
|
||||
// is measured from "recomputer started" — not "first pass
|
||||
// returned", which never happens if compute() hangs.
|
||||
r.noteWarmupStart_1659()
|
||||
// Initial synchronous compute — first read must NOT see empty
|
||||
// or uninitialized data (acceptance criterion #1240).
|
||||
r.runOnce()
|
||||
go r.loop()
|
||||
})
|
||||
}
|
||||
|
||||
func (r *analyticsRecomputer) loop() {
|
||||
defer close(r.done)
|
||||
t := time.NewTicker(r.interval)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-t.C:
|
||||
r.runOnce()
|
||||
case <-r.stop:
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (r *analyticsRecomputer) runOnce() {
|
||||
if r.compute == nil {
|
||||
return
|
||||
}
|
||||
defer func() {
|
||||
// Don't let a compute panic kill the background goroutine.
|
||||
// The previous snapshot remains valid. Even on panic, we
|
||||
// still want IsWarmingUp_1659's fallback timeout to be the
|
||||
// safety net (a perpetually panicking compute would never
|
||||
// reach markFirstPassDone otherwise).
|
||||
_ = recover()
|
||||
}()
|
||||
t0 := time.Now()
|
||||
result := r.compute()
|
||||
r.lastComputeNs.Store(int64(time.Since(t0)))
|
||||
r.computeRuns.Add(1)
|
||||
if result != nil {
|
||||
r.cache.Store(result)
|
||||
}
|
||||
// Issue #1659: mark the first-pass clock so the warmup gate
|
||||
// in GetAnalyticsRFWithWindow / Topology / Channels handlers
|
||||
// can flip from 503-Retry-After to serving the cache.
|
||||
//
|
||||
// PR #1688 r1: called on EVERY successful pass (even nil
|
||||
// result) so a compute that returns nil but doesn't panic
|
||||
// still lifts the gate — banner-stuck-forever fix (munger #2).
|
||||
// The markFirstPassDone helper is idempotent and additionally
|
||||
// consults the chunked-loader readiness gate (munger #5).
|
||||
r.markFirstPassDone_1659()
|
||||
}
|
||||
|
||||
// Load returns the most recently computed snapshot, or nil if Start
|
||||
// has not been called (or the very first compute returned nil).
|
||||
// Never blocks beyond a single atomic load.
|
||||
func (r *analyticsRecomputer) Load() interface{} {
|
||||
v := r.cache.Load()
|
||||
if v == nil {
|
||||
return nil
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
// Stop signals the background goroutine to exit and waits for it.
|
||||
// Safe to call multiple times. Safe to call before Start (no-op).
|
||||
func (r *analyticsRecomputer) Stop() {
|
||||
r.stopOnce.Do(func() {
|
||||
close(r.stop)
|
||||
})
|
||||
// Only wait if the goroutine was actually started.
|
||||
select {
|
||||
case <-r.done:
|
||||
case <-time.After(5 * time.Second):
|
||||
// Defensive timeout: shouldn't happen in practice.
|
||||
}
|
||||
}
|
||||
|
||||
// LastComputeDuration returns the duration of the most recent compute.
|
||||
func (r *analyticsRecomputer) LastComputeDuration() time.Duration {
|
||||
return time.Duration(r.lastComputeNs.Load())
|
||||
}
|
||||
|
||||
// ComputeRuns returns the total number of compute invocations.
|
||||
func (r *analyticsRecomputer) ComputeRuns() int64 {
|
||||
return r.computeRuns.Load()
|
||||
}
|
||||
|
||||
// AnalyticsRecomputeIntervals lets callers (main.go) override the
|
||||
// per-endpoint recompute interval from config.json. Zero values fall
|
||||
// back to the defaultInterval passed to StartAnalyticsRecomputers.
|
||||
type AnalyticsRecomputeIntervals struct {
|
||||
Topology time.Duration
|
||||
RF time.Duration
|
||||
Distance time.Duration
|
||||
Channels time.Duration
|
||||
HashCollisions time.Duration
|
||||
HashSizes time.Duration
|
||||
Roles time.Duration
|
||||
ObserversClockSkew time.Duration
|
||||
NodesClockSkew time.Duration
|
||||
}
|
||||
|
||||
func pickInterval(override, def time.Duration) time.Duration {
|
||||
if override > 0 {
|
||||
return override
|
||||
}
|
||||
return def
|
||||
}
|
||||
|
||||
// StartAnalyticsRecomputers wires each analytics endpoint to a
|
||||
// background recompute goroutine. Each runs an initial compute
|
||||
// synchronously (so the first read after startup is a cache hit, never
|
||||
// cold) and then refreshes on a ticker.
|
||||
//
|
||||
// All recomputers serve the DEFAULT query shape only: region="" and
|
||||
// zero-window (no ?since= / ?until= params). Region-keyed or windowed
|
||||
// queries continue to use the legacy on-request compute + TTL cache —
|
||||
// the recomputer count would explode if we maintained one per
|
||||
// (endpoint × region × window) combination, and region filtering is
|
||||
// fast read-time work anyway.
|
||||
//
|
||||
// Returns a stop closure that signals all goroutines and blocks until
|
||||
// they exit. Safe to call once per PacketStore. Idempotent if called
|
||||
// multiple times (subsequent calls return the first stop closure).
|
||||
func (s *PacketStore) StartAnalyticsRecomputers(defaultInterval time.Duration, overrides ...AnalyticsRecomputeIntervals) func() {
|
||||
if defaultInterval <= 0 {
|
||||
defaultInterval = 5 * time.Minute
|
||||
}
|
||||
var ov AnalyticsRecomputeIntervals
|
||||
if len(overrides) > 0 {
|
||||
ov = overrides[0]
|
||||
}
|
||||
|
||||
s.analyticsRecomputerMu.Lock()
|
||||
if s.recompTopology != nil {
|
||||
// Already started; return a no-op so the caller's defer is harmless.
|
||||
s.analyticsRecomputerMu.Unlock()
|
||||
return func() {}
|
||||
}
|
||||
|
||||
// Each recomputer wraps the underlying compute* function with the
|
||||
// default arguments. We use computeAnalytics* (not GetAnalytics*) to
|
||||
// bypass the legacy TTL cache layer — the recomputer IS the cache.
|
||||
s.recompTopology = newAnalyticsRecomputer(
|
||||
"topology", pickInterval(ov.Topology, defaultInterval),
|
||||
func() interface{} { return s.computeAnalyticsTopology("", "", TimeWindow{}) },
|
||||
)
|
||||
s.recompRF = newAnalyticsRecomputer(
|
||||
"rf", pickInterval(ov.RF, defaultInterval),
|
||||
func() interface{} { return s.computeAnalyticsRF("", "", TimeWindow{}) },
|
||||
)
|
||||
s.recompDistance = newAnalyticsRecomputer(
|
||||
"distance", pickInterval(ov.Distance, defaultInterval),
|
||||
func() interface{} { return s.computeAnalyticsDistance("", "") },
|
||||
)
|
||||
s.recompChannels = newAnalyticsRecomputer(
|
||||
"channels", pickInterval(ov.Channels, defaultInterval),
|
||||
func() interface{} { return s.computeAnalyticsChannels("", "", TimeWindow{}) },
|
||||
)
|
||||
s.recompHashCollisions = newAnalyticsRecomputer(
|
||||
"hash-collisions", pickInterval(ov.HashCollisions, defaultInterval),
|
||||
func() interface{} { return s.computeHashCollisions("", "") },
|
||||
)
|
||||
s.recompHashSizes = newAnalyticsRecomputer(
|
||||
"hash-sizes", pickInterval(ov.HashSizes, defaultInterval),
|
||||
func() interface{} { return s.computeAnalyticsHashSizesWithCapability("", "") },
|
||||
)
|
||||
s.recompRoles = newAnalyticsRecomputer(
|
||||
"roles", pickInterval(ov.Roles, defaultInterval),
|
||||
func() interface{} { return s.computeAnalyticsRoles() },
|
||||
)
|
||||
s.recompObserversClockSkew = newAnalyticsRecomputer(
|
||||
"observers-clock-skew", pickInterval(ov.ObserversClockSkew, defaultInterval),
|
||||
func() interface{} { return s.computeObserverCalibrations() },
|
||||
)
|
||||
s.recompNodesClockSkew = newAnalyticsRecomputer(
|
||||
"nodes-clock-skew", pickInterval(ov.NodesClockSkew, defaultInterval),
|
||||
func() interface{} { return s.computeFleetClockSkew() },
|
||||
)
|
||||
all := []*analyticsRecomputer{
|
||||
s.recompTopology, s.recompRF, s.recompDistance,
|
||||
s.recompChannels, s.recompHashCollisions, s.recompHashSizes,
|
||||
s.recompRoles,
|
||||
s.recompObserversClockSkew, s.recompNodesClockSkew,
|
||||
}
|
||||
s.analyticsRecomputerMu.Unlock()
|
||||
|
||||
// Issue #1659 (PR #1688 r1, munger #5): wire the chunked-loader
|
||||
// readiness gate on the three warmup-gated recomputers (RF,
|
||||
// Topology, Channels). markFirstPassDone_1659 will refuse to
|
||||
// flip first-pass-done until s.LoadComplete() reports true —
|
||||
// i.e. the cold-load has populated all observations. Otherwise
|
||||
// the FIRST recomputer pass runs against the post-restart in-RAM
|
||||
// slice and the gate opens on partial data (the original #1659
|
||||
// bug class).
|
||||
loadCompleteGate := s.LoadComplete
|
||||
s.recompRF.setWarmupReadyGate_1659(loadCompleteGate)
|
||||
s.recompTopology.setWarmupReadyGate_1659(loadCompleteGate)
|
||||
s.recompChannels.setWarmupReadyGate_1659(loadCompleteGate)
|
||||
|
||||
for _, rc := range all {
|
||||
rc.Start()
|
||||
}
|
||||
|
||||
return func() {
|
||||
for _, rc := range all {
|
||||
rc.Stop()
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,174 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"runtime"
|
||||
"sort"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func numGoroutinesForTest() int { return runtime.NumGoroutine() }
|
||||
|
||||
// TestAnalyticsRecomputerSteadyStateLatency asserts that issue #1240's
|
||||
// steady-state background recompute is in place: reads of the common
|
||||
// analytics endpoints (region="") return from cache in <50ms p99 even
|
||||
// under simulated ingest load.
|
||||
//
|
||||
// On master (pre-fix), GetAnalyticsTopology holds s.mu.RLock for the
|
||||
// entire compute. Concurrent ingest writers (s.mu.Lock) starve readers
|
||||
// or vice versa, producing per-read latencies in the hundreds of
|
||||
// milliseconds. The cache TTL doesn't help: after every expiry one
|
||||
// reader still pays the full compute cost.
|
||||
//
|
||||
// Post-fix, GetAnalyticsTopology with region="" and zero window must
|
||||
// Load() from the background-refreshed atomic snapshot — never blocking
|
||||
// under writer contention.
|
||||
func TestAnalyticsRecomputerSteadyStateLatency(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("skipping latency timing test in -short mode")
|
||||
}
|
||||
|
||||
db := setupTestDB(t)
|
||||
defer db.Close()
|
||||
store := NewPacketStore(db, nil)
|
||||
|
||||
// Populate with enough records to make on-request compute non-trivial.
|
||||
const N = 20000
|
||||
hops := make([]distHopRecord, N)
|
||||
for i := 0; i < N; i++ {
|
||||
hops[i] = distHopRecord{
|
||||
FromName: "A", FromPk: "aa",
|
||||
ToName: "B", ToPk: "bb",
|
||||
Dist: float64(i%500) + 0.5,
|
||||
Type: []string{"R↔R", "C↔R", "C↔C"}[i%3],
|
||||
Hash: "h",
|
||||
Timestamp: "2024-01-01T00:00:00Z",
|
||||
HourBucket: "2024-01-01-00",
|
||||
}
|
||||
}
|
||||
store.mu.Lock()
|
||||
store.distHops = hops
|
||||
store.mu.Unlock()
|
||||
|
||||
// Start the recomputer infrastructure. On master this method
|
||||
// doesn't exist, so this test won't compile until the GREEN commit
|
||||
// lands; the RED commit lands the test + a stub. Stub returns
|
||||
// without wiring background recompute, so the test still fails on
|
||||
// the latency assertion below.
|
||||
stop := store.StartAnalyticsRecomputers(10 * time.Millisecond)
|
||||
defer stop()
|
||||
|
||||
// Give the initial compute a moment to populate.
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
|
||||
// Simulated writer: contend for s.mu.Lock. This is what makes the
|
||||
// non-recomputer path miss the latency target — the old
|
||||
// GetAnalyticsTopology grabs s.mu.RLock for the entire compute and
|
||||
// blocks behind every writer cycle.
|
||||
var stopWriters atomic.Bool
|
||||
var writerWg sync.WaitGroup
|
||||
const Writers = 4
|
||||
writerWg.Add(Writers)
|
||||
for w := 0; w < Writers; w++ {
|
||||
go func() {
|
||||
defer writerWg.Done()
|
||||
for !stopWriters.Load() {
|
||||
store.mu.Lock()
|
||||
// Trivial mutation: extend distHops by one and shrink back.
|
||||
store.distHops = append(store.distHops, distHopRecord{
|
||||
Dist: 1, Hash: "x", Timestamp: "2024-01-01T00:00:00Z",
|
||||
})
|
||||
store.distHops = store.distHops[:len(store.distHops)-1]
|
||||
store.mu.Unlock()
|
||||
// Brief pause to keep the lock-cycle rate realistic.
|
||||
time.Sleep(100 * time.Microsecond)
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// 100 concurrent reads.
|
||||
const Readers = 100
|
||||
latencies := make([]time.Duration, Readers)
|
||||
var rwg sync.WaitGroup
|
||||
rwg.Add(Readers)
|
||||
for i := 0; i < Readers; i++ {
|
||||
i := i
|
||||
go func() {
|
||||
defer rwg.Done()
|
||||
t0 := time.Now()
|
||||
r := store.GetAnalyticsDistance("", "")
|
||||
latencies[i] = time.Since(t0)
|
||||
if r == nil {
|
||||
t.Errorf("reader %d got nil result", i)
|
||||
}
|
||||
}()
|
||||
}
|
||||
rwg.Wait()
|
||||
stopWriters.Store(true)
|
||||
writerWg.Wait()
|
||||
|
||||
sort.Slice(latencies, func(i, j int) bool { return latencies[i] < latencies[j] })
|
||||
p50 := latencies[Readers/2]
|
||||
p99 := latencies[(Readers*99)/100]
|
||||
|
||||
t.Logf("analytics distance read latency: p50=%v p99=%v max=%v",
|
||||
p50, p99, latencies[Readers-1])
|
||||
|
||||
// p99 budget: 50ms. Atomic-pointer load + JSON-shape map return
|
||||
// should be sub-millisecond; 50ms leaves margin for goroutine
|
||||
// scheduling jitter under concurrent test runs.
|
||||
const budget = 50 * time.Millisecond
|
||||
if p99 > budget {
|
||||
t.Fatalf("p99 read latency %v exceeds %v budget (issue #1240 not in effect)", p99, budget)
|
||||
}
|
||||
}
|
||||
|
||||
// TestAnalyticsRecomputerShutdownNoLeak asserts the background
|
||||
// goroutines started by StartAnalyticsRecomputers exit cleanly when
|
||||
// the returned stop function is called — no leak across server
|
||||
// shutdown (issue #1240 acceptance criterion).
|
||||
func TestAnalyticsRecomputerShutdownNoLeak(t *testing.T) {
|
||||
db := setupTestDB(t)
|
||||
defer db.Close()
|
||||
store := NewPacketStore(db, nil)
|
||||
|
||||
// Use a tight tick so we know recompute is actually running (not
|
||||
// just blocked on the ticker).
|
||||
stop := store.StartAnalyticsRecomputers(20 * time.Millisecond)
|
||||
|
||||
// Snapshot active goroutines a beat after start.
|
||||
time.Sleep(80 * time.Millisecond)
|
||||
startGoroutines := runtimeNumGoroutine()
|
||||
|
||||
stop()
|
||||
|
||||
// After stop returns, give the scheduler a beat to reap exits.
|
||||
deadline := time.Now().Add(2 * time.Second)
|
||||
var endGoroutines int
|
||||
for time.Now().Before(deadline) {
|
||||
endGoroutines = runtimeNumGoroutine()
|
||||
if endGoroutines <= startGoroutines-5 { // we started 6 recomputers
|
||||
break
|
||||
}
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
}
|
||||
|
||||
// We expect ~6 fewer goroutines than the snapshot taken DURING
|
||||
// recompute (one per registered recomputer). Allow some slack
|
||||
// since test runners can have flaky goroutine counts.
|
||||
if endGoroutines >= startGoroutines {
|
||||
t.Fatalf("goroutine leak after stop: %d → %d (expected fewer)",
|
||||
startGoroutines, endGoroutines)
|
||||
}
|
||||
t.Logf("goroutines: during=%d after=%d (Δ=%d)",
|
||||
startGoroutines, endGoroutines, startGoroutines-endGoroutines)
|
||||
}
|
||||
|
||||
// runtimeNumGoroutine is wrapped to keep the imports section of the
|
||||
// production file minimal.
|
||||
func runtimeNumGoroutine() int {
|
||||
// imported below
|
||||
return numGoroutinesForTest()
|
||||
}
|
||||
@@ -1,212 +0,0 @@
|
||||
// Package main: issue #1659 — analytics warmup gating.
|
||||
//
|
||||
// Problem: after server restart, recompRF (and recompTopology /
|
||||
// recompChannels) cache the FIRST computation, which immediately after
|
||||
// boot is just the small in-RAM-observations slice (background
|
||||
// chunk-loader has not yet backfilled history). The recomputer then
|
||||
// serves that small slice from GetAnalyticsRFWithWindow's default
|
||||
// shortcut for an entire recompute interval, while the client pins it
|
||||
// via CLIENT_TTL.analyticsRF. UX: cards show a tiny "post-restart"
|
||||
// window even when the user selects "All data".
|
||||
//
|
||||
// Fix (r1 — addresses #1688 review munger #5):
|
||||
//
|
||||
// The first-pass-done signal is NOT enough on its own — the FIRST
|
||||
// recomputer pass at boot can complete against the post-restart slice
|
||||
// BEFORE the chunked loader (#1008 / chunked_load.go) has populated
|
||||
// the full observation set. Marking the gate ready in that window
|
||||
// reproduces the original #1659 bug.
|
||||
//
|
||||
// Two correctness invariants:
|
||||
//
|
||||
// 1. (#1688 munger #5) Only mark first-pass-done when BOTH:
|
||||
// a. a recomputer pass has completed, AND
|
||||
// b. the chunked loader has finished (s.LoadComplete()).
|
||||
// The gate's `readyGate` callback is wired by
|
||||
// StartAnalyticsRecomputers to `store.LoadComplete`. Passes that
|
||||
// complete while loadComplete is still false leave the gate in
|
||||
// the warming-up state; the NEXT pass after loadComplete flips
|
||||
// true is the one that opens the gate.
|
||||
//
|
||||
// 2. (#1688 munger #2 + kent-beck #2) The gate MUST lift in bounded
|
||||
// time. If compute() panics on every pass, hangs indefinitely,
|
||||
// or returns nil forever, an unguarded gate would leave the
|
||||
// 503 banner permanent. Two safeguards:
|
||||
// a. compute() panics are already caught by runOnce()'s
|
||||
// defer recover(); we additionally call markFirstPassDone
|
||||
// on EVERY pass (even nil-result), so a recomputer that
|
||||
// returns nil but doesn't panic still flips the gate.
|
||||
// b. A hard fallback timeout (warmupForceTimeout, 60s by
|
||||
// default) elapsed since the recomputer was constructed
|
||||
// forces IsWarmingUp_1659() to false — degraded mode
|
||||
// (serve whatever cache exists, possibly empty) is
|
||||
// strictly better than a permanent 503.
|
||||
//
|
||||
// Concurrency (#1688 munger #3):
|
||||
//
|
||||
// The previous r0 design used a package-level map keyed by recomputer
|
||||
// pointer, guarded by a global chanLock. Every default-shape analytics
|
||||
// request acquired that lock — a serialization point on a hot path.
|
||||
//
|
||||
// r1 inlines the warmup fields directly on `analyticsRecomputer`:
|
||||
// - firstPassDoneNs atomic.Int64
|
||||
// - warmupStartedNs atomic.Int64
|
||||
// - readyGate atomic.Value (holds func() bool, may be nil)
|
||||
//
|
||||
// Reads on the hot path are lock-free atomic loads. No package-level
|
||||
// state, no map lookups, no mutex.
|
||||
//
|
||||
// Tests: analytics_warmup_1659_test.go.
|
||||
package main
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"time"
|
||||
)
|
||||
|
||||
// warmupForceTimeout is the deadline after which IsWarmingUp_1659()
|
||||
// flips false regardless of whether a successful first pass has run.
|
||||
// Operators get degraded analytics (possibly empty until the next
|
||||
// successful compute) instead of a permanent 503 banner.
|
||||
//
|
||||
// Var (not const) so tests can shorten it.
|
||||
var warmupForceTimeout = 60 * time.Second
|
||||
|
||||
// setWarmupReadyGate wires a callback that the recomputer consults
|
||||
// before honoring a markFirstPassDone_1659() request. When the gate
|
||||
// returns false, the warmup state is preserved across the pass —
|
||||
// equivalent to "this pass doesn't count; we need at least one pass
|
||||
// AFTER the gate flips true".
|
||||
//
|
||||
// nil callback means "no extra gating" (legacy behavior).
|
||||
//
|
||||
// Called from StartAnalyticsRecomputers; safe to call before Start().
|
||||
func (r *analyticsRecomputer) setWarmupReadyGate_1659(gate func() bool) {
|
||||
if r == nil {
|
||||
return
|
||||
}
|
||||
if gate == nil {
|
||||
r.warmupReadyGate.Store((*func() bool)(nil))
|
||||
return
|
||||
}
|
||||
r.warmupReadyGate.Store(&gate)
|
||||
}
|
||||
|
||||
func (r *analyticsRecomputer) loadWarmupReadyGate_1659() func() bool {
|
||||
v := r.warmupReadyGate.Load()
|
||||
if v == nil {
|
||||
return nil
|
||||
}
|
||||
p, ok := v.(*func() bool)
|
||||
if !ok || p == nil {
|
||||
return nil
|
||||
}
|
||||
return *p
|
||||
}
|
||||
|
||||
// markFirstPassDone_1659 is called from analyticsRecomputer.runOnce()
|
||||
// after every compute attempt (success OR nil result; panics are
|
||||
// caught upstream and never reach here).
|
||||
//
|
||||
// The gate flip is conditional on the readyGate (when set) reporting
|
||||
// true — this implements the munger #5 fix: first-pass-done must
|
||||
// require BOTH a recomputer pass complete AND the chunked loader to
|
||||
// have finished populating the in-RAM observation set.
|
||||
//
|
||||
// Idempotent: only the FIRST successful flip wins; subsequent calls
|
||||
// observe a non-zero firstPassDoneNs and return immediately.
|
||||
func (r *analyticsRecomputer) markFirstPassDone_1659() {
|
||||
if r.firstPassDoneNs.Load() != 0 {
|
||||
return
|
||||
}
|
||||
if gate := r.loadWarmupReadyGate_1659(); gate != nil && !gate() {
|
||||
return
|
||||
}
|
||||
r.firstPassDoneNs.CompareAndSwap(0, time.Now().UnixNano())
|
||||
}
|
||||
|
||||
// FirstPassDoneAt_1659 reports the time the first full compute pass
|
||||
// completed (subject to the readyGate). Returns zero time if no
|
||||
// qualifying pass has completed yet.
|
||||
func (r *analyticsRecomputer) FirstPassDoneAt_1659() time.Time {
|
||||
if r == nil {
|
||||
return time.Time{}
|
||||
}
|
||||
ns := r.firstPassDoneNs.Load()
|
||||
if ns == 0 {
|
||||
return time.Time{}
|
||||
}
|
||||
return time.Unix(0, ns)
|
||||
}
|
||||
|
||||
// IsWarmingUp_1659 reports true when the recomputer has not yet
|
||||
// completed a qualifying first pass AND the fallback timeout has not
|
||||
// yet elapsed. Handlers for the default-shape request must return
|
||||
// 503 + Retry-After: 5 while this is true.
|
||||
//
|
||||
// Fallback timeout (warmupForceTimeout) prevents a permanent 503 in
|
||||
// pathological compute paths (perpetual panic, perpetual nil, hang).
|
||||
//
|
||||
// Lock-free: pure atomic loads.
|
||||
func (r *analyticsRecomputer) IsWarmingUp_1659() bool {
|
||||
if r == nil {
|
||||
// No recomputer registered → treat as ready; the handler
|
||||
// falls through to the legacy compute path.
|
||||
return false
|
||||
}
|
||||
if r.firstPassDoneNs.Load() != 0 {
|
||||
return false
|
||||
}
|
||||
startedNs := r.warmupStartedNs.Load()
|
||||
if startedNs != 0 {
|
||||
if time.Since(time.Unix(0, startedNs)) >= warmupForceTimeout {
|
||||
// Forced-ready: gate has been stuck too long. Stop
|
||||
// serving 503; let the handler serve whatever is in
|
||||
// the cache (possibly empty).
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// noteWarmupStart_1659 records the moment the recomputer was launched
|
||||
// (called once from Start). Used by IsWarmingUp_1659 to compute the
|
||||
// fallback-timeout elapsed window.
|
||||
func (r *analyticsRecomputer) noteWarmupStart_1659() {
|
||||
if r == nil {
|
||||
return
|
||||
}
|
||||
r.warmupStartedNs.CompareAndSwap(0, time.Now().UnixNano())
|
||||
}
|
||||
|
||||
// writeAnalyticsWarmup503 emits the standard warmup response. The body
|
||||
// shape is documented for clients: error string + retry_after_s int.
|
||||
func writeAnalyticsWarmup503(w http.ResponseWriter) {
|
||||
w.Header().Set("Retry-After", "5")
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusServiceUnavailable)
|
||||
_, _ = w.Write([]byte(`{"error":"analytics warming up","retry_after_s":5}`))
|
||||
}
|
||||
|
||||
// installWarmupBlocker_1659 is a test-only helper that registers the
|
||||
// RF / topology / channels recomputers with a compute function that
|
||||
// blocks on the supplied channel. firstPassDoneNs therefore stays
|
||||
// zero, simulating the post-restart warmup window for the warmup test.
|
||||
//
|
||||
// We bypass StartAnalyticsRecomputers entirely and wire the
|
||||
// recomputers manually so the background goroutines never fire. The
|
||||
// test only needs the *analyticsRecomputer pointers to be non-nil and
|
||||
// in the warmup state.
|
||||
func (s *PacketStore) installWarmupBlocker_1659(block <-chan struct{}) {
|
||||
blockCompute := func() interface{} {
|
||||
<-block
|
||||
return nil
|
||||
}
|
||||
s.analyticsRecomputerMu.Lock()
|
||||
defer s.analyticsRecomputerMu.Unlock()
|
||||
s.recompRF = newAnalyticsRecomputer("rf-test-block", time.Hour, blockCompute)
|
||||
s.recompTopology = newAnalyticsRecomputer("topo-test-block", time.Hour, blockCompute)
|
||||
s.recompChannels = newAnalyticsRecomputer("chan-test-block", time.Hour, blockCompute)
|
||||
// Do NOT call Start() — leaving firstPassDoneNs at zero is exactly
|
||||
// the warmup state the test wants to exercise.
|
||||
}
|
||||
@@ -1,330 +0,0 @@
|
||||
// Package main: issue #1659 — analytics warmup gating.
|
||||
//
|
||||
// After a server restart, the analytics recomputer caches the FIRST
|
||||
// computation (a small in-RAM slice) and serves it via the default
|
||||
// region="", zero-window shortcut in GetAnalyticsRFWithWindow until the
|
||||
// next periodic recompute fires. The client-side CLIENT_TTL.analyticsRF
|
||||
// then pins that small slice on the page even after the server flips
|
||||
// to steady-state.
|
||||
//
|
||||
// Fix: each recomputer carries a firstPassDoneAt timestamp set ONLY
|
||||
// after a full-range compute completes. While firstPassDoneAt is zero
|
||||
// AND the request is the default-shape (region="" && area="" &&
|
||||
// window.IsZero()), the handler returns 503 + Retry-After: 5 with a
|
||||
// JSON body the client recognizes and retries with backoff.
|
||||
//
|
||||
// These tests are the RED contract: they must FAIL on the assertion
|
||||
// (not a build error) when the warmup gate is absent, and PASS once
|
||||
// the fix lands.
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/gorilla/mux"
|
||||
)
|
||||
|
||||
// TestAnalyticsRF_WarmupReturns503 asserts that immediately after the
|
||||
// server starts — before any analytics recomputer has finished its
|
||||
// first full-range pass — GET /api/analytics/rf returns 503 with
|
||||
// Retry-After: 5 and a JSON body shaped as
|
||||
// {"error":"analytics warming up","retry_after_s":5}.
|
||||
//
|
||||
// This is the core acceptance criterion (c) from #1659.
|
||||
func TestAnalyticsRF_WarmupReturns503(t *testing.T) {
|
||||
db := setupTestDB(t)
|
||||
defer db.Close()
|
||||
store := NewPacketStore(db, nil)
|
||||
// Register recomputers but DO NOT let them complete a first pass.
|
||||
// We install a compute func that blocks until we release it, so the
|
||||
// recomputer's firstPassDoneAt stays zero.
|
||||
block := make(chan struct{})
|
||||
defer close(block)
|
||||
store.installWarmupBlocker_1659(block) // helper added in GREEN
|
||||
|
||||
cfg := &Config{Port: 3000}
|
||||
hub := NewHub()
|
||||
srv := NewServer(db, cfg, hub)
|
||||
srv.store = store
|
||||
router := mux.NewRouter()
|
||||
srv.RegisterRoutes(router)
|
||||
|
||||
req := httptest.NewRequest("GET", "/api/analytics/rf", nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
|
||||
if w.Code != http.StatusServiceUnavailable {
|
||||
t.Fatalf("expected 503 during warmup, got %d (body=%s)", w.Code, w.Body.String())
|
||||
}
|
||||
if got := w.Header().Get("Retry-After"); got != "5" {
|
||||
t.Fatalf("expected Retry-After: 5, got %q", got)
|
||||
}
|
||||
var resp map[string]interface{}
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("invalid JSON body: %v (raw=%s)", err, w.Body.String())
|
||||
}
|
||||
if resp["error"] != "analytics warming up" {
|
||||
t.Fatalf("expected error='analytics warming up', got %v", resp["error"])
|
||||
}
|
||||
if v, ok := resp["retry_after_s"].(float64); !ok || v != 5 {
|
||||
t.Fatalf("expected retry_after_s=5, got %v", resp["retry_after_s"])
|
||||
}
|
||||
}
|
||||
|
||||
// TestAnalyticsRF_AfterFirstPassReturns200 asserts the post-warmup
|
||||
// happy path: once the recomputer's first full-range compute completes,
|
||||
// the handler serves the cached snapshot as 200.
|
||||
func TestAnalyticsRF_AfterFirstPassReturns200(t *testing.T) {
|
||||
db := setupTestDB(t)
|
||||
defer db.Close()
|
||||
store := NewPacketStore(db, nil)
|
||||
// #1688 r1: the warmup gate now ALSO requires LoadComplete() to be
|
||||
// true before first-pass-done flips (munger #5). Tests that don't
|
||||
// exercise the chunked loader must flip it manually to model a
|
||||
// production server that has finished cold-loading.
|
||||
store.loadComplete.Store(true)
|
||||
|
||||
stop := store.StartAnalyticsRecomputers(50 * time.Millisecond)
|
||||
defer stop()
|
||||
|
||||
// Wait for the synchronous first-pass to complete. Start() runs
|
||||
// the initial compute synchronously, so by the time it returns
|
||||
// firstPassDoneAt should be set. We poll a brief moment to keep
|
||||
// the test robust to scheduling.
|
||||
deadline := time.Now().Add(3 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
if store.recompRF != nil && !store.recompRF.FirstPassDoneAt_1659().IsZero() {
|
||||
break
|
||||
}
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
}
|
||||
if store.recompRF == nil || store.recompRF.FirstPassDoneAt_1659().IsZero() {
|
||||
t.Fatal("recompRF.firstPassDoneAt never flipped after Start()")
|
||||
}
|
||||
|
||||
cfg := &Config{Port: 3000}
|
||||
hub := NewHub()
|
||||
srv := NewServer(db, cfg, hub)
|
||||
srv.store = store
|
||||
router := mux.NewRouter()
|
||||
srv.RegisterRoutes(router)
|
||||
|
||||
req := httptest.NewRequest("GET", "/api/analytics/rf", nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("expected 200 after first pass, got %d (body=%s)", w.Code, w.Body.String())
|
||||
}
|
||||
if got := w.Header().Get("Retry-After"); got != "" {
|
||||
t.Fatalf("expected no Retry-After header on 200, got %q", got)
|
||||
}
|
||||
// Body should be a valid JSON object (the RF analytics map).
|
||||
var resp map[string]interface{}
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("invalid JSON body: %v", err)
|
||||
}
|
||||
if len(resp) == 0 {
|
||||
t.Fatal("expected non-empty RF analytics response after first pass")
|
||||
}
|
||||
}
|
||||
|
||||
// TestAnalyticsRF_WindowedRequestNotGated asserts that even during
|
||||
// warmup, a request with an explicit time window (?since=/?until=) or
|
||||
// region/area filter is NOT gated by the warmup flag — those queries
|
||||
// bypass the recomputer entirely and hit the legacy compute-then-cache
|
||||
// path, which is unaffected by the first-pass bug.
|
||||
func TestAnalyticsRF_WindowedRequestNotGated(t *testing.T) {
|
||||
db := setupTestDB(t)
|
||||
defer db.Close()
|
||||
store := NewPacketStore(db, nil)
|
||||
block := make(chan struct{})
|
||||
defer close(block)
|
||||
store.installWarmupBlocker_1659(block)
|
||||
|
||||
cfg := &Config{Port: 3000}
|
||||
hub := NewHub()
|
||||
srv := NewServer(db, cfg, hub)
|
||||
srv.store = store
|
||||
router := mux.NewRouter()
|
||||
srv.RegisterRoutes(router)
|
||||
|
||||
// Explicit window — should bypass warmup gate.
|
||||
req := httptest.NewRequest("GET", "/api/analytics/rf?window=1h", nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
|
||||
if w.Code == http.StatusServiceUnavailable {
|
||||
t.Fatalf("windowed request must NOT be gated by warmup (got 503)")
|
||||
}
|
||||
}
|
||||
|
||||
// === PR #1688 r1 — new test cases ===
|
||||
|
||||
// TestAnalyticsTopology_WarmupReturns503 — kent-beck #1: topology
|
||||
// gate is symmetric with RF; assert the same 503 contract.
|
||||
func TestAnalyticsTopology_WarmupReturns503(t *testing.T) {
|
||||
db := setupTestDB(t)
|
||||
defer db.Close()
|
||||
store := NewPacketStore(db, nil)
|
||||
block := make(chan struct{})
|
||||
defer close(block)
|
||||
store.installWarmupBlocker_1659(block)
|
||||
|
||||
cfg := &Config{Port: 3000}
|
||||
hub := NewHub()
|
||||
srv := NewServer(db, cfg, hub)
|
||||
srv.store = store
|
||||
router := mux.NewRouter()
|
||||
srv.RegisterRoutes(router)
|
||||
|
||||
req := httptest.NewRequest("GET", "/api/analytics/topology", nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
|
||||
if w.Code != http.StatusServiceUnavailable {
|
||||
t.Fatalf("topology: expected 503 during warmup, got %d", w.Code)
|
||||
}
|
||||
if got := w.Header().Get("Retry-After"); got != "5" {
|
||||
t.Fatalf("topology: expected Retry-After: 5, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestAnalyticsChannels_WarmupReturns503 — kent-beck #1: channels
|
||||
// gate is symmetric with RF; assert the same 503 contract.
|
||||
func TestAnalyticsChannels_WarmupReturns503(t *testing.T) {
|
||||
db := setupTestDB(t)
|
||||
defer db.Close()
|
||||
store := NewPacketStore(db, nil)
|
||||
block := make(chan struct{})
|
||||
defer close(block)
|
||||
store.installWarmupBlocker_1659(block)
|
||||
|
||||
cfg := &Config{Port: 3000}
|
||||
hub := NewHub()
|
||||
srv := NewServer(db, cfg, hub)
|
||||
srv.store = store
|
||||
router := mux.NewRouter()
|
||||
srv.RegisterRoutes(router)
|
||||
|
||||
req := httptest.NewRequest("GET", "/api/analytics/channels", nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
|
||||
if w.Code != http.StatusServiceUnavailable {
|
||||
t.Fatalf("channels: expected 503 during warmup, got %d", w.Code)
|
||||
}
|
||||
if got := w.Header().Get("Retry-After"); got != "5" {
|
||||
t.Fatalf("channels: expected Retry-After: 5, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestWarmup_GateBlockedUntilLoadComplete — munger #5 correctness:
|
||||
// the chunked loader readiness MUST gate first-pass-done. A recomputer
|
||||
// pass that completes while LoadComplete() is false must NOT lift the
|
||||
// gate; a SUBSEQUENT pass after LoadComplete() flips true must lift it.
|
||||
func TestWarmup_GateBlockedUntilLoadComplete(t *testing.T) {
|
||||
db := setupTestDB(t)
|
||||
defer db.Close()
|
||||
store := NewPacketStore(db, nil)
|
||||
// LoadComplete starts false — chunked loader still running.
|
||||
|
||||
called := make(chan struct{}, 16)
|
||||
rc := newAnalyticsRecomputer("test-rf", time.Hour, func() interface{} {
|
||||
called <- struct{}{}
|
||||
return map[string]int{"x": 1}
|
||||
})
|
||||
rc.setWarmupReadyGate_1659(store.LoadComplete)
|
||||
rc.Start()
|
||||
defer rc.Stop()
|
||||
|
||||
// First pass already ran synchronously in Start(). Gate must still
|
||||
// be warming up because LoadComplete() is false.
|
||||
<-called
|
||||
if !rc.IsWarmingUp_1659() {
|
||||
t.Fatalf("expected IsWarmingUp_1659=true while LoadComplete()=false (munger #5 bug)")
|
||||
}
|
||||
if !rc.FirstPassDoneAt_1659().IsZero() {
|
||||
t.Fatalf("expected FirstPassDoneAt zero while LoadComplete()=false")
|
||||
}
|
||||
|
||||
// Now flip the loader and trigger another pass.
|
||||
store.loadComplete.Store(true)
|
||||
rc.runOnce()
|
||||
if rc.IsWarmingUp_1659() {
|
||||
t.Fatalf("expected gate to lift after LoadComplete()=true + another pass")
|
||||
}
|
||||
}
|
||||
|
||||
// TestWarmup_NilResultStillLiftsGate — munger #2 / kent-beck #2:
|
||||
// a compute that returns nil but doesn't panic must still flip the
|
||||
// gate (the cache stays empty but the banner does NOT get stuck).
|
||||
func TestWarmup_NilResultStillLiftsGate(t *testing.T) {
|
||||
rc := newAnalyticsRecomputer("test-nil", time.Hour, func() interface{} {
|
||||
return nil
|
||||
})
|
||||
rc.Start()
|
||||
defer rc.Stop()
|
||||
|
||||
if rc.IsWarmingUp_1659() {
|
||||
t.Fatalf("nil-result compute must still lift warmup gate after first pass")
|
||||
}
|
||||
}
|
||||
|
||||
// TestWarmup_PanicEventuallyLiftsGate — munger #2 / kent-beck #2:
|
||||
// a compute that ALWAYS panics must not leave the gate stuck forever.
|
||||
// The fallback timeout (warmupForceTimeout) is the safety net.
|
||||
func TestWarmup_PanicEventuallyLiftsGate(t *testing.T) {
|
||||
prev := warmupForceTimeout
|
||||
warmupForceTimeout = 50 * time.Millisecond
|
||||
defer func() { warmupForceTimeout = prev }()
|
||||
|
||||
rc := newAnalyticsRecomputer("test-panic", time.Hour, func() interface{} {
|
||||
panic("compute boom")
|
||||
})
|
||||
rc.Start()
|
||||
defer rc.Stop()
|
||||
|
||||
// Panic was recovered inside runOnce; firstPassDoneNs is still 0.
|
||||
if rc.FirstPassDoneAt_1659().IsZero() == false {
|
||||
t.Fatalf("panicking compute should not have set firstPassDoneNs")
|
||||
}
|
||||
// But after warmupForceTimeout elapses, the gate must lift.
|
||||
time.Sleep(80 * time.Millisecond)
|
||||
if rc.IsWarmingUp_1659() {
|
||||
t.Fatalf("expected fallback timeout to lift gate after warmupForceTimeout (got still-warming)")
|
||||
}
|
||||
}
|
||||
|
||||
// TestWarmup_TimeoutLiftsHangingCompute — munger #2 / kent-beck #2:
|
||||
// hung compute (blocks indefinitely on a channel) must not result in
|
||||
// permanent 503. Fallback timeout lifts it.
|
||||
func TestWarmup_TimeoutLiftsHangingCompute(t *testing.T) {
|
||||
prev := warmupForceTimeout
|
||||
warmupForceTimeout = 50 * time.Millisecond
|
||||
defer func() { warmupForceTimeout = prev }()
|
||||
|
||||
block := make(chan struct{})
|
||||
defer close(block)
|
||||
rc := newAnalyticsRecomputer("test-hang", time.Hour, func() interface{} {
|
||||
<-block
|
||||
return nil
|
||||
})
|
||||
// Don't call Start (would block forever on synchronous initial
|
||||
// compute). Just simulate "we noted warmup start, compute is
|
||||
// hanging in another goroutine".
|
||||
rc.noteWarmupStart_1659()
|
||||
go rc.runOnce()
|
||||
|
||||
if !rc.IsWarmingUp_1659() {
|
||||
t.Fatalf("expected initial state to be warming-up")
|
||||
}
|
||||
time.Sleep(80 * time.Millisecond)
|
||||
if rc.IsWarmingUp_1659() {
|
||||
t.Fatalf("expected fallback timeout to lift hung-compute warmup")
|
||||
}
|
||||
}
|
||||
@@ -1,400 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/gorilla/mux"
|
||||
)
|
||||
|
||||
func mustExecDB(t *testing.T, db *DB, q string) {
|
||||
t.Helper()
|
||||
if _, err := db.conn.Exec(q); err != nil {
|
||||
t.Fatalf("exec %q: %v", q, err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAreaEntryParsing(t *testing.T) {
|
||||
raw := `{
|
||||
"port": 3000,
|
||||
"areas": {
|
||||
"BEL": {
|
||||
"label": "Belgium",
|
||||
"polygon": [[50.0, 2.5], [51.5, 2.5], [51.5, 6.4], [50.0, 6.4]]
|
||||
},
|
||||
"BOX": {
|
||||
"label": "Bounding Box Area",
|
||||
"latMin": 50.0, "latMax": 51.5, "lonMin": 2.5, "lonMax": 6.4
|
||||
}
|
||||
}
|
||||
}`
|
||||
var cfg Config
|
||||
if err := json.Unmarshal([]byte(raw), &cfg); err != nil {
|
||||
t.Fatalf("unmarshal: %v", err)
|
||||
}
|
||||
if len(cfg.Areas) != 2 {
|
||||
t.Fatalf("want 2 areas, got %d", len(cfg.Areas))
|
||||
}
|
||||
bel := cfg.Areas["BEL"]
|
||||
if bel.Label != "Belgium" {
|
||||
t.Errorf("label: want Belgium, got %q", bel.Label)
|
||||
}
|
||||
if len(bel.Polygon) != 4 {
|
||||
t.Errorf("polygon: want 4 points, got %d", len(bel.Polygon))
|
||||
}
|
||||
box := cfg.Areas["BOX"]
|
||||
if box.LatMin == nil || *box.LatMin != 50.0 {
|
||||
t.Error("LatMin not parsed")
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetNodePubkeysInArea_Polygon(t *testing.T) {
|
||||
db := setupTestDBv2(t)
|
||||
mustExecDB(t, db, `INSERT INTO nodes (public_key, lat, lon) VALUES ('pk-inside', 50.85, 4.35)`)
|
||||
mustExecDB(t, db, `INSERT INTO nodes (public_key, lat, lon) VALUES ('pk-outside', 48.0, 4.35)`)
|
||||
mustExecDB(t, db, `INSERT INTO nodes (public_key, lat, lon) VALUES ('pk-nogps', NULL, NULL)`)
|
||||
mustExecDB(t, db, `INSERT INTO nodes (public_key, lat, lon) VALUES ('pk-zero', 0.0, 0.0)`)
|
||||
|
||||
entry := AreaEntry{
|
||||
Label: "Belgium",
|
||||
Polygon: [][2]float64{{50.0, 2.5}, {51.5, 2.5}, {51.5, 6.4}, {50.0, 6.4}},
|
||||
}
|
||||
pks, err := db.GetNodePubkeysInArea(entry)
|
||||
if err != nil {
|
||||
t.Fatalf("GetNodePubkeysInArea: %v", err)
|
||||
}
|
||||
if len(pks) != 1 || pks[0] != "pk-inside" {
|
||||
t.Errorf("want [pk-inside], got %v", pks)
|
||||
}
|
||||
}
|
||||
|
||||
// newTestStoreWithDB builds a minimal PacketStore wired to the given DB and config.
|
||||
func newTestStoreWithDB(t *testing.T, db *DB, cfg *Config) *PacketStore {
|
||||
t.Helper()
|
||||
return &PacketStore{
|
||||
db: db,
|
||||
config: cfg,
|
||||
byNode: make(map[string][]*StoreTx),
|
||||
byTxID: make(map[int]*StoreTx),
|
||||
byObsID: make(map[int]*StoreObs),
|
||||
byObserver: make(map[string][]*StoreObs),
|
||||
byHash: make(map[string]*StoreTx),
|
||||
byPayloadType: make(map[int][]*StoreTx),
|
||||
nodeHashes: make(map[string]map[string]bool),
|
||||
byPathHop: make(map[string][]*StoreTx),
|
||||
advertPubkeys: make(map[string]int),
|
||||
rfCache: make(map[string]*cachedResult),
|
||||
topoCache: make(map[string]*cachedResult),
|
||||
hashCache: make(map[string]*cachedResult),
|
||||
collisionCache: make(map[string]*cachedResult),
|
||||
chanCache: make(map[string]*cachedResult),
|
||||
distCache: make(map[string]*cachedResult),
|
||||
subpathCache: make(map[string]*cachedResult),
|
||||
regionObsCache: make(map[string]map[string]bool),
|
||||
areaNodeCache: make(map[string]map[string]bool),
|
||||
areaNodeCacheTimes: make(map[string]time.Time),
|
||||
rfCacheTTL: 15 * time.Second,
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveAreaNodes_UnknownKey(t *testing.T) {
|
||||
db := setupTestDBv2(t)
|
||||
cfg := &Config{Areas: map[string]AreaEntry{
|
||||
"BEL": {Label: "Belgium", Polygon: [][2]float64{{50.0, 2.5}, {51.5, 2.5}, {51.5, 6.4}, {50.0, 6.4}}},
|
||||
}}
|
||||
s := newTestStoreWithDB(t, db, cfg)
|
||||
result := s.resolveAreaNodes("UNKNOWN")
|
||||
if result != nil {
|
||||
t.Errorf("want nil for unknown area, got %v", result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveAreaNodes_CacheHit(t *testing.T) {
|
||||
db := setupTestDBv2(t)
|
||||
mustExecDB(t, db, `INSERT INTO nodes (public_key, lat, lon) VALUES ('pk1', 50.85, 4.35)`)
|
||||
|
||||
cfg := &Config{Areas: map[string]AreaEntry{
|
||||
"BEL": {Label: "Belgium", Polygon: [][2]float64{{50.0, 2.5}, {51.5, 2.5}, {51.5, 6.4}, {50.0, 6.4}}},
|
||||
}}
|
||||
s := newTestStoreWithDB(t, db, cfg)
|
||||
|
||||
r1 := s.resolveAreaNodes("BEL")
|
||||
if !r1["pk1"] {
|
||||
t.Fatal("pk1 should be in area BEL on first call")
|
||||
}
|
||||
|
||||
// Delete node so a live DB query would return nothing — second call must use cache.
|
||||
mustExecDB(t, db, `DELETE FROM nodes WHERE public_key = 'pk1'`)
|
||||
|
||||
r2 := s.resolveAreaNodes("BEL")
|
||||
if !r2["pk1"] {
|
||||
t.Fatal("cache hit should still return pk1 after DB delete")
|
||||
}
|
||||
}
|
||||
|
||||
// ingestAdvert adds a synthetic ADVERT packet to the store's in-memory packet list.
|
||||
func ingestAdvert(t *testing.T, s *PacketStore, hash, decodedJSON string) {
|
||||
t.Helper()
|
||||
pt := PayloadADVERT
|
||||
tx := &StoreTx{
|
||||
Hash: hash,
|
||||
FirstSeen: "2026-01-01T00:00:00Z",
|
||||
PayloadType: &pt,
|
||||
DecodedJSON: decodedJSON,
|
||||
}
|
||||
s.mu.Lock()
|
||||
s.packets = append(s.packets, tx)
|
||||
s.byHash[hash] = tx
|
||||
s.byPayloadType[PayloadADVERT] = append(s.byPayloadType[PayloadADVERT], tx)
|
||||
s.mu.Unlock()
|
||||
}
|
||||
|
||||
func TestFilterPacketsByArea(t *testing.T) {
|
||||
db := setupTestDBv2(t)
|
||||
mustExecDB(t, db, `INSERT INTO nodes (public_key, lat, lon) VALUES ('inside-node', 50.85, 4.35)`)
|
||||
mustExecDB(t, db, `INSERT INTO nodes (public_key, lat, lon) VALUES ('outside-node', 48.0, 4.35)`)
|
||||
|
||||
cfg := &Config{Areas: map[string]AreaEntry{
|
||||
"BEL": {Label: "Belgium", Polygon: [][2]float64{{50.0, 2.5}, {51.5, 2.5}, {51.5, 6.4}, {50.0, 6.4}}},
|
||||
}}
|
||||
s := newTestStoreWithDB(t, db, cfg)
|
||||
|
||||
ingestAdvert(t, s, "hash-in", `{"public_key":"inside-node","name":"Inside"}`)
|
||||
ingestAdvert(t, s, "hash-out", `{"public_key":"outside-node","name":"Outside"}`)
|
||||
|
||||
result := s.QueryPackets(PacketQuery{Limit: 50, Area: "BEL"})
|
||||
if result.Total != 1 {
|
||||
t.Fatalf("want 1 packet in area BEL, got %d (packets: %v)", result.Total, result.Packets)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAnalyticsRFAreaFilter(t *testing.T) {
|
||||
db := setupTestDBv2(t)
|
||||
mustExecDB(t, db, `INSERT INTO nodes (public_key, lat, lon) VALUES ('inside-node', 50.85, 4.35)`)
|
||||
mustExecDB(t, db, `INSERT INTO nodes (public_key, lat, lon) VALUES ('outside-node', 48.0, 4.35)`)
|
||||
|
||||
cfg := &Config{Areas: map[string]AreaEntry{
|
||||
"BEL": {Label: "Belgium", Polygon: [][2]float64{{50.0, 2.5}, {51.5, 2.5}, {51.5, 6.4}, {50.0, 6.4}}},
|
||||
}}
|
||||
s := newTestStoreWithDB(t, db, cfg)
|
||||
|
||||
ingestAdvert(t, s, "hash-rf-in", `{"public_key":"inside-node","name":"Inside"}`)
|
||||
ingestAdvert(t, s, "hash-rf-out", `{"public_key":"outside-node","name":"Outside"}`)
|
||||
|
||||
result := s.GetAnalyticsRF("", "BEL")
|
||||
if result == nil {
|
||||
t.Fatal("GetAnalyticsRF returned nil")
|
||||
}
|
||||
total, _ := result["totalTransmissions"].(int)
|
||||
if total != 1 {
|
||||
t.Errorf("want totalTransmissions=1 for BEL, got %d", total)
|
||||
}
|
||||
}
|
||||
|
||||
// ingestChanMsg adds a synthetic GRP_TXT packet with the given sender pubkey and channel hash.
|
||||
func ingestChanMsg(t *testing.T, s *PacketStore, hash, senderPK string, chanHash int) {
|
||||
t.Helper()
|
||||
pt := PayloadGRP_TXT
|
||||
decodedJSON := fmt.Sprintf(`{"public_key":%q,"channelHash":%d}`, senderPK, chanHash)
|
||||
tx := &StoreTx{
|
||||
Hash: hash,
|
||||
FirstSeen: "2026-01-01T00:00:00Z",
|
||||
PayloadType: &pt,
|
||||
DecodedJSON: decodedJSON,
|
||||
}
|
||||
s.mu.Lock()
|
||||
s.packets = append(s.packets, tx)
|
||||
s.byHash[hash] = tx
|
||||
s.byPayloadType[PayloadGRP_TXT] = append(s.byPayloadType[PayloadGRP_TXT], tx)
|
||||
s.mu.Unlock()
|
||||
}
|
||||
|
||||
func TestAnalyticsChannelsAreaFilter(t *testing.T) {
|
||||
db := setupTestDBv2(t)
|
||||
mustExecDB(t, db, `INSERT INTO nodes (public_key, lat, lon) VALUES ('inside-node', 50.85, 4.35)`)
|
||||
mustExecDB(t, db, `INSERT INTO nodes (public_key, lat, lon) VALUES ('outside-node', 48.0, 4.35)`)
|
||||
|
||||
cfg := &Config{Areas: map[string]AreaEntry{
|
||||
"BEL": {Label: "Belgium", Polygon: [][2]float64{{50.0, 2.5}, {51.5, 2.5}, {51.5, 6.4}, {50.0, 6.4}}},
|
||||
}}
|
||||
s := newTestStoreWithDB(t, db, cfg)
|
||||
|
||||
// inside-node sends on channel hash 42, outside-node on channel hash 99.
|
||||
ingestChanMsg(t, s, "ch-in", "inside-node", 42)
|
||||
ingestChanMsg(t, s, "ch-out", "outside-node", 99)
|
||||
|
||||
unfiltered := s.GetAnalyticsChannels("", "")
|
||||
filtered := s.GetAnalyticsChannels("", "BEL")
|
||||
if filtered == nil {
|
||||
t.Fatal("GetAnalyticsChannels returned nil")
|
||||
}
|
||||
unfilteredCount, _ := unfiltered["activeChannels"].(int)
|
||||
filteredCount, _ := filtered["activeChannels"].(int)
|
||||
if unfilteredCount != 2 {
|
||||
t.Errorf("want 2 active channels unfiltered, got %d", unfilteredCount)
|
||||
}
|
||||
if filteredCount != 1 {
|
||||
t.Errorf("want 1 active channel for BEL, got %d", filteredCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetNodePubkeysInArea_BoundingBox(t *testing.T) {
|
||||
db := setupTestDBv2(t)
|
||||
mustExecDB(t, db, `INSERT INTO nodes (public_key, lat, lon) VALUES ('in', 50.5, 5.0)`)
|
||||
mustExecDB(t, db, `INSERT INTO nodes (public_key, lat, lon) VALUES ('out', 52.0, 5.0)`)
|
||||
|
||||
minLat, maxLat, minLon, maxLon := 50.0, 51.5, 2.5, 6.4
|
||||
entry := AreaEntry{LatMin: &minLat, LatMax: &maxLat, LonMin: &minLon, LonMax: &maxLon}
|
||||
pks, err := db.GetNodePubkeysInArea(entry)
|
||||
if err != nil {
|
||||
t.Fatalf("%v", err)
|
||||
}
|
||||
if len(pks) != 1 || pks[0] != "in" {
|
||||
t.Errorf("want [in], got %v", pks)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleConfigAreas(t *testing.T) {
|
||||
db := setupTestDBv2(t)
|
||||
cfg := &Config{Areas: map[string]AreaEntry{
|
||||
"BEL": {Label: "Belgium", Polygon: [][2]float64{{50.0, 2.5}, {51.5, 2.5}, {51.5, 6.4}, {50.0, 6.4}}},
|
||||
"MST": {Label: "Maastricht"},
|
||||
}}
|
||||
|
||||
r := mux.NewRouter()
|
||||
srv := &Server{db: db, cfg: cfg}
|
||||
r.HandleFunc("/api/config/areas", srv.handleConfigAreas).Methods("GET")
|
||||
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/config/areas", nil)
|
||||
w := httptest.NewRecorder()
|
||||
r.ServeHTTP(w, req)
|
||||
|
||||
if w.Code != 200 {
|
||||
t.Fatalf("want 200, got %d", w.Code)
|
||||
}
|
||||
var result []map[string]string
|
||||
if err := json.NewDecoder(w.Body).Decode(&result); err != nil {
|
||||
t.Fatalf("decode: %v", err)
|
||||
}
|
||||
if len(result) != 2 {
|
||||
t.Fatalf("want 2 areas, got %d", len(result))
|
||||
}
|
||||
keys := map[string]bool{}
|
||||
for _, entry := range result {
|
||||
keys[entry["key"]] = true
|
||||
if entry["label"] == "" {
|
||||
t.Errorf("missing label for key %q", entry["key"])
|
||||
}
|
||||
}
|
||||
if !keys["BEL"] || !keys["MST"] {
|
||||
t.Errorf("expected BEL and MST, got %v", keys)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleConfigAreasEmpty(t *testing.T) {
|
||||
db := setupTestDBv2(t)
|
||||
cfg := &Config{}
|
||||
|
||||
r := mux.NewRouter()
|
||||
srv := &Server{db: db, cfg: cfg}
|
||||
r.HandleFunc("/api/config/areas", srv.handleConfigAreas).Methods("GET")
|
||||
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/config/areas", nil)
|
||||
w := httptest.NewRecorder()
|
||||
r.ServeHTTP(w, req)
|
||||
|
||||
var result []interface{}
|
||||
if err := json.NewDecoder(w.Body).Decode(&result); err != nil {
|
||||
t.Fatalf("decode: %v", err)
|
||||
}
|
||||
if len(result) != 0 {
|
||||
t.Errorf("want empty array, got %v", result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveAreaNodes_CalledBeforeRLock(t *testing.T) {
|
||||
// Verify resolveAreaNodes doesn't deadlock when called concurrently with writes.
|
||||
// This test catches the anti-pattern where resolveAreaNodes (which does a DB
|
||||
// query) is called while holding s.mu.RLock().
|
||||
db := setupTestDBv2(t)
|
||||
mustExecDB(t, db, `INSERT INTO nodes (public_key, lat, lon) VALUES ('n1', 50.85, 4.35)`)
|
||||
|
||||
cfg := &Config{Areas: map[string]AreaEntry{
|
||||
"BEL": {Label: "Belgium", Polygon: [][2]float64{{50.0, 2.5}, {51.5, 2.5}, {51.5, 6.4}, {50.0, 6.4}}},
|
||||
}}
|
||||
s := newTestStoreWithDB(t, db, cfg)
|
||||
ingestAdvert(t, s, "h1", `{"public_key":"n1","name":"N1"}`)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
for i := 0; i < 5; i++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
s.GetBulkHealth(10, "", "BEL")
|
||||
}()
|
||||
}
|
||||
wg.Wait() // must not deadlock
|
||||
}
|
||||
|
||||
func TestResolveAreaNodes_PerKeyTTL(t *testing.T) {
|
||||
db := setupTestDBv2(t)
|
||||
mustExecDB(t, db, `INSERT INTO nodes (public_key, lat, lon) VALUES ('bel-node', 50.85, 4.35)`)
|
||||
mustExecDB(t, db, `INSERT INTO nodes (public_key, lat, lon) VALUES ('nl-node', 52.4, 4.9)`)
|
||||
|
||||
cfg := &Config{Areas: map[string]AreaEntry{
|
||||
"BEL": {Label: "Belgium", Polygon: [][2]float64{{50.0, 2.5}, {51.5, 2.5}, {51.5, 6.4}, {50.0, 6.4}}},
|
||||
"NL": {Label: "Netherlands", Polygon: [][2]float64{{51.5, 3.4}, {53.6, 3.4}, {53.6, 7.2}, {51.5, 7.2}}},
|
||||
}}
|
||||
s := newTestStoreWithDB(t, db, cfg)
|
||||
|
||||
// Populate both keys into cache.
|
||||
r1 := s.resolveAreaNodes("BEL")
|
||||
if !r1["bel-node"] {
|
||||
t.Fatal("bel-node should be in BEL")
|
||||
}
|
||||
r2 := s.resolveAreaNodes("NL")
|
||||
if !r2["nl-node"] {
|
||||
t.Fatal("nl-node should be in NL")
|
||||
}
|
||||
|
||||
// Delete both nodes from DB to prove cache still serves them.
|
||||
mustExecDB(t, db, `DELETE FROM nodes`)
|
||||
|
||||
// BEL cache should still be warm (not evicted by NL query).
|
||||
r3 := s.resolveAreaNodes("BEL")
|
||||
if !r3["bel-node"] {
|
||||
t.Error("BEL cache was evicted by NL query (global TTL bug)")
|
||||
}
|
||||
// NL cache should still be warm too.
|
||||
r4 := s.resolveAreaNodes("NL")
|
||||
if !r4["nl-node"] {
|
||||
t.Error("NL cache was evicted unexpectedly")
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetBulkHealth_AreaBypassesCap(t *testing.T) {
|
||||
db := setupTestDBv2(t)
|
||||
|
||||
// Insert 510 nodes inside BEL — all at 50.85, 4.35.
|
||||
for i := 0; i < 510; i++ {
|
||||
mustExecDB(t, db, fmt.Sprintf(
|
||||
`INSERT INTO nodes (public_key, lat, lon) VALUES ('node-%d', 50.85, 4.35)`, i,
|
||||
))
|
||||
}
|
||||
|
||||
cfg := &Config{Areas: map[string]AreaEntry{
|
||||
"BEL": {Label: "Belgium", Polygon: [][2]float64{{50.0, 2.5}, {51.5, 2.5}, {51.5, 6.4}, {50.0, 6.4}}},
|
||||
}}
|
||||
s := newTestStoreWithDB(t, db, cfg)
|
||||
|
||||
// With limit=10 but area filter active, all 510 in-area nodes must be returned.
|
||||
result := s.GetBulkHealth(10, "", "BEL")
|
||||
if len(result) != 510 {
|
||||
t.Errorf("want 510 nodes from area BEL, got %d", len(result))
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,132 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/gorilla/mux"
|
||||
)
|
||||
|
||||
// TestBackfillAsyncChunked verifies that backfillResolvedPathsAsync processes
|
||||
// observations in chunks, yields between batches, and sets the completion flag.
|
||||
func TestBackfillAsyncChunked(t *testing.T) {
|
||||
store := &PacketStore{
|
||||
packets: make([]*StoreTx, 0),
|
||||
byHash: make(map[string]*StoreTx),
|
||||
byTxID: make(map[int]*StoreTx),
|
||||
byObsID: make(map[int]*StoreObs),
|
||||
}
|
||||
|
||||
// No pending observations → should complete immediately.
|
||||
backfillResolvedPathsAsync(store, "", 100, time.Millisecond, 24)
|
||||
if !store.backfillComplete.Load() {
|
||||
t.Fatal("expected backfillComplete to be true with empty store")
|
||||
}
|
||||
}
|
||||
|
||||
// TestBackfillStatusHeader verifies the X-CoreScope-Status header is set correctly.
|
||||
func TestBackfillStatusHeader(t *testing.T) {
|
||||
store := &PacketStore{
|
||||
packets: make([]*StoreTx, 0),
|
||||
byHash: make(map[string]*StoreTx),
|
||||
byTxID: make(map[int]*StoreTx),
|
||||
byObsID: make(map[int]*StoreObs),
|
||||
}
|
||||
|
||||
srv := &Server{store: store}
|
||||
|
||||
handler := srv.backfillStatusMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(200)
|
||||
}))
|
||||
|
||||
// Before backfill completes → backfilling
|
||||
req := httptest.NewRequest("GET", "/api/stats", nil)
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, req)
|
||||
if got := rec.Header().Get("X-CoreScope-Status"); got != "backfilling" {
|
||||
t.Fatalf("expected 'backfilling', got %q", got)
|
||||
}
|
||||
|
||||
// After backfill completes → ready
|
||||
store.backfillComplete.Store(true)
|
||||
rec = httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, req)
|
||||
if got := rec.Header().Get("X-CoreScope-Status"); got != "ready" {
|
||||
t.Fatalf("expected 'ready', got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestStatsBackfillFields verifies /api/stats includes backfill fields.
|
||||
func TestStatsBackfillFields(t *testing.T) {
|
||||
db := setupTestDBv2(t)
|
||||
defer db.Close()
|
||||
seedV2Data(t, db)
|
||||
|
||||
store := &PacketStore{
|
||||
db: db,
|
||||
packets: make([]*StoreTx, 0),
|
||||
byHash: make(map[string]*StoreTx),
|
||||
byTxID: make(map[int]*StoreTx),
|
||||
byObsID: make(map[int]*StoreObs),
|
||||
loaded: true,
|
||||
}
|
||||
|
||||
cfg := &Config{Port: 0}
|
||||
hub := NewHub()
|
||||
srv := NewServer(db, cfg, hub)
|
||||
srv.store = store
|
||||
|
||||
router := mux.NewRouter()
|
||||
srv.RegisterRoutes(router)
|
||||
|
||||
// While backfilling
|
||||
req := httptest.NewRequest("GET", "/api/stats", nil)
|
||||
rec := httptest.NewRecorder()
|
||||
router.ServeHTTP(rec, req)
|
||||
|
||||
var resp map[string]interface{}
|
||||
if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("failed to parse stats response: %v", err)
|
||||
}
|
||||
|
||||
if backfilling, ok := resp["backfilling"]; !ok {
|
||||
t.Fatal("missing 'backfilling' field in stats response")
|
||||
} else if backfilling != true {
|
||||
t.Fatalf("expected backfilling=true, got %v", backfilling)
|
||||
}
|
||||
|
||||
if _, ok := resp["backfillProgress"]; !ok {
|
||||
t.Fatal("missing 'backfillProgress' field in stats response")
|
||||
}
|
||||
|
||||
// Check header
|
||||
if got := rec.Header().Get("X-CoreScope-Status"); got != "backfilling" {
|
||||
t.Fatalf("expected X-CoreScope-Status=backfilling, got %q", got)
|
||||
}
|
||||
|
||||
// After backfill completes
|
||||
store.backfillComplete.Store(true)
|
||||
// Invalidate stats cache
|
||||
srv.statsMu.Lock()
|
||||
srv.statsCache = nil
|
||||
srv.statsMu.Unlock()
|
||||
|
||||
rec = httptest.NewRecorder()
|
||||
router.ServeHTTP(rec, req)
|
||||
|
||||
resp = nil
|
||||
if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("failed to parse stats response: %v", err)
|
||||
}
|
||||
|
||||
if backfilling, ok := resp["backfilling"]; !ok || backfilling != false {
|
||||
t.Fatalf("expected backfilling=false after completion, got %v", backfilling)
|
||||
}
|
||||
|
||||
if got := rec.Header().Get("X-CoreScope-Status"); got != "ready" {
|
||||
t.Fatalf("expected X-CoreScope-Status=ready, got %q", got)
|
||||
}
|
||||
}
|
||||
@@ -1,89 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// handleBackup streams a consistent SQLite snapshot of the analyzer DB.
|
||||
//
|
||||
// Requires API-key authentication (mounted via requireAPIKey in routes.go).
|
||||
//
|
||||
// Strategy: SQLite's `VACUUM INTO 'path'` produces an atomic, defragmented
|
||||
// copy of the current database into a new file. It runs at READ ISOLATION
|
||||
// against the source DB (works on our read-only connection) and never
|
||||
// blocks concurrent writers — the ingestor keeps writing to the WAL while
|
||||
// the snapshot is taken from a consistent read transaction.
|
||||
//
|
||||
// Response:
|
||||
//
|
||||
// 200 OK
|
||||
// Content-Type: application/octet-stream
|
||||
// Content-Disposition: attachment; filename="corescope-backup-<unix>.db"
|
||||
// <body: complete SQLite database file>
|
||||
//
|
||||
// The temp file is removed after the response is fully written, regardless
|
||||
// of whether the client successfully consumed the stream.
|
||||
func (s *Server) handleBackup(w http.ResponseWriter, r *http.Request) {
|
||||
if s.db == nil || s.db.conn == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "database unavailable")
|
||||
return
|
||||
}
|
||||
|
||||
ts := time.Now().UTC().Unix()
|
||||
clientIP := r.Header.Get("X-Forwarded-For")
|
||||
if clientIP == "" {
|
||||
clientIP = r.RemoteAddr
|
||||
}
|
||||
log.Printf("[backup] generating backup for client %s", clientIP)
|
||||
|
||||
// Stage the snapshot in the OS temp dir so we never touch the live DB
|
||||
// directory (avoids confusing operators / accidental WAL clobber).
|
||||
tmpDir, err := os.MkdirTemp("", "corescope-backup-")
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "tempdir failed: "+err.Error())
|
||||
return
|
||||
}
|
||||
defer func() {
|
||||
if rmErr := os.RemoveAll(tmpDir); rmErr != nil {
|
||||
log.Printf("[backup] cleanup error: %v", rmErr)
|
||||
}
|
||||
}()
|
||||
|
||||
snapshotPath := filepath.Join(tmpDir, fmt.Sprintf("corescope-backup-%d.db", ts))
|
||||
|
||||
// SQLite parses the path literal — escape any single quotes defensively.
|
||||
// (mkdtemp output won't contain quotes, but be paranoid for future-proofing.)
|
||||
escaped := strings.ReplaceAll(snapshotPath, "'", "''")
|
||||
if _, err := s.db.conn.ExecContext(r.Context(), fmt.Sprintf("VACUUM INTO '%s'", escaped)); err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "snapshot failed: "+err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
f, err := os.Open(snapshotPath)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "open snapshot failed: "+err.Error())
|
||||
return
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
stat, err := f.Stat()
|
||||
if err == nil {
|
||||
w.Header().Set("Content-Length", fmt.Sprintf("%d", stat.Size()))
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/octet-stream")
|
||||
w.Header().Set("Content-Disposition", fmt.Sprintf("attachment; filename=\"corescope-backup-%d.db\"", ts))
|
||||
w.Header().Set("X-Content-Type-Options", "nosniff")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
|
||||
if _, err := io.Copy(w, f); err != nil {
|
||||
// Headers already flushed; just log. Client will see truncated stream.
|
||||
log.Printf("[backup] stream error: %v", err)
|
||||
}
|
||||
}
|
||||
@@ -1,55 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// sqliteMagic is the 16-byte file header identifying a valid SQLite 3 database.
|
||||
// See https://www.sqlite.org/fileformat.html#magic_header_string
|
||||
const sqliteMagic = "SQLite format 3\x00"
|
||||
|
||||
func TestBackupRequiresAPIKey(t *testing.T) {
|
||||
_, router := setupTestServerWithAPIKey(t, "test-secret-key-strong-enough")
|
||||
|
||||
req := httptest.NewRequest("GET", "/api/backup", nil)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
if w.Code != http.StatusUnauthorized {
|
||||
t.Fatalf("expected 401 without API key, got %d (body: %s)", w.Code, w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestBackupReturnsValidSQLiteSnapshot(t *testing.T) {
|
||||
const apiKey = "test-secret-key-strong-enough"
|
||||
_, router := setupTestServerWithAPIKey(t, apiKey)
|
||||
|
||||
req := httptest.NewRequest("GET", "/api/backup", nil)
|
||||
req.Header.Set("X-API-Key", apiKey)
|
||||
w := httptest.NewRecorder()
|
||||
router.ServeHTTP(w, req)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("expected 200, got %d (body: %s)", w.Code, w.Body.String())
|
||||
}
|
||||
|
||||
ct := w.Header().Get("Content-Type")
|
||||
if ct != "application/octet-stream" {
|
||||
t.Errorf("expected Content-Type application/octet-stream, got %q", ct)
|
||||
}
|
||||
|
||||
cd := w.Header().Get("Content-Disposition")
|
||||
if !strings.HasPrefix(cd, "attachment;") || !strings.Contains(cd, "filename=\"corescope-backup-") || !strings.HasSuffix(cd, ".db\"") {
|
||||
t.Errorf("expected Content-Disposition attachment with corescope-backup-<ts>.db filename, got %q", cd)
|
||||
}
|
||||
|
||||
body := w.Body.Bytes()
|
||||
if len(body) < len(sqliteMagic) {
|
||||
t.Fatalf("backup body too short (%d bytes) — expected SQLite file", len(body))
|
||||
}
|
||||
if got := string(body[:len(sqliteMagic)]); got != sqliteMagic {
|
||||
t.Fatalf("expected SQLite magic header %q, got %q", sqliteMagic, got)
|
||||
}
|
||||
}
|
||||
@@ -162,7 +162,7 @@ func createTestDBWithAgedPackets(t *testing.T, numRecent, numOld int) string {
|
||||
}
|
||||
execOrFail(`CREATE TABLE transmissions (id INTEGER PRIMARY KEY, raw_hex TEXT, hash TEXT, first_seen TEXT, route_type INTEGER, payload_type INTEGER, payload_version INTEGER, decoded_json TEXT)`)
|
||||
execOrFail(`CREATE TABLE observations (id INTEGER PRIMARY KEY, transmission_id INTEGER, observer_id TEXT, observer_name TEXT, direction TEXT, snr REAL, rssi REAL, score INTEGER, path_json TEXT, timestamp TEXT, raw_hex TEXT)`)
|
||||
execOrFail(`CREATE TABLE observers (rowid INTEGER PRIMARY KEY, id TEXT, name TEXT, iata TEXT)`)
|
||||
execOrFail(`CREATE TABLE observers (rowid INTEGER PRIMARY KEY, id TEXT, name TEXT)`)
|
||||
execOrFail(`CREATE TABLE nodes (pubkey TEXT PRIMARY KEY, name TEXT, role TEXT, lat REAL, lon REAL, last_seen TEXT, first_seen TEXT, frequency REAL)`)
|
||||
execOrFail(`CREATE TABLE schema_version (version INTEGER)`)
|
||||
execOrFail(`INSERT INTO schema_version (version) VALUES (1)`)
|
||||
@@ -172,20 +172,16 @@ func createTestDBWithAgedPackets(t *testing.T, numRecent, numOld int) string {
|
||||
id := 1
|
||||
// Insert old packets (48 hours ago)
|
||||
for i := 0; i < numOld; i++ {
|
||||
oldT := now.Add(-48 * time.Hour).Add(time.Duration(i) * time.Second)
|
||||
ts := oldT.Format(time.RFC3339)
|
||||
ts := now.Add(-48 * time.Hour).Add(time.Duration(i) * time.Second).Format(time.RFC3339)
|
||||
conn.Exec("INSERT INTO transmissions VALUES (?,?,?,?,0,4,1,?)", id, "aa", fmt.Sprintf("old%d", i), ts, `{}`)
|
||||
// observations.timestamp is INTEGER (unix seconds) in production schema
|
||||
// — keep the fixture consistent so the RFC3339 subquery matches.
|
||||
conn.Exec("INSERT INTO observations VALUES (?,?,?,?,?,?,?,?,?,?,?)", id, id, "obs1", "Obs1", "RX", -10.0, -80.0, 5, `[]`, oldT.Unix(), "")
|
||||
conn.Exec("INSERT INTO observations VALUES (?,?,?,?,?,?,?,?,?,?,?)", id, id, "obs1", "Obs1", "RX", -10.0, -80.0, 5, `[]`, ts, "")
|
||||
id++
|
||||
}
|
||||
// Insert recent packets (within last hour)
|
||||
for i := 0; i < numRecent; i++ {
|
||||
newT := now.Add(-30 * time.Minute).Add(time.Duration(i) * time.Second)
|
||||
ts := newT.Format(time.RFC3339)
|
||||
ts := now.Add(-30 * time.Minute).Add(time.Duration(i) * time.Second).Format(time.RFC3339)
|
||||
conn.Exec("INSERT INTO transmissions VALUES (?,?,?,?,0,4,1,?)", id, "bb", fmt.Sprintf("new%d", i), ts, `{}`)
|
||||
conn.Exec("INSERT INTO observations VALUES (?,?,?,?,?,?,?,?,?,?,?)", id, id, "obs1", "Obs1", "RX", -10.0, -80.0, 5, `[]`, newT.Unix(), "")
|
||||
conn.Exec("INSERT INTO observations VALUES (?,?,?,?,?,?,?,?,?,?,?)", id, id, "obs1", "Obs1", "RX", -10.0, -80.0, 5, `[]`, ts, "")
|
||||
id++
|
||||
}
|
||||
return dbPath
|
||||
@@ -321,7 +317,7 @@ func createTestDBAt(tb testing.TB, dbPath string, numTx int) {
|
||||
direction TEXT, snr REAL, rssi REAL, score INTEGER,
|
||||
path_json TEXT, timestamp TEXT, raw_hex TEXT
|
||||
)`)
|
||||
execOrFail(`CREATE TABLE IF NOT EXISTS observers (rowid INTEGER PRIMARY KEY, id TEXT, name TEXT, iata TEXT)`)
|
||||
execOrFail(`CREATE TABLE IF NOT EXISTS observers (rowid INTEGER PRIMARY KEY, id TEXT, name TEXT)`)
|
||||
execOrFail(`CREATE TABLE IF NOT EXISTS nodes (
|
||||
pubkey TEXT PRIMARY KEY, name TEXT, role TEXT, lat REAL, lon REAL,
|
||||
last_seen TEXT, first_seen TEXT, frequency REAL
|
||||
@@ -372,7 +368,7 @@ func createTestDBWithObs(tb testing.TB, dbPath string, numTx int) {
|
||||
id INTEGER PRIMARY KEY, transmission_id INTEGER, observer_id TEXT, observer_name TEXT,
|
||||
direction TEXT, snr REAL, rssi REAL, score INTEGER, path_json TEXT, timestamp TEXT, raw_hex TEXT
|
||||
)`)
|
||||
execOrFail(`CREATE TABLE IF NOT EXISTS observers (rowid INTEGER PRIMARY KEY, id TEXT, name TEXT, iata TEXT)`)
|
||||
execOrFail(`CREATE TABLE IF NOT EXISTS observers (rowid INTEGER PRIMARY KEY, id TEXT, name TEXT)`)
|
||||
execOrFail(`CREATE TABLE IF NOT EXISTS nodes (
|
||||
pubkey TEXT PRIMARY KEY, name TEXT, role TEXT, lat REAL, lon REAL,
|
||||
last_seen TEXT, first_seen TEXT, frequency REAL
|
||||
|
||||
@@ -1,123 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/gorilla/mux"
|
||||
)
|
||||
|
||||
// TestBridgeScore_HandleNodesSurface verifies that /api/nodes
|
||||
// includes a `bridge_score` field on repeater rows after the bridge
|
||||
// recomputer has run. Drives the line-graph A-B-C-D through the full
|
||||
// pipeline: insert nodes, populate the neighbor graph, force a
|
||||
// recompute, hit the handler, parse the response. Issue #672 axis 2.
|
||||
func TestBridgeScore_HandleNodesSurface(t *testing.T) {
|
||||
db := setupCapabilityTestDB(t)
|
||||
defer db.conn.Close()
|
||||
// handleNodes/db.GetNodes selects a foreign_advert column not in
|
||||
// the minimal capability-test schema.
|
||||
if _, err := db.conn.Exec(`ALTER TABLE nodes ADD COLUMN foreign_advert INTEGER DEFAULT 0`); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Four repeater nodes in a line.
|
||||
pks := []string{
|
||||
"aaaa000000000000000000000000000000000000000000000000000000000000",
|
||||
"bbbb000000000000000000000000000000000000000000000000000000000000",
|
||||
"cccc000000000000000000000000000000000000000000000000000000000000",
|
||||
"dddd000000000000000000000000000000000000000000000000000000000000",
|
||||
}
|
||||
recent := time.Now().UTC().Format("2006-01-02T15:04:05.000Z")
|
||||
for _, pk := range pks {
|
||||
if _, err := db.conn.Exec(`INSERT INTO nodes
|
||||
(public_key, name, role, lat, lon, last_seen, first_seen, advert_count)
|
||||
VALUES (?, ?, 'repeater', 37.5, -122.0, ?, ?, 10)`,
|
||||
pk, "node-"+pk[:4], recent, recent); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
store := NewPacketStore(db, nil)
|
||||
// Build neighbor graph with the line A-B-C-D. Add each edge
|
||||
// `count` times so its time-decayed Score saturates.
|
||||
g := NewNeighborGraph()
|
||||
now := time.Now()
|
||||
obs := "obs-test"
|
||||
snr := 5.0
|
||||
for i := 0; i < 10; i++ {
|
||||
g.upsertEdge(pks[0], pks[1], "aa", obs, &snr, now)
|
||||
g.upsertEdge(pks[1], pks[2], "bb", obs, &snr, now)
|
||||
g.upsertEdge(pks[2], pks[3], "cc", obs, &snr, now)
|
||||
}
|
||||
store.graph.Store(g)
|
||||
|
||||
// Direct invocation of the recomputer's compute path — bypassing
|
||||
// StartBridgeScoreRecomputer's package-level once-flag (which is
|
||||
// problematic across tests).
|
||||
recomputeBridgeScoresSafe(store)
|
||||
|
||||
snap := store.GetBridgeScoreMap()
|
||||
if len(snap) == 0 {
|
||||
t.Fatalf("expected non-empty bridge score snapshot, got empty")
|
||||
}
|
||||
// Sanity: middle nodes b/c must be positive, ends must be zero.
|
||||
if snap[pks[1]] <= 0 || snap[pks[2]] <= 0 {
|
||||
t.Errorf("middle nodes should have positive bridge: b=%v c=%v",
|
||||
snap[pks[1]], snap[pks[2]])
|
||||
}
|
||||
if snap[pks[0]] != 0 || snap[pks[3]] != 0 {
|
||||
t.Errorf("end nodes should have zero bridge: a=%v d=%v",
|
||||
snap[pks[0]], snap[pks[3]])
|
||||
}
|
||||
|
||||
// Wire a Server, call handleNodes, parse the response.
|
||||
cfg := &Config{Port: 3000}
|
||||
hub := NewHub()
|
||||
srv := NewServer(db, cfg, hub)
|
||||
srv.store = store
|
||||
|
||||
router := mux.NewRouter()
|
||||
srv.RegisterRoutes(router)
|
||||
|
||||
req := httptest.NewRequest("GET", "/api/nodes?limit=100", nil)
|
||||
rr := httptest.NewRecorder()
|
||||
router.ServeHTTP(rr, req)
|
||||
|
||||
if rr.Code != 200 {
|
||||
t.Fatalf("handleNodes status: want 200, got %d body=%s", rr.Code, rr.Body.String())
|
||||
}
|
||||
var resp struct {
|
||||
Nodes []map[string]interface{} `json:"nodes"`
|
||||
}
|
||||
if err := json.Unmarshal(rr.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("decode: %v body=%s", err, rr.Body.String())
|
||||
}
|
||||
gotBy := map[string]map[string]interface{}{}
|
||||
for _, n := range resp.Nodes {
|
||||
if pk, _ := n["public_key"].(string); pk != "" {
|
||||
gotBy[pk] = n
|
||||
}
|
||||
}
|
||||
for _, pk := range pks {
|
||||
n, ok := gotBy[pk]
|
||||
if !ok {
|
||||
t.Errorf("node %s missing from response", pk[:4])
|
||||
continue
|
||||
}
|
||||
if _, has := n["bridge_score"]; !has {
|
||||
t.Errorf("node %s: bridge_score field absent from response", pk[:4])
|
||||
}
|
||||
}
|
||||
// Middle node B must report a non-zero bridge_score; end node A
|
||||
// must report exactly zero. These two assertions together prevent
|
||||
// a "field present but always 0" regression.
|
||||
if v, _ := gotBy[pks[1]]["bridge_score"].(float64); v <= 0 {
|
||||
t.Errorf("middle node B bridge_score in API response should be > 0, got %v", v)
|
||||
}
|
||||
if v, _ := gotBy[pks[0]]["bridge_score"].(float64); v != 0 {
|
||||
t.Errorf("end node A bridge_score in API response should be 0, got %v", v)
|
||||
}
|
||||
}
|
||||
@@ -1,198 +0,0 @@
|
||||
// Package main: bridge-axis recomputer (issue #672 axis 2 of 4).
|
||||
//
|
||||
// Steady-state background loop that recomputes the per-pubkey bridge
|
||||
// centrality score over the in-memory NeighborGraph and stores the
|
||||
// resulting map atomically. handleNodes reads via a single atomic
|
||||
// load — no lock contention with ingest or with other recomputers
|
||||
// (same pattern as #1240 / #1248).
|
||||
//
|
||||
// Interval default: 5 minutes. The graph itself rebuilds asynchronously
|
||||
// on its own schedule (path_inspect.go); a 5-minute cadence here is
|
||||
// well within the freshness budget for a structural metric (centrality
|
||||
// changes slowly — a new edge or evicted node nudges scores by
|
||||
// fractions of a percent).
|
||||
//
|
||||
// Cost (Brandes + Dijkstra): O(V · (E + V log V)). Staging-scale ~600
|
||||
// nodes / ~2 000 edges ≈ ~4.8M ops, well under 100 ms in practice. On
|
||||
// host-fleet scale (5 000 nodes / 30 000 edges) it is still seconds,
|
||||
// running in a background goroutine off the request path.
|
||||
package main
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// bridgeRecomputerDefaultInterval is how often the bridge score map is
|
||||
// rebuilt. 5 minutes mirrors analytics_recomputer (#1240) and
|
||||
// repeater_enrich_recomputer (#1262); centrality is a slow-moving
|
||||
// structural signal and does not warrant tighter cadence.
|
||||
const bridgeRecomputerDefaultInterval = 5 * time.Minute
|
||||
|
||||
// bridgeRecompStartedMu serializes start of the bridge recomputer.
|
||||
// We do not currently expose Stop publicly — the goroutine lives for
|
||||
// the lifetime of the process — but keeping the started flag local
|
||||
// (instead of on PacketStore) avoids further field churn in store.go.
|
||||
var (
|
||||
bridgeRecompStartedMu sync.Mutex
|
||||
bridgeRecompStarted bool
|
||||
)
|
||||
|
||||
// StartBridgeScoreRecomputer launches the bridge-centrality recomputer
|
||||
// (issue #672 axis 2). It performs an initial synchronous compute so
|
||||
// that the very first /api/nodes after server start hits a populated
|
||||
// snapshot instead of returning bridge_score=0 for every node, then
|
||||
// reschedules every `interval` (default 5min if <= 0).
|
||||
//
|
||||
// Idempotent: subsequent calls are no-ops and return a no-op stop
|
||||
// closure.
|
||||
func (s *PacketStore) StartBridgeScoreRecomputer(interval time.Duration) func() {
|
||||
if interval <= 0 {
|
||||
interval = bridgeRecomputerDefaultInterval
|
||||
}
|
||||
|
||||
bridgeRecompStartedMu.Lock()
|
||||
if bridgeRecompStarted {
|
||||
bridgeRecompStartedMu.Unlock()
|
||||
return func() {}
|
||||
}
|
||||
bridgeRecompStarted = true
|
||||
stop := make(chan struct{})
|
||||
done := make(chan struct{})
|
||||
bridgeRecompStartedMu.Unlock()
|
||||
|
||||
// Initial synchronous prewarm — see comment above.
|
||||
recomputeBridgeScoresSafe(s)
|
||||
|
||||
var stopOnce sync.Once
|
||||
go func() {
|
||||
defer close(done)
|
||||
t := time.NewTicker(interval)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-t.C:
|
||||
recomputeBridgeScoresSafe(s)
|
||||
case <-stop:
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
return func() {
|
||||
stopOnce.Do(func() {
|
||||
close(stop)
|
||||
})
|
||||
select {
|
||||
case <-done:
|
||||
case <-time.After(5 * time.Second):
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// recomputeBridgeScoresSafe runs ComputeBridgeScores over the current
|
||||
// neighbor graph and installs the result. Panics in compute are
|
||||
// swallowed (defensive) so the goroutine never dies; the previous
|
||||
// snapshot remains valid.
|
||||
func recomputeBridgeScoresSafe(s *PacketStore) {
|
||||
defer func() { _ = recover() }()
|
||||
graph := s.graph.Load()
|
||||
if graph == nil {
|
||||
// No graph yet — install an empty map so readers get a defined
|
||||
// zero rather than a nil sentinel (handleNodes treats both as
|
||||
// 0.0, but an explicit empty snapshot avoids "is this ready
|
||||
// yet?" confusion in operator-facing tooling).
|
||||
empty := map[string]float64{}
|
||||
s.bridgeScoreMap.Store(&empty)
|
||||
return
|
||||
}
|
||||
now := time.Now()
|
||||
edges := bridgeEdgesFromGraph(graph, now)
|
||||
scores := ComputeBridgeScores(edges)
|
||||
s.bridgeScoreMap.Store(&scores)
|
||||
}
|
||||
|
||||
// bridgeEdgesFromGraph snapshots the NeighborGraph into a flat slice
|
||||
// of BridgeEdge tuples with weight = Score(now) * Confidence(), per
|
||||
// the convention established by #1235. Edges with unresolved B
|
||||
// endpoints (no concrete pubkey yet — only a hop prefix) are skipped:
|
||||
// they contribute no betweenness signal because the second endpoint
|
||||
// is unknown.
|
||||
func bridgeEdgesFromGraph(graph *NeighborGraph, now time.Time) []BridgeEdge {
|
||||
all := graph.AllEdges()
|
||||
out := make([]BridgeEdge, 0, len(all))
|
||||
for _, e := range all {
|
||||
if e == nil {
|
||||
continue
|
||||
}
|
||||
if e.NodeA == "" || e.NodeB == "" {
|
||||
// Unresolved (prefix-only) — no defined second endpoint.
|
||||
continue
|
||||
}
|
||||
w := e.Score(now) * e.Confidence()
|
||||
if w < bridgeMinWeightEpsilon {
|
||||
continue
|
||||
}
|
||||
out = append(out, BridgeEdge{A: e.NodeA, B: e.NodeB, Weight: w})
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// GetBridgeScore returns the bridge centrality score for a pubkey in
|
||||
// [0, 1], or 0 if the recomputer has not run yet or the pubkey is not
|
||||
// in the graph. Lookup is case-insensitive (the score map keys are
|
||||
// lowercase, matching byPathHop convention).
|
||||
func (s *PacketStore) GetBridgeScore(pubkey string) float64 {
|
||||
if pubkey == "" {
|
||||
return 0
|
||||
}
|
||||
snap := s.bridgeScoreMap.Load()
|
||||
if snap == nil {
|
||||
return 0
|
||||
}
|
||||
m := *snap
|
||||
if v, ok := m[pubkey]; ok {
|
||||
return v
|
||||
}
|
||||
// Try lowercase form.
|
||||
lc := pubkey
|
||||
for i := 0; i < len(lc); i++ {
|
||||
if lc[i] >= 'A' && lc[i] <= 'Z' {
|
||||
b := []byte(pubkey)
|
||||
for j := i; j < len(b); j++ {
|
||||
if b[j] >= 'A' && b[j] <= 'Z' {
|
||||
b[j] += 'a' - 'A'
|
||||
}
|
||||
}
|
||||
lc = string(b)
|
||||
break
|
||||
}
|
||||
}
|
||||
if v, ok := m[lc]; ok {
|
||||
return v
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// GetBridgeScoreMap returns a defensive copy-by-reference of the
|
||||
// current bridge score snapshot. Nil-safe: returns an empty map if
|
||||
// no snapshot has been installed yet. Map is read-only by convention
|
||||
// — callers MUST NOT mutate it (the snapshot is shared across all
|
||||
// concurrent readers).
|
||||
func (s *PacketStore) GetBridgeScoreMap() map[string]float64 {
|
||||
snap := s.bridgeScoreMap.Load()
|
||||
if snap == nil {
|
||||
return map[string]float64{}
|
||||
}
|
||||
return *snap
|
||||
}
|
||||
|
||||
// resetBridgeRecomputerForTest is a test-only helper to allow the
|
||||
// integration test to re-Start the recomputer in a fresh process
|
||||
// (which would otherwise be blocked by the package-level
|
||||
// bridgeRecompStarted flag). Production code must not call this.
|
||||
func resetBridgeRecomputerForTest() {
|
||||
bridgeRecompStartedMu.Lock()
|
||||
bridgeRecompStarted = false
|
||||
bridgeRecompStartedMu.Unlock()
|
||||
}
|
||||
@@ -1,206 +0,0 @@
|
||||
// Package main: bridge axis of repeater usefulness score (issue #672,
|
||||
// axis 2 of 4). The "Bridge" signal is the betweenness centrality of a
|
||||
// node in the (undirected, weighted) neighbor graph: a high value means
|
||||
// the node lies on many shortest paths between other pairs and is hence
|
||||
// structurally important — removing it would force traffic around or
|
||||
// fragment the mesh.
|
||||
//
|
||||
// Algorithm: Brandes' algorithm (1) with Dijkstra for weighted
|
||||
// shortest paths. Complexity O(V · (E + V log V)). For the staging
|
||||
// graph (~600 nodes, ~2 000 edges) this is ~4.8M ops — trivial,
|
||||
// completes in milliseconds. We accumulate raw betweenness across all
|
||||
// sources, halve (an undirected pair is counted from each endpoint
|
||||
// once), then normalize by the max observed value so the per-node
|
||||
// score is in [0, 1].
|
||||
//
|
||||
// Edge weight follows the convention established by #1235: the
|
||||
// affinity score (count + recency decay) is multiplied by the
|
||||
// observer-diversity confidence — stronger, more corroborated
|
||||
// neighborships are preferred when there is a choice of paths.
|
||||
// Geo-rejected edges are already excluded from the input graph at
|
||||
// build time (#1230) so we don't have to re-filter here.
|
||||
//
|
||||
// For Dijkstra we need a DISTANCE (lower = better) not an affinity
|
||||
// (higher = better), so we convert: cost = 1 / max(epsilon, weight).
|
||||
// epsilon avoids divide-by-zero on a degenerate zero-weight edge.
|
||||
//
|
||||
// (1) Brandes, "A Faster Algorithm for Betweenness Centrality" (2001).
|
||||
package main
|
||||
|
||||
import (
|
||||
"container/heap"
|
||||
"math"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// BridgeEdge is the algorithm-facing edge tuple consumed by
|
||||
// ComputeBridgeScores. Endpoints A and B are pubkeys (case preserved
|
||||
// by caller; we lowercase internally for stable keying). Weight is
|
||||
// the affinity (higher = stronger connection). Edges with zero or
|
||||
// negative weight are skipped — they would break Dijkstra's
|
||||
// relaxation invariant.
|
||||
type BridgeEdge struct {
|
||||
A, B string
|
||||
Weight float64
|
||||
}
|
||||
|
||||
// bridgeMinWeightEpsilon is the floor applied to weights before we
|
||||
// invert them into Dijkstra distances. 1e-9 is small enough that any
|
||||
// real weight (Score in [0,1] times Confidence in [0,1]) dominates,
|
||||
// but large enough to avoid Inf when weight is exactly zero.
|
||||
const bridgeMinWeightEpsilon = 1e-9
|
||||
|
||||
// ComputeBridgeScores returns a map pubkey → bridge score in [0, 1]
|
||||
// computed via Brandes' weighted betweenness centrality on the
|
||||
// undirected graph defined by `edges`. Returned map is keyed by the
|
||||
// lowercase pubkey form (matching the byPathHop / persisted-edge
|
||||
// convention). Nodes appearing in the graph but with zero betweenness
|
||||
// are still present in the map with value 0.0.
|
||||
//
|
||||
// Self-loops (A == B) and edges with weight < epsilon are silently
|
||||
// skipped. Duplicate edges between the same pair keep the cheapest
|
||||
// (= the highest-weight) version — consistent with shortest-path
|
||||
// semantics.
|
||||
//
|
||||
// Pure (no global state, no locks); safe to call concurrently.
|
||||
// Cost: O(V · (E + V log V)).
|
||||
func ComputeBridgeScores(edges []BridgeEdge) map[string]float64 {
|
||||
// 1. Build adjacency list with distance = 1/weight.
|
||||
adj := make(map[string]map[string]float64)
|
||||
addOrMerge := func(a, b string, dist float64) {
|
||||
m, ok := adj[a]
|
||||
if !ok {
|
||||
m = make(map[string]float64)
|
||||
adj[a] = m
|
||||
}
|
||||
if existing, has := m[b]; !has || dist < existing {
|
||||
m[b] = dist
|
||||
}
|
||||
}
|
||||
for _, e := range edges {
|
||||
a := strings.ToLower(strings.TrimSpace(e.A))
|
||||
b := strings.ToLower(strings.TrimSpace(e.B))
|
||||
if a == "" || b == "" || a == b {
|
||||
continue
|
||||
}
|
||||
w := e.Weight
|
||||
if w < bridgeMinWeightEpsilon {
|
||||
continue
|
||||
}
|
||||
dist := 1.0 / w
|
||||
addOrMerge(a, b, dist)
|
||||
addOrMerge(b, a, dist)
|
||||
}
|
||||
if len(adj) == 0 {
|
||||
return map[string]float64{}
|
||||
}
|
||||
|
||||
nodes := make([]string, 0, len(adj))
|
||||
for n := range adj {
|
||||
nodes = append(nodes, n)
|
||||
}
|
||||
|
||||
bc := make(map[string]float64, len(nodes))
|
||||
for _, n := range nodes {
|
||||
bc[n] = 0
|
||||
}
|
||||
|
||||
// 2. Brandes outer loop: one Dijkstra-based single-source shortest
|
||||
// path computation per source vertex.
|
||||
for _, s := range nodes {
|
||||
stack := make([]string, 0, len(nodes))
|
||||
pred := make(map[string][]string, len(nodes))
|
||||
sigma := make(map[string]float64, len(nodes))
|
||||
dist := make(map[string]float64, len(nodes))
|
||||
for _, n := range nodes {
|
||||
sigma[n] = 0
|
||||
dist[n] = math.Inf(1)
|
||||
}
|
||||
sigma[s] = 1
|
||||
dist[s] = 0
|
||||
|
||||
pq := &bridgePQ{}
|
||||
heap.Init(pq)
|
||||
heap.Push(pq, bridgePQItem{node: s, dist: 0})
|
||||
|
||||
visited := make(map[string]bool, len(nodes))
|
||||
for pq.Len() > 0 {
|
||||
top := heap.Pop(pq).(bridgePQItem)
|
||||
v := top.node
|
||||
if visited[v] {
|
||||
continue
|
||||
}
|
||||
visited[v] = true
|
||||
stack = append(stack, v)
|
||||
|
||||
for w, edgeDist := range adj[v] {
|
||||
alt := dist[v] + edgeDist
|
||||
if alt < dist[w]-1e-12 {
|
||||
dist[w] = alt
|
||||
sigma[w] = sigma[v]
|
||||
pred[w] = append(pred[w][:0], v)
|
||||
heap.Push(pq, bridgePQItem{node: w, dist: alt})
|
||||
} else if math.Abs(alt-dist[w]) <= 1e-12 {
|
||||
sigma[w] += sigma[v]
|
||||
pred[w] = append(pred[w], v)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Back-propagation: walk the stack in reverse order.
|
||||
delta := make(map[string]float64, len(nodes))
|
||||
for i := len(stack) - 1; i >= 0; i-- {
|
||||
w := stack[i]
|
||||
for _, v := range pred[w] {
|
||||
if sigma[w] == 0 {
|
||||
continue
|
||||
}
|
||||
delta[v] += (sigma[v] / sigma[w]) * (1.0 + delta[w])
|
||||
}
|
||||
if w != s {
|
||||
bc[w] += delta[w]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Undirected graphs double-count each (s,t) pair, so halve.
|
||||
for k := range bc {
|
||||
bc[k] /= 2.0
|
||||
}
|
||||
|
||||
// 5. Normalize by max so scores live in [0, 1]. If max is 0
|
||||
// (clique or single edge) we leave everything at zero.
|
||||
maxBC := 0.0
|
||||
for _, v := range bc {
|
||||
if v > maxBC {
|
||||
maxBC = v
|
||||
}
|
||||
}
|
||||
if maxBC > 0 {
|
||||
for k, v := range bc {
|
||||
bc[k] = v / maxBC
|
||||
}
|
||||
}
|
||||
return bc
|
||||
}
|
||||
|
||||
// ─── min-heap for Dijkstra ─────────────────────────────────────────────────────
|
||||
|
||||
type bridgePQItem struct {
|
||||
node string
|
||||
dist float64
|
||||
}
|
||||
|
||||
type bridgePQ []bridgePQItem
|
||||
|
||||
func (h bridgePQ) Len() int { return len(h) }
|
||||
func (h bridgePQ) Less(i, j int) bool { return h[i].dist < h[j].dist }
|
||||
func (h bridgePQ) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
|
||||
func (h *bridgePQ) Push(x interface{}) { *h = append(*h, x.(bridgePQItem)) }
|
||||
func (h *bridgePQ) Pop() interface{} {
|
||||
old := *h
|
||||
n := len(old)
|
||||
it := old[n-1]
|
||||
*h = old[:n-1]
|
||||
return it
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user