diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 0c648fc9..21ee8b4f 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -83,6 +83,9 @@ jobs: - name: Verify Dockerfile COPY invariants (issue #1316) run: bash scripts/check-dockerfile-internal-pkgs.sh + - name: Staging disk-monitor unit tests (issue #1684) + run: bash scripts/staging/test-disk-monitor.sh + - name: Lint CSS variables (issue #1128) run: | set -e diff --git a/DEPLOY.md b/DEPLOY.md index 450c3e1f..cc3a28a1 100644 --- a/DEPLOY.md +++ b/DEPLOY.md @@ -129,3 +129,98 @@ docker compose pull && docker compose up -d | `./manage.sh setup` | Copy `docker-compose.example.yml`, edit env vars | `manage.sh` remains available for advanced use cases (building from source, custom patches, development). Pre-built images are recommended for most production deployments. + +## Staging VM — disk-usage monitor & cleanup (#1684) + +The staging VM ran out of disk during a hot-patch (#1684). To prevent +repeats, two scripts live in `scripts/staging/`: + +- `disk-monitor.sh ` — reads `df -P`, classifies usage against + `<80 ok / >=80 warn / >=90 error / >=95 alert`, emits to stderr + + journald (via `logger`). Returns non-zero on `error|alert` so + systemd surfaces the unit as failed. +- `disk-cleanup.sh` — removes `/tmp` snapshot files (`*.db`, + `staging-snap.*`, `cs-*`, `node-compile-cache`) older than 7 days + and runs `docker builder prune` + `docker image prune` with + `--filter "until=72h" --filter "label!=keep"`. Set + `CORESCOPE_CLEANUP_DRY_RUN=1` to log without deleting. + +### Install on the staging host + +SSH to `` as the staging operator user and: + +```bash +sudo install -m 0755 scripts/staging/disk-monitor.sh /usr/local/bin/corescope-disk-monitor +sudo install -m 0755 scripts/staging/disk-cleanup.sh /usr/local/bin/corescope-disk-cleanup + +# 15-minute monitor +sudo tee /etc/systemd/system/corescope-disk-monitor.service >/dev/null <<'UNIT' +[Unit] +Description=CoreScope staging disk-usage monitor (issue #1684) +[Service] +Type=oneshot +ExecStart=/usr/local/bin/corescope-disk-monitor / +UNIT + +sudo tee /etc/systemd/system/corescope-disk-monitor.timer >/dev/null <<'UNIT' +[Unit] +Description=Run CoreScope disk-usage monitor every 15 minutes +[Timer] +OnBootSec=5min +OnUnitActiveSec=15min +Unit=corescope-disk-monitor.service +[Install] +WantedBy=timers.target +UNIT + +# Daily cleanup at 03:30 local +sudo tee /etc/systemd/system/corescope-disk-cleanup.service >/dev/null <<'UNIT' +[Unit] +Description=CoreScope staging disk cleanup (issue #1684) +[Service] +Type=oneshot +ExecStart=/usr/local/bin/corescope-disk-cleanup +UNIT + +sudo tee /etc/systemd/system/corescope-disk-cleanup.timer >/dev/null <<'UNIT' +[Unit] +Description=Run CoreScope disk cleanup daily at off-peak +[Timer] +OnCalendar=*-*-* 03:30:00 +Persistent=true +Unit=corescope-disk-cleanup.service +[Install] +WantedBy=timers.target +UNIT + +sudo systemctl daemon-reload +sudo systemctl enable --now corescope-disk-monitor.timer corescope-disk-cleanup.timer +``` + +`` is the staging VM hostname/IP — operator supplies it, +not committed to the repo. + +### Inspecting alerts + +```bash +journalctl -t corescope-disk-monitor --since '-1d' +journalctl -t corescope-disk-cleanup --since '-7d' +systemctl list-timers | grep corescope-disk +``` + +`logger` priorities map: `ok→info`, `warn→warning`, `error→err`, +`alert→alert` (syslog severity 1, the highest level). Wire +`journalctl -p alert ...` to whatever ops channel the operator +prefers; use `-p err` to also catch the `error` tier. + +### Notes on `staging-snap.db` root cause (#1684 phase 3) + +`grep -rn staging-snap.db cmd/ public/ scripts/` returns **zero** +hits in the repo. The 4.4 GB orphan was a manual debugging artifact, +not produced by any committed code. The `disk-cleanup.sh` retention +rule (anything matching `staging-snap.*` in `/tmp` older than 7 days) +prevents recurrence without needing source-side TTL changes. + +If a future feature legitimately needs persistent snapshot DBs, put +them under `/var/lib/corescope/snapshots/` with explicit rotation — +not in `/tmp`, which is ephemeral by definition. diff --git a/scripts/staging/disk-cleanup.sh b/scripts/staging/disk-cleanup.sh new file mode 100755 index 00000000..7ee9cea8 --- /dev/null +++ b/scripts/staging/disk-cleanup.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash +# disk-cleanup.sh — daily staging VM cleanup (issue #1684). +# +# Removes orphaned /tmp snapshots older than 7 days and prunes Docker +# build cache + dangling images older than 72h (respecting label=keep). +# +# Designed to run from a daily systemd timer at off-peak. Idempotent. +# Set CORESCOPE_CLEANUP_DRY_RUN=1 to log without deleting. + +set -euo pipefail + +DRY_RUN="${CORESCOPE_CLEANUP_DRY_RUN:-0}" +LOG_TAG="corescope-disk-cleanup" + +log() { + echo "$LOG_TAG: $*" >&2 + if command -v logger >/dev/null 2>&1; then + logger -t "$LOG_TAG" -- "$*" + fi +} + +run_or_dry() { + if [ "$DRY_RUN" = "1" ]; then + log "DRY_RUN: $*" + else + log "exec: $*" + "$@" + fi +} + +# ----- /tmp snapshot retention ---------------------------------------------- +# Anything in /tmp matching known snapshot/cache patterns older than 7 days dies. +# -mindepth 1 avoids touching /tmp itself; -maxdepth 2 limits blast radius. +cleanup_tmp() { + log "scanning /tmp for snapshots older than 7d" + local find_args=( + /tmp -mindepth 1 -maxdepth 2 -mtime +7 + \( + -name 'staging-snap.*' -o + -name 'cs-*' -o + -name 'node-compile-cache' + \) + ) + if [ "$DRY_RUN" = "1" ]; then + find "${find_args[@]}" -print | while IFS= read -r f; do + log "DRY_RUN: would rm -rf $f" + done + else + # -print before -exec so we have an audit trail in journald. + find "${find_args[@]}" -print -exec rm -rf {} + + fi +} + +# ----- Docker prune --------------------------------------------------------- +cleanup_docker() { + if ! command -v docker >/dev/null 2>&1; then + log "docker not installed; skipping docker prune" + return 0 + fi + run_or_dry docker builder prune -af --filter "until=72h" + run_or_dry docker image prune -af --filter "until=72h" --filter "label!=keep" +} + +main() { + log "starting (dry_run=$DRY_RUN)" + cleanup_tmp + cleanup_docker + log "done" +} + +if [ "${BASH_SOURCE[0]}" = "${0}" ]; then + main "$@" +fi diff --git a/scripts/staging/disk-monitor.sh b/scripts/staging/disk-monitor.sh new file mode 100755 index 00000000..fb5c98c6 --- /dev/null +++ b/scripts/staging/disk-monitor.sh @@ -0,0 +1,101 @@ +#!/usr/bin/env bash +# disk-monitor.sh — staging VM disk-usage monitor (issue #1684). +# +# Reads `df` for a mount point, classifies usage against thresholds, and +# emits a single line to stderr (and journald via systemd) at the matching +# severity. Designed to be invoked by a 15-minute systemd timer; output +# goes to journald which the operator can wire to alerts as needed. +# +# Pure-bash helpers (parse_df_percent, classify_threshold) are sourced by +# scripts/test-disk-monitor.sh — keep them side-effect free. + +set -euo pipefail + +# ----- pure helpers (testable) ----------------------------------------------- + +# parse_df_percent +# Extracts the Use% column (column 5) from a 2-line `df -P` output and +# strips the trailing '%'. Echoes the integer percent (0-100). Returns +# non-zero if the input doesn't look like df output. +parse_df_percent() { + local input="$1" + # df -P guarantees a 2-line output: header + data. Take the last line. + local data + data="$(printf '%s\n' "$input" | tail -n1)" + # Column 5 is Use% (e.g. "81%"). + local pct + pct="$(printf '%s\n' "$data" | awk '{print $5}')" + case "$pct" in + *%) ;; + *) return 1 ;; + esac + printf '%s\n' "${pct%\%}" +} + +# classify_threshold +# Echoes one of: ok | warn | error | alert based on the issue #1684 spec: +# <80 ok ; >=80 warn ; >=90 error ; >=95 alert +# Returns non-zero if input is not an integer 0-100. +classify_threshold() { + local pct="$1" + case "$pct" in + ''|*[!0-9]*) return 1 ;; + esac + if [ "$pct" -lt 0 ] || [ "$pct" -gt 100 ]; then + return 1 + fi + if [ "$pct" -ge 95 ]; then + echo alert + elif [ "$pct" -ge 90 ]; then + echo error + elif [ "$pct" -ge 80 ]; then + echo warn + else + echo ok + fi +} + +# severity_priority +# Echoes the syslog priority for `logger -p`. Maps to the canonical +# syslog severity ladder (RFC 5424): alert=1, crit=2, err=3, warning=4, +# info=6. We deliberately use `alert` (not `crit`) for the >=95% case so +# downstream `journalctl -p alert` filters fire at the highest level. +# ok=info warn=warning error=err alert=alert +severity_priority() { + case "$1" in + ok) echo user.info ;; + warn) echo user.warning ;; + error) echo user.err ;; + alert) echo user.alert ;; + *) return 1 ;; + esac +} + +# ----- main ----------------------------------------------------------------- + +main() { + local mount="${1:-/}" + local df_out + df_out="$(df -P "$mount")" + local pct severity prio + pct="$(parse_df_percent "$df_out")" + severity="$(classify_threshold "$pct")" + prio="$(severity_priority "$severity")" + local msg="disk-monitor mount=$mount used=${pct}% severity=$severity" + # journald via systemd captures stderr; also emit through logger so + # syslog-based collectors see the priority. + echo "$msg" >&2 + if command -v logger >/dev/null 2>&1; then + logger -t corescope-disk-monitor -p "$prio" -- "$msg" + fi + # Exit codes: 0 ok|warn, 1 error|alert (so timers can surface failures). + case "$severity" in + ok|warn) return 0 ;; + *) return 1 ;; + esac +} + +# Only run main when executed directly (not when sourced by tests). +if [ "${BASH_SOURCE[0]}" = "${0}" ]; then + main "$@" +fi diff --git a/scripts/staging/test-disk-monitor.sh b/scripts/staging/test-disk-monitor.sh new file mode 100755 index 00000000..e327cef8 --- /dev/null +++ b/scripts/staging/test-disk-monitor.sh @@ -0,0 +1,109 @@ +#!/usr/bin/env bash +# test-disk-monitor.sh — unit tests for scripts/staging/disk-monitor.sh +# (issue #1684). Pure bash, no external deps. Sources the script and +# exercises its pure helpers against table-driven cases. +# +# Run: bash scripts/staging/test-disk-monitor.sh +# Exits non-zero if any case fails. + +set -u + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +# shellcheck source=disk-monitor.sh +. "$SCRIPT_DIR/disk-monitor.sh" + +PASS=0 +FAIL=0 + +assert_eq() { + local label="$1" expected="$2" actual="$3" + if [ "$expected" = "$actual" ]; then + PASS=$((PASS + 1)) + # echo "PASS: $label" + else + FAIL=$((FAIL + 1)) + echo "FAIL: $label — expected '$expected' got '$actual'" >&2 + fi +} + +# ----- classify_threshold --------------------------------------------------- +# Spec from issue #1684: <80 ok ; >=80 warn ; >=90 error ; >=95 alert +assert_eq "classify 0" "ok" "$(classify_threshold 0)" +assert_eq "classify 50" "ok" "$(classify_threshold 50)" +assert_eq "classify 79" "ok" "$(classify_threshold 79)" +assert_eq "classify 80" "warn" "$(classify_threshold 80)" +assert_eq "classify 85" "warn" "$(classify_threshold 85)" +assert_eq "classify 89" "warn" "$(classify_threshold 89)" +assert_eq "classify 90" "error" "$(classify_threshold 90)" +assert_eq "classify 94" "error" "$(classify_threshold 94)" +assert_eq "classify 95" "alert" "$(classify_threshold 95)" +assert_eq "classify 100" "alert" "$(classify_threshold 100)" + +# Invalid inputs return non-zero (no echo expected). +if classify_threshold "abc" >/dev/null 2>&1; then + FAIL=$((FAIL + 1)) + echo "FAIL: classify 'abc' — expected non-zero exit" >&2 +else + PASS=$((PASS + 1)) +fi +if classify_threshold 150 >/dev/null 2>&1; then + FAIL=$((FAIL + 1)) + echo "FAIL: classify 150 — expected non-zero exit" >&2 +else + PASS=$((PASS + 1)) +fi + +# ----- parse_df_percent ----------------------------------------------------- +# Simulates `df -P /` output. Use% column 5. +DF_OK='Filesystem 1024-blocks Used Available Capacity Mounted on +/dev/root 30401152 17040640 13360512 57% /' +DF_HIGH='Filesystem 1024-blocks Used Available Capacity Mounted on +/dev/root 30401152 29401152 1000000 97% /' +DF_FULL='Filesystem 1024-blocks Used Available Capacity Mounted on +/dev/root 30401152 30401152 0 100% /' + +assert_eq "parse_df 57%" "57" "$(parse_df_percent "$DF_OK")" +assert_eq "parse_df 97%" "97" "$(parse_df_percent "$DF_HIGH")" +assert_eq "parse_df 100%" "100" "$(parse_df_percent "$DF_FULL")" + +# Pipeline: parse_df_percent | classify_threshold (the real call path). +assert_eq "pipe 57->ok" "ok" "$(classify_threshold "$(parse_df_percent "$DF_OK")")" +assert_eq "pipe 97->alert" "alert" "$(classify_threshold "$(parse_df_percent "$DF_HIGH")")" +assert_eq "pipe 100->alert" "alert" "$(classify_threshold "$(parse_df_percent "$DF_FULL")")" + +# ----- severity_priority ---------------------------------------------------- +assert_eq "prio ok" "user.info" "$(severity_priority ok)" +assert_eq "prio warn" "user.warning" "$(severity_priority warn)" +assert_eq "prio error" "user.err" "$(severity_priority error)" +# alert maps to syslog `alert` (severity 1), NOT `crit` (severity 2). +# Regression guard for PR #1686 r1 adv #1: previously mapped to user.crit, +# which silently downgraded the highest-severity tier. +assert_eq "prio alert" "user.alert" "$(severity_priority alert)" + +# ----- disk-cleanup.sh /tmp pattern safety ---------------------------------- +# Regression guard for PR #1686 r1 adv #2: cleanup must NOT match a bare +# `*.db` pattern in /tmp — that would nuke unrelated SQLite session files, +# sqlite-pkg test outputs, and any debugging artifacts. Only named prefixes +# (`staging-snap.*`, `cs-*`, `node-compile-cache`) are allowed. +CLEANUP_SH="$SCRIPT_DIR/disk-cleanup.sh" +if [ -f "$CLEANUP_SH" ]; then + if grep -Eq "^[[:space:]]*-name[[:space:]]+'\*\.db'" "$CLEANUP_SH"; then + FAIL=$((FAIL + 1)) + echo "FAIL: disk-cleanup.sh contains bare -name '*.db' (data-loss footgun)" >&2 + else + PASS=$((PASS + 1)) + fi + # Sanity: the named-prefix patterns we DO want must still be present. + for pat in "staging-snap.\*" "cs-\*" "node-compile-cache"; do + if grep -Eq "\-name[[:space:]]+'${pat}'" "$CLEANUP_SH"; then + PASS=$((PASS + 1)) + else + FAIL=$((FAIL + 1)) + echo "FAIL: disk-cleanup.sh missing expected -name '${pat}' pattern" >&2 + fi + done +fi + +echo "----" +echo "PASS=$PASS FAIL=$FAIL" +[ "$FAIL" -eq 0 ]