MeshChatX/.github/workflows/bench.yml

# Benchmarks and integrity checks (push to main branches and workflow_dispatch).
# Results are stored in a runner cache and compared on every run; the job fails
# when any metric regresses beyond 150% of the stored baseline, and a commit
# comment is posted with the offending numbers.
#
# Pinned first-party actions (bump tag and SHA together when upgrading):
#   actions/checkout@v6.0.1            8e8c483db84b4bee98b60c0593521ed34d9990e8
#   actions/setup-python@v6.2.0        a309ff8b426b58ec0e2a45f0f869d46889d02405
#   actions/setup-node@v6.1.0          395ad3262231945c25e8478fd5baf05154b1d79f
#   actions/cache@v4.2.0               1bd1e32a3bdc45362d1e726936510720a7c30a57
#   benchmark-action/github-action-benchmark@v1.22.0
#                                      a60cea5bc7b49e15c1f58f411161f99e0df48372

name: Benchmarks

on:
    workflow_dispatch:
    push:
        branches:
            - master
            - dev

concurrency:
    group: ${{ github.workflow }}-${{ github.ref }}
    cancel-in-progress: true

permissions:
    contents: write
    pull-requests: write

env:
    FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
    PYTHON_VERSION: "3.14"
    NODE_VERSION: "24"
    POETRY_VERSION: "2.3.4"
    PNPM_VERSION: "10.33.0"

jobs:
    bench:
        runs-on: ubuntu-latest
        timeout-minutes: 60
        steps:
            - name: Checkout
              uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8

            - name: Set up Python
              uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405
              with:
                  python-version: ${{ env.PYTHON_VERSION }}

            - name: Install Poetry (PyPI pin)
              env:
                  POETRY_VERSION: ${{ env.POETRY_VERSION }}
              run: bash scripts/ci/github-install-poetry.sh

            - name: Cache Poetry downloads
              uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57
              with:
                  path: ~/.cache/pypoetry
                  key: ${{ runner.os }}-pypoetry-${{ hashFiles('poetry.lock') }}
                  restore-keys: |
                      ${{ runner.os }}-pypoetry-

            - name: Enable pnpm (corepack)
              run: corepack enable && corepack prepare "pnpm@${PNPM_VERSION}" --activate

            - name: Set up Node
              uses: actions/setup-node@395ad3262231945c25e8478fd5baf05154b1d79f
              with:
                  node-version: ${{ env.NODE_VERSION }}
                  cache: pnpm
                  cache-dependency-path: pnpm-lock.yaml

            - name: Install dependencies
              run: bash scripts/ci/github-install-deps.sh

            - name: Setup Task
              run: sh scripts/ci/setup-task.sh

            - name: Restore benchmark baseline cache
              uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57
              with:
                  path: ./cache
                  key: ${{ runner.os }}-bench-baseline-${{ github.ref_name }}
                  restore-keys: |
                      ${{ runner.os }}-bench-baseline-

            - name: Run benchmarks
              run: |
                  set -euo pipefail
                  poetry run python tests/backend/run_comprehensive_benchmarks.py \
                      --json-output bench_results.json 2>&1 | tee bench_results.txt

            - name: Run integrity tests
              run: |
                  set -euo pipefail
                  task test:integrity 2>&1 | tee -a bench_results.txt

            - name: Store and compare benchmark results
              uses: benchmark-action/github-action-benchmark@a60cea5bc7b49e15c1f58f411161f99e0df48372
              with:
                  name: MeshChatX Backend Benchmarks
                  tool: customSmallerIsBetter
                  output-file-path: bench_results.json
                  external-data-json-path: ./cache/benchmark-data.json
                  github-token: ${{ secrets.GITHUB_TOKEN }}
                  # GitHub shared runners have 20-40% variance even with identical
                  # code. alert-threshold posts a comment; fail-threshold fails
                  # the job. Sub-ms operations are especially noisy so we keep
                  # the comment bar at 2x and the hard-fail bar at 3x.
                  alert-threshold: "200%"
                  fail-threshold: "300%"
                  fail-on-alert: true
                  comment-on-alert: true
                  summary-always: true