feat(benchmarks): update benchmark workflow with JSON output and caching for results comparison

2026-05-29 16:54:00 +00:00 · 2026-04-30 12:15:08 -05:00
parent c8a1ce6240
commit 70cf79d768
2 changed files with 55 additions and 6 deletions
@@ -1,10 +1,15 @@
 # Benchmarks and integrity checks (push to main branches and workflow_dispatch).
+# Results are stored in a runner cache and compared on every run; the job fails
+# when any metric regresses beyond 150% of the stored baseline, and a commit
+# comment is posted with the offending numbers.
 #
 # Pinned first-party actions (bump tag and SHA together when upgrading):
 #   actions/checkout@v6.0.1            8e8c483db84b4bee98b60c0593521ed34d9990e8
 #   actions/setup-python@v6.2.0        a309ff8b426b58ec0e2a45f0f869d46889d02405
 #   actions/setup-node@v6.1.0          395ad3262231945c25e8478fd5baf05154b1d79f
 #   actions/cache@v4.2.0               1bd1e32a3bdc45362d1e726936510720a7c30a57
+#   benchmark-action/github-action-benchmark@v1.22.0
+#                                      a60cea5bc7b49e15c1f58f411161f99e0df48372

 name: Benchmarks

@@ -20,7 +25,7 @@ concurrency:
    cancel-in-progress: true

 permissions:
-    contents: read
+    contents: write

 env:
    FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
@@ -71,12 +76,34 @@ jobs:
            - name: Setup Task
              run: sh scripts/ci/setup-task.sh

+            - name: Restore benchmark baseline cache
+              uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57
+              with:
+                  path: ./cache
+                  key: ${{ runner.os }}-bench-baseline-${{ github.ref_name }}
+                  restore-keys: |
+                      ${{ runner.os }}-bench-baseline-
+
            - name: Run benchmarks
              run: |
                  set -euo pipefail
-                  task bench 2>&1 | tee bench_results.txt
+                  poetry run python tests/backend/run_comprehensive_benchmarks.py \
+                      --json-output bench_results.json 2>&1 | tee bench_results.txt

            - name: Run integrity tests
              run: |
                  set -euo pipefail
                  task test:integrity 2>&1 | tee -a bench_results.txt
+
+            - name: Store and compare benchmark results
+              uses: benchmark-action/github-action-benchmark@a60cea5bc7b49e15c1f58f411161f99e0df48372
+              with:
+                  name: MeshChatX Backend Benchmarks
+                  tool: customSmallerIsBetter
+                  output-file-path: bench_results.json
+                  external-data-json-path: ./cache/benchmark-data.json
+                  github-token: ${{ secrets.GITHUB_TOKEN }}
+                  alert-threshold: "150%"
+                  fail-on-alert: true
+                  comment-on-alert: true
+                  summary-always: true
@@ -35,7 +35,7 @@ class BackendBenchmarker:
        self.db.close()
        shutil.rmtree(self.temp_dir)

-    def run_all(self, extreme=False):
+    def run_all(self, extreme=False, json_output_path=None):
        print(f"\n{'=' * 20} BACKEND BENCHMARKING START {'=' * 20}")
        print(f"Mode: {'EXTREME (Breaking Space)' if extreme else 'Standard'}")
        print(f"Base Memory: {get_memory_usage_mb():.2f} MB")
@@ -53,7 +53,7 @@ class BackendBenchmarker:

        self.bench_telephony_operations()

-        self.print_summary()
+        self.print_summary(json_output_path=json_output_path)

    def bench_extreme_message_flood(self):
        """Insert 100,000 messages with large randomized content."""
@@ -305,7 +305,7 @@ class BackendBenchmarker:
        _, res = log_call()
        self.results.append(res)

-    def print_summary(self):
+    def print_summary(self, json_output_path=None):
        print(f"\n{'=' * 20} BENCHMARK SUMMARY {'=' * 20}")
        print(f"{'Benchmark Name':40} | {'Avg Time':10} | {'Mem Delta':10}")
        print(f"{'-' * 40}-|-{'-' * 10}-|-{'-' * 10}")
@@ -316,6 +316,22 @@ class BackendBenchmarker:
        print(f"{'=' * 59}")
        print(f"Final Memory Usage: {get_memory_usage_mb():.2f} MB")

+        if json_output_path:
+            import json as _json
+
+            entries = [
+                {
+                    "name": r.name,
+                    "unit": "ms",
+                    "value": round(r.duration_ms, 3),
+                    "extra": f"Memory delta: {r.memory_delta_mb:.2f} MB",
+                }
+                for r in self.results
+            ]
+            with open(json_output_path, "w") as f:
+                _json.dump(entries, f, indent=2)
+            print(f"Benchmark JSON written to {json_output_path}")
+

 if __name__ == "__main__":
    import argparse
@@ -326,10 +342,16 @@ if __name__ == "__main__":
        action="store_true",
        help="Run extreme stress tests",
    )
+    parser.add_argument(
+        "--json-output",
+        metavar="PATH",
+        default=None,
+        help="Write benchmark results as github-action-benchmark customSmallerIsBetter JSON to PATH",
+    )
    args = parser.parse_args()

    bench = BackendBenchmarker()
    try:
-        bench.run_all(extreme=args.extreme)
+        bench.run_all(extreme=args.extreme, json_output_path=args.json_output)
    finally:
        bench.cleanup()