Improve output of performance tests (#96)

gabrys · web-flow · commit 978e01e26aa0 · 2025-10-06T10:44:11.000Z
* Improve output of peformance tests

Both in CLI and in GitHub JUnit report

* Don't spit out package path

* Revert workflow triggers for tests-performance
diff --git a/.github/workflows/tests-performance.yml b/.github/workflows/tests-performance.yml
@@ -38,26 +38,16 @@ jobs:
 
       - name: Run performance tests
         run: |
-          pytest --junitxml="test-results/test-performance.xml" tests/performance
-
-      - name: Report measurements
-        uses: mikepenz/action-junit-report@v5
-        if: always()
-        with:
-          check_name: 'Performance measurements'
-          report_paths: "./test-results/test-performance.xml"
-          detailed_summary: true
-          include_passed: true
-          include_time_in_summary: true
-          resolve_ignore_classname: true
+          pytest --junitxml=test-results/test-performance.xml tests/performance
 
       - name: Performance validation
         uses: mikepenz/action-junit-report@v5
         if: always()
         with:
-          check_name: 'Performance validation'
-          report_paths: "./test-results/test-performance__validation.xml"
+          check_name: Performance validation
+          report_paths: ./test-results/test-performance.xml
           detailed_summary: true
           include_passed: true
           include_time_in_summary: true
           resolve_ignore_classname: true
+          fail_on_failure: true
diff --git a/dev_requirements.txt b/dev_requirements.txt
@@ -1,5 +1,6 @@
 -e .
 behave
+junit-xml
 mock
 pre-commit
 pytest
@@ -9,6 +10,7 @@ pytest-retry
 pytest-timeout
 pytest-xdist
 icecream
+tabulate
 git+https://github.com/neptune-ai/neptune-client-scale.git@main#egg=neptune-scale
 fastapi == 0.116.2
 uvicorn == 0.35.0
diff --git a/tests/performance/conftest.py b/tests/performance/conftest.py
@@ -1,66 +1,66 @@
 import json
 import os
-import subprocess
-import sys
 import tempfile
 from io import BytesIO
 from pathlib import Path
 
-if not os.getenv("BENCHMARK_VALIDATE_FILE"):
-    # Create a temp dir for the benchmark results:
-    tmp_dir = tempfile.mkdtemp(prefix="neptune-query-benchmark-")
-    report_path = Path(tmp_dir) / "benchmark.json"
+from .validation import (
+    generate_junit_report,
+    generate_text_report,
+)
+
+# Create a temp dir for the benchmark results:
+tmp_dir = tempfile.mkdtemp(prefix="neptune-query-benchmark-")
+benchmark_json_path = Path(tmp_dir) / "benchmark.json"
+
+
+def cleanup():
+    try:
+        os.unlink(benchmark_json_path)
+    except Exception:
+        pass
+    try:
+        os.rmdir(tmp_dir)
+    except Exception:
+        pass
 
 
 def pytest_configure(config):
-    if not os.getenv("BENCHMARK_VALIDATE_FILE"):
-        # Perform at least 15 rounds per test
-        # Testing at least for 10 seconds per test
-        config.option.benchmark_min_rounds = 15
-        config.option.benchmark_max_time = 10.0
-        config.option.benchmark_disable_gc = True
-        config.option.benchmark_time_unit = "ms"
-        config.option.benchmark_sort = "name"
-        config.option.benchmark_json = BytesIO()
-        config.option.junitxml = "benchmark_measurement.xml"
-    else:
-        if config.option.xmlpath:
-            # For --junitxml = /path/abc.xml, create /path/abc__validation.xml
-            path = Path(config.option.xmlpath)
-            config.option.xmlpath = str(path.with_stem(path.stem + "__validation"))
+    # Perform at least 15 rounds per test
+    # Testing at least for 10 seconds per test
+    config.option.benchmark_min_rounds = 15
+    config.option.benchmark_max_time = 10.0
+    config.option.benchmark_disable_gc = True
+    config.option.benchmark_time_unit = "ms"
+    config.option.benchmark_sort = "name"
+    config.option.benchmark_json = BytesIO()
+    config.option.benchmark_quiet = True
+
+    config.option.original_xmlpath = config.option.xmlpath
+    config.option.xmlpath = None
 
 
 def pytest_benchmark_update_json(config, benchmarks, output_json):
-    with open(report_path, "w") as f:
+    with open(benchmark_json_path, "w") as f:
         json.dump(output_json, f, indent=2)
+
     with open("benchmark_results.json", "w") as f:
         json.dump(output_json, f, indent=2)
 
 
-def pytest_sessionfinish(session, exitstatus):
+def pytest_terminal_summary(terminalreporter, exitstatus, config):
     try:
-        if exitstatus != 0:
-            return
-
-        if os.getenv("BENCHMARK_VALIDATE_FILE"):
-            return
-
-        if os.getenv("BENCHMARK_NO_VALIDATION") == "1":
-            return
+        # Print a report to the terminal
+        msg = generate_text_report(benchmark_json_path)
+        terminalreporter.ensure_newline()
+        terminalreporter.write(msg)
+        terminalreporter.ensure_newline()
 
-        # Rerun the tests in validation mode
-        os.environ["BENCHMARK_VALIDATE_FILE"] = str(report_path)
-        cp = subprocess.run(
-            [sys.executable] + sys.argv + ["-W", "ignore::pytest_benchmark.logger.PytestBenchmarkWarning"]
-        )
-        session.exitstatus = cp.returncode
+        # And save a nice JUnit XML
+        if config.option.original_xmlpath:
+            path = Path(config.option.original_xmlpath)
+            path.parent.mkdir(parents=True, exist_ok=True)
+            generate_junit_report(benchmark_json_path, path)
 
     finally:
-        try:
-            os.unlink(report_path)
-        except Exception:
-            pass
-        try:
-            os.rmdir(tmp_dir)
-        except Exception:
-            pass
+        cleanup()
diff --git a/tests/performance/decorator.py b/tests/performance/decorator.py
@@ -1,33 +1,10 @@
 import json
-import os
-import warnings
 from dataclasses import dataclass
-from functools import (
-    cache,
-    wraps,
-)
 from typing import Any
 
 import pytest
 
 
-@cache
-def _get_benchmark_data() -> dict[tuple[str, str], dict[str, Any]]:
-    benchmark_output_file = os.getenv("BENCHMARK_VALIDATE_FILE")
-    if benchmark_output_file is None:
-        raise RuntimeError("Environment variable BENCHMARK_VALIDATE_FILE is not set.")
-
-    stats = {}
-    with open(benchmark_output_file) as f:
-        data = json.load(f)
-        for benchmark in data["benchmarks"]:
-            name = benchmark["name"].split("[")[0]  # Remove params from the name
-            params = json.dumps(benchmark["params"], sort_keys=True)
-            stats[name, params] = benchmark["stats"]
-
-    return stats
-
-
 @dataclass
 class PerformanceTestCaseSpec:
     fn_name: str
@@ -44,6 +21,11 @@ def get_params_for_parametrize(self):
     def get_params_json(self):
         return json.dumps(self.params, sort_keys=True)
 
+    def get_params_human(self):
+        if all(type(value) in [float, int] for value in self.params.values()):
+            return ", ".join(f"{key}={value}" for key, value in sorted(self.params.items()))
+        return self.get_params_json()
+
 
 def expected_benchmark(*multiple_cases: dict, **single_case: dict):
     def wrapper(fn):
@@ -59,7 +41,7 @@ def wrapper(fn):
 
             if case_param_keys != param_keys:
                 raise ValueError(
-                    "All expected_benchmark decorators must have the same parameter keys."
+                    "All listed cases in expected_benchmark must have the same parameter keys."
                     f"Expected {param_keys}, got {case_param_keys}"
                 )
 
@@ -73,93 +55,11 @@ def wrapper(fn):
                 )
             )
 
-        if not os.getenv("BENCHMARK_VALIDATE_FILE"):
-            pytest.mark.parametrize(
-                ",".join(param_keys),
-                [spec.get_params_for_parametrize() for spec in specs],
-            )(fn)
-            return fn
-
-        performance_factor = float(os.getenv("BENCHMARK_PERFORMANCE_FACTOR", "1.0"))
-
-        @wraps(fn)
-        def validation(*args, **kwargs):
-            # Find the matching spec
-            spec: PerformanceTestCaseSpec | None = None
-            for case in specs:
-                if all(kwargs.get(k) == v for k, v in case.params.items()):
-                    spec = case
-                    break
-
-            assert spec is not None, "No matching performance case found for the given parameters."
-
-            # Extract the actual parameters used in this test run
-            if spec.min_p0 is None or spec.max_p80 is None or spec.max_p100 is None:
-                warnings.warn("Benchmark thresholds not set, skipping validation.", category=UserWarning)
-                return
-
-            perf_data = _get_benchmark_data()
-
-            assert spec.fn_name, spec.get_params_json() in perf_data
-            stats = perf_data[spec.fn_name, spec.get_params_json()]
-
-            times = sorted(stats["data"])
-            p0 = times[0]
-            p80 = times[int(len(times) * 0.8)]
-            p100 = times[-1]
-
-            adjusted_min_p0 = spec.min_p0 * performance_factor
-            adjusted_max_p80 = spec.max_p80 * performance_factor
-            adjusted_max_p100 = spec.max_p100 * performance_factor
-
-            p0_marker = "✓" if p0 >= adjusted_min_p0 else "✗"
-            p80_marker = "✓" if p80 <= adjusted_max_p80 else "✗"
-            p100_marker = "✓" if p100 <= adjusted_max_p100 else "✗"
-
-            params_human = ", ".join(f"{k}={v!r}" for k, v in spec.params.items())
-            detailed_msg = f"""
-
-                Benchmark '{spec.fn_name}' with params {params_human} results:
-
-                {p0_marker} 0th percentile:       {p0:.3f} s
-                  Unadjusted min_p0:    {spec.min_p0:.3f} s
-                  Adjusted (*) min_p0:  {adjusted_min_p0:.3f} s
-
-                {p80_marker} 80th percentile:       {p80:.3f} s
-                  Unadjusted max_p80:    {spec.max_p80:.3f} s
-                  Adjusted (*) max_p80:  {adjusted_max_p80:.3f} s
-
-                {p100_marker} 100th percentile:       {p100:.3f} s
-                  Unadjusted max_p100:    {spec.max_p100:.3f} s
-                  Adjusted (*) max_p100:  {adjusted_max_p100:.3f} s
-
-                (*) Use the environment variable "BENCHMARK_PERFORMANCE_FACTOR" to adjust the thresholds.
-
-                BENCHMARK_PERFORMANCE_FACTOR=1.0 (default) is meant to represent GitHub Actions performance.
-                Decrease this factor if your local machine is faster than GitHub Actions.
-
-"""
-
-            if performance_factor == 1.0:
-                adjusted_min_p0_str = f"{adjusted_min_p0:.3f}"
-                adjusted_max_p80_str = f"{adjusted_max_p80:.3f}"
-                adjusted_max_p100_str = f"{adjusted_max_p100:.3f}"
-            else:
-                adjusted_min_p0_str = f"{adjusted_min_p0:.3f} (= {spec.min_p0:.3f} * {performance_factor})"
-                adjusted_max_p80_str = f"{adjusted_max_p80:.3f} (= {spec.max_p80:.3f} * {performance_factor})"
-                adjusted_max_p100_str = f"{adjusted_max_p100:.3f} (= {spec.max_p100:.3f} * {performance_factor})"
-
-            assert p0 >= adjusted_min_p0, f"p0 {p0:.3f} is less than expected {adjusted_min_p0_str}" + detailed_msg
-            assert p80 <= adjusted_max_p80, f"p80 {p80:.3f} is more than expected {adjusted_max_p80_str}" + detailed_msg
-            assert p100 <= adjusted_max_p100, (
-                f"p100 {p100:.3f} is more than expected {adjusted_max_p100_str}" + detailed_msg
-            )
-
         pytest.mark.parametrize(
             ",".join(param_keys),
             [spec.get_params_for_parametrize() for spec in specs],
-        )(validation)
-
-        return validation
+        )(fn)
+        fn.__expected_benchmark_specs = specs
+        return fn
 
     return wrapper
diff --git a/tests/performance/validation.py b/tests/performance/validation.py