WIP Manually create junit report

gabrys · gabrys · commit 182df3f8d3f7 · 2025-09-30T20:53:06.000+02:00
diff --git a/.github/workflows/tests-performance.yml b/.github/workflows/tests-performance.yml
@@ -8,16 +8,6 @@ on:
     - cron: '0 8 * * *' # Run at 8:00 daily
   workflow_dispatch:
   push:
-    branches:
-      - main
-      - dev/.*
-#  pull_request:
-#    paths:
-#      - 'src/**'
-#      - 'tests/**'
-#      - 'dev_requirements.txt'
-#      - 'pyproject.toml'
-#      - '.github/workflows/tests-performance.yml'
 
 jobs:
   test-performance:
@@ -38,25 +28,14 @@ jobs:
 
       - name: Run performance tests
         run: |
-          pytest --junitxml="test-results/test-performance.xml" tests/performance
-
-      - name: Report measurements
-        uses: mikepenz/action-junit-report@v5
-        if: always()
-        with:
-          check_name: 'Performance measurements'
-          report_paths: "./test-results/test-performance.xml"
-          detailed_summary: true
-          include_passed: true
-          include_time_in_summary: true
-          resolve_ignore_classname: true
+          pytest --junitxml=test-results/test-performance.xml tests/performance
 
       - name: Performance validation
         uses: mikepenz/action-junit-report@v5
         if: always()
         with:
-          check_name: 'Performance validation'
-          report_paths: "./test-results/test-performance__validation.xml"
+          check_name: Performance validation
+          report_paths: ./test-results/test-performance__validation.xml
           detailed_summary: true
           include_passed: true
           include_time_in_summary: true
diff --git a/dev_requirements.txt b/dev_requirements.txt
@@ -1,5 +1,6 @@
 -e .
 behave
+junit-xml
 mock
 pre-commit
 pytest
diff --git a/test-performance__validation.xml b/test-performance__validation.xml
@@ -0,0 +1,11 @@
+<?xml version="1.0" ?>
+<testsuites disabled="0" errors="0" failures="2" tests="2" time="2.9327545302003273">
+	<testsuite disabled="0" errors="0" failures="2" name="BenchmarkResults" skipped="0" tests="2" time="2.9327545302003273">
+		<testcase name="test_perf_output_format.test_perf_create_series_dataframe[200-50-100]" time="1.468609">
+			<failure type="failure" message="max too big"/>
+		</testcase>
+		<testcase name="test_perf_output_format.test_perf_create_series_dataframe[50-200-100]" time="1.464145">
+			<failure type="failure" message="max too big"/>
+		</testcase>
+	</testsuite>
+</testsuites>
diff --git a/tests/performance/conftest.py b/tests/performance/conftest.py
@@ -1,11 +1,13 @@
 import json
 import os
-import subprocess
-import sys
 import tempfile
 from io import BytesIO
 from pathlib import Path
 
+
+from .validation import generate_junit_report
+
+
 if not os.getenv("BENCHMARK_VALIDATE_FILE"):
     # Create a temp dir for the benchmark results:
     tmp_dir = tempfile.mkdtemp(prefix="neptune-query-benchmark-")
@@ -31,36 +33,43 @@ def pytest_configure(config):
 
 
 def pytest_benchmark_update_json(config, benchmarks, output_json):
-    with open(report_path, "w") as f:
-        json.dump(output_json, f, indent=2)
     with open("benchmark_results.json", "w") as f:
         json.dump(output_json, f, indent=2)
 
+    if config.option.xmlpath:
+        # For --junitxml = /path/abc.xml, create /path/abc__validation.xml
+        path = Path(config.option.xmlpath)
+        validation_report = path.with_stem(path.stem + "__validation")
+        generate_junit_report("benchmark_results.json", validation_report)
 
-def pytest_sessionfinish(session, exitstatus):
-    try:
-        if exitstatus != 0:
-            return
-
-        if os.getenv("BENCHMARK_VALIDATE_FILE"):
-            return
-
-        if os.getenv("BENCHMARK_NO_VALIDATION") == "1":
-            return
-
-        # Rerun the tests in validation mode
-        os.environ["BENCHMARK_VALIDATE_FILE"] = str(report_path)
-        cp = subprocess.run(
-            [sys.executable] + sys.argv + ["-W", "ignore::pytest_benchmark.logger.PytestBenchmarkWarning"]
-        )
-        session.exitstatus = cp.returncode
 
-    finally:
-        try:
-            os.unlink(report_path)
-        except Exception:
-            pass
-        try:
-            os.rmdir(tmp_dir)
-        except Exception:
-            pass
+# def pytest_sessionfinish(session, exitstatus):
+#
+#     generate_junit_report("benchmark_results.json", "junit_report.xml")
+#
+#     try:
+#         if exitstatus != 0:
+#             return
+#
+#         if os.getenv("BENCHMARK_VALIDATE_FILE"):
+#             return
+#
+#         if os.getenv("BENCHMARK_NO_VALIDATION") == "1":
+#             return
+#
+#         # Rerun the tests in validation mode
+#         os.environ["BENCHMARK_VALIDATE_FILE"] = str(report_path)
+#         cp = subprocess.run(
+#             [sys.executable] + sys.argv + ["-W", "ignore::pytest_benchmark.logger.PytestBenchmarkWarning"]
+#         )
+#         session.exitstatus = cp.returncode
+#
+#     finally:
+#         try:
+#             os.unlink(report_path)
+#         except Exception:
+#             pass
+#         try:
+#             os.rmdir(tmp_dir)
+#         except Exception:
+#             pass
diff --git a/tests/performance/decorator.py b/tests/performance/decorator.py
@@ -59,7 +59,7 @@ def wrapper(fn):
 
             if case_param_keys != param_keys:
                 raise ValueError(
-                    "All expected_benchmark decorators must have the same parameter keys."
+                    "All listed cases in expected_benchmark must have the same parameter keys."
                     f"Expected {param_keys}, got {case_param_keys}"
                 )
 
@@ -73,93 +73,93 @@ def wrapper(fn):
                 )
             )
 
-        if not os.getenv("BENCHMARK_VALIDATE_FILE"):
-            pytest.mark.parametrize(
-                ",".join(param_keys),
-                [spec.get_params_for_parametrize() for spec in specs],
-            )(fn)
-            return fn
-
-        performance_factor = float(os.getenv("BENCHMARK_PERFORMANCE_FACTOR", "1.0"))
-
-        @wraps(fn)
-        def validation(*args, **kwargs):
-            # Find the matching spec
-            spec: PerformanceTestCaseSpec | None = None
-            for case in specs:
-                if all(kwargs.get(k) == v for k, v in case.params.items()):
-                    spec = case
-                    break
-
-            assert spec is not None, "No matching performance case found for the given parameters."
-
-            # Extract the actual parameters used in this test run
-            if spec.min_p0 is None or spec.max_p80 is None or spec.max_p100 is None:
-                warnings.warn("Benchmark thresholds not set, skipping validation.", category=UserWarning)
-                return
-
-            perf_data = _get_benchmark_data()
-
-            assert spec.fn_name, spec.get_params_json() in perf_data
-            stats = perf_data[spec.fn_name, spec.get_params_json()]
-
-            times = sorted(stats["data"])
-            p0 = times[0]
-            p80 = times[int(len(times) * 0.8)]
-            p100 = times[-1]
-
-            adjusted_min_p0 = spec.min_p0 * performance_factor
-            adjusted_max_p80 = spec.max_p80 * performance_factor
-            adjusted_max_p100 = spec.max_p100 * performance_factor
-
-            p0_marker = "✓" if p0 >= adjusted_min_p0 else "✗"
-            p80_marker = "✓" if p80 <= adjusted_max_p80 else "✗"
-            p100_marker = "✓" if p100 <= adjusted_max_p100 else "✗"
-
-            params_human = ", ".join(f"{k}={v!r}" for k, v in spec.params.items())
-            detailed_msg = f"""
-
-                Benchmark '{spec.fn_name}' with params {params_human} results:
-
-                {p0_marker} 0th percentile:       {p0:.3f} s
-                  Unadjusted min_p0:    {spec.min_p0:.3f} s
-                  Adjusted (*) min_p0:  {adjusted_min_p0:.3f} s
-
-                {p80_marker} 80th percentile:       {p80:.3f} s
-                  Unadjusted max_p80:    {spec.max_p80:.3f} s
-                  Adjusted (*) max_p80:  {adjusted_max_p80:.3f} s
-
-                {p100_marker} 100th percentile:       {p100:.3f} s
-                  Unadjusted max_p100:    {spec.max_p100:.3f} s
-                  Adjusted (*) max_p100:  {adjusted_max_p100:.3f} s
-
-                (*) Use the environment variable "BENCHMARK_PERFORMANCE_FACTOR" to adjust the thresholds.
-
-                BENCHMARK_PERFORMANCE_FACTOR=1.0 (default) is meant to represent GitHub Actions performance.
-                Decrease this factor if your local machine is faster than GitHub Actions.
-
-"""
-
-            if performance_factor == 1.0:
-                adjusted_min_p0_str = f"{adjusted_min_p0:.3f}"
-                adjusted_max_p80_str = f"{adjusted_max_p80:.3f}"
-                adjusted_max_p100_str = f"{adjusted_max_p100:.3f}"
-            else:
-                adjusted_min_p0_str = f"{adjusted_min_p0:.3f} (= {spec.min_p0:.3f} * {performance_factor})"
-                adjusted_max_p80_str = f"{adjusted_max_p80:.3f} (= {spec.max_p80:.3f} * {performance_factor})"
-                adjusted_max_p100_str = f"{adjusted_max_p100:.3f} (= {spec.max_p100:.3f} * {performance_factor})"
-
-            assert p0 >= adjusted_min_p0, f"p0 {p0:.3f} is less than expected {adjusted_min_p0_str}" + detailed_msg
-            assert p80 <= adjusted_max_p80, f"p80 {p80:.3f} is more than expected {adjusted_max_p80_str}" + detailed_msg
-            assert p100 <= adjusted_max_p100, (
-                f"p100 {p100:.3f} is more than expected {adjusted_max_p100_str}" + detailed_msg
-            )
-
         pytest.mark.parametrize(
             ",".join(param_keys),
             [spec.get_params_for_parametrize() for spec in specs],
-        )(validation)
-
-        return validation
+        )(fn)
+        fn.__expected_benchmark_specs = specs
+        return fn
+
+        # performance_factor = float(os.getenv("BENCHMARK_PERFORMANCE_FACTOR", "1.0"))
+#
+#         @wraps(fn)
+#         def validation(*args, **kwargs):
+#             # Find the matching spec
+#             spec: PerformanceTestCaseSpec | None = None
+#             for case in specs:
+#                 if all(kwargs.get(k) == v for k, v in case.params.items()):
+#                     spec = case
+#                     break
+#
+#             assert spec is not None, "No matching performance case found for the given parameters."
+#
+#             # Extract the actual parameters used in this test run
+#             if spec.min_p0 is None or spec.max_p80 is None or spec.max_p100 is None:
+#                 warnings.warn("Benchmark thresholds not set, skipping validation.", category=UserWarning)
+#                 return
+#
+#             perf_data = _get_benchmark_data()
+#
+#             assert spec.fn_name, spec.get_params_json() in perf_data
+#             stats = perf_data[spec.fn_name, spec.get_params_json()]
+#
+#             times = sorted(stats["data"])
+#             p0 = times[0]
+#             p80 = times[int(len(times) * 0.8)]
+#             p100 = times[-1]
+#
+#             adjusted_min_p0 = spec.min_p0 * performance_factor
+#             adjusted_max_p80 = spec.max_p80 * performance_factor
+#             adjusted_max_p100 = spec.max_p100 * performance_factor
+#
+#             p0_marker = "✓" if p0 >= adjusted_min_p0 else "✗"
+#             p80_marker = "✓" if p80 <= adjusted_max_p80 else "✗"
+#             p100_marker = "✓" if p100 <= adjusted_max_p100 else "✗"
+#
+#             params_human = ", ".join(f"{k}={v!r}" for k, v in spec.params.items())
+#             detailed_msg = f"""
+#
+#                 Benchmark '{spec.fn_name}' with params {params_human} results:
+#
+#                 {p0_marker} 0th percentile:       {p0:.3f} s
+#                   Unadjusted min_p0:    {spec.min_p0:.3f} s
+#                   Adjusted (*) min_p0:  {adjusted_min_p0:.3f} s
+#
+#                 {p80_marker} 80th percentile:       {p80:.3f} s
+#                   Unadjusted max_p80:    {spec.max_p80:.3f} s
+#                   Adjusted (*) max_p80:  {adjusted_max_p80:.3f} s
+#
+#                 {p100_marker} 100th percentile:       {p100:.3f} s
+#                   Unadjusted max_p100:    {spec.max_p100:.3f} s
+#                   Adjusted (*) max_p100:  {adjusted_max_p100:.3f} s
+#
+#                 (*) Use the environment variable "BENCHMARK_PERFORMANCE_FACTOR" to adjust the thresholds.
+#
+#                 BENCHMARK_PERFORMANCE_FACTOR=1.0 (default) is meant to represent GitHub Actions performance.
+#                 Decrease this factor if your local machine is faster than GitHub Actions.
+#
+# """
+#
+#             if performance_factor == 1.0:
+#                 adjusted_min_p0_str = f"{adjusted_min_p0:.3f}"
+#                 adjusted_max_p80_str = f"{adjusted_max_p80:.3f}"
+#                 adjusted_max_p100_str = f"{adjusted_max_p100:.3f}"
+#             else:
+#                 adjusted_min_p0_str = f"{adjusted_min_p0:.3f} (= {spec.min_p0:.3f} * {performance_factor})"
+#                 adjusted_max_p80_str = f"{adjusted_max_p80:.3f} (= {spec.max_p80:.3f} * {performance_factor})"
+#                 adjusted_max_p100_str = f"{adjusted_max_p100:.3f} (= {spec.max_p100:.3f} * {performance_factor})"
+#
+#             assert p0 >= adjusted_min_p0, f"p0 {p0:.3f} is less than expected {adjusted_min_p0_str}" + detailed_msg
+#             assert p80 <= adjusted_max_p80, f"p80 {p80:.3f} is more than expected {adjusted_max_p80_str}" + detailed_msg
+#             assert p100 <= adjusted_max_p100, (
+#                 f"p100 {p100:.3f} is more than expected {adjusted_max_p100_str}" + detailed_msg
+#             )
+#
+#         pytest.mark.parametrize(
+#             ",".join(param_keys),
+#             [spec.get_params_for_parametrize() for spec in specs],
+#         )(validation)
+#
+#         return validation
 
     return wrapper
diff --git a/tests/performance/validation.py b/tests/performance/validation.py
@@ -0,0 +1,56 @@
+import importlib
+import json
+from pathlib import Path
+
+from junit_xml import TestSuite, TestCase, to_xml_report_file
+
+from .decorator import PerformanceTestCaseSpec
+
+
+def get_benchmark_spec(benchmark):
+    module_path = Path(benchmark["fullname"].split("::")[0])
+    module_name = str(module_path.with_suffix("")).replace("/", ".")
+    fn_name = benchmark["name"].split("[")[0]
+    params = benchmark["params"]
+    module = importlib.import_module(module_name)
+    fn = getattr(module, fn_name)
+    specs: PerformanceTestCaseSpec = fn.__expected_benchmark_specs
+    for spec in specs:
+        if spec.params == params:
+            return spec
+    raise ValueError(f"No matching spec found for benchmark {module_name}.{fn_name} with params {params}")
+
+def generate_junit_report(benchmark_path, report_path):
+    with open(benchmark_path, "r") as f:
+        report = json.load(f)
+
+    test_cases = []
+    for benchmark in report["benchmarks"]:
+        name = benchmark["fullname"].replace("tests.performance.", "").replace(".py::", ".").replace("/", ".")
+        spec = get_benchmark_spec(benchmark)
+
+        times = sorted(benchmark["stats"]["data"])
+        p0 = times[0]
+        p80 = times[int(len(times) * 0.8)]
+        p100 = times[-1]
+
+        tc_p0 = TestCase(name=f"{name}__p0", elapsed_sec=p0)
+        tc_p80 = TestCase(name=f"{name}__p80", elapsed_sec=p80)
+        tc_p100 = TestCase(name=f"{name}__p100", elapsed_sec=p100)
+
+        if spec.min_p0 is not None and p0 < spec.min_p0:
+            tc_p0.add_failure_info("p0 too small")
+        if spec.max_p80 is not None and p80 > spec.max_p80:
+            tc_p80.add_failure_info("p80 too big")
+        if spec.max_p100 is not None and p100 > spec.max_p100:
+            tc_p100.add_failure_info("p100 too big")
+        test_cases += [tc_p0, tc_p80, tc_p100]
+
+    test_suite = TestSuite(name="BenchmarkResults", test_cases=test_cases)
+
+    with open(report_path, "w") as f:
+        to_xml_report_file(f, [test_suite], prettyprint=True)
+
+
+if __name__ == "__main__":
+    generate_junit_report("benchmark_results.json", "junit_report.xml")

-Original file line number
+Diff line change
@@ @@ -1,5 +1,6 @@ @@
 -e .
 behave
 +junit-xml
 mock
 pre-commit
 pytest