Performance tests (#70)

gabrys · web-flow · commit c356de7ae7ec · 2025-09-26T14:40:36.000Z
* Performance tests

* Parametrized performance tests plus BENCHMARK_PERFORMANCE_FACTOR

* Remove warning supression

* Adjust the expected thresholds

* Rename the util module to decorator

* More updates and fixes to perfomance testing

* Fix parametrized performance benchmarks
* More verbose messages
* Properly silence pytest-benchmark warning on the validation phase

* Fix aligmnment of detailed message to better work in GH UI

* More beauty

* Adjust thresholds

* More updates

* Always show the benchmark summary

* Rename step to "Run performance tests"

* Adjust meta params for the benchmarks

* Fix format

* Update docstrings

* Add test for the perfomance issue introduced in nq 1.4.0

* Change the way to specify multiple cases to @expected_benchmark

* Rename PerfomanceCase to PerformanceTestCaseSpec

* Fail tests properly

* Reformat
diff --git a/.github/workflows/tests-performance.yml b/.github/workflows/tests-performance.yml
@@ -0,0 +1,48 @@
+name: Performance tests
+
+on:
+  schedule:
+    - cron: '0 8 * * *' # Run at 8:00 daily
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+      - dev/.*
+  pull_request:
+    paths:
+      - 'src/**'
+      - 'tests/**'
+      - 'dev_requirements.txt'
+      - 'pyproject.toml'
+      - '.github/workflows/tests-performance.yml'
+
+jobs:
+  test:
+    runs-on: tools-gha-runners
+    timeout-minutes: 30
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip &&
+          pip install -r dev_requirements.txt
+
+      - name: Run performance tests
+        run: |
+          pytest --junitxml="test-results/test-performance.xml" tests/performance
+
+      - name: Report
+        uses: mikepenz/action-junit-report@v5
+        if: always()
+        with:
+          report_paths: "./test-results/test-performance*.xml"
+          update_check: true
+          annotate_notice: true
+          job_name: "Performance tests"
diff --git a/.gitignore b/.gitignore
@@ -129,3 +129,7 @@ stream.bin
 MagicMock/
 
 poetry.lock
+
+# pytest-benchmark
+.benchmarks
+benchmark_results.json
diff --git a/dev_requirements.txt b/dev_requirements.txt
@@ -3,6 +3,7 @@ behave
 mock
 pre-commit
 pytest
+pytest-benchmark
 pytest-mock
 pytest-retry
 pytest-timeout
diff --git a/tests/performance/__init__.py b/tests/performance/__init__.py
diff --git a/tests/performance/conftest.py b/tests/performance/conftest.py
@@ -0,0 +1,60 @@
+import json
+import os
+import subprocess
+import sys
+import tempfile
+from io import BytesIO
+from pathlib import Path
+
+if not os.getenv("BENCHMARK_VALIDATE_FILE"):
+    # Create a temp dir for the benchmark results:
+    tmp_dir = tempfile.mkdtemp(prefix="neptune-query-benchmark-")
+    report_path = Path(tmp_dir) / "benchmark.json"
+
+
+def pytest_configure(config):
+    if not os.getenv("BENCHMARK_VALIDATE_FILE"):
+        # Perform at least 15 rounds per test
+        # Testing at least for 10 seconds per test
+        config.option.benchmark_min_rounds = 15
+        config.option.benchmark_max_time = 10.0
+        config.option.benchmark_disable_gc = True
+        config.option.benchmark_time_unit = "ms"
+        config.option.benchmark_sort = "name"
+        config.option.benchmark_json = BytesIO()
+
+
+def pytest_benchmark_update_json(config, benchmarks, output_json):
+    with open(report_path, "w") as f:
+        json.dump(output_json, f, indent=2)
+    with open("benchmark_results.json", "w") as f:
+        json.dump(output_json, f, indent=2)
+
+
+def pytest_sessionfinish(session, exitstatus):
+    try:
+        if exitstatus != 0:
+            return
+
+        if os.getenv("BENCHMARK_VALIDATE_FILE"):
+            return
+
+        if os.getenv("BENCHMARK_NO_VALIDATION") == "1":
+            return
+
+        # Rerun the tests in validation mode
+        os.environ["BENCHMARK_VALIDATE_FILE"] = str(report_path)
+        cp = subprocess.run(
+            [sys.executable] + sys.argv + ["-W", "ignore::pytest_benchmark.logger.PytestBenchmarkWarning"]
+        )
+        session.exitstatus = cp.returncode
+
+    finally:
+        try:
+            os.unlink(report_path)
+        except Exception:
+            pass
+        try:
+            os.rmdir(tmp_dir)
+        except Exception:
+            pass
diff --git a/tests/performance/decorator.py b/tests/performance/decorator.py
@@ -0,0 +1,165 @@
+import json
+import os
+import warnings
+from dataclasses import dataclass
+from functools import (
+    cache,
+    wraps,
+)
+from typing import Any
+
+import pytest
+
+
+@cache
+def _get_benchmark_data() -> dict[tuple[str, str], dict[str, Any]]:
+    benchmark_output_file = os.getenv("BENCHMARK_VALIDATE_FILE")
+    if benchmark_output_file is None:
+        raise RuntimeError("Environment variable BENCHMARK_VALIDATE_FILE is not set.")
+
+    stats = {}
+    with open(benchmark_output_file) as f:
+        data = json.load(f)
+        for benchmark in data["benchmarks"]:
+            name = benchmark["name"].split("[")[0]  # Remove params from the name
+            params = json.dumps(benchmark["params"], sort_keys=True)
+            stats[name, params] = benchmark["stats"]
+
+    return stats
+
+
+@dataclass
+class PerformanceTestCaseSpec:
+    fn_name: str
+    params: dict[str, Any]
+    min_p0: float | None
+    max_p80: float | None
+    max_p100: float | None
+
+    def get_params_for_parametrize(self):
+        if len(self.params) == 1:
+            return list(self.params.values())[0]
+        return tuple(self.params.values())
+
+    def get_params_json(self):
+        return json.dumps(self.params, sort_keys=True)
+
+
+def expected_benchmark(*multiple_cases: dict, **single_case: dict):
+    def wrapper(fn):
+        specs = []
+        param_keys = {}
+
+        all_cases = multiple_cases or [single_case]
+
+        for case in all_cases:
+            case_param_keys = {k for k in case.keys() if k not in ("min_p0", "max_p80", "max_p100")}
+            if not param_keys:
+                param_keys = case_param_keys
+
+            if case_param_keys != param_keys:
+                raise ValueError(
+                    "All expected_benchmark decorators must have the same parameter keys."
+                    f"Expected {param_keys}, got {case_param_keys}"
+                )
+
+            specs.append(
+                PerformanceTestCaseSpec(
+                    fn_name=fn.__name__,
+                    params={k: case[k] for k in param_keys},
+                    min_p0=case.get("min_p0"),
+                    max_p80=case.get("max_p80"),
+                    max_p100=case.get("max_p100"),
+                )
+            )
+
+        if not os.getenv("BENCHMARK_VALIDATE_FILE"):
+            pytest.mark.parametrize(
+                ",".join(param_keys),
+                [spec.get_params_for_parametrize() for spec in specs],
+            )(fn)
+            return fn
+
+        performance_factor = float(os.getenv("BENCHMARK_PERFORMANCE_FACTOR", "1.0"))
+
+        @wraps(fn)
+        def validation(*args, **kwargs):
+            # Find the matching spec
+            spec: PerformanceTestCaseSpec | None = None
+            for case in specs:
+                if all(kwargs.get(k) == v for k, v in case.params.items()):
+                    spec = case
+                    break
+
+            assert spec is not None, "No matching performance case found for the given parameters."
+
+            # Extract the actual parameters used in this test run
+            if spec.min_p0 is None or spec.max_p80 is None or spec.max_p100 is None:
+                warnings.warn("Benchmark thresholds not set, skipping validation.", category=UserWarning)
+                return
+
+            perf_data = _get_benchmark_data()
+
+            assert spec.fn_name, spec.get_params_json() in perf_data
+            stats = perf_data[spec.fn_name, spec.get_params_json()]
+
+            times = sorted(stats["data"])
+            p0 = times[0]
+            p80 = times[int(len(times) * 0.8)]
+            p100 = times[-1]
+
+            adjusted_min_p0 = spec.min_p0 * performance_factor
+            adjusted_max_p80 = spec.max_p80 * performance_factor
+            adjusted_max_p100 = spec.max_p100 * performance_factor
+
+            p0_marker = "✓" if p0 >= adjusted_min_p0 else "✗"
+            p80_marker = "✓" if p80 <= adjusted_max_p80 else "✗"
+            p100_marker = "✓" if p100 <= adjusted_max_p100 else "✗"
+
+            params_human = ", ".join(f"{k}={v!r}" for k, v in spec.params.items())
+            detailed_msg = f"""
+
+                Benchmark '{spec.fn_name}' with params {params_human} results:
+
+                {p0_marker} 0th percentile:       {p0:.3f} s
+                  Unadjusted min_p0:    {spec.min_p0:.3f} s
+                  Adjusted (*) min_p0:  {adjusted_min_p0:.3f} s
+
+                {p80_marker} 80th percentile:       {p80:.3f} s
+                  Unadjusted max_p80:    {spec.max_p80:.3f} s
+                  Adjusted (*) max_p80:  {adjusted_max_p80:.3f} s
+
+                {p100_marker} 100th percentile:       {p100:.3f} s
+                  Unadjusted max_p100:    {spec.max_p100:.3f} s
+                  Adjusted (*) max_p100:  {adjusted_max_p100:.3f} s
+
+                (*) Use the environment variable "BENCHMARK_PERFORMANCE_FACTOR" to adjust the thresholds.
+
+                BENCHMARK_PERFORMANCE_FACTOR=1.0 (default) is meant to represent GitHub Actions performance.
+                Decrease this factor if your local machine is faster than GitHub Actions.
+
+"""
+
+            if performance_factor == 1.0:
+                adjusted_min_p0_str = f"{adjusted_min_p0:.3f}"
+                adjusted_max_p80_str = f"{adjusted_max_p80:.3f}"
+                adjusted_max_p100_str = f"{adjusted_max_p100:.3f}"
+            else:
+                adjusted_min_p0_str = f"{adjusted_min_p0:.3f} (= {spec.min_p0:.3f} * {performance_factor})"
+                adjusted_max_p80_str = f"{adjusted_max_p80:.3f} (= {spec.max_p80:.3f} * {performance_factor})"
+                adjusted_max_p100_str = f"{adjusted_max_p100:.3f} (= {spec.max_p100:.3f} * {performance_factor})"
+
+            assert p0 >= adjusted_min_p0, f"p0 {p0:.3f} is less than expected {adjusted_min_p0_str}" + detailed_msg
+            assert p80 <= adjusted_max_p80, f"p80 {p80:.3f} is more than expected {adjusted_max_p80_str}" + detailed_msg
+            assert p100 <= adjusted_max_p100, (
+                f"p100 {p100:.3f} is more than expected {adjusted_max_p100_str}" + detailed_msg
+            )
+
+        pytest.mark.parametrize(
+            ",".join(param_keys),
+            [spec.get_params_for_parametrize() for spec in specs],
+        )(validation)
+
+        return validation
+
+    return wrapper
diff --git a/tests/performance/generate.py b/tests/performance/generate.py
@@ -0,0 +1,104 @@
+import random
+import string
+
+import neptune_query.internal.retrieval.metrics as metrics
+from neptune_query.internal.identifiers import (
+    AttributeDefinition,
+    ProjectIdentifier,
+    RunAttributeDefinition,
+    RunIdentifier,
+    SysId,
+)
+from neptune_query.internal.retrieval.attribute_types import FloatSeriesAggregations
+from neptune_query.internal.retrieval.attribute_values import AttributeValue
+from neptune_query.internal.retrieval.metric_buckets import TimeseriesBucket
+
+# Set the random seed for reproducibility
+random.seed(20250925)
+
+
+def random_alnum(length: int) -> str:
+    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))
+
+
+def random_alnum_strings(count: int, length: int) -> list[str]:
+    return [random_alnum(length) for _ in range(count)]
+
+
+def float_point_value(i: int, exp: int) -> metrics.FloatPointValue:
+    return (1234567890 + i * 1000.0, float(i) + exp, float(i) * 10, False, 1.0)
+
+
+EXPERIMENT_IDENTIFIER = RunIdentifier(ProjectIdentifier("project/abc"), SysId("XXX-1"))
+
+
+def float_series_value(path: str, exp: int):
+    """Helper to create a float series value for testing."""
+    return AttributeValue(
+        attribute_definition=AttributeDefinition(path, "float_series"),
+        value=FloatSeriesAggregations(last=float(exp), min=0.0, max=float(exp), average=float(exp) / 2, variance=0.0),
+        run_identifier=EXPERIMENT_IDENTIFIER,
+    )
+
+
+def string_value(path: str, exp: int):
+    """Helper to create a string value for testing."""
+    return AttributeValue(
+        attribute_definition=AttributeDefinition(path, "string"),
+        value=f"value_{exp}",
+        run_identifier=EXPERIMENT_IDENTIFIER,
+    )
+
+
+def bucket_metrics(experiments: int, paths: int, buckets: int) -> dict[RunAttributeDefinition, list[TimeseriesBucket]]:
+    return {
+        run_attribute_definition(experiment, path): [bucket_metric(index=i) for i in range(buckets)]
+        for experiment in range(experiments)
+        for path in range(paths)
+    }
+
+
+def run_attribute_definition(
+    sys_id: int | str, path: int | str, attribute_type: str = "float_series"
+) -> RunAttributeDefinition:
+    return RunAttributeDefinition(
+        RunIdentifier(ProjectIdentifier("foo/bar"), SysId(f"sysid{sys_id}")),
+        AttributeDefinition(f"path{path}", attribute_type),
+    )
+
+
+def bucket_metric(index: int) -> TimeseriesBucket:
+    if index > 0:
+        return TimeseriesBucket(
+            index=index,
+            from_x=20.0 * index,
+            to_x=20.0 * (index + 1),
+            first_x=20.0 * index + 2,
+            first_y=100.0 * (index - 1) + 90.0,
+            last_x=20.0 * (index + 1) - 2,
+            last_y=100.0 * index,
+            y_min=80.0 * index,
+            y_max=110.0 * index,
+            finite_point_count=10 + index,
+            nan_count=5 - index,
+            positive_inf_count=2 * index,
+            negative_inf_count=index,
+            finite_points_sum=950.0 * index,
+        )
+    else:
+        return TimeseriesBucket(
+            index=index,
+            from_x=float("-inf"),
+            to_x=20.0,
+            first_x=20.0,
+            first_y=0.0,
+            last_x=20.0,
+            last_y=0.0,
+            y_min=0.0,
+            y_max=0.0,
+            finite_point_count=1,
+            nan_count=0,
+            positive_inf_count=0,
+            negative_inf_count=0,
+            finite_points_sum=0.0,
+        )
diff --git a/tests/performance/test_perf_output_format.py b/tests/performance/test_perf_output_format.py