Test stability

ganyicz · ganyicz · commit 4bff9e9411a0 · 2026-03-02T12:15:11.000+01:00
diff --git a/.github/scripts/aggregate-stability.php b/.github/scripts/aggregate-stability.php
@@ -0,0 +1,207 @@
+<?php
+
+/**
+ * Aggregates benchmark stability results across multiple runs.
+ *
+ * Reads JSON stats files produced by `benchmark --dump` and generates
+ * a markdown report ranking parameter combinations by cross-run stability.
+ *
+ * Usage: php aggregate-stability.php <stats-directory>
+ */
+
+$dir = $argv[1] ?? 'stats';
+
+// Each config directory contains stats-rep1.json .. stats-rep5.json.
+$files = glob("$dir/*/stats-rep*.json");
+
+if (empty($files)) {
+    fwrite(STDERR, "No stats files found in $dir\n");
+    exit(1);
+}
+
+// Group runs by config (iterations x rounds x warmup).
+$byConfig = [];
+
+foreach ($files as $file) {
+    $data = json_decode(file_get_contents($file), true);
+
+    if (! $data || ! isset($data['config'], $data['benchmarks'])) {
+        fwrite(STDERR, "Skipping invalid file: $file\n");
+        continue;
+    }
+
+    $key = sprintf(
+        'i%05d-r%02d-w%d',
+        $data['config']['iterations'],
+        $data['config']['rounds'],
+        $data['config']['warmup'],
+    );
+
+    $byConfig[$key]['config'] = $data['config'];
+    $byConfig[$key]['runs'][] = $data['benchmarks'];
+}
+
+ksort($byConfig);
+
+function computeStats(array $values): array
+{
+    $n = count($values);
+
+    if ($n === 0) {
+        return ['mean' => 0, 'stddev' => 0, 'cv' => 0, 'min' => 0, 'max' => 0];
+    }
+
+    $mean = array_sum($values) / $n;
+    $variance = array_reduce(
+        $values,
+        fn ($carry, $v) => $carry + ($v - $mean) ** 2,
+        0
+    ) / max($n - 1, 1);
+
+    $stddev = sqrt($variance);
+    $cv = $mean > 0 ? ($stddev / $mean) * 100 : 0;
+
+    return [
+        'mean' => $mean,
+        'stddev' => $stddev,
+        'cv' => $cv,
+        'min' => min($values),
+        'max' => max($values),
+    ];
+}
+
+// --- Report ---
+
+echo "# Benchmark Stability Report\n\n";
+
+// Overall ranking table.
+echo "## Ranking (sorted by Blaze cross-run CV%)\n\n";
+echo "Lower CV% = more stable across repeated runs on the same machine.\n\n";
+echo "| Config | Runs | Blaze cross-run CV% | Blade cross-run CV% | Avg Blaze within-run CV% | Verdict |\n";
+echo "|--------|------|--------------------:|--------------------:|-------------------------:|--------:|\n";
+
+$rankings = [];
+
+foreach ($byConfig as $key => $group) {
+    $config = $group['config'];
+    $runs = $group['runs'];
+    $benchNames = array_keys($runs[0]);
+
+    $blazeCrossRunCvs = [];
+    $bladeCrossRunCvs = [];
+    $blazeWithinRunCvs = [];
+
+    foreach ($benchNames as $bench) {
+        $blazeMedians = array_map(fn ($r) => $r[$bench]['blaze']['median'], $runs);
+        $bladeMedians = array_map(fn ($r) => $r[$bench]['blade']['median'], $runs);
+
+        $blazeCrossRunCvs[] = computeStats($blazeMedians)['cv'];
+        $bladeCrossRunCvs[] = computeStats($bladeMedians)['cv'];
+
+        foreach ($runs as $run) {
+            $blazeWithinRunCvs[] = $run[$bench]['blaze']['cv_percent'];
+        }
+    }
+
+    $avgBlazeCrossRunCv = array_sum($blazeCrossRunCvs) / count($blazeCrossRunCvs);
+    $avgBladeCrossRunCv = array_sum($bladeCrossRunCvs) / count($bladeCrossRunCvs);
+    $avgBlazeWithinRunCv = array_sum($blazeWithinRunCvs) / count($blazeWithinRunCvs);
+
+    if ($avgBlazeCrossRunCv < 3.0) {
+        $verdict = 'EXCELLENT';
+    } elseif ($avgBlazeCrossRunCv < 5.0) {
+        $verdict = 'GOOD';
+    } elseif ($avgBlazeCrossRunCv < 10.0) {
+        $verdict = 'FAIR';
+    } else {
+        $verdict = 'POOR';
+    }
+
+    $label = sprintf(
+        '%dk iter x %d rounds x %d warmup',
+        $config['iterations'] / 1000,
+        $config['rounds'],
+        $config['warmup'],
+    );
+
+    $rankings[] = [
+        'key' => $key,
+        'label' => $label,
+        'runs' => count($runs),
+        'blaze_cross_cv' => $avgBlazeCrossRunCv,
+        'blade_cross_cv' => $avgBladeCrossRunCv,
+        'blaze_within_cv' => $avgBlazeWithinRunCv,
+        'verdict' => $verdict,
+    ];
+}
+
+// Sort by blaze cross-run CV ascending (most stable first).
+usort($rankings, fn ($a, $b) => $a['blaze_cross_cv'] <=> $b['blaze_cross_cv']);
+
+foreach ($rankings as $r) {
+    printf(
+        "| %-30s | %d | %5.1f%% | %5.1f%% | %5.1f%% | %-9s |\n",
+        $r['label'],
+        $r['runs'],
+        $r['blaze_cross_cv'],
+        $r['blade_cross_cv'],
+        $r['blaze_within_cv'],
+        $r['verdict'],
+    );
+}
+
+echo "\n---\n\n";
+
+// Detailed per-config tables.
+foreach ($byConfig as $key => $group) {
+    $config = $group['config'];
+    $runs = $group['runs'];
+    $numRuns = count($runs);
+
+    $label = sprintf(
+        '%dk iterations x %d rounds x %d warmup',
+        $config['iterations'] / 1000,
+        $config['rounds'],
+        $config['warmup'],
+    );
+
+    echo "## $label ($numRuns runs)\n\n";
+    echo "| Benchmark | Engine | Medians (per run) | Cross-run CV% | Avg within-run CV% | Stable? |\n";
+    echo "|-----------|--------|-------------------|:-------------:|:-------------------:|:-------:|\n";
+
+    $benchNames = array_keys($runs[0]);
+
+    foreach ($benchNames as $bench) {
+        foreach (['blade', 'blaze'] as $engine) {
+            $medians = array_map(fn ($r) => $r[$bench][$engine]['median'], $runs);
+            $withinCvs = array_map(fn ($r) => $r[$bench][$engine]['cv_percent'], $runs);
+
+            $stats = computeStats($medians);
+            $avgWithinCv = array_sum($withinCvs) / count($withinCvs);
+
+            $medianStr = implode(', ', array_map(fn ($v) => sprintf('%.1f', $v), $medians));
+
+            if ($stats['cv'] < 3.0) {
+                $stable = 'YES';
+            } elseif ($stats['cv'] < 5.0) {
+                $stable = 'OK';
+            } elseif ($stats['cv'] < 10.0) {
+                $stable = 'FAIR';
+            } else {
+                $stable = 'NO';
+            }
+
+            printf(
+                "| %-25s | %-5s | %s | %.1f%% | %.1f%% | %s |\n",
+                $bench,
+                strtoupper($engine),
+                $medianStr,
+                $stats['cv'],
+                $avgWithinCv,
+                $stable,
+            );
+        }
+    }
+
+    echo "\n";
+}
diff --git a/.github/workflows/benchmark-stability.yml b/.github/workflows/benchmark-stability.yml
@@ -0,0 +1,107 @@
+name: Benchmark Stability
+
+on:
+  push:
+    branches-ignore:
+      - main
+
+jobs:
+  stability:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          # Low iterations
+          - { iterations: 5000,  rounds: 5,  warmup: 2, label: 5k-r5-w2 }
+          - { iterations: 5000,  rounds: 10, warmup: 3, label: 5k-r10-w3 }
+          # Current default
+          - { iterations: 10000, rounds: 5,  warmup: 2, label: 10k-r5-w2 }
+          # More rounds
+          - { iterations: 10000, rounds: 7,  warmup: 2, label: 10k-r7-w2 }
+          - { iterations: 10000, rounds: 10, warmup: 2, label: 10k-r10-w2 }
+          # Heavier warmup
+          - { iterations: 10000, rounds: 5,  warmup: 5, label: 10k-r5-w5 }
+          - { iterations: 10000, rounds: 7,  warmup: 5, label: 10k-r7-w5 }
+          # Higher iterations
+          - { iterations: 15000, rounds: 5,  warmup: 2, label: 15k-r5-w2 }
+          - { iterations: 15000, rounds: 5,  warmup: 5, label: 15k-r5-w5 }
+          - { iterations: 15000, rounds: 7,  warmup: 3, label: 15k-r7-w3 }
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup PHP
+        uses: shivammathur/setup-php@v2
+        with:
+          php-version: '8.4'
+          tools: composer:v2
+          coverage: none
+          extensions: mbstring, dom, curl, json, libxml, xml, xmlwriter, simplexml, tokenizer
+
+      - name: Determine composer cache directory
+        id: composer-cache
+        run: echo "dir=$(composer config cache-files-dir)" >> $GITHUB_OUTPUT
+
+      - name: Cache composer
+        uses: actions/cache@v3
+        with:
+          path: ${{ steps.composer-cache.outputs.dir }}
+          key: ${{ runner.os }}-composer-${{ hashFiles('**/composer.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-composer-
+
+      - name: Install dependencies
+        run: composer install --prefer-dist --no-progress --no-interaction
+
+      - name: Run benchmarks (5 repeats)
+        run: |
+          for i in 1 2 3 4 5; do
+            echo "--- Repeat $i ---"
+            vendor/bin/testbench benchmark \
+              --ci \
+              --iterations=${{ matrix.iterations }} \
+              --rounds=${{ matrix.rounds }} \
+              --warmup=${{ matrix.warmup }} \
+              --dump=stats-rep${i}.json
+            echo ""
+          done
+
+      - name: Upload stats
+        uses: actions/upload-artifact@v4
+        with:
+          name: stats-${{ matrix.label }}
+          path: stats-rep*.json
+
+  aggregate:
+    needs: stability
+    if: always()
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup PHP
+        uses: shivammathur/setup-php@v2
+        with:
+          php-version: '8.4'
+
+      - name: Download all stats
+        uses: actions/download-artifact@v4
+        with:
+          path: stats
+          pattern: stats-*
+
+      - name: Generate stability report
+        run: php .github/scripts/aggregate-stability.php stats > stability-report.md
+
+      - name: Show report
+        run: cat stability-report.md >> $GITHUB_STEP_SUMMARY
+
+      - name: Upload report
+        uses: actions/upload-artifact@v4
+        with:
+          name: stability-report
+          path: stability-report.md
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -95,10 +95,16 @@ jobs:
         run: echo "${{ github.event.pull_request.number }}" > benchmark-result.md
 
       - name: Run benchmark
-        run: vendor/bin/testbench benchmark --ci >> benchmark-result.md
+        run: vendor/bin/testbench benchmark --ci --dump=benchmark-stats.json >> benchmark-result.md
 
       - name: Upload benchmark result
         uses: actions/upload-artifact@v4
         with:
           name: benchmark-result
           path: benchmark-result.md
+
+      - name: Upload benchmark stats
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-stats
+          path: benchmark-stats.json
diff --git a/workbench/app/Console/Commands/BenchmarkCommand.php b/workbench/app/Console/Commands/BenchmarkCommand.php