|
| 1 | +<?php |
| 2 | + |
| 3 | +/** |
| 4 | + * Aggregates benchmark stability results across multiple runs. |
| 5 | + * |
| 6 | + * Reads JSON stats files produced by `benchmark --dump` and generates |
| 7 | + * a markdown report ranking parameter combinations by cross-run stability. |
| 8 | + * |
| 9 | + * Usage: php aggregate-stability.php <stats-directory> |
| 10 | + */ |
| 11 | + |
| 12 | +$dir = $argv[1] ?? 'stats'; |
| 13 | + |
| 14 | +// Each config directory contains stats-rep1.json .. stats-rep5.json. |
| 15 | +$files = glob("$dir/*/stats-rep*.json"); |
| 16 | + |
| 17 | +if (empty($files)) { |
| 18 | + fwrite(STDERR, "No stats files found in $dir\n"); |
| 19 | + exit(1); |
| 20 | +} |
| 21 | + |
| 22 | +// Group runs by config (iterations x rounds x warmup). |
| 23 | +$byConfig = []; |
| 24 | + |
| 25 | +foreach ($files as $file) { |
| 26 | + $data = json_decode(file_get_contents($file), true); |
| 27 | + |
| 28 | + if (! $data || ! isset($data['config'], $data['benchmarks'])) { |
| 29 | + fwrite(STDERR, "Skipping invalid file: $file\n"); |
| 30 | + continue; |
| 31 | + } |
| 32 | + |
| 33 | + $key = sprintf( |
| 34 | + 'i%05d-r%02d-w%d', |
| 35 | + $data['config']['iterations'], |
| 36 | + $data['config']['rounds'], |
| 37 | + $data['config']['warmup'], |
| 38 | + ); |
| 39 | + |
| 40 | + $byConfig[$key]['config'] = $data['config']; |
| 41 | + $byConfig[$key]['runs'][] = $data['benchmarks']; |
| 42 | +} |
| 43 | + |
| 44 | +ksort($byConfig); |
| 45 | + |
| 46 | +function computeStats(array $values): array |
| 47 | +{ |
| 48 | + $n = count($values); |
| 49 | + |
| 50 | + if ($n === 0) { |
| 51 | + return ['mean' => 0, 'stddev' => 0, 'cv' => 0, 'min' => 0, 'max' => 0]; |
| 52 | + } |
| 53 | + |
| 54 | + $mean = array_sum($values) / $n; |
| 55 | + $variance = array_reduce( |
| 56 | + $values, |
| 57 | + fn ($carry, $v) => $carry + ($v - $mean) ** 2, |
| 58 | + 0 |
| 59 | + ) / max($n - 1, 1); |
| 60 | + |
| 61 | + $stddev = sqrt($variance); |
| 62 | + $cv = $mean > 0 ? ($stddev / $mean) * 100 : 0; |
| 63 | + |
| 64 | + return [ |
| 65 | + 'mean' => $mean, |
| 66 | + 'stddev' => $stddev, |
| 67 | + 'cv' => $cv, |
| 68 | + 'min' => min($values), |
| 69 | + 'max' => max($values), |
| 70 | + ]; |
| 71 | +} |
| 72 | + |
| 73 | +// --- Report --- |
| 74 | + |
| 75 | +echo "# Benchmark Stability Report\n\n"; |
| 76 | + |
| 77 | +// Overall ranking table. |
| 78 | +echo "## Ranking (sorted by Blaze cross-run CV%)\n\n"; |
| 79 | +echo "Lower CV% = more stable across repeated runs on the same machine.\n\n"; |
| 80 | +echo "| Config | Runs | Blaze cross-run CV% | Blade cross-run CV% | Avg Blaze within-run CV% | Verdict |\n"; |
| 81 | +echo "|--------|------|--------------------:|--------------------:|-------------------------:|--------:|\n"; |
| 82 | + |
| 83 | +$rankings = []; |
| 84 | + |
| 85 | +foreach ($byConfig as $key => $group) { |
| 86 | + $config = $group['config']; |
| 87 | + $runs = $group['runs']; |
| 88 | + $benchNames = array_keys($runs[0]); |
| 89 | + |
| 90 | + $blazeCrossRunCvs = []; |
| 91 | + $bladeCrossRunCvs = []; |
| 92 | + $blazeWithinRunCvs = []; |
| 93 | + |
| 94 | + foreach ($benchNames as $bench) { |
| 95 | + $blazeMedians = array_map(fn ($r) => $r[$bench]['blaze']['median'], $runs); |
| 96 | + $bladeMedians = array_map(fn ($r) => $r[$bench]['blade']['median'], $runs); |
| 97 | + |
| 98 | + $blazeCrossRunCvs[] = computeStats($blazeMedians)['cv']; |
| 99 | + $bladeCrossRunCvs[] = computeStats($bladeMedians)['cv']; |
| 100 | + |
| 101 | + foreach ($runs as $run) { |
| 102 | + $blazeWithinRunCvs[] = $run[$bench]['blaze']['cv_percent']; |
| 103 | + } |
| 104 | + } |
| 105 | + |
| 106 | + $avgBlazeCrossRunCv = array_sum($blazeCrossRunCvs) / count($blazeCrossRunCvs); |
| 107 | + $avgBladeCrossRunCv = array_sum($bladeCrossRunCvs) / count($bladeCrossRunCvs); |
| 108 | + $avgBlazeWithinRunCv = array_sum($blazeWithinRunCvs) / count($blazeWithinRunCvs); |
| 109 | + |
| 110 | + if ($avgBlazeCrossRunCv < 3.0) { |
| 111 | + $verdict = 'EXCELLENT'; |
| 112 | + } elseif ($avgBlazeCrossRunCv < 5.0) { |
| 113 | + $verdict = 'GOOD'; |
| 114 | + } elseif ($avgBlazeCrossRunCv < 10.0) { |
| 115 | + $verdict = 'FAIR'; |
| 116 | + } else { |
| 117 | + $verdict = 'POOR'; |
| 118 | + } |
| 119 | + |
| 120 | + $label = sprintf( |
| 121 | + '%dk iter x %d rounds x %d warmup', |
| 122 | + $config['iterations'] / 1000, |
| 123 | + $config['rounds'], |
| 124 | + $config['warmup'], |
| 125 | + ); |
| 126 | + |
| 127 | + $rankings[] = [ |
| 128 | + 'key' => $key, |
| 129 | + 'label' => $label, |
| 130 | + 'runs' => count($runs), |
| 131 | + 'blaze_cross_cv' => $avgBlazeCrossRunCv, |
| 132 | + 'blade_cross_cv' => $avgBladeCrossRunCv, |
| 133 | + 'blaze_within_cv' => $avgBlazeWithinRunCv, |
| 134 | + 'verdict' => $verdict, |
| 135 | + ]; |
| 136 | +} |
| 137 | + |
| 138 | +// Sort by blaze cross-run CV ascending (most stable first). |
| 139 | +usort($rankings, fn ($a, $b) => $a['blaze_cross_cv'] <=> $b['blaze_cross_cv']); |
| 140 | + |
| 141 | +foreach ($rankings as $r) { |
| 142 | + printf( |
| 143 | + "| %-30s | %d | %5.1f%% | %5.1f%% | %5.1f%% | %-9s |\n", |
| 144 | + $r['label'], |
| 145 | + $r['runs'], |
| 146 | + $r['blaze_cross_cv'], |
| 147 | + $r['blade_cross_cv'], |
| 148 | + $r['blaze_within_cv'], |
| 149 | + $r['verdict'], |
| 150 | + ); |
| 151 | +} |
| 152 | + |
| 153 | +echo "\n---\n\n"; |
| 154 | + |
| 155 | +// Detailed per-config tables. |
| 156 | +foreach ($byConfig as $key => $group) { |
| 157 | + $config = $group['config']; |
| 158 | + $runs = $group['runs']; |
| 159 | + $numRuns = count($runs); |
| 160 | + |
| 161 | + $label = sprintf( |
| 162 | + '%dk iterations x %d rounds x %d warmup', |
| 163 | + $config['iterations'] / 1000, |
| 164 | + $config['rounds'], |
| 165 | + $config['warmup'], |
| 166 | + ); |
| 167 | + |
| 168 | + echo "## $label ($numRuns runs)\n\n"; |
| 169 | + echo "| Benchmark | Engine | Medians (per run) | Cross-run CV% | Avg within-run CV% | Stable? |\n"; |
| 170 | + echo "|-----------|--------|-------------------|:-------------:|:-------------------:|:-------:|\n"; |
| 171 | + |
| 172 | + $benchNames = array_keys($runs[0]); |
| 173 | + |
| 174 | + foreach ($benchNames as $bench) { |
| 175 | + foreach (['blade', 'blaze'] as $engine) { |
| 176 | + $medians = array_map(fn ($r) => $r[$bench][$engine]['median'], $runs); |
| 177 | + $withinCvs = array_map(fn ($r) => $r[$bench][$engine]['cv_percent'], $runs); |
| 178 | + |
| 179 | + $stats = computeStats($medians); |
| 180 | + $avgWithinCv = array_sum($withinCvs) / count($withinCvs); |
| 181 | + |
| 182 | + $medianStr = implode(', ', array_map(fn ($v) => sprintf('%.1f', $v), $medians)); |
| 183 | + |
| 184 | + if ($stats['cv'] < 3.0) { |
| 185 | + $stable = 'YES'; |
| 186 | + } elseif ($stats['cv'] < 5.0) { |
| 187 | + $stable = 'OK'; |
| 188 | + } elseif ($stats['cv'] < 10.0) { |
| 189 | + $stable = 'FAIR'; |
| 190 | + } else { |
| 191 | + $stable = 'NO'; |
| 192 | + } |
| 193 | + |
| 194 | + printf( |
| 195 | + "| %-25s | %-5s | %s | %.1f%% | %.1f%% | %s |\n", |
| 196 | + $bench, |
| 197 | + strtoupper($engine), |
| 198 | + $medianStr, |
| 199 | + $stats['cv'], |
| 200 | + $avgWithinCv, |
| 201 | + $stable, |
| 202 | + ); |
| 203 | + } |
| 204 | + } |
| 205 | + |
| 206 | + echo "\n"; |
| 207 | +} |
0 commit comments