Skip to content

Commit 4bff9e9

Browse files
committed
Test stability
1 parent 0eac8e6 commit 4bff9e9

File tree

4 files changed

+374
-31
lines changed

4 files changed

+374
-31
lines changed
Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
<?php
2+
3+
/**
4+
* Aggregates benchmark stability results across multiple runs.
5+
*
6+
* Reads JSON stats files produced by `benchmark --dump` and generates
7+
* a markdown report ranking parameter combinations by cross-run stability.
8+
*
9+
* Usage: php aggregate-stability.php <stats-directory>
10+
*/
11+
12+
$dir = $argv[1] ?? 'stats';
13+
14+
// Each config directory contains stats-rep1.json .. stats-rep5.json.
15+
$files = glob("$dir/*/stats-rep*.json");
16+
17+
if (empty($files)) {
18+
fwrite(STDERR, "No stats files found in $dir\n");
19+
exit(1);
20+
}
21+
22+
// Group runs by config (iterations x rounds x warmup).
23+
$byConfig = [];
24+
25+
foreach ($files as $file) {
26+
$data = json_decode(file_get_contents($file), true);
27+
28+
if (! $data || ! isset($data['config'], $data['benchmarks'])) {
29+
fwrite(STDERR, "Skipping invalid file: $file\n");
30+
continue;
31+
}
32+
33+
$key = sprintf(
34+
'i%05d-r%02d-w%d',
35+
$data['config']['iterations'],
36+
$data['config']['rounds'],
37+
$data['config']['warmup'],
38+
);
39+
40+
$byConfig[$key]['config'] = $data['config'];
41+
$byConfig[$key]['runs'][] = $data['benchmarks'];
42+
}
43+
44+
ksort($byConfig);
45+
46+
function computeStats(array $values): array
47+
{
48+
$n = count($values);
49+
50+
if ($n === 0) {
51+
return ['mean' => 0, 'stddev' => 0, 'cv' => 0, 'min' => 0, 'max' => 0];
52+
}
53+
54+
$mean = array_sum($values) / $n;
55+
$variance = array_reduce(
56+
$values,
57+
fn ($carry, $v) => $carry + ($v - $mean) ** 2,
58+
0
59+
) / max($n - 1, 1);
60+
61+
$stddev = sqrt($variance);
62+
$cv = $mean > 0 ? ($stddev / $mean) * 100 : 0;
63+
64+
return [
65+
'mean' => $mean,
66+
'stddev' => $stddev,
67+
'cv' => $cv,
68+
'min' => min($values),
69+
'max' => max($values),
70+
];
71+
}
72+
73+
// --- Report ---
74+
75+
echo "# Benchmark Stability Report\n\n";
76+
77+
// Overall ranking table.
78+
echo "## Ranking (sorted by Blaze cross-run CV%)\n\n";
79+
echo "Lower CV% = more stable across repeated runs on the same machine.\n\n";
80+
echo "| Config | Runs | Blaze cross-run CV% | Blade cross-run CV% | Avg Blaze within-run CV% | Verdict |\n";
81+
echo "|--------|------|--------------------:|--------------------:|-------------------------:|--------:|\n";
82+
83+
$rankings = [];
84+
85+
foreach ($byConfig as $key => $group) {
86+
$config = $group['config'];
87+
$runs = $group['runs'];
88+
$benchNames = array_keys($runs[0]);
89+
90+
$blazeCrossRunCvs = [];
91+
$bladeCrossRunCvs = [];
92+
$blazeWithinRunCvs = [];
93+
94+
foreach ($benchNames as $bench) {
95+
$blazeMedians = array_map(fn ($r) => $r[$bench]['blaze']['median'], $runs);
96+
$bladeMedians = array_map(fn ($r) => $r[$bench]['blade']['median'], $runs);
97+
98+
$blazeCrossRunCvs[] = computeStats($blazeMedians)['cv'];
99+
$bladeCrossRunCvs[] = computeStats($bladeMedians)['cv'];
100+
101+
foreach ($runs as $run) {
102+
$blazeWithinRunCvs[] = $run[$bench]['blaze']['cv_percent'];
103+
}
104+
}
105+
106+
$avgBlazeCrossRunCv = array_sum($blazeCrossRunCvs) / count($blazeCrossRunCvs);
107+
$avgBladeCrossRunCv = array_sum($bladeCrossRunCvs) / count($bladeCrossRunCvs);
108+
$avgBlazeWithinRunCv = array_sum($blazeWithinRunCvs) / count($blazeWithinRunCvs);
109+
110+
if ($avgBlazeCrossRunCv < 3.0) {
111+
$verdict = 'EXCELLENT';
112+
} elseif ($avgBlazeCrossRunCv < 5.0) {
113+
$verdict = 'GOOD';
114+
} elseif ($avgBlazeCrossRunCv < 10.0) {
115+
$verdict = 'FAIR';
116+
} else {
117+
$verdict = 'POOR';
118+
}
119+
120+
$label = sprintf(
121+
'%dk iter x %d rounds x %d warmup',
122+
$config['iterations'] / 1000,
123+
$config['rounds'],
124+
$config['warmup'],
125+
);
126+
127+
$rankings[] = [
128+
'key' => $key,
129+
'label' => $label,
130+
'runs' => count($runs),
131+
'blaze_cross_cv' => $avgBlazeCrossRunCv,
132+
'blade_cross_cv' => $avgBladeCrossRunCv,
133+
'blaze_within_cv' => $avgBlazeWithinRunCv,
134+
'verdict' => $verdict,
135+
];
136+
}
137+
138+
// Sort by blaze cross-run CV ascending (most stable first).
139+
usort($rankings, fn ($a, $b) => $a['blaze_cross_cv'] <=> $b['blaze_cross_cv']);
140+
141+
foreach ($rankings as $r) {
142+
printf(
143+
"| %-30s | %d | %5.1f%% | %5.1f%% | %5.1f%% | %-9s |\n",
144+
$r['label'],
145+
$r['runs'],
146+
$r['blaze_cross_cv'],
147+
$r['blade_cross_cv'],
148+
$r['blaze_within_cv'],
149+
$r['verdict'],
150+
);
151+
}
152+
153+
echo "\n---\n\n";
154+
155+
// Detailed per-config tables.
156+
foreach ($byConfig as $key => $group) {
157+
$config = $group['config'];
158+
$runs = $group['runs'];
159+
$numRuns = count($runs);
160+
161+
$label = sprintf(
162+
'%dk iterations x %d rounds x %d warmup',
163+
$config['iterations'] / 1000,
164+
$config['rounds'],
165+
$config['warmup'],
166+
);
167+
168+
echo "## $label ($numRuns runs)\n\n";
169+
echo "| Benchmark | Engine | Medians (per run) | Cross-run CV% | Avg within-run CV% | Stable? |\n";
170+
echo "|-----------|--------|-------------------|:-------------:|:-------------------:|:-------:|\n";
171+
172+
$benchNames = array_keys($runs[0]);
173+
174+
foreach ($benchNames as $bench) {
175+
foreach (['blade', 'blaze'] as $engine) {
176+
$medians = array_map(fn ($r) => $r[$bench][$engine]['median'], $runs);
177+
$withinCvs = array_map(fn ($r) => $r[$bench][$engine]['cv_percent'], $runs);
178+
179+
$stats = computeStats($medians);
180+
$avgWithinCv = array_sum($withinCvs) / count($withinCvs);
181+
182+
$medianStr = implode(', ', array_map(fn ($v) => sprintf('%.1f', $v), $medians));
183+
184+
if ($stats['cv'] < 3.0) {
185+
$stable = 'YES';
186+
} elseif ($stats['cv'] < 5.0) {
187+
$stable = 'OK';
188+
} elseif ($stats['cv'] < 10.0) {
189+
$stable = 'FAIR';
190+
} else {
191+
$stable = 'NO';
192+
}
193+
194+
printf(
195+
"| %-25s | %-5s | %s | %.1f%% | %.1f%% | %s |\n",
196+
$bench,
197+
strtoupper($engine),
198+
$medianStr,
199+
$stats['cv'],
200+
$avgWithinCv,
201+
$stable,
202+
);
203+
}
204+
}
205+
206+
echo "\n";
207+
}
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
name: Benchmark Stability
2+
3+
on:
4+
push:
5+
branches-ignore:
6+
- main
7+
8+
jobs:
9+
stability:
10+
runs-on: ubuntu-latest
11+
strategy:
12+
fail-fast: false
13+
matrix:
14+
include:
15+
# Low iterations
16+
- { iterations: 5000, rounds: 5, warmup: 2, label: 5k-r5-w2 }
17+
- { iterations: 5000, rounds: 10, warmup: 3, label: 5k-r10-w3 }
18+
# Current default
19+
- { iterations: 10000, rounds: 5, warmup: 2, label: 10k-r5-w2 }
20+
# More rounds
21+
- { iterations: 10000, rounds: 7, warmup: 2, label: 10k-r7-w2 }
22+
- { iterations: 10000, rounds: 10, warmup: 2, label: 10k-r10-w2 }
23+
# Heavier warmup
24+
- { iterations: 10000, rounds: 5, warmup: 5, label: 10k-r5-w5 }
25+
- { iterations: 10000, rounds: 7, warmup: 5, label: 10k-r7-w5 }
26+
# Higher iterations
27+
- { iterations: 15000, rounds: 5, warmup: 2, label: 15k-r5-w2 }
28+
- { iterations: 15000, rounds: 5, warmup: 5, label: 15k-r5-w5 }
29+
- { iterations: 15000, rounds: 7, warmup: 3, label: 15k-r7-w3 }
30+
31+
steps:
32+
- name: Checkout
33+
uses: actions/checkout@v4
34+
35+
- name: Setup PHP
36+
uses: shivammathur/setup-php@v2
37+
with:
38+
php-version: '8.4'
39+
tools: composer:v2
40+
coverage: none
41+
extensions: mbstring, dom, curl, json, libxml, xml, xmlwriter, simplexml, tokenizer
42+
43+
- name: Determine composer cache directory
44+
id: composer-cache
45+
run: echo "dir=$(composer config cache-files-dir)" >> $GITHUB_OUTPUT
46+
47+
- name: Cache composer
48+
uses: actions/cache@v3
49+
with:
50+
path: ${{ steps.composer-cache.outputs.dir }}
51+
key: ${{ runner.os }}-composer-${{ hashFiles('**/composer.lock') }}
52+
restore-keys: |
53+
${{ runner.os }}-composer-
54+
55+
- name: Install dependencies
56+
run: composer install --prefer-dist --no-progress --no-interaction
57+
58+
- name: Run benchmarks (5 repeats)
59+
run: |
60+
for i in 1 2 3 4 5; do
61+
echo "--- Repeat $i ---"
62+
vendor/bin/testbench benchmark \
63+
--ci \
64+
--iterations=${{ matrix.iterations }} \
65+
--rounds=${{ matrix.rounds }} \
66+
--warmup=${{ matrix.warmup }} \
67+
--dump=stats-rep${i}.json
68+
echo ""
69+
done
70+
71+
- name: Upload stats
72+
uses: actions/upload-artifact@v4
73+
with:
74+
name: stats-${{ matrix.label }}
75+
path: stats-rep*.json
76+
77+
aggregate:
78+
needs: stability
79+
if: always()
80+
runs-on: ubuntu-latest
81+
82+
steps:
83+
- name: Checkout
84+
uses: actions/checkout@v4
85+
86+
- name: Setup PHP
87+
uses: shivammathur/setup-php@v2
88+
with:
89+
php-version: '8.4'
90+
91+
- name: Download all stats
92+
uses: actions/download-artifact@v4
93+
with:
94+
path: stats
95+
pattern: stats-*
96+
97+
- name: Generate stability report
98+
run: php .github/scripts/aggregate-stability.php stats > stability-report.md
99+
100+
- name: Show report
101+
run: cat stability-report.md >> $GITHUB_STEP_SUMMARY
102+
103+
- name: Upload report
104+
uses: actions/upload-artifact@v4
105+
with:
106+
name: stability-report
107+
path: stability-report.md

.github/workflows/ci.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,10 +95,16 @@ jobs:
9595
run: echo "${{ github.event.pull_request.number }}" > benchmark-result.md
9696

9797
- name: Run benchmark
98-
run: vendor/bin/testbench benchmark --ci >> benchmark-result.md
98+
run: vendor/bin/testbench benchmark --ci --dump=benchmark-stats.json >> benchmark-result.md
9999

100100
- name: Upload benchmark result
101101
uses: actions/upload-artifact@v4
102102
with:
103103
name: benchmark-result
104104
path: benchmark-result.md
105+
106+
- name: Upload benchmark stats
107+
uses: actions/upload-artifact@v4
108+
with:
109+
name: benchmark-stats
110+
path: benchmark-stats.json

0 commit comments

Comments
 (0)