Skip to content

Commit 63991d2

Browse files
committed
Formatting
1 parent 26e75f8 commit 63991d2

File tree

2 files changed

+41
-34
lines changed

2 files changed

+41
-34
lines changed

.github/workflows/performance_score_director.yml

Lines changed: 38 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
# - Runs entirely on a single machine.
22
# - The baseline is established first, then the branch under test is measured.
3-
# - Both runs fail if the benchmark error is over predefined thresholds.
4-
# - Then, if both are below thresholds and neither failed, those results must be directly comparable.
5-
# - Therefore, if the difference between the two is over the threshold, then the branch is considered to have regressed.
3+
# - Each benchmark gives a 99.9 % confidence interval.
4+
# - The confidence intervals are compared to determine if the branch under test is a regression or an improvement.
5+
# - The error threshold is expected to be below +/- 2.5 %,
6+
# but sometimes it gets higher due to the nature of public GitHub runners.
7+
# We have yet to see an error of over +/- 4 %.
8+
# With the error so high, the impact is that small regressions are not considered statistically significant.
69
name: Performance Regression Test - Score Director
710

811
on:
@@ -31,7 +34,7 @@ on:
3134

3235
jobs:
3336

34-
test:
37+
benchmark:
3538
runs-on: ubuntu-latest
3639
strategy:
3740
fail-fast: false # Jobs fail if the benchmark error is over predefined thresholds; other benchmarks continue.
@@ -70,9 +73,9 @@ jobs:
7073
working-directory: ./timefold-solver-benchmarks
7174
shell: bash
7275
run: |
73-
echo "forks=15" > scoredirector-benchmark.properties
74-
echo "warmup_iterations=5" >> scoredirector-benchmark.properties
75-
echo "measurement_iterations=15" >> scoredirector-benchmark.properties
76+
echo "forks=1" > scoredirector-benchmark.properties
77+
echo "warmup_iterations=1" >> scoredirector-benchmark.properties
78+
echo "measurement_iterations=1" >> scoredirector-benchmark.properties
7679
echo "relative_score_error_threshold=0.025" >> scoredirector-benchmark.properties
7780
echo "score_director_type=cs" >> scoredirector-benchmark.properties
7881
echo "example=${{ matrix.example }}" >> scoredirector-benchmark.properties
@@ -92,9 +95,8 @@ jobs:
9295
shell: bash
9396
run: |
9497
./run-scoredirector.sh
95-
# The benchmark gives the 99.9 % confidence interval.
96-
echo "RANGE_START=$(jq '.[0].primaryMetric.scoreConfidence[0]' ./timefold-solver-benchmarks/results/scoredirector/${{ github.event.inputs.baseline }}/results.json)" >> "$GITHUB_OUTPUT"
97-
echo "RANGE_END=$(jq '.[0].primaryMetric.scoreConfidence[1]' ./timefold-solver-benchmarks/results/scoredirector/${{ github.event.inputs.baseline }}/results.json)" >> "$GITHUB_OUTPUT"
98+
echo "RANGE_START=$(jq '.[0].primaryMetric.scoreConfidence[0]' results/scoredirector/${{ github.event.inputs.baseline }}/results.json)" >> "$GITHUB_OUTPUT"
99+
echo "RANGE_END=$(jq '.[0].primaryMetric.scoreConfidence[1]' results/scoredirector/${{ github.event.inputs.baseline }}/results.json)" >> "$GITHUB_OUTPUT"
98100
99101
- name: Phase 2 - Checkout timefold-solver
100102
uses: actions/checkout@v4
@@ -145,18 +147,18 @@ jobs:
145147
shell: bash
146148
run: |
147149
./run-scoredirector.sh
148-
# The benchmark gives the 99.9 % confidence interval.
149-
echo "RANGE_START=$(jq '.[0].primaryMetric.scoreConfidence[0]' ./timefold-solver-benchmarks/results/scoredirector/${{ github.event.inputs.branch }}/results.json)" >> "$GITHUB_OUTPUT"
150-
echo "RANGE_END=$(jq '.[0].primaryMetric.scoreConfidence[1]' ./timefold-solver-benchmarks/results/scoredirector/${{ github.event.inputs.branch }}/results.json)" >> "$GITHUB_OUTPUT"
150+
echo "RANGE_START=$(jq '.[0].primaryMetric.scoreConfidence[0]' results/scoredirector/${{ github.event.inputs.branch }}/results.json)" >> "$GITHUB_OUTPUT"
151+
echo "RANGE_END=$(jq '.[0].primaryMetric.scoreConfidence[1]' results/scoredirector/${{ github.event.inputs.branch }}/results.json)" >> "$GITHUB_OUTPUT"
151152
152153
- name: Phase 3 - Archive benchmark data
153154
uses: actions/upload-artifact@v4
154155
with:
155156
name: results-${{ matrix.example }}-${{ github.event.inputs.baseline }}_vs_${{ github.event.inputs.branch }}
156157
path: |
158+
./timefold-solver-benchmarks/scoredirector-benchmark.properties
157159
./timefold-solver-benchmarks/results/scoredirector
158160
159-
- name: Compare baseline with the branch
161+
- name: Phase 3 - Report results
160162
working-directory: ./timefold-solver-benchmarks
161163
env:
162164
OLD_RANGE_START: ${{ steps.benchmark_baseline.outputs.RANGE_START }}
@@ -165,16 +167,29 @@ jobs:
165167
NEW_RANGE_END: ${{ steps.benchmark_new.outputs.RANGE_END }}
166168
shell: bash
167169
run: |
168-
echo "Baseline result with 99.9 % confidence: "
169-
echo " [$OLD_RANGE_START, $OLD_RANGE_END]"
170-
echo " New result with 99.9 % confidence: "
171-
echo " [$NEW_RANGE_START, $NEW_RANGE_END]"
172-
echo ""
170+
export FAIL = false
173171
if [ "$NEW_RANGE_START" -le "$OLD_RANGE_END" ] && [ "$NEW_RANGE_END" -ge "$OLD_RANGE_START" ]; then
174-
echo "Result is not statistically significant."
172+
export OLD_MEAN=$(((OLD_RANGE_END - OLD_RANGE_START)/2)+OLD_RANGE_START)
173+
export NEW_MEAN=$(((NEW_RANGE_END - NEW_RANGE_START)/2)+NEW_RANGE_START)
174+
if [ "$NEW_RANGE_START" -ge "$OLD_MEAN" ]; then
175+
echo "### Possible improvement ⁉️" >> $GITHUB_STEP_SUMMARY
176+
elif [ "$OLD_RANGE_END" -le "$NEW_MEAN" ]; then
177+
echo "### Possible regression ⁉️" >> $GITHUB_STEP_SUMMARY
178+
else
179+
echo "### Statistically insignificant result ⁉️" >> $GITHUB_STEP_SUMMARY
180+
fi
175181
elif [ "$NEW_RANGE_START" -gt "$OLD_RANGE_END" ]; then
176-
echo "Statistically significant improvement."
182+
echo "### Statistically significant improvement 🚀" >> $GITHUB_STEP_SUMMARY
177183
else
178-
echo "Statistically significant regression."
179-
exit 1
184+
echo "### Statistically significant regression 🛑" >> $GITHUB_STEP_SUMMARY
185+
export FAIL = true
186+
fi
187+
188+
echo "| | **Ref** | **Min** | **Max** |" >> $GITHUB_STEP_SUMMARY
189+
echo "|:-----:|:-----------:|:-----------:|:-----------:|" >> $GITHUB_STEP_SUMMARY
190+
echo "| _Old_ | `${{ github.event.inputs.baseline }}` | ${OLD_RANGE_START%.*} | ${OLD_RANGE_END%.*} |" >> $GITHUB_STEP_SUMMARY
191+
echo "| _New_ | `${{ github.event.inputs.branch }}` | ${NEW_RANGE_START%.*} | ${NEW_RANGE_END%.*} |" >> $GITHUB_STEP_SUMMARY
192+
193+
if [ "$FAIL" = true ]; then
194+
exit 1
180195
fi

src/main/java/ai/timefold/solver/benchmarks/micro/coldstart/Main.java

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,11 @@
3333

3434
import java.io.IOException;
3535
import java.io.InputStream;
36-
import java.util.concurrent.atomic.AtomicBoolean;
3736

3837
import ai.timefold.solver.benchmarks.micro.coldstart.jmh.TimeToFirstScoreBenchmark;
3938
import ai.timefold.solver.benchmarks.micro.coldstart.jmh.TimeToSolverFactoryBenchmark;
4039
import ai.timefold.solver.benchmarks.micro.common.AbstractMain;
4140

42-
import org.openjdk.jmh.results.Result;
4341
import org.openjdk.jmh.runner.Runner;
4442
import org.openjdk.jmh.runner.RunnerException;
4543
import org.openjdk.jmh.runner.options.ChainedOptionsBuilder;
@@ -76,9 +74,8 @@ public static void main(String[] args) throws RunnerException, IOException {
7674

7775
var relativeScoreErrorThreshold = configuration.getRelativeScoreErrorThreshold();
7876
var thresholdForPrint = ((int) Math.round(relativeScoreErrorThreshold * 10_000)) / 100.0D;
79-
var wasSuccess = new AtomicBoolean(true);
8077
runResults.forEach(result -> {
81-
Result<?> primaryResult = result.getPrimaryResult();
78+
var primaryResult = result.getPrimaryResult();
8279
var score = primaryResult.getScore();
8380
var scoreError = primaryResult.getScoreError();
8481
var relativeScoreError = scoreError / score;
@@ -87,18 +84,13 @@ public static void main(String[] args) throws RunnerException, IOException {
8784
var benchmarkName = benchParams.getBenchmark() + " " + benchParams.getParam("example");
8885
var relativeScoreErrorForPrint = ((int) Math.round(relativeScoreError * 10_000)) / 100.0D;
8986
if (relativeScoreError > relativeScoreErrorThreshold) {
90-
LOGGER.error("Score error for '{}' is too high: ± {} % (threshold: ± {} %).", benchmarkName,
87+
LOGGER.warn("Score error for '{}' is too high: ± {} % (threshold: ± {} %).", benchmarkName,
9188
relativeScoreErrorForPrint, thresholdForPrint);
92-
wasSuccess.set(false);
93-
9489
} else if (relativeScoreError > (relativeScoreErrorThreshold * 0.9)) {
95-
LOGGER.warn("Score error for '{}' approaching threshold: ± {} % (threshold: ± {} %).", benchmarkName,
90+
LOGGER.info("Score error for '{}' approaching threshold: ± {} % (threshold: ± {} %).", benchmarkName,
9691
relativeScoreErrorForPrint, thresholdForPrint);
9792
}
9893
});
99-
if (wasSuccess.get()) {
100-
System.exit(1);
101-
}
10294
}
10395

10496
private static ChainedOptionsBuilder processBenchmark(ChainedOptionsBuilder options, Configuration configuration) {

0 commit comments

Comments
 (0)