11# - Runs entirely on a single machine.
22# - The baseline is established first, then the branch under test is measured.
3- # - Both runs fail if the benchmark error is over predefined thresholds.
4- # - Then, if both are below thresholds and neither failed, those results must be directly comparable.
5- # - Therefore, if the difference between the two is over the threshold, then the branch is considered to have regressed.
3+ # - Each benchmark gives a 99.9 % confidence interval.
4+ # - The confidence intervals are compared to determine if the branch under test is a regression or an improvement.
5+ # - The error threshold is expected to be below +/- 2.5 %,
6+ # but sometimes it gets higher due to the nature of public GitHub runners.
7+ # We have yet to see an error of over +/- 4 %.
8+ # With the error so high, the impact is that small regressions are not considered statistically significant.
69name : Performance Regression Test - Score Director
710
811on :
3134
3235jobs :
3336
34- test :
37+ benchmark :
3538 runs-on : ubuntu-latest
3639 strategy :
3740 fail-fast : false # Jobs fail if the benchmark error is over predefined thresholds; other benchmarks continue.
7073 working-directory : ./timefold-solver-benchmarks
7174 shell : bash
7275 run : |
73- echo "forks=15 " > scoredirector-benchmark.properties
74- echo "warmup_iterations=5 " >> scoredirector-benchmark.properties
75- echo "measurement_iterations=15 " >> scoredirector-benchmark.properties
76+ echo "forks=1 " > scoredirector-benchmark.properties
77+ echo "warmup_iterations=1 " >> scoredirector-benchmark.properties
78+ echo "measurement_iterations=1 " >> scoredirector-benchmark.properties
7679 echo "relative_score_error_threshold=0.025" >> scoredirector-benchmark.properties
7780 echo "score_director_type=cs" >> scoredirector-benchmark.properties
7881 echo "example=${{ matrix.example }}" >> scoredirector-benchmark.properties
9295 shell : bash
9396 run : |
9497 ./run-scoredirector.sh
95- # The benchmark gives the 99.9 % confidence interval.
96- echo "RANGE_START=$(jq '.[0].primaryMetric.scoreConfidence[0]' ./timefold-solver-benchmarks/results/scoredirector/${{ github.event.inputs.baseline }}/results.json)" >> "$GITHUB_OUTPUT"
97- echo "RANGE_END=$(jq '.[0].primaryMetric.scoreConfidence[1]' ./timefold-solver-benchmarks/results/scoredirector/${{ github.event.inputs.baseline }}/results.json)" >> "$GITHUB_OUTPUT"
98+ echo "RANGE_START=$(jq '.[0].primaryMetric.scoreConfidence[0]' results/scoredirector/${{ github.event.inputs.baseline }}/results.json)" >> "$GITHUB_OUTPUT"
99+ echo "RANGE_END=$(jq '.[0].primaryMetric.scoreConfidence[1]' results/scoredirector/${{ github.event.inputs.baseline }}/results.json)" >> "$GITHUB_OUTPUT"
98100
99101 - name : Phase 2 - Checkout timefold-solver
100102 uses : actions/checkout@v4
@@ -145,18 +147,18 @@ jobs:
145147 shell : bash
146148 run : |
147149 ./run-scoredirector.sh
148- # The benchmark gives the 99.9 % confidence interval.
149- echo "RANGE_START=$(jq '.[0].primaryMetric.scoreConfidence[0]' ./timefold-solver-benchmarks/results/scoredirector/${{ github.event.inputs.branch }}/results.json)" >> "$GITHUB_OUTPUT"
150- echo "RANGE_END=$(jq '.[0].primaryMetric.scoreConfidence[1]' ./timefold-solver-benchmarks/results/scoredirector/${{ github.event.inputs.branch }}/results.json)" >> "$GITHUB_OUTPUT"
150+ echo "RANGE_START=$(jq '.[0].primaryMetric.scoreConfidence[0]' results/scoredirector/${{ github.event.inputs.branch }}/results.json)" >> "$GITHUB_OUTPUT"
151+ echo "RANGE_END=$(jq '.[0].primaryMetric.scoreConfidence[1]' results/scoredirector/${{ github.event.inputs.branch }}/results.json)" >> "$GITHUB_OUTPUT"
151152
152153 - name : Phase 3 - Archive benchmark data
153154 uses : actions/upload-artifact@v4
154155 with :
155156 name : results-${{ matrix.example }}-${{ github.event.inputs.baseline }}_vs_${{ github.event.inputs.branch }}
156157 path : |
158+ ./timefold-solver-benchmarks/scoredirector-benchmark.properties
157159 ./timefold-solver-benchmarks/results/scoredirector
158160
159- - name : Compare baseline with the branch
161+ - name : Phase 3 - Report results
160162 working-directory : ./timefold-solver-benchmarks
161163 env :
162164 OLD_RANGE_START : ${{ steps.benchmark_baseline.outputs.RANGE_START }}
@@ -165,16 +167,29 @@ jobs:
165167 NEW_RANGE_END : ${{ steps.benchmark_new.outputs.RANGE_END }}
166168 shell : bash
167169 run : |
168- echo "Baseline result with 99.9 % confidence: "
169- echo " [$OLD_RANGE_START, $OLD_RANGE_END]"
170- echo " New result with 99.9 % confidence: "
171- echo " [$NEW_RANGE_START, $NEW_RANGE_END]"
172- echo ""
170+ export FAIL = false
173171 if [ "$NEW_RANGE_START" -le "$OLD_RANGE_END" ] && [ "$NEW_RANGE_END" -ge "$OLD_RANGE_START" ]; then
174- echo "Result is not statistically significant."
172+ export OLD_MEAN=$(((OLD_RANGE_END - OLD_RANGE_START)/2)+OLD_RANGE_START)
173+ export NEW_MEAN=$(((NEW_RANGE_END - NEW_RANGE_START)/2)+NEW_RANGE_START)
174+ if [ "$NEW_RANGE_START" -ge "$OLD_MEAN" ]; then
175+ echo "### Possible improvement ⁉️" >> $GITHUB_STEP_SUMMARY
176+ elif [ "$OLD_RANGE_END" -le "$NEW_MEAN" ]; then
177+ echo "### Possible regression ⁉️" >> $GITHUB_STEP_SUMMARY
178+ else
179+ echo "### Statistically insignificant result ⁉️" >> $GITHUB_STEP_SUMMARY
180+ fi
175181 elif [ "$NEW_RANGE_START" -gt "$OLD_RANGE_END" ]; then
176- echo "Statistically significant improvement."
182+ echo "### Statistically significant improvement 🚀" >> $GITHUB_STEP_SUMMARY
177183 else
178- echo "Statistically significant regression."
179- exit 1
184+ echo "### Statistically significant regression 🛑" >> $GITHUB_STEP_SUMMARY
185+ export FAIL = true
186+ fi
187+
188+ echo "| | **Ref** | **Min** | **Max** |" >> $GITHUB_STEP_SUMMARY
189+ echo "|:-----:|:-----------:|:-----------:|:-----------:|" >> $GITHUB_STEP_SUMMARY
190+ echo "| _Old_ | `${{ github.event.inputs.baseline }}` | ${OLD_RANGE_START%.*} | ${OLD_RANGE_END%.*} |" >> $GITHUB_STEP_SUMMARY
191+ echo "| _New_ | `${{ github.event.inputs.branch }}` | ${NEW_RANGE_START%.*} | ${NEW_RANGE_END%.*} |" >> $GITHUB_STEP_SUMMARY
192+
193+ if [ "$FAIL" = true ]; then
194+ exit 1
180195 fi
0 commit comments