microbench-ci: compare only outer run when insignificant

herkolategan · herkolategan · commit 075e523fb9d1 · 2025-03-18T12:17:58.000Z
Previously, during the comparison step all runs were compared. Since the last
run determines if the previous runs had significant changes, we now first
compare only the last outer run of each benchmark. If the last run had
significant changes, we then compare all runs to produce the final assessment.

This prevents having a compare summary on CI that shows a regressions, when in
fact only one run possibly had a regression, followed by an insignificant change
in the last run.

Epic: None
Release note: None
diff --git a/pkg/cmd/microbench-ci/compare.go b/pkg/cmd/microbench-ci/compare.go
@@ -121,14 +121,27 @@ func (b *Benchmark) compare(lines int) (*CompareResult, error) {
 	return &compareResult, nil
 }
 
-// compareBenchmarks compares the metrics of all benchmarks between two revisions.
+// compareBenchmarks compares the metrics of all benchmarks between two
+// revisions. It first compares only the last outer run of each benchmark. If
+// the last run had significant changes, it compares the metrics of all runs.
+// This is because the last run would only have completed with significant
+// changes if all the previous runs had them as well, and then we want to
+// include it in the final assessment. In contrast if the last run had no
+// significant changes, it is possible that the previous runs had significant
+// changes, and we don't want to include them in the final assessment.
 func (b Benchmarks) compareBenchmarks() (CompareResults, error) {
 	compareResults := make(CompareResults, 0, len(b))
 	for _, benchmark := range b {
-		compareResult, err := benchmark.compare(0)
+		compareResult, err := benchmark.compare(benchmark.Count)
 		if err != nil {
 			return nil, err
 		}
+		if compareResult.top() != NoChange {
+			compareResult, err = benchmark.compare(0)
+			if err != nil {
+				return nil, err
+			}
+		}
 		compareResults = append(compareResults, compareResult)
 	}
 	return compareResults, nil
diff --git a/pkg/cmd/microbench-ci/testdata/summary.txt b/pkg/cmd/microbench-ci/testdata/summary.txt
@@ -72,8 +72,8 @@ run group=1
 
 | Metric                      | Old Commit     | New Commit     | Delta      | Note         |
 |-----------------------------|----------------|----------------|------------|--------------|
-| ⚪ **sec/op** | 9.852m ±0% | 9.880m ±1% | ~ | p=0.084 n=20    |
-| ⚪ **allocs/op** | 10.38k ±0% | 10.38k ±0% | ~ | p=1.000 n=20    |
+| ⚪ **sec/op** | 9.852m ±0% | 9.852m ±0% | ~ | p=1.000 n=10    |
+| ⚪ **allocs/op** | 10.38k ±1% | 10.38k ±1% | ~ | p=1.000 n=10    |
 
 <details><summary>Reproduce</summary>
 
@@ -161,32 +161,22 @@ json
             "Metric": "B/op",
             "Summary": {
               "Center": 2367667,
-              "Lo": 2364281,
-              "Hi": 2369187,
-              "Confidence": 0.95861,
+              "Lo": 2358650,
+              "Hi": 2370670,
+              "Confidence": 0.97852,
               "Warnings": null
             },
             "Sample": {
               "Values": [
                 2352326,
-                2352326,
-                2358650,
                 2358650,
                 2364281,
-                2364281,
-                2365463,
                 2365463,
                 2367582,
-                2367582,
-                2367752,
                 2367752,
                 2368213,
-                2368213,
                 2369187,
-                2369187,
-                2370670,
                 2370670,
-                2375306,
                 2375306
               ],
               "Thresholds": {
@@ -199,32 +189,22 @@ json
             "Metric": "allocs/op",
             "Summary": {
               "Center": 10378.50000,
-              "Lo": 10361,
-              "Hi": 10392,
-              "Confidence": 0.95861,
+              "Lo": 10287,
+              "Hi": 10398,
+              "Confidence": 0.97852,
               "Warnings": null
             },
             "Sample": {
               "Values": [
                 10246,
-                10246,
-                10287,
                 10287,
                 10361,
-                10361,
-                10377,
                 10377,
                 10378,
-                10378,
                 10379,
-                10379,
-                10386,
                 10386,
                 10392,
-                10392,
-                10398,
                 10398,
-                10411,
                 10411
               ],
               "Thresholds": {
@@ -236,10 +216,10 @@ json
           {
             "Metric": "sec/op",
             "Summary": {
-              "Center": 0.00988,
+              "Center": 0.00985,
               "Lo": 0.00985,
-              "Hi": 0.00995,
-              "Confidence": 0.95861,
+              "Hi": 0.00985,
+              "Confidence": 0.97852,
               "Warnings": null
             },
             "Sample": {
@@ -253,17 +233,7 @@ json
                 0.00985,
                 0.00985,
                 0.00985,
-                0.00985,
-                0.00991,
-                0.00993,
-                0.00995,
-                0.00995,
-                0.00995,
-                0.00995,
-                0.00997,
-                0.00998,
-                0.00998,
-                0.01000
+                0.00985
               ],
               "Thresholds": {
                 "CompareAlpha": 0.05000
@@ -277,32 +247,22 @@ json
             "Metric": "B/op",
             "Summary": {
               "Center": 2367667,
-              "Lo": 2364281,
-              "Hi": 2369187,
-              "Confidence": 0.95861,
+              "Lo": 2358650,
+              "Hi": 2370670,
+              "Confidence": 0.97852,
               "Warnings": null
             },
             "Sample": {
               "Values": [
                 2352326,
-                2352326,
-                2358650,
                 2358650,
                 2364281,
-                2364281,
                 2365463,
-                2365463,
-                2367582,
                 2367582,
                 2367752,
-                2367752,
                 2368213,
-                2368213,
-                2369187,
                 2369187,
                 2370670,
-                2370670,
-                2375306,
                 2375306
               ],
               "Thresholds": {
@@ -315,32 +275,22 @@ json
             "Metric": "allocs/op",
             "Summary": {
               "Center": 10378.50000,
-              "Lo": 10361,
-              "Hi": 10392,
-              "Confidence": 0.95861,
+              "Lo": 10287,
+              "Hi": 10398,
+              "Confidence": 0.97852,
               "Warnings": null
             },
             "Sample": {
               "Values": [
-                10246,
                 10246,
                 10287,
-                10287,
-                10361,
                 10361,
                 10377,
-                10377,
                 10378,
-                10378,
-                10379,
                 10379,
                 10386,
-                10386,
-                10392,
                 10392,
                 10398,
-                10398,
-                10411,
                 10411
               ],
               "Thresholds": {
@@ -355,21 +305,11 @@ json
               "Center": 0,
               "Lo": 0,
               "Hi": 0,
-              "Confidence": 0.95861,
+              "Confidence": 0.97852,
               "Warnings": null
             },
             "Sample": {
               "Values": [
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
-                0,
                 0,
                 0,
                 0,
@@ -393,17 +333,11 @@ json
               "Center": 0.00985,
               "Lo": 0.00985,
               "Hi": 0.00985,
-              "Confidence": 0.95861,
+              "Confidence": 0.97852,
               "Warnings": null
             },
             "Sample": {
               "Values": [
-                0.00981,
-                0.00985,
-                0.00985,
-                0.00985,
-                0.00985,
-                0.00985,
                 0.00985,
                 0.00985,
                 0.00985,
@@ -413,11 +347,7 @@ json
                 0.00985,
                 0.00985,
                 0.00985,
-                0.00987,
-                0.00988,
-                0.00990,
-                0.00993,
-                0.00998
+                0.00985
               ],
               "Thresholds": {
                 "CompareAlpha": 0.05000