chore[ci]: unify the benchmark table analysis thresholds

joseph-isaacs · joseph-isaacs · commit 8d0e5e7260fb · 2025-11-20T10:49:42.000Z
Signed-off-by: Joe Isaacs &lt;joe.isaacs@live.co.uk&gt;
diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py
@@ -56,13 +56,20 @@ def extract_dataset_key(df):
 
 # assert df3["unit_base"].equals(df3["unit_pr"]), (df3["unit_base"], df3["unit_pr"])
 
+# Determine threshold based on benchmark name
+# Use 30% threshold for S3 benchmarks, 10% for others
+is_s3_benchmark = "s3" in benchmark_name.lower()
+threshold_pct = 30 if is_s3_benchmark else 10
+improvement_threshold = 1.0 - (threshold_pct / 100.0)  # e.g., 0.7 for 30%, 0.9 for 10%
+regression_threshold = 1.0 + (threshold_pct / 100.0)  # e.g., 1.3 for 30%, 1.1 for 10%
+
 # Generate summary statistics
 df3["ratio"] = df3["value_pr"] / df3["value_base"]
 df3["remark"] = pd.Series([""] * len(df3))
 df3["remark"] = df3["remark"].case_when(
     [
-        (df3["ratio"] >= 1.3, "🚨"),
-        (df3["ratio"] <= 0.7, "🚀"),
+        (df3["ratio"] >= regression_threshold, "🚨"),
+        (df3["ratio"] <= improvement_threshold, "🚀"),
     ]
 )
 
@@ -115,13 +122,6 @@ def calculate_geo_mean(df):
     best_improvement = "No valid vortex comparisons"
     worst_regression = "No valid vortex comparisons"
 
-# Determine threshold based on benchmark name
-# Use 30% threshold for S3 benchmarks, 10% for others
-is_s3_benchmark = "s3" in benchmark_name.lower()
-threshold_pct = 30 if is_s3_benchmark else 10
-improvement_threshold = 1.0 - (threshold_pct / 100.0)  # e.g., 0.7 for 30%, 0.9 for 10%
-regression_threshold = 1.0 + (threshold_pct / 100.0)  # e.g., 1.3 for 30%, 1.1 for 10%
-
 # Count significant changes for vortex-only results
 significant_improvements = (vortex_df["ratio"] < improvement_threshold).sum()
 significant_regressions = (vortex_df["ratio"] > regression_threshold).sum()