fix: pr benchmark results (#4141)

0ax1 · web-flow · commit faf634662d07 · 2025-08-06T17:49:03.000+01:00
To disambiguate different TPC-H runs, include the scale factor in the
dataset name. In addition, we filter duplicate entries which can result
from nightly runs if the commit is identical to the base commit of the
PR to avoid duplicate result rows.

Signed-off-by: Alexander Droste &lt;alexander.droste@protonmail.com&gt;
diff --git a/bench-vortex/src/datasets/mod.rs b/bench-vortex/src/datasets/mod.rs
@@ -39,12 +39,12 @@ pub enum BenchmarkDataset {
 }
 
 impl BenchmarkDataset {
-    pub fn name(&self) -> &str {
+    pub fn name(&self) -> String {
         match self {
-            BenchmarkDataset::TpcH { .. } => "tpch",
-            BenchmarkDataset::TpcDS { .. } => "tpcds",
-            BenchmarkDataset::ClickBench { .. } => "clickbench",
-            BenchmarkDataset::PublicBi { .. } => "public-bi",
+            BenchmarkDataset::TpcH { scale_factor } => format!("tpch_sf{scale_factor}"),
+            BenchmarkDataset::TpcDS { scale_factor } => format!("tpcds_sf{scale_factor}"),
+            BenchmarkDataset::ClickBench { .. } => "clickbench".to_string(),
+            BenchmarkDataset::PublicBi { .. } => "public-bi".to_string(),
         }
     }
 }
diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py
@@ -16,6 +16,10 @@
 base = pd.read_json(sys.argv[1], lines=True)
 pr = pd.read_json(sys.argv[2], lines=True)
 
+# Filter duplicate entries which can result from nightly
+# runs if the commit is identical to the base commit of the PR.
+base = base.drop_duplicates(subset=["name", "storage"], keep="last")
+
 base_commit_id = set(base["commit_id"].unique())
 pr_commit_id = set(pr["commit_id"].unique())
 

Original file line number	Diff line number	Diff line change
`@@ -39,12 +39,12 @@ pub enum BenchmarkDataset {`
`39`	`39`	`}`
`40`	`40`
`41`	`41`	`impl BenchmarkDataset {`
`42`		`- pub fn name(&self) -> &str {`
	`42`	`+ pub fn name(&self) -> String {`
`43`	`43`	`match self {`
`44`		`- BenchmarkDataset::TpcH { .. } => "tpch",`
`45`		`- BenchmarkDataset::TpcDS { .. } => "tpcds",`
`46`		`- BenchmarkDataset::ClickBench { .. } => "clickbench",`
`47`		`- BenchmarkDataset::PublicBi { .. } => "public-bi",`
	`44`	`+ BenchmarkDataset::TpcH { scale_factor } => format!("tpch_sf{scale_factor}"),`
	`45`	`+ BenchmarkDataset::TpcDS { scale_factor } => format!("tpcds_sf{scale_factor}"),`
	`46`	`+ BenchmarkDataset::ClickBench { .. } => "clickbench".to_string(),`
	`47`	`+ BenchmarkDataset::PublicBi { .. } => "public-bi".to_string(),`
`48`	`48`	`}`
`49`	`49`	`}`
`50`	`50`	`}`