Some vx-bench fixes

AdamGS · AdamGS · commit 0182ef99485a · 2025-12-17T14:00:26.000Z
Signed-off-by: Adam Gutglick &lt;adam@spiraldb.com&gt;
diff --git a/bench-orchestrator/README.md b/bench-orchestrator/README.md
@@ -65,7 +65,6 @@ vx-bench compare [options]
 - `--base, -b`: Base reference (`engine:format@run`)
 - `--target, -t`: Target reference (`engine:format@run`)
 - `--threshold`: Significance threshold (default: 0.10 = 10%)
-- `--markdown, -m`: Output as GitHub-compatible markdown
 
 ### `list` - List Benchmark Runs
 
@@ -129,7 +128,7 @@ git checkout feature/my-optimization
 vx-bench run tpch -e datafusion -f parquet,vortex -l feature
 
 # Compare the runs
-vx-bench compare --runs baseline,feature --markdown
+vx-bench compare --runs baseline,feature
 ```
 
 ### 2. Quick Regression Check
@@ -172,25 +171,9 @@ vx-bench run tpch \
 vx-bench compare \
   --base "datafusion:parquet@format-analysis" \
   --target "datafusion:vortex@format-analysis" \
-  --markdown
 ```
 
-### 5. CI/CD Integration
-
-Generate markdown output for pull request comments:
-
-```bash
-# Run benchmarks
-vx-bench run tpch -e datafusion -f parquet,vortex -l "pr-$PR_NUMBER"
-
-# Generate comparison for PR comment
-vx-bench compare \
-  --base "datafusion:parquet@main" \
-  --target "datafusion:parquet@pr-$PR_NUMBER" \
-  --markdown > benchmark-report.md
-```
-
-### 6. Memory Usage Analysis
+### 5. Memory Usage Analysis
 
 Track memory usage alongside performance:
 
@@ -204,7 +187,7 @@ vx-bench run tpch \
 vx-bench show memory-profiling
 ```
 
-### 7. Scale Factor Testing
+### 6. Scale Factor Testing
 
 Test performance at different data scales:
 
@@ -219,7 +202,7 @@ vx-bench run tpch -s 10 -l sf10
 vx-bench compare --runs sf1,sf10
 ```
 
-### 8. Excluding Problematic Queries
+### 7. Excluding Problematic Queries
 
 Skip queries that are known to fail or take too long:
 
@@ -228,7 +211,7 @@ Skip queries that are known to fail or take too long:
 vx-bench run tpch --exclude-queries 15,21 -l partial-run
 ```
 
-### 9. Historical Analysis
+### 8. Historical Analysis
 
 Find runs from the past week and compare trends:
 
@@ -240,7 +223,7 @@ vx-bench list --since "7 days" --benchmark tpch
 vx-bench compare --runs <run-id-1>,<run-id-2>
 ```
 
-### 10. Cleanup Old Results
+### 9. Cleanup Old Results
 
 Keep your results directory manageable:
 
@@ -291,17 +274,6 @@ Default output uses rich formatting with color-coded ratios:
 - Red (with down arrow): Regression (>10% slower)
 - Yellow: Neutral (within 10%)
 
-### Markdown Output
-
-Use `--markdown` for GitHub-compatible tables suitable for PR comments:
-
-```markdown
-| Query | base | target | Ratio |
-|-------|------|--------|-------|
-| q1    | 1.2s | 0.9s   | 0.750x |
-| q2    | 2.5s | 2.6s   | 1.040x |
-```
-
 ## Data Storage
 
 Results are stored in `<workspace>/target/vortex-bench/runs/`. Each run creates a directory containing:
diff --git a/bench-orchestrator/bench_orchestrator/cli.py b/bench-orchestrator/bench_orchestrator/cli.py
@@ -76,7 +76,7 @@ def run(
     exclude_list = parse_queries(exclude_queries)
 
     # Build options dict
-    options = {}
+    options: dict[str, str] = {}
     if scale_factor:
         options["scale_factor"] = scale_factor
 
@@ -170,7 +170,6 @@ def compare(
         typer.Option("--runs", "-r", help="Two runs to compare (comma-separated)"),
     ] = None,
     threshold: Annotated[float, typer.Option("--threshold", help="Significance threshold (default 10%)")] = 0.10,
-    markdown: Annotated[bool, typer.Option("--markdown", "-m", help="Output as markdown")] = False,
 ) -> None:
     """Compare benchmark results."""
     store = ResultStore()
@@ -244,18 +243,13 @@ def compare(
 
     reporter = BenchmarkReporter(comparison, stats, threshold)
 
-    if markdown:
-        console.print(reporter.summary())
-        console.print()
-        console.print(reporter.to_markdown(base_label, target_label))
-    else:
-        table = reporter.to_rich_table(
-            title="Benchmark Comparison",
-            base_label=base_label,
-            target_label=target_label,
-        )
-        console.print(table)
-        reporter.print_summary()
+    table = reporter.to_rich_table(
+        title="Benchmark Comparison",
+        base_label=base_label,
+        target_label=target_label,
+    )
+    console.print(table)
+    reporter.print_summary()
 
 
 @app.command("list")
@@ -288,7 +282,7 @@ def list_runs(
         return
 
     table = Table(title="Benchmark Runs")
-    table.add_column("Run ID", style="cyan")
+    table.add_column("Run ID", style="cyan", no_wrap=True)
     table.add_column("Label", style="green")
     table.add_column("Benchmark")
     table.add_column("Engines")
@@ -298,7 +292,7 @@ def list_runs(
     for run in runs:
         status = "[yellow]partial[/yellow]" if run.partial else "[green]complete[/green]"
         table.add_row(
-            run.run_id[:30] + "..." if len(run.run_id) > 30 else run.run_id,
+            run.run_id,
             run.label or "-",
             run.benchmark,
             ", ".join(run.engines),
diff --git a/bench-orchestrator/bench_orchestrator/comparison/reporter.py b/bench-orchestrator/bench_orchestrator/comparison/reporter.py
@@ -96,38 +96,6 @@ def to_rich_table(
 
         return table
 
-    def to_markdown(
-        self,
-        base_label: str = "base",
-        target_label: str = "target",
-    ) -> str:
-        """Generate markdown table (GitHub-compatible)."""
-        lines = []
-
-        # Header
-        lines.append(f"| Query | {base_label} | {target_label} | Ratio |")
-        lines.append("|-------|---------|--------|-------|")
-
-        for _, row in self.df.iterrows():
-            name = str(row.get("name", ""))
-            if "/" in name:
-                name = name.split("/")[0]
-
-            base_val = row.get("value_base", float("nan"))
-            target_val = row.get("value_target", float("nan"))
-            ratio = row.get("ratio", float("nan"))
-
-            ratio_str = f"{ratio:.3f}x" if not pd.isna(ratio) else "N/A"
-            if not pd.isna(ratio):
-                if ratio < (1.0 - self.threshold):
-                    ratio_str += " \U0001f680"  # Rocket
-                elif ratio > (1.0 + self.threshold):
-                    ratio_str += " \U0001f6a8"  # Alarm
-
-            lines.append(f"| {name} | {_format_time_ns(base_val)} | {_format_time_ns(target_val)} | {ratio_str} |")
-
-        return "\n".join(lines)
-
     def summary(self) -> str:
         """Generate summary statistics."""
         lines = ["## Summary", ""]
diff --git a/bench-orchestrator/bench_orchestrator/storage/store.py b/bench-orchestrator/bench_orchestrator/storage/store.py
@@ -81,8 +81,8 @@ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
         # Mark as partial if there was an exception
         if exc_type is not None:
             self.metadata.partial = True
-        else:
-            self.metadata.completed_at = datetime.now()
+
+        self.metadata.completed_at = datetime.now()
 
         # Write metadata
         with open(self.run_dir / "metadata.json", "w") as f:
@@ -146,7 +146,6 @@ def create_run(self, config: RunConfig, build_config: BuildConfig) -> Iterator[R
             env_triple=env_triple,
             rustflags=build_config.rustflags,
             profile=build_config.profile,
-            partial=True,  # Will be set to False on successful completion
         )
 
         ctx = RunContext(run_dir, metadata)
diff --git a/vortex-bench/src/statpopgen/statpopgen_benchmark.rs b/vortex-bench/src/statpopgen/statpopgen_benchmark.rs
@@ -129,6 +129,7 @@ impl Benchmark for StatPopGenBenchmark {
         Ok(())
     }
 
+    #[allow(clippy::cast_possible_truncation)]
     fn expected_row_counts(&self) -> Option<Vec<usize>> {
         let n_rows = self.n_rows as usize;
         match self.scale_factor {

Original file line number	Diff line number	Diff line change
`@@ -129,6 +129,7 @@ impl Benchmark for StatPopGenBenchmark {`
`129`	`129`	`Ok(())`
`130`	`130`	`}`
`131`	`131`
	`132`	`+ #[allow(clippy::cast_possible_truncation)]`
`132`	`133`	`fn expected_row_counts(&self) -> Option<Vec<usize>> {`
`133`	`134`	`let n_rows = self.n_rows as usize;`
`134`	`135`	`match self.scale_factor {`