|
6 | 6 | from datetime import datetime, timedelta |
7 | 7 | from typing import Annotated |
8 | 8 |
|
| 9 | +import pandas as pd |
9 | 10 | import typer |
10 | 11 | from rich.console import Console |
11 | 12 | from rich.table import Table |
12 | 13 |
|
13 | | -from .comparison.analyzer import BenchmarkAnalyzer, TargetRef |
14 | | -from .comparison.reporter import BenchmarkReporter |
| 14 | +from .comparison import analyzer |
| 15 | +from .comparison.reporter import pivot_comparison_table |
15 | 16 | from .config import ( |
16 | 17 | ENGINE_FORMATS, |
17 | 18 | Benchmark, |
@@ -49,6 +50,10 @@ def parse_queries(value: str | None) -> list[int] | None: |
49 | 50 | return [int(q.strip()) for q in value.split(",")] |
50 | 51 |
|
51 | 52 |
|
| 53 | +def run_ref_auto_complete() -> list[str]: |
| 54 | + return list(map(lambda x: x.run_id, ResultStore().list_runs(limit=None))) |
| 55 | + |
| 56 | + |
52 | 57 | @app.command() |
53 | 58 | def run( |
54 | 59 | benchmark: Annotated[Benchmark, typer.Argument(help="Benchmark suite to run")], |
@@ -154,103 +159,124 @@ def run( |
154 | 159 |
|
155 | 160 | console.print(f"\n[green]Results saved to run: {ctx.metadata.run_id}[/green]") |
156 | 161 |
|
| 162 | + # Show comparison table if we have multiple engine:format combinations |
| 163 | + df = store.load_results(ctx.metadata.run_id) |
| 164 | + if not df.empty: |
| 165 | + try: |
| 166 | + pivot = analyzer.compare_within_run(df) |
| 167 | + table = pivot_comparison_table(pivot) |
| 168 | + console.print() |
| 169 | + console.print(table) |
| 170 | + except ValueError: |
| 171 | + # Not enough combinations to compare |
| 172 | + pass |
| 173 | + |
157 | 174 |
|
158 | 175 | @app.command() |
159 | 176 | def compare( |
160 | | - base: Annotated[ |
| 177 | + runs: Annotated[ |
161 | 178 | str | None, |
162 | | - typer.Option("--base", "-b", help="Base reference (engine:format@run)"), |
| 179 | + typer.Option("--runs", "-r", help="Runs to compare (comma-separated, 2 or more)"), |
163 | 180 | ] = None, |
164 | | - target: Annotated[ |
| 181 | + run: Annotated[ |
165 | 182 | str | None, |
166 | | - typer.Option("--target", "-t", help="Target reference (engine:format@run)"), |
| 183 | + typer.Option("--run", help="Single run for within-run comparison", autocompletion=run_ref_auto_complete), |
167 | 184 | ] = None, |
168 | | - runs: Annotated[ |
| 185 | + baseline: Annotated[ |
169 | 186 | str | None, |
170 | | - typer.Option("--runs", "-r", help="Two runs to compare (comma-separated)"), |
| 187 | + typer.Option("--baseline", help="Baseline engine:format for within-run comparison"), |
171 | 188 | ] = None, |
172 | 189 | threshold: Annotated[float, typer.Option("--threshold", help="Significance threshold (default 10%)")] = 0.10, |
| 190 | + filter_engine: Annotated[ |
| 191 | + str | None, typer.Option("--engine", help="Filter only for results that use a specific engine") |
| 192 | + ] = None, |
| 193 | + filter_format: Annotated[ |
| 194 | + str | None, typer.Option("--format", help="Filter only for results that use a specific file format") |
| 195 | + ] = None, |
173 | 196 | ) -> None: |
174 | 197 | """Compare benchmark results.""" |
175 | 198 | store = ResultStore() |
176 | 199 |
|
177 | | - if runs: |
178 | | - # Compare two full runs |
179 | | - run_refs = [r.strip() for r in runs.split(",")] |
180 | | - if len(run_refs) != 2: |
181 | | - console.print("[red]--runs requires exactly two run references[/red]") |
| 200 | + if run: |
| 201 | + # Within-run comparison |
| 202 | + run_meta = store.get_run(run) |
| 203 | + if not run_meta: |
| 204 | + console.print(f"[red]Run not found: {run}[/red]") |
182 | 205 | raise typer.Exit(1) |
183 | 206 |
|
184 | | - base_run = store.get_run(run_refs[0]) |
185 | | - target_run = store.get_run(run_refs[1]) |
| 207 | + df = store.load_results(run_meta.run_id) |
186 | 208 |
|
187 | | - if not base_run: |
188 | | - console.print(f"[red]Run not found: {run_refs[0]}[/red]") |
| 209 | + if df.empty: |
| 210 | + console.print("[red]No results found[/red]") |
189 | 211 | raise typer.Exit(1) |
190 | | - if not target_run: |
191 | | - console.print(f"[red]Run not found: {run_refs[1]}[/red]") |
192 | | - raise typer.Exit(1) |
193 | | - |
194 | | - base_df = store.load_results(base_run.run_id) |
195 | | - target_df = store.load_results(target_run.run_id) |
196 | 212 |
|
197 | | - base_label = base_run.label or base_run.run_id[:20] |
198 | | - target_label = target_run.label or target_run.run_id[:20] |
| 213 | + # Parse baseline if provided |
| 214 | + baseline_engine = None |
| 215 | + baseline_format = None |
| 216 | + if baseline: |
| 217 | + if ":" in baseline: |
| 218 | + baseline_engine, baseline_format = baseline.split(":", 1) |
| 219 | + else: |
| 220 | + console.print("[red]--baseline must be engine:format[/red]") |
| 221 | + raise typer.Exit(1) |
199 | 222 |
|
200 | | - elif base and target: |
201 | | - # Compare specific configurations |
202 | | - base_ref = TargetRef.parse(base) |
203 | | - target_ref = TargetRef.parse(target) |
| 223 | + try: |
| 224 | + pivot = analyzer.compare_within_run(df, baseline_engine, baseline_format, filter_engine, filter_format) |
| 225 | + except ValueError as e: |
| 226 | + console.print(f"[red]{e}[/red]") |
| 227 | + raise typer.Exit(1) |
204 | 228 |
|
205 | | - base_run = store.get_run(base_ref.run) |
206 | | - target_run = store.get_run(target_ref.run) |
| 229 | + table = pivot_comparison_table(pivot, threshold) |
| 230 | + console.print(table) |
| 231 | + return |
207 | 232 |
|
208 | | - if not base_run: |
209 | | - console.print(f"[red]Run not found: {base_ref.run}[/red]") |
210 | | - raise typer.Exit(1) |
211 | | - if not target_run: |
212 | | - console.print(f"[red]Run not found: {target_ref.run}[/red]") |
| 233 | + elif runs: |
| 234 | + # Compare multiple runs (2 or more) |
| 235 | + run_refs = [r.strip() for r in runs.split(",")] |
| 236 | + if len(run_refs) < 2: |
| 237 | + console.print("[red]--runs requires at least two run references[/red]") |
213 | 238 | raise typer.Exit(1) |
214 | 239 |
|
215 | | - base_df = store.load_results(base_run.run_id) |
216 | | - target_df = store.load_results(target_run.run_id) |
217 | | - |
218 | | - # Apply filters |
219 | | - base_analyzer = BenchmarkAnalyzer(base_df) |
220 | | - target_analyzer = BenchmarkAnalyzer(target_df) |
| 240 | + # Load all runs |
| 241 | + run_data: list[tuple[str, pd.DataFrame]] = [] |
| 242 | + for ref in run_refs: |
| 243 | + run_meta = store.get_run(ref) |
| 244 | + if not run_meta: |
| 245 | + console.print(f"[red]Run not found: {ref}[/red]") |
| 246 | + raise typer.Exit(1) |
| 247 | + label = run_meta.label or run_meta.run_id[:16] |
| 248 | + df = store.load_results(run_meta.run_id) |
| 249 | + if df.empty: |
| 250 | + console.print(f"[red]No results for run: {ref}[/red]") |
| 251 | + raise typer.Exit(1) |
| 252 | + run_data.append((label, df)) |
| 253 | + |
| 254 | + # Use baseline option if provided, otherwise first run |
| 255 | + baseline_label = None |
| 256 | + if baseline: |
| 257 | + # Find matching label |
| 258 | + for label, _ in run_data: |
| 259 | + if baseline in label: |
| 260 | + baseline_label = label |
| 261 | + break |
| 262 | + if baseline_label is None: |
| 263 | + console.print(f"[red]Baseline not found: {baseline}[/red]") |
| 264 | + raise typer.Exit(1) |
221 | 265 |
|
222 | | - base_df = base_analyzer.filter_by_ref(base_ref) |
223 | | - target_df = target_analyzer.filter_by_ref(target_ref) |
| 266 | + try: |
| 267 | + pivot = analyzer.compare_runs(run_data, baseline_label, filter_engine, filter_format) |
| 268 | + except ValueError as e: |
| 269 | + console.print(f"[red]{e}[/red]") |
| 270 | + raise typer.Exit(1) |
224 | 271 |
|
225 | | - base_label = base |
226 | | - target_label = target |
| 272 | + table = pivot_comparison_table(pivot, threshold, row_keys=["query", "engine", "format"]) |
| 273 | + console.print(table) |
| 274 | + return |
227 | 275 |
|
228 | 276 | else: |
229 | | - console.print("[red]Must specify either --runs or --base/--target[/red]") |
230 | | - raise typer.Exit(1) |
231 | | - |
232 | | - if base_df.empty: |
233 | | - console.print("[red]No results found for base[/red]") |
234 | | - raise typer.Exit(1) |
235 | | - if target_df.empty: |
236 | | - console.print("[red]No results found for target[/red]") |
| 277 | + console.print("[red]Must specify either --runs or --run[/red]") |
237 | 278 | raise typer.Exit(1) |
238 | 279 |
|
239 | | - # Perform comparison |
240 | | - analyzer = BenchmarkAnalyzer(base_df) |
241 | | - comparison = analyzer.compare(base_df, target_df) |
242 | | - stats = analyzer.summary_stats(comparison) |
243 | | - |
244 | | - reporter = BenchmarkReporter(comparison, stats, threshold) |
245 | | - |
246 | | - table = reporter.to_rich_table( |
247 | | - title="Benchmark Comparison", |
248 | | - base_label=base_label, |
249 | | - target_label=target_label, |
250 | | - ) |
251 | | - console.print(table) |
252 | | - reporter.print_summary() |
253 | | - |
254 | 280 |
|
255 | 281 | @app.command("list") |
256 | 282 | def list_runs( |
@@ -305,7 +331,7 @@ def list_runs( |
305 | 331 |
|
306 | 332 | @app.command() |
307 | 333 | def show( |
308 | | - run_ref: Annotated[str, typer.Argument(help="Run ID, label, or 'latest'")], |
| 334 | + run_ref: Annotated[str, typer.Argument(help="Run ID, label, or 'latest'", autocompletion=run_ref_auto_complete)], |
309 | 335 | ) -> None: |
310 | 336 | """Show details of a specific run.""" |
311 | 337 | store = ResultStore() |
|
0 commit comments