ionelmc · akx · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026 · ionelmc
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -43,3 +43,4 @@ Authors
 * Enno Gotthold - https://github.com/SchoolGuy
 * Thomas B. Brunner - https://github.com/thomasbbrunner
 * Hugo van Kemenade - https://github.com/hugovk
+* Aarni Koskela - https://github.com/akx
diff --git a/docs/comparing.rst b/docs/comparing.rst
@@ -38,6 +38,29 @@ Example::
 
     pytest-benchmark compare 0001 0002
 
+Comparing between source files
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When comparing benchmarks from multiple files (e.g. a ``main`` branch run vs. a feature branch run),
+the default output shows all benchmarks in a single flat table.
+The ``--compare-between`` flag pivots the table so that each row is a unique benchmark,
+with columns showing the metric value from each source file and the relative change::
+
+    pytest-benchmark compare --compare-between 0001 0002
+
+Example output::
+
+    ----------------------- benchmark: 9 tests, 2 sources ------------------------
+    Name (time in ns)                 0001_f41c0c7 Min  0002_8e68892 Min    Change
+    ------------------------------------------------------------------------------
+    test_getattr_thread_critical           790.93          245.80          -68.9%
+    test_setattr_thread_critical           899.99          254.15          -71.8%
+    ...
+
+You can control which metrics are shown per source with ``--columns`` and the sort order with ``--sort``::
+
+    pytest-benchmark compare --compare-between --sort=mean --columns=min,mean 0001 0002
+
 Plotting
 --------
 

diff --git a/src/pytest_benchmark/cli.py b/src/pytest_benchmark/cli.py
@@ -19,6 +19,7 @@
 from .plugin import add_display_options
 from .plugin import add_global_options
 from .plugin import add_histogram_options
+from .table import CompareBetweenResults
 from .table import TableResults
 from .utils import NAME_FORMATTERS
 from .utils import first_or_value
@@ -108,6 +109,11 @@ def make_parser():
     )
     add_display_options(compare_command.add_argument, prefix='')
     add_histogram_options(compare_command.add_argument, prefix='')
+    compare_command.add_argument(
+        '--compare-between',
+        action='store_true',
+        help='Compare same-named benchmarks across different source files.',
+    )
     add_glob_or_file(compare_command.add_argument)
     add_csv_options(compare_command.add_argument, prefix='')
 
@@ -148,10 +154,18 @@ def main():
         for file in storage.query():
             print(file)
     elif args.command == 'compare':
-        results_table = TableResults(
+        histogram = first_or_value(args.histogram, False)
+        if args.compare_between:
+            if histogram:
+                parser.error('--compare-between is not compatible with --histogram')
+            results_table_cls = CompareBetweenResults
+        else:
+            results_table_cls = TableResults
+
+        results_table = results_table_cls(
             columns=args.columns,
             sort=args.sort,
-            histogram=first_or_value(args.histogram, False),
+            histogram=histogram,
             name_format=NAME_FORMATTERS[args.name],
             logger=logger,
             scale_unit=partial(

diff --git a/src/pytest_benchmark/table.py b/src/pytest_benchmark/table.py
@@ -11,6 +11,20 @@
 
 NUMBER_FMT = '{0:,.4f}'
 ALIGNED_NUMBER_FMT = '{0:>{1},.4f}{2:<{3}}'
+STAT_PROPS = ('min', 'max', 'mean', 'median', 'iqr', 'stddev', 'ops')
+
+
+def compute_best_worst(benchmarks, progress_reporter, tr, line):
+    worst = {}
+    best = {}
+    for line1, prop in progress_reporter(STAT_PROPS, tr, '{line}: {value}', line=line):
+        # For 'ops', higher is better; for time-based metrics, lower is better
+        best_fn, worst_fn = (max, min) if prop == 'ops' else (min, max)
+        values = progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1)
+        best[prop] = best_fn(bench[prop] for _, bench in values)
+        values = progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1)
+        worst[prop] = worst_fn(bench[prop] for _, bench in values)
+    return best, worst
 
 
 class TableResults:
@@ -22,6 +36,24 @@ def __init__(self, columns, sort, histogram, name_format, logger, scale_unit):
         self.logger = logger
         self.scale_unit = scale_unit
 
+    def compute_scale(self, benchmarks, best, worst):
+        unit, adjustment = self.scale_unit(unit='seconds', benchmarks=benchmarks, best=best, worst=worst, sort=self.sort)
+        ops_unit, ops_adjustment = self.scale_unit(unit='operations', benchmarks=benchmarks, best=best, worst=worst, sort=self.sort)
+        labels = {
+            'name': f'Name (time in {unit}s)',
+            'min': 'Min',
+            'max': 'Max',
+            'mean': 'Mean',
+            'stddev': 'StdDev',
+            'rounds': 'Rounds',
+            'iterations': 'Iterations',
+            'iqr': 'IQR',
+            'median': 'Median',
+            'outliers': 'Outliers',
+            'ops': f'OPS ({ops_unit}ops/s)' if ops_unit else 'OPS',
+        }
+        return unit, adjustment, ops_adjustment, labels
+
     def display(self, tr, groups, progress_reporter=report_progress):
         tr.write_line('')
         report_online_progress(progress_reporter, tr, 'Computing stats ...')
@@ -30,47 +62,24 @@ def display(self, tr, groups, progress_reporter=report_progress):
             for bench in benchmarks:
                 bench['name'] = self.name_format(bench)
 
-            worst = {}
-            best = {}
             solo = len(benchmarks) == 1
-            for line1, prop in progress_reporter(
-                ('min', 'max', 'mean', 'median', 'iqr', 'stddev', 'ops'), tr, '{line}: {value}', line=line
-            ):
-                if prop == 'ops':
-                    worst[prop] = min(bench[prop] for _, bench in progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1))
-                    best[prop] = max(bench[prop] for _, bench in progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1))
-                else:
-                    worst[prop] = max(bench[prop] for _, bench in progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1))
-                    best[prop] = min(bench[prop] for _, bench in progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1))
+            best, worst = compute_best_worst(benchmarks, progress_reporter, tr, line)
             for line1, prop in progress_reporter(('outliers', 'rounds', 'iterations'), tr, '{line}: {value}', line=line):
                 worst[prop] = max(
                     benchmark[prop] for _, benchmark in progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1)
                 )
 
-            unit, adjustment = self.scale_unit(unit='seconds', benchmarks=benchmarks, best=best, worst=worst, sort=self.sort)
-            ops_unit, ops_adjustment = self.scale_unit(unit='operations', benchmarks=benchmarks, best=best, worst=worst, sort=self.sort)
-            labels = {
-                'name': f'Name (time in {unit}s)',
-                'min': 'Min',
-                'max': 'Max',
-                'mean': 'Mean',
-                'stddev': 'StdDev',
-                'rounds': 'Rounds',
-                'iterations': 'Iterations',
-                'iqr': 'IQR',
-                'median': 'Median',
-                'outliers': 'Outliers',
-                'ops': f'OPS ({ops_unit}ops/s)' if ops_unit else 'OPS',
-            }
+            unit, adjustment, ops_adjustment, labels = self.compute_scale(benchmarks, best, worst)
             widths = {
                 'name': 3 + max(len(labels['name']), max(len(benchmark['name']) for benchmark in benchmarks)),
                 'rounds': 2 + max(len(labels['rounds']), len(str(worst['rounds']))),
                 'iterations': 2 + max(len(labels['iterations']), len(str(worst['iterations']))),
                 'outliers': 2 + max(len(labels['outliers']), len(str(worst['outliers']))),
                 'ops': 2 + max(len(labels['ops']), len(NUMBER_FMT.format(best['ops'] * ops_adjustment))),
             }
-            for prop in 'min', 'max', 'mean', 'stddev', 'median', 'iqr':
-                widths[prop] = 2 + max(len(labels[prop]), max(len(NUMBER_FMT.format(bench[prop] * adjustment)) for bench in benchmarks))
+            for prop in STAT_PROPS:
+                if prop not in widths:
+                    widths[prop] = 2 + max(len(labels[prop]), max(len(NUMBER_FMT.format(bench[prop] * adjustment)) for bench in benchmarks))
 
             rpadding = 0 if solo else 10
             labels_line = labels['name'].ljust(widths['name']) + ''.join(
@@ -134,6 +143,120 @@ def display(self, tr, groups, progress_reporter=report_progress):
         tr.write_line('  OPS: Operations Per Second, computed as 1 / Mean')
 
 
+class CompareBetweenResults(TableResults):
+    def display(self, tr, groups, progress_reporter=report_progress):
+        tr.write_line('')
+        for line, (group, benchmarks) in progress_reporter(groups, tr, 'Computing stats ... group {pos}/{total}'):
+            # Collect sources in order of first appearance and build fullname -> {source: bench} mapping
+            sources = list(dict.fromkeys(bench.get('source', '') for bench in benchmarks))
+            bench_map = {}
+            for bench in benchmarks:
+                bench_map.setdefault(bench['fullname'], {})[bench.get('source', '')] = bench
+
+            if len(sources) < 2:
+                tr.write_line(f'ERROR: --compare-between requires at least 2 source files, got {len(sources)}.', red=True)
+                continue
+
+            # Format source labels (short)
+            source_labels = []
+            for src in sources:
+                parts = src.split('/')
+                label = parts[-1][:12] if parts[-1] else src[:12]
+                source_labels.append(label)
+
+            metrics = [c for c in self.columns if c in STAT_PROPS]
+            if not metrics:
+                metrics = [self.sort] if self.sort in STAT_PROPS else ['min']
+
+            all_benches = list(benchmarks)
+            best, worst = compute_best_worst(all_benches, progress_reporter, tr, line)
+            _unit, adjustment, ops_adjustment, labels = self.compute_scale(all_benches, best, worst)
+            adjustments = {m: (ops_adjustment if m == 'ops' else adjustment) for m in metrics}
+
+            # Sort benchmarks by the sort metric from the first source
+            first_source = sources[0]
+            sorted_names = sorted(
+                bench_map,
+                key=lambda name: bench_map[name].get(first_source, {}).get(self.sort, float('inf')),
+            )
+
+            # Format benchmark display names without source suffix
+            display_names = {}
+            for fullname in sorted_names:
+                any_bench = next(iter(bench_map[fullname].values()))
+                display_names[fullname] = self.name_format({**any_bench, 'source': ''})
+
+            name_width = 3 + max(len(labels['name']), max(len(n) for n in display_names.values()))
+
+            # Build column specs: (source_idx, metric, label, width) for each metric x source
+            col_specs = []
+            for metric in metrics:
+                adj = adjustments[metric]
+                for si, (src, slabel) in enumerate(zip(sources, source_labels)):
+                    label = f'{slabel} {labels[metric]}'
+                    max_val_width = len(label)
+                    for fullname in sorted_names:
+                        bench = bench_map[fullname].get(src)
+                        if bench is not None:
+                            max_val_width = max(max_val_width, len(NUMBER_FMT.format(bench[metric] * adj)))
+                    col_specs.append((si, metric, label, max_val_width + 2))
+
+            # Build change column specs: (metric, source_idx, label, width) comparing first source to each subsequent
+            use_simple_label = len(metrics) == 1 and len(sources) == 2
+            change_specs = []
+            for metric in metrics:
+                for si in range(1, len(sources)):
+                    label = 'Change' if use_simple_label else f'Chg:{source_labels[si][:6]}/{labels[metric]}'
+                    change_specs.append((metric, si, label, max(len(label) + 2, 10)))
+
+            # Build and print header
+            header = labels['name'].ljust(name_width)
+            for _si, _metric, label, width in col_specs:
+                header += label.rjust(width)
+            for _metric, _si, label, width in change_specs:
+                header += label.rjust(width)
+
+            group_name = '' if group is None else f' {group!r}'
+            tr.write_line(
+                f' benchmark{group_name}: {len(sorted_names)} tests, {len(sources)} sources '.center(len(header), '-'),
+                yellow=True,
+            )
+            tr.write_line(header)
+            tr.write_line('-' * len(header), yellow=True)
+
+            # Print rows
+            for fullname in sorted_names:
+                tr.write(display_names[fullname].ljust(name_width))
+                row_values = {}
+                for si, metric, _label, width in col_specs:
+                    bench = bench_map[fullname].get(sources[si])
+                    if bench is None:
+                        tr.write('N/A'.rjust(width))
+                    else:
+                        row_values[(si, metric)] = bench[metric]
+                        tr.write(NUMBER_FMT.format(bench[metric] * adjustments[metric]).rjust(width))
+
+                for metric, si, _label, width in change_specs:
+                    base_val = row_values.get((0, metric))
+                    cmp_val = row_values.get((si, metric))
+                    if base_val is None or cmp_val is None or base_val == 0:
+                        tr.write('N/A'.rjust(width))
+                    else:
+                        pct = (cmp_val - base_val) / abs(base_val) * 100
+                        # For ops, higher is better; for time metrics, lower is better
+                        is_improvement = (pct > 0) if metric == 'ops' else (pct < 0)
+                        is_regression = (pct < 0) if metric == 'ops' else (pct > 0)
+                        tr.write(f'{pct:+.1f}%'.rjust(width), green=is_improvement, red=is_regression, bold=True)
+                tr.write('\n')
+
+            tr.write_line('-' * len(header), yellow=True)
+            tr.write_line('')
+
+        tr.write_line('Legend:')
+        tr.write_line('  Change: percentage change from first source to subsequent sources.')
+        tr.write_line('  Green: improvement, Red: regression.')
+
+
 def compute_baseline_scale(baseline, value, width):
     if not width:
         return ''

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -93,7 +93,7 @@ def test_help_compare(testdir, args):
             '                                 [--columns LABELS] [--name FORMAT]',
             '                                 [--time-unit COLUMN]',
             '                                 [--histogram [FILENAME-PREFIX]]',
-            '                                 [--csv [FILENAME]]',
+            '                                 [--compare-between] [--csv [FILENAME]]',
             '                                 [[]glob_or_file *[]]',
             '',
             'Compare saved runs.',
@@ -122,6 +122,8 @@ def test_help_compare(testdir, args):
             '                        FILENAME-PREFIX-test_name.svg. If FILENAME-PREFIX',
             "                        contains slashes ('/') then directories will be",
             "                        created. Default: 'benchmark_*'",
+            '  --compare-between     Compare same-named benchmarks across different source',
+            '                        files.',
             "  --csv [FILENAME]      Save a csv report. If FILENAME contains slashes ('/')",
             '                        then directories will be created. Default:',
             "                        'benchmark_*'",
@@ -287,6 +289,47 @@ def test_compare(testdir, name, name_pattern_generator):
     assert result.ret == 0
 
 
+def test_compare_between(testdir):
+    common_args = [
+        'py.test-benchmark',
+        '--storage',
+        STORAGE,
+        'compare',
+        '--compare-between',
+        '--sort=min',
+        '--columns=min,max',
+        '--name=short',
+    ]
+    # 2-source case: simple "Change" header
+    result = testdir.run(*common_args, '0001', '0002')
+    result.stdout.fnmatch_lines(
+        [
+            '---*--- benchmark: 1 tests, 2 sources ---*---',
+            'Name (time in ns) *0001_b87b9aa Min*0002_b87b9aa Min*0001_b87b9aa Max*0002_b87b9aa Max*Chg:0002_b/Min*Chg:0002_b/Max',
+            '---*---',
+            'xfast_parametrized[[]0[]] *217.3145*216.9028*11*447.3891*7*739.2997*-0.2%*-32.4%',
+            '---*---',
+            '',
+            'Legend:',
+            '  Change: percentage change from first source to subsequent sources.',
+        ]
+    )
+    assert result.ret == 0
+
+    # 3-source case: disambiguated change headers per source/metric
+    result = testdir.run(*common_args, '0001', '0002', '0003')
+    result.stdout.fnmatch_lines(
+        [
+            '---*--- benchmark: 1 tests, 3 sources ---*---',
+            'Name (time in ns) *Chg:0002_b/Min*Chg:0003_5/Min*Chg:0002_b/Max*Chg:0003_5/Max',
+            '---*---',
+            'xfast_parametrized[[]0[]] *217.3145*216.9028*215.6286*11*447.3891*-0.2%*-0.8%*-32.4%*-9.9%',
+            '---*---',
+        ]
+    )
+    assert result.ret == 0
+
+
 @pytest.mark.parametrize(
     ('name', 'name_pattern_generator', 'unit'),
     [