ionelmc · akx · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026 · ionelmc
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -43,3 +43,4 @@ Authors
 * Enno Gotthold - https://github.com/SchoolGuy
 * Thomas B. Brunner - https://github.com/thomasbbrunner
 * Hugo van Kemenade - https://github.com/hugovk
+* Aarni Koskela - https://github.com/akx
diff --git a/docs/comparing.rst b/docs/comparing.rst
@@ -38,6 +38,32 @@ Example::
 
     pytest-benchmark compare 0001 0002
 
+Comparing between source files
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When comparing benchmarks from multiple files (e.g. a ``main`` branch run vs. a feature branch run),
+the default output shows all benchmarks in a single flat table.
+The ``--compare-between`` flag pivots the table so that each row is a unique benchmark,
+with columns showing the metric value from each source file and the relative change::
+
+    pytest-benchmark compare --compare-between 0001 0002
+
+Example output::
+
+    -------------------------- benchmark: 9 tests, 2 sources ---------------------------
+    Name (time in ns)                 0001_f41c0c7(*) Min  0002_8e68892 Min         ΔMin
+    -------------------------------------------------------------------------------------
+    test_getattr_thread_critical               790.93            245.80          -68.9%
+    test_setattr_thread_critical               899.99            254.15          -71.8%
+    ...
+
+The first source is the reference, marked with ``(*)``.
+Each subsequent source is followed by a ``Δ`` column showing the percentage change.
+
+You can control which metrics are shown per source with ``--columns`` and the sort order with ``--sort``::
+
+    pytest-benchmark compare --compare-between --sort=mean --columns=min,mean 0001 0002
+
 Plotting
 --------
 

diff --git a/src/pytest_benchmark/cli.py b/src/pytest_benchmark/cli.py
@@ -19,6 +19,7 @@
 from .plugin import add_display_options
 from .plugin import add_global_options
 from .plugin import add_histogram_options
+from .table import CompareBetweenResults
 from .table import TableResults
 from .utils import NAME_FORMATTERS
 from .utils import first_or_value
@@ -108,6 +109,11 @@ def make_parser():
     )
     add_display_options(compare_command.add_argument, prefix='')
     add_histogram_options(compare_command.add_argument, prefix='')
+    compare_command.add_argument(
+        '--compare-between',
+        action='store_true',
+        help='Compare same-named benchmarks across different source files.',
+    )
     add_glob_or_file(compare_command.add_argument)
     add_csv_options(compare_command.add_argument, prefix='')
 
@@ -148,10 +154,18 @@ def main():
         for file in storage.query():
             print(file)
     elif args.command == 'compare':
-        results_table = TableResults(
+        histogram = first_or_value(args.histogram, False)
+        if args.compare_between:
+            if histogram:
+                parser.error('--compare-between is not compatible with --histogram')
+            results_table_cls = CompareBetweenResults
+        else:
+            results_table_cls = TableResults
+
+        results_table = results_table_cls(
             columns=args.columns,
             sort=args.sort,
-            histogram=first_or_value(args.histogram, False),
+            histogram=histogram,
             name_format=NAME_FORMATTERS[args.name],
             logger=logger,
             scale_unit=partial(

diff --git a/src/pytest_benchmark/table.py b/src/pytest_benchmark/table.py
@@ -11,6 +11,22 @@
 
 NUMBER_FMT = '{0:,.4f}'
 ALIGNED_NUMBER_FMT = '{0:>{1},.4f}{2:<{3}}'
+STAT_PROPS = ('min', 'max', 'mean', 'median', 'iqr', 'stddev', 'ops')
+DELTA = '\N{GREEK CAPITAL LETTER DELTA}'
+REF_LABEL = '(*)'
+
+
+def compute_best_worst(benchmarks, progress_reporter, tr, line):
+    worst = {}
+    best = {}
+    for line1, prop in progress_reporter(STAT_PROPS, tr, '{line}: {value}', line=line):
+        # For 'ops', higher is better; for time-based metrics, lower is better
+        best_fn, worst_fn = (max, min) if prop == 'ops' else (min, max)
+        values = progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1)
+        best[prop] = best_fn(bench[prop] for _, bench in values)
+        values = progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1)
+        worst[prop] = worst_fn(bench[prop] for _, bench in values)
+    return best, worst
 
 
 class TableResults:
@@ -22,6 +38,24 @@ def __init__(self, columns, sort, histogram, name_format, logger, scale_unit):
         self.logger = logger
         self.scale_unit = scale_unit
 
+    def compute_scale(self, benchmarks, best, worst):
+        unit, adjustment = self.scale_unit(unit='seconds', benchmarks=benchmarks, best=best, worst=worst, sort=self.sort)
+        ops_unit, ops_adjustment = self.scale_unit(unit='operations', benchmarks=benchmarks, best=best, worst=worst, sort=self.sort)
+        labels = {
+            'name': f'Name (time in {unit}s)',
+            'min': 'Min',
+            'max': 'Max',
+            'mean': 'Mean',
+            'stddev': 'StdDev',
+            'rounds': 'Rounds',
+            'iterations': 'Iterations',
+            'iqr': 'IQR',
+            'median': 'Median',
+            'outliers': 'Outliers',
+            'ops': f'OPS ({ops_unit}ops/s)' if ops_unit else 'OPS',
+        }
+        return unit, adjustment, ops_adjustment, labels
+
     def display(self, tr, groups, progress_reporter=report_progress):
         tr.write_line('')
         report_online_progress(progress_reporter, tr, 'Computing stats ...')
@@ -30,47 +64,24 @@ def display(self, tr, groups, progress_reporter=report_progress):
             for bench in benchmarks:
                 bench['name'] = self.name_format(bench)
 
-            worst = {}
-            best = {}
             solo = len(benchmarks) == 1
-            for line1, prop in progress_reporter(
-                ('min', 'max', 'mean', 'median', 'iqr', 'stddev', 'ops'), tr, '{line}: {value}', line=line
-            ):
-                if prop == 'ops':
-                    worst[prop] = min(bench[prop] for _, bench in progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1))
-                    best[prop] = max(bench[prop] for _, bench in progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1))
-                else:
-                    worst[prop] = max(bench[prop] for _, bench in progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1))
-                    best[prop] = min(bench[prop] for _, bench in progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1))
+            best, worst = compute_best_worst(benchmarks, progress_reporter, tr, line)
             for line1, prop in progress_reporter(('outliers', 'rounds', 'iterations'), tr, '{line}: {value}', line=line):
                 worst[prop] = max(
                     benchmark[prop] for _, benchmark in progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1)
                 )
 
-            unit, adjustment = self.scale_unit(unit='seconds', benchmarks=benchmarks, best=best, worst=worst, sort=self.sort)
-            ops_unit, ops_adjustment = self.scale_unit(unit='operations', benchmarks=benchmarks, best=best, worst=worst, sort=self.sort)
-            labels = {
-                'name': f'Name (time in {unit}s)',
-                'min': 'Min',
-                'max': 'Max',
-                'mean': 'Mean',
-                'stddev': 'StdDev',
-                'rounds': 'Rounds',
-                'iterations': 'Iterations',
-                'iqr': 'IQR',
-                'median': 'Median',
-                'outliers': 'Outliers',
-                'ops': f'OPS ({ops_unit}ops/s)' if ops_unit else 'OPS',
-            }
+            unit, adjustment, ops_adjustment, labels = self.compute_scale(benchmarks, best, worst)
             widths = {
                 'name': 3 + max(len(labels['name']), max(len(benchmark['name']) for benchmark in benchmarks)),
                 'rounds': 2 + max(len(labels['rounds']), len(str(worst['rounds']))),
                 'iterations': 2 + max(len(labels['iterations']), len(str(worst['iterations']))),
                 'outliers': 2 + max(len(labels['outliers']), len(str(worst['outliers']))),
                 'ops': 2 + max(len(labels['ops']), len(NUMBER_FMT.format(best['ops'] * ops_adjustment))),
             }
-            for prop in 'min', 'max', 'mean', 'stddev', 'median', 'iqr':
-                widths[prop] = 2 + max(len(labels[prop]), max(len(NUMBER_FMT.format(bench[prop] * adjustment)) for bench in benchmarks))
+            for prop in STAT_PROPS:
+                if prop not in widths:
+                    widths[prop] = 2 + max(len(labels[prop]), max(len(NUMBER_FMT.format(bench[prop] * adjustment)) for bench in benchmarks))
 
             rpadding = 0 if solo else 10
             labels_line = labels['name'].ljust(widths['name']) + ''.join(
@@ -134,6 +145,131 @@ def display(self, tr, groups, progress_reporter=report_progress):
         tr.write_line('  OPS: Operations Per Second, computed as 1 / Mean')
 
 
+class CompareBetweenResults(TableResults):
+    def display(self, tr, groups, progress_reporter=report_progress):
+        tr.write_line('')
+
+        for line, (group, benchmarks) in progress_reporter(groups, tr, 'Computing stats ... group {pos}/{total}'):
+            self._display_single_between(line, group, benchmarks, tr=tr, progress_reporter=progress_reporter)
+
+        tr.write_line('Legend:')
+        tr.write_line(f'  {REF_LABEL}: reference source for comparison. Cyan on terminal.')
+        tr.write_line(f'  {DELTA}: percentage change from reference source.')
+        tr.write_line('  Green: improvement, Red: regression.')
+
+    def _display_single_between(self, line, group, benchmarks, *, tr, progress_reporter):
+        # Collect sources in order of first appearance and build fullname -> {source: bench} mapping
+        sources = list(dict.fromkeys(bench.get('source', '') for bench in benchmarks))
+        bench_map = {}
+        for bench in benchmarks:
+            bench_map.setdefault(bench['fullname'], {})[bench.get('source', '')] = bench
+
+        if len(sources) < 2:
+            tr.write_line(f'ERROR: --compare-between requires at least 2 source files, got {len(sources)}.', red=True)
+            return
+
+        metrics = [c for c in self.columns if c in STAT_PROPS]
+        if not metrics:
+            metrics = [self.sort] if self.sort in STAT_PROPS else ['min']
+
+        all_benches = list(benchmarks)
+        best, worst = compute_best_worst(all_benches, progress_reporter, tr, line)
+        _unit, adjustment, ops_adjustment, labels = self.compute_scale(all_benches, best, worst)
+        adjustments = {m: (ops_adjustment if m == 'ops' else adjustment) for m in metrics}
+
+        # Sort benchmarks by the sort metric from the first source
+        first_source = sources[0]
+        sorted_names = sorted(
+            bench_map,
+            key=lambda name: bench_map[name].get(first_source, {}).get(self.sort, float('inf')),
+        )
+
+        # Format benchmark display names without source suffix
+        display_names = {}
+        for fullname in sorted_names:
+            any_bench = next(iter(bench_map[fullname].values()))
+            display_names[fullname] = self.name_format({**any_bench, 'source': ''})
+
+        name_width = 3 + max(len(labels['name']), *(len(n) for n in display_names.values()))
+
+        columns = []  # Each entry: (source_idx, metric, label, width, is_change)
+
+        def _val_col_width(metric, src, label):
+            """
+            Helper to compute value column width for a given metric and source
+            """
+            adj = adjustments[metric]
+            w = len(label)
+            for fn in sorted_names:
+                bench = bench_map[fn].get(src)
+                if bench is not None:
+                    w = max(w, len(NUMBER_FMT.format(bench[metric] * adj)))
+            return w + 2
+
+        # Format source labels (short)
+        source_labels = []
+        for src in sources:
+            parts = src.split('/')
+            label = parts[-1][:12] if parts[-1] else src[:12]
+            source_labels.append(label)
+
+        # Reference source columns
+        for metric in metrics:
+            label = f'{source_labels[0]}{REF_LABEL} {labels[metric]}'
+            columns.append((0, metric, label, _val_col_width(metric, sources[0], label), False))
+
+        # Subsequent sources: value + diff pairs per metric
+        for si in range(1, len(sources)):
+            slabel = source_labels[si]
+            for metric in metrics:
+                val_label = f'{slabel} {labels[metric]}'
+                columns.append((si, metric, val_label, _val_col_width(metric, sources[si], val_label), False))
+                diff_label = f'{DELTA}{labels[metric]}'
+                columns.append((si, metric, diff_label, max(len(diff_label) + 2, 10), True))
+
+        header = ''.join(
+            [
+                labels['name'].ljust(name_width),
+                *(label.rjust(width) for _si, _metric, label, width, _is_change in columns),
+            ]
+        )
+
+        group_name = '' if group is None else f' {group!r}'
+        tr.write_line(
+            f' benchmark{group_name}: {len(sorted_names)} tests, {len(sources)} sources '.center(len(header), '-'),
+            yellow=True,
+        )
+        tr.write_line(header)
+        tr.write_line('-' * len(header), yellow=True)
+
+        for fullname in sorted_names:
+            tr.write(display_names[fullname].ljust(name_width))
+            row_values = {}
+            for si, metric, _label, width, is_change in columns:
+                if not is_change:
+                    bench = bench_map[fullname].get(sources[si])
+                    if bench is None:
+                        tr.write('N/A'.rjust(width))
+                    else:
+                        row_values[(si, metric)] = bench[metric]
+                        tr.write(NUMBER_FMT.format(bench[metric] * adjustments[metric]).rjust(width), cyan=(si == 0))
+                else:
+                    base_val = row_values.get((0, metric))
+                    cmp_val = row_values.get((si, metric))
+                    if base_val is None or cmp_val is None or base_val == 0:
+                        tr.write('N/A'.rjust(width))
+                    else:
+                        pct = (cmp_val - base_val) / abs(base_val) * 100
+                        # For ops, higher is better; for time metrics, lower is better
+                        is_improvement = (pct > 0) if metric == 'ops' else (pct < 0)
+                        is_regression = (pct < 0) if metric == 'ops' else (pct > 0)
+                        tr.write(f'{pct:+.1f}%'.rjust(width), green=is_improvement, red=is_regression, bold=True)
+            tr.write('\n')
+
+        tr.write_line('-' * len(header), yellow=True)
+        tr.write_line('')
+
+
 def compute_baseline_scale(baseline, value, width):
     if not width:
         return ''

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -93,7 +93,7 @@ def test_help_compare(testdir, args):
             '                                 [--columns LABELS] [--name FORMAT]',
             '                                 [--time-unit COLUMN]',
             '                                 [--histogram [FILENAME-PREFIX]]',
-            '                                 [--csv [FILENAME]]',
+            '                                 [--compare-between] [--csv [FILENAME]]',
             '                                 [[]glob_or_file *[]]',
             '',
             'Compare saved runs.',
@@ -122,6 +122,8 @@ def test_help_compare(testdir, args):
             '                        FILENAME-PREFIX-test_name.svg. If FILENAME-PREFIX',
             "                        contains slashes ('/') then directories will be",
             "                        created. Default: 'benchmark_*'",
+            '  --compare-between     Compare same-named benchmarks across different source',
+            '                        files.',
             "  --csv [FILENAME]      Save a csv report. If FILENAME contains slashes ('/')",
             '                        then directories will be created. Default:',
             "                        'benchmark_*'",
@@ -287,6 +289,63 @@ def test_compare(testdir, name, name_pattern_generator):
     assert result.ret == 0
 
 
+def test_compare_between(testdir):
+    common_args = [
+        'py.test-benchmark',
+        '--storage',
+        STORAGE,
+        'compare',
+        '--compare-between',
+        '--sort=min',
+        '--columns=min,max',
+        '--name=short',
+    ]
+    # 2-source case: reference (*), interleaved value+delta columns
+    result = testdir.run(*common_args, '0001', '0002')
+    result.stdout.fnmatch_lines(
+        [
+            '---*--- benchmark: 1 tests, 2 sources ---*---',
+            'Name (time in ns) *0001_b87b9aa(*) Min*0001_b87b9aa(*) Max*0002_b87b9aa Min*\u0394Min*0002_b87b9aa Max*\u0394Max',
+            '---*---',
+            'xfast_parametrized[[]0[]] *217.3145*11*447.3891*216.9028*-0.2%*7*739.2997*-32.4%',
+            '---*---',
+            '',
+            'Legend:',
+            '  (*): reference source for comparison.*',
+        ]
+    )
+    assert result.ret == 0
+
+    # 3-source case: each non-reference source followed by its deltas
+    result = testdir.run(*common_args, '0001', '0002', '0003')
+    result.stdout.fnmatch_lines(
+        [
+            '---*--- benchmark: 1 tests, 3 sources ---*---',
+            'Name (time in ns) *0001_b87b9aa(*) Min*0001_b87b9aa(*) Max*0002_b87b9aa Min*\u0394Min*0002_b87b9aa Max*\u0394Max*0003_5b78858 Min*\u0394Min*0003_5b78858 Max*\u0394Max',
+            '---*---',
+            'xfast_parametrized[[]0[]] *217.3145*11*447.3891*216.9028*-0.2%*7*739.2997*-32.4%*215.6286*-0.8%*10*318.6159*-9.9%',
+            '---*---',
+        ]
+    )
+    assert result.ret == 0
+
+
+def test_compare_between_histogram_error(testdir):
+    result = testdir.run(
+        'py.test-benchmark',
+        '--storage',
+        STORAGE,
+        'compare',
+        '--compare-between',
+        '--histogram',
+        'foobar',
+        '0001',
+        '0002',
+    )
+    result.stderr.fnmatch_lines(['*error: --compare-between is not compatible with --histogram*'])
+    assert result.ret != 0
+
+
 @pytest.mark.parametrize(
     ('name', 'name_pattern_generator', 'unit'),
     [