diff --git a/AUTHORS.rst b/AUTHORS.rst index 5744ea7..8267690 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -43,3 +43,4 @@ Authors * Enno Gotthold - https://github.com/SchoolGuy * Thomas B. Brunner - https://github.com/thomasbbrunner * Hugo van Kemenade - https://github.com/hugovk +* Aarni Koskela - https://github.com/akx diff --git a/docs/comparing.rst b/docs/comparing.rst index 1256dec..76a6cdb 100644 --- a/docs/comparing.rst +++ b/docs/comparing.rst @@ -38,6 +38,32 @@ Example:: pytest-benchmark compare 0001 0002 +Comparing between source files +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When comparing benchmarks from multiple files (e.g. a ``main`` branch run vs. a feature branch run), +the default output shows all benchmarks in a single flat table. +The ``--compare-between`` flag pivots the table so that each row is a unique benchmark, +with columns showing the metric value from each source file and the relative change:: + + pytest-benchmark compare --compare-between 0001 0002 + +Example output:: + + -------------------------- benchmark: 9 tests, 2 sources --------------------------- + Name (time in ns) 0001_f41c0c7(*) Min 0002_8e68892 Min ΔMin + ------------------------------------------------------------------------------------- + test_getattr_thread_critical 790.93 245.80 -68.9% + test_setattr_thread_critical 899.99 254.15 -71.8% + ... + +The first source is the reference, marked with ``(*)``. +Each subsequent source is followed by a ``Δ`` column showing the percentage change. + +You can control which metrics are shown per source with ``--columns`` and the sort order with ``--sort``:: + + pytest-benchmark compare --compare-between --sort=mean --columns=min,mean 0001 0002 + Plotting -------- diff --git a/src/pytest_benchmark/cli.py b/src/pytest_benchmark/cli.py index b206e7b..ac64829 100644 --- a/src/pytest_benchmark/cli.py +++ b/src/pytest_benchmark/cli.py @@ -19,6 +19,7 @@ from .plugin import add_display_options from .plugin import add_global_options from .plugin import add_histogram_options +from .table import CompareBetweenResults from .table import TableResults from .utils import NAME_FORMATTERS from .utils import first_or_value @@ -108,6 +109,11 @@ def make_parser(): ) add_display_options(compare_command.add_argument, prefix='') add_histogram_options(compare_command.add_argument, prefix='') + compare_command.add_argument( + '--compare-between', + action='store_true', + help='Compare same-named benchmarks across different source files.', + ) add_glob_or_file(compare_command.add_argument) add_csv_options(compare_command.add_argument, prefix='') @@ -148,10 +154,18 @@ def main(): for file in storage.query(): print(file) elif args.command == 'compare': - results_table = TableResults( + histogram = first_or_value(args.histogram, False) + if args.compare_between: + if histogram: + parser.error('--compare-between is not compatible with --histogram') + results_table_cls = CompareBetweenResults + else: + results_table_cls = TableResults + + results_table = results_table_cls( columns=args.columns, sort=args.sort, - histogram=first_or_value(args.histogram, False), + histogram=histogram, name_format=NAME_FORMATTERS[args.name], logger=logger, scale_unit=partial( diff --git a/src/pytest_benchmark/table.py b/src/pytest_benchmark/table.py index 3bfd450..4e7eb01 100644 --- a/src/pytest_benchmark/table.py +++ b/src/pytest_benchmark/table.py @@ -11,6 +11,22 @@ NUMBER_FMT = '{0:,.4f}' ALIGNED_NUMBER_FMT = '{0:>{1},.4f}{2:<{3}}' +STAT_PROPS = ('min', 'max', 'mean', 'median', 'iqr', 'stddev', 'ops') +DELTA = '\N{GREEK CAPITAL LETTER DELTA}' +REF_LABEL = '(*)' + + +def compute_best_worst(benchmarks, progress_reporter, tr, line): + worst = {} + best = {} + for line1, prop in progress_reporter(STAT_PROPS, tr, '{line}: {value}', line=line): + # For 'ops', higher is better; for time-based metrics, lower is better + best_fn, worst_fn = (max, min) if prop == 'ops' else (min, max) + values = progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1) + best[prop] = best_fn(bench[prop] for _, bench in values) + values = progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1) + worst[prop] = worst_fn(bench[prop] for _, bench in values) + return best, worst class TableResults: @@ -22,6 +38,24 @@ def __init__(self, columns, sort, histogram, name_format, logger, scale_unit): self.logger = logger self.scale_unit = scale_unit + def compute_scale(self, benchmarks, best, worst): + unit, adjustment = self.scale_unit(unit='seconds', benchmarks=benchmarks, best=best, worst=worst, sort=self.sort) + ops_unit, ops_adjustment = self.scale_unit(unit='operations', benchmarks=benchmarks, best=best, worst=worst, sort=self.sort) + labels = { + 'name': f'Name (time in {unit}s)', + 'min': 'Min', + 'max': 'Max', + 'mean': 'Mean', + 'stddev': 'StdDev', + 'rounds': 'Rounds', + 'iterations': 'Iterations', + 'iqr': 'IQR', + 'median': 'Median', + 'outliers': 'Outliers', + 'ops': f'OPS ({ops_unit}ops/s)' if ops_unit else 'OPS', + } + return unit, adjustment, ops_adjustment, labels + def display(self, tr, groups, progress_reporter=report_progress): tr.write_line('') report_online_progress(progress_reporter, tr, 'Computing stats ...') @@ -30,38 +64,14 @@ def display(self, tr, groups, progress_reporter=report_progress): for bench in benchmarks: bench['name'] = self.name_format(bench) - worst = {} - best = {} solo = len(benchmarks) == 1 - for line1, prop in progress_reporter( - ('min', 'max', 'mean', 'median', 'iqr', 'stddev', 'ops'), tr, '{line}: {value}', line=line - ): - if prop == 'ops': - worst[prop] = min(bench[prop] for _, bench in progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1)) - best[prop] = max(bench[prop] for _, bench in progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1)) - else: - worst[prop] = max(bench[prop] for _, bench in progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1)) - best[prop] = min(bench[prop] for _, bench in progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1)) + best, worst = compute_best_worst(benchmarks, progress_reporter, tr, line) for line1, prop in progress_reporter(('outliers', 'rounds', 'iterations'), tr, '{line}: {value}', line=line): worst[prop] = max( benchmark[prop] for _, benchmark in progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1) ) - unit, adjustment = self.scale_unit(unit='seconds', benchmarks=benchmarks, best=best, worst=worst, sort=self.sort) - ops_unit, ops_adjustment = self.scale_unit(unit='operations', benchmarks=benchmarks, best=best, worst=worst, sort=self.sort) - labels = { - 'name': f'Name (time in {unit}s)', - 'min': 'Min', - 'max': 'Max', - 'mean': 'Mean', - 'stddev': 'StdDev', - 'rounds': 'Rounds', - 'iterations': 'Iterations', - 'iqr': 'IQR', - 'median': 'Median', - 'outliers': 'Outliers', - 'ops': f'OPS ({ops_unit}ops/s)' if ops_unit else 'OPS', - } + unit, adjustment, ops_adjustment, labels = self.compute_scale(benchmarks, best, worst) widths = { 'name': 3 + max(len(labels['name']), max(len(benchmark['name']) for benchmark in benchmarks)), 'rounds': 2 + max(len(labels['rounds']), len(str(worst['rounds']))), @@ -69,8 +79,9 @@ def display(self, tr, groups, progress_reporter=report_progress): 'outliers': 2 + max(len(labels['outliers']), len(str(worst['outliers']))), 'ops': 2 + max(len(labels['ops']), len(NUMBER_FMT.format(best['ops'] * ops_adjustment))), } - for prop in 'min', 'max', 'mean', 'stddev', 'median', 'iqr': - widths[prop] = 2 + max(len(labels[prop]), max(len(NUMBER_FMT.format(bench[prop] * adjustment)) for bench in benchmarks)) + for prop in STAT_PROPS: + if prop not in widths: + widths[prop] = 2 + max(len(labels[prop]), max(len(NUMBER_FMT.format(bench[prop] * adjustment)) for bench in benchmarks)) rpadding = 0 if solo else 10 labels_line = labels['name'].ljust(widths['name']) + ''.join( @@ -134,6 +145,131 @@ def display(self, tr, groups, progress_reporter=report_progress): tr.write_line(' OPS: Operations Per Second, computed as 1 / Mean') +class CompareBetweenResults(TableResults): + def display(self, tr, groups, progress_reporter=report_progress): + tr.write_line('') + + for line, (group, benchmarks) in progress_reporter(groups, tr, 'Computing stats ... group {pos}/{total}'): + self._display_single_between(line, group, benchmarks, tr=tr, progress_reporter=progress_reporter) + + tr.write_line('Legend:') + tr.write_line(f' {REF_LABEL}: reference source for comparison. Cyan on terminal.') + tr.write_line(f' {DELTA}: percentage change from reference source.') + tr.write_line(' Green: improvement, Red: regression.') + + def _display_single_between(self, line, group, benchmarks, *, tr, progress_reporter): + # Collect sources in order of first appearance and build fullname -> {source: bench} mapping + sources = list(dict.fromkeys(bench.get('source', '') for bench in benchmarks)) + bench_map = {} + for bench in benchmarks: + bench_map.setdefault(bench['fullname'], {})[bench.get('source', '')] = bench + + if len(sources) < 2: + tr.write_line(f'ERROR: --compare-between requires at least 2 source files, got {len(sources)}.', red=True) + return + + metrics = [c for c in self.columns if c in STAT_PROPS] + if not metrics: + metrics = [self.sort] if self.sort in STAT_PROPS else ['min'] + + all_benches = list(benchmarks) + best, worst = compute_best_worst(all_benches, progress_reporter, tr, line) + _unit, adjustment, ops_adjustment, labels = self.compute_scale(all_benches, best, worst) + adjustments = {m: (ops_adjustment if m == 'ops' else adjustment) for m in metrics} + + # Sort benchmarks by the sort metric from the first source + first_source = sources[0] + sorted_names = sorted( + bench_map, + key=lambda name: bench_map[name].get(first_source, {}).get(self.sort, float('inf')), + ) + + # Format benchmark display names without source suffix + display_names = {} + for fullname in sorted_names: + any_bench = next(iter(bench_map[fullname].values())) + display_names[fullname] = self.name_format({**any_bench, 'source': ''}) + + name_width = 3 + max(len(labels['name']), *(len(n) for n in display_names.values())) + + columns = [] # Each entry: (source_idx, metric, label, width, is_change) + + def _val_col_width(metric, src, label): + """ + Helper to compute value column width for a given metric and source + """ + adj = adjustments[metric] + w = len(label) + for fn in sorted_names: + bench = bench_map[fn].get(src) + if bench is not None: + w = max(w, len(NUMBER_FMT.format(bench[metric] * adj))) + return w + 2 + + # Format source labels (short) + source_labels = [] + for src in sources: + parts = src.split('/') + label = parts[-1][:12] if parts[-1] else src[:12] + source_labels.append(label) + + # Reference source columns + for metric in metrics: + label = f'{source_labels[0]}{REF_LABEL} {labels[metric]}' + columns.append((0, metric, label, _val_col_width(metric, sources[0], label), False)) + + # Subsequent sources: value + diff pairs per metric + for si in range(1, len(sources)): + slabel = source_labels[si] + for metric in metrics: + val_label = f'{slabel} {labels[metric]}' + columns.append((si, metric, val_label, _val_col_width(metric, sources[si], val_label), False)) + diff_label = f'{DELTA}{labels[metric]}' + columns.append((si, metric, diff_label, max(len(diff_label) + 2, 10), True)) + + header = ''.join( + [ + labels['name'].ljust(name_width), + *(label.rjust(width) for _si, _metric, label, width, _is_change in columns), + ] + ) + + group_name = '' if group is None else f' {group!r}' + tr.write_line( + f' benchmark{group_name}: {len(sorted_names)} tests, {len(sources)} sources '.center(len(header), '-'), + yellow=True, + ) + tr.write_line(header) + tr.write_line('-' * len(header), yellow=True) + + for fullname in sorted_names: + tr.write(display_names[fullname].ljust(name_width)) + row_values = {} + for si, metric, _label, width, is_change in columns: + if not is_change: + bench = bench_map[fullname].get(sources[si]) + if bench is None: + tr.write('N/A'.rjust(width)) + else: + row_values[(si, metric)] = bench[metric] + tr.write(NUMBER_FMT.format(bench[metric] * adjustments[metric]).rjust(width), cyan=(si == 0)) + else: + base_val = row_values.get((0, metric)) + cmp_val = row_values.get((si, metric)) + if base_val is None or cmp_val is None or base_val == 0: + tr.write('N/A'.rjust(width)) + else: + pct = (cmp_val - base_val) / abs(base_val) * 100 + # For ops, higher is better; for time metrics, lower is better + is_improvement = (pct > 0) if metric == 'ops' else (pct < 0) + is_regression = (pct < 0) if metric == 'ops' else (pct > 0) + tr.write(f'{pct:+.1f}%'.rjust(width), green=is_improvement, red=is_regression, bold=True) + tr.write('\n') + + tr.write_line('-' * len(header), yellow=True) + tr.write_line('') + + def compute_baseline_scale(baseline, value, width): if not width: return '' diff --git a/tests/test_cli.py b/tests/test_cli.py index 438f475..ee48be3 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -93,7 +93,7 @@ def test_help_compare(testdir, args): ' [--columns LABELS] [--name FORMAT]', ' [--time-unit COLUMN]', ' [--histogram [FILENAME-PREFIX]]', - ' [--csv [FILENAME]]', + ' [--compare-between] [--csv [FILENAME]]', ' [[]glob_or_file *[]]', '', 'Compare saved runs.', @@ -122,6 +122,8 @@ def test_help_compare(testdir, args): ' FILENAME-PREFIX-test_name.svg. If FILENAME-PREFIX', " contains slashes ('/') then directories will be", " created. Default: 'benchmark_*'", + ' --compare-between Compare same-named benchmarks across different source', + ' files.', " --csv [FILENAME] Save a csv report. If FILENAME contains slashes ('/')", ' then directories will be created. Default:', " 'benchmark_*'", @@ -287,6 +289,63 @@ def test_compare(testdir, name, name_pattern_generator): assert result.ret == 0 +def test_compare_between(testdir): + common_args = [ + 'py.test-benchmark', + '--storage', + STORAGE, + 'compare', + '--compare-between', + '--sort=min', + '--columns=min,max', + '--name=short', + ] + # 2-source case: reference (*), interleaved value+delta columns + result = testdir.run(*common_args, '0001', '0002') + result.stdout.fnmatch_lines( + [ + '---*--- benchmark: 1 tests, 2 sources ---*---', + 'Name (time in ns) *0001_b87b9aa(*) Min*0001_b87b9aa(*) Max*0002_b87b9aa Min*\u0394Min*0002_b87b9aa Max*\u0394Max', + '---*---', + 'xfast_parametrized[[]0[]] *217.3145*11*447.3891*216.9028*-0.2%*7*739.2997*-32.4%', + '---*---', + '', + 'Legend:', + ' (*): reference source for comparison.*', + ] + ) + assert result.ret == 0 + + # 3-source case: each non-reference source followed by its deltas + result = testdir.run(*common_args, '0001', '0002', '0003') + result.stdout.fnmatch_lines( + [ + '---*--- benchmark: 1 tests, 3 sources ---*---', + 'Name (time in ns) *0001_b87b9aa(*) Min*0001_b87b9aa(*) Max*0002_b87b9aa Min*\u0394Min*0002_b87b9aa Max*\u0394Max*0003_5b78858 Min*\u0394Min*0003_5b78858 Max*\u0394Max', + '---*---', + 'xfast_parametrized[[]0[]] *217.3145*11*447.3891*216.9028*-0.2%*7*739.2997*-32.4%*215.6286*-0.8%*10*318.6159*-9.9%', + '---*---', + ] + ) + assert result.ret == 0 + + +def test_compare_between_histogram_error(testdir): + result = testdir.run( + 'py.test-benchmark', + '--storage', + STORAGE, + 'compare', + '--compare-between', + '--histogram', + 'foobar', + '0001', + '0002', + ) + result.stderr.fnmatch_lines(['*error: --compare-between is not compatible with --histogram*']) + assert result.ret != 0 + + @pytest.mark.parametrize( ('name', 'name_pattern_generator', 'unit'), [