Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions AUTHORS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,4 @@ Authors
* Enno Gotthold - https://github.com/SchoolGuy
* Thomas B. Brunner - https://github.com/thomasbbrunner
* Hugo van Kemenade - https://github.com/hugovk
* Aarni Koskela - https://github.com/akx
23 changes: 23 additions & 0 deletions docs/comparing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,29 @@ Example::

pytest-benchmark compare 0001 0002

Comparing between source files
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

When comparing benchmarks from multiple files (e.g. a ``main`` branch run vs. a feature branch run),
the default output shows all benchmarks in a single flat table.
The ``--compare-between`` flag pivots the table so that each row is a unique benchmark,
with columns showing the metric value from each source file and the relative change::

pytest-benchmark compare --compare-between 0001 0002

Example output::

----------------------- benchmark: 9 tests, 2 sources ------------------------
Name (time in ns) 0001_f41c0c7 Min 0002_8e68892 Min Change
------------------------------------------------------------------------------
test_getattr_thread_critical 790.93 245.80 -68.9%
test_setattr_thread_critical 899.99 254.15 -71.8%
...

You can control which metrics are shown per source with ``--columns`` and the sort order with ``--sort``::

pytest-benchmark compare --compare-between --sort=mean --columns=min,mean 0001 0002

Plotting
--------

Expand Down
18 changes: 16 additions & 2 deletions src/pytest_benchmark/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from .plugin import add_display_options
from .plugin import add_global_options
from .plugin import add_histogram_options
from .table import CompareBetweenResults
from .table import TableResults
from .utils import NAME_FORMATTERS
from .utils import first_or_value
Expand Down Expand Up @@ -108,6 +109,11 @@ def make_parser():
)
add_display_options(compare_command.add_argument, prefix='')
add_histogram_options(compare_command.add_argument, prefix='')
compare_command.add_argument(
'--compare-between',
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should use the prefix.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The prefix seems to be forced to '' for all compare subcommand options (and there's no prefix available in this function as far as I can see..?)?

action='store_true',
help='Compare same-named benchmarks across different source files.',
)
add_glob_or_file(compare_command.add_argument)
add_csv_options(compare_command.add_argument, prefix='')

Expand Down Expand Up @@ -148,10 +154,18 @@ def main():
for file in storage.query():
print(file)
elif args.command == 'compare':
results_table = TableResults(
histogram = first_or_value(args.histogram, False)
if args.compare_between:
if histogram:
parser.error('--compare-between is not compatible with --histogram')
results_table_cls = CompareBetweenResults
else:
results_table_cls = TableResults

results_table = results_table_cls(
columns=args.columns,
sort=args.sort,
histogram=first_or_value(args.histogram, False),
histogram=histogram,
name_format=NAME_FORMATTERS[args.name],
logger=logger,
scale_unit=partial(
Expand Down
179 changes: 151 additions & 28 deletions src/pytest_benchmark/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,20 @@

NUMBER_FMT = '{0:,.4f}'
ALIGNED_NUMBER_FMT = '{0:>{1},.4f}{2:<{3}}'
STAT_PROPS = ('min', 'max', 'mean', 'median', 'iqr', 'stddev', 'ops')


def compute_best_worst(benchmarks, progress_reporter, tr, line):
worst = {}
best = {}
for line1, prop in progress_reporter(STAT_PROPS, tr, '{line}: {value}', line=line):
# For 'ops', higher is better; for time-based metrics, lower is better
best_fn, worst_fn = (max, min) if prop == 'ops' else (min, max)
values = progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1)
best[prop] = best_fn(bench[prop] for _, bench in values)
values = progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1)
worst[prop] = worst_fn(bench[prop] for _, bench in values)
return best, worst


class TableResults:
Expand All @@ -22,6 +36,24 @@ def __init__(self, columns, sort, histogram, name_format, logger, scale_unit):
self.logger = logger
self.scale_unit = scale_unit

def compute_scale(self, benchmarks, best, worst):
unit, adjustment = self.scale_unit(unit='seconds', benchmarks=benchmarks, best=best, worst=worst, sort=self.sort)
ops_unit, ops_adjustment = self.scale_unit(unit='operations', benchmarks=benchmarks, best=best, worst=worst, sort=self.sort)
labels = {
'name': f'Name (time in {unit}s)',
'min': 'Min',
'max': 'Max',
'mean': 'Mean',
'stddev': 'StdDev',
'rounds': 'Rounds',
'iterations': 'Iterations',
'iqr': 'IQR',
'median': 'Median',
'outliers': 'Outliers',
'ops': f'OPS ({ops_unit}ops/s)' if ops_unit else 'OPS',
}
return unit, adjustment, ops_adjustment, labels

def display(self, tr, groups, progress_reporter=report_progress):
tr.write_line('')
report_online_progress(progress_reporter, tr, 'Computing stats ...')
Expand All @@ -30,47 +62,24 @@ def display(self, tr, groups, progress_reporter=report_progress):
for bench in benchmarks:
bench['name'] = self.name_format(bench)

worst = {}
best = {}
solo = len(benchmarks) == 1
for line1, prop in progress_reporter(
('min', 'max', 'mean', 'median', 'iqr', 'stddev', 'ops'), tr, '{line}: {value}', line=line
):
if prop == 'ops':
worst[prop] = min(bench[prop] for _, bench in progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1))
best[prop] = max(bench[prop] for _, bench in progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1))
else:
worst[prop] = max(bench[prop] for _, bench in progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1))
best[prop] = min(bench[prop] for _, bench in progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1))
best, worst = compute_best_worst(benchmarks, progress_reporter, tr, line)
for line1, prop in progress_reporter(('outliers', 'rounds', 'iterations'), tr, '{line}: {value}', line=line):
worst[prop] = max(
benchmark[prop] for _, benchmark in progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1)
)

unit, adjustment = self.scale_unit(unit='seconds', benchmarks=benchmarks, best=best, worst=worst, sort=self.sort)
ops_unit, ops_adjustment = self.scale_unit(unit='operations', benchmarks=benchmarks, best=best, worst=worst, sort=self.sort)
labels = {
'name': f'Name (time in {unit}s)',
'min': 'Min',
'max': 'Max',
'mean': 'Mean',
'stddev': 'StdDev',
'rounds': 'Rounds',
'iterations': 'Iterations',
'iqr': 'IQR',
'median': 'Median',
'outliers': 'Outliers',
'ops': f'OPS ({ops_unit}ops/s)' if ops_unit else 'OPS',
}
unit, adjustment, ops_adjustment, labels = self.compute_scale(benchmarks, best, worst)
widths = {
'name': 3 + max(len(labels['name']), max(len(benchmark['name']) for benchmark in benchmarks)),
'rounds': 2 + max(len(labels['rounds']), len(str(worst['rounds']))),
'iterations': 2 + max(len(labels['iterations']), len(str(worst['iterations']))),
'outliers': 2 + max(len(labels['outliers']), len(str(worst['outliers']))),
'ops': 2 + max(len(labels['ops']), len(NUMBER_FMT.format(best['ops'] * ops_adjustment))),
}
for prop in 'min', 'max', 'mean', 'stddev', 'median', 'iqr':
widths[prop] = 2 + max(len(labels[prop]), max(len(NUMBER_FMT.format(bench[prop] * adjustment)) for bench in benchmarks))
for prop in STAT_PROPS:
if prop not in widths:
widths[prop] = 2 + max(len(labels[prop]), max(len(NUMBER_FMT.format(bench[prop] * adjustment)) for bench in benchmarks))

rpadding = 0 if solo else 10
labels_line = labels['name'].ljust(widths['name']) + ''.join(
Expand Down Expand Up @@ -134,6 +143,120 @@ def display(self, tr, groups, progress_reporter=report_progress):
tr.write_line(' OPS: Operations Per Second, computed as 1 / Mean')


class CompareBetweenResults(TableResults):
def display(self, tr, groups, progress_reporter=report_progress):
tr.write_line('')
for line, (group, benchmarks) in progress_reporter(groups, tr, 'Computing stats ... group {pos}/{total}'):
# Collect sources in order of first appearance and build fullname -> {source: bench} mapping
sources = list(dict.fromkeys(bench.get('source', '') for bench in benchmarks))
bench_map = {}
for bench in benchmarks:
bench_map.setdefault(bench['fullname'], {})[bench.get('source', '')] = bench

if len(sources) < 2:
tr.write_line(f'ERROR: --compare-between requires at least 2 source files, got {len(sources)}.', red=True)
continue

# Format source labels (short)
source_labels = []
for src in sources:
parts = src.split('/')
label = parts[-1][:12] if parts[-1] else src[:12]
source_labels.append(label)

metrics = [c for c in self.columns if c in STAT_PROPS]
if not metrics:
metrics = [self.sort] if self.sort in STAT_PROPS else ['min']

all_benches = list(benchmarks)
best, worst = compute_best_worst(all_benches, progress_reporter, tr, line)
_unit, adjustment, ops_adjustment, labels = self.compute_scale(all_benches, best, worst)
adjustments = {m: (ops_adjustment if m == 'ops' else adjustment) for m in metrics}

# Sort benchmarks by the sort metric from the first source
first_source = sources[0]
sorted_names = sorted(
bench_map,
key=lambda name: bench_map[name].get(first_source, {}).get(self.sort, float('inf')),
)

# Format benchmark display names without source suffix
display_names = {}
for fullname in sorted_names:
any_bench = next(iter(bench_map[fullname].values()))
display_names[fullname] = self.name_format({**any_bench, 'source': ''})

name_width = 3 + max(len(labels['name']), max(len(n) for n in display_names.values()))

# Build column specs: (source_idx, metric, label, width) for each metric x source
col_specs = []
for metric in metrics:
adj = adjustments[metric]
for si, (src, slabel) in enumerate(zip(sources, source_labels)):
label = f'{slabel} {labels[metric]}'
max_val_width = len(label)
for fullname in sorted_names:
bench = bench_map[fullname].get(src)
if bench is not None:
max_val_width = max(max_val_width, len(NUMBER_FMT.format(bench[metric] * adj)))
col_specs.append((si, metric, label, max_val_width + 2))

# Build change column specs: (metric, source_idx, label, width) comparing first source to each subsequent
use_simple_label = len(metrics) == 1 and len(sources) == 2
change_specs = []
for metric in metrics:
for si in range(1, len(sources)):
label = 'Change' if use_simple_label else f'Chg:{source_labels[si][:6]}/{labels[metric]}'
change_specs.append((metric, si, label, max(len(label) + 2, 10)))

# Build and print header
header = labels['name'].ljust(name_width)
for _si, _metric, label, width in col_specs:
header += label.rjust(width)
for _metric, _si, label, width in change_specs:
header += label.rjust(width)

group_name = '' if group is None else f' {group!r}'
tr.write_line(
f' benchmark{group_name}: {len(sorted_names)} tests, {len(sources)} sources '.center(len(header), '-'),
yellow=True,
)
tr.write_line(header)
tr.write_line('-' * len(header), yellow=True)

# Print rows
for fullname in sorted_names:
tr.write(display_names[fullname].ljust(name_width))
row_values = {}
for si, metric, _label, width in col_specs:
bench = bench_map[fullname].get(sources[si])
if bench is None:
tr.write('N/A'.rjust(width))
else:
row_values[(si, metric)] = bench[metric]
tr.write(NUMBER_FMT.format(bench[metric] * adjustments[metric]).rjust(width))

for metric, si, _label, width in change_specs:
base_val = row_values.get((0, metric))
cmp_val = row_values.get((si, metric))
if base_val is None or cmp_val is None or base_val == 0:
tr.write('N/A'.rjust(width))
else:
pct = (cmp_val - base_val) / abs(base_val) * 100
# For ops, higher is better; for time metrics, lower is better
is_improvement = (pct > 0) if metric == 'ops' else (pct < 0)
is_regression = (pct < 0) if metric == 'ops' else (pct > 0)
tr.write(f'{pct:+.1f}%'.rjust(width), green=is_improvement, red=is_regression, bold=True)
tr.write('\n')

tr.write_line('-' * len(header), yellow=True)
tr.write_line('')

tr.write_line('Legend:')
tr.write_line(' Change: percentage change from first source to subsequent sources.')
tr.write_line(' Green: improvement, Red: regression.')


def compute_baseline_scale(baseline, value, width):
if not width:
return ''
Expand Down
45 changes: 44 additions & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def test_help_compare(testdir, args):
' [--columns LABELS] [--name FORMAT]',
' [--time-unit COLUMN]',
' [--histogram [FILENAME-PREFIX]]',
' [--csv [FILENAME]]',
' [--compare-between] [--csv [FILENAME]]',
' [[]glob_or_file *[]]',
'',
'Compare saved runs.',
Expand Down Expand Up @@ -122,6 +122,8 @@ def test_help_compare(testdir, args):
' FILENAME-PREFIX-test_name.svg. If FILENAME-PREFIX',
" contains slashes ('/') then directories will be",
" created. Default: 'benchmark_*'",
' --compare-between Compare same-named benchmarks across different source',
' files.',
" --csv [FILENAME] Save a csv report. If FILENAME contains slashes ('/')",
' then directories will be created. Default:',
" 'benchmark_*'",
Expand Down Expand Up @@ -287,6 +289,47 @@ def test_compare(testdir, name, name_pattern_generator):
assert result.ret == 0


def test_compare_between(testdir):
common_args = [
'py.test-benchmark',
'--storage',
STORAGE,
'compare',
'--compare-between',
'--sort=min',
'--columns=min,max',
'--name=short',
]
# 2-source case: simple "Change" header
result = testdir.run(*common_args, '0001', '0002')
result.stdout.fnmatch_lines(
[
'---*--- benchmark: 1 tests, 2 sources ---*---',
'Name (time in ns) *0001_b87b9aa Min*0002_b87b9aa Min*0001_b87b9aa Max*0002_b87b9aa Max*Chg:0002_b/Min*Chg:0002_b/Max',
'---*---',
'xfast_parametrized[[]0[]] *217.3145*216.9028*11*447.3891*7*739.2997*-0.2%*-32.4%',
'---*---',
'',
'Legend:',
' Change: percentage change from first source to subsequent sources.',
]
)
assert result.ret == 0

# 3-source case: disambiguated change headers per source/metric
result = testdir.run(*common_args, '0001', '0002', '0003')
result.stdout.fnmatch_lines(
[
'---*--- benchmark: 1 tests, 3 sources ---*---',
'Name (time in ns) *Chg:0002_b/Min*Chg:0003_5/Min*Chg:0002_b/Max*Chg:0003_5/Max',
'---*---',
'xfast_parametrized[[]0[]] *217.3145*216.9028*215.6286*11*447.3891*-0.2%*-0.8%*-32.4%*-9.9%',
'---*---',
]
)
assert result.ret == 0


@pytest.mark.parametrize(
('name', 'name_pattern_generator', 'unit'),
[
Expand Down
Loading