Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions AUTHORS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,4 @@ Authors
* Enno Gotthold - https://github.com/SchoolGuy
* Thomas B. Brunner - https://github.com/thomasbbrunner
* Hugo van Kemenade - https://github.com/hugovk
* Aarni Koskela - https://github.com/akx
26 changes: 26 additions & 0 deletions docs/comparing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,32 @@ Example::

pytest-benchmark compare 0001 0002

Comparing between source files
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

When comparing benchmarks from multiple files (e.g. a ``main`` branch run vs. a feature branch run),
the default output shows all benchmarks in a single flat table.
The ``--compare-between`` flag pivots the table so that each row is a unique benchmark,
with columns showing the metric value from each source file and the relative change::

pytest-benchmark compare --compare-between 0001 0002

Example output::

-------------------------- benchmark: 9 tests, 2 sources ---------------------------
Name (time in ns) 0001_f41c0c7(*) Min 0002_8e68892 Min ΔMin
-------------------------------------------------------------------------------------
test_getattr_thread_critical 790.93 245.80 -68.9%
test_setattr_thread_critical 899.99 254.15 -71.8%
...

The first source is the reference, marked with ``(*)``.
Each subsequent source is followed by a ``Δ`` column showing the percentage change.

You can control which metrics are shown per source with ``--columns`` and the sort order with ``--sort``::

pytest-benchmark compare --compare-between --sort=mean --columns=min,mean 0001 0002

Plotting
--------

Expand Down
18 changes: 16 additions & 2 deletions src/pytest_benchmark/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from .plugin import add_display_options
from .plugin import add_global_options
from .plugin import add_histogram_options
from .table import CompareBetweenResults
from .table import TableResults
from .utils import NAME_FORMATTERS
from .utils import first_or_value
Expand Down Expand Up @@ -108,6 +109,11 @@ def make_parser():
)
add_display_options(compare_command.add_argument, prefix='')
add_histogram_options(compare_command.add_argument, prefix='')
compare_command.add_argument(
'--compare-between',
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should use the prefix.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The prefix seems to be forced to '' for all compare subcommand options (and there's no prefix available in this function as far as I can see..?)?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah oops I kinda forgot what this function was for. What I wanted is --compare-between to be --between instead (cause the command already has "compare" - pointless to just repeat "compare" all over).

action='store_true',
help='Compare same-named benchmarks across different source files.',
)
add_glob_or_file(compare_command.add_argument)
add_csv_options(compare_command.add_argument, prefix='')

Expand Down Expand Up @@ -148,10 +154,18 @@ def main():
for file in storage.query():
print(file)
elif args.command == 'compare':
results_table = TableResults(
histogram = first_or_value(args.histogram, False)
if args.compare_between:
if histogram:
parser.error('--compare-between is not compatible with --histogram')
results_table_cls = CompareBetweenResults
else:
results_table_cls = TableResults

results_table = results_table_cls(
columns=args.columns,
sort=args.sort,
histogram=first_or_value(args.histogram, False),
histogram=histogram,
name_format=NAME_FORMATTERS[args.name],
logger=logger,
scale_unit=partial(
Expand Down
192 changes: 164 additions & 28 deletions src/pytest_benchmark/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,22 @@

NUMBER_FMT = '{0:,.4f}'
ALIGNED_NUMBER_FMT = '{0:>{1},.4f}{2:<{3}}'
STAT_PROPS = ('min', 'max', 'mean', 'median', 'iqr', 'stddev', 'ops')
DELTA = '\N{GREEK CAPITAL LETTER DELTA}'
REF_LABEL = '(*)'


def compute_best_worst(benchmarks, progress_reporter, tr, line):
worst = {}
best = {}
for line1, prop in progress_reporter(STAT_PROPS, tr, '{line}: {value}', line=line):
# For 'ops', higher is better; for time-based metrics, lower is better
best_fn, worst_fn = (max, min) if prop == 'ops' else (min, max)
values = progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1)
best[prop] = best_fn(bench[prop] for _, bench in values)
values = progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1)
worst[prop] = worst_fn(bench[prop] for _, bench in values)
return best, worst


class TableResults:
Expand All @@ -22,6 +38,24 @@ def __init__(self, columns, sort, histogram, name_format, logger, scale_unit):
self.logger = logger
self.scale_unit = scale_unit

def compute_scale(self, benchmarks, best, worst):
unit, adjustment = self.scale_unit(unit='seconds', benchmarks=benchmarks, best=best, worst=worst, sort=self.sort)
ops_unit, ops_adjustment = self.scale_unit(unit='operations', benchmarks=benchmarks, best=best, worst=worst, sort=self.sort)
labels = {
'name': f'Name (time in {unit}s)',
'min': 'Min',
'max': 'Max',
'mean': 'Mean',
'stddev': 'StdDev',
'rounds': 'Rounds',
'iterations': 'Iterations',
'iqr': 'IQR',
'median': 'Median',
'outliers': 'Outliers',
'ops': f'OPS ({ops_unit}ops/s)' if ops_unit else 'OPS',
}
return unit, adjustment, ops_adjustment, labels

def display(self, tr, groups, progress_reporter=report_progress):
tr.write_line('')
report_online_progress(progress_reporter, tr, 'Computing stats ...')
Expand All @@ -30,47 +64,24 @@ def display(self, tr, groups, progress_reporter=report_progress):
for bench in benchmarks:
bench['name'] = self.name_format(bench)

worst = {}
best = {}
solo = len(benchmarks) == 1
for line1, prop in progress_reporter(
('min', 'max', 'mean', 'median', 'iqr', 'stddev', 'ops'), tr, '{line}: {value}', line=line
):
if prop == 'ops':
worst[prop] = min(bench[prop] for _, bench in progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1))
best[prop] = max(bench[prop] for _, bench in progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1))
else:
worst[prop] = max(bench[prop] for _, bench in progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1))
best[prop] = min(bench[prop] for _, bench in progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1))
best, worst = compute_best_worst(benchmarks, progress_reporter, tr, line)
for line1, prop in progress_reporter(('outliers', 'rounds', 'iterations'), tr, '{line}: {value}', line=line):
worst[prop] = max(
benchmark[prop] for _, benchmark in progress_reporter(benchmarks, tr, '{line} ({pos}/{total})', line=line1)
)

unit, adjustment = self.scale_unit(unit='seconds', benchmarks=benchmarks, best=best, worst=worst, sort=self.sort)
ops_unit, ops_adjustment = self.scale_unit(unit='operations', benchmarks=benchmarks, best=best, worst=worst, sort=self.sort)
labels = {
'name': f'Name (time in {unit}s)',
'min': 'Min',
'max': 'Max',
'mean': 'Mean',
'stddev': 'StdDev',
'rounds': 'Rounds',
'iterations': 'Iterations',
'iqr': 'IQR',
'median': 'Median',
'outliers': 'Outliers',
'ops': f'OPS ({ops_unit}ops/s)' if ops_unit else 'OPS',
}
unit, adjustment, ops_adjustment, labels = self.compute_scale(benchmarks, best, worst)
widths = {
'name': 3 + max(len(labels['name']), max(len(benchmark['name']) for benchmark in benchmarks)),
'rounds': 2 + max(len(labels['rounds']), len(str(worst['rounds']))),
'iterations': 2 + max(len(labels['iterations']), len(str(worst['iterations']))),
'outliers': 2 + max(len(labels['outliers']), len(str(worst['outliers']))),
'ops': 2 + max(len(labels['ops']), len(NUMBER_FMT.format(best['ops'] * ops_adjustment))),
}
for prop in 'min', 'max', 'mean', 'stddev', 'median', 'iqr':
widths[prop] = 2 + max(len(labels[prop]), max(len(NUMBER_FMT.format(bench[prop] * adjustment)) for bench in benchmarks))
for prop in STAT_PROPS:
if prop not in widths:
widths[prop] = 2 + max(len(labels[prop]), max(len(NUMBER_FMT.format(bench[prop] * adjustment)) for bench in benchmarks))

rpadding = 0 if solo else 10
labels_line = labels['name'].ljust(widths['name']) + ''.join(
Expand Down Expand Up @@ -134,6 +145,131 @@ def display(self, tr, groups, progress_reporter=report_progress):
tr.write_line(' OPS: Operations Per Second, computed as 1 / Mean')


class CompareBetweenResults(TableResults):
def display(self, tr, groups, progress_reporter=report_progress):
tr.write_line('')

for line, (group, benchmarks) in progress_reporter(groups, tr, 'Computing stats ... group {pos}/{total}'):
self._display_single_between(line, group, benchmarks, tr=tr, progress_reporter=progress_reporter)

tr.write_line('Legend:')
tr.write_line(f' {REF_LABEL}: reference source for comparison. Cyan on terminal.')
tr.write_line(f' {DELTA}: percentage change from reference source.')
tr.write_line(' Green: improvement, Red: regression.')

def _display_single_between(self, line, group, benchmarks, *, tr, progress_reporter):
# Collect sources in order of first appearance and build fullname -> {source: bench} mapping
sources = list(dict.fromkeys(bench.get('source', '') for bench in benchmarks))
bench_map = {}
for bench in benchmarks:
bench_map.setdefault(bench['fullname'], {})[bench.get('source', '')] = bench

if len(sources) < 2:
tr.write_line(f'ERROR: --compare-between requires at least 2 source files, got {len(sources)}.', red=True)
return

metrics = [c for c in self.columns if c in STAT_PROPS]
if not metrics:
metrics = [self.sort] if self.sort in STAT_PROPS else ['min']

all_benches = list(benchmarks)
best, worst = compute_best_worst(all_benches, progress_reporter, tr, line)
_unit, adjustment, ops_adjustment, labels = self.compute_scale(all_benches, best, worst)
adjustments = {m: (ops_adjustment if m == 'ops' else adjustment) for m in metrics}

# Sort benchmarks by the sort metric from the first source
first_source = sources[0]
sorted_names = sorted(
bench_map,
key=lambda name: bench_map[name].get(first_source, {}).get(self.sort, float('inf')),
)

# Format benchmark display names without source suffix
display_names = {}
for fullname in sorted_names:
any_bench = next(iter(bench_map[fullname].values()))
display_names[fullname] = self.name_format({**any_bench, 'source': ''})

name_width = 3 + max(len(labels['name']), *(len(n) for n in display_names.values()))

columns = [] # Each entry: (source_idx, metric, label, width, is_change)

def _val_col_width(metric, src, label):
"""
Helper to compute value column width for a given metric and source
"""
adj = adjustments[metric]
w = len(label)
for fn in sorted_names:
bench = bench_map[fn].get(src)
if bench is not None:
w = max(w, len(NUMBER_FMT.format(bench[metric] * adj)))
return w + 2

# Format source labels (short)
source_labels = []
for src in sources:
parts = src.split('/')
label = parts[-1][:12] if parts[-1] else src[:12]
source_labels.append(label)

# Reference source columns
for metric in metrics:
label = f'{source_labels[0]}{REF_LABEL} {labels[metric]}'
columns.append((0, metric, label, _val_col_width(metric, sources[0], label), False))

# Subsequent sources: value + diff pairs per metric
for si in range(1, len(sources)):
slabel = source_labels[si]
for metric in metrics:
val_label = f'{slabel} {labels[metric]}'
columns.append((si, metric, val_label, _val_col_width(metric, sources[si], val_label), False))
diff_label = f'{DELTA}{labels[metric]}'
columns.append((si, metric, diff_label, max(len(diff_label) + 2, 10), True))

header = ''.join(
[
labels['name'].ljust(name_width),
*(label.rjust(width) for _si, _metric, label, width, _is_change in columns),
]
)

group_name = '' if group is None else f' {group!r}'
tr.write_line(
f' benchmark{group_name}: {len(sorted_names)} tests, {len(sources)} sources '.center(len(header), '-'),
yellow=True,
)
tr.write_line(header)
tr.write_line('-' * len(header), yellow=True)

for fullname in sorted_names:
tr.write(display_names[fullname].ljust(name_width))
row_values = {}
for si, metric, _label, width, is_change in columns:
if not is_change:
bench = bench_map[fullname].get(sources[si])
if bench is None:
tr.write('N/A'.rjust(width))
else:
row_values[(si, metric)] = bench[metric]
tr.write(NUMBER_FMT.format(bench[metric] * adjustments[metric]).rjust(width), cyan=(si == 0))
else:
base_val = row_values.get((0, metric))
cmp_val = row_values.get((si, metric))
if base_val is None or cmp_val is None or base_val == 0:
tr.write('N/A'.rjust(width))
else:
pct = (cmp_val - base_val) / abs(base_val) * 100
# For ops, higher is better; for time metrics, lower is better
is_improvement = (pct > 0) if metric == 'ops' else (pct < 0)
is_regression = (pct < 0) if metric == 'ops' else (pct > 0)
tr.write(f'{pct:+.1f}%'.rjust(width), green=is_improvement, red=is_regression, bold=True)
tr.write('\n')

tr.write_line('-' * len(header), yellow=True)
tr.write_line('')


def compute_baseline_scale(baseline, value, width):
if not width:
return ''
Expand Down
61 changes: 60 additions & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def test_help_compare(testdir, args):
' [--columns LABELS] [--name FORMAT]',
' [--time-unit COLUMN]',
' [--histogram [FILENAME-PREFIX]]',
' [--csv [FILENAME]]',
' [--compare-between] [--csv [FILENAME]]',
' [[]glob_or_file *[]]',
'',
'Compare saved runs.',
Expand Down Expand Up @@ -122,6 +122,8 @@ def test_help_compare(testdir, args):
' FILENAME-PREFIX-test_name.svg. If FILENAME-PREFIX',
" contains slashes ('/') then directories will be",
" created. Default: 'benchmark_*'",
' --compare-between Compare same-named benchmarks across different source',
' files.',
" --csv [FILENAME] Save a csv report. If FILENAME contains slashes ('/')",
' then directories will be created. Default:',
" 'benchmark_*'",
Expand Down Expand Up @@ -287,6 +289,63 @@ def test_compare(testdir, name, name_pattern_generator):
assert result.ret == 0


def test_compare_between(testdir):
common_args = [
'py.test-benchmark',
'--storage',
STORAGE,
'compare',
'--compare-between',
'--sort=min',
'--columns=min,max',
'--name=short',
]
# 2-source case: reference (*), interleaved value+delta columns
result = testdir.run(*common_args, '0001', '0002')
result.stdout.fnmatch_lines(
[
'---*--- benchmark: 1 tests, 2 sources ---*---',
'Name (time in ns) *0001_b87b9aa(*) Min*0001_b87b9aa(*) Max*0002_b87b9aa Min*\u0394Min*0002_b87b9aa Max*\u0394Max',
'---*---',
'xfast_parametrized[[]0[]] *217.3145*11*447.3891*216.9028*-0.2%*7*739.2997*-32.4%',
'---*---',
'',
'Legend:',
' (*): reference source for comparison.*',
]
)
assert result.ret == 0

# 3-source case: each non-reference source followed by its deltas
result = testdir.run(*common_args, '0001', '0002', '0003')
result.stdout.fnmatch_lines(
[
'---*--- benchmark: 1 tests, 3 sources ---*---',
'Name (time in ns) *0001_b87b9aa(*) Min*0001_b87b9aa(*) Max*0002_b87b9aa Min*\u0394Min*0002_b87b9aa Max*\u0394Max*0003_5b78858 Min*\u0394Min*0003_5b78858 Max*\u0394Max',
'---*---',
'xfast_parametrized[[]0[]] *217.3145*11*447.3891*216.9028*-0.2%*7*739.2997*-32.4%*215.6286*-0.8%*10*318.6159*-9.9%',
'---*---',
]
)
assert result.ret == 0


def test_compare_between_histogram_error(testdir):
result = testdir.run(
'py.test-benchmark',
'--storage',
STORAGE,
'compare',
'--compare-between',
'--histogram',
'foobar',
'0001',
'0002',
)
result.stderr.fnmatch_lines(['*error: --compare-between is not compatible with --histogram*'])
assert result.ret != 0


@pytest.mark.parametrize(
('name', 'name_pattern_generator', 'unit'),
[
Expand Down
Loading