Skip to content

Commit 0acfdbd

Browse files
committed
[libc++] Improve historical benchmark visualization
- Use LOWESS instead of OLS trendlines, it tends to fit data better - Plot using the commit date instead of the arbitrary revlist order - Fix progress bar reporting when we prefetch Git commit data - Allow adding a subtitle to charts, which is helpful to stay organized - Ensure that series are always presented in the same (alphabetical) order
1 parent eb85899 commit 0acfdbd

File tree

2 files changed

+29
-13
lines changed

2 files changed

+29
-13
lines changed

libcxx/utils/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
GitPython
12
numpy
23
pandas
34
plotly

libcxx/utils/visualize-historical

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/usr/bin/env python3
22

33
import argparse
4+
import datetime
45
import functools
56
import os
67
import pathlib
@@ -10,6 +11,7 @@ import subprocess
1011
import sys
1112
import tempfile
1213

14+
import git
1315
import pandas
1416
import plotly
1517
import plotly.express
@@ -74,13 +76,22 @@ class Commit:
7476
"""
7577
return subprocess.check_output(['git', '-C', self._git_repo, 'rev-parse', self._sha], text=True).strip()
7678

79+
@functools.cached_property
80+
def commit_date(self):
81+
"""
82+
Return the date of the commit as a `datetime.datetime` object.
83+
"""
84+
repo = git.Repo(self._git_repo)
85+
return datetime.datetime.fromtimestamp(repo.commit(self._sha).committed_date)
86+
7787
def prefetch(self):
7888
"""
7989
Prefetch cached properties associated to this commit object.
8090
8191
This makes it possible to control when time is spent recovering that information from Git for
8292
e.g. better reporting to the user.
8393
"""
94+
self.commit_date
8495
self.fullrev
8596
self.shortrev
8697
self.show()
@@ -101,20 +112,21 @@ def truncate_lines(string, n, marker=None):
101112
assert len(truncated) <= n, "broken post-condition"
102113
return '\n'.join(truncated)
103114

104-
def create_plot(data, metric):
115+
def create_plot(data, metric, subtitle=None):
105116
"""
106117
Create a plot object showing the evolution of each benchmark throughout the given commits for
107118
the given metric.
108119
"""
109-
data = data.sort_values(by='revlist_order')
120+
data = data.sort_values(by=['date', 'benchmark'])
110121
revlist = pandas.unique(data['commit']) # list of all commits in chronological order
111122
hover_info = {c: truncate_lines(c.show(), 30, marker='...').replace('\n', '<br>') for c in revlist}
112123
figure = plotly.express.scatter(data, title=f"{revlist[0].shortrev} to {revlist[-1].shortrev}",
113-
x='revlist_order', y=metric,
124+
subtitle=subtitle,
125+
x='date', y=metric,
114126
symbol='benchmark',
115127
color='benchmark',
116128
hover_name=[hover_info[c] for c in data['commit']],
117-
trendline="ols")
129+
trendline="lowess")
118130
return figure
119131

120132
def directory_path(string):
@@ -184,7 +196,7 @@ def main(argv):
184196
description='Visualize historical data in LNT format. This program generates a HTML file that embeds an '
185197
'interactive plot with the provided data. The HTML file can then be opened in a browser to '
186198
'visualize the data as a chart.',
187-
epilog='This script depends on the `plotly` and the `tqdm` Python modules.')
199+
epilog='This script depends on the modules listed in `libcxx/utils/requirements.txt`.')
188200
parser.add_argument('directory', type=directory_path,
189201
help='Path to a valid directory containing benchmark data in LNT format, each file being named <commit>.lnt. '
190202
'This is also the format generated by the `benchmark-historical` utility.')
@@ -208,6 +220,8 @@ def main(argv):
208220
'floating point number, e.g. 0.25 will detect points that differ by more than 25%% from their previous '
209221
'result. This option respects --filter, i.e. only benchmarks that match the filter will be analyzed for '
210222
'outliers.')
223+
parser.add_argument('--subtitle', type=str, required=False,
224+
help='Optional subtitle for the chart. This can be used to help identify the contents of the chart.')
211225
parser.add_argument('--git-repo', type=directory_path, default=pathlib.Path(os.getcwd()),
212226
help='Path to the git repository to use for ordering commits in time. '
213227
'By default, the current working directory is used.')
@@ -217,26 +231,27 @@ def main(argv):
217231
args = parser.parse_args(argv)
218232

219233
# Extract benchmark data from the directory.
220-
data = []
234+
data = {}
221235
files = [f for f in args.directory.glob('*.lnt')]
222236
for file in tqdm.tqdm(files, desc='Parsing LNT files'):
237+
rows = parse_lnt(file.read_text().splitlines())
223238
(commit, _) = os.path.splitext(os.path.basename(file))
224239
commit = Commit(args.git_repo, commit)
225-
with open(file, 'r') as f:
226-
rows = parse_lnt(f.readlines())
227-
data.extend((commit, row) for row in rows)
240+
data[commit] = rows
228241

229242
# Obtain commit information which is then cached throughout the program. Do this
230243
# eagerly so we can provide a progress bar.
231-
for (commit, _) in tqdm.tqdm(data, desc='Prefetching Git information'):
244+
for commit in tqdm.tqdm(data.keys(), desc='Prefetching Git information'):
232245
commit.prefetch()
233246

234247
# Create a dataframe from the raw data and add some columns to it:
235248
# - 'commit' represents the Commit object associated to the results in that row
236249
# - `revlist_order` represents the order of the commit within the Git repository.
237-
data = pandas.DataFrame([row | {'commit': commit} for (commit, row) in data])
238-
revlist = sorted_revlist(args.git_repo, [c.fullrev for c in set(data['commit'])])
250+
# - `date` represents the commit date
251+
revlist = sorted_revlist(args.git_repo, [c.fullrev for c in data.keys()])
252+
data = pandas.DataFrame([row | {'commit': c} for (c, rows) in data.items() for row in rows])
239253
data = data.join(pandas.DataFrame([{'revlist_order': revlist.index(c.fullrev)} for c in data['commit']]))
254+
data = data.join(pandas.DataFrame([{'date': c.commit_date} for c in data['commit']]))
240255

241256
# Filter the benchmarks if needed.
242257
if args.filter is not None:
@@ -254,7 +269,7 @@ def main(argv):
254269
return
255270

256271
# Plot the data for all the required benchmarks.
257-
figure = create_plot(data, args.metric)
272+
figure = create_plot(data, args.metric, subtitle=args.subtitle)
258273
do_open = args.output is None or args.open
259274
output = args.output if args.output is not None else tempfile.NamedTemporaryFile(suffix='.html').name
260275
plotly.io.write_html(figure, file=output, auto_open=do_open)

0 commit comments

Comments
 (0)