[libc++] Add a script to find outliers and re-run candidates in LNT results

ldionne · ldionne · commit a9b8dfe7b5f2 · 2025-10-01T12:38:24.000-04:00
This allows selectively re-running benchmarks that are suspected to contain
a lot of noise.
diff --git a/libcxx/utils/find-rerun-candidates b/libcxx/utils/find-rerun-candidates
@@ -0,0 +1,242 @@
+#!/usr/bin/env python3
+
+import argparse
+import datetime
+import functools
+import os
+import pathlib
+import re
+import statistics
+import subprocess
+import sys
+
+import git
+import pandas
+import tqdm
+
+@functools.total_ordering
+class Commit:
+    """
+    This class represents a commit inside a given Git repository.
+    """
+
+    def __init__(self, git_repo, sha):
+        self._git_repo = git_repo
+        self._sha = sha
+
+    def __eq__(self, other):
+        """
+        Return whether two commits refer to the same commit.
+
+        This doesn't take into account the content of the Git tree at those commits, only the
+        'identity' of the commits themselves.
+        """
+        return self.fullrev == other.fullrev
+
+    def __lt__(self, other):
+        """
+        Return whether a commit is an ancestor of another commit in the Git repository.
+        """
+        # Is self._sha an ancestor of other._sha?
+        res = subprocess.run(['git', '-C', self._git_repo, 'merge-base', '--is-ancestor', self._sha, other._sha])
+        if res.returncode not in (0, 1):
+            raise RuntimeError(f'Error when trying to obtain the commit order for {self._sha} and {other._sha}')
+        return res.returncode == 0
+
+    def __hash__(self):
+        """
+        Return the full revision for this commit.
+        """
+        return hash(self.fullrev)
+
+    @functools.cache
+    def show(self, include_diff=False):
+        """
+        Return the commit information equivalent to `git show` associated to this commit.
+        """
+        cmd = ['git', '-C', self._git_repo, 'show', self._sha]
+        if not include_diff:
+            cmd.append('--no-patch')
+        return subprocess.check_output(cmd, text=True)
+
+    @functools.cached_property
+    def shortrev(self):
+        """
+        Return the shortened version of the given SHA.
+        """
+        return subprocess.check_output(['git', '-C', self._git_repo, 'rev-parse', '--short', self._sha], text=True).strip()
+
+    @functools.cached_property
+    def fullrev(self):
+        """
+        Return the full SHA associated to this commit.
+        """
+        return subprocess.check_output(['git', '-C', self._git_repo, 'rev-parse', self._sha], text=True).strip()
+
+    @functools.cached_property
+    def commit_date(self):
+        """
+        Return the date of the commit as a `datetime.datetime` object.
+        """
+        repo = git.Repo(self._git_repo)
+        return datetime.datetime.fromtimestamp(repo.commit(self._sha).committed_date)
+
+    def prefetch(self):
+        """
+        Prefetch cached properties associated to this commit object.
+
+        This makes it possible to control when time is spent recovering that information from Git for
+        e.g. better reporting to the user.
+        """
+        self.commit_date
+        self.fullrev
+        self.shortrev
+        self.show()
+
+    def __str__(self):
+        return self._sha
+
+def directory_path(string):
+    if os.path.isdir(string):
+        return pathlib.Path(string)
+    else:
+        raise NotADirectoryError(string)
+
+def parse_lnt(lines, aggregate=statistics.median):
+    """
+    Parse lines in LNT format and return a list of dictionnaries of the form:
+
+        [
+            {
+                'benchmark': <benchmark1>,
+                <metric1>: [float],
+                <metric2>: [float],
+                'data_points': int,
+                ...
+            },
+            {
+                'benchmark': <benchmark2>,
+                <metric1>: [float],
+                <metric2>: [float],
+                'data_points': int,
+                ...
+            },
+            ...
+        ]
+
+    If a metric has multiple values associated to it, they are aggregated into a single
+    value using the provided aggregation function.
+    """
+    results = {}
+    for line in lines:
+        line = line.strip()
+        if not line:
+            continue
+
+        (identifier, value) = line.split(' ')
+        (benchmark, metric) = identifier.split('.')
+        if benchmark not in results:
+            results[benchmark] = {'benchmark': benchmark}
+
+        entry = results[benchmark]
+        if metric not in entry:
+            entry[metric] = []
+        entry[metric].append(float(value))
+
+    for (bm, entry) in results.items():
+        metrics = [key for key in entry if isinstance(entry[key], list)]
+        min_data_points = min(len(entry[metric]) for metric in metrics)
+        for metric in metrics:
+            entry[metric] = aggregate(entry[metric])
+        entry['data_points'] = min_data_points
+
+    return list(results.values())
+
+def sorted_revlist(git_repo, commits):
+    """
+    Return the list of commits sorted by their chronological order (from oldest to newest) in the
+    provided Git repository. Items earlier in the list are older than items later in the list.
+    """
+    revlist_cmd = ['git', '-C', git_repo, 'rev-list', '--no-walk'] + list(commits)
+    revlist = subprocess.check_output(revlist_cmd, text=True).strip().splitlines()
+    return list(reversed(revlist))
+
+def main(argv):
+    parser = argparse.ArgumentParser(
+        prog='find-rerun-candidates',
+        description='Find benchmarking data points that are good candidates for additional runs, to reduce noise.')
+    parser.add_argument('directory', type=directory_path,
+        help='Path to a valid directory containing benchmark data in LNT format, each file being named <commit>.lnt. '
+             'This is also the format generated by the `benchmark-historical` utility.')
+    parser.add_argument('--metric', type=str, default='execution_time',
+        help='The metric to analyze. LNT data may contain multiple metrics (e.g. code size, execution time, etc) -- '
+             'this option allows selecting which metric is analyzed for rerun candidates. The default is "execution_time".')
+    parser.add_argument('--filter', type=str, required=False,
+        help='An optional regular expression used to filter the benchmarks included in the analysis. '
+             'Only benchmarks whose names match the regular expression will be analyzed.')
+    parser.add_argument('--outlier-threshold', metavar='FLOAT', type=float, default=0.1,
+        help='Relative difference from the previous points for considering a data point as an outlier. This threshold is '
+             'expressed as a floating point number, e.g. 0.25 will detect points that differ by more than 25%% from their '
+             'previous result.')
+    parser.add_argument('--data-points-threshold', type=int, required=False,
+        help='Number of data points above which an outlier is not considered an outlier. If an outlier has more than '
+             'that number of data points yet its relative difference is above the threshold, it is not considered an '
+             'outlier. This can be used to re-run noisy data points until we have at least N samples, at which point '
+             'we consider the data to be accurate, even if the result is beyond the threshold. By default, there is '
+             'no limit on the number of data points.')
+    parser.add_argument('--git-repo', type=directory_path, default=pathlib.Path(os.getcwd()),
+        help='Path to the git repository to use for ordering commits in time. '
+             'By default, the current working directory is used.')
+    args = parser.parse_args(argv)
+
+    # Extract benchmark data from the directory.
+    data = {}
+    files = [f for f in args.directory.glob('*.lnt')]
+    for file in tqdm.tqdm(files, desc='Parsing LNT files'):
+        rows = parse_lnt(file.read_text().splitlines())
+        (commit, _) = os.path.splitext(os.path.basename(file))
+        commit = Commit(args.git_repo, commit)
+        data[commit] = rows
+
+    # Obtain commit information which is then cached throughout the program. Do this
+    # eagerly so we can provide a progress bar.
+    for commit in tqdm.tqdm(data.keys(), desc='Prefetching Git information'):
+        commit.prefetch()
+
+    # Create a dataframe from the raw data and add some columns to it:
+    # - 'commit' represents the Commit object associated to the results in that row
+    # - `revlist_order` represents the order of the commit within the Git repository.
+    revlist = sorted_revlist(args.git_repo, [c.fullrev for c in data.keys()])
+    data = pandas.DataFrame([row | {'commit': c} for (c, rows) in data.items() for row in rows])
+    data = data.join(pandas.DataFrame([{'revlist_order': revlist.index(c.fullrev)} for c in data['commit']]))
+
+    # Filter the benchmarks if needed.
+    if args.filter is not None:
+        keeplist = [b for b in data['benchmark'] if re.search(args.filter, b) is not None]
+        data = data[data['benchmark'].isin(keeplist)]
+
+    # Detect outliers by selecting all benchmarks whose change percentage is beyond the threshold.
+    # If we have a max number of points, also take that into account.
+    if args.data_points_threshold is not None:
+        print(f'Generating outliers with more than {args.outlier_threshold * 100}% relative difference and less than {args.data_points_threshold} data points')
+    else:
+        print(f'Generating outliers with more than {args.outlier_threshold * 100}% relative difference')
+
+    overall = set()
+    for (benchmark, series) in data.sort_values(by='revlist_order').groupby('benchmark'):
+        pct_change = series[args.metric].pct_change()
+        outliers = series[pct_change.abs() > args.outlier_threshold]
+        if args.data_points_threshold is not None:
+            outliers = outliers[outliers['data_points'] < args.data_points_threshold]
+        outliers = set(outliers['commit'])
+        overall |= outliers
+        if len(outliers) > 0:
+            print(f'{benchmark}: {" ".join(c.shortrev for c in outliers)}')
+
+    if len(overall) > 0:
+        print(f'Summary: {" ".join(c.shortrev for c in overall)}')
+    else:
+        print(f'No outliers')
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
diff --git a/libcxx/utils/visualize-historical b/libcxx/utils/visualize-historical
@@ -213,13 +213,6 @@ def main(argv):
              'Since the chart is interactive, it generally makes most sense to include all the benchmarks '
              'and to then filter them in the browser, but in some cases producing a chart with a reduced '
              'number of data series is useful.')
-    parser.add_argument('--find-outliers', metavar='FLOAT', type=float, required=False,
-        help='Instead of building a chart, detect commits that show a large spike (more than the given relative threshold) '
-             'with the previous result and print those to standard output. This can be used to generate a list of '
-             'potential outliers that we might want to re-generate the data for. The threshold is expressed as a '
-             'floating point number, e.g. 0.25 will detect points that differ by more than 25%% from their previous '
-             'result. This option respects --filter, i.e. only benchmarks that match the filter will be analyzed for '
-             'outliers.')
     parser.add_argument('--subtitle', type=str, required=False,
         help='Optional subtitle for the chart. This can be used to help identify the contents of the chart.')
     parser.add_argument('--git-repo', type=directory_path, default=pathlib.Path(os.getcwd()),
@@ -258,16 +251,6 @@ def main(argv):
         keeplist = [b for b in data['benchmark'] if re.search(args.filter, b) is not None]
         data = data[data['benchmark'].isin(keeplist)]
 
-    # If requested, perform a basic pass to detect outliers.
-    # Note that we consider a commit to be an outlier if any of the benchmarks for that commit is an outlier.
-    if args.find_outliers is not None:
-        threshold = args.find_outliers
-        outliers = set()
-        for (benchmark, series) in data.sort_values(by='revlist_order').groupby('benchmark'):
-            outliers |= set(series[series[args.metric].pct_change() > threshold]['commit'])
-        print(f'Outliers (more than {threshold * 100}%): {" ".join(c.shortrev for c in outliers)}')
-        return
-
     # Plot the data for all the required benchmarks.
     figure = create_plot(data, args.metric, subtitle=args.subtitle)
     do_open = args.output is None or args.open