[benchmark] test_performance with BenchmarkDriver

palimondo · palimondo · commit d854f0f89859 · 2019-01-08T00:22:00.000+01:00
Refactored `test_perfomance` function to use existing  BenchmarkDriver and TestComparator.

This replaces hand-rolled parser and comparison logic with library functions which already have full unit test coverage.
diff --git a/benchmark/scripts/run_smoke_bench b/benchmark/scripts/run_smoke_bench
@@ -26,11 +26,10 @@ from __future__ import print_function
 import argparse
 import glob
 import os
-import re
 import subprocess
 import sys
 
-from compare_perf_tests import LogParser, create_report
+from compare_perf_tests import LogParser, TestComparator, create_report
 
 from imp import load_source
 # import Benchmark_Driver  # doesn't work because it misses '.py' extension
@@ -149,89 +148,61 @@ def test_opt_levels(args):
     return 0
 
 
-def test_performance(opt_level, old_dir, new_dir, threshold, num_samples,
-                     output_file):
-    num_results_dont_differ = 0
-    iter = 1
-    to_test = None
-    prev_num_tests = None
+def measure(driver, tests, i):
+    """Log and measure samples of the tests with the given driver.
 
-    old_lines = ""
-    new_lines = ""
+    Collect increasing number of samples, depending on the iteration.
+    """
+    num_samples = min(i + 3, 10)
+    msg = '    Iteration {0} for {1}: num samples = {2}, '.format(
+        i, driver.args.tests, num_samples)
+    msg += ('running all tests' if driver.all_tests == tests else
+            're-testing {0} tests'.format(len(tests)))
+    log(msg)
+    driver.tests = tests
+    return driver.run(num_samples=num_samples, sample_time=0.0025)
 
-    # #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),PEAK_MEMORY(B)
-    score_re = re.compile(r"(\d+),([\w.\-]+),\d+,(\d+)")
-
-    while to_test is None or len(to_test) > 0:
-        tested_benchmarks = set()
-
-        # (benchmark_name, benchmark_directory) -> (min_value, result_line)
-        values = {}
-
-        # Run the benchmarks and store the results in 'values'.
-        for bench_dir in (old_dir, new_dir):
-            log('    Iteration ' + str(iter) + ' for ' + bench_dir +
-                ': num samples = ' + str(num_samples) +
-                (', running all tests' if to_test is None
-                    else ', re-testing ' + str(len(to_test)) + ' tests'))
-
-            result = get_results(bench_dir, opt_level, num_samples, to_test)
-            for line in result.splitlines():
-                m = score_re.match(line)
-                if m:
-                    testname = m.group(2)
-                    val = int(m.group(3))
-                    values[(testname, bench_dir)] = (val, line)
-                    tested_benchmarks.add(testname)
-
-        # Some local utility functions
-
-        def bench_in(bench, bench_dir):
-            return (bench, bench_dir) in values
-
-        def within_threshold(bench):
-            old_val = values[(bench, old_dir)][0]
-            new_val = values[(bench, new_dir)][0]
-            if not new_val:
-                return True
-            f = float(old_val) / float(new_val)
-            return f >= 1.0 - threshold and f <= 1.0 + threshold
-
-        def result_line(bench, bench_dir):
-            result_line = values[(bench, bench_dir)][1]
-            return result_line + '\n'
-
-        # Check which benchmarks are added/removed and which need to be re-run
-        to_test = []
-        for bench in sorted(tested_benchmarks):
-            if bench_in(bench, old_dir) and not bench_in(bench, new_dir):
-                old_lines += result_line(bench, old_dir)
-            elif bench_in(bench, new_dir) and not bench_in(bench, old_dir):
-                new_lines += result_line(bench, new_dir)
-            elif within_threshold(bench) or num_results_dont_differ >= 4:
-                old_lines += result_line(bench, old_dir)
-                new_lines += result_line(bench, new_dir)
-            else:
-                to_test.append(bench)
-                if VERBOSE:
-                    log('        test again ' + bench)
-
-        # Track how many times we could not reduce the number of benchmarks
-        if prev_num_tests == len(to_test):
-            num_results_dont_differ += 1
-        else:
-            num_results_dont_differ = 0
-        prev_num_tests = len(to_test)
 
-        # Increase the number of samples for benchmarks which re-run
-        if num_samples < 10:
-            num_samples += 1
+def merge(results, other_results):
+    """"Merge the other PreformanceTestResults into the first dictionary."""
+    for test, result in other_results.items():
+        results[test].merge(result)
+    return results
 
-        iter += 1
+
+def test_performance(opt_level, old_dir, new_dir, threshold, num_samples,
+                     output_file):
+    """Detect performance changes in benchmarks.
+
+    Start fast with few samples per benchmark and gradually spend more time
+    gathering more precise measurements of the change candidates.
+    """
+
+    i, unchanged_length_count = 0, 0
+    old, new = [BenchmarkDriver(DriverArgs(dir, optimization=opt_level))
+                for dir in [old_dir, new_dir]]
+    results = [measure(driver, driver.tests, i) for driver in [old, new]]
+    tests = TestComparator(results[0], results[1], threshold)
+    changed = tests.decreased + tests.increased
+
+    while len(changed) > 0 and unchanged_length_count < 5:
+        i += 1
+        if VERBOSE:
+            log('        test again: ' + str([test.name for test in changed]))
+        results = [merge(the_results,
+                         measure(driver, [test.name for test in changed], i))
+                   for the_results, driver in zip(results, [old, new])]
+        tests = TestComparator(results[0], results[1], threshold)
+        changed = tests.decreased + tests.increased
+
+        if len(old.tests) == len(changed):
+            unchanged_length_count += 1
+        else:
+            unchanged_length_count = 0
 
     log('')
-    return report_results("Performance: -" + opt_level,
-                          old_lines, new_lines, threshold * 1.4, output_file)
+    return report_results("Performance: -" + opt_level, None, None,
+                          threshold * 1.4, output_file, *results)
 
 
 def get_results(bench_dir, opt_level, num_samples, to_test):
@@ -294,9 +265,10 @@ def get_codesize(filename):
     return int(data_line.split('\t')[0])
 
 
-def report_results(title, old_lines, new_lines, threshold, output_file):
-    old_results = LogParser.results_from_string(old_lines)
-    new_results = LogParser.results_from_string(new_lines)
+def report_results(title, old_lines, new_lines, threshold, output_file,
+                   old_results=None, new_results=None):
+    old_results = old_results or LogParser.results_from_string(old_lines)
+    new_results = new_results or LogParser.results_from_string(new_lines)
 
     print("------- " + title + " -------")
     print(create_report(old_results, new_results, threshold, 'git'))