"checkup" mode for benchmarks that checks GC activity and compilations

steve-s · steve-s · commit fa2b47b99311 · 2021-08-31T10:43:17.000+02:00
This mode adds necessary flags to the VM, doubles the iterations count,
and then analyzes the output for exessive GC activity (&gt;10% of iteration
time), compilations finishing after the original iterations count, and
warmup detected after the original iterations count.
diff --git a/mx.graalpython/mx_graalpython.py b/mx.graalpython/mx_graalpython.py
@@ -27,15 +27,14 @@
 import contextlib
 import datetime
 import glob
+import itertools
 import json
 import os
 import pathlib
 import platform
 import re
 import shlex
 import shutil
-
-import itertools
 import sys
 
 HPY_IMPORT_ORPHAN_BRANCH_NAME = "hpy-import"
diff --git a/mx.graalpython/mx_graalpython_benchmark.py b/mx.graalpython/mx_graalpython_benchmark.py
@@ -476,6 +476,7 @@ def get_extra_polyglot_args(self):
 class PythonBaseBenchmarkSuite(VmBenchmarkSuite, AveragingBenchmarkMixin):
     def __init__(self, name, benchmarks):
         super(PythonBaseBenchmarkSuite, self).__init__()
+        self._checkup = 'GRAALPYTHON_BENCHMARKS_CHECKUP' in os.environ
         self._name = name
         self._benchmarks = benchmarks
         self._graph_dump_time = None
@@ -680,9 +681,15 @@ def _replace_host_vm(key):
         _replace_host_vm('graalvm-ee')
         self.post_run_graph(benchmarks[0], dims['host-vm-config'], dims['guest-vm-config'])
 
+        if self._checkup:
+            self.checkup(out)
+
         return ret_code, out, dims
 
     def run(self, benchmarks, bm_suite_args):
+        if '--checkup' in bm_suite_args:
+            self._checkup = True
+            bm_suite_args.remove('--checkup')
         results = super(PythonBaseBenchmarkSuite, self).run(benchmarks, bm_suite_args)
         self.addAverageAcrossLatestResults(results)
         return results
@@ -708,6 +715,10 @@ def postprocess_run_args(self, run_args):
             if arg.startswith("-i"):
                 if len(run_args) >= i and run_args[i + 1] == "-1":
                     pass
+                elif self._checkup and len(run_args) >= i:
+                    iterations = int(run_args[i + 1]) * 2
+                    remaining = ["-i", str(iterations)] + run_args[i+2:]
+                    break
                 else:
                     remaining = run_args[i:]
                     break
@@ -717,13 +728,78 @@ def postprocess_run_args(self, run_args):
 
         if not (remaining and "-i" in remaining):
             iterations = DEFAULT_ITERATIONS + self.getExtraIterationCount(DEFAULT_ITERATIONS)
+            if self._checkup:
+                iterations *= 2
             remaining = ["-i", str(iterations)] + (remaining if remaining else [])
 
+        if self._checkup:
+            vm_options += ['--engine.TraceCompilation', '-XX:+PrintGC']
+
         return vm_options, remaining
 
     def createCommandLineArgs(self, benchmarks, bmSuiteArgs):
         return self.createVmCommandLineArgs(benchmarks, bmSuiteArgs)
 
+    def checkup(self, out):
+        lines = out.split('\n')
+        benchmark_name = None
+        current_iteration = -1
+        iterations_count = -1
+        iteration_times = []
+        gc_times = []
+        late_compilation = False
+        for i in range(len(lines)):
+            line = lines[i]
+
+            # this marks the beginning of an output of a benchmark
+            benchmark_info = re.search("### (.*), \\d+ warmup iterations, (\\d+) bench iterations", line)
+            if benchmark_name is None and benchmark_info:
+                benchmark_name = benchmark_info.group(1)
+                iterations_count = int(benchmark_info.group(2))
+                mx.log(f"Checking benchmark {benchmark_name} with {iterations_count} iterations")
+                continue
+
+            if benchmark_name is None:
+                continue
+
+            # this marks the end of the processing of a single benchmark
+            warmup_match = re.search("### WARMUP detected at iteration: (\\d+)", line)
+            if warmup_match:
+                warmup = int(warmup_match.group(1))
+                for i in range(warmup, len(iteration_times)):
+                    if gc_times[i] > iteration_times[i] / 10:
+                        mx.warn(
+                            f"Benchmark checkup: {benchmark_name}: excessive GC pause of {gc_times[i]} (on {i} iteration)")
+                if warmup > iterations_count / 2:
+                    mx.warn(f"Benchmark checkup: {benchmark_name}: warmup detected too late (on {warmup} iteration)")
+                if late_compilation:
+                    mx.warn(
+                        f"Benchmark checkup: {benchmark_name}: compilation detected too late (on {late_compilation} iteration)")
+                iteration_times = []
+                gc_times = []
+                current_iteration = -1
+                benchmark_name = None
+                late_compilation = False
+                continue
+
+            # following is done only when we are inside benchmark output:
+            iteration_info = re.search("### iteration=(\\d+), name=.*, duration=([0-9.]*)", line)
+            if iteration_info:
+                current_iteration += 1
+                iteration_times += [float(iteration_info.group(2))]
+                gc_times += [0.0]
+                continue
+
+            if current_iteration == -1:
+                continue
+
+            gc_log = re.search("\\[GC .* ([0-9,]*) secs]", line)
+            if gc_log:
+                gc_times[len(gc_times) - 1] += float(gc_log.group(1).replace(',', '.'))
+
+            if current_iteration >= iterations_count / 2 and "[engine] opt done" in line:
+                late_compilation = current_iteration
+
 
 class PythonBenchmarkSuite(PythonBaseBenchmarkSuite):
     def __init__(self, name, bench_path, benchmarks, python_path=None):