Add extra_column to benchcomp markdown visualizer (#2415)

karkhaz · web-flow · commit 47afab5e45fd · 2023-04-28T08:06:44.000Z
diff --git a/tools/benchcomp/benchcomp/visualizers/__init__.py b/tools/benchcomp/benchcomp/visualizers/__init__.py
@@ -82,20 +82,84 @@ def __call__(self, results):
 
 
 class dump_markdown_results_table:
-    """Print a Markdown-formatted table displaying benchmark results
+    """Print Markdown-formatted tables displaying benchmark results
+
+    For each metric, this visualization prints out a table of benchmarks,
+    showing the value of the metric for each variant.
 
     The 'out_file' key is mandatory; specify '-' to print to stdout.
 
+    'extra_colums' can be an empty dict. The sample configuration below assumes
+    that each benchmark result has a 'success' and 'runtime' metric for both
+    variants, 'variant_1' and 'variant_2'. It adds a 'ratio' column to the table
+    for the 'runtime' metric, and a 'change' column to the table for the
+    'success' metric. The 'text' lambda is called once for each benchmark. The
+    'text' lambda accepts a single argument---a dict---that maps variant
+    names to the value of that variant for a particular metric. The lambda
+    returns a string that is rendered in the benchmark's row in the new column.
+    This allows you to emit arbitrary text or markdown formatting in response to
+    particular combinations of values for different variants, such as
+    regressions or performance improvements.
+
     Sample configuration:
 
+    ```
     visualize:
     - type: dump_markdown_results_table
-      out_file: '-'
+      out_file: "-"
+      extra_columns:
+        runtime:
+        - column_name: ratio
+          text: >
+            lambda b: str(b["variant_2"]/b["variant_1"])
+            if b["variant_2"] < (1.5 * b["variant_1"])
+            else "**" + str(b["variant_2"]/b["variant_1"])
+        success:
+        - column_name: change
+          text: >
+            lambda b: "" if b["variant_2"] == b["variant_1"]
+            else "newly passing" if b["variant_2"]
+            else "regressed"
+    ```
+
+    Example output:
+
+    ```
+    ## runtime
+
+    | Benchmark |  variant_1 | variant_2 | ratio |
+    | --- | --- | --- | --- |
+    | bench_1 | 5 | 10 | **2.0** |
+    | bench_2 | 10 | 5 | 0.5 |
+
+    ## success
+
+    | Benchmark |  variant_1 | variant_2 | notes |
+    | --- | --- | --- | --- |
+    | bench_1 | True | True |  |
+    | bench_2 | True | False | regressed |
+    | bench_3 | False | True | newly passing |
+    ```
     """
 
 
-    def __init__(self, out_file):
+    def __init__(self, out_file, extra_columns=None):
         self.get_out_file = benchcomp.Outfile(out_file)
+        self.extra_columns = self._eval_column_text(extra_columns or {})
+
+
+    @staticmethod
+    def _eval_column_text(column_spec):
+        for columns in column_spec.values():
+            for column in columns:
+                try:
+                    column["text"] = eval(column["text"])
+                except SyntaxError:
+                    logging.error(
+                        "This column text is not a valid python program: '%s'",
+                        column["text"])
+                    sys.exit(1)
+        return column_spec
 
 
     @staticmethod
@@ -104,10 +168,10 @@ def _get_template():
             {% for metric, benchmarks in d["metrics"].items() %}
             ## {{ metric }}
 
-            | Benchmark | {% for variant in d["variants"] %} {{ variant }} |{% endfor %}
-            | --- | {% for variant in d["variants"] %}--- |{% endfor -%}
+            | Benchmark | {% for variant in d["variants"][metric] %} {{ variant }} |{% endfor %}
+            | --- |{% for variant in d["variants"][metric] %} --- |{% endfor -%}
             {% for bench_name, bench_variants in benchmarks.items () %}
-            | {{ bench_name }} {% for variant in d["variants"] -%}
+            | {{ bench_name }} {% for variant in d["variants"][metric] -%}
              | {{ bench_variants[variant] }} {% endfor %}|
             {%- endfor %}
             {% endfor -%}
@@ -134,10 +198,35 @@ def _organize_results_into_metrics(results):
         return ret
 
 
+    def _add_extra_columns(self, metrics):
+        for metric, benches in metrics.items():
+            try:
+                columns = self.extra_columns[metric]
+            except KeyError:
+                continue
+            for bench, variants in benches.items():
+                tmp_variants = dict(variants)
+                for column in columns:
+                    variants[column["column_name"]] = column["text"](tmp_variants)
+
+
+    @staticmethod
+    def _get_variants(metrics):
+        ret = {}
+        for metric, benches in metrics.items():
+            for bench, variants in benches.items():
+                ret[metric] = list(variants.keys())
+                break
+        return ret
+
+
     def __call__(self, results):
+        metrics = self._organize_results_into_metrics(results)
+        self._add_extra_columns(metrics)
+
         data = {
-            "metrics": self._organize_results_into_metrics(results),
-            "variants": list(results["benchmarks"].values())[0]["variants"],
+            "metrics": metrics,
+            "variants": self._get_variants(metrics),
         }
 
         env = jinja2.Environment(
diff --git a/tools/benchcomp/configs/perf-regression.yaml b/tools/benchcomp/configs/perf-regression.yaml
@@ -33,6 +33,61 @@ visualize:
 
   - type: dump_markdown_results_table
     out_file: '-'
+    extra_columns:
+
+      # For these two metrics, display the difference between old and new and
+      # embolden if the absolute difference is more than 10% of the old value
+      number_vccs:
+      - column_name: diff old → new
+        text: >
+          lambda b: "" if b["kani_new"] == b["kani_old"]
+          else ("**" if abs((b["kani_new"]-b["kani_old"])/b["kani_old"]) > 0.1 else "")
+          + ("+" if b["kani_new"] > b["kani_old"] else "")
+          + str(b["kani_new"] - b["kani_old"])
+          + ("**" if abs((b["kani_new"]-b["kani_old"])/b["kani_old"]) > 0.1 else "")
+      number_program_steps:
+      - column_name: diff old → new
+        text: >
+          lambda b: "" if b["kani_new"] == b["kani_old"]
+          else ("**" if abs((b["kani_new"]-b["kani_old"])/b["kani_old"]) > 0.1 else "")
+          + ("+" if b["kani_new"] > b["kani_old"] else "")
+          + str(b["kani_new"] - b["kani_old"])
+          + ("**" if abs((b["kani_new"]-b["kani_old"])/b["kani_old"]) > 0.1 else "")
+
+      # For 'runtime' metrics, display the % change from old to new, emboldening
+      # cells whose absolute change is >50%
+      solver_runtime:
+      - column_name: "% change old → new"
+        text: >
+          lambda b: "" if b["kani_new"] == b["kani_old"]
+          else ("**" if abs((b["kani_new"]-b["kani_old"])/b["kani_old"]) > 0.5 else "")
+          + ("+" if b["kani_new"] > b["kani_old"] else "")
+          + "%.3f%%" % ((b["kani_new"] - b["kani_old"]) * 100 / b["kani_old"])
+          + ("**" if abs((b["kani_new"]-b["kani_old"])/b["kani_old"]) > 0.5 else "")
+      verification_time:
+      - column_name: "% change old → new"
+        text: >
+          lambda b: "" if b["kani_new"] == b["kani_old"]
+          else ("**" if abs((b["kani_new"]-b["kani_old"])/b["kani_old"]) > 0.5 else "")
+          + ("+" if b["kani_new"] > b["kani_old"] else "")
+          + "%.3f%%" % ((b["kani_new"] - b["kani_old"]) * 100 / b["kani_old"])
+          + ("**" if abs((b["kani_new"]-b["kani_old"])/b["kani_old"]) > 0.5 else "")
+      symex_runtime:
+      - column_name: "% change old → new"
+        text: >
+          lambda b: "" if b["kani_new"] == b["kani_old"]
+          else ("**" if abs((b["kani_new"]-b["kani_old"])/b["kani_old"]) > 0.5 else "")
+          + ("+" if b["kani_new"] > b["kani_old"] else "")
+          + "%.3f%%" % ((b["kani_new"] - b["kani_old"]) * 100 / b["kani_old"])
+          + ("**" if abs((b["kani_new"]-b["kani_old"])/b["kani_old"]) > 0.5 else "")
+
+      # For success metric, display some text if success has changed
+      success:
+      - column_name: change
+        text: >
+          lambda b: "" if b["kani_new"] == b["kani_old"]
+          else "❌ newly failing" if b["kani_old"]
+          else "✅ newly passing"
 
   - type: error_on_regression
     variant_pairs: [[kani_old, kani_new]]
diff --git a/tools/benchcomp/test/test_regression.py b/tools/benchcomp/test/test_regression.py
@@ -402,9 +402,10 @@ def test_markdown_results_table(self):
                         "config": {
                             "directory": str(tmp),
                             "command_line":
-                                "mkdir bench_1 bench_2"
+                                "mkdir bench_1 bench_2 bench_3"
                                 "&& echo true > bench_1/success"
                                 "&& echo true > bench_2/success"
+                                "&& echo false > bench_3/success"
                                 "&& echo 5 > bench_1/runtime"
                                 "&& echo 10 > bench_2/runtime"
                         },
@@ -413,9 +414,10 @@ def test_markdown_results_table(self):
                         "config": {
                             "directory": str(tmp),
                             "command_line":
-                                "mkdir bench_1 bench_2"
+                                "mkdir bench_1 bench_2 bench_3"
                                 "&& echo true > bench_1/success"
                                 "&& echo false > bench_2/success"
+                                "&& echo true > bench_3/success"
                                 "&& echo 10 > bench_1/runtime"
                                 "&& echo 5 > bench_2/runtime"
                         }
@@ -432,6 +434,22 @@ def test_markdown_results_table(self):
                 "visualize": [{
                     "type": "dump_markdown_results_table",
                     "out_file": "-",
+                    "extra_columns": {
+                        "runtime": [{
+                            "column_name": "ratio",
+                            "text":
+                                "lambda b: str(b['variant_2']/b['variant_1'])"
+                                "if b['variant_2'] < 1.5 * b['variant_1'] "
+                                "else '**' + str(b['variant_2']/b['variant_1']) + '**'"
+                        }],
+                        "success": [{
+                            "column_name": "notes",
+                            "text":
+                                "lambda b: '' if b['variant_2'] == b['variant_1']"
+                                "else 'newly passing' if b['variant_2'] "
+                                "else 'regressed'"
+                        }]
+                    }
                 }]
             })
             run_bc()
@@ -441,17 +459,18 @@ def test_markdown_results_table(self):
                 run_bc.stdout, textwrap.dedent("""
                     ## runtime
 
-                    | Benchmark |  variant_1 | variant_2 |
-                    | --- | --- |--- |
-                    | bench_1 | 5 | 10 |
-                    | bench_2 | 10 | 5 |
+                    | Benchmark |  variant_1 | variant_2 | ratio |
+                    | --- | --- | --- | --- |
+                    | bench_1 | 5 | 10 | **2.0** |
+                    | bench_2 | 10 | 5 | 0.5 |
 
                     ## success
 
-                    | Benchmark |  variant_1 | variant_2 |
-                    | --- | --- |--- |
-                    | bench_1 | True | True |
-                    | bench_2 | True | False |
+                    | Benchmark |  variant_1 | variant_2 | notes |
+                    | --- | --- | --- | --- |
+                    | bench_1 | True | True |  |
+                    | bench_2 | True | False | regressed |
+                    | bench_3 | False | True | newly passing |
                     """))