[MISC] Format benchmark params as table, add legend, and support download as csv. (#1965)

SonSang · duburcqa · web-flow · commit e2f47a1bc9d7 · 2025-11-16T16:42:48.000+01:00
Co-authored-by: Alexis Duburcq &lt;alexis.duburcq@gmail.com&gt;
diff --git a/.github/workflows/alarm.yml b/.github/workflows/alarm.yml
@@ -55,21 +55,24 @@ jobs:
 
           # --- Parameters ---
           MAX_VALID_REVISIONS: 5
-          MAX_FETCH_REVISIONS: 100
+          MAX_FETCH_REVISIONS: 50
           RUNTIME_REGRESSION_TOLERANCE_PCT: 10
           COMPILE_REGRESSION_TOLERANCE_PCT: 10
 
           # Input/Output paths
           ARTIFACTS_DIR: ${{ steps.dl.outputs.download-path }}
           PR_COMMENT_PATH: pr_comment.md
           CHECK_BODY_PATH: check_output.md
+          CSV_RUNTIME_PATH: runtime_fps.csv
+          CSV_COMPILE_PATH: compile_time.csv
           EXIT_CODE_REGRESSION: 42
         run: |
           { python - << 'PY'; EXIT_CODE=$?; } || true
 
           import os, sys, json, re, math, statistics
           import wandb
           from pathlib import Path
+          import csv
 
           # ----- arguments -----
 
@@ -86,6 +89,11 @@ jobs:
           pr_comment_path = Path(os.environ["PR_COMMENT_PATH"]).expanduser()
           check_body_path = Path(os.environ["CHECK_BODY_PATH"]).expanduser()
 
+          csv_files = {
+            "runtime_fps": Path(os.environ["CSV_RUNTIME_PATH"]).expanduser().resolve(),
+            "compile_time": Path(os.environ["CSV_COMPILE_PATH"]).expanduser().resolve(),
+          }
+
           # ---------- helpers ----------
 
           METRIC_KEYS = ("compile_time", "runtime_fps", "realtime_factor")
@@ -97,6 +105,16 @@ jobs:
               kv = dict(map(str.strip, token.split("=", 1)) for token in bid.split("-"))
               return _normalize_kv_id(kv)
 
+          def parse_norm_id(nbid: str) -> dict:
+              kv = {}
+              if nbid:
+                  for token in nbid.split("-"):
+                      token = token.strip()
+                      if token and "=" in token:
+                          k, v = token.split("=", 1)
+                          kv[k.strip()] = v.strip()
+              return kv
+
           def artifacts_parse_csv_summary(current_txt_path):
               out = {}
               for line in current_txt_path.read_text().splitlines():
@@ -139,24 +157,19 @@ jobs:
           api = wandb.Api()
           runs_iter = api.runs(f"{ENTITY}/{PROJECT}", order="-created_at")
 
-          is_complete = False
+          revs = set()
           records_by_rev = {}
           for i, run in enumerate(runs_iter):
               # Abort if still not complete after checking enough runs.
               # This would happen if a new benchmark has been added, and not enough past data is available yet.
-              if i == MAX_FETCH_REVISIONS:
+              if len(revs) == MAX_FETCH_REVISIONS:
                   break
 
               # Early return if enough complete records have been collected
               records_is_complete = [bids_set.issubset(record.keys()) for record in records_by_rev.values()]
-              is_complete = sum(records_is_complete) == MAX_VALID_REVISIONS
-              if is_complete:
+              if sum(records_is_complete) == MAX_VALID_REVISIONS:
                   break
 
-              # Skip runs did not finish for some reason
-              if run.state != "finished":
-                  continue
-
               # Load config and summary, with support of legacy runs
               config, summary = run.config, run.summary
               if isinstance(config, str):
@@ -167,13 +180,18 @@ jobs:
               # Extract revision commit and branch
               try:
                   rev, branch = config["revision"].split("@", 1)
+                  revs.add(rev)
               except ValueError:
                   # Ignore this run if the revision has been corrupted for some unknown reason
                   continue
               # Ignore runs associated with a commit that is not part of the official repository
               if not branch.startswith('Genesis-Embodied-AI/'):
                   continue
 
+              # Skip runs did not finish for some reason
+              if run.state != "finished":
+                  continue
+
               # Do not store new records if the desired number of revision is already reached
               if len(records_by_rev) == MAX_VALID_REVISIONS and rev not in records_by_rev:
                   continue
@@ -205,50 +223,87 @@ jobs:
 
           # ----- build TWO tables -----
 
+          # Parse benchmark IDs into key-value dicts
+          id2kv = {bid: parse_norm_id(bid) for bid in current_bm.keys()}
+          params_name = sorted(set(kv.keys() for kv in id2kv.values()))
+
           reg_found = False
           tables = {}
+          rows_for_csv = {"runtime_fps": [], "compile_time": []}
+          info = {}
           for metric, alias in (("runtime_fps", "FPS"), ("compile_time", "compile")):
-              rows = []
+              rows_md = []
+
+              header_cells = (
+                  "status",
+                  *params_name,
+                  f"current {alias}",
+                  f"baseline {alias} [last (mean ± std)] (*1)",
+                  f"Δ {alias} (*2)"
+              )
+              header = "| " + " | ".join(header_cells) + " |"
+              align  = "|:------:|" + "|".join([":---" for _ in params_name]) + "|---:|---:|---:|"
+
               for bid in sorted(current_bm.keys()):
                   value_cur = current_bm[bid][metric]
                   is_int = isinstance(value_cur, int) or value_cur.is_integer()
                   value_repr = fmt_num(value_cur, is_int)
 
+                  info = {
+                      **{k: kv.get(k, "-") for k in params_name},
+                      "current": value_cur,
+                      "baseline_last": None,
+                      "baseline_min": None,
+                      "baseline_max": None,
+                  }
+
                   values_prev = [
                       record[bid][metric]
                       for record in records_by_rev.values()
                       if bid in record
                   ]
                   if values_prev:
+                      value_last = values_prev[0]
                       value_ref = statistics.fmean(values_prev)
-                      delta = (value_cur - value_ref) / value_ref * 100.0
+                      delta = (value_cur - value_last) / value_last * 100.0
+
+                      info["baseline_last"] = int(value_last) if is_int else float(value_last)
 
-                      stats_repr = f"{fmt_num(values_prev[0], is_int)}"
+                      stats_repr = f"{fmt_num(value_last, is_int)}"
                       delta_repr = f"{delta:+.1f}%"
                       if len(values_prev) == MAX_VALID_REVISIONS:
+                          info["baseline_mean"] = int(value_ref) if is_int else float(value_ref)
+                          info["baseline_min"] = int(min(values_prev)) if is_int else float(min(values_prev))
+                          info["baseline_max"] = int(max(values_prev)) if is_int else float(max(values_prev))
+
                           value_std = statistics.stdev(values_prev)
                           stats_repr += f" ({fmt_num(value_ref, is_int)} ± {fmt_num(value_std, is_int)})"
-                          if abs(delta) > METRICS_TOL[metrics]:
+                          if abs(delta) > METRICS_TOL[metric]:
+                              info["status"] = "alert"
+
                               delta_repr = f"**{delta_repr}**"
                               picto = "🔴"
                               reg_found = True
                           else:
+                              info["status"] = "ok"
+
                               picto = "✅"
                       else:
+                          info["status"] = "n/a"
+
                           picto = "ℹ️"
                   else:
                       picto, stats_repr, delta_repr = "ℹ️", "---", "---"
 
-                  rows.append([picto, f"`{bid}`", value_repr, stats_repr, delta_repr])
+                  kv = id2kv[bid]
+                  key_cells = [kv.get(k, "-") for k in params_name]
 
-              header = [
-                f"| status | benchmark ID | current {alias} | baseline {alias} [last (mean ± std)] | Δ {alias} |",
-                "|:------:|:-------------|-----------:|-------------:|------:|",
-              ]
-              tables[metric] = header + ["| " + " | ".join(r) + " |" for r in rows]
+                  rows_md.append("| " + " | ".join([picto] + key_cells + [value_repr, stats_repr, delta_repr]) + " |")
+                  rows_for_csv[metric].append(info)
 
-          # ----- baseline commit list -----
+              tables[metric] = [header, align] + rows_md
 
+          # ----- baseline commit list (MD) -----
           blist = [f"- Commit {i}: {sha}" for i, sha in enumerate(records_by_rev.keys(), 1)]
           baseline_block = ["**Baselines considered:** " + f"**{len(records_by_rev)}** commits"] + blist
 
@@ -270,6 +325,9 @@ jobs:
                   "",
                   "### Compile Time",
                   *tables["compile_time"],
+                  "",
+                  f"- (*1) last: last commit on main, mean/std: stats over revs {MAX_VALID_REVISIONS} commits if available.",
+                  f"- (*2) Δ: relative difference between PR and last commit on main, i.e. (PR - main) / main * 100%.",
               ]
           )
 
@@ -280,7 +338,15 @@ jobs:
           else:
               comment_body = ""
 
-          # Write files
+          # CSV file
+          for metric in ("runtime_fps", "compile_time"):
+            with csv_files[metric].open("w", newline="", encoding="utf-8") as f:
+                w = csv.DictWriter(f, fieldnames=info.keys())
+                w.writeheader()
+                for rec in rows_for_csv[metric]:
+                    w.writerow(rec)
+
+          # write md results
           check_body_path.write_text(check_body + "\n", encoding="utf-8")
           pr_comment_path.write_text(comment_body + "\n", encoding="utf-8")
 
@@ -369,3 +435,13 @@ jobs:
                 text: process.env.CHECK_BODY || undefined
               }
             });
+
+      - name: Upload benchmark comparisons in CSV & JSONL
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-comparison-tables
+          path: |
+            runtime_fps.csv
+            compile_time.csv
+          if-no-files-found: warn