From 979bc2938718c5dd0602173f676baf0b23dfb01b Mon Sep 17 00:00:00 2001
From: jazullo <josephazullo@gmail.com>
Date: Mon, 24 Mar 2025 09:04:02 +0000
Subject: [PATCH 1/6] Add new plotting scripts and an explainer

---
 .../criterionmethodology.py                   | 81 +++++++++++++++++++
 .../criterion-drop-in-replacement/readme      | 16 ++++
 .../sweep_seq.py                              | 50 ++++++++++++
 .../c-sorting-benchmarks/readme               |  0
 .../sort_insertion_out.csv                    |  0
 .../sort_merge_seq_out.csv                    |  0
 .../c-sorting-benchmarks/sort_quick_out.csv   |  0
 .../scripts/{ => old-criterion}/plot.py       |  0
 .../plot_relative_speedup.py                  |  0
 benchmarks/scripts/{ => old-criterion}/readme |  0
 .../scripts/{ => old-criterion}/sweep_seq.py  |  0
 11 files changed, 147 insertions(+)
 create mode 100644 benchmarks/scripts/criterion-drop-in-replacement/criterionmethodology.py
 create mode 100644 benchmarks/scripts/criterion-drop-in-replacement/readme
 create mode 100644 benchmarks/scripts/criterion-drop-in-replacement/sweep_seq.py
 rename benchmarks/scripts/{ => old-criterion}/c-sorting-benchmarks/readme (100%)
 rename benchmarks/scripts/{ => old-criterion}/c-sorting-benchmarks/sort_insertion_out.csv (100%)
 rename benchmarks/scripts/{ => old-criterion}/c-sorting-benchmarks/sort_merge_seq_out.csv (100%)
 rename benchmarks/scripts/{ => old-criterion}/c-sorting-benchmarks/sort_quick_out.csv (100%)
 rename benchmarks/scripts/{ => old-criterion}/plot.py (100%)
 rename benchmarks/scripts/{ => old-criterion}/plot_relative_speedup.py (100%)
 rename benchmarks/scripts/{ => old-criterion}/readme (100%)
 rename benchmarks/scripts/{ => old-criterion}/sweep_seq.py (100%)

diff --git a/benchmarks/scripts/criterion-drop-in-replacement/criterionmethodology.py b/benchmarks/scripts/criterion-drop-in-replacement/criterionmethodology.py
new file mode 100644
index 0000000..d1e790b
--- /dev/null
+++ b/benchmarks/scripts/criterion-drop-in-replacement/criterionmethodology.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+import numpy as np
+from sys import argv
+import subprocess
+from time import time
+import math
+
+from matplotlib import pyplot as plt
+
+MAKE_PLOT = False
+
+def linear_regression_with_std(x, y):
+    x = np.array(x)
+    y = np.array(y)
+    x_mean = np.mean(x)
+    y_mean = np.mean(y)
+    numerator = np.sum((x - x_mean) * (y - y_mean))
+    denominator = np.sum((x - x_mean) ** 2)
+    slope = numerator / denominator
+    intercept = y_mean - slope * x_mean
+    y_pred = slope * x + intercept
+    residuals = y - y_pred
+    std_dev = np.std(residuals)
+    return slope, intercept, std_dev
+
+def do_bench(cliargs, iters):
+    print([cliargs[1], str(iters)] + cliargs[2:])
+    out = str(subprocess.check_output([cliargs[1], str(iters)] + cliargs[2:]))
+    s1 = out[out.find("SELFTIMED")+11:]
+    s2 = float(s1[:s1.find("\n")-4])
+    selftimed = s2
+
+    b1 = out[out.find("BATCHTIME")+11:]
+    b2 = float(b1[:b1.find("SELFTIMED")-2])
+    batchtime = b2
+
+    print(f"ITERS: {iters}, BATCHTIME: {batchtime}, SELFTIMED: {selftimed}")
+    return batchtime
+
+def converge(cliargs):
+    xs = []
+    ys = []
+    iters = 1
+    t = time()
+    while len(xs) == 0:
+        st = do_bench(cliargs, iters)
+        if st * iters < 0.65:
+            iters *= 2
+            continue
+        xs.append(iters)
+        ys.append(st)
+    for _ in range(2):
+        if time() - t < 3.5:
+            iters = int(math.trunc(float(iters) * 1.2) + 1)
+        else:
+            iters += 1 + iters // 20
+        st = do_bench(cliargs, iters)
+        xs.append(iters)
+        ys.append(st)
+    while time() - t < 3.5:
+        if time() - t < 3.5:
+            iters = int(math.trunc(float(iters) * 1.2) + 1)
+        else:
+            iters += 1 + iters // 20
+        st = do_bench(cliargs, iters)
+        xs.append(iters)
+        ys.append(st)
+    m, b, sigma = linear_regression_with_std(xs, ys)
+    print(f"Slope (Mean): {m}, Intercept (Overhead): {b}, Stdev: {sigma}")
+    p, lnc, lngsd = linear_regression_with_std([math.log(x) for x in xs], [math.log(y) for y in ys])
+    c, gsd = math.exp(lnc), math.exp(lngsd)
+    print(f"Power (Distortion): {p}, Factor (Geomean) {c}, GeoStdev {gsd}")
+    if MAKE_PLOT:
+        plt.plot(xs, ys, 'rx')
+        plt.plot([xs[0], xs[-1]], [m*xs[0]+b, m*xs[-1]+b], color="blue")
+        plt.plot(xs, [c*x**p for x in xs], color="green")
+        plt.savefig("plot.png")
+    return m, sigma, c, gsd
+
+if __name__ == "__main__":
+    print(converge(argv))
diff --git a/benchmarks/scripts/criterion-drop-in-replacement/readme b/benchmarks/scripts/criterion-drop-in-replacement/readme
new file mode 100644
index 0000000..30145df
--- /dev/null
+++ b/benchmarks/scripts/criterion-drop-in-replacement/readme
@@ -0,0 +1,16 @@
+The script `criterionmethodology.py` is my implementation of a benchrunner-runner that uses the criterion methodology.
+We take as input some program which takes `iters` as a command-line argument, times a function of interest in a tight loop which repeats `iters` many times, and then prints to stdout the batchtime (total loop time) and selftimed (total loop time divided by iters).
+The essense of criterion is then to sweep `iters` and perform a linear regression against iters and batchtime.
+The slope is the mean and the y-intercept represents some notion of shared overhead, insensitive to `iters`.
+Ultimately, criterion serves as a way to benchmark tasks with very short execution times, as startup overhead can be ignored.
+
+Since we have relatively precise timing over loops, I also implemented the criterion methodolgy *geometrically*.
+I take the logarithm of all the x and y values, compute the linear regression over that, then exponentiate the y-intercept - this represents the geomean.
+The other dependent portion, which is the slope, becomes a power (the equation is y = e^b x^m), which represents *geometric overhead*, e.g. how much overhead is being added per iteration.
+This may do well to model any slowdowns arising from pre-allocating arrays.
+Additionally, since performance data is non-negative and judged multiplicatively (twice as good means numbers are half, twice has bad means numbers are doubled; these are all *factors*), the geomean and geo-standard-deviation may make more sense theoretically.
+However, from my testing, the geomean seams to vary wildly for programs with fleeting execution times, even between repeat runs with the same parameters.
+
+The scripts `criterionmethodology.py` and `sweep_seq.py` can both be ran directly.
+The first takes command-line arguments, e.g. `criterionmethodology benchrunner Quicksort Seq 2000` will call `benchrunner iters Quicksort Seq 2000` for various `iters`.
+`sweep_seq` performs a logarithmic sweep over different array sizes, invoking the criterion methdology at each point.
diff --git a/benchmarks/scripts/criterion-drop-in-replacement/sweep_seq.py b/benchmarks/scripts/criterion-drop-in-replacement/sweep_seq.py
new file mode 100644
index 0000000..9327ba6
--- /dev/null
+++ b/benchmarks/scripts/criterion-drop-in-replacement/sweep_seq.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+import os
+import numpy as np
+from criterionmethodology import converge
+import sys
+
+# names = ["Optsort", "Insertionsort", "Mergesort", "Quicksort"]
+# names = ["CopyArray", "Quicksort", "Insertionsort", "Mergesort"]
+names = ["Insertionsort"]
+
+# DENSITY = 4
+DENSITY = 12
+def bounds(name):
+    match name:
+        case "Insertionsort":
+            lo = 3  # 2**n ...
+            hi = 16
+        case "Quicksort":
+            lo = 3
+            hi = 22
+        case "Mergesort":
+            # lo = 12
+            lo = 3
+            hi = 24
+        case "Cilksort":
+            # lo = 12
+            lo = 3
+            hi = 16#24
+        case "Optsort":
+            lo = 3
+            hi = 16#24
+        case _:
+            lo = 3
+            hi = 20
+    return lo, hi, (hi-lo)*DENSITY+1
+
+def dotrial(name, size):
+    return converge([sys.argv[0], "benchrunner", name, "Seq", str(int(size))])
+
+if __name__ == "__main__":
+    for name in names:
+        lo, hi, pts = bounds(name)
+        with open("%s_out3.csv" % name, "w") as f:
+            f.write("# size\tmean\tstddev\tgeomean\tgeostdev\n")
+        for i in np.unique(np.logspace(lo, hi, pts, base=2).astype(int)):
+            with open("%s_out3.csv" % name, "a") as f:
+                try:
+                    f.write("%d" % int(i) + "\t%f\t%f\t%f\t%f\n" % dotrial(name, i))
+                except:
+                    pass
diff --git a/benchmarks/scripts/c-sorting-benchmarks/readme b/benchmarks/scripts/old-criterion/c-sorting-benchmarks/readme
similarity index 100%
rename from benchmarks/scripts/c-sorting-benchmarks/readme
rename to benchmarks/scripts/old-criterion/c-sorting-benchmarks/readme
diff --git a/benchmarks/scripts/c-sorting-benchmarks/sort_insertion_out.csv b/benchmarks/scripts/old-criterion/c-sorting-benchmarks/sort_insertion_out.csv
similarity index 100%
rename from benchmarks/scripts/c-sorting-benchmarks/sort_insertion_out.csv
rename to benchmarks/scripts/old-criterion/c-sorting-benchmarks/sort_insertion_out.csv
diff --git a/benchmarks/scripts/c-sorting-benchmarks/sort_merge_seq_out.csv b/benchmarks/scripts/old-criterion/c-sorting-benchmarks/sort_merge_seq_out.csv
similarity index 100%
rename from benchmarks/scripts/c-sorting-benchmarks/sort_merge_seq_out.csv
rename to benchmarks/scripts/old-criterion/c-sorting-benchmarks/sort_merge_seq_out.csv
diff --git a/benchmarks/scripts/c-sorting-benchmarks/sort_quick_out.csv b/benchmarks/scripts/old-criterion/c-sorting-benchmarks/sort_quick_out.csv
similarity index 100%
rename from benchmarks/scripts/c-sorting-benchmarks/sort_quick_out.csv
rename to benchmarks/scripts/old-criterion/c-sorting-benchmarks/sort_quick_out.csv
diff --git a/benchmarks/scripts/plot.py b/benchmarks/scripts/old-criterion/plot.py
similarity index 100%
rename from benchmarks/scripts/plot.py
rename to benchmarks/scripts/old-criterion/plot.py
diff --git a/benchmarks/scripts/plot_relative_speedup.py b/benchmarks/scripts/old-criterion/plot_relative_speedup.py
similarity index 100%
rename from benchmarks/scripts/plot_relative_speedup.py
rename to benchmarks/scripts/old-criterion/plot_relative_speedup.py
diff --git a/benchmarks/scripts/readme b/benchmarks/scripts/old-criterion/readme
similarity index 100%
rename from benchmarks/scripts/readme
rename to benchmarks/scripts/old-criterion/readme
diff --git a/benchmarks/scripts/sweep_seq.py b/benchmarks/scripts/old-criterion/sweep_seq.py
similarity index 100%
rename from benchmarks/scripts/sweep_seq.py
rename to benchmarks/scripts/old-criterion/sweep_seq.py

From 942636fa56a61a311d6a3a687d96cb712ee3e4de Mon Sep 17 00:00:00 2001
From: Artem Pelenitsyn <a.pelenitsyn@gmail.com>
Date: Tue, 19 Aug 2025 13:55:28 -0400
Subject: [PATCH 2/6] polish README

---
 .../criterion-drop-in-replacement/README.md   | 44 +++++++++++++++++++
 .../criterion-drop-in-replacement/readme      | 16 -------
 2 files changed, 44 insertions(+), 16 deletions(-)
 create mode 100644 benchmarks/scripts/criterion-drop-in-replacement/README.md
 delete mode 100644 benchmarks/scripts/criterion-drop-in-replacement/readme

diff --git a/benchmarks/scripts/criterion-drop-in-replacement/README.md b/benchmarks/scripts/criterion-drop-in-replacement/README.md
new file mode 100644
index 0000000..62c4b0f
--- /dev/null
+++ b/benchmarks/scripts/criterion-drop-in-replacement/README.md
@@ -0,0 +1,44 @@
+## Purpose
+
+This directory contains a Python re-implementation of the Haskell Criterion methodology to run executables (instead of Haskell functions, like Criterion normally does).
+One could call it "benchrunner-runner" because the purpose is to run `benchrunner` many times and calculate the appropriate run time statistics.
+
+We take as input some program `prog` with the following interface:
+
+- `prog` takes `iters` as a command-line argument,
+- `prog` measures run time of a function of interest in a tight loop that repeats `iters` many times, and finally
+- `prog` prints to stdout the batchtime (total loop time) and selftimed (total loop time divided by `iters`).
+
+The ultimate goal is then to sweep `iters` and perform a linear regression against `iters` and `batchtime`.
+The slope is the mean and the y-intercept represents some notion of shared overhead, insensitive to `iters`.
+
+## Run
+
+This package contains two scripts:
+
+- `sweep_seq.py` (top level)
+- `criterionmethodology.py` (called by `sweep_seq.py`)
+
+Both can be ran directly, i.e.:
+
+```shellsession
+criterionmethodology benchrunner Quicksort Seq 2000
+```
+
+will call `benchrunner iters Quicksort Seq 2000` for various `iters`.
+
+`sweep_seq` performs a logarithmic sweep over different array sizes, invoking `criterionmethdology.py` at each point.
+
+## Arightmetic vs geometric mean
+
+Since performance data is non-negative and judged multiplicatively (twice as good means numbers are half, twice has bad means numbers are doubled; these are all *factors*), the geomean and geo-standard-deviation may make more sense theoretically.
+However, from some testing, the geomean seems to vary wildly for programs with fleeting execution times, even between repeated runs with the same parameters.
+
+In particular, to compute the geomean, we:
+
+- take the logarithm of all the `x` and `y` values,
+- compute linear regression over that, then 
+- exponentiate the y-intercept.
+
+The other dependent portion, which is the slope, becomes a power (the equation is `y = e^b x^m`), which represents *geometric overhead*, e.g. how much overhead is being added per iteration.
+This may do well to model any slowdowns, e.g. ones arising from pre-allocating arrays.
diff --git a/benchmarks/scripts/criterion-drop-in-replacement/readme b/benchmarks/scripts/criterion-drop-in-replacement/readme
deleted file mode 100644
index 30145df..0000000
--- a/benchmarks/scripts/criterion-drop-in-replacement/readme
+++ /dev/null
@@ -1,16 +0,0 @@
-The script `criterionmethodology.py` is my implementation of a benchrunner-runner that uses the criterion methodology.
-We take as input some program which takes `iters` as a command-line argument, times a function of interest in a tight loop which repeats `iters` many times, and then prints to stdout the batchtime (total loop time) and selftimed (total loop time divided by iters).
-The essense of criterion is then to sweep `iters` and perform a linear regression against iters and batchtime.
-The slope is the mean and the y-intercept represents some notion of shared overhead, insensitive to `iters`.
-Ultimately, criterion serves as a way to benchmark tasks with very short execution times, as startup overhead can be ignored.
-
-Since we have relatively precise timing over loops, I also implemented the criterion methodolgy *geometrically*.
-I take the logarithm of all the x and y values, compute the linear regression over that, then exponentiate the y-intercept - this represents the geomean.
-The other dependent portion, which is the slope, becomes a power (the equation is y = e^b x^m), which represents *geometric overhead*, e.g. how much overhead is being added per iteration.
-This may do well to model any slowdowns arising from pre-allocating arrays.
-Additionally, since performance data is non-negative and judged multiplicatively (twice as good means numbers are half, twice has bad means numbers are doubled; these are all *factors*), the geomean and geo-standard-deviation may make more sense theoretically.
-However, from my testing, the geomean seams to vary wildly for programs with fleeting execution times, even between repeat runs with the same parameters.
-
-The scripts `criterionmethodology.py` and `sweep_seq.py` can both be ran directly.
-The first takes command-line arguments, e.g. `criterionmethodology benchrunner Quicksort Seq 2000` will call `benchrunner iters Quicksort Seq 2000` for various `iters`.
-`sweep_seq` performs a logarithmic sweep over different array sizes, invoking the criterion methdology at each point.

From f473b2091a083cbb462d859d18188d33f5cb937d Mon Sep 17 00:00:00 2001
From: Artem Pelenitsyn <a.pelenitsyn@gmail.com>
Date: Tue, 19 Aug 2025 13:55:53 -0400
Subject: [PATCH 3/6] fix Windows line endings in one .py

---
 .../criterionmethodology.py                   | 162 +++++++++---------
 1 file changed, 81 insertions(+), 81 deletions(-)
 mode change 100644 => 100755 benchmarks/scripts/criterion-drop-in-replacement/criterionmethodology.py

diff --git a/benchmarks/scripts/criterion-drop-in-replacement/criterionmethodology.py b/benchmarks/scripts/criterion-drop-in-replacement/criterionmethodology.py
old mode 100644
new mode 100755
index d1e790b..927376a
--- a/benchmarks/scripts/criterion-drop-in-replacement/criterionmethodology.py
+++ b/benchmarks/scripts/criterion-drop-in-replacement/criterionmethodology.py
@@ -1,81 +1,81 @@
-#!/usr/bin/env python3
-import numpy as np
-from sys import argv
-import subprocess
-from time import time
-import math
-
-from matplotlib import pyplot as plt
-
-MAKE_PLOT = False
-
-def linear_regression_with_std(x, y):
-    x = np.array(x)
-    y = np.array(y)
-    x_mean = np.mean(x)
-    y_mean = np.mean(y)
-    numerator = np.sum((x - x_mean) * (y - y_mean))
-    denominator = np.sum((x - x_mean) ** 2)
-    slope = numerator / denominator
-    intercept = y_mean - slope * x_mean
-    y_pred = slope * x + intercept
-    residuals = y - y_pred
-    std_dev = np.std(residuals)
-    return slope, intercept, std_dev
-
-def do_bench(cliargs, iters):
-    print([cliargs[1], str(iters)] + cliargs[2:])
-    out = str(subprocess.check_output([cliargs[1], str(iters)] + cliargs[2:]))
-    s1 = out[out.find("SELFTIMED")+11:]
-    s2 = float(s1[:s1.find("\n")-4])
-    selftimed = s2
-
-    b1 = out[out.find("BATCHTIME")+11:]
-    b2 = float(b1[:b1.find("SELFTIMED")-2])
-    batchtime = b2
-
-    print(f"ITERS: {iters}, BATCHTIME: {batchtime}, SELFTIMED: {selftimed}")
-    return batchtime
-
-def converge(cliargs):
-    xs = []
-    ys = []
-    iters = 1
-    t = time()
-    while len(xs) == 0:
-        st = do_bench(cliargs, iters)
-        if st * iters < 0.65:
-            iters *= 2
-            continue
-        xs.append(iters)
-        ys.append(st)
-    for _ in range(2):
-        if time() - t < 3.5:
-            iters = int(math.trunc(float(iters) * 1.2) + 1)
-        else:
-            iters += 1 + iters // 20
-        st = do_bench(cliargs, iters)
-        xs.append(iters)
-        ys.append(st)
-    while time() - t < 3.5:
-        if time() - t < 3.5:
-            iters = int(math.trunc(float(iters) * 1.2) + 1)
-        else:
-            iters += 1 + iters // 20
-        st = do_bench(cliargs, iters)
-        xs.append(iters)
-        ys.append(st)
-    m, b, sigma = linear_regression_with_std(xs, ys)
-    print(f"Slope (Mean): {m}, Intercept (Overhead): {b}, Stdev: {sigma}")
-    p, lnc, lngsd = linear_regression_with_std([math.log(x) for x in xs], [math.log(y) for y in ys])
-    c, gsd = math.exp(lnc), math.exp(lngsd)
-    print(f"Power (Distortion): {p}, Factor (Geomean) {c}, GeoStdev {gsd}")
-    if MAKE_PLOT:
-        plt.plot(xs, ys, 'rx')
-        plt.plot([xs[0], xs[-1]], [m*xs[0]+b, m*xs[-1]+b], color="blue")
-        plt.plot(xs, [c*x**p for x in xs], color="green")
-        plt.savefig("plot.png")
-    return m, sigma, c, gsd
-
-if __name__ == "__main__":
-    print(converge(argv))
+#!/usr/bin/env python
+import numpy as np
+from sys import argv
+import subprocess
+from time import time
+import math
+
+from matplotlib import pyplot as plt
+
+MAKE_PLOT = False
+
+def linear_regression_with_std(x, y):
+    x = np.array(x)
+    y = np.array(y)
+    x_mean = np.mean(x)
+    y_mean = np.mean(y)
+    numerator = np.sum((x - x_mean) * (y - y_mean))
+    denominator = np.sum((x - x_mean) ** 2)
+    slope = numerator / denominator
+    intercept = y_mean - slope * x_mean
+    y_pred = slope * x + intercept
+    residuals = y - y_pred
+    std_dev = np.std(residuals)
+    return slope, intercept, std_dev
+
+def do_bench(cliargs, iters):
+    print([cliargs[1], str(iters)] + cliargs[2:])
+    out = str(subprocess.check_output([cliargs[1], str(iters)] + cliargs[2:]))
+    s1 = out[out.find("SELFTIMED")+11:]
+    s2 = float(s1[:s1.find("\n")-4])
+    selftimed = s2
+
+    b1 = out[out.find("BATCHTIME")+11:]
+    b2 = float(b1[:b1.find("SELFTIMED")-2])
+    batchtime = b2
+
+    print(f"ITERS: {iters}, BATCHTIME: {batchtime}, SELFTIMED: {selftimed}")
+    return batchtime
+
+def converge(cliargs):
+    xs = []
+    ys = []
+    iters = 1
+    t = time()
+    while len(xs) == 0:
+        st = do_bench(cliargs, iters)
+        if st * iters < 0.65:
+            iters *= 2
+            continue
+        xs.append(iters)
+        ys.append(st)
+    for _ in range(2):
+        if time() - t < 3.5:
+            iters = int(math.trunc(float(iters) * 1.2) + 1)
+        else:
+            iters += 1 + iters // 20
+        st = do_bench(cliargs, iters)
+        xs.append(iters)
+        ys.append(st)
+    while time() - t < 3.5:
+        if time() - t < 3.5:
+            iters = int(math.trunc(float(iters) * 1.2) + 1)
+        else:
+            iters += 1 + iters // 20
+        st = do_bench(cliargs, iters)
+        xs.append(iters)
+        ys.append(st)
+    m, b, sigma = linear_regression_with_std(xs, ys)
+    print(f"Slope (Mean): {m}, Intercept (Overhead): {b}, Stdev: {sigma}")
+    p, lnc, lngsd = linear_regression_with_std([math.log(x) for x in xs], [math.log(y) for y in ys])
+    c, gsd = math.exp(lnc), math.exp(lngsd)
+    print(f"Power (Distortion): {p}, Factor (Geomean) {c}, GeoStdev {gsd}")
+    if MAKE_PLOT:
+        plt.plot(xs, ys, 'rx')
+        plt.plot([xs[0], xs[-1]], [m*xs[0]+b, m*xs[-1]+b], color="blue")
+        plt.plot(xs, [c*x**p for x in xs], color="green")
+        plt.savefig("plot.png")
+    return m, sigma, c, gsd
+
+if __name__ == "__main__":
+    print(converge(argv))

From 79f4b3e0c7a063c20ddc85359c8f472b92656961 Mon Sep 17 00:00:00 2001
From: Artem Pelenitsyn <a.pelenitsyn@gmail.com>
Date: Tue, 19 Aug 2025 13:56:18 -0400
Subject: [PATCH 4/6] make executable another .py

---
 benchmarks/scripts/criterion-drop-in-replacement/sweep_seq.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 benchmarks/scripts/criterion-drop-in-replacement/sweep_seq.py

diff --git a/benchmarks/scripts/criterion-drop-in-replacement/sweep_seq.py b/benchmarks/scripts/criterion-drop-in-replacement/sweep_seq.py
old mode 100644
new mode 100755

From abaf99b44c52334a7dd3e9b109ac8f7115776129 Mon Sep 17 00:00:00 2001
From: Artem Pelenitsyn <a.pelenitsyn@gmail.com>
Date: Wed, 20 Aug 2025 16:16:41 -0400
Subject: [PATCH 5/6] polish criterionmethodology.py

---
 .../criterion-drop-in-replacement/README.md    |  4 ++--
 .../criterionmethodology.py                    | 18 +++++++++++-------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/benchmarks/scripts/criterion-drop-in-replacement/README.md b/benchmarks/scripts/criterion-drop-in-replacement/README.md
index 62c4b0f..bcb1d84 100644
--- a/benchmarks/scripts/criterion-drop-in-replacement/README.md
+++ b/benchmarks/scripts/criterion-drop-in-replacement/README.md
@@ -3,7 +3,7 @@
 This directory contains a Python re-implementation of the Haskell Criterion methodology to run executables (instead of Haskell functions, like Criterion normally does).
 One could call it "benchrunner-runner" because the purpose is to run `benchrunner` many times and calculate the appropriate run time statistics.
 
-We take as input some program `prog` with the following interface:
+We take as input a path to some program `prog` (meant to be the `benchrunner`) with the following interface:
 
 - `prog` takes `iters` as a command-line argument,
 - `prog` measures run time of a function of interest in a tight loop that repeats `iters` many times, and finally
@@ -29,7 +29,7 @@ will call `benchrunner iters Quicksort Seq 2000` for various `iters`.
 
 `sweep_seq` performs a logarithmic sweep over different array sizes, invoking `criterionmethdology.py` at each point.
 
-## Arightmetic vs geometric mean
+## Arithmetic vs geometric mean
 
 Since performance data is non-negative and judged multiplicatively (twice as good means numbers are half, twice has bad means numbers are doubled; these are all *factors*), the geomean and geo-standard-deviation may make more sense theoretically.
 However, from some testing, the geomean seems to vary wildly for programs with fleeting execution times, even between repeated runs with the same parameters.
diff --git a/benchmarks/scripts/criterion-drop-in-replacement/criterionmethodology.py b/benchmarks/scripts/criterion-drop-in-replacement/criterionmethodology.py
index 927376a..4df7113 100755
--- a/benchmarks/scripts/criterion-drop-in-replacement/criterionmethodology.py
+++ b/benchmarks/scripts/criterion-drop-in-replacement/criterionmethodology.py
@@ -24,7 +24,8 @@ def linear_regression_with_std(x, y):
     return slope, intercept, std_dev
 
 def do_bench(cliargs, iters):
-    print([cliargs[1], str(iters)] + cliargs[2:])
+    bin = cliargs[1].rsplit('/', 1)[-1]
+    print([bin] + cliargs[2:] + [str(iters)])
     out = str(subprocess.check_output([cliargs[1], str(iters)] + cliargs[2:]))
     s1 = out[out.find("SELFTIMED")+11:]
     s2 = float(s1[:s1.find("\n")-4])
@@ -34,7 +35,7 @@ def do_bench(cliargs, iters):
     b2 = float(b1[:b1.find("SELFTIMED")-2])
     batchtime = b2
 
-    print(f"ITERS: {iters}, BATCHTIME: {batchtime}, SELFTIMED: {selftimed}")
+    #print(f"ITERS: {iters}, BATCHTIME: {batchtime}, SELFTIMED: {selftimed}")
     return batchtime
 
 def converge(cliargs):
@@ -65,17 +66,20 @@ def converge(cliargs):
         st = do_bench(cliargs, iters)
         xs.append(iters)
         ys.append(st)
-    m, b, sigma = linear_regression_with_std(xs, ys)
-    print(f"Slope (Mean): {m}, Intercept (Overhead): {b}, Stdev: {sigma}")
+
+    m, b, sig = linear_regression_with_std(xs, ys)
     p, lnc, lngsd = linear_regression_with_std([math.log(x) for x in xs], [math.log(y) for y in ys])
     c, gsd = math.exp(lnc), math.exp(lngsd)
-    print(f"Power (Distortion): {p}, Factor (Geomean) {c}, GeoStdev {gsd}")
+
+    print(f"Slope (Mean):     {m:.2e}, Stdev:    {sig:.2e}, Intercept (Overhead): {b:.2e}")
+    print(f"Factor (Geomean): {c:.2e}, GeoStdev: {gsd:.2e}, Power (Distortion):   {p:.2e}")
+
     if MAKE_PLOT:
         plt.plot(xs, ys, 'rx')
         plt.plot([xs[0], xs[-1]], [m*xs[0]+b, m*xs[-1]+b], color="blue")
         plt.plot(xs, [c*x**p for x in xs], color="green")
         plt.savefig("plot.png")
-    return m, sigma, c, gsd
+    return m, sig, c, gsd
 
 if __name__ == "__main__":
-    print(converge(argv))
+    converge(argv)

From ce3065f4b6717efe8a2debaad1647393f7d0a4bb Mon Sep 17 00:00:00 2001
From: Artem Pelenitsyn <a.pelenitsyn@gmail.com>
Date: Thu, 21 Aug 2025 15:38:03 -0400
Subject: [PATCH 6/6] cooking the Python: it's getting there

---
 .../criterionmethodology.py                   | 113 +++++++++++++-----
 .../sweep_seq.py                              |  15 ++-
 2 files changed, 93 insertions(+), 35 deletions(-)

diff --git a/benchmarks/scripts/criterion-drop-in-replacement/criterionmethodology.py b/benchmarks/scripts/criterion-drop-in-replacement/criterionmethodology.py
index 4df7113..71a7197 100755
--- a/benchmarks/scripts/criterion-drop-in-replacement/criterionmethodology.py
+++ b/benchmarks/scripts/criterion-drop-in-replacement/criterionmethodology.py
@@ -1,4 +1,32 @@
 #!/usr/bin/env python
+
+#
+# The script determines the cost of one iteration of a function (in seconds) using an executable that
+#
+# - runs `iters` iterations of that function in a tight loop and
+# - prints out the time it took to run them.
+#
+# Example call:
+#
+#    ./criterionmethodology.py $(cabal list-bin benchrunner) Quicksort Seq 2000
+#
+# In particular, we
+#
+# - run given executable (the first and only relevant argument) with 'iters' argument varied from 1 to N;
+#   N and the step size are dynamially determined based on the time it takes to run the binary;
+# - fetch timing results from binary's stdout and do linear regression over them;
+# - plot the regression (see the `plot` function) in `plot.png`.
+#
+# Growing the `iters` parameter is the main ingenuity of the script. It follows the Criterion methodology:
+# running the given binary for small number of iterations doubling them every time, and upon reaching
+# a certain threshold (FIRST_ITER_THRESHOLD), increasing them linearly until the overall execution time
+# reaches another threshold (TOTAL_TIME_THRESHOLD) seconds.
+#
+# - The `converge` function runs the whole process, starting with a small number of iterations.
+# - The `iter` function encodes the methodology for increasing 'iters'.
+# - The `do_bench` function runs the binary and scrapes the output, so the expected binary's interface is encoded in it.
+#
+
 import numpy as np
 from sys import argv
 import subprocess
@@ -7,7 +35,16 @@
 
 from matplotlib import pyplot as plt
 
+LOG=True
 MAKE_PLOT = False
+FIRST_ITER_THRESHOLD = 3e-6 # 0.65
+TOTAL_TIME_THRESHOLD = 1    # 3.5
+                            # ^^ Joseph's original values, but they are too high for my machine.
+
+# Poor-man logging
+def log(format, **xs):
+    if LOG:
+        print(format, **xs)
 
 def linear_regression_with_std(x, y):
     x = np.array(x)
@@ -23,10 +60,9 @@ def linear_regression_with_std(x, y):
     std_dev = np.std(residuals)
     return slope, intercept, std_dev
 
+# Do one trial: run the binary with given arguments, including the given `iters`, and return the batch time.
 def do_bench(cliargs, iters):
-    bin = cliargs[1].rsplit('/', 1)[-1]
-    print([bin] + cliargs[2:] + [str(iters)])
-    out = str(subprocess.check_output([cliargs[1], str(iters)] + cliargs[2:]))
+    out = str(subprocess.check_output([cliargs[0], str(iters)] + cliargs[1:]))
     s1 = out[out.find("SELFTIMED")+11:]
     s2 = float(s1[:s1.find("\n")-4])
     selftimed = s2
@@ -35,51 +71,74 @@ def do_bench(cliargs, iters):
     b2 = float(b1[:b1.find("SELFTIMED")-2])
     batchtime = b2
 
-    #print(f"ITERS: {iters}, BATCHTIME: {batchtime}, SELFTIMED: {selftimed}")
+    #log(f"ITERS: {iters}, BATCHTIME: {batchtime}, SELFTIMED: {selftimed}")
     return batchtime
 
+# Increase 'iters' and do one trial with that. Store results in xs and ys. Return new iters.
+def iter(iters, cliargs, start_time, xs, ys):
+    if time() - start_time < TOTAL_TIME_THRESHOLD:
+        iters = int(math.trunc(float(iters) * 1.2) + 1)
+    else:
+        iters += 1 + iters // 20
+    log(str(iters) + " ", end="", flush=True)
+    st = do_bench(cliargs, iters)
+    xs.append(iters)
+    ys.append(st)
+    return iters
+
+def plot(xs, ys, b, c, m, p):
+    plotfile = "plot.png"
+    os.remove(plotfile) if os.path.exists(plotfile) else None
+    plt.plot(xs, ys, 'rx')
+    plt.plot([xs[0], xs[-1]], [m*xs[0]+b, m*xs[-1]+b], color="blue")
+    plt.plot(xs, [c*x**p for x in xs], color="green")
+    plt.savefig(plotfile)
+
+# Main function to run the iteration experiment.
+# - cliargs is a list of command line arguments WIHTOUT the current script's name (argv[0]), in particular:
+#   - the first argument is the path to the binary, and
+#   - the rest is simply the arguments to pass to the binary.
 def converge(cliargs):
+    bin = cliargs[0].rsplit('/', 1)[-1] # Get the binary name from the path
+    log("Converge on: " + str([bin] + cliargs[1:]))
+    log("iters: ", end="")
     xs = []
     ys = []
     iters = 1
     t = time()
+
+    # First find a starting point for `iters` where the time is at least FIRST_ITER_THRESHOLD seconds
     while len(xs) == 0:
+        log(str(iters) + " ", end="", flush=True)
         st = do_bench(cliargs, iters)
-        if st * iters < 0.65:
+        if st < FIRST_ITER_THRESHOLD: # Artem: Joseph had `st * iters < ...` here but I think it's a typo
             iters *= 2
             continue
         xs.append(iters)
         ys.append(st)
+
+    log(" | ", end="", flush=True)
+    # Do two more trials increasing iters regardless of time
     for _ in range(2):
-        if time() - t < 3.5:
-            iters = int(math.trunc(float(iters) * 1.2) + 1)
-        else:
-            iters += 1 + iters // 20
-        st = do_bench(cliargs, iters)
-        xs.append(iters)
-        ys.append(st)
-    while time() - t < 3.5:
-        if time() - t < 3.5:
-            iters = int(math.trunc(float(iters) * 1.2) + 1)
-        else:
-            iters += 1 + iters // 20
-        st = do_bench(cliargs, iters)
-        xs.append(iters)
-        ys.append(st)
+        iters = iter(iters, cliargs, t, xs, ys)
+
+    log(" | ", end="", flush=True)
+    # Keep increasing iters until we reach TOTAL_TIME_THRESHOLD seconds of execution in total
+    while time() - t < TOTAL_TIME_THRESHOLD:
+        iters = iter(iters, cliargs, t, xs, ys)
+    log("done!")
 
     m, b, sig = linear_regression_with_std(xs, ys)
     p, lnc, lngsd = linear_regression_with_std([math.log(x) for x in xs], [math.log(y) for y in ys])
     c, gsd = math.exp(lnc), math.exp(lngsd)
 
-    print(f"Slope (Mean):     {m:.2e}, Stdev:    {sig:.2e}, Intercept (Overhead): {b:.2e}")
-    print(f"Factor (Geomean): {c:.2e}, GeoStdev: {gsd:.2e}, Power (Distortion):   {p:.2e}")
+    log(f"Slope (Mean):     {m:.2e}, Stdev:    {sig:.2e}, Intercept (Overhead): {b:.2e}")
+    log(f"Factor (Geomean): {c:.2e}, GeoStdev: {gsd:.2e}, Power (Distortion):   {p:.2e}")
 
     if MAKE_PLOT:
-        plt.plot(xs, ys, 'rx')
-        plt.plot([xs[0], xs[-1]], [m*xs[0]+b, m*xs[-1]+b], color="blue")
-        plt.plot(xs, [c*x**p for x in xs], color="green")
-        plt.savefig("plot.png")
+        plot(xs, ys, b, c, m, p)
+
     return m, sig, c, gsd
 
 if __name__ == "__main__":
-    converge(argv)
+    converge(argv[1:])
diff --git a/benchmarks/scripts/criterion-drop-in-replacement/sweep_seq.py b/benchmarks/scripts/criterion-drop-in-replacement/sweep_seq.py
index 9327ba6..0014800 100755
--- a/benchmarks/scripts/criterion-drop-in-replacement/sweep_seq.py
+++ b/benchmarks/scripts/criterion-drop-in-replacement/sweep_seq.py
@@ -14,7 +14,7 @@ def bounds(name):
     match name:
         case "Insertionsort":
             lo = 3  # 2**n ...
-            hi = 16
+            hi = 12 # for local testing; initially: 16
         case "Quicksort":
             lo = 3
             hi = 22
@@ -34,17 +34,16 @@ def bounds(name):
             hi = 20
     return lo, hi, (hi-lo)*DENSITY+1
 
-def dotrial(name, size):
-    return converge([sys.argv[0], "benchrunner", name, "Seq", str(int(size))])
+def dotrial(exe, name, size):
+    return converge([exe, name, "Seq", str(int(size))])
 
 if __name__ == "__main__":
+    exe = sys.argv[1]
+    print("Running with executable:", exe)
     for name in names:
         lo, hi, pts = bounds(name)
         with open("%s_out3.csv" % name, "w") as f:
             f.write("# size\tmean\tstddev\tgeomean\tgeostdev\n")
-        for i in np.unique(np.logspace(lo, hi, pts, base=2).astype(int)):
+        for i in np.unique(np.logspace(lo, hi, pts, base=2).astype(int)):  # Artem: I don't understand this and I must
             with open("%s_out3.csv" % name, "a") as f:
-                try:
-                    f.write("%d" % int(i) + "\t%f\t%f\t%f\t%f\n" % dotrial(name, i))
-                except:
-                    pass
+                f.write("%d" % int(i) + "\t%f\t%f\t%f\t%f\n" % dotrial(exe, name, i))