michaelborkowski · michaelborkowski · Mar 24, 2025 · Mar 24, 2025 · Mar 24, 2025 · Mar 24, 2025
diff --git a/benchmarks/scripts/criterion-drop-in-replacement/criterionmethodology.py b/benchmarks/scripts/criterion-drop-in-replacement/criterionmethodology.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+import numpy as np
+from sys import argv
+import subprocess
+from time import time
+import math
+
+from matplotlib import pyplot as plt
+
+MAKE_PLOT = False
+
+def linear_regression_with_std(x, y):
+    x = np.array(x)
+    y = np.array(y)
+    x_mean = np.mean(x)
+    y_mean = np.mean(y)
+    numerator = np.sum((x - x_mean) * (y - y_mean))
+    denominator = np.sum((x - x_mean) ** 2)
+    slope = numerator / denominator
+    intercept = y_mean - slope * x_mean
+    y_pred = slope * x + intercept
+    residuals = y - y_pred
+    std_dev = np.std(residuals)
+    return slope, intercept, std_dev
+
+def do_bench(cliargs, iters): 
+    print([cliargs[1], str(iters)] + cliargs[2:])
+    out = str(subprocess.check_output([cliargs[1], str(iters)] + cliargs[2:]))
+    s1 = out[out.find("SELFTIMED")+11:]
+    s2 = float(s1[:s1.find("\n")-4])
+    selftimed = s2
+
+    b1 = out[out.find("BATCHTIME")+11:]
+    b2 = float(b1[:b1.find("SELFTIMED")-2])
+    batchtime = b2
+
+    print(f"ITERS: {iters}, BATCHTIME: {batchtime}, SELFTIMED: {selftimed}")
+    return batchtime
+
+def converge(cliargs): 
+    xs = []
+    ys = []
+    iters = 1
+    t = time()
+    while len(xs) == 0: 
+        st = do_bench(cliargs, iters)
+        if st * iters < 0.65: 
+            iters *= 2
+            continue
+        xs.append(iters)
+        ys.append(st)
+    for _ in range(2): 
+        if time() - t < 3.5: 
+            iters = int(math.trunc(float(iters) * 1.2) + 1)
+        else: 
+            iters += 1 + iters // 20
+        st = do_bench(cliargs, iters)
+        xs.append(iters)
+        ys.append(st)
+    while time() - t < 3.5: 
+        if time() - t < 3.5: 
+            iters = int(math.trunc(float(iters) * 1.2) + 1)
+        else: 
+            iters += 1 + iters // 20
+        st = do_bench(cliargs, iters)
+        xs.append(iters)
+        ys.append(st)
+    m, b, sigma = linear_regression_with_std(xs, ys)
+    print(f"Slope (Mean): {m}, Intercept (Overhead): {b}, Stdev: {sigma}")
+    p, lnc, lngsd = linear_regression_with_std([math.log(x) for x in xs], [math.log(y) for y in ys])
+    c, gsd = math.exp(lnc), math.exp(lngsd)
+    print(f"Power (Distortion): {p}, Factor (Geomean) {c}, GeoStdev {gsd}")
+    if MAKE_PLOT: 
+        plt.plot(xs, ys, 'rx')
+        plt.plot([xs[0], xs[-1]], [m*xs[0]+b, m*xs[-1]+b], color="blue")
+        plt.plot(xs, [c*x**p for x in xs], color="green")
+        plt.savefig("plot.png")
+    return m, sigma, c, gsd
+
+if __name__ == "__main__": 
+    print(converge(argv))
diff --git a/benchmarks/scripts/criterion-drop-in-replacement/readme b/benchmarks/scripts/criterion-drop-in-replacement/readme
@@ -0,0 +1,5 @@
+The script `criterionmethodology.py` is my implementation of a benchrunner-runner that uses the criterion methodology. We take as input some program which takes `iters` as a command-line argument, times a function of interest in a tight loop which repeats `iters` many times, and then prints to stdout the batchtime (total loop time) and selftimed (total loop time divided by iters). The essense of criterion is then to sweep `iters` and perform a linear regression against iters and batchtime. The slope is the mean and the y-intercept represents some notion of shared overhead, insensitive to `iters`. Ultimately, criterion serves as a way to benchmark tasks with very short execution times, as startup overhead can be ignored. 
+
+Since we have relatively precise timing over loops, I also implemented the criterion methodolgy *geometrically*. I take the logarithm of all the x and y values, compute the linear regression over that, then exponentiate the y-intercept - this represents the geomean. The other dependent portion, which is the slope, becomes a power (the equation is y = e^b x^m), which represents *geometric overhead*, e.g. how much overhead is being added per iteration. This may do well to model any slowdowns arising from pre-allocating arrays. Additionally, since performance data is non-negative and judged multiplicatively (twice as good means numbers are half, twice has bad means numbers are doubled; these are all *factors*), the geomean and geo-standard-deviation may make more sense theoretically. However, from my testing, the geomean seams to vary wildly for programs with fleeting execution times, even between repeat runs with the same parameters. 
+
+The scripts `criterionmethodology.py` and `sweep_seq.py` can both be ran directly. The first takes command-line arguments, e.g. `criterionmethodology benchrunner Quicksort Seq 2000` will call `benchrunner iters Quicksort Seq 2000` for various `iters`. `sweep_seq` performs a logarithmic sweep over different array sizes, invoking the criterion methdology at each point. 
diff --git a/benchmarks/scripts/criterion-drop-in-replacement/sweep_seq.py b/benchmarks/scripts/criterion-drop-in-replacement/sweep_seq.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+import os
+import numpy as np
+from criterionmethodology import converge
+import sys
+
+# names = ["Optsort", "Insertionsort", "Mergesort", "Quicksort"]
+# names = ["CopyArray", "Quicksort", "Insertionsort", "Mergesort"]
+names = ["Insertionsort"]
+
+# DENSITY = 4
+DENSITY = 12
+def bounds(name): 
+    match name: 
+        case "Insertionsort": 
+            lo = 3  # 2**n ...
+            hi = 16
+        case "Quicksort": 
+            lo = 3
+            hi = 22
+        case "Mergesort": 
+            # lo = 12
+            lo = 3
+            hi = 24
+        case "Cilksort": 
+            # lo = 12
+            lo = 3
+            hi = 16#24
+        case "Optsort": 
+            lo = 3
+            hi = 16#24
+        case _: 
+            lo = 3
+            hi = 20
+    return lo, hi, (hi-lo)*DENSITY+1
+
+def dotrial(name, size):
+    return converge([sys.argv[0], "benchrunner", name, "Seq", str(int(size))])
+
+if __name__ == "__main__": 
+    for name in names: 
+        lo, hi, pts = bounds(name)
+        with open("%s_out3.csv" % name, "w") as f: 
+            f.write("# size\tmean\tstddev\tgeomean\tgeostdev\n")
+        for i in np.unique(np.logspace(lo, hi, pts, base=2).astype(int)): 
+            with open("%s_out3.csv" % name, "a") as f: 
+                try: 
+                    f.write("%d" % int(i) + "\t%f\t%f\t%f\t%f\n" % dotrial(name, i))
+                except: 
+                    pass
+
diff --git a/...marks/scripts/c-sorting-benchmarks/readme → ...old-criterion/c-sorting-benchmarks/readme b/...marks/scripts/c-sorting-benchmarks/readme → ...old-criterion/c-sorting-benchmarks/readme
diff --git a/...sorting-benchmarks/sort_insertion_out.csv → ...sorting-benchmarks/sort_insertion_out.csv b/...sorting-benchmarks/sort_insertion_out.csv → ...sorting-benchmarks/sort_insertion_out.csv
diff --git a/...sorting-benchmarks/sort_merge_seq_out.csv → ...sorting-benchmarks/sort_merge_seq_out.csv b/...sorting-benchmarks/sort_merge_seq_out.csv → ...sorting-benchmarks/sort_merge_seq_out.csv
diff --git a/...s/c-sorting-benchmarks/sort_quick_out.csv → ...n/c-sorting-benchmarks/sort_quick_out.csv b/...s/c-sorting-benchmarks/sort_quick_out.csv → ...n/c-sorting-benchmarks/sort_quick_out.csv
diff --git a/benchmarks/scripts/plot.py → benchmarks/scripts/old-criterion/plot.py b/benchmarks/scripts/plot.py → benchmarks/scripts/old-criterion/plot.py
diff --git a/benchmarks/scripts/plot_relative_speedup.py → ...ts/old-criterion/plot_relative_speedup.py b/benchmarks/scripts/plot_relative_speedup.py → ...ts/old-criterion/plot_relative_speedup.py
diff --git a/benchmarks/scripts/readme → benchmarks/scripts/old-criterion/readme b/benchmarks/scripts/readme → benchmarks/scripts/old-criterion/readme
diff --git a/benchmarks/scripts/sweep_seq.py → ...hmarks/scripts/old-criterion/sweep_seq.py b/benchmarks/scripts/sweep_seq.py → ...hmarks/scripts/old-criterion/sweep_seq.py
diff --git a/benchrunner/Benchrunner.hs b/benchrunner/Benchrunner.hs
@@ -19,6 +19,7 @@ import qualified Insertion as I
 import qualified QuickSort as Q
 import qualified DpsMergeSort4 as DMS
 import qualified DpsMergeSort4Par as DMSP
+import qualified CilkSort as CSP
 import qualified PiecewiseFallbackSort as PFS
 import qualified PiecewiseFallbackSortPar as PFSP
 import qualified Microbench as MB
@@ -41,6 +42,7 @@ getInput bench mb_size = case bench of
     Insertionsort -> ArrayIn <$> randArray (Proxy :: Proxy Int64) (mb 100)
     Quicksort     -> ArrayIn <$> randArray (Proxy :: Proxy Int64) (mb 1000000)
     Mergesort     -> ArrayIn <$> randArray (Proxy :: Proxy Int64) (mb 8000000)
+    Cilksort       -> ArrayIn <$> randArray (Proxy :: Proxy Int64) (mb 8000000)
     Optsort       -> ArrayIn <$> randArray (Proxy :: Proxy Int64) (mb 8000000)
   _ -> error "getInput: Unexpected Input!"
   where
@@ -103,6 +105,7 @@ sortFn bench parorseq = case (bench,parorseq) of
   (Mergesort, Par) -> DMSP.msort
   (Optsort,   Seq) -> PFS.pfsort
   (Optsort,   Par) -> PFSP.pfsort
+  (Cilksort,  Par) -> CSP.cilkSort
   oth -> error $ "sortFn: unknown configuration: " ++ show oth
 
 vectorSortFn :: SortAlgo -> ParOrSeq -> VecSort

diff --git a/benchrunner/Types.hs b/benchrunner/Types.hs
@@ -11,6 +11,7 @@ data SortAlgo
   = Insertionsort
   | Mergesort
   | Quicksort
+  | Cilksort
   | Optsort -- piecewise fallback
   deriving (Eq, Show, Read)
 

diff --git a/lh-array-sort.cabal b/lh-array-sort.cabal
@@ -67,13 +67,9 @@ library
       PiecewiseFallbackSort
       PiecewiseFallbackSortPar
 
-      -- JZ: Add Parallel Cilksort
-      -- Current Cilksort is entirely sequential
+      QuickSortCilk
+      CilkSort
 
--- remove until ready:
---      QuickSortNew
---  the last not quite ready yet?
---      CilkSort
       Linear.Common
   other-modules:
       Array.List

diff --git a/src/Array.hs b/src/Array.hs
@@ -1,6 +1,7 @@
 {-# LANGUAGE CPP           #-}
 {-# LANGUAGE BangPatterns  #-}
 {-# LANGUAGE DeriveFunctor #-}
+{-# LANGUAGE LiberalTypeSynonyms #-}
 
 -- {-# LANGUAGE Strict        #-}
 
@@ -15,6 +16,9 @@ module Array
 
     -- * Construction and querying
   , alloc, make, generate, generate_par, generate_par_m, makeArray
+  , flattenCallback, makeCallback, biJoinAllocAffine, allocScratchAffine
+  , biJoinAlloc, allocScratch
+
   , copy, copy_par, copy_par_m
   , size, get, set, slice, append
   , splitAt
@@ -95,9 +99,49 @@ makeArray = make
 #endif
 
 {-# INLINE free #-}
-free :: HasPrim a => Array a -. ()
+free :: Array a -. ()
 free = Unsafe.toLinear (\_ -> ())
 
+{-# INLINE flattenCallback #-}
+{-@ flattenCallback :: f:_ -> xs:_ ->
+      ret:{ ys:(Array a) } @-}
+flattenCallback :: (forall c. (Array b -. Ur c) -. Array a -. Ur c) -. Array a -. Array b
+flattenCallback f arr = unur (f ur arr)
+
+{-# INLINE makeCallback #-}
+makeCallback :: (Array b -. Array a) -. (Array a -. Ur c) -. Array b -. Ur c
+makeCallback direct k arr = k (direct arr)
+
+{-# INLINE biJoinAllocAffine #-}
+biJoinAllocAffine :: HasPrim tmps => Int -> tmps -> (Array tmps -. Array srcs -. Array dsts) -> Array srcs -. Array dsts
+biJoinAllocAffine i a f = flattenCallback (\cont src -> alloc i a (\tmp -> makeCallback (f tmp) cont src))
+
+-- efficient implementation of above
+{-# INLINE allocScratchAffine #-}
+allocScratchAffine :: HasPrim tmps => Int -> tmps -> (Array srcs -. Array tmps -. Array dsts) -> Array srcs -. Array dsts
+allocScratchAffine i a f arr = f arr (makeArray i a)
+
+{-# INLINE biJoinAlloc #-}
+biJoinAlloc :: HasPrim tmps => Int -> tmps -> (Array tmps -. Array srcs -. (Array dsts, Array tmpdsts)) -> Array srcs -. Array dsts
+biJoinAlloc i a f = 
+  let
+    g tmp src = 
+      let
+        !(dst, tmp') = f tmp src
+      in
+      case free tmp' of !() -> dst
+  in
+  flattenCallback (\cont src -> alloc i a (\tmp -> makeCallback (g tmp) cont src))
+
+-- efficient implementation of above
+{-# INLINE allocScratch #-}
+allocScratch :: HasPrim tmps => Int -> tmps -> (Array srcs -. Array tmps -. (Array dsts, Array tmpdsts)) -> Array srcs -. Array dsts
+allocScratch i a f arr = 
+  let
+    !(dst, tmp) = f arr (makeArray i a)
+  in case free tmp of !() -> dst
+
+
 --------------------------------------------------------------------------------
 -- Parallel operations
 --------------------------------------------------------------------------------