michaelborkowski · jazullo · Mar 24, 2025 · Mar 24, 2025 · Mar 24, 2025 · Mar 24, 2025
diff --git a/benchmarks/scripts/criterion-drop-in-replacement/criterionmethodology.py b/benchmarks/scripts/criterion-drop-in-replacement/criterionmethodology.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+import numpy as np
+from sys import argv
+import subprocess
+from time import time
+import math
+
+from matplotlib import pyplot as plt
+
+MAKE_PLOT = False
+
+def linear_regression_with_std(x, y):
+    x = np.array(x)
+    y = np.array(y)
+    x_mean = np.mean(x)
+    y_mean = np.mean(y)
+    numerator = np.sum((x - x_mean) * (y - y_mean))
+    denominator = np.sum((x - x_mean) ** 2)
+    slope = numerator / denominator
+    intercept = y_mean - slope * x_mean
+    y_pred = slope * x + intercept
+    residuals = y - y_pred
+    std_dev = np.std(residuals)
+    return slope, intercept, std_dev
+
+def do_bench(cliargs, iters): 
+    print([cliargs[1], str(iters)] + cliargs[2:])
+    out = str(subprocess.check_output([cliargs[1], str(iters)] + cliargs[2:]))
+    s1 = out[out.find("SELFTIMED")+11:]
+    s2 = float(s1[:s1.find("\n")-4])
+    selftimed = s2
+
+    b1 = out[out.find("BATCHTIME")+11:]
+    b2 = float(b1[:b1.find("SELFTIMED")-2])
+    batchtime = b2
+
+    print(f"ITERS: {iters}, BATCHTIME: {batchtime}, SELFTIMED: {selftimed}")
+    return batchtime
+
+def converge(cliargs): 
+    xs = []
+    ys = []
+    iters = 1
+    t = time()
+    while len(xs) == 0: 
+        st = do_bench(cliargs, iters)
+        if st * iters < 0.65: 
+            iters *= 2
+            continue
+        xs.append(iters)
+        ys.append(st)
+    for _ in range(2): 
+        if time() - t < 3.5: 
+            iters = int(math.trunc(float(iters) * 1.2) + 1)
+        else: 
+            iters += 1 + iters // 20
+        st = do_bench(cliargs, iters)
+        xs.append(iters)
+        ys.append(st)
+    while time() - t < 3.5: 
+        if time() - t < 3.5: 
+            iters = int(math.trunc(float(iters) * 1.2) + 1)
+        else: 
+            iters += 1 + iters // 20
+        st = do_bench(cliargs, iters)
+        xs.append(iters)
+        ys.append(st)
+    m, b, sigma = linear_regression_with_std(xs, ys)
+    print(f"Slope (Mean): {m}, Intercept (Overhead): {b}, Stdev: {sigma}")
+    p, lnc, lngsd = linear_regression_with_std([math.log(x) for x in xs], [math.log(y) for y in ys])
+    c, gsd = math.exp(lnc), math.exp(lngsd)
+    print(f"Power (Distortion): {p}, Factor (Geomean) {c}, GeoStdev {gsd}")
+    if MAKE_PLOT: 
+        plt.plot(xs, ys, 'rx')
+        plt.plot([xs[0], xs[-1]], [m*xs[0]+b, m*xs[-1]+b], color="blue")
+        plt.plot(xs, [c*x**p for x in xs], color="green")
+        plt.savefig("plot.png")
+    return m, sigma, c, gsd
+
+if __name__ == "__main__": 
+    print(converge(argv))
diff --git a/benchmarks/scripts/criterion-drop-in-replacement/readme b/benchmarks/scripts/criterion-drop-in-replacement/readme
@@ -0,0 +1,5 @@
+The script `criterionmethodology.py` is my implementation of a benchrunner-runner that uses the criterion methodology. We take as input some program which takes `iters` as a command-line argument, times a function of interest in a tight loop which repeats `iters` many times, and then prints to stdout the batchtime (total loop time) and selftimed (total loop time divided by iters). The essense of criterion is then to sweep `iters` and perform a linear regression against iters and batchtime. The slope is the mean and the y-intercept represents some notion of shared overhead, insensitive to `iters`. Ultimately, criterion serves as a way to benchmark tasks with very short execution times, as startup overhead can be ignored. 
+
+Since we have relatively precise timing over loops, I also implemented the criterion methodolgy *geometrically*. I take the logarithm of all the x and y values, compute the linear regression over that, then exponentiate the y-intercept - this represents the geomean. The other dependent portion, which is the slope, becomes a power (the equation is y = e^b x^m), which represents *geometric overhead*, e.g. how much overhead is being added per iteration. This may do well to model any slowdowns arising from pre-allocating arrays. Additionally, since performance data is non-negative and judged multiplicatively (twice as good means numbers are half, twice has bad means numbers are doubled; these are all *factors*), the geomean and geo-standard-deviation may make more sense theoretically. However, from my testing, the geomean seams to vary wildly for programs with fleeting execution times, even between repeat runs with the same parameters. 
+
+The scripts `criterionmethodology.py` and `sweep_seq.py` can both be ran directly. The first takes command-line arguments, e.g. `criterionmethodology benchrunner Quicksort Seq 2000` will call `benchrunner iters Quicksort Seq 2000` for various `iters`. `sweep_seq` performs a logarithmic sweep over different array sizes, invoking the criterion methdology at each point. 
diff --git a/benchmarks/scripts/criterion-drop-in-replacement/sweep_seq.py b/benchmarks/scripts/criterion-drop-in-replacement/sweep_seq.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+import os
+import numpy as np
+from criterionmethodology import converge
+import sys
+
+# names = ["Optsort", "Insertionsort", "Mergesort", "Quicksort"]
+# names = ["CopyArray", "Quicksort", "Insertionsort", "Mergesort"]
+names = ["Insertionsort"]
+
+# DENSITY = 4
+DENSITY = 12
+def bounds(name): 
+    match name: 
+        case "Insertionsort": 
+            lo = 3  # 2**n ...
+            hi = 16
+        case "Quicksort": 
+            lo = 3
+            hi = 22
+        case "Mergesort": 
+            # lo = 12
+            lo = 3
+            hi = 24
+        case "Cilksort": 
+            # lo = 12
+            lo = 3
+            hi = 16#24
+        case "Optsort": 
+            lo = 3
+            hi = 16#24
+        case _: 
+            lo = 3
+            hi = 20
+    return lo, hi, (hi-lo)*DENSITY+1
+
+def dotrial(name, size):
+    return converge([sys.argv[0], "benchrunner", name, "Seq", str(int(size))])
+
+if __name__ == "__main__": 
+    for name in names: 
+        lo, hi, pts = bounds(name)
+        with open("%s_out3.csv" % name, "w") as f: 
+            f.write("# size\tmean\tstddev\tgeomean\tgeostdev\n")
+        for i in np.unique(np.logspace(lo, hi, pts, base=2).astype(int)): 
+            with open("%s_out3.csv" % name, "a") as f: 
+                try: 
+                    f.write("%d" % int(i) + "\t%f\t%f\t%f\t%f\n" % dotrial(name, i))
+                except: 
+                    pass
+
diff --git a/...marks/scripts/c-sorting-benchmarks/readme → ...old-criterion/c-sorting-benchmarks/readme b/...marks/scripts/c-sorting-benchmarks/readme → ...old-criterion/c-sorting-benchmarks/readme
diff --git a/...sorting-benchmarks/sort_insertion_out.csv → ...sorting-benchmarks/sort_insertion_out.csv b/...sorting-benchmarks/sort_insertion_out.csv → ...sorting-benchmarks/sort_insertion_out.csv
diff --git a/...sorting-benchmarks/sort_merge_seq_out.csv → ...sorting-benchmarks/sort_merge_seq_out.csv b/...sorting-benchmarks/sort_merge_seq_out.csv → ...sorting-benchmarks/sort_merge_seq_out.csv
diff --git a/...s/c-sorting-benchmarks/sort_quick_out.csv → ...n/c-sorting-benchmarks/sort_quick_out.csv b/...s/c-sorting-benchmarks/sort_quick_out.csv → ...n/c-sorting-benchmarks/sort_quick_out.csv
diff --git a/benchmarks/scripts/plot.py → benchmarks/scripts/old-criterion/plot.py b/benchmarks/scripts/plot.py → benchmarks/scripts/old-criterion/plot.py
diff --git a/benchmarks/scripts/plot_relative_speedup.py → ...ts/old-criterion/plot_relative_speedup.py b/benchmarks/scripts/plot_relative_speedup.py → ...ts/old-criterion/plot_relative_speedup.py
diff --git a/benchmarks/scripts/readme → benchmarks/scripts/old-criterion/readme b/benchmarks/scripts/readme → benchmarks/scripts/old-criterion/readme
diff --git a/benchmarks/scripts/sweep_seq.py → ...hmarks/scripts/old-criterion/sweep_seq.py b/benchmarks/scripts/sweep_seq.py → ...hmarks/scripts/old-criterion/sweep_seq.py
diff --git a/benchrunner/Benchrunner.hs b/benchrunner/Benchrunner.hs
@@ -19,6 +19,7 @@ import qualified Insertion as I
 import qualified QuickSort as Q
 import qualified DpsMergeSort4 as DMS
 import qualified DpsMergeSort4Par as DMSP
+import qualified CilkSort as CSP
 import qualified PiecewiseFallbackSort as PFS
 import qualified PiecewiseFallbackSortPar as PFSP
 import qualified Microbench as MB
@@ -41,6 +42,7 @@ getInput bench mb_size = case bench of
     Insertionsort -> ArrayIn <$> randArray (Proxy :: Proxy Int64) (mb 100)
     Quicksort     -> ArrayIn <$> randArray (Proxy :: Proxy Int64) (mb 1000000)
     Mergesort     -> ArrayIn <$> randArray (Proxy :: Proxy Int64) (mb 8000000)
+    Cilksort       -> ArrayIn <$> randArray (Proxy :: Proxy Int64) (mb 8000000)
     Optsort       -> ArrayIn <$> randArray (Proxy :: Proxy Int64) (mb 8000000)
   _ -> error "getInput: Unexpected Input!"
   where
@@ -103,6 +105,7 @@ sortFn bench parorseq = case (bench,parorseq) of
   (Mergesort, Par) -> DMSP.msort
   (Optsort,   Seq) -> PFS.pfsort
   (Optsort,   Par) -> PFSP.pfsort
+  (Cilksort,  Par) -> CSP.cilkSort
   oth -> error $ "sortFn: unknown configuration: " ++ show oth
 
 vectorSortFn :: SortAlgo -> ParOrSeq -> VecSort

diff --git a/benchrunner/Types.hs b/benchrunner/Types.hs
@@ -11,6 +11,7 @@ data SortAlgo
   = Insertionsort
   | Mergesort
   | Quicksort
+  | Cilksort
   | Optsort -- piecewise fallback
   deriving (Eq, Show, Read)
 

diff --git a/lh-array-sort.cabal b/lh-array-sort.cabal
@@ -67,13 +67,9 @@ library
       PiecewiseFallbackSort
       PiecewiseFallbackSortPar
 
-      -- JZ: Add Parallel Cilksort
-      -- Current Cilksort is entirely sequential
+      QuickSortCilk
+      CilkSort
 
--- remove until ready:
---      QuickSortNew
---  the last not quite ready yet?
---      CilkSort
       Linear.Common
   other-modules:
       Array.List

diff --git a/src/Array.hs b/src/Array.hs
@@ -1,6 +1,7 @@
 {-# LANGUAGE CPP           #-}
 {-# LANGUAGE BangPatterns  #-}
 {-# LANGUAGE DeriveFunctor #-}
+{-# LANGUAGE LiberalTypeSynonyms #-}
 
 -- {-# LANGUAGE Strict        #-}
 
@@ -15,6 +16,9 @@ module Array
 
     -- * Construction and querying
   , alloc, make, generate, generate_par, generate_par_m, makeArray
+  , flattenCallback, makeCallback, biJoinAllocAffine, allocScratchAffine
+  , biJoinAlloc, allocScratch
+
   , copy, copy_par, copy_par_m
   , size, get, set, slice, append
   , splitAt
@@ -95,9 +99,47 @@ makeArray = make
 #endif
 
 {-# INLINE free #-}
-free :: HasPrim a => Array a -. ()
+free :: Array a -. ()
 free = Unsafe.toLinear (\_ -> ())
 
+{-# INLINE flattenCallback #-}
+flattenCallback :: (forall c. (Array b -. Ur c) -. Array a -. Ur c) -. Array a -. Array b
+flattenCallback f arr = unur (f ur arr)
+
+{-# INLINE makeCallback #-}
+makeCallback :: (Array b -. Array a) -. (Array a -. Ur c) -. Array b -. Ur c
+makeCallback direct k arr = k (direct arr)
+
+{-# INLINE biJoinAllocAffine #-}
+biJoinAllocAffine :: HasPrim tmps => Int -> tmps -> (Array tmps -. Array srcs -. Array dsts) -> Array srcs -. Array dsts
+biJoinAllocAffine i a f = flattenCallback (\cont src -> alloc i a (\tmp -> makeCallback (f tmp) cont src))
+
+-- efficient implementation of above
+{-# INLINE allocScratchAffine #-}
+allocScratchAffine :: HasPrim tmps => Int -> tmps -> (Array srcs -. Array tmps -. Array dsts) -> Array srcs -. Array dsts
+allocScratchAffine i a f arr = f arr (makeArray i a)
+
+{-# INLINE biJoinAlloc #-}
+biJoinAlloc :: HasPrim tmps => Int -> tmps -> (Array tmps -. Array srcs -. (Array dsts, Array tmpdsts)) -> Array srcs -. Array dsts
+biJoinAlloc i a f = 
+  let
+    g tmp src = 
+      let
+        !(dst, tmp') = f tmp src
+      in
+      case free tmp' of !() -> dst
+  in
+  flattenCallback (\cont src -> alloc i a (\tmp -> makeCallback (g tmp) cont src))
+
+-- efficient implementation of above
+{-# INLINE allocScratch #-}
+allocScratch :: HasPrim tmps => Int -> tmps -> (Array srcs -. Array tmps -. (Array dsts, Array tmpdsts)) -> Array srcs -. Array dsts
+allocScratch i a f arr = 
+  let
+    !(dst, tmp) = f arr (makeArray i a)
+  in case free tmp of !() -> dst
+
+
 --------------------------------------------------------------------------------
 -- Parallel operations
 --------------------------------------------------------------------------------

diff --git a/src/CilkSort.hs b/src/CilkSort.hs
@@ -6,24 +6,26 @@ module CilkSort where
 import qualified Language.Haskell.Liquid.Bag as B
 import           Language.Haskell.Liquid.ProofCombinators hiding ((?))
 import           ProofCombinators
-import           Array
+import           Array as A
 import           ArrayOperations
-import           DpsMerge
+import           DpsMergePar
+import qualified DpsMergeSort4 as Seq
 import Properties.Equivalence
 import Properties.Order
-import           Insertion
 import           QuickSortCilk
+import           Par
 
+import           Linear.Common
 #ifdef MUTABLE_ARRAYS
 import           Array.Mutable as A
+import           Control.DeepSeq ( NFData(..) )
 #else
 import           Array.List as A
 #endif
 
 #define KILO 1024
-#define MERGESIZE (2*KILO)
-#define QUICKSIZE (2*KILO)
-#define INSERTIONSIZE 20
+#define QUICKSIZE (8*KILO)
+#define SEQSIZE   (8*KILO)
 
 -- DPS mergesort -- unfold twice, merge twice
 {-@ cilkSortInplace :: xs:Array a
@@ -35,51 +37,70 @@ import           Array.List as A
                                        left zs == left xs && right zs == right xs &&
                                        left ts == left ys && right ts == right ys }>
        / [A.size xs] @-}
-cilkSortInplace :: (Show a, Ord a) => A.Array a -> A.Array a -> (A.Array a, A.Array a)
+#ifdef MUTABLE_ARRAYS
+cilkSortInplace :: (Show a, HasPrimOrd a, NFData a) =>
+#else
+cilkSortInplace :: (Show a, HasPrimOrd a) =>
+#endif
+  A.Array a -. A.Array a -. (A.Array a, A.Array a)
 cilkSortInplace src tmp =
-  let (len, src') = A.size2 src in
-  if len <= QUICKSIZE
-  then let src'' = quickSort src'
-        in (src'', tmp)
+  let !(Ur len, src') = A.size2 src in
+  if len <= SEQSIZE
+  then
+    if len <= QUICKSIZE
+    then let src'' = quickSort src'
+          in (src'', tmp)
+    else Seq.msortInplace src' tmp
   else
-    let (srcA, srcB)     = splitMid src'
-        (tmpA, tmpB)     = splitMid tmp
-        (src1, src2)     = splitMid srcA
-        (src3, src4)     = splitMid srcB
-        (tmp1, tmp2)     = splitMid tmpA
-        (tmp3, tmp4)     = splitMid tmpB
-        (src1', tmp1')   = cilkSortInplace src1 tmp1
-        (src2', tmp2')   = cilkSortInplace src2 tmp2
-        (src3', tmp3')   = cilkSortInplace src3 tmp3
-        (src4', tmp4')   = cilkSortInplace src4 tmp4
+    let !(srcA, srcB)     = splitMid src'
+        !(tmpA, tmpB)     = splitMid tmp
+        !(src1, src2)     = splitMid srcA
+        !(src3, src4)     = splitMid srcB
+        !(tmp1, tmp2)     = splitMid tmpA
+        !(tmp3, tmp4)     = splitMid tmpB
+        !(((src1', tmp1'), (src2', tmp2')), ((src3', tmp3'), (src4', tmp4')))
+                         = (.||||.) (cilkSortInplace src1 tmp1) (cilkSortInplace src2 tmp2)
+                                    (cilkSortInplace src3 tmp3) (cilkSortInplace src4 tmp4)
         tmpA'            = A.append tmp1' tmp2'
         tmpB'            = A.append tmp3' tmp4'
-        (srcA'', tmpA'') = merge src1' src2' tmpA'
-        (srcB'', tmpB'') = merge src3' src4' tmpB'
+        !((srcA'', tmpA''), (srcB'', tmpB''))
+                         = merge_par src1' src2' tmpA' .||. merge_par src3' src4' tmpB'
+--                         = tuple2 (merge_par src1' src2') tmpA' (merge_par src3' src4') tmpB'
         src''            = A.append srcA'' srcB''
-        (tmp''', src''') = merge tmpA'' tmpB'' src''
-    in (src''', tmp''')  ? lem_toBag_splitMid src
+        !(tmp''', src''') = merge_par tmpA'' tmpB'' src''
+    in  (src''', tmp''') ? lem_toBag_splitMid src
                          ? lem_toBag_splitMid tmp
                          ? lem_toBag_splitMid srcA
                          ? lem_toBag_splitMid srcB
                          ? lem_toBag_splitMid tmpA
                          ? lem_toBag_splitMid tmpB
 
-{-@ cilkSort' :: { xs:(Array a) | A.size xs > 0 && left xs == 0 && right xs == size xs }
-              -> { y:a | y == A.get xs 0 }
-              -> { zs:(Array a) | toBag xs == toBag zs && isSorted' zs &&
-                                  A.size xs == A.size zs && token xs == token zs } @-}
-cilkSort' :: (Show a, Ord a) => A.Array a -> a -> A.Array a
-cilkSort' src anyVal =
-  let (len, src') = A.size2 src
-      (src'', _tmp) = cilkSortInplace src' (A.make len anyVal) in
-  _tmp `seq` src''
+{-@ cilkSort' :: y:a
+           -> { xs:(Array a) | A.size xs > 0 && left xs == 0 && right xs == size xs && y == A.get xs 0 }
+           -> { zs:(Array a) | toBag xs == toBag zs && isSorted' zs &&
+                               A.size xs == A.size zs && token xs == token zs } @-}
+#ifdef MUTABLE_ARRAYS
+cilkSort' :: (Show a, HasPrimOrd a, NFData a) =>
+#else
+cilkSort' :: (Show a, HasPrimOrd a) =>
+#endif
+  a -> A.Array a -. A.Array a
+cilkSort' anyVal src =
+  let !(Ur len, src') = A.size2 src
+      !src'' = A.allocScratch len anyVal cilkSortInplace src' in
+  src''
 
+-- finally, the top-level merge sort function
 {-@ cilkSort :: { xs:(A.Array a) | left xs == 0 && right xs == size xs }
                     -> { ys:_ | toBag xs == toBag ys && isSorted' ys &&
                                 A.size xs == A.size ys && token xs == token ys  } @-}
-cilkSort :: (Show a, Ord a) => A.Array a -> A.Array a
+#ifdef MUTABLE_ARRAYS
+cilkSort :: (Show a, HasPrimOrd a, NFData a) =>
+#else
+cilkSort :: (Show a, HasPrimOrd a) =>
+#endif
+  A.Array a -. A.Array a
 cilkSort src =
-  let (len, src') = A.size2 src in
+  let !(Ur len, src') = A.size2 src in
       if len == 0 then src'
-      else let (x0, src'') = A.get2 src' 0 in cilkSort' src'' x0
+      else let !(Ur x0, src'') = A.get2 0 src' in cilkSort' x0 src''