JuliaGPU · anicusan · May 25, 2025 · May 23, 2025 · May 23, 2025 · May 25, 2025
diff --git a/.gitignore b/.gitignore
@@ -25,3 +25,6 @@ Manifest.toml
 
 # Local environment files
 .vscode/settings.json
+
+# Profile files
+profile.pb.gz
diff --git a/Project.toml b/Project.toml
@@ -1,15 +1,13 @@
 name = "AcceleratedKernels"
 uuid = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 authors = ["Andrei-Leonard Nicusan <[email protected]> and contributors"]
-version = "0.3.4"
+version = "0.4.0"
 
 [deps]
 ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
-OhMyThreads = "67456a42-1dca-4109-a031-0a68de7e3ad5"
-Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
 
 [weakdeps]
 Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
@@ -25,7 +23,5 @@ GPUArraysCore = "0.2.0"
 KernelAbstractions = "0.9.34"
 Markdown = "1"
 Metal = "1"
-OhMyThreads = "0.7, 0.8"
-Polyester = "0.7"
-julia = "1.10"
 oneAPI = "1, 2"
+julia = "1.10"
diff --git a/README.md b/README.md
@@ -190,6 +190,44 @@ Julia v1.11
 ## 1. What's Different?
 As far as I am aware, this is the first cross-architecture parallel standard library *from a unified codebase* - that is, the code is written as [KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl) backend-agnostic kernels, which are then **transpiled** to a GPU backend; that means we benefit from all the optimisations available on the native platform and official compiler stacks. For example, unlike open standards like OpenCL that require GPU vendors to implement that API for their hardware, we target the existing official compilers. And while performance-portability libraries like [Kokkos](https://github.com/kokkos/kokkos) and [RAJA](https://github.com/LLNL/RAJA) are powerful for large C++ codebases, they require US National Lab-level development and maintenance efforts to effectively forward calls from a single API to other OpenMP, CUDA Thrust, ROCm rocThrust, oneAPI DPC++ libraries developed separately.
 
+As a simple example, this is how a normal Julia `for`-loop can be converted to an accelerated kernel - for both multithreaded CPUs and Nvidia / AMD / Intel / Apple GPUs, **with native performance** - by changing a single line:
+
+<table>
+<tr>
+<td> CPU Code </td> <td> Multithreaded / GPU code </td>
+<tr>
+
+<tr>
+<td>
+
+```julia
+# Copy kernel testing throughput
+
+function cpu_copy!(dst, src)
+    for i in eachindex(src)
+        dst[i] = src[i]
+    end
+end
+```
+
+</td>
+<td>
+
+```julia
+import AcceleratedKernels as AK
+
+function ak_copy!(dst, src)
+    AK.foreachindex(src) do i
+        dst[i] = src[i]
+    end
+end
+```
+
+</td>
+</tr>
+</table>
+
+
 Again, this is only possible because of the unique Julia compilation model, the [JuliaGPU](https://juliagpu.org/) organisation work for reusable GPU backend infrastructure, and especially the [KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl) backend-agnostic kernel language. Thank you.
 
 
@@ -299,6 +337,11 @@ Leave out to test the CPU backend:
 $> julia -e 'import Pkg; Pkg.test("AcceleratedKernels.jl")'
 ```
 
+Start Julia with multiple threads to run the tests on a multithreaded CPU backend:
+```bash
+$> julia --threads=4 -e 'import Pkg; Pkg.test("AcceleratedKernels.jl")'
+```
+
 
 ## 8. Issues and Debugging
 As the compilation pipeline of GPU kernels is different to that of base Julia, error messages also look different - for example, where Julia would insert an exception when a variable name was not defined (e.g. we had a typo), a GPU kernel throwing exceptions cannot be compiled and instead you'll see some cascading errors like `"[...] compiling [...] resulted in invalid LLVM IR"` caused by `"Reason: unsupported use of an undefined name"` resulting in `"Reason: unsupported dynamic function invocation"`, etc.
@@ -322,7 +365,6 @@ Help is very welcome for any of the below:
       switch_below=(1, 10, 100, 1000, 10000)
   end
   ```
-- We need multithreaded implementations of `sort`, N-dimensional `mapreduce` (in `OhMyThreads.tmapreduce`) and `accumulate` (again, probably in `OhMyThreads`).
 - Any way to expose the warp-size from the backends? Would be useful in reductions.
 - Add a performance regressions runner.
 - **Other ideas?** Post an issue, or open a discussion on the Julia Discourse.

diff --git a/docs/src/api/using_backends.md b/docs/src/api/using_backends.md
@@ -30,6 +30,4 @@ v = Vector(-1000:1000)                          # Normal CPU array
 AK.reduce(+, v, max_tasks=Threads.nthreads())
 ```
 
-Note the `reduce` and `mapreduce` CPU implementations forward arguments to [OhMyThreads.jl](https://github.com/JuliaFolds2/OhMyThreads.jl), an excellent package for multithreading. The focus of AcceleratedKernels.jl is to provide a unified interface to high-performance implementations of common algorithmic kernels, for both CPUs and GPUs - if you need fine-grained control over threads, scheduling, communication for specialised algorithms (e.g. with highly unequal workloads), consider using [OhMyThreads.jl](https://github.com/JuliaFolds2/OhMyThreads.jl) or [KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl) directly.
-
-There is ongoing work on multithreaded CPU `sort` and `accumulate` implementations - at the moment, they fall back to single-threaded algorithms; the rest of the library is fully parallelised for both CPUs and GPUs.
+By default all algorithms use the number of threads Julia was started with.
diff --git a/ext/AcceleratedKernelsMetalExt.jl b/ext/AcceleratedKernelsMetalExt.jl
@@ -14,6 +14,10 @@ function AK.accumulate!(
     dims::Union{Nothing, Int}=nothing,
     inclusive::Bool=true,
 
+    # CPU settings - not used
+    max_tasks::Int=Threads.nthreads(),
+    min_elems::Int=1,
+
     # Algorithm choice
     alg::AK.AccumulateAlgorithm=AK.ScanPrefixes(),
 
@@ -39,6 +43,10 @@ function AK.accumulate!(
     dims::Union{Nothing, Int}=nothing,
     inclusive::Bool=true,
 
+    # CPU settings - not used
+    max_tasks::Int=Threads.nthreads(),
+    min_elems::Int=1,
+
     # Algorithm choice
     alg::AK.AccumulateAlgorithm=AK.ScanPrefixes(),
 
@@ -63,6 +71,10 @@ function AK.cumsum(
     neutral=zero(eltype(src)),
     dims::Union{Nothing, Int}=nothing,
 
+    # CPU settings - not used
+    max_tasks::Int=Threads.nthreads(),
+    min_elems::Int=1,
+
     # Algorithm choice
     alg::AK.AccumulateAlgorithm=AK.ScanPrefixes(),
 
@@ -93,6 +105,10 @@ function AK.cumprod(
     neutral=one(eltype(src)),
     dims::Union{Nothing, Int}=nothing,
 
+    # CPU settings - not used
+    max_tasks::Int=Threads.nthreads(),
+    min_elems::Int=1,
+
     # Algorithm choice
     alg::AK.AccumulateAlgorithm=AK.ScanPrefixes(),
 

diff --git a/prototype/parallel_accumulate/Project.toml b/prototype/parallel_accumulate/Project.toml
@@ -0,0 +1,3 @@
+[deps]
+AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
diff --git a/prototype/parallel_accumulate/benchmark.jl b/prototype/parallel_accumulate/benchmark.jl
@@ -0,0 +1,30 @@
+
+import AcceleratedKernels as AK
+using BenchmarkTools
+
+
+
+v = rand(1_000_000)
+init = eltype(v)(0)
+
+r1 = Base.accumulate(+, v; init=init)
+r2 = AK.accumulate(+, v; init=init)
+
+@assert r1 == r2
+
+
+v = rand(1_000_000)
+init = eltype(v)(0)
+
+println("1D Benchmark - Base vs. AK")
+display(@benchmark Base.accumulate(+, v; init=init))
+display(@benchmark AK.accumulate(+, v; init=init))
+
+
+v = rand(100, 100, 100)
+init = eltype(v)(0)
+
+println("3D Benchmark - Base vs. AK")
+display(@benchmark Base.accumulate(+, v; init=init, dims=2))
+display(@benchmark AK.accumulate(+, v; init=init, dims=2))
+
diff --git a/prototype/parallel_mapreduce/Project.toml b/prototype/parallel_mapreduce/Project.toml
@@ -0,0 +1,3 @@
+[deps]
+AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
+OhMyThreads = "67456a42-1dca-4109-a031-0a68de7e3ad5"
diff --git a/prototype/parallel_mapreduce/benchmark.jl b/prototype/parallel_mapreduce/benchmark.jl
@@ -0,0 +1,24 @@
+
+import AcceleratedKernels as AK
+using BenchmarkTools
+
+
+v = rand(1_000_000)
+f(x) = x^2
+op(x, y) = x + y
+init = eltype(v)(0)
+
+println("1D Benchmark - Base vs. AK")
+display(@benchmark Base.mapreduce(f, op, v; init=init))
+display(@benchmark AK.mapreduce(f, op, v; init=init, neutral=init))
+
+
+v = rand(100, 100, 100)
+f(x) = x^2
+op(x, y) = x + y
+init = eltype(v)(0)
+
+println("3D Benchmark - Base vs. AK")
+display(@benchmark Base.mapreduce(f, op, v; init=init, dims=2))
+display(@benchmark AK.mapreduce(f, op, v; init=init, neutral=init, dims=2))
+
diff --git a/prototype/parallel_mapreduce/mapreduce_vs_omt.jl b/prototype/parallel_mapreduce/mapreduce_vs_omt.jl
@@ -0,0 +1,38 @@
+
+import AcceleratedKernels as AK
+import OhMyThreads as OMT
+using BenchmarkTools
+
+
+# Turns out we have the same performance as tmapreduce with just AK base threading
+function mapreduce_omt(f, op, v; init)
+    # MapReduce using OhMyThreads
+    return OMT.tmapreduce(f, op, v; init=init)
+end
+
+
+function mapreduce_ak(f, op, v; init, max_tasks=Threads.nthreads())
+    # MapReduce using AcceleratedKernels
+    if max_tasks == 1
+        return Base.mapreduce(f, op, v; init=init)
+    end
+
+    shared = Vector{typeof(init)}(undef, max_tasks)
+    AK.itask_partition(length(v), max_tasks) do itask, irange
+        @inbounds begin
+            shared[itask] = Base.mapreduce(f, op, @view(v[irange]); init=init)
+        end
+    end
+    return Base.reduce(op, shared; init=init)
+end
+
+
+v = rand(1_000_000)
+f(x) = x^2
+op(x, y) = x + y
+init = eltype(v)(0)
+
+@assert mapreduce_omt(f, op, v; init=init) == mapreduce_ak(f, op, v; init=init)
+
+display(@benchmark mapreduce_omt(f, op, v; init=init))
+display(@benchmark mapreduce_ak(f, op, v; init=init))
diff --git a/prototype/parallel_sample_sort/ak_test.jl b/prototype/parallel_sample_sort/ak_test.jl
@@ -10,3 +10,13 @@ AK.sort!(v)
 v = rand(1:100, 1_000_000)
 ix = AK.sortperm(v)
 @assert issorted(v[ix])
+
+
+for _ in 1:1000
+    num_elems = rand(1:100_000)
+    v = array_from_host(rand(Int32, num_elems))
+    AK.sample_sort!(v)
+    vh = Array(v)
+    @assert issorted(vh)
+end
+
diff --git a/prototype/parallel_sample_sort/benchmark.jl b/prototype/parallel_sample_sort/benchmark.jl
@@ -0,0 +1,45 @@
+
+import AcceleratedKernels as AK
+using BenchmarkTools
+
+using Profile
+using PProf
+
+using Random
+Random.seed!(0)
+
+
+# Compile
+v = rand(1_000_000)
+AK.sort!(v)
+
+
+# Collect a profile
+Profile.clear()
+# v = rand(1_000_000)
+# @profile AK.sort!(v)
+
+v = rand(UInt32(0):UInt32(1_000_000), 1_000_000)
+ix = Vector{Int}(undef, 1_000_000)
+@profile AK.sortperm!(ix, v)
+pprof()
+
+
+println("\nBase vs AK sort (Int):")
+display(@benchmark Base.sort!(v) setup=(v = rand(1:1_000_000, 1_000_000)))
+display(@benchmark AK.sort!(v) setup=(v = rand(1:1_000_000, 1_000_000)))
+
+
+println("\nBase vs AK sort (Float64):")
+display(@benchmark Base.sort!(v) setup=(v = rand(Float64, 1_000_000)))
+display(@benchmark AK.sort!(v) setup=(v = rand(Float64, 1_000_000)))
+
+
+println("\nBase vs AK sortperm (UInt32):")
+display(@benchmark Base.sortperm!(ix, v) setup=(v = rand(UInt32(0):UInt32(1_000_000), 1_000_000); ix = Vector{Int}(undef, 1_000_000)))
+display(@benchmark AK.sortperm!(ix, v) setup=(v = rand(UInt32(0):UInt32(1_000_000), 1_000_000); ix = Vector{Int}(undef, 1_000_000)))
+
+
+println("\nBase vs AK sortperm (Float64):")
+display(@benchmark Base.sortperm!(ix, v) setup=(v = rand(Float64, 1_000_000); ix = Vector{Int}(undef, 1_000_000)))
+display(@benchmark AK.sortperm!(ix, v) setup=(v = rand(Float64, 1_000_000); ix = Vector{Int}(undef, 1_000_000)))
diff --git a/src/AcceleratedKernels.jl b/src/AcceleratedKernels.jl
@@ -12,10 +12,8 @@ module AcceleratedKernels
 
 # Internal dependencies
 using ArgCheck: @argcheck
-using GPUArraysCore: AbstractGPUVector, AbstractGPUArray, @allowscalar
+using GPUArraysCore: AbstractGPUArray, @allowscalar
 using KernelAbstractions
-using Polyester: @batch
-import OhMyThreads as OMT
 
 
 # Exposed functions from upstream packages