JuliaGPU · anicusan · Jul 15, 2025 · Jun 15, 2025 · Jul 12, 2025 · Jun 7, 2025
diff --git a/.github/workflows/CI-CPU.yml b/.github/workflows/CI-CPU.yml
@@ -63,6 +63,46 @@ jobs:
       - uses: julia-actions/julia-runtest@v1
         env:
           JULIA_NUM_THREADS: ${{ matrix.env.JULIA_NUM_THREADS }}
+  OpenCL:
+    name: OpenCL
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    permissions: # needed to allow julia-actions/cache to proactively delete old caches that it has created
+      actions: write
+      contents: read
+    strategy:
+      fail-fast: true
+    steps:
+      - uses: actions/checkout@v4
+      - uses: julia-actions/setup-julia@v2
+        with:
+          version: 1
+          arch: x64
+      - uses: julia-actions/cache@v2
+      - uses: julia-actions/julia-buildpkg@v1
+      - uses: julia-actions/julia-runtest@v1
+        with:
+          test_args: '--OpenCL'
+  # cpuKA:
+  #   name: KA CPU Backend
+  #   runs-on: ubuntu-latest
+  #   timeout-minutes: 60
+  #   permissions: # needed to allow julia-actions/cache to proactively delete old caches that it has created
+  #     actions: write
+  #     contents: read
+  #   strategy:
+  #     fail-fast: true
+  #   steps:
+  #     - uses: actions/checkout@v4
+  #     - uses: julia-actions/setup-julia@v2
+  #       with:
+  #         version: 1
+  #         arch: x64
+  #     - uses: julia-actions/cache@v2
+  #     - uses: julia-actions/julia-buildpkg@v1
+  #     - uses: julia-actions/julia-runtest@v1
+  #       with:
+  #         test_args: '--cpuKA'
   docs:
     name: Documentation
     runs-on: ubuntu-latest

diff --git a/src/AcceleratedKernels.jl b/src/AcceleratedKernels.jl
@@ -12,7 +12,7 @@ module AcceleratedKernels
 
 # Internal dependencies
 using ArgCheck: @argcheck
-using GPUArraysCore: AbstractGPUArray, @allowscalar
+using GPUArraysCore: AnyGPUArray, @allowscalar
 using KernelAbstractions
 
 

diff --git a/src/accumulate/accumulate.jl b/src/accumulate/accumulate.jl
@@ -167,24 +167,34 @@ function _accumulate_impl!(
     # CPU settings
     max_tasks::Int=Threads.nthreads(),
     min_elems::Int=2,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size::Int=256,
     temp::Union{Nothing, AbstractArray}=nothing,
     temp_flags::Union{Nothing, AbstractArray}=nothing,
 )
     if isnothing(dims)
-        return accumulate_1d!(
-            op, v, backend, alg;
-            init, neutral, inclusive,
-            max_tasks, min_elems,
-            block_size, temp, temp_flags,
-        )
+        return if use_KA_algo(v, prefer_threads)
+            accumulate_1d_gpu!(
+                op, v, backend, alg;
+                init, neutral, inclusive,
+                max_tasks, min_elems,
+                block_size, temp, temp_flags,
+            )
+        else
+            accumulate_1d_cpu!(
+                op, v, backend, alg;
+                init, neutral, inclusive,
+                max_tasks, min_elems,
+                block_size, temp, temp_flags,
+            )
+        end
     else
         return accumulate_nd!(
             op, v, backend;
             init, neutral, dims, inclusive,
-            max_tasks, min_elems,
+            max_tasks, min_elems, prefer_threads,
             block_size,
         )
     end

diff --git a/src/accumulate/accumulate_1d_cpu.jl b/src/accumulate/accumulate_1d_cpu.jl
@@ -1,5 +1,5 @@
-function accumulate_1d!(
-    op, v::AbstractArray, backend::CPU, alg;
+function accumulate_1d_cpu!(
+    op, v::AbstractArray, backend::Backend, alg;
     init,
     neutral,
     inclusive::Bool,

diff --git a/src/accumulate/accumulate_1d_gpu.jl b/src/accumulate/accumulate_1d_gpu.jl
@@ -248,8 +248,8 @@ end
 
 
 # DecoupledLookback algorithm
-function accumulate_1d!(
-    op, v::AbstractArray, backend::GPU, ::DecoupledLookback;
+function accumulate_1d_gpu!(
+    op, v::AbstractArray, backend::Backend, ::DecoupledLookback;
     init,
     neutral,
     inclusive::Bool,
@@ -307,8 +307,8 @@ end
 
 
 # ScanPrefixes algorithm
-function accumulate_1d!(
-    op, v::AbstractArray, backend::GPU, ::ScanPrefixes;
+function accumulate_1d_gpu!(
+    op, v::AbstractArray, backend, ::ScanPrefixes;
     init,
     neutral,
     inclusive::Bool,

diff --git a/src/accumulate/accumulate_nd.jl b/src/accumulate/accumulate_nd.jl
@@ -8,6 +8,7 @@ function accumulate_nd!(
     # CPU settings
     max_tasks::Int,
     min_elems::Int,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size::Int,
@@ -34,7 +35,7 @@ function accumulate_nd!(
 
     # Degenerate cases end
 
-    if backend isa CPU
+    if !use_KA_algo(v, prefer_threads)
         _accumulate_nd_cpu_sections!(op, v; init, dims, inclusive, max_tasks, min_elems)
     else
         # On GPUs we have two parallelisation approaches, based on which dimension has more elements:

diff --git a/src/foreachindex.jl b/src/foreachindex.jl
@@ -15,7 +15,7 @@ end
 function _forindices_gpu(
     f,
     indices,
-    backend::GPU;
+    backend::Backend;
 
     block_size::Int=256,
 )
@@ -125,11 +125,12 @@ function foreachindex(
     # CPU settings
     max_tasks=Threads.nthreads(),
     min_elems=1,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size=256,
 )
-    if backend isa GPU
+    if use_KA_algo(itr, prefer_threads)
         _forindices_gpu(f, eachindex(itr), backend; block_size)
     else
         _forindices_threads(f, eachindex(itr); max_tasks, min_elems)
@@ -218,6 +219,7 @@ function foraxes(
     # CPU settings
     max_tasks=Threads.nthreads(),
     min_elems=1,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size=256,
@@ -226,11 +228,11 @@ function foraxes(
         return foreachindex(
             f, itr, backend;
             max_tasks, min_elems,
-            block_size,
+            prefer_threads, block_size,
         )
     end
 
-    if backend isa GPU
+    if use_KA_algo(itr, prefer_threads)
         _forindices_gpu(f, axes(itr, dims), backend; block_size)
     else
         _forindices_threads(f, axes(itr, dims); max_tasks, min_elems)

diff --git a/src/map.jl b/src/map.jl
@@ -33,19 +33,12 @@ end
 """
 function map!(
     f, dst::AbstractArray, src::AbstractArray, backend::Backend=get_backend(src);
-
-    # CPU settings
-    max_tasks=Threads.nthreads(),
-    min_elems=1,
-
-    # GPU settings
-    block_size=256,
+    kwargs...
 )
     @argcheck length(dst) == length(src)
     foreachindex(
         src, backend;
-        max_tasks, min_elems,
-        block_size,
+        kwargs...
     ) do idx
         dst[idx] = f(src[idx])
     end

diff --git a/src/predicates.jl b/src/predicates.jl
@@ -114,11 +114,12 @@ function _any_impl(
     # CPU settings
     max_tasks=Threads.nthreads(),
     min_elems=1,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size::Int=256,
 )
-    if backend isa GPU
+    if use_KA_algo(v, prefer_threads)
         @argcheck block_size > 0
 
         # Some platforms crash when multiple threads write to the same memory location in a global
@@ -137,7 +138,8 @@ function _any_impl(
                 backend;
                 init=false,
                 neutral=false,
-                block_size=block_size,
+                prefer_threads=true,
+                block_size,
                 temp=alg.temp,
                 switch_below=alg.switch_below,
             )
@@ -246,11 +248,12 @@ function _all_impl(
     # CPU settings
     max_tasks=Threads.nthreads(),
     min_elems=1,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size::Int=256,
 )
-    if backend isa GPU
+    if use_KA_algo(v, prefer_threads)
         @argcheck block_size > 0
 
         # Some platforms crash when multiple threads write to the same memory location in a global
@@ -269,7 +272,8 @@ function _all_impl(
                 backend;
                 init=true,
                 neutral=true,
-                block_size=block_size,
+                prefer_threads=false,
+                block_size,
                 temp=alg.temp,
                 switch_below=alg.switch_below,
             )

diff --git a/src/reduce/mapreduce_1d_cpu.jl b/src/reduce/mapreduce_1d_cpu.jl
@@ -1,5 +1,5 @@
-function mapreduce_1d(
-    f, op, src::AbstractArray, backend::CPU;
+function mapreduce_1d_cpu(
+    f, op, src::AbstractArray, backend::Backend;
     init,
     neutral,
 

diff --git a/src/reduce/mapreduce_1d_gpu.jl b/src/reduce/mapreduce_1d_gpu.jl
@@ -99,8 +99,8 @@
 end
 
 
-function mapreduce_1d(
-    f, op, src::AbstractArray, backend::GPU;
+function mapreduce_1d_gpu(
+    f, op, src::AbstractArray, backend::Backend;
     init,
     neutral,
 

diff --git a/src/reduce/mapreduce_nd.jl b/src/reduce/mapreduce_nd.jl
@@ -7,6 +7,7 @@ function mapreduce_nd(
     # CPU settings - ignored here
     max_tasks::Int,
     min_elems::Int,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size::Int,
@@ -113,7 +114,7 @@ function mapreduce_nd(
     end
     dst_size = length(dst)
 
-    if backend isa CPU
+    if !use_KA_algo(src, prefer_threads)
         _mapreduce_nd_cpu_sections!(
             f, op, dst, src;
             init,

diff --git a/src/reduce/reduce.jl b/src/reduce/reduce.jl
@@ -175,25 +175,36 @@ function _mapreduce_impl(
     # CPU settings
     max_tasks::Int=Threads.nthreads(),
     min_elems::Int=1,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size::Int=256,
     temp::Union{Nothing, AbstractArray}=nothing,
     switch_below::Int=0,
 )
     if isnothing(dims)
-        return mapreduce_1d(
-            f, op, src, backend;
-            init, neutral,
-            max_tasks, min_elems,
-            block_size, temp,
-            switch_below
-        )
+        if use_KA_algo(src, prefer_threads)
+            mapreduce_1d_gpu(
+                f, op, src, backend;
+                init, neutral,
+                max_tasks, min_elems,
+                block_size, temp,
+                switch_below
+            )
+        else
+            mapreduce_1d_cpu(
+                f, op, src, backend;
+                init, neutral,
+                max_tasks, min_elems,
+                block_size, temp,
+                switch_below
+            )
+        end
     else
         return mapreduce_nd(
             f, op, src, backend;
-            init, neutral,
-            dims, max_tasks=max_tasks,
+            init, neutral, dims,
+            max_tasks, prefer_threads,
             min_elems, block_size,
             temp,
         )

diff --git a/src/sort/merge_sort.jl b/src/sort/merge_sort.jl
@@ -125,27 +125,27 @@ end
 
 """
     merge_sort!(
-        v::AbstractGPUArray, backend::Backend=get_backend(v);
+        v::AbstractArray, backend::Backend=get_backend(v);
 
         lt=isless,
         by=identity,
         rev::Union{Nothing, Bool}=nothing,
         order::Base.Order.Ordering=Base.Order.Forward,
 
         block_size::Int=256,
-        temp::Union{Nothing, AbstractGPUArray}=nothing,
+        temp::Union{Nothing, AbstractArray}=nothing,
     )
 """
 function merge_sort!(
-    v::AbstractGPUArray, backend::Backend=get_backend(v);
+    v::AbstractArray, backend::Backend=get_backend(v);
 
     lt=isless,
     by=identity,
     rev::Union{Nothing, Bool}=nothing,
     order::Base.Order.Ordering=Base.Order.Forward,
 
     block_size::Int=256,
-    temp::Union{Nothing, AbstractGPUArray}=nothing,
+    temp::Union{Nothing, AbstractArray}=nothing,
 )
     # Simple sanity checks
     @argcheck block_size > 0
@@ -195,19 +195,19 @@ end
 
 """
     merge_sort(
-        v::AbstractGPUArray, backend::Backend=get_backend(v);
+        v::AbstractArray, backend::Backend=get_backend(v);
 
         lt=isless,
         by=identity,
         rev::Union{Nothing, Bool}=nothing,
         order::Base.Order.Ordering=Base.Order.Forward,
 
         block_size::Int=256,
-        temp::Union{Nothing, AbstractGPUArray}=nothing,
+        temp::Union{Nothing, AbstractArray}=nothing,
     )
 """
 function merge_sort(
-    v::AbstractGPUArray, backend::Backend=get_backend(v);
+    v::AbstractArray, backend::Backend=get_backend(v);
     kwargs...
 )
     v_copy = copy(v)