diff --git a/.github/workflows/CI-CPU.yml b/.github/workflows/CI-CPU.yml
index a9848a3..298d02c 100644
--- a/.github/workflows/CI-CPU.yml
+++ b/.github/workflows/CI-CPU.yml
@@ -63,6 +63,46 @@ jobs:
       - uses: julia-actions/julia-runtest@v1
         env:
           JULIA_NUM_THREADS: ${{ matrix.env.JULIA_NUM_THREADS }}
+  OpenCL:
+    name: OpenCL
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    permissions: # needed to allow julia-actions/cache to proactively delete old caches that it has created
+      actions: write
+      contents: read
+    strategy:
+      fail-fast: true
+    steps:
+      - uses: actions/checkout@v4
+      - uses: julia-actions/setup-julia@v2
+        with:
+          version: 1
+          arch: x64
+      - uses: julia-actions/cache@v2
+      - uses: julia-actions/julia-buildpkg@v1
+      - uses: julia-actions/julia-runtest@v1
+        with:
+          test_args: '--OpenCL'
+  # cpuKA:
+  #   name: KA CPU Backend
+  #   runs-on: ubuntu-latest
+  #   timeout-minutes: 60
+  #   permissions: # needed to allow julia-actions/cache to proactively delete old caches that it has created
+  #     actions: write
+  #     contents: read
+  #   strategy:
+  #     fail-fast: true
+  #   steps:
+  #     - uses: actions/checkout@v4
+  #     - uses: julia-actions/setup-julia@v2
+  #       with:
+  #         version: 1
+  #         arch: x64
+  #     - uses: julia-actions/cache@v2
+  #     - uses: julia-actions/julia-buildpkg@v1
+  #     - uses: julia-actions/julia-runtest@v1
+  #       with:
+  #         test_args: '--cpuKA'
   docs:
     name: Documentation
     runs-on: ubuntu-latest
diff --git a/src/AcceleratedKernels.jl b/src/AcceleratedKernels.jl
index f97c9d8..a9de68e 100644
--- a/src/AcceleratedKernels.jl
+++ b/src/AcceleratedKernels.jl
@@ -12,7 +12,7 @@ module AcceleratedKernels
 
 # Internal dependencies
 using ArgCheck: @argcheck
-using GPUArraysCore: AbstractGPUArray, @allowscalar
+using GPUArraysCore: AnyGPUArray, @allowscalar
 using KernelAbstractions
 
 
diff --git a/src/accumulate/accumulate.jl b/src/accumulate/accumulate.jl
index 3179b2a..0aff1bf 100644
--- a/src/accumulate/accumulate.jl
+++ b/src/accumulate/accumulate.jl
@@ -167,6 +167,7 @@ function _accumulate_impl!(
     # CPU settings
     max_tasks::Int=Threads.nthreads(),
     min_elems::Int=2,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size::Int=256,
@@ -174,17 +175,26 @@ function _accumulate_impl!(
     temp_flags::Union{Nothing, AbstractArray}=nothing,
 )
     if isnothing(dims)
-        return accumulate_1d!(
-            op, v, backend, alg;
-            init, neutral, inclusive,
-            max_tasks, min_elems,
-            block_size, temp, temp_flags,
-        )
+        return if use_KA_algo(v, prefer_threads)
+            accumulate_1d_gpu!(
+                op, v, backend, alg;
+                init, neutral, inclusive,
+                max_tasks, min_elems,
+                block_size, temp, temp_flags,
+            )
+        else
+            accumulate_1d_cpu!(
+                op, v, backend, alg;
+                init, neutral, inclusive,
+                max_tasks, min_elems,
+                block_size, temp, temp_flags,
+            )
+        end
     else
         return accumulate_nd!(
             op, v, backend;
             init, neutral, dims, inclusive,
-            max_tasks, min_elems,
+            max_tasks, min_elems, prefer_threads,
             block_size,
         )
     end
diff --git a/src/accumulate/accumulate_1d_cpu.jl b/src/accumulate/accumulate_1d_cpu.jl
index ce03c0c..9f45ada 100644
--- a/src/accumulate/accumulate_1d_cpu.jl
+++ b/src/accumulate/accumulate_1d_cpu.jl
@@ -1,5 +1,5 @@
-function accumulate_1d!(
-    op, v::AbstractArray, backend::CPU, alg;
+function accumulate_1d_cpu!(
+    op, v::AbstractArray, backend::Backend, alg;
     init,
     neutral,
     inclusive::Bool,
diff --git a/src/accumulate/accumulate_1d_gpu.jl b/src/accumulate/accumulate_1d_gpu.jl
index be3ee59..f0ca135 100644
--- a/src/accumulate/accumulate_1d_gpu.jl
+++ b/src/accumulate/accumulate_1d_gpu.jl
@@ -248,8 +248,8 @@ end
 
 
 # DecoupledLookback algorithm
-function accumulate_1d!(
-    op, v::AbstractArray, backend::GPU, ::DecoupledLookback;
+function accumulate_1d_gpu!(
+    op, v::AbstractArray, backend::Backend, ::DecoupledLookback;
     init,
     neutral,
     inclusive::Bool,
@@ -307,8 +307,8 @@ end
 
 
 # ScanPrefixes algorithm
-function accumulate_1d!(
-    op, v::AbstractArray, backend::GPU, ::ScanPrefixes;
+function accumulate_1d_gpu!(
+    op, v::AbstractArray, backend, ::ScanPrefixes;
     init,
     neutral,
     inclusive::Bool,
diff --git a/src/accumulate/accumulate_nd.jl b/src/accumulate/accumulate_nd.jl
index aeb08ca..5e213c2 100644
--- a/src/accumulate/accumulate_nd.jl
+++ b/src/accumulate/accumulate_nd.jl
@@ -8,6 +8,7 @@ function accumulate_nd!(
     # CPU settings
     max_tasks::Int,
     min_elems::Int,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size::Int,
@@ -34,7 +35,7 @@ function accumulate_nd!(
 
     # Degenerate cases end
 
-    if backend isa CPU
+    if !use_KA_algo(v, prefer_threads)
         _accumulate_nd_cpu_sections!(op, v; init, dims, inclusive, max_tasks, min_elems)
     else
         # On GPUs we have two parallelisation approaches, based on which dimension has more elements:
diff --git a/src/foreachindex.jl b/src/foreachindex.jl
index 2cf68a6..24b9d78 100644
--- a/src/foreachindex.jl
+++ b/src/foreachindex.jl
@@ -15,7 +15,7 @@ end
 function _forindices_gpu(
     f,
     indices,
-    backend::GPU;
+    backend::Backend;
 
     block_size::Int=256,
 )
@@ -125,11 +125,12 @@ function foreachindex(
     # CPU settings
     max_tasks=Threads.nthreads(),
     min_elems=1,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size=256,
 )
-    if backend isa GPU
+    if use_KA_algo(itr, prefer_threads)
         _forindices_gpu(f, eachindex(itr), backend; block_size)
     else
         _forindices_threads(f, eachindex(itr); max_tasks, min_elems)
@@ -218,6 +219,7 @@ function foraxes(
     # CPU settings
     max_tasks=Threads.nthreads(),
     min_elems=1,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size=256,
@@ -226,11 +228,11 @@ function foraxes(
         return foreachindex(
             f, itr, backend;
             max_tasks, min_elems,
-            block_size,
+            prefer_threads, block_size,
         )
     end
 
-    if backend isa GPU
+    if use_KA_algo(itr, prefer_threads)
         _forindices_gpu(f, axes(itr, dims), backend; block_size)
     else
         _forindices_threads(f, axes(itr, dims); max_tasks, min_elems)
diff --git a/src/map.jl b/src/map.jl
index 6219f0a..10d690d 100644
--- a/src/map.jl
+++ b/src/map.jl
@@ -33,19 +33,12 @@ end
 """
 function map!(
     f, dst::AbstractArray, src::AbstractArray, backend::Backend=get_backend(src);
-
-    # CPU settings
-    max_tasks=Threads.nthreads(),
-    min_elems=1,
-
-    # GPU settings
-    block_size=256,
+    kwargs...
 )
     @argcheck length(dst) == length(src)
     foreachindex(
         src, backend;
-        max_tasks, min_elems,
-        block_size,
+        kwargs...
     ) do idx
         dst[idx] = f(src[idx])
     end
diff --git a/src/predicates.jl b/src/predicates.jl
index 19c8e5c..1f0a0dd 100644
--- a/src/predicates.jl
+++ b/src/predicates.jl
@@ -114,11 +114,12 @@ function _any_impl(
     # CPU settings
     max_tasks=Threads.nthreads(),
     min_elems=1,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size::Int=256,
 )
-    if backend isa GPU
+    if use_KA_algo(v, prefer_threads)
         @argcheck block_size > 0
 
         # Some platforms crash when multiple threads write to the same memory location in a global
@@ -137,7 +138,8 @@ function _any_impl(
                 backend;
                 init=false,
                 neutral=false,
-                block_size=block_size,
+                prefer_threads=true,
+                block_size,
                 temp=alg.temp,
                 switch_below=alg.switch_below,
             )
@@ -246,11 +248,12 @@ function _all_impl(
     # CPU settings
     max_tasks=Threads.nthreads(),
     min_elems=1,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size::Int=256,
 )
-    if backend isa GPU
+    if use_KA_algo(v, prefer_threads)
         @argcheck block_size > 0
 
         # Some platforms crash when multiple threads write to the same memory location in a global
@@ -269,7 +272,8 @@ function _all_impl(
                 backend;
                 init=true,
                 neutral=true,
-                block_size=block_size,
+                prefer_threads=false,
+                block_size,
                 temp=alg.temp,
                 switch_below=alg.switch_below,
             )
diff --git a/src/reduce/mapreduce_1d_cpu.jl b/src/reduce/mapreduce_1d_cpu.jl
index 544c3a7..95a93f2 100644
--- a/src/reduce/mapreduce_1d_cpu.jl
+++ b/src/reduce/mapreduce_1d_cpu.jl
@@ -1,5 +1,5 @@
-function mapreduce_1d(
-    f, op, src::AbstractArray, backend::CPU;
+function mapreduce_1d_cpu(
+    f, op, src::AbstractArray, backend::Backend;
     init,
     neutral,
 
diff --git a/src/reduce/mapreduce_1d_gpu.jl b/src/reduce/mapreduce_1d_gpu.jl
index b2b9566..c1e31cc 100644
--- a/src/reduce/mapreduce_1d_gpu.jl
+++ b/src/reduce/mapreduce_1d_gpu.jl
@@ -99,8 +99,8 @@
 end
 
 
-function mapreduce_1d(
-    f, op, src::AbstractArray, backend::GPU;
+function mapreduce_1d_gpu(
+    f, op, src::AbstractArray, backend::Backend;
     init,
     neutral,
 
diff --git a/src/reduce/mapreduce_nd.jl b/src/reduce/mapreduce_nd.jl
index a373a71..cf7d825 100644
--- a/src/reduce/mapreduce_nd.jl
+++ b/src/reduce/mapreduce_nd.jl
@@ -7,6 +7,7 @@ function mapreduce_nd(
     # CPU settings - ignored here
     max_tasks::Int,
     min_elems::Int,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size::Int,
@@ -113,7 +114,7 @@ function mapreduce_nd(
     end
     dst_size = length(dst)
 
-    if backend isa CPU
+    if !use_KA_algo(src, prefer_threads)
         _mapreduce_nd_cpu_sections!(
             f, op, dst, src;
             init,
diff --git a/src/reduce/reduce.jl b/src/reduce/reduce.jl
index 0a5781d..0332531 100644
--- a/src/reduce/reduce.jl
+++ b/src/reduce/reduce.jl
@@ -175,6 +175,7 @@ function _mapreduce_impl(
     # CPU settings
     max_tasks::Int=Threads.nthreads(),
     min_elems::Int=1,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size::Int=256,
@@ -182,18 +183,28 @@ function _mapreduce_impl(
     switch_below::Int=0,
 )
     if isnothing(dims)
-        return mapreduce_1d(
-            f, op, src, backend;
-            init, neutral,
-            max_tasks, min_elems,
-            block_size, temp,
-            switch_below
-        )
+        if use_KA_algo(src, prefer_threads)
+            mapreduce_1d_gpu(
+                f, op, src, backend;
+                init, neutral,
+                max_tasks, min_elems,
+                block_size, temp,
+                switch_below
+            )
+        else
+            mapreduce_1d_cpu(
+                f, op, src, backend;
+                init, neutral,
+                max_tasks, min_elems,
+                block_size, temp,
+                switch_below
+            )
+        end
     else
         return mapreduce_nd(
             f, op, src, backend;
-            init, neutral,
-            dims, max_tasks=max_tasks,
+            init, neutral, dims,
+            max_tasks, prefer_threads,
             min_elems, block_size,
             temp,
         )
diff --git a/src/sort/merge_sort.jl b/src/sort/merge_sort.jl
index 00ea8fc..5fb7b20 100644
--- a/src/sort/merge_sort.jl
+++ b/src/sort/merge_sort.jl
@@ -125,7 +125,7 @@ end
 
 """
     merge_sort!(
-        v::AbstractGPUArray, backend::Backend=get_backend(v);
+        v::AbstractArray, backend::Backend=get_backend(v);
 
         lt=isless,
         by=identity,
@@ -133,11 +133,11 @@ end
         order::Base.Order.Ordering=Base.Order.Forward,
 
         block_size::Int=256,
-        temp::Union{Nothing, AbstractGPUArray}=nothing,
+        temp::Union{Nothing, AbstractArray}=nothing,
     )
 """
 function merge_sort!(
-    v::AbstractGPUArray, backend::Backend=get_backend(v);
+    v::AbstractArray, backend::Backend=get_backend(v);
 
     lt=isless,
     by=identity,
@@ -145,7 +145,7 @@ function merge_sort!(
     order::Base.Order.Ordering=Base.Order.Forward,
 
     block_size::Int=256,
-    temp::Union{Nothing, AbstractGPUArray}=nothing,
+    temp::Union{Nothing, AbstractArray}=nothing,
 )
     # Simple sanity checks
     @argcheck block_size > 0
@@ -195,7 +195,7 @@ end
 
 """
     merge_sort(
-        v::AbstractGPUArray, backend::Backend=get_backend(v);
+        v::AbstractArray, backend::Backend=get_backend(v);
 
         lt=isless,
         by=identity,
@@ -203,11 +203,11 @@ end
         order::Base.Order.Ordering=Base.Order.Forward,
 
         block_size::Int=256,
-        temp::Union{Nothing, AbstractGPUArray}=nothing,
+        temp::Union{Nothing, AbstractArray}=nothing,
     )
 """
 function merge_sort(
-    v::AbstractGPUArray, backend::Backend=get_backend(v);
+    v::AbstractArray, backend::Backend=get_backend(v);
     kwargs...
 )
     v_copy = copy(v)
diff --git a/src/sort/merge_sort_by_key.jl b/src/sort/merge_sort_by_key.jl
index 8690910..f6de5f3 100644
--- a/src/sort/merge_sort_by_key.jl
+++ b/src/sort/merge_sort_by_key.jl
@@ -241,8 +241,8 @@ end
 
 """
     merge_sort_by_key(
-        keys::AbstractGPUArray,
-        values::AbstractGPUArray,
+        keys::AbstractArray,
+        values::AbstractArray,
         backend::Backend=get_backend(keys);
 
         lt=isless,
@@ -251,13 +251,13 @@ end
         order::Base.Order.Ordering=Base.Order.Forward,
 
         block_size::Int=256,
-        temp_keys::Union{Nothing, AbstractGPUArray}=nothing,
-        temp_values::Union{Nothing, AbstractGPUArray}=nothing,
+        temp_keys::Union{Nothing, AbstractArray}=nothing,
+        temp_values::Union{Nothing, AbstractArray}=nothing,
     )
 """
 function merge_sort_by_key(
-    keys::AbstractGPUArray,
-    values::AbstractGPUArray,
+    keys::AbstractArray,
+    values::AbstractArray,
     backend::Backend=get_backend(keys);
     kwargs...
 )
diff --git a/src/sort/merge_sortperm.jl b/src/sort/merge_sortperm.jl
index 3266cc9..6b97061 100644
--- a/src/sort/merge_sortperm.jl
+++ b/src/sort/merge_sortperm.jl
@@ -1,7 +1,7 @@
 """
     merge_sortperm!(
-        ix::AbstractGPUArray,
-        v::AbstractGPUArray,
+        ix::AbstractArray,
+        v::AbstractArray,
         backend::Backend=get_backend(v);
 
         lt=(<),
@@ -11,13 +11,13 @@
 
         inplace::Bool=false,
         block_size::Int=256,
-        temp_ix::Union{Nothing, AbstractGPUArray}=nothing,
-        temp_v::Union{Nothing, AbstractGPUArray}=nothing,
+        temp_ix::Union{Nothing, AbstractArray}=nothing,
+        temp_v::Union{Nothing, AbstractArray}=nothing,
     )
 """
 function merge_sortperm!(
-    ix::AbstractGPUArray,
-    v::AbstractGPUArray,
+    ix::AbstractArray,
+    v::AbstractArray,
     backend::Backend=get_backend(v);
 
     lt=(<),
@@ -27,8 +27,8 @@ function merge_sortperm!(
 
     inplace::Bool=false,
     block_size::Int=256,
-    temp_ix::Union{Nothing, AbstractGPUArray}=nothing,
-    temp_v::Union{Nothing, AbstractGPUArray}=nothing,
+    temp_ix::Union{Nothing, AbstractArray}=nothing,
+    temp_v::Union{Nothing, AbstractArray}=nothing,
 )
     # Simple sanity checks
     @argcheck block_size > 0
@@ -61,7 +61,7 @@ end
 
 """
     merge_sortperm(
-        v::AbstractGPUArray, backend::Backend=get_backend(v);
+        v::AbstractArray, backend::Backend=get_backend(v);
 
         lt=(<),
         by=identity,
@@ -70,12 +70,12 @@ end
 
         inplace::Bool=false,
         block_size::Int=256,
-        temp_ix::Union{Nothing, AbstractGPUArray}=nothing,
-        temp_v::Union{Nothing, AbstractGPUArray}=nothing,
+        temp_ix::Union{Nothing, AbstractArray}=nothing,
+        temp_v::Union{Nothing, AbstractArray}=nothing,
     )
 """
 function merge_sortperm(
-    v::AbstractGPUArray, backend::Backend=get_backend(v);
+    v::AbstractArray, backend::Backend=get_backend(v);
     kwargs...
 )
     ix = similar(v, Int)
@@ -88,8 +88,8 @@ end
 
 """
     merge_sortperm_lowmem!(
-        ix::AbstractGPUArray,
-        v::AbstractGPUArray,
+        ix::AbstractArray,
+        v::AbstractArray,
         backend::Backend=get_backend(v);
 
         lt=(<),
@@ -98,12 +98,12 @@ end
         order::Base.Order.Ordering=Base.Order.Forward,
 
         block_size::Int=256,
-        temp::Union{Nothing, AbstractGPUArray}=nothing,
+        temp::Union{Nothing, AbstractArray}=nothing,
     )
 """
 function merge_sortperm_lowmem!(
-    ix::AbstractGPUArray,
-    v::AbstractGPUArray,
+    ix::AbstractArray,
+    v::AbstractArray,
     backend::Backend=get_backend(v);
 
     lt=(<),
@@ -112,7 +112,7 @@ function merge_sortperm_lowmem!(
     order::Base.Order.Ordering=Base.Order.Forward,
 
     block_size::Int=256,
-    temp::Union{Nothing, AbstractGPUArray}=nothing,
+    temp::Union{Nothing, AbstractArray}=nothing,
 )
     # Simple sanity checks
     @argcheck block_size > 0
@@ -168,7 +168,7 @@ end
 
 """
     merge_sortperm_lowmem(
-        v::AbstractGPUArray, backend::Backend=get_backend(v);
+        v::AbstractArray, backend::Backend=get_backend(v);
 
         lt=(<),
         by=identity,
@@ -176,11 +176,11 @@ end
         order::Base.Order.Ordering=Base.Order.Forward,
 
         block_size::Int=256,
-        temp::Union{Nothing, AbstractGPUArray}=nothing,
+        temp::Union{Nothing, AbstractArray}=nothing,
     )
 """
 function merge_sortperm_lowmem(
-    v::AbstractGPUArray, backend::Backend=get_backend(v);
+    v::AbstractArray, backend::Backend=get_backend(v);
     kwargs...
 )
     ix = similar(v, Int)
diff --git a/src/sort/sort.jl b/src/sort/sort.jl
index 5fbce2d..8e55e3a 100644
--- a/src/sort/sort.jl
+++ b/src/sort/sort.jl
@@ -88,6 +88,7 @@ function _sort_impl!(
 
     max_tasks=Threads.nthreads(),
     min_elems=1,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size::Int=256,
@@ -95,7 +96,7 @@ function _sort_impl!(
     # Temporary buffer, same size as `v`
     temp::Union{Nothing, AbstractArray}=nothing,
 )
-    if backend isa GPU
+    if use_KA_algo(v, prefer_threads)
         merge_sort!(
             v, backend;
             lt, by, rev, order,
@@ -198,6 +199,7 @@ function _sortperm_impl!(
 
     max_tasks=Threads.nthreads(),
     min_elems=1,
+    prefer_threads::Bool=true,
 
     # GPU settings
     block_size::Int=256,
@@ -205,7 +207,7 @@ function _sortperm_impl!(
     # Temporary buffer, same size as `v`
     temp::Union{Nothing, AbstractArray}=nothing,
 )
-    if backend isa GPU
+    if use_KA_algo(v, prefer_threads)
         merge_sortperm_lowmem!(
             ix, v, backend;
             lt, by, rev, order,
diff --git a/src/utils.jl b/src/utils.jl
index 31b6246..f601b44 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -2,6 +2,10 @@ function ispow2(x)
     count_ones(x) == 1
 end
 
+# Helper function to check whether the package cpu implementation of an algorithm should be used
+@inline function use_KA_algo(output_array, prefer_threads)
+    return output_array isa AnyGPUArray || !prefer_threads
+end
 
 """
     struct TypeWrap{T} end
diff --git a/test/accumulate.jl b/test/accumulate.jl
index 759277d..5022e38 100644
--- a/test/accumulate.jl
+++ b/test/accumulate.jl
@@ -10,7 +10,7 @@ ALGS = AK.AccumulateAlgorithm[AK.ScanPrefixes()]
     for num_elems in 1:256
         x = array_from_host(ones(Int32, num_elems))
         y = copy(x)
-        AK.accumulate!(+, y; init=0, inclusive=false, block_size=128, alg)
+        AK.accumulate!(+, y; prefer_threads, init=0, inclusive=false, block_size=128, alg)
         yh = Array(y)
         @test all(yh .== 0:length(yh) - 1)
     end
@@ -19,7 +19,7 @@ ALGS = AK.AccumulateAlgorithm[AK.ScanPrefixes()]
     for num_elems in 1:256
         x = array_from_host(rand(1:1000, num_elems), Int32)
         y = copy(x)
-        AK.accumulate!(+, y; init=0, block_size=128, alg)
+        AK.accumulate!(+, y; prefer_threads, init=0, block_size=128, alg)
         @test all(Array(y) .== accumulate(+, Array(x)))
     end
 
@@ -28,7 +28,7 @@ ALGS = AK.AccumulateAlgorithm[AK.ScanPrefixes()]
         num_elems = rand(1:100_000)
         x = array_from_host(ones(Int32, num_elems))
         y = copy(x)
-        AK.accumulate!(+, y; init=0, inclusive=false, alg)
+        AK.accumulate!(+, y; prefer_threads, init=0, inclusive=false, alg)
         yh = Array(y)
         @test all(yh .== 0:length(yh) - 1)
     end
@@ -38,7 +38,7 @@ ALGS = AK.AccumulateAlgorithm[AK.ScanPrefixes()]
         num_elems = rand(1:100_000)
         x = array_from_host(rand(1:1000, num_elems), Int32)
         y = copy(x)
-        AK.accumulate!(+, y; init=0, alg)
+        AK.accumulate!(+, y; prefer_threads, init=0, alg)
         @test all(Array(y) .== accumulate(+, Array(x)))
     end
 
@@ -47,7 +47,7 @@ ALGS = AK.AccumulateAlgorithm[AK.ScanPrefixes()]
         num_elems = rand(1:100_000)
         x = array_from_host(rand(1:1000, num_elems), Int32)
         y = copy(x)
-        AK.accumulate!(+, y; init=0, block_size=16, alg)
+        AK.accumulate!(+, y; prefer_threads, init=0, block_size=16, alg)
         @test all(Array(y) .== accumulate(+, Array(x)))
     end
 
@@ -58,7 +58,7 @@ ALGS = AK.AccumulateAlgorithm[AK.ScanPrefixes()]
         n3 = rand(1:100)
         vh = rand(Float32, n1, n2, n3)
         v = array_from_host(vh)
-        AK.accumulate!(+, v; init=0, alg)
+        AK.accumulate!(+, v; prefer_threads, init=0, alg)
         @test all(Array(v) .≈ accumulate(+, vh))
     end
 
@@ -68,33 +68,33 @@ ALGS = AK.AccumulateAlgorithm[AK.ScanPrefixes()]
         x = array_from_host(rand(1:1000, num_elems), Int32)
         y = similar(x)
         init = rand(-1000:1000)
-        AK.accumulate!(+, y, x; init=Int32(init), alg)
+        AK.accumulate!(+, y, x; prefer_threads, init=Int32(init), alg)
         @test all(Array(y) .== accumulate(+, Array(x); init))
     end
 
     # Exclusive scan
     x = array_from_host(ones(Int32, 10))
     y = copy(x)
-    AK.accumulate!(+, y; init=0, inclusive=false, alg)
+    AK.accumulate!(+, y; prefer_threads, init=0, inclusive=false, alg)
     @test all(Array(y) .== 0:9)
 
     # Test init value is respected with exclusive scan too
     x = array_from_host(ones(Int32, 10))
     y = copy(x)
     init = 10
-    AK.accumulate!(+, y; init=Int32(init), inclusive=false, alg)
+    AK.accumulate!(+, y; prefer_threads, init=Int32(init), inclusive=false, alg)
     @test all(Array(y) .== 10:19)
 
     # Test that undefined kwargs are not accepted
-    @test_throws MethodError AK.accumulate(+, y; init=10, dims=2, inclusive=false, bad=:kwarg)
+    @test_throws MethodError AK.accumulate(+, y; prefer_threads, init=10, dims=2, inclusive=false, bad=:kwarg)
 
     # Testing different settings
     AK.accumulate!(+, array_from_host(ones(Int32, 1000)); init=0, inclusive=false,
-                block_size=128, alg,
+                prefer_threads, block_size=128, alg,
                 temp=array_from_host(zeros(Int32, 1000)),
                 temp_flags=array_from_host(zeros(Int8, 1000)))
     AK.accumulate(+, array_from_host(ones(Int32, 1000)); init=0, inclusive=false,
-                block_size=128, alg,
+                prefer_threads, block_size=128, alg,
                 temp=array_from_host(zeros(Int64, 1000)),
                 temp_flags=array_from_host(zeros(Int8, 1000)))
 end
@@ -110,7 +110,7 @@ end
                 for ksize in 0:3
                     sh = rand(Int32(1):Int32(100), isize, jsize, ksize)
                     s = array_from_host(sh)
-                    d = AK.accumulate(+, s; init=Int32(0), dims)
+                    d = AK.accumulate(+, s; prefer_threads, init=Int32(0), dims)
 
                     dh = Array(d)
                     dhres = accumulate(+, sh; init=Int32(0), dims)
@@ -130,7 +130,7 @@ end
             vh = rand(Int32(1):Int32(100), n1, n2, n3)
             v = array_from_host(vh)
 
-            s = AK.accumulate(+, v; init=Int32(0), dims)
+            s = AK.accumulate(+, v; prefer_threads, init=Int32(0), dims)
             sh = Array(s)
             @test sh == accumulate(+, vh; init=Int32(0), dims)
         end
@@ -144,7 +144,7 @@ end
             vh = rand(UInt32(1):UInt32(100), n1, n2, n3)
             v = array_from_host(vh)
 
-            s = AK.accumulate(+, v; init=UInt32(0), dims)
+            s = AK.accumulate(+, v; prefer_threads, init=UInt32(0), dims)
             sh = Array(s)
             @test sh == accumulate(+, vh; init=UInt32(0), dims)
         end
@@ -158,7 +158,7 @@ end
             vh = rand(Float32, n1, n2, n3)
             v = array_from_host(vh)
 
-            s = AK.accumulate(+, v; init=Float32(0), dims)
+            s = AK.accumulate(+, v; prefer_threads, init=Float32(0), dims)
             sh = Array(s)
             @test all(sh .≈ accumulate(+, vh; init=Float32(0), dims))
         end
@@ -173,7 +173,7 @@ end
             vh = rand(Float32, n1, n2, n3)
             v = array_from_host(vh)
             init = rand(-1000:1000)
-            s = AK.accumulate(+, v; init=Float32(init), dims)
+            s = AK.accumulate(+, v; prefer_threads, init=Float32(init), dims)
             sh = Array(s)
             @test all(sh .≈ accumulate(+, vh; init=Float32(init), dims))
         end
@@ -182,19 +182,19 @@ end
     # Exclusive scan
     vh = ones(Int32, 10, 10)
     v = array_from_host(vh)
-    s = AK.accumulate(+, v; init=0, dims=2, inclusive=false)
+    s = AK.accumulate(+, v; prefer_threads, init=0, dims=2, inclusive=false)
     sh = Array(s)
     @test all([sh[i, :] == 0:9 for i in 1:10])
 
     # Test init value is respected with exclusive scan too
     vh = ones(Int32, 10, 10)
     v = array_from_host(vh)
-    s = AK.accumulate(+, v; init=10, dims=2, inclusive=false)
+    s = AK.accumulate(+, v; prefer_threads, init=10, dims=2, inclusive=false)
     sh = Array(s)
     @test all([sh[i, :] == 10:19 for i in 1:10])
 
     # Test that undefined kwargs are not accepted
-    @test_throws MethodError AK.accumulate(+, v; init=10, dims=2, inclusive=false, bad=:kwarg)
+    @test_throws MethodError AK.accumulate(+, v; prefer_threads, init=10, dims=2, inclusive=false, bad=:kwarg)
 
     # Test all options with bigger matrices
     for D in [(1_000_000,3), (3,1_000_000)], dims in [1,2]
@@ -210,7 +210,8 @@ end
     # Testing different settings
     AK.accumulate(
         (x, y) -> x + 1,
-        array_from_host(rand(Int32, 3, 4, 5)),
+        array_from_host(rand(Int32, 3, 4, 5));
+        prefer_threads,
         init=Int32(0),
         neutral=Int32(0),
         dims=2,
@@ -219,7 +220,8 @@ end
     )
     AK.accumulate(
         (x, y) -> x + 1,
-        array_from_host(rand(Int32, 3, 4, 5)),
+        array_from_host(rand(Int32, 3, 4, 5));
+        prefer_threads,
         init=Int32(0),
         neutral=Int32(0),
         dims=3,
@@ -234,14 +236,14 @@ end
     # Simple correctness tests
     v = array_from_host(1:100)
     vh = Array(v)
-    @test Array(AK.cumsum(v)) == cumsum(vh)
+    @test Array(AK.cumsum(v; prefer_threads)) == cumsum(vh)
 
     # Fuzzy testing
     for _ in 1:100
         num_elems = rand(1:100_000)
         vh = rand(Float32, num_elems)
         v = array_from_host(vh)
-        @test all(Array(AK.cumsum(v)) .≈ cumsum(vh))
+        @test all(Array(AK.cumsum(v; prefer_threads)) .≈ cumsum(vh))
     end
 
     for _ in 1:100
@@ -253,10 +255,10 @@ end
             v = array_from_host(vh)
 
             # Indexing into array as if linear; not supported in Base
-            # @test all(Array(AK.cumsum(v)) .== cumsum(vh))
+            # @test all(Array(AK.cumsum(v; prefer_threads)) .== cumsum(vh))
 
             # Along dimensions
-            r = Array(AK.cumsum(v; dims))
+            r = Array(AK.cumsum(v; prefer_threads, dims))
             rh = cumsum(vh; dims)
 
             @test r == rh
@@ -266,14 +268,14 @@ end
     # Test promotion to op-dictated type
     xh = rand(Bool, 16)
     x = array_from_host(xh)
-    @test Array(AK.cumsum(x)) == cumsum(xh)
+    @test Array(AK.cumsum(x; prefer_threads)) == cumsum(xh)
 
     # Testing different settings
     v = array_from_host(rand(-5:5, 100_000))
-    AK.cumsum(v, block_size=64)
+    AK.cumsum(v; prefer_threads, block_size=64)
 
     # Test that undefined kwargs are not accepted
-    @test_throws MethodError AK.cumsum(v; init=10, bad=:kwarg)
+    @test_throws MethodError AK.cumsum(v; prefer_threads, init=10, bad=:kwarg)
 
     # The other settings are stress-tested in reduce
 end
@@ -286,11 +288,11 @@ end
     # Simple correctness tests
     v = array_from_host(1:100)
     vh = Array(v)
-    @test Array(AK.cumprod(v)) == cumprod(vh)
+    @test Array(AK.cumprod(v; prefer_threads)) == cumprod(vh)
 
     vh = ones(Float32, 100_000)
     v = array_from_host(vh)
-    @test Array(AK.cumprod(v)) == vh
+    @test Array(AK.cumprod(v; prefer_threads)) == vh
 
     # Fuzzy testing
     for _ in 1:100
@@ -302,10 +304,10 @@ end
             v = array_from_host(vh)
 
             # Indexing into array as if linear; not supported in Base
-            # @test all(Array(AK.cumprod(v)) .== cumprod(vh))
+            # @test all(Array(AK.cumprod(v; prefer_threads)) .== cumprod(vh))
 
             # Along dimensions
-            r = Array(AK.cumprod(v; dims))
+            r = Array(AK.cumprod(v; prefer_threads, dims))
             rh = cumprod(vh; dims)
 
             @test r == rh
@@ -314,10 +316,10 @@ end
 
     # Testing different settings
     v = array_from_host(rand(-5:5, 100_000))
-    AK.cumprod(v, block_size=64)
+    AK.cumprod(v; prefer_threads, block_size=64)
 
     # Test that undefined kwargs are not accepted
-    @test_throws MethodError AK.cumprod(v; init=10, bad=:kwarg)
+    @test_throws MethodError AK.cumprod(v; prefer_threads, init=10, bad=:kwarg)
 
     # The other settings are stress-tested in reduce
 end
diff --git a/test/binarysearch.jl b/test/binarysearch.jl
index b988cde..0f11c4c 100644
--- a/test/binarysearch.jl
+++ b/test/binarysearch.jl
@@ -11,11 +11,11 @@
         v = array_from_host(sort(rand(Int32, num_elems_v)))
         x = array_from_host(rand(Int32, num_elems_x))
         ix = similar(x, Int32)
-        AK.searchsortedfirst!(ix, v, x)
+        AK.searchsortedfirst!(ix, v, x; prefer_threads)
 
         vh = Array(v)
         xh = Array(x)
-        ixh = AK.searchsortedfirst(vh, xh)
+        ixh = AK.searchsortedfirst(vh, xh; prefer_threads)
         ixh_base = [searchsortedfirst(vh, e) for e in xh]
 
         @test all(Array(ix) .== ixh .== ixh_base)
@@ -24,11 +24,11 @@
         v = array_from_host(sort(rand(Float32, num_elems_v)))
         x = array_from_host(rand(Float32, num_elems_x))
         ix = similar(x, Int32)
-        AK.searchsortedfirst!(ix, v, x)
+        AK.searchsortedfirst!(ix, v, x; prefer_threads)
 
         vh = Array(v)
         xh = Array(x)
-        ixh = AK.searchsortedfirst(vh, xh)
+        ixh = AK.searchsortedfirst(vh, xh; prefer_threads)
         ixh_base = [searchsortedfirst(vh, e) for e in xh]
 
         @test all(Array(ix) .== ixh .== ixh_base)
@@ -43,11 +43,11 @@
         v = array_from_host(sort(rand(Int32, num_elems_v)))
         x = array_from_host(rand(Int32, num_elems_x))
         ix = similar(x, Int32)
-        AK.searchsortedlast!(ix, v, x)
+        AK.searchsortedlast!(ix, v, x; prefer_threads)
 
         vh = Array(v)
         xh = Array(x)
-        ixh = AK.searchsortedlast(vh, xh)
+        ixh = AK.searchsortedlast(vh, xh; prefer_threads)
         ixh_base = [searchsortedlast(vh, e) for e in xh]
 
         @test all(Array(ix) .== ixh .== ixh_base)
@@ -56,11 +56,11 @@
         v = array_from_host(sort(rand(Float32, num_elems_v)))
         x = array_from_host(rand(Float32, num_elems_x))
         ix = similar(x, Int32)
-        AK.searchsortedlast!(ix, v, x)
+        AK.searchsortedlast!(ix, v, x; prefer_threads)
 
         vh = Array(v)
         xh = Array(x)
-        ixh = AK.searchsortedlast(vh, xh)
+        ixh = AK.searchsortedlast(vh, xh; prefer_threads)
         ixh_base = [searchsortedlast(vh, e) for e in xh]
 
         @test all(Array(ix) .== ixh .== ixh_base)
@@ -71,23 +71,23 @@
     x = array_from_host(rand(Int32, 10_000))
     ix = similar(x, Int32)
 
-    AK.searchsortedfirst!(ix, v, x, by=abs, lt=(>), rev=true, block_size=64)
-    AK.searchsortedfirst(v, x, by=abs, lt=(>), rev=true, block_size=64)
-    AK.searchsortedlast!(ix, v, x, by=abs, lt=(>), rev=true, block_size=64)
-    AK.searchsortedlast(v, x, by=abs, lt=(>), rev=true, block_size=64)
+    AK.searchsortedfirst!(ix, v, x; prefer_threads, by=abs, lt=(>), rev=true, block_size=64)
+    AK.searchsortedfirst(v, x; prefer_threads, by=abs, lt=(>), rev=true, block_size=64)
+    AK.searchsortedlast!(ix, v, x; prefer_threads, by=abs, lt=(>), rev=true, block_size=64)
+    AK.searchsortedlast(v, x; prefer_threads, by=abs, lt=(>), rev=true, block_size=64)
 
     vh = Array(v)
     xh = Array(x)
     ixh = similar(xh, Int32)
 
-    AK.searchsortedfirst!(ixh, vh, xh, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100)
-    AK.searchsortedfirst(vh, xh, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100)
-    AK.searchsortedlast!(ixh, vh, xh, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100)
-    AK.searchsortedlast(vh, xh, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100)
+    AK.searchsortedfirst!(ixh, vh, xh; prefer_threads, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100)
+    AK.searchsortedfirst(vh, xh; prefer_threads, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100)
+    AK.searchsortedlast!(ixh, vh, xh; prefer_threads, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100)
+    AK.searchsortedlast(vh, xh; prefer_threads, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100)
 
     # Test that undefined kwargs are not accepted
-    @test_throws MethodError AK.searchsortedfirst!(ixh, vh, xh, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100, bad=:kwarg)
-    @test_throws MethodError AK.searchsortedfirst(vh, xh, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100, bad=:kwarg)
-    @test_throws MethodError AK.searchsortedlast!(ixh, vh, xh, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100, bad=:kwarg)
-    @test_throws MethodError AK.searchsortedlast(vh, xh, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100, bad=:kwarg)
+    @test_throws MethodError AK.searchsortedfirst!(ixh, vh, xh; prefer_threads, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100, bad=:kwarg)
+    @test_throws MethodError AK.searchsortedfirst(vh, xh; prefer_threads, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100, bad=:kwarg)
+    @test_throws MethodError AK.searchsortedlast!(ixh, vh, xh; prefer_threads, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100, bad=:kwarg)
+    @test_throws MethodError AK.searchsortedlast(vh, xh; prefer_threads, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100, bad=:kwarg)
 end
diff --git a/test/looping.jl b/test/looping.jl
index 746fe8c..55944d6 100644
--- a/test/looping.jl
+++ b/test/looping.jl
@@ -3,27 +3,27 @@
     Random.seed!(0)
 
     # CPU
-    if BACKEND == CPU()
+    if IS_CPU_BACKEND && prefer_threads
         x = zeros(Int, 1000)
-        AK.foreachindex(x) do i
+        AK.foreachindex(x; prefer_threads) do i
             x[i] = i
         end
         @test all(x .== 1:length(x))
 
         x = zeros(Int, 1000)
-        AK.foreachindex(x, max_tasks=1, min_elems=1) do i
+        AK.foreachindex(x; prefer_threads, max_tasks=1, min_elems=1) do i
             x[i] = i
         end
         @test all(x .== 1:length(x))
 
         x = zeros(Int, 1000)
-        AK.foreachindex(x, max_tasks=10, min_elems=1) do i
+        AK.foreachindex(x; prefer_threads, max_tasks=10, min_elems=1) do i
             x[i] = i
         end
         @test all(x .== 1:length(x))
 
         x = zeros(Int, 1000)
-        AK.foreachindex(x, max_tasks=10, min_elems=10) do i
+        AK.foreachindex(x; prefer_threads, max_tasks=10, min_elems=10) do i
             x[i] = i
         end
         @test all(x .== 1:length(x))
@@ -31,7 +31,7 @@
     # GPU
     else
         x = array_from_host(zeros(Int, 10_000))
-        f1(x) = AK.foreachindex(x) do i     # This must be inside a function to have a known type!
+        f1(x) = AK.foreachindex(x; prefer_threads) do i     # This must be inside a function to have a known type!
             x[i] = i
         end
         f1(x)
@@ -39,7 +39,7 @@
         @test all(xh .== 1:length(xh))
 
         x = array_from_host(zeros(Int, 10_000))
-        f2(x) = AK.foreachindex(x, block_size=64) do i
+        f2(x) = AK.foreachindex(x; prefer_threads, block_size=64) do i
             x[i] = i
         end
         f2(x)
@@ -59,12 +59,12 @@ end
     end
 
     x = array_from_host(zeros(Int, 10, 1000))
-    f1(x)
+    f1(x; prefer_threads)
     xh = Array(x)
     @test all(xh .== (1:10) .+ (1:1000)')
 
     x = array_from_host(zeros(UInt32, 10, 1000))
-    f1(x, max_tasks=2, min_elems=100, block_size=64)
+    f1(x; prefer_threads, max_tasks=2, min_elems=100, block_size=64)
     xh = Array(x)
     @test all(xh .== (1:10) .+ (1:1000)')
 
@@ -75,12 +75,12 @@ end
     end
 
     x = array_from_host(zeros(Int, 10, 1000))
-    f2(x)
+    f2(x; prefer_threads)
     xh = Array(x)
     @test all(xh .== (1:10) .+ (1:1000)')
 
     x = array_from_host(zeros(UInt32, 10, 1000))
-    f2(x, max_tasks=2, min_elems=100, block_size=64)
+    f2(x; prefer_threads, max_tasks=2, min_elems=100, block_size=64)
     xh = Array(x)
     @test all(xh .== (1:10) .+ (1:1000)')
 
@@ -90,7 +90,7 @@ end
     end
 
     x = array_from_host(zeros(Int, 10, 1000))
-    f3(x)
+    f3(x; prefer_threads)
     xh = Array(x)
     @test all(xh[:] .== 1:length(x))
 end
diff --git a/test/map.jl b/test/map.jl
index 01c55bd..24fa8bd 100644
--- a/test/map.jl
+++ b/test/map.jl
@@ -2,56 +2,56 @@
     Random.seed!(0)
 
     # CPU
-    if BACKEND == CPU()
+    if IS_CPU_BACKEND && prefer_threads
         x = Array(1:1000)
-        y = AK.map(x) do i
+        y = AK.map(x; prefer_threads) do i
             i^2
         end
         @test y == map(i -> i^2, x)
 
         x = Array(1:1000)
         y = zeros(Int, 1000)
-        AK.map!(y, x) do i
+        AK.map!(y, x; prefer_threads) do i
             i^2
         end
         @test y == map(i -> i^2, x)
 
         x = rand(Float32, 1000)
-        y = AK.map(x, max_tasks=2, min_elems=100) do i
+        y = AK.map(x; prefer_threads, max_tasks=2, min_elems=100) do i
             i > 0.5 ? i : 0
         end
         @test y == map(i -> i > 0.5 ? i : 0, x)
 
         x = rand(Float32, 1000)
-        y = AK.map(x, max_tasks=4, min_elems=500) do i
+        y = AK.map(x; prefer_threads, max_tasks=4, min_elems=500) do i
             i > 0.5 ? i : 0
         end
         @test y == map(i -> i > 0.5 ? i : 0, x)
 
         # Test that undefined kwargs are not accepted
-        @test_throws MethodError AK.map(x -> x^2, x; bad=:kwarg)
+        @test_throws MethodError AK.map(x -> x^2, x; prefer_threads, bad=:kwarg)
     # GPU
     else
         x = array_from_host(1:1000)
-        y = AK.map(x) do i
+        y = AK.map(x; prefer_threads) do i
             i^2
         end
         @test Array(y) == map(i -> i^2, 1:1000)
 
         x = array_from_host(1:1000)
         y = array_from_host(zeros(Int, 1000))
-        AK.map!(y, x) do i
+        AK.map!(y, x; prefer_threads) do i
             i^2
         end
         @test Array(y) == map(i -> i^2, 1:1000)
 
         x = array_from_host(rand(Float32, 1000))
-        y = AK.map(x, block_size=64) do i
+        y = AK.map(x; prefer_threads, block_size=64) do i
             i > 0.5 ? i : 0
         end
         @test Array(y) == map(i -> i > 0.5 ? i : 0, Array(x))
 
         # Test that undefined kwargs are not accepted
-        @test_throws MethodError AK.map(x -> x^2, x; bad=:kwarg)
+        @test_throws MethodError AK.map(x -> x^2, x; prefer_threads, bad=:kwarg)
     end
 end
diff --git a/test/predicates.jl b/test/predicates.jl
index cd589e1..cc455bc 100644
--- a/test/predicates.jl
+++ b/test/predicates.jl
@@ -5,28 +5,28 @@
     # Simple correctness tests
     v = array_from_host(1:100)
 
-    @test AK.any(x->x<0, v) === false
-    @test AK.any(x->x>99, v) === true
+    @test AK.any(x->x<0, v; prefer_threads) === false
+    @test AK.any(x->x>99, v; prefer_threads) === true
 
-    @test AK.all(x->x>0, v) === true
-    @test AK.all(x->x<100, v) === false
+    @test AK.all(x->x>0, v; prefer_threads) === true
+    @test AK.all(x->x<100, v; prefer_threads) === false
 
     for _ in 1:100
         num_elems = rand(1:100_000)
         v = array_from_host(rand(Float32, num_elems))
-        @test AK.any(x->x<0, v) === false
-        @test AK.any(x->x<1, v) === true
-        @test AK.all(x->x<1, v) === true
-        @test AK.all(x->x<0, v) === false
+        @test AK.any(x->x<0, v; prefer_threads) === false
+        @test AK.any(x->x<1, v; prefer_threads) === true
+        @test AK.all(x->x<1, v; prefer_threads) === true
+        @test AK.all(x->x<0, v; prefer_threads) === false
     end
 
     for _ in 1:100
         num_elems = rand(1:100_000)
         v = array_from_host(rand(Float32, num_elems))
-        @test AK.any(x->x<0, v) === false
-        @test AK.any(x->x<1, v) === true
-        @test AK.all(x->x<1, v) === true
-        @test AK.all(x->x<0, v) === false
+        @test AK.any(x->x<0, v; prefer_threads) === false
+        @test AK.any(x->x<1, v; prefer_threads) === true
+        @test AK.all(x->x<1, v; prefer_threads) === true
+        @test AK.all(x->x<0, v; prefer_threads) === false
     end
 
     # Test the MapReduce algorithm which works on all platforms
@@ -34,14 +34,14 @@
         num_elems = rand(1:100_000)
         v = array_from_host(rand(Float32, num_elems))
         alg=AK.MapReduce(temp=similar(v, Bool), switch_below=100)
-        @test AK.any(x->x<0, v; alg) === false
-        @test AK.any(x->x<1, v; alg) === true
-        @test AK.all(x->x<1, v; alg) === true
-        @test AK.all(x->x<0, v; alg) === false
+        @test AK.any(x->x<0, v; prefer_threads, alg) === false
+        @test AK.any(x->x<1, v; prefer_threads, alg) === true
+        @test AK.all(x->x<1, v; prefer_threads, alg) === true
+        @test AK.all(x->x<0, v; prefer_threads, alg) === false
     end
 
     # Testing different settings
     v = array_from_host(rand(-5:5, 100_000))
-    AK.any(x->x<5, v, max_tasks=2, min_elems=100, block_size=64)
-    AK.all(x->x<5, v, max_tasks=2, min_elems=100, block_size=64)
+    AK.any(x->x<5, v; prefer_threads, max_tasks=2, min_elems=100, block_size=64)
+    AK.all(x->x<5, v; prefer_threads, max_tasks=2, min_elems=100, block_size=64)
 end
diff --git a/test/reduce.jl b/test/reduce.jl
index 7e9e4a6..9fe8b5c 100644
--- a/test/reduce.jl
+++ b/test/reduce.jl
@@ -13,6 +13,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0)
         AK.reduce(
             (x, y) -> x < y ? x : y,
             s;
+            prefer_threads,
             init=typemax(eltype(s)),
             neutral=typemax(eltype(s)),
         )
@@ -48,6 +49,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0)
         AK.reduce(
             (x, y) -> x + y,
             s;
+            prefer_threads,
             init=zero(eltype(s)),
             neutral=zero(eltype(s)),
         )
@@ -93,7 +95,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0)
     for _ in 1:100
         num_elems = rand(1:100_000)
         v = array_from_host(rand(Int32(1):Int32(100), num_elems))
-        s = AK.reduce(+, v; init=Int32(10))
+        s = AK.reduce(+, v; prefer_threads, init=Int32(10))
         vh = Array(v)
         @test s == sum(vh) + 10
     end
@@ -104,7 +106,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0)
         v = array_from_host(rand(1:100, num_elems), Int32)
         switch_below = rand(1:100)
         init = rand(1:100)
-        s = AK.reduce(+, v; switch_below=switch_below, init=Int32(init))
+        s = AK.reduce(+, v; prefer_threads, switch_below=switch_below, init=Int32(init))
         vh = Array(v)
         @test s == reduce(+, vh; init)
     end
@@ -113,7 +115,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0)
     for _ in 1:100
         num_elems = rand(1:1000)
         v = 1:num_elems
-        s = AK.reduce(+, v, BACKEND; init=Int32(0))
+        s = AK.reduce(+, v, BACKEND; prefer_threads, init=Int32(0))
         vh = Array(v)
         @test s == reduce(+, vh)
     end
@@ -124,7 +126,8 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0)
     # Testing different settings
     AK.reduce(
         (x, y) -> x + 1,
-        array_from_host(rand(Int32, 10_000)),
+        array_from_host(rand(Int32, 10_000));
+        prefer_threads,
         init=Int32(0),
         neutral=Int64(0),
         block_size=64,
@@ -135,7 +138,8 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0)
     )
     AK.reduce(
         (x, y) -> x + 1,
-        rand(Int32, 10_000),
+        rand(Int32, 10_000);
+        prefer_threads,
         init=Int32(0),
         neutral=Int64(0),
         max_tasks=16,
@@ -154,7 +158,7 @@ end
                 for ksize in 0:3
                     sh = rand(Int32(1):Int32(100), isize, jsize, ksize)
                     s = array_from_host(sh)
-                    d = AK.reduce(+, s; init=Int32(10), dims)
+                    d = AK.reduce(+, s; prefer_threads, init=Int32(10), dims)
                     dh = Array(d)
                     @test dh == sum(sh; init=Int32(10), dims)
                     @test eltype(dh) == eltype(sum(sh; init=Int32(10), dims))
@@ -171,7 +175,7 @@ end
             n3 = rand(1:100)
             vh = rand(Int32(1):Int32(100), n1, n2, n3)
             v = array_from_host(vh)
-            s = AK.reduce(+, v; init=Int32(0), dims)
+            s = AK.reduce(+, v; prefer_threads, init=Int32(0), dims)
             sh = Array(s)
             @test sh == sum(vh; dims)
         end
@@ -184,7 +188,7 @@ end
             n3 = rand(1:100)
             vh = rand(UInt32(1):UInt32(100), n1, n2, n3)
             v = array_from_host(vh)
-            s = AK.reduce(+, v; init=UInt32(0), dims)
+            s = AK.reduce(+, v; prefer_threads, init=UInt32(0), dims)
             sh = Array(s)
             @test sh == sum(vh; dims)
         end
@@ -197,7 +201,7 @@ end
             n3 = rand(1:100)
             vh = rand(Float32, n1, n2, n3)
             v = array_from_host(vh)
-            s = AK.reduce(+, v; init=Float32(0), dims)
+            s = AK.reduce(+, v; prefer_threads, init=Float32(0), dims)
             sh = Array(s)
             @test sh ≈ sum(vh; dims)
         end
@@ -212,19 +216,20 @@ end
             vh = rand(Int32(1):Int32(100), n1, n2, n3)
             v = array_from_host(vh)
             init = rand(1:100)
-            s = AK.reduce(+, v; init=Int32(init), dims)
+            s = AK.reduce(+, v; prefer_threads, init=Int32(init), dims)
             sh = Array(s)
             @test sh == reduce(+, vh; dims, init)
         end
     end
 
     # Test that undefined kwargs are not accepted
-    @test_throws MethodError AK.reduce(+, array_from_host(rand(Int32, 10, 10)); init=10, bad=:kwarg)
+    @test_throws MethodError AK.reduce(+, array_from_host(rand(Int32, 10, 10)); prefer_threads, init=10, bad=:kwarg)
 
     # Testing different settings
     AK.reduce(
         (x, y) -> x + 1,
-        array_from_host(rand(Int32, 3, 4, 5)),
+        array_from_host(rand(Int32, 3, 4, 5));
+        prefer_threads,
         init=Int32(0),
         neutral=Int32(0),
         dims=2,
@@ -236,7 +241,8 @@ end
     )
     AK.reduce(
         (x, y) -> x + 1,
-        array_from_host(rand(Int32, 3, 4, 5)),
+        array_from_host(rand(Int32, 3, 4, 5));
+        prefer_threads,
         init=Int32(0),
         neutral=Int32(0),
         dims=3,
@@ -258,6 +264,7 @@ end
             p -> (p.x, p.y),
             (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]),
             s;
+            prefer_threads,
             init=(typemax(Float32), typemax(Float32)),
             neutral=(typemax(Float32), typemax(Float32)),
         )
@@ -310,7 +317,7 @@ end
     for _ in 1:100
         num_elems = rand(1:100_000)
         v = array_from_host(rand(Int32(1):Int32(100), num_elems))
-        s = AK.mapreduce(abs, +, v; init=Int32(10))
+        s = AK.mapreduce(abs, +, v; prefer_threads, init=Int32(10))
         vh = Array(v)
         @test s == sum(vh) + 10
     end
@@ -321,7 +328,7 @@ end
         v = array_from_host(rand(-100:-1, num_elems), Int32)
         switch_below = rand(1:100)
         init = rand(1:100)
-        s = AK.mapreduce(abs, +, v; switch_below=switch_below, init=Int32(init))
+        s = AK.mapreduce(abs, +, v; prefer_threads, switch_below=switch_below, init=Int32(init))
         vh = Array(v)
         @test s == mapreduce(abs, +, vh; init)
     end
@@ -330,7 +337,7 @@ end
     for _ in 1:100
         num_elems = rand(1:1000)
         v = 1:num_elems
-        s = AK.mapreduce(abs, +, v, BACKEND; init=Int32(0))
+        s = AK.mapreduce(abs, +, v, BACKEND; prefer_threads, init=Int32(0))
         vh = Array(v)
         @test s == mapreduce(abs, +, vh)
     end
@@ -339,7 +346,8 @@ end
     f(s, temp) = AK.mapreduce(
         p -> (p.x, p.y),
         (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]),
-        s,
+        s;
+        prefer_threads,
         init=(typemax(Float32), typemax(Float32)),
         neutral=(typemax(Float32), typemax(Float32)),
         block_size=64,
@@ -353,7 +361,7 @@ end
     f(v, temp)
 
     # Test that undefined kwargs are not accepted
-    @test_throws MethodError AK.mapreduce(-, +, v; init=10, bad=:kwarg)
+    @test_throws MethodError AK.mapreduce(-, +, v; prefer_threads, init=10, bad=:kwarg)
 end
 
 
@@ -367,7 +375,7 @@ end
                 for ksize in 0:3
                     sh = rand(Int32(-100):Int32(100), isize, jsize, ksize)
                     s = array_from_host(sh)
-                    d = AK.mapreduce(-, +, s; init=Int32(-10), dims)
+                    d = AK.mapreduce(-, +, s; prefer_threads, init=Int32(-10), dims)
                     dh = Array(d)
                     @test dh == mapreduce(-, +, sh; init=Int32(-10), dims)
                     @test eltype(dh) == eltype(mapreduce(-, +, sh; init=Int32(-10), dims))
@@ -384,7 +392,7 @@ end
             n3 = rand(1:100)
             vh = rand(Int32(1):Int32(100), n1, n2, n3)
             v = array_from_host(vh)
-            s = AK.mapreduce(-, +, v; init=Int32(0), dims)
+            s = AK.mapreduce(-, +, v; prefer_threads, init=Int32(0), dims)
             sh = Array(s)
             @test sh == mapreduce(-, +, vh; init=Int32(0), dims)
         end
@@ -396,6 +404,7 @@ end
             p -> (p.x, p.y),
             (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]),
             s;
+            prefer_threads,
             init=(typemax(Float32), typemax(Float32)),
             neutral=(typemax(Float32), typemax(Float32)),
             dims,
@@ -443,20 +452,21 @@ end
             vh = rand(Int32(-100):Int32(100), n1, n2, n3)
             v = array_from_host(vh)
             init = rand(1:100)
-            s = AK.mapreduce(-, +, v; init=Int32(init), dims)
+            s = AK.mapreduce(-, +, v; prefer_threads, init=Int32(init), dims)
             sh = Array(s)
             @test sh == mapreduce(-, +, vh; dims, init)
         end
     end
 
     # Test that undefined kwargs are not accepted
-    @test_throws MethodError AK.mapreduce(-, +, array_from_host(rand(Int32, 3, 4, 5)); init=10, bad=:kwarg)
+    @test_throws MethodError AK.mapreduce(-, +, array_from_host(rand(Int32, 3, 4, 5)); prefer_threads, init=10, bad=:kwarg)
 
     # Testing different settings
     AK.mapreduce(
         -,
         (x, y) -> x + 1,
-        array_from_host(rand(Int32, 3, 4, 5)),
+        array_from_host(rand(Int32, 3, 4, 5));
+        prefer_threads,
         init=Int32(0),
         neutral=Int32(0),
         dims=2,
@@ -469,7 +479,8 @@ end
     AK.mapreduce(
         -,
         (x, y) -> x + 1,
-        array_from_host(rand(Int32, 3, 4, 5)),
+        array_from_host(rand(Int32, 3, 4, 5));
+        prefer_threads,
         init=Int32(0),
         neutral=Int32(0),
         dims=3,
@@ -486,13 +497,13 @@ end
 
     # Simple correctness tests
     v = array_from_host(1:100)
-    @test AK.sum(v) == sum(Array(v))
+    @test AK.sum(v; prefer_threads) == sum(Array(v))
 
     # Fuzzy testing
     for _ in 1:100
         num_elems = rand(1:100_000)
         v = array_from_host(rand(Float32, num_elems))
-        @test AK.sum(v) ≈ sum(Array(v))
+        @test AK.sum(v; prefer_threads) ≈ sum(Array(v))
     end
 
     for _ in 1:100
@@ -504,10 +515,10 @@ end
             v = array_from_host(vh)
 
             # Indexing into array as if linear
-            @test AK.sum(v) == sum(vh)
+            @test AK.sum(v; prefer_threads) == sum(vh)
 
             # Along dimensions
-            r = Array(AK.sum(v; dims))
+            r = Array(AK.sum(v; prefer_threads, dims))
             rh = sum(vh; dims)
 
             @test r == rh
@@ -516,10 +527,10 @@ end
 
     # Testing different settings
     v = array_from_host(rand(-5:5, 100_000))
-    AK.sum(v, block_size=64)
+    AK.sum(v; prefer_threads, block_size=64)
 
     # Test that undefined kwargs are not accepted
-    @test_throws MethodError AK.sum(v; bad=:kwarg)
+    @test_throws MethodError AK.sum(v; prefer_threads, bad=:kwarg)
 
     # The other settings are stress-tested in reduce
 end
@@ -531,13 +542,13 @@ end
 
     # Simple correctness tests
     v = array_from_host(1:100)
-    @test AK.prod(v) == prod(Array(v))
+    @test AK.prod(v; prefer_threads) == prod(Array(v))
 
     # Fuzzy testing
     for _ in 1:100
         num_elems = rand(1:100_000)
         v = array_from_host(rand(Float32, num_elems))
-        @test AK.prod(v) ≈ prod(Array(v))
+        @test AK.prod(v; prefer_threads) ≈ prod(Array(v))
     end
 
     for _ in 1:100
@@ -549,10 +560,10 @@ end
             v = array_from_host(vh)
 
             # Indexing into array as if linear
-            @test AK.sum(v) == sum(vh)
+            @test AK.sum(v; prefer_threads) == sum(vh)
 
             # Along dimensions
-            r = Array(AK.sum(v; dims))
+            r = Array(AK.sum(v; prefer_threads, dims))
             rh = sum(vh; dims)
 
             @test r == rh
@@ -561,10 +572,10 @@ end
 
     # Testing different settings
     v = array_from_host(rand(-5:5, 100_000))
-    AK.prod(v, block_size=64)
+    AK.prod(v; prefer_threads, block_size=64)
 
     # Test that undefined kwargs are not accepted
-    @test_throws MethodError AK.prod(v; bad=:kwarg)
+    @test_throws MethodError AK.prod(v; prefer_threads, bad=:kwarg)
 
     # The other settings are stress-tested in reduce
 end
@@ -576,13 +587,13 @@ end
 
     # Simple correctness tests
     v = array_from_host(1:100)
-    @test AK.minimum(v) == minimum(Array(v))
+    @test AK.minimum(v; prefer_threads) == minimum(Array(v))
 
     # Fuzzy testing
     for _ in 1:100
         num_elems = rand(1:100_000)
         v = array_from_host(rand(Float32, num_elems))
-        @test AK.minimum(v) == minimum(Array(v))
+        @test AK.minimum(v; prefer_threads) == minimum(Array(v))
     end
 
     for _ in 1:100
@@ -594,10 +605,10 @@ end
             v = array_from_host(vh)
 
             # Indexing into array as if linear
-            @test AK.minimum(v) == minimum(vh)
+            @test AK.minimum(v; prefer_threads) == minimum(vh)
 
             # Along dimensions
-            r = Array(AK.minimum(v; dims))
+            r = Array(AK.minimum(v; prefer_threads, dims))
             rh = minimum(vh; dims)
 
             @test r == rh
@@ -606,10 +617,10 @@ end
 
     # Testing different settings
     v = array_from_host(rand(-5:5, 100_000))
-    AK.minimum(v, block_size=64)
+    AK.minimum(v; prefer_threads, block_size=64)
 
     # Test that undefined kwargs are not accepted
-    @test_throws MethodError AK.minimum(v; bad=:kwarg)
+    @test_throws MethodError AK.minimum(v; prefer_threads, bad=:kwarg)
 
     # The other settings are stress-tested in reduce
 end
@@ -621,13 +632,13 @@ end
 
     # Simple correctness tests
     v = array_from_host(1:100)
-    @test AK.maximum(v) == maximum(Array(v))
+    @test AK.maximum(v; prefer_threads) == maximum(Array(v))
 
     # Fuzzy testing
     for _ in 1:100
         num_elems = rand(1:100_000)
         v = array_from_host(rand(Float32, num_elems))
-        @test AK.maximum(v) == maximum(Array(v))
+        @test AK.maximum(v; prefer_threads) == maximum(Array(v))
     end
 
     for _ in 1:100
@@ -639,10 +650,10 @@ end
             v = array_from_host(vh)
 
             # Indexing into array as if linear
-            @test AK.maximum(v) == maximum(vh)
+            @test AK.maximum(v; prefer_threads) == maximum(vh)
 
             # Along dimensions
-            r = Array(AK.maximum(v; dims))
+            r = Array(AK.maximum(v; prefer_threads, dims))
             rh = maximum(vh; dims)
 
             @test r == rh
@@ -651,10 +662,10 @@ end
 
     # Testing different settings
     v = array_from_host(rand(-5:5, 100_000))
-    AK.maximum(v, block_size=64)
+    AK.maximum(v; prefer_threads, block_size=64)
 
     # Test that undefined kwargs are not accepted
-    @test_throws MethodError AK.maximum(v; bad=:kwarg)
+    @test_throws MethodError AK.maximum(v; prefer_threads, bad=:kwarg)
 
     # The other settings are stress-tested in reduce
 end
@@ -666,13 +677,13 @@ end
 
     # Simple correctness tests
     v = array_from_host(1:100)
-    @test AK.count(x->x>50, v) == count(x->x>50, Array(v))
+    @test AK.count(x->x>50, v; prefer_threads) == count(x->x>50, Array(v))
 
     # Fuzzy testing
     for _ in 1:100
         num_elems = rand(1:100_000)
         v = array_from_host(rand(Float32, num_elems))
-        @test AK.count(x->x>0.5, v) == count(x->x>0.5, Array(v))
+        @test AK.count(x->x>0.5, v; prefer_threads) == count(x->x>0.5, Array(v))
     end
 
     for _ in 1:100
@@ -684,10 +695,10 @@ end
             v = array_from_host(vh)
 
             # Indexing into array as if linear
-            @test AK.count(x->x>0.5, v) == count(x->x>0.5, vh)
+            @test AK.count(x->x>0.5, v; prefer_threads) == count(x->x>0.5, vh)
 
             # Along dimensions
-            r = Array(AK.count(x->x>0.5, v; dims))
+            r = Array(AK.count(x->x>0.5, v; prefer_threads, dims))
             rh = count(x->x>0.5, vh; dims)
 
             @test r == rh
@@ -698,15 +709,15 @@ end
     for _ in 1:100
         num_elems = rand(1:100_000)
         v = array_from_host(rand(Bool, num_elems))
-        @test AK.count(v) == count(Array(v))
+        @test AK.count(v; prefer_threads) == count(Array(v))
     end
 
     # Testing different settings
     v = array_from_host(rand(-5:5, 100_000))
-    AK.count(x->x>0, v, block_size=64)
+    AK.count(x->x>0, v; prefer_threads, block_size=64)
 
     # Test that undefined kwargs are not accepted
-    @test_throws MethodError AK.count(v; bad=:kwarg)
+    @test_throws MethodError AK.count(v; prefer_threads, bad=:kwarg)
 
     # The other settings are stress-tested in reduce
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 8db1c11..dcb4896 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -35,19 +35,22 @@ elseif "--Metal" in ARGS
     const BACKEND = MetalBackend()
 elseif "--OpenCL" in ARGS
     Pkg.add(name="OpenCL", rev="master")
+    Pkg.add(name="SPIRVIntrinsics", rev="master")
     Pkg.add("pocl_jll")
     using pocl_jll
     using OpenCL
     OpenCL.versioninfo()
     const BACKEND = OpenCLBackend()
-    TEST_DL[] = true
 elseif !@isdefined(BACKEND)
     # Otherwise do CPU tests
     using InteractiveUtils
     InteractiveUtils.versioninfo()
-    const BACKEND = CPU()
+    const BACKEND = get_backend([])
 end
 
+const IS_CPU_BACKEND = BACKEND == get_backend([])
+
+global prefer_threads::Bool = !(IS_CPU_BACKEND && "--cpuKA" in ARGS)
 
 array_from_host(h_arr::AbstractArray, dtype=nothing) = array_from_host(BACKEND, h_arr, dtype)
 function array_from_host(backend, h_arr::AbstractArray, dtype=nothing)
diff --git a/test/sort.jl b/test/sort.jl
index 59e9505..ee48c00 100644
--- a/test/sort.jl
+++ b/test/sort.jl
@@ -1,4 +1,4 @@
-if BACKEND != CPU()
+if !IS_CPU_BACKEND || !prefer_threads
 @testset "merge_sort" begin
     Random.seed!(0)
 
@@ -98,7 +98,7 @@ end
     for _ in 1:100
         num_elems = rand(1:100_000)
         v = array_from_host(rand(Int32, num_elems))
-        AK.sort!(v)
+        AK.sort!(v; prefer_threads)
         vh = Array(v)
         @test issorted(vh)
     end
@@ -106,7 +106,7 @@ end
     for _ in 1:100
         num_elems = rand(1:100_000)
         v = array_from_host(rand(UInt32, num_elems))
-        AK.sort!(v)
+        AK.sort!(v; prefer_threads)
         vh = Array(v)
         @test issorted(vh)
     end
@@ -114,39 +114,39 @@ end
     for _ in 1:100
         num_elems = rand(1:100_000)
         v = array_from_host(rand(Float32, num_elems))
-        AK.sort!(v)
+        AK.sort!(v; prefer_threads)
         vh = Array(v)
         @test issorted(vh)
     end
 
     # Testing different settings
     v = array_from_host(rand(1:100_000, 10_000), Float32)
-    AK.sort!(v, lt=(>), by=abs, rev=true,
+    AK.sort!(v; prefer_threads, lt=(>), by=abs, rev=true,
             max_tasks=64, min_elems=8, block_size=64,
             temp=array_from_host(1:10_000, Float32))
     @test issorted(Array(v))
 
     v = array_from_host(rand(1:100_000, 10_000), Int32)
-    AK.sort!(v, lt=(>), rev=true,
+    AK.sort!(v; prefer_threads, lt=(>), rev=true,
             max_tasks=64, min_elems=8, block_size=64,
             temp=array_from_host(1:10_000, Int32))
     @test issorted(Array(v))
 
     v = array_from_host(rand(1:100_000, 10_000), Float32)
-    v = AK.sort(v, lt=(>), by=abs, rev=true,
+    v = AK.sort(v; prefer_threads, lt=(>), by=abs, rev=true,
                 max_tasks=64, min_elems=8, block_size=64,
                 temp=array_from_host(1:10_000, Float32))
     @test issorted(Array(v))
 
     v = array_from_host(rand(1:100_000, 10_000), Int32)
-    v = AK.sort(v, lt=(>), by=abs, rev=true,
+    v = AK.sort(v; prefer_threads, lt=(>), by=abs, rev=true,
                 max_tasks=64, min_elems=8, block_size=64,
                 temp=array_from_host(1:10_000, Int32))
     @test issorted(Array(v))
 end
 
 
-if BACKEND != CPU()
+if !IS_CPU_BACKEND || !prefer_threads
 @testset "merge_sort_by_key" begin
     Random.seed!(0)
 
@@ -228,7 +228,7 @@ end
 end
 
 
-if BACKEND != CPU()
+if !IS_CPU_BACKEND || !prefer_threads
 @testset "merge_sortperm" begin
     Random.seed!(0)
 
@@ -337,7 +337,7 @@ end
 end
 
 
-if BACKEND != CPU()
+if !IS_CPU_BACKEND || !prefer_threads
 @testset "merge_sortperm_lowmem" begin
     Random.seed!(0)
 
@@ -404,7 +404,7 @@ end
         num_elems = rand(1:100_000)
         ix = array_from_host(zeros(Int32, num_elems))
         v = array_from_host(rand(Int32, num_elems))
-        AK.sortperm!(ix, v)
+        AK.sortperm!(ix, v; prefer_threads)
         ixh = Array(ix)
         vh = Array(v)
         @test issorted(vh[ixh])
@@ -414,7 +414,7 @@ end
         num_elems = rand(1:100_000)
         ix = array_from_host(zeros(Int32, num_elems))
         v = array_from_host(rand(UInt32, num_elems))
-        AK.sortperm!(ix, v)
+        AK.sortperm!(ix, v; prefer_threads)
         ixh = Array(ix)
         vh = Array(v)
         @test issorted(vh[ixh])
@@ -424,7 +424,7 @@ end
         num_elems = rand(1:100_000)
         ix = array_from_host(zeros(Int32, num_elems))
         v = array_from_host(rand(Float32, num_elems))
-        AK.sortperm!(ix, v)
+        AK.sortperm!(ix, v; prefer_threads)
         ixh = Array(ix)
         vh = Array(v)
         @test issorted(vh[ixh])
@@ -434,7 +434,8 @@ end
     ix = array_from_host(1:10_000, Int32)
     v = array_from_host(1:10_000, Float32)
     AK.sortperm!(ix,
-                v,
+                v;
+                prefer_threads,
                 lt=(>), by=abs, rev=true,
                 block_size=64,
                 temp=array_from_host(1:10_000, Int32))
@@ -443,7 +444,8 @@ end
     @test issorted(vh[ixh])
 
     v = array_from_host(1:10_000, Float32)
-    ix = AK.sortperm(v,
+    ix = AK.sortperm(v;
+                    prefer_threads,
                     lt=(>), by=abs, rev=true,
                     block_size=64,
                     temp=array_from_host(1:10_000, Int))