diff --git a/test/accumulate.jl b/test/accumulate.jl
new file mode 100644
index 0000000..a8b425e
--- /dev/null
+++ b/test/accumulate.jl
@@ -0,0 +1,296 @@
+@testset "accumulate_1d" begin
+
+    Random.seed!(0)
+
+    # Single block exlusive scan (each block processes two elements)
+    for num_elems in 1:256
+        x = array_from_host(ones(Int32, num_elems))
+        y = copy(x)
+        AK.accumulate!(+, y; init=0, inclusive=false, block_size=128)
+        yh = Array(y)
+        @test all(yh .== 0:length(yh) - 1)
+    end
+
+    # Single block inclusive scan
+    for num_elems in 1:256
+        x = array_from_host(rand(1:1000, num_elems), Int32)
+        y = copy(x)
+        AK.accumulate!(+, y; init=0, block_size=128)
+        @test all(Array(y) .== accumulate(+, Array(x)))
+    end
+
+    # Large exclusive scan
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        x = array_from_host(ones(Int32, num_elems))
+        y = copy(x)
+        AK.accumulate!(+, y; init=0, inclusive=false)
+        yh = Array(y)
+        @test all(yh .== 0:length(yh) - 1)
+    end
+
+    # Large inclusive scan
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        x = array_from_host(rand(1:1000, num_elems), Int32)
+        y = copy(x)
+        AK.accumulate!(+, y; init=0)
+        @test all(Array(y) .== accumulate(+, Array(x)))
+    end
+
+    # Stress-testing small block sizes -> many blocks
+    for _ in 1:100
+        num_elems = rand(1:100_000)
+        x = array_from_host(rand(1:1000, num_elems), Int32)
+        y = copy(x)
+        AK.accumulate!(+, y; init=0, block_size=16)
+        @test all(Array(y) .== accumulate(+, Array(x)))
+    end
+
+    # Allowing N-dimensional arrays, still reduced as 1D
+    for _ in 1:100
+        n1 = rand(1:100)
+        n2 = rand(1:100)
+        n3 = rand(1:100)
+        vh = rand(Float32, n1, n2, n3)
+        v = array_from_host(vh)
+        AK.accumulate!(+, v; init=0)
+        @test all(Array(v) .≈ accumulate(+, vh))
+    end
+
+    # Ensuring the init value is respected
+    for _ in 1:100
+        num_elems = rand(1:100_000)
+        x = array_from_host(rand(1:1000, num_elems), Int32)
+        y = similar(x)
+        init = rand(-1000:1000)
+        AK.accumulate!(+, y, x; init=Int32(init))
+        @test all(Array(y) .== accumulate(+, Array(x), init=init))
+    end
+
+    # Exclusive scan
+    x = array_from_host(ones(Int32, 10))
+    y = copy(x)
+    AK.accumulate!(+, y; init=0, inclusive=false)
+    @test all(Array(y) .== 0:9)
+
+    # Test init value is respected with exclusive scan too
+    x = array_from_host(ones(Int32, 10))
+    y = copy(x)
+    init = 10
+    AK.accumulate!(+, y; init=Int32(init), inclusive=false)
+    @test all(Array(y) .== 10:19)
+
+    # Testing different settings
+    AK.accumulate!(+, array_from_host(ones(Int32, 1000)), init=0, inclusive=false,
+                block_size=128,
+                temp=array_from_host(zeros(Int32, 1000)),
+                temp_flags=array_from_host(zeros(Int8, 1000)))
+    AK.accumulate(+, array_from_host(ones(Int32, 1000)), init=0, inclusive=false,
+                block_size=128,
+                temp=array_from_host(zeros(Int64, 1000)),
+                temp_flags=array_from_host(zeros(Int8, 1000)))
+end
+
+
+@testset "accumulate_nd" begin
+    Random.seed!(0)
+
+    # Test all possible corner cases against Base.accumulate
+    for dims in 1:4
+        for isize in 0:3
+            for jsize in 0:3
+                for ksize in 0:3
+                    sh = rand(Int32(1):Int32(100), isize, jsize, ksize)
+                    s = array_from_host(sh)
+                    d = AK.accumulate(+, s; init=Int32(0), dims=dims)
+
+                    dh = Array(d)
+                    dhres = accumulate(+, sh, init=Int32(0), dims=dims)
+                    @test dh == dhres
+                    @test eltype(dh) == eltype(dhres)
+                end
+            end
+        end
+    end
+
+    # Fuzzy correctness testing
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(1:100)
+            n2 = rand(1:100)
+            n3 = rand(1:100)
+            vh = rand(Int32(1):Int32(100), n1, n2, n3)
+            v = array_from_host(vh)
+
+            s = AK.accumulate(+, v; init=Int32(0), dims=dims)
+            sh = Array(s)
+            @test sh == accumulate(+, vh, init=Int32(0), dims=dims)
+        end
+    end
+
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(1:100)
+            n2 = rand(1:100)
+            n3 = rand(1:100)
+            vh = rand(UInt32(1):UInt32(100), n1, n2, n3)
+            v = array_from_host(vh)
+
+            s = AK.accumulate(+, v; init=UInt32(0), dims=dims)
+            sh = Array(s)
+            @test sh == accumulate(+, vh, init=UInt32(0), dims=dims)
+        end
+    end
+
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(1:100)
+            n2 = rand(1:100)
+            n3 = rand(1:100)
+            vh = rand(Float32, n1, n2, n3)
+            v = array_from_host(vh)
+
+            s = AK.accumulate(+, v; init=Float32(0), dims=dims)
+            sh = Array(s)
+            @test all(sh .≈ accumulate(+, vh, init=Float32(0), dims=dims))
+        end
+    end
+
+    # Ensure the init value is respected
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(1:100)
+            n2 = rand(1:100)
+            n3 = rand(1:100)
+            vh = rand(Float32, n1, n2, n3)
+            v = array_from_host(vh)
+            init = rand(-1000:1000)
+            s = AK.accumulate(+, v; init=Float32(init), dims=dims)
+            sh = Array(s)
+            @test all(sh .≈ accumulate(+, vh, init=Float32(init), dims=dims))
+        end
+    end
+
+    # Exclusive scan
+    vh = ones(Int32, 10, 10)
+    v = array_from_host(vh)
+    s = AK.accumulate(+, v; init=0, dims=2, inclusive=false)
+    sh = Array(s)
+    @test all([sh[i, :] == 0:9 for i in 1:10])
+
+    # Test init value is respected with exclusive scan too
+    vh = ones(Int32, 10, 10)
+    v = array_from_host(vh)
+    s = AK.accumulate(+, v; init=10, dims=2, inclusive=false)
+    sh = Array(s)
+    @test all([sh[i, :] == 10:19 for i in 1:10])
+
+    # Testing different settings
+    AK.accumulate(
+        (x, y) -> x + 1,
+        array_from_host(rand(Int32, 3, 4, 5)),
+        init=Int32(0),
+        neutral=Int32(0),
+        dims=2,
+        block_size=64,
+        temp=array_from_host(zeros(Int32, 3, 1, 5)),
+    )
+    AK.accumulate(
+        (x, y) -> x + 1,
+        array_from_host(rand(Int32, 3, 4, 5)),
+        init=Int32(0),
+        neutral=Int32(0),
+        dims=3,
+        block_size=64,
+        temp=array_from_host(zeros(Int32, 3, 4, 1)),
+    )
+end
+@testset "cumsum" begin
+
+    Random.seed!(0)
+
+    # Simple correctness tests
+    v = array_from_host(1:100)
+    vh = Array(v)
+    @test Array(AK.cumsum(v)) == cumsum(vh)
+
+    # Fuzzy testing
+    for _ in 1:100
+        num_elems = rand(1:100_000)
+        vh = rand(Float32, num_elems)
+        v = array_from_host(vh)
+        @test all(Array(AK.cumsum(v)) .≈ cumsum(vh))
+    end
+
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(1:10)
+            n2 = rand(1:10)
+            n3 = rand(1:10)
+            vh = rand(Int32(-5):Int32(5), n1, n2, n3)
+            v = array_from_host(vh)
+
+            # Indexing into array as if linear; not supported in Base
+            # @test all(Array(AK.cumsum(v)) .== cumsum(vh))
+
+            # Along dimensions
+            r = Array(AK.cumsum(v, dims=dims))
+            rh = cumsum(vh, dims=dims)
+
+            @test r == rh
+        end
+    end
+
+    # Test promotion to op-dictated type
+    xh = rand(Bool, 16)
+    x = array_from_host(xh)
+    @test Array(AK.cumsum(x)) == cumsum(xh)
+
+    # Testing different settings
+    v = array_from_host(rand(-5:5, 100_000))
+    AK.cumsum(v, block_size=64)
+
+    # The other settings are stress-tested in reduce
+end
+
+
+@testset "cumprod" begin
+
+    Random.seed!(0)
+
+    # Simple correctness tests
+    v = array_from_host(1:100)
+    vh = Array(v)
+    @test Array(AK.cumprod(v)) == cumprod(vh)
+
+    vh = ones(Float32, 100_000)
+    v = array_from_host(vh)
+    @test Array(AK.cumprod(v)) == vh
+
+    # Fuzzy testing
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(1:10)
+            n2 = rand(1:10)
+            n3 = rand(1:10)
+            vh = rand(Int32(-5):Int32(5), n1, n2, n3)
+            v = array_from_host(vh)
+
+            # Indexing into array as if linear; not supported in Base
+            # @test all(Array(AK.cumprod(v)) .== cumprod(vh))
+
+            # Along dimensions
+            r = Array(AK.cumprod(v, dims=dims))
+            rh = cumprod(vh, dims=dims)
+
+            @test r == rh
+        end
+    end
+
+    # Testing different settings
+    v = array_from_host(rand(-5:5, 100_000))
+    AK.cumprod(v, block_size=64)
+
+    # The other settings are stress-tested in reduce
+end
diff --git a/test/binarysearch.jl b/test/binarysearch.jl
new file mode 100644
index 0000000..806df71
--- /dev/null
+++ b/test/binarysearch.jl
@@ -0,0 +1,87 @@
+@testset "searchsorted" begin
+
+    Random.seed!(0)
+
+    # Fuzzy correctness testing of searchsortedfirst
+    for _ in 1:100
+        num_elems_v = rand(1:100_000)
+        num_elems_x = rand(1:100_000)
+
+        # Ints
+        v = array_from_host(sort(rand(Int32, num_elems_v)))
+        x = array_from_host(rand(Int32, num_elems_x))
+        ix = similar(x, Int32)
+        AK.searchsortedfirst!(ix, v, x)
+
+        vh = Array(v)
+        xh = Array(x)
+        ixh = AK.searchsortedfirst(vh, xh)
+        ixh_base = [searchsortedfirst(vh, e) for e in xh]
+
+        @test all(Array(ix) .== ixh .== ixh_base)
+
+        # Floats
+        v = array_from_host(sort(rand(Float32, num_elems_v)))
+        x = array_from_host(rand(Float32, num_elems_x))
+        ix = similar(x, Int32)
+        AK.searchsortedfirst!(ix, v, x)
+
+        vh = Array(v)
+        xh = Array(x)
+        ixh = AK.searchsortedfirst(vh, xh)
+        ixh_base = [searchsortedfirst(vh, e) for e in xh]
+
+        @test all(Array(ix) .== ixh .== ixh_base)
+    end
+
+    # Fuzzy correctness testing of searchsortedlast
+    for _ in 1:100
+        num_elems_v = rand(1:100_000)
+        num_elems_x = rand(1:100_000)
+
+        # Ints
+        v = array_from_host(sort(rand(Int32, num_elems_v)))
+        x = array_from_host(rand(Int32, num_elems_x))
+        ix = similar(x, Int32)
+        AK.searchsortedlast!(ix, v, x)
+
+        vh = Array(v)
+        xh = Array(x)
+        ixh = AK.searchsortedlast(vh, xh)
+        ixh_base = [searchsortedlast(vh, e) for e in xh]
+
+        @test all(Array(ix) .== ixh .== ixh_base)
+
+        # Floats
+        v = array_from_host(sort(rand(Float32, num_elems_v)))
+        x = array_from_host(rand(Float32, num_elems_x))
+        ix = similar(x, Int32)
+        AK.searchsortedlast!(ix, v, x)
+
+        vh = Array(v)
+        xh = Array(x)
+        ixh = AK.searchsortedlast(vh, xh)
+        ixh_base = [searchsortedlast(vh, e) for e in xh]
+
+        @test all(Array(ix) .== ixh .== ixh_base)
+    end
+
+    # Testing different settings
+    v = array_from_host(sort(rand(Int32, 100_000)))
+    x = array_from_host(rand(Int32, 10_000))
+    ix = similar(x, Int32)
+
+    AK.searchsortedfirst!(ix, v, x, by=abs, lt=(>), rev=true, block_size=64)
+    AK.searchsortedfirst(v, x, by=abs, lt=(>), rev=true, block_size=64)
+    AK.searchsortedlast!(ix, v, x, by=abs, lt=(>), rev=true, block_size=64)
+    AK.searchsortedlast(v, x, by=abs, lt=(>), rev=true, block_size=64)
+
+    vh = Array(v)
+    xh = Array(x)
+    ixh = similar(xh, Int32)
+
+    AK.searchsortedfirst!(ixh, vh, xh, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100)
+    AK.searchsortedfirst(vh, xh, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100)
+    AK.searchsortedlast!(ixh, vh, xh, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100)
+    AK.searchsortedlast(vh, xh, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100)
+end
diff --git a/test/looping.jl b/test/looping.jl
new file mode 100644
index 0000000..41a9872
--- /dev/null
+++ b/test/looping.jl
@@ -0,0 +1,112 @@
+
+@testset "foreachindex" begin
+    Random.seed!(0)
+
+    # CPU
+    if BACKEND == CPU()
+        x = zeros(Int, 1000)
+        AK.foreachindex(x) do i
+            x[i] = i
+        end
+        @test all(x .== 1:length(x))
+
+        x = zeros(Int, 1000)
+        AK.foreachindex(x, max_tasks=1, min_elems=1) do i
+            x[i] = i
+        end
+        @test all(x .== 1:length(x))
+
+        x = zeros(Int, 1000)
+        AK.foreachindex(x, max_tasks=10, min_elems=1) do i
+            x[i] = i
+        end
+        @test all(x .== 1:length(x))
+
+        x = zeros(Int, 1000)
+        AK.foreachindex(x, max_tasks=10, min_elems=10, scheduler=:threads) do i
+            x[i] = i
+        end
+        @test all(x .== 1:length(x))
+
+        x = zeros(Int, 1000)
+        AK.foreachindex(x, max_tasks=10, min_elems=10, scheduler=:polyester) do i
+            x[i] = i
+        end
+        @test all(x .== 1:length(x))
+
+    # GPU
+    else
+        x = array_from_host(zeros(Int, 10_000))
+        f1(x) = AK.foreachindex(x) do i     # This must be inside a function to have a known type!
+            x[i] = i
+        end
+        f1(x)
+        xh = Array(x)
+        @test all(xh .== 1:length(xh))
+
+        x = array_from_host(zeros(Int, 10_000))
+        f2(x) = AK.foreachindex(x, block_size=64) do i
+            x[i] = i
+        end
+        f2(x)
+        xh = Array(x)
+        @test all(xh .== 1:length(xh))
+    end
+end
+
+
+@testset "foraxes" begin
+    Random.seed!(0)
+
+    f1(x; kwargs...) = AK.foraxes(x, 1; kwargs...) do i
+        for j in axes(x, 2)
+            x[i, j] = i + j
+        end
+    end
+
+    x = array_from_host(zeros(Int, 10, 1000))
+    f1(x)
+    xh = Array(x)
+    @test all(xh .== (1:10) .+ (1:1000)')
+
+    x = array_from_host(zeros(UInt32, 10, 1000))
+    f1(x, scheduler=:threads, max_tasks=2, min_elems=100, block_size=64)
+    xh = Array(x)
+    @test all(xh .== (1:10) .+ (1:1000)')
+
+    x = array_from_host(zeros(Float32, 10, 1000))
+    f1(x, scheduler=:polyester, max_tasks=4, min_elems=500, block_size=128)
+    xh = Array(x)
+    @test all(xh .≈ (1:10) .+ (1:1000)')
+
+    f2(x; kwargs...) = AK.foraxes(x, 2; kwargs...) do j
+        for i in axes(x, 1)
+            x[i, j] = i + j
+        end
+    end
+
+    x = array_from_host(zeros(Int, 10, 1000))
+    f2(x)
+    xh = Array(x)
+    @test all(xh .== (1:10) .+ (1:1000)')
+
+    x = array_from_host(zeros(UInt32, 10, 1000))
+    f2(x, scheduler=:threads, max_tasks=2, min_elems=100, block_size=64)
+    xh = Array(x)
+    @test all(xh .== (1:10) .+ (1:1000)')
+
+    x = array_from_host(zeros(Float32, 10, 1000))
+    f2(x, scheduler=:polyester, max_tasks=4, min_elems=500, block_size=128)
+    xh = Array(x)
+    @test all(xh .≈ (1:10) .+ (1:1000)')
+
+    # dims are nothing, behaving like foreachindex
+    f3(x; kwargs...) = AK.foraxes(x, nothing; kwargs...) do i
+        x[i] = i
+    end
+
+    x = array_from_host(zeros(Int, 10, 1000))
+    f3(x)
+    xh = Array(x)
+    @test all(xh[:] .== 1:length(x))
+end
diff --git a/test/map.jl b/test/map.jl
new file mode 100644
index 0000000..d5e4d0b
--- /dev/null
+++ b/test/map.jl
@@ -0,0 +1,52 @@
+@testset "map" begin
+    Random.seed!(0)
+
+    # CPU
+    if BACKEND == CPU()
+        x = Array(1:1000)
+        y = AK.map(x) do i
+            i^2
+        end
+        @test y == map(i -> i^2, x)
+
+        x = Array(1:1000)
+        y = zeros(Int, 1000)
+        AK.map!(y, x) do i
+            i^2
+        end
+        @test y == map(i -> i^2, x)
+
+        x = rand(Float32, 1000)
+        y = AK.map(x, scheduler=:threads, max_tasks=2, min_elems=100) do i
+            i > 0.5 ? i : 0
+        end
+        @test y == map(i -> i > 0.5 ? i : 0, x)
+
+        x = rand(Float32, 1000)
+        y = AK.map(x, scheduler=:polyester, max_tasks=4, min_elems=500) do i
+            i > 0.5 ? i : 0
+        end
+        @test y == map(i -> i > 0.5 ? i : 0, x)
+
+    # GPU
+    else
+        x = array_from_host(1:1000)
+        y = AK.map(x) do i
+            i^2
+        end
+        @test Array(y) == map(i -> i^2, 1:1000)
+
+        x = array_from_host(1:1000)
+        y = array_from_host(zeros(Int, 1000))
+        AK.map!(y, x) do i
+            i^2
+        end
+        @test Array(y) == map(i -> i^2, 1:1000)
+
+        x = array_from_host(rand(Float32, 1000))
+        y = AK.map(x, block_size=64) do i
+            i > 0.5 ? i : 0
+        end
+        @test Array(y) == map(i -> i > 0.5 ? i : 0, Array(x))
+    end
+end
diff --git a/test/partition.jl b/test/partition.jl
new file mode 100644
index 0000000..507abd7
--- /dev/null
+++ b/test/partition.jl
@@ -0,0 +1,74 @@
+@testset "TaskPartitioner" begin
+    # All tasks needed
+    tp = AK.TaskPartitioner(10, 4, 1)
+    @test tp.num_tasks == 4
+    @test length(tp) == tp.num_tasks
+    @test tp[1] === 1:3
+    @test tp[2] === 4:6
+    @test tp[3] === 7:8
+    @test tp[4] === 9:10
+
+    # Not all tasks needed
+    tp = AK.TaskPartitioner(20, 6, 5)
+    @test tp.num_tasks == 4
+    @test length(tp) == tp.num_tasks
+    @test tp[1] === 1:5
+    @test tp[2] === 6:10
+    @test tp[3] === 11:15
+    @test tp[4] === 16:20
+end
+
+
+@testset "task_partition" begin
+    Random.seed!(0)
+
+    # Single-threaded
+    x = zeros(Int, 1000)
+    AK.task_partition(length(x), 1, 1) do irange
+        for i in irange
+            x[i] = i
+        end
+    end
+    @test all(x .== 1:length(x))
+
+    # Multi-threaded
+    x = zeros(Int, 1000)
+    tp = AK.TaskPartitioner(length(x), 10, 1)
+    AK.task_partition(tp) do irange
+        for i in irange
+            x[i] = i
+        end
+    end
+    @test all(x .== 1:length(x))
+end
+
+@testset "itask_partition" begin
+    Random.seed!(0)
+
+    # Single-threaded
+    x = zeros(Int, 1000)
+    ix = zeros(Int, 1000)
+    AK.itask_partition(length(x), 1, 1) do itask, irange
+        for i in irange
+            x[i] = i
+            ix[i] = itask
+        end
+    end
+    @test all(x .== 1:length(x))
+    @test all(ix .== 1)
+
+    # Multi-threaded
+    x = zeros(Int, 1000)
+    ix = zeros(Int, 1000)
+    tp = AK.TaskPartitioner(length(x), 10, 1)
+    AK.itask_partition(tp) do itask, irange
+        for i in irange
+            x[i] = i
+            ix[i] = itask
+        end
+    end
+    @test all(x .== 1:length(x))
+    for i in 1:tp.num_tasks
+        @test all(ix[tp[i]] .== i)
+    end
+end
diff --git a/test/predicates.jl b/test/predicates.jl
new file mode 100644
index 0000000..13dbab3
--- /dev/null
+++ b/test/predicates.jl
@@ -0,0 +1,47 @@
+@testset "truth" begin
+
+    Random.seed!(0)
+
+    # Simple correctness tests
+    v = array_from_host(1:100)
+
+    @test AK.any(x->x<0, v) === false
+    @test AK.any(x->x>99, v) === true
+
+    @test AK.all(x->x>0, v) === true
+    @test AK.all(x->x<100, v) === false
+
+    for _ in 1:100
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(Float32, num_elems))
+        @test AK.any(x->x<0, v) === false
+        @test AK.any(x->x<1, v) === true
+        @test AK.all(x->x<1, v) === true
+        @test AK.all(x->x<0, v) === false
+    end
+
+    for _ in 1:100
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(Float32, num_elems))
+        @test AK.any(x->x<0, v) === false
+        @test AK.any(x->x<1, v) === true
+        @test AK.all(x->x<1, v) === true
+        @test AK.all(x->x<0, v) === false
+    end
+
+    # Test the MapReduce algorithm which works on all platforms
+    for _ in 1:100
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(Float32, num_elems))
+        alg=AK.MapReduce(temp=similar(v, Bool), switch_below=100)
+        @test AK.any(x->x<0, v, alg=alg) === false
+        @test AK.any(x->x<1, v, alg=alg) === true
+        @test AK.all(x->x<1, v, alg=alg) === true
+        @test AK.all(x->x<0, v, alg=alg) === false
+    end
+
+    # Testing different settings
+    v = array_from_host(rand(-5:5, 100_000))
+    AK.any(x->x<5, v, max_tasks=2, min_elems=100, block_size=64)
+    AK.all(x->x<5, v, max_tasks=2, min_elems=100, block_size=64)
+end
diff --git a/test/reduce.jl b/test/reduce.jl
new file mode 100644
index 0000000..37e9c58
--- /dev/null
+++ b/test/reduce.jl
@@ -0,0 +1,692 @@
+struct Point
+    x::Float32
+    y::Float32
+end
+# Only for backend-agnostic initialisation with KernelAbstractions.zero
+Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0)
+
+@testset "reduce_1d" begin
+    Random.seed!(0)
+
+    function redmin(s)
+        # Reduction-based minimum finder
+        AK.reduce(
+            (x, y) -> x < y ? x : y,
+            s;
+            init=typemax(eltype(s)),
+            neutral=typemax(eltype(s)),
+        )
+    end
+
+    # Fuzzy correctness testing
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(Int32, num_elems))
+        s = redmin(v)
+        vh = Array(v)
+        @test s == minimum(vh)
+    end
+
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(UInt32, num_elems))
+        s = redmin(v)
+        vh = Array(v)
+        @test s == minimum(vh)
+    end
+
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(Float32, num_elems))
+        s = redmin(v)
+        vh = Array(v)
+        @test s == minimum(vh)
+    end
+
+    function redsum(s)
+        # Reduction-based summation
+        AK.reduce(
+            (x, y) -> x + y,
+            s;
+            init=zero(eltype(s)),
+            neutral=zero(eltype(s)),
+        )
+    end
+
+    # Fuzzy correctness testing
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(1:100, num_elems), Int32)
+        s = redsum(v)
+        vh = Array(v)
+        @test s == sum(vh)
+    end
+
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(1:100, num_elems), UInt32)
+        s = redsum(v)
+        vh = Array(v)
+        @test s == sum(vh)
+    end
+
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(Float32, num_elems))
+        s = redsum(v)
+        vh = Array(v)
+        @test s ≈ sum(vh)
+    end
+
+    # Allowing N-dimensional arrays, still reduced as 1D
+    for _ in 1:100
+        n1 = rand(1:100)
+        n2 = rand(1:100)
+        n3 = rand(1:100)
+        vh = rand(Float32, n1, n2, n3)
+        v = array_from_host(vh)
+        s = redsum(v)
+        @test s ≈ sum(vh)
+    end
+
+    # Ensuring that the init value is respected
+    for _ in 1:100
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(Int32(1):Int32(100), num_elems))
+        s = AK.reduce(+, v; init=Int32(10))
+        vh = Array(v)
+        @test s == sum(vh) + 10
+    end
+
+    # Testing with switch_below - i.e. finishing on the CPU
+    for _ in 1:100
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(1:100, num_elems), Int32)
+        switch_below = rand(1:100)
+        init = rand(1:100)
+        s = AK.reduce(+, v; switch_below=switch_below, init=Int32(init))
+        vh = Array(v)
+        @test s == reduce(+, vh, init=init)
+    end
+
+    # Test with unmaterialised ranges
+    for _ in 1:100
+        num_elems = rand(1:1000)
+        v = 1:num_elems
+        s = AK.reduce(+, v, BACKEND; init=Int32(0))
+        vh = Array(v)
+        @test s == reduce(+, vh)
+    end
+
+    # Testing different settings
+    AK.reduce(
+        (x, y) -> x + 1,
+        array_from_host(rand(Int32, 10_000)),
+        init=Int32(0),
+        neutral=Int64(0),
+        block_size=64,
+        temp=array_from_host(zeros(Int32, 10_000)),
+        switch_below=50,
+        scheduler=:dynamic,
+        max_tasks=10,
+        min_elems=100,
+    )
+    AK.reduce(
+        (x, y) -> x + 1,
+        rand(Int32, 10_000),
+        init=Int32(0),
+        neutral=Int64(0),
+        scheduler=:greedy,
+        max_tasks=16,
+        min_elems=1000,
+    )
+end
+
+
+@testset "reduce_nd" begin
+    Random.seed!(0)
+
+    # Test all possible corner cases against Base.reduce
+    for dims in 1:4
+        for isize in 0:3
+            for jsize in 0:3
+                for ksize in 0:3
+                    sh = rand(Int32(1):Int32(100), isize, jsize, ksize)
+                    s = array_from_host(sh)
+                    d = AK.reduce(+, s; init=Int32(10), dims=dims)
+                    dh = Array(d)
+                    @test dh == sum(sh, init=Int32(10), dims=dims)
+                    @test eltype(dh) == eltype(sum(sh, init=Int32(10), dims=dims))
+                end
+            end
+        end
+    end
+
+    # Fuzzy correctness testing
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(1:100)
+            n2 = rand(1:100)
+            n3 = rand(1:100)
+            vh = rand(Int32(1):Int32(100), n1, n2, n3)
+            v = array_from_host(vh)
+            s = AK.reduce(+, v; init=Int32(0), dims=dims)
+            sh = Array(s)
+            @test sh == sum(vh, dims=dims)
+        end
+    end
+
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(1:100)
+            n2 = rand(1:100)
+            n3 = rand(1:100)
+            vh = rand(UInt32(1):UInt32(100), n1, n2, n3)
+            v = array_from_host(vh)
+            s = AK.reduce(+, v; init=UInt32(0), dims=dims)
+            sh = Array(s)
+            @test sh == sum(vh, dims=dims)
+        end
+    end
+
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(1:100)
+            n2 = rand(1:100)
+            n3 = rand(1:100)
+            vh = rand(Float32, n1, n2, n3)
+            v = array_from_host(vh)
+            s = AK.reduce(+, v; init=Float32(0), dims=dims)
+            sh = Array(s)
+            @test sh ≈ sum(vh, dims=dims)
+        end
+    end
+
+    # Ensuring that the init value is respected
+    for _ in 1:100
+        for dims in 1:4
+            n1 = rand(1:100)
+            n2 = rand(1:100)
+            n3 = rand(1:100)
+            vh = rand(Int32(1):Int32(100), n1, n2, n3)
+            v = array_from_host(vh)
+            init = rand(1:100)
+            s = AK.reduce(+, v; init=Int32(init), dims=dims)
+            sh = Array(s)
+            @test sh == reduce(+, vh, dims=dims, init=init)
+        end
+    end
+
+    # Testing different settings
+    AK.reduce(
+        (x, y) -> x + 1,
+        array_from_host(rand(Int32, 3, 4, 5)),
+        init=Int32(0),
+        neutral=Int32(0),
+        dims=2,
+        block_size=64,
+        temp=array_from_host(zeros(Int32, 3, 1, 5)),
+        switch_below=50,
+        scheduler=:dynamic,
+        max_tasks=10,
+        min_elems=100,
+    )
+    AK.reduce(
+        (x, y) -> x + 1,
+        array_from_host(rand(Int32, 3, 4, 5)),
+        init=Int32(0),
+        neutral=Int32(0),
+        dims=3,
+        block_size=64,
+        temp=array_from_host(zeros(Int32, 3, 4, 1)),
+        switch_below=50,
+        scheduler=:greedy,
+        max_tasks=16,
+        min_elems=1000,
+    )
+end
+
+
+@testset "mapreduce_1d" begin
+    Random.seed!(0)
+
+    function minbox(s)
+        # Extract coordinates into tuple and reduce to find dimensionwise minima
+        AK.mapreduce(
+            p -> (p.x, p.y),
+            (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]),
+            s;
+            init=(typemax(Float32), typemax(Float32)),
+            neutral=(typemax(Float32), typemax(Float32)),
+        )
+    end
+
+    function minbox_base(s)
+        # Extract coordinates into tuple and reduce to find dimensionwise minima
+        Base.mapreduce(
+            p -> (p.x, p.y),
+            (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]),
+            s;
+            init=(typemax(Float32), typemax(Float32)),
+        )
+    end
+
+    # Fuzzy correctness testing
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        v = array_from_host([Point(rand(Float32), rand(Float32)) for _ in 1:num_elems])
+        mgpu = minbox(v)
+
+        vh = Array(v)
+        mcpu = minbox(vh)
+        mbase = minbox_base(vh)
+
+        @test typeof(mgpu) === typeof(mcpu) === typeof(mbase)
+        @test mgpu[1] ≈ mcpu[1] ≈ mbase[1]
+        @test mgpu[2] ≈ mcpu[2] ≈ mbase[2]
+    end
+
+    # Allowing N-dimensional arrays, still reduced as 1D
+    for _ in 1:100
+        n1 = rand(1:100)
+        n2 = rand(1:100)
+        n3 = rand(1:100)
+
+        v = array_from_host([Point(rand(Float32), rand(Float32)) for _ in 1:n1, _ in 1:n2, _ in 1:n3])
+        mgpu = minbox(v)
+
+        vh = Array(v)
+        mcpu = minbox(vh)
+        mbase = minbox_base(vh)
+
+        @test typeof(mgpu) === typeof(mcpu) === typeof(mbase)
+        @test mgpu[1] ≈ mcpu[1] ≈ mbase[1]
+        @test mgpu[2] ≈ mcpu[2] ≈ mbase[2]
+    end
+
+    # Ensuring that the init value is respected
+    for _ in 1:100
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(Int32(1):Int32(100), num_elems))
+        s = AK.mapreduce(abs, +, v; init=Int32(10))
+        vh = Array(v)
+        @test s == sum(vh) + 10
+    end
+
+    # Testing with switch_below - i.e. finishing on the CPU
+    for _ in 1:100
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(-100:-1, num_elems), Int32)
+        switch_below = rand(1:100)
+        init = rand(1:100)
+        s = AK.mapreduce(abs, +, v; switch_below=switch_below, init=Int32(init))
+        vh = Array(v)
+        @test s == mapreduce(abs, +, vh, init=init)
+    end
+
+    # Test with unmaterialised ranges
+    for _ in 1:100
+        num_elems = rand(1:1000)
+        v = 1:num_elems
+        s = AK.mapreduce(abs, +, v, BACKEND; init=Int32(0))
+        vh = Array(v)
+        @test s == mapreduce(abs, +, vh)
+    end
+
+    # Testing different settings, enforcing change of type between f and op
+    f(s, temp) = AK.mapreduce(
+        p -> (p.x, p.y),
+        (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]),
+        s,
+        init=(typemax(Float32), typemax(Float32)),
+        neutral=(typemax(Float32), typemax(Float32)),
+        block_size=64,
+        temp=temp,
+        switch_below=50,
+        scheduler=:dynamic,
+        max_tasks=10,
+        min_elems=100,
+    )
+    v = array_from_host([Point(rand(Float32), rand(Float32)) for _ in 1:10_042])
+    temp = similar(v, Tuple{Float32, Float32})
+    f(v, temp)
+end
+
+
+@testset "mapreduce_nd" begin
+    Random.seed!(0)
+
+    # Test all possible corner cases against Base.reduce
+    for dims in 1:4
+        for isize in 0:3
+            for jsize in 0:3
+                for ksize in 0:3
+                    sh = rand(Int32(-100):Int32(100), isize, jsize, ksize)
+                    s = array_from_host(sh)
+                    d = AK.mapreduce(-, +, s; init=Int32(-10), dims=dims)
+                    dh = Array(d)
+                    @test dh == mapreduce(-, +, sh, init=Int32(-10), dims=dims)
+                    @test eltype(dh) == eltype(mapreduce(-, +, sh, init=Int32(-10), dims=dims))
+                end
+            end
+        end
+    end
+
+    # Fuzzy correctness testing
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(1:100)
+            n2 = rand(1:100)
+            n3 = rand(1:100)
+            vh = rand(Int32(1):Int32(100), n1, n2, n3)
+            v = array_from_host(vh)
+            s = AK.mapreduce(-, +, v; init=Int32(0), dims=dims)
+            sh = Array(s)
+            @test sh == mapreduce(-, +, vh, init=Int32(0), dims=dims)
+        end
+    end
+
+    function minbox(s, dims)
+        # Extract coordinates into tuple and reduce to find dimensionwise minima
+        AK.mapreduce(
+            p -> (p.x, p.y),
+            (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]),
+            s;
+            init=(typemax(Float32), typemax(Float32)),
+            neutral=(typemax(Float32), typemax(Float32)),
+            dims=dims,
+        )
+    end
+
+    function minbox_base(s, dims)
+        # Extract coordinates into tuple and reduce to find dimensionwise minima
+        Base.mapreduce(
+            p -> (p.x, p.y),
+            (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]),
+            s;
+            init=(typemax(Float32), typemax(Float32)),
+            dims=dims,
+        )
+    end
+
+    # Fuzzy correctness testing
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(1:100)
+            n2 = rand(1:100)
+            n3 = rand(1:100)
+            v = array_from_host([Point(rand(Float32), rand(Float32)) for _ in 1:n1, _ in 1:n2, _ in 1:n3])
+            mgpu = minbox(v, dims)
+
+            vh = Array(v)
+            mcpu = minbox(vh, dims)
+            mbase = minbox_base(vh, dims)
+
+            @test eltype(mgpu) === eltype(mcpu) === eltype(mbase)
+            @test all([
+                (mgpu_red[1] ≈ mcpu[i][1] ≈ mbase[i][1]) && (mgpu_red[2] ≈ mcpu[i][2] ≈ mbase[i][2])
+                for (i, mgpu_red) in enumerate(Array(mgpu))
+            ])
+        end
+    end
+
+    # Ensuring that the init value is respected
+    for _ in 1:100
+        for dims in 1:4
+            n1 = rand(1:100)
+            n2 = rand(1:100)
+            n3 = rand(1:100)
+            vh = rand(Int32(-100):Int32(100), n1, n2, n3)
+            v = array_from_host(vh)
+            init = rand(1:100)
+            s = AK.mapreduce(-, +, v; init=Int32(init), dims=dims)
+            sh = Array(s)
+            @test sh == mapreduce(-, +, vh, dims=dims, init=init)
+        end
+    end
+
+    # Testing different settings
+    AK.mapreduce(
+        -,
+        (x, y) -> x + 1,
+        array_from_host(rand(Int32, 3, 4, 5)),
+        init=Int32(0),
+        neutral=Int32(0),
+        dims=2,
+        block_size=64,
+        temp=array_from_host(zeros(Int32, 3, 1, 5)),
+        switch_below=50,
+        scheduler=:dynamic,
+        max_tasks=10,
+        min_elems=100,
+    )
+    AK.mapreduce(
+        -,
+        (x, y) -> x + 1,
+        array_from_host(rand(Int32, 3, 4, 5)),
+        init=Int32(0),
+        neutral=Int32(0),
+        dims=3,
+        block_size=64,
+        temp=array_from_host(zeros(Int32, 3, 4, 1)),
+        switch_below=50,
+        scheduler=:greedy,
+        max_tasks=16,
+        min_elems=1000,
+    )
+end
+@testset "sum" begin
+
+    Random.seed!(0)
+
+    # Simple correctness tests
+    v = array_from_host(1:100)
+    @test AK.sum(v) == sum(Array(v))
+
+    # Fuzzy testing
+    for _ in 1:100
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(Float32, num_elems))
+        @test AK.sum(v) ≈ sum(Array(v))
+    end
+
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(1:100)
+            n2 = rand(1:100)
+            n3 = rand(1:100)
+            vh = rand(Int32(-5):Int32(5), n1, n2, n3)
+            v = array_from_host(vh)
+
+            # Indexing into array as if linear
+            @test AK.sum(v) == sum(vh)
+
+            # Along dimensions
+            r = Array(AK.sum(v, dims=dims))
+            rh = sum(vh, dims=dims)
+
+            @test r == rh
+        end
+    end
+
+    # Testing different settings
+    v = array_from_host(rand(-5:5, 100_000))
+    AK.sum(v, block_size=64)
+
+    # The other settings are stress-tested in reduce
+end
+
+
+@testset "prod" begin
+
+    Random.seed!(0)
+
+    # Simple correctness tests
+    v = array_from_host(1:100)
+    @test AK.prod(v) == prod(Array(v))
+
+    # Fuzzy testing
+    for _ in 1:100
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(Float32, num_elems))
+        @test AK.prod(v) ≈ prod(Array(v))
+    end
+
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(1:10)
+            n2 = rand(1:10)
+            n3 = rand(1:10)
+            vh = rand(Int32(-5):Int32(5), n1, n2, n3)
+            v = array_from_host(vh)
+
+            # Indexing into array as if linear
+            @test AK.sum(v) == sum(vh)
+
+            # Along dimensions
+            r = Array(AK.sum(v, dims=dims))
+            rh = sum(vh, dims=dims)
+
+            @test r == rh
+        end
+    end
+
+    # Testing different settings
+    v = array_from_host(rand(-5:5, 100_000))
+    AK.prod(v, block_size=64)
+
+    # The other settings are stress-tested in reduce
+end
+
+
+@testset "minimum" begin
+
+    Random.seed!(0)
+
+    # Simple correctness tests
+    v = array_from_host(1:100)
+    @test AK.minimum(v) == minimum(Array(v))
+
+    # Fuzzy testing
+    for _ in 1:100
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(Float32, num_elems))
+        @test AK.minimum(v) == minimum(Array(v))
+    end
+
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(1:100)
+            n2 = rand(1:100)
+            n3 = rand(1:100)
+            vh = rand(Int32, n1, n2, n3)
+            v = array_from_host(vh)
+
+            # Indexing into array as if linear
+            @test AK.minimum(v) == minimum(vh)
+
+            # Along dimensions
+            r = Array(AK.minimum(v, dims=dims))
+            rh = minimum(vh, dims=dims)
+
+            @test r == rh
+        end
+    end
+
+    # Testing different settings
+    v = array_from_host(rand(-5:5, 100_000))
+    AK.minimum(v, block_size=64)
+
+    # The other settings are stress-tested in reduce
+end
+
+
+@testset "maximum" begin
+
+    Random.seed!(0)
+
+    # Simple correctness tests
+    v = array_from_host(1:100)
+    @test AK.maximum(v) == maximum(Array(v))
+
+    # Fuzzy testing
+    for _ in 1:100
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(Float32, num_elems))
+        @test AK.maximum(v) == maximum(Array(v))
+    end
+
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(1:100)
+            n2 = rand(1:100)
+            n3 = rand(1:100)
+            vh = rand(Int32, n1, n2, n3)
+            v = array_from_host(vh)
+
+            # Indexing into array as if linear
+            @test AK.maximum(v) == maximum(vh)
+
+            # Along dimensions
+            r = Array(AK.maximum(v, dims=dims))
+            rh = maximum(vh, dims=dims)
+
+            @test r == rh
+        end
+    end
+
+    # Testing different settings
+    v = array_from_host(rand(-5:5, 100_000))
+    AK.maximum(v, block_size=64)
+
+    # The other settings are stress-tested in reduce
+end
+
+
+@testset "count" begin
+
+    Random.seed!(0)
+
+    # Simple correctness tests
+    v = array_from_host(1:100)
+    @test AK.count(x->x>50, v) == count(x->x>50, Array(v))
+
+    # Fuzzy testing
+    for _ in 1:100
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(Float32, num_elems))
+        @test AK.count(x->x>0.5, v) == count(x->x>0.5, Array(v))
+    end
+
+    for _ in 1:100
+        for dims in 1:3
+            n1 = rand(1:100)
+            n2 = rand(1:100)
+            n3 = rand(1:100)
+            vh = rand(Float32, n1, n2, n3)
+            v = array_from_host(vh)
+
+            # Indexing into array as if linear
+            @test AK.count(x->x>0.5, v) == count(x->x>0.5, vh)
+
+            # Along dimensions
+            r = Array(AK.count(x->x>0.5, v, dims=dims))
+            rh = count(x->x>0.5, vh, dims=dims)
+
+            @test r == rh
+        end
+    end
+
+    # Counting booleans directly
+    for _ in 1:100
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(Bool, num_elems))
+        @test AK.count(v) == count(Array(v))
+    end
+
+    # Testing different settings
+    v = array_from_host(rand(-5:5, 100_000))
+    AK.count(x->x>0, v, block_size=64)
+
+    # The other settings are stress-tested in reduce
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 0b22dee..6578c74 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -12,1887 +12,54 @@ if "--CUDA" in ARGS
     Pkg.add("CUDA")
     using CUDA
     CUDA.versioninfo()
-    const backend = CUDABackend()
+    const BACKEND = CUDABackend()
 elseif "--oneAPI" in ARGS
     Pkg.add("oneAPI")
     using oneAPI
     oneAPI.versioninfo()
-    const backend = oneAPIBackend()
+    const BACKEND = oneAPIBackend()
 elseif "--AMDGPU" in ARGS
     Pkg.add("AMDGPU")
     using AMDGPU
     AMDGPU.versioninfo()
-    const backend = ROCBackend()
+    const BACKEND = ROCBackend()
 elseif "--Metal" in ARGS
     Pkg.add("Metal")
     using Metal
     Metal.versioninfo()
-    const backend = MetalBackend()
+    const BACKEND = MetalBackend()
 elseif "--OpenCL" in ARGS
     Pkg.add(name="OpenCL", rev="master")
     Pkg.add("pocl_jll")
     using pocl_jll
     using OpenCL
     OpenCL.versioninfo()
-    const backend = OpenCLBackend()
-elseif !@isdefined(backend)
+    const BACKEND = OpenCLBackend()
+elseif !@isdefined(BACKEND)
     # Otherwise do CPU tests
     using InteractiveUtils
     InteractiveUtils.versioninfo()
-    const backend = CPU()
+    const BACKEND = CPU()
 end
 
 
-function array_from_host(h_arr::AbstractArray, dtype=nothing)
+array_from_host(h_arr::AbstractArray, dtype=nothing) = array_from_host(BACKEND, h_arr, dtype)
+function array_from_host(backend, h_arr::AbstractArray, dtype=nothing)
     d_arr = KernelAbstractions.zeros(backend, isnothing(dtype) ? eltype(h_arr) : dtype, size(h_arr))
     copyto!(d_arr, h_arr isa Array ? h_arr : Array(h_arr))      # Allow unmaterialised types, e.g. ranges
     d_arr
 end
 
-
 @testset "Aqua" begin
     using Aqua
     Aqua.test_all(AK)
 end
 
-
-@testset "TaskPartitioner" begin
-    # All tasks needed
-    tp = AK.TaskPartitioner(10, 4, 1)
-    @test tp.num_tasks == 4
-    @test length(tp) == tp.num_tasks
-    @test tp[1] === 1:3
-    @test tp[2] === 4:6
-    @test tp[3] === 7:8
-    @test tp[4] === 9:10
-
-    # Not all tasks needed
-    tp = AK.TaskPartitioner(20, 6, 5)
-    @test tp.num_tasks == 4
-    @test length(tp) == tp.num_tasks
-    @test tp[1] === 1:5
-    @test tp[2] === 6:10
-    @test tp[3] === 11:15
-    @test tp[4] === 16:20
-end
-
-
-@testset "task_partition" begin
-    Random.seed!(0)
-
-    # Single-threaded
-    x = zeros(Int, 1000)
-    AK.task_partition(length(x), 1, 1) do irange
-        for i in irange
-            x[i] = i
-        end
-    end
-    @test all(x .== 1:length(x))
-
-    # Multi-threaded
-    x = zeros(Int, 1000)
-    tp = AK.TaskPartitioner(length(x), 10, 1)
-    AK.task_partition(tp) do irange
-        for i in irange
-            x[i] = i
-        end
-    end
-    @test all(x .== 1:length(x))
-end
-
-
-@testset "itask_partition" begin
-    Random.seed!(0)
-
-    # Single-threaded
-    x = zeros(Int, 1000)
-    ix = zeros(Int, 1000)
-    AK.itask_partition(length(x), 1, 1) do itask, irange
-        for i in irange
-            x[i] = i
-            ix[i] = itask
-        end
-    end
-    @test all(x .== 1:length(x))
-    @test all(ix .== 1)
-
-    # Multi-threaded
-    x = zeros(Int, 1000)
-    ix = zeros(Int, 1000)
-    tp = AK.TaskPartitioner(length(x), 10, 1)
-    AK.itask_partition(tp) do itask, irange
-        for i in irange
-            x[i] = i
-            ix[i] = itask
-        end
-    end
-    @test all(x .== 1:length(x))
-    for i in 1:tp.num_tasks
-        @test all(ix[tp[i]] .== i)
-    end
-end
-
-
-@testset "foreachindex" begin
-    Random.seed!(0)
-
-    # CPU
-    if backend == CPU()
-        x = zeros(Int, 1000)
-        AK.foreachindex(x) do i
-            x[i] = i
-        end
-        @test all(x .== 1:length(x))
-
-        x = zeros(Int, 1000)
-        AK.foreachindex(x, max_tasks=1, min_elems=1) do i
-            x[i] = i
-        end
-        @test all(x .== 1:length(x))
-
-        x = zeros(Int, 1000)
-        AK.foreachindex(x, max_tasks=10, min_elems=1) do i
-            x[i] = i
-        end
-        @test all(x .== 1:length(x))
-
-        x = zeros(Int, 1000)
-        AK.foreachindex(x, max_tasks=10, min_elems=10, scheduler=:threads) do i
-            x[i] = i
-        end
-        @test all(x .== 1:length(x))
-
-        x = zeros(Int, 1000)
-        AK.foreachindex(x, max_tasks=10, min_elems=10, scheduler=:polyester) do i
-            x[i] = i
-        end
-        @test all(x .== 1:length(x))
-
-    # GPU
-    else
-        x = array_from_host(zeros(Int, 10_000))
-        f1(x) = AK.foreachindex(x) do i     # This must be inside a function to have a known type!
-            x[i] = i
-        end
-        f1(x)
-        xh = Array(x)
-        @test all(xh .== 1:length(xh))
-
-        x = array_from_host(zeros(Int, 10_000))
-        f2(x) = AK.foreachindex(x, block_size=64) do i
-            x[i] = i
-        end
-        f2(x)
-        xh = Array(x)
-        @test all(xh .== 1:length(xh))
-    end
-end
-
-
-@testset "foraxes" begin
-    Random.seed!(0)
-
-    f1(x; kwargs...) = AK.foraxes(x, 1; kwargs...) do i
-        for j in axes(x, 2)
-            x[i, j] = i + j
-        end
-    end
-
-    x = array_from_host(zeros(Int, 10, 1000))
-    f1(x)
-    xh = Array(x)
-    @test all(xh .== (1:10) .+ (1:1000)')
-
-    x = array_from_host(zeros(UInt32, 10, 1000))
-    f1(x, scheduler=:threads, max_tasks=2, min_elems=100, block_size=64)
-    xh = Array(x)
-    @test all(xh .== (1:10) .+ (1:1000)')
-
-    x = array_from_host(zeros(Float32, 10, 1000))
-    f1(x, scheduler=:polyester, max_tasks=4, min_elems=500, block_size=128)
-    xh = Array(x)
-    @test all(xh .≈ (1:10) .+ (1:1000)')
-
-    f2(x; kwargs...) = AK.foraxes(x, 2; kwargs...) do j
-        for i in axes(x, 1)
-            x[i, j] = i + j
-        end
-    end
-
-    x = array_from_host(zeros(Int, 10, 1000))
-    f2(x)
-    xh = Array(x)
-    @test all(xh .== (1:10) .+ (1:1000)')
-
-    x = array_from_host(zeros(UInt32, 10, 1000))
-    f2(x, scheduler=:threads, max_tasks=2, min_elems=100, block_size=64)
-    xh = Array(x)
-    @test all(xh .== (1:10) .+ (1:1000)')
-
-    x = array_from_host(zeros(Float32, 10, 1000))
-    f2(x, scheduler=:polyester, max_tasks=4, min_elems=500, block_size=128)
-    xh = Array(x)
-    @test all(xh .≈ (1:10) .+ (1:1000)')
-
-    # dims are nothing, behaving like foreachindex
-    f3(x; kwargs...) = AK.foraxes(x, nothing; kwargs...) do i
-        x[i] = i
-    end
-
-    x = array_from_host(zeros(Int, 10, 1000))
-    f3(x)
-    xh = Array(x)
-    @test all(xh[:] .== 1:length(x))
-end
-
-
-@testset "map" begin
-    Random.seed!(0)
-
-    # CPU
-    if backend == CPU()
-        x = Array(1:1000)
-        y = AK.map(x) do i
-            i^2
-        end
-        @test y == map(i -> i^2, x)
-
-        x = Array(1:1000)
-        y = zeros(Int, 1000)
-        AK.map!(y, x) do i
-            i^2
-        end
-        @test y == map(i -> i^2, x)
-
-        x = rand(Float32, 1000)
-        y = AK.map(x, scheduler=:threads, max_tasks=2, min_elems=100) do i
-            i > 0.5 ? i : 0
-        end
-        @test y == map(i -> i > 0.5 ? i : 0, x)
-
-        x = rand(Float32, 1000)
-        y = AK.map(x, scheduler=:polyester, max_tasks=4, min_elems=500) do i
-            i > 0.5 ? i : 0
-        end
-        @test y == map(i -> i > 0.5 ? i : 0, x)
-
-    # GPU
-    else
-        x = array_from_host(1:1000)
-        y = AK.map(x) do i
-            i^2
-        end
-        @test Array(y) == map(i -> i^2, 1:1000)
-
-        x = array_from_host(1:1000)
-        y = array_from_host(zeros(Int, 1000))
-        AK.map!(y, x) do i
-            i^2
-        end
-        @test Array(y) == map(i -> i^2, 1:1000)
-
-        x = array_from_host(rand(Float32, 1000))
-        y = AK.map(x, block_size=64) do i
-            i > 0.5 ? i : 0
-        end
-        @test Array(y) == map(i -> i > 0.5 ? i : 0, Array(x))
-    end
-end
-
-
-if backend != CPU()
-@testset "merge_sort" begin
-    Random.seed!(0)
-
-    # Fuzzy correctness testing
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(Int32, num_elems))
-        AK.merge_sort!(v)
-        vh = Array(v)
-        @test issorted(vh)
-    end
-
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(UInt32, num_elems))
-        AK.merge_sort!(v)
-        vh = Array(v)
-        @test issorted(vh)
-    end
-
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(Float32, num_elems))
-        AK.merge_sort!(v)
-        vh = Array(v)
-        @test issorted(vh)
-    end
-
-    # Testing different settings
-    v = array_from_host(1:10_000, Float32)
-    AK.merge_sort!(v, lt=(>), by=abs, rev=true,
-                block_size=64, temp=array_from_host(1:10_000, Float32))
-    @test issorted(Array(v))
-
-    v = array_from_host(1:10_000, Int32)
-    AK.merge_sort!(v, lt=(>), rev=true,
-                block_size=64, temp=array_from_host(1:10_000, Int32))
-    @test issorted(Array(v))
-
-    v = array_from_host(1:10_000, Float32)
-    v = AK.merge_sort(v, lt=(>), by=abs, rev=true,
-                      block_size=64, temp=array_from_host(1:10_000, Float32))
-    @test issorted(Array(v))
-
-    v = array_from_host(1:10_000, Int32)
-    v = AK.merge_sort(v, lt=(>), by=abs, rev=true,
-                      block_size=64, temp=array_from_host(1:10_000, Int32))
-    @test issorted(Array(v))
-end
-
-else # CPU backend
-@testset "sample_sort" begin
-    Random.seed!(0)
-
-    # Fuzzy correctness testing
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(Int32, num_elems))
-        AK.sample_sort!(v)
-        vh = Array(v)
-        @test issorted(vh)
-    end
-
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(UInt32, num_elems))
-        AK.sample_sort!(v)
-        vh = Array(v)
-        @test issorted(vh)
-    end
-
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(Float32, num_elems))
-        AK.sample_sort!(v)
-        vh = Array(v)
-        @test issorted(vh)
-    end
-
-    # Testing different settings
-    v = array_from_host(rand(1:100_000, 10_000), Float32)
-    AK.sample_sort!(v, lt=(>), by=abs, rev=true,
-                    max_tasks=64, temp=array_from_host(1:10_000, Float32))
-    @test issorted(Array(v))
-
-    v = array_from_host(rand(1:100_000, 10_000), Int32)
-    AK.sample_sort!(v, lt=(>), rev=true,
-                    max_tasks=64, temp=array_from_host(1:10_000, Int32))
-    @test issorted(Array(v))
-end
-end
-
-
-@testset "sort" begin
-    Random.seed!(0)
-
-    # Fuzzy correctness testing
-    for _ in 1:100
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(Int32, num_elems))
-        AK.sort!(v)
-        vh = Array(v)
-        @test issorted(vh)
-    end
-
-    for _ in 1:100
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(UInt32, num_elems))
-        AK.sort!(v)
-        vh = Array(v)
-        @test issorted(vh)
-    end
-
-    for _ in 1:100
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(Float32, num_elems))
-        AK.sort!(v)
-        vh = Array(v)
-        @test issorted(vh)
-    end
-
-    # Testing different settings
-    v = array_from_host(rand(1:100_000, 10_000), Float32)
-    AK.sort!(v, lt=(>), by=abs, rev=true,
-             max_tasks=64, min_elems=8, block_size=64,
-             temp=array_from_host(1:10_000, Float32))
-    @test issorted(Array(v))
-
-    v = array_from_host(rand(1:100_000, 10_000), Int32)
-    AK.sort!(v, lt=(>), rev=true,
-             max_tasks=64, min_elems=8, block_size=64,
-             temp=array_from_host(1:10_000, Int32))
-    @test issorted(Array(v))
-
-    v = array_from_host(rand(1:100_000, 10_000), Float32)
-    v = AK.sort(v, lt=(>), by=abs, rev=true,
-                max_tasks=64, min_elems=8, block_size=64,
-                temp=array_from_host(1:10_000, Float32))
-    @test issorted(Array(v))
-
-    v = array_from_host(rand(1:100_000, 10_000), Int32)
-    v = AK.sort(v, lt=(>), by=abs, rev=true,
-                max_tasks=64, min_elems=8, block_size=64,
-                temp=array_from_host(1:10_000, Int32))
-    @test issorted(Array(v))
-end
-
-
-if backend != CPU()
-@testset "merge_sort_by_key" begin
-    Random.seed!(0)
-
-    # Fuzzy correctness testing
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        k = array_from_host(rand(Int32, num_elems))
-        v = copy(k) .- 1
-        AK.merge_sort_by_key!(k, v)
-        kh = Array(k)
-        vh = Array(v)
-        @test issorted(kh)
-        @test issorted(vh)
-    end
-
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        k = array_from_host(rand(UInt32, num_elems))
-        v = copy(k) .- 1
-        AK.merge_sort_by_key!(k, v)
-        kh = Array(k)
-        vh = Array(v)
-        @test issorted(kh)
-        @test issorted(vh)
-    end
-
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        k = array_from_host(rand(Float32, num_elems))
-        v = copy(k) .- 1
-        AK.merge_sort_by_key!(k, v)
-        kh = Array(k)
-        vh = Array(v)
-        @test issorted(kh)
-        @test issorted(vh)
-    end
-
-    # Testing different settings
-    k = array_from_host(1:10_000, Float32)
-    v = array_from_host(1:10_000, Int32)
-    AK.merge_sort_by_key!(k, v,
-                        lt=(>), by=abs, rev=true,
-                        block_size=64,
-                        temp_keys=array_from_host(1:10_000, Float32),
-                        temp_values=array_from_host(1:10_000, Int32))
-    @test issorted(Array(k))
-    @test issorted(Array(v))
-
-    k = array_from_host(1:10_000, Int32)
-    v = array_from_host(1:10_000, Float32)
-    AK.merge_sort_by_key!(k, v,
-                        lt=(>), by=abs, rev=true,
-                        block_size=64,
-                        temp_keys=array_from_host(1:10_000, Int32),
-                        temp_values=array_from_host(1:10_000, Float32))
-    @test issorted(Array(k))
-    @test issorted(Array(v))
-
-    k = array_from_host(1:10_000, Float32)
-    v = array_from_host(1:10_000, Int32)
-    AK.merge_sort_by_key(k, v,
-                        lt=(>), by=abs, rev=true,
-                        block_size=64,
-                        temp_keys=array_from_host(1:10_000, Float32),
-                        temp_values=array_from_host(1:10_000, Int32))
-    @test issorted(Array(k))
-    @test issorted(Array(v))
-
-    k = array_from_host(1:10_000, Int32)
-    v = array_from_host(1:10_000, Float32)
-    AK.merge_sort_by_key(k, v,
-                        lt=(>), by=abs, rev=true,
-                        block_size=64,
-                        temp_keys=array_from_host(1:10_000, Int32),
-                        temp_values=array_from_host(1:10_000, Float32))
-    @test issorted(Array(k))
-    @test issorted(Array(v))
-end
-end
-
-
-if backend != CPU()
-@testset "merge_sortperm" begin
-    Random.seed!(0)
-
-    # Fuzzy correctness testing
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        ix = array_from_host(zeros(Int32, num_elems))
-        v = array_from_host(rand(Int32, num_elems))
-        AK.merge_sortperm!(ix, v)
-        ixh = Array(ix)
-        vh = Array(v)
-        @test issorted(vh[ixh])
-    end
-
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        ix = array_from_host(zeros(Int32, num_elems))
-        v = array_from_host(rand(UInt32, num_elems))
-        AK.merge_sortperm!(ix, v)
-        ixh = Array(ix)
-        vh = Array(v)
-        @test issorted(vh[ixh])
-    end
-
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        ix = array_from_host(zeros(Int32, num_elems))
-        v = array_from_host(rand(Float32, num_elems))
-        AK.merge_sortperm!(ix, v)
-        ixh = Array(ix)
-        vh = Array(v)
-        @test issorted(vh[ixh])
-    end
-
-    # Testing different settings
-    ix = array_from_host(1:10_000, Int32)
-    v = array_from_host(1:10_000, Float32)
-    AK.merge_sortperm!(ix,
-                    v,
-                    lt=(>), by=abs, rev=true,
-                    inplace=true, block_size=64,
-                    temp_ix=array_from_host(1:10_000, Int32),
-                    temp_v=array_from_host(1:10_000, Float32))
-    ixh = Array(ix)
-    vh = Array(v)
-    @test issorted(vh[ixh])
-
-    v = array_from_host(1:10_000, Float32)
-    ix = AK.merge_sortperm(v,
-                        lt=(>), by=abs, rev=true,
-                        inplace=true, block_size=64,
-                        temp_ix=array_from_host(1:10_000, Int),
-                        temp_v=array_from_host(1:10_000, Float32))
-    ixh = Array(ix)
-    vh = Array(v)
-    @test issorted(vh[ixh])
-end
-
-else # CPU backend
-@testset "sample_sortperm" begin
-    Random.seed!(0)
-
-    # Fuzzy correctness testing
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        ix = array_from_host(zeros(Int32, num_elems))
-        v = array_from_host(rand(Int32, num_elems))
-        AK.sample_sortperm!(ix, v)
-        ixh = Array(ix)
-        vh = Array(v)
-        @test issorted(vh[ixh])
-    end
-
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        ix = array_from_host(zeros(Int32, num_elems))
-        v = array_from_host(rand(UInt32, num_elems))
-        AK.sample_sortperm!(ix, v)
-        ixh = Array(ix)
-        vh = Array(v)
-        @test issorted(vh[ixh])
-    end
-
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        ix = array_from_host(zeros(Int32, num_elems))
-        v = array_from_host(rand(Float32, num_elems))
-        AK.sample_sortperm!(ix, v)
-        ixh = Array(ix)
-        vh = Array(v)
-        @test issorted(vh[ixh])
-    end
-
-    # Testing different settings
-    ix = array_from_host(1:10_000, Int32)
-    v = array_from_host(1:10_000, Float32)
-    AK.sample_sortperm!(ix,
-                    v,
-                    lt=(>), by=abs, rev=true,
-                    max_tasks=64,
-                    temp=array_from_host(1:10_000, Int32))
-    ixh = Array(ix)
-    vh = Array(v)
-    @test issorted(vh[ixh])
-end
-end
-
-
-if backend != CPU()
-@testset "merge_sortperm_lowmem" begin
-    Random.seed!(0)
-
-    # Fuzzy correctness testing
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        ix = array_from_host(zeros(Int32, num_elems))
-        v = array_from_host(rand(Int32, num_elems))
-        AK.merge_sortperm_lowmem!(ix, v)
-        ixh = Array(ix)
-        vh = Array(v)
-        @test issorted(vh[ixh])
-    end
-
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        ix = array_from_host(zeros(Int32, num_elems))
-        v = array_from_host(rand(UInt32, num_elems))
-        AK.merge_sortperm_lowmem!(ix, v)
-        ixh = Array(ix)
-        vh = Array(v)
-        @test issorted(vh[ixh])
-    end
-
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        ix = array_from_host(zeros(Int32, num_elems))
-        v = array_from_host(rand(Float32, num_elems))
-        AK.merge_sortperm_lowmem!(ix, v)
-        ixh = Array(ix)
-        vh = Array(v)
-        @test issorted(vh[ixh])
-    end
-
-    # Testing different settings
-    ix = array_from_host(1:10_000, Int32)
-    v = array_from_host(1:10_000, Float32)
-    AK.merge_sortperm_lowmem!(ix,
-                            v,
-                            lt=(>), by=abs, rev=true,
-                            block_size=64,
-                            temp=array_from_host(1:10_000, Int32))
-    ixh = Array(ix)
-    vh = Array(v)
-    @test issorted(vh[ixh])
-
-    v = array_from_host(1:10_000, Float32)
-    ix = AK.merge_sortperm_lowmem(v,
-                                lt=(>), by=abs, rev=true,
-                                block_size=64,
-                                temp=array_from_host(1:10_000, Int))
-    ixh = Array(ix)
-    vh = Array(v)
-    @test issorted(vh[ixh])
-end
-end
-
-
-@testset "sortperm" begin
-    Random.seed!(0)
-
-    # Fuzzy correctness testing
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        ix = array_from_host(zeros(Int32, num_elems))
-        v = array_from_host(rand(Int32, num_elems))
-        AK.sortperm!(ix, v)
-        ixh = Array(ix)
-        vh = Array(v)
-        @test issorted(vh[ixh])
-    end
-
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        ix = array_from_host(zeros(Int32, num_elems))
-        v = array_from_host(rand(UInt32, num_elems))
-        AK.sortperm!(ix, v)
-        ixh = Array(ix)
-        vh = Array(v)
-        @test issorted(vh[ixh])
-    end
-
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        ix = array_from_host(zeros(Int32, num_elems))
-        v = array_from_host(rand(Float32, num_elems))
-        AK.sortperm!(ix, v)
-        ixh = Array(ix)
-        vh = Array(v)
-        @test issorted(vh[ixh])
-    end
-
-    # Testing different settings
-    ix = array_from_host(1:10_000, Int32)
-    v = array_from_host(1:10_000, Float32)
-    AK.sortperm!(ix,
-                 v,
-                 lt=(>), by=abs, rev=true,
-                 block_size=64,
-                 temp=array_from_host(1:10_000, Int32))
-    ixh = Array(ix)
-    vh = Array(v)
-    @test issorted(vh[ixh])
-
-    v = array_from_host(1:10_000, Float32)
-    ix = AK.sortperm(v,
-                     lt=(>), by=abs, rev=true,
-                     block_size=64,
-                     temp=array_from_host(1:10_000, Int))
-    ixh = Array(ix)
-    vh = Array(v)
-    @test issorted(vh[ixh])
-end
-
-
-@testset "reduce_1d" begin
-    Random.seed!(0)
-
-    function redmin(s)
-        # Reduction-based minimum finder
-        AK.reduce(
-            (x, y) -> x < y ? x : y,
-            s;
-            init=typemax(eltype(s)),
-            neutral=typemax(eltype(s)),
-        )
-    end
-
-    # Fuzzy correctness testing
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(Int32, num_elems))
-        s = redmin(v)
-        vh = Array(v)
-        @test s == minimum(vh)
-    end
-
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(UInt32, num_elems))
-        s = redmin(v)
-        vh = Array(v)
-        @test s == minimum(vh)
-    end
-
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(Float32, num_elems))
-        s = redmin(v)
-        vh = Array(v)
-        @test s == minimum(vh)
-    end
-
-    function redsum(s)
-        # Reduction-based summation
-        AK.reduce(
-            (x, y) -> x + y,
-            s;
-            init=zero(eltype(s)),
-            neutral=zero(eltype(s)),
-        )
-    end
-
-    # Fuzzy correctness testing
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(1:100, num_elems), Int32)
-        s = redsum(v)
-        vh = Array(v)
-        @test s == sum(vh)
-    end
-
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(1:100, num_elems), UInt32)
-        s = redsum(v)
-        vh = Array(v)
-        @test s == sum(vh)
-    end
-
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(Float32, num_elems))
-        s = redsum(v)
-        vh = Array(v)
-        @test s ≈ sum(vh)
-    end
-
-    # Allowing N-dimensional arrays, still reduced as 1D
-    for _ in 1:100
-        n1 = rand(1:100)
-        n2 = rand(1:100)
-        n3 = rand(1:100)
-        vh = rand(Float32, n1, n2, n3)
-        v = array_from_host(vh)
-        s = redsum(v)
-        @test s ≈ sum(vh)
-    end
-
-    # Ensuring that the init value is respected
-    for _ in 1:100
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(Int32(1):Int32(100), num_elems))
-        s = AK.reduce(+, v; init=Int32(10))
-        vh = Array(v)
-        @test s == sum(vh) + 10
-    end
-
-    # Testing with switch_below - i.e. finishing on the CPU
-    for _ in 1:100
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(1:100, num_elems), Int32)
-        switch_below = rand(1:100)
-        init = rand(1:100)
-        s = AK.reduce(+, v; switch_below=switch_below, init=Int32(init))
-        vh = Array(v)
-        @test s == reduce(+, vh, init=init)
-    end
-
-    # Test with unmaterialised ranges
-    for _ in 1:100
-        num_elems = rand(1:1000)
-        v = 1:num_elems
-        s = AK.reduce(+, v, backend; init=Int32(0))
-        vh = Array(v)
-        @test s == reduce(+, vh)
-    end
-
-    # Testing different settings
-    AK.reduce(
-        (x, y) -> x + 1,
-        array_from_host(rand(Int32, 10_000)),
-        init=Int32(0),
-        neutral=Int64(0),
-        block_size=64,
-        temp=array_from_host(zeros(Int32, 10_000)),
-        switch_below=50,
-        scheduler=:dynamic,
-        max_tasks=10,
-        min_elems=100,
-    )
-    AK.reduce(
-        (x, y) -> x + 1,
-        rand(Int32, 10_000),
-        init=Int32(0),
-        neutral=Int64(0),
-        scheduler=:greedy,
-        max_tasks=16,
-        min_elems=1000,
-    )
-end
-
-
-@testset "reduce_nd" begin
-    Random.seed!(0)
-
-    # Test all possible corner cases against Base.reduce
-    for dims in 1:4
-        for isize in 0:3
-            for jsize in 0:3
-                for ksize in 0:3
-                    sh = rand(Int32(1):Int32(100), isize, jsize, ksize)
-                    s = array_from_host(sh)
-                    d = AK.reduce(+, s; init=Int32(10), dims=dims)
-                    dh = Array(d)
-                    @test dh == sum(sh, init=Int32(10), dims=dims)
-                    @test eltype(dh) == eltype(sum(sh, init=Int32(10), dims=dims))
-                end
-            end
-        end
-    end
-
-    # Fuzzy correctness testing
-    for _ in 1:100
-        for dims in 1:3
-            n1 = rand(1:100)
-            n2 = rand(1:100)
-            n3 = rand(1:100)
-            vh = rand(Int32(1):Int32(100), n1, n2, n3)
-            v = array_from_host(vh)
-            s = AK.reduce(+, v; init=Int32(0), dims=dims)
-            sh = Array(s)
-            @test sh == sum(vh, dims=dims)
-        end
-    end
-
-    for _ in 1:100
-        for dims in 1:3
-            n1 = rand(1:100)
-            n2 = rand(1:100)
-            n3 = rand(1:100)
-            vh = rand(UInt32(1):UInt32(100), n1, n2, n3)
-            v = array_from_host(vh)
-            s = AK.reduce(+, v; init=UInt32(0), dims=dims)
-            sh = Array(s)
-            @test sh == sum(vh, dims=dims)
-        end
-    end
-
-    for _ in 1:100
-        for dims in 1:3
-            n1 = rand(1:100)
-            n2 = rand(1:100)
-            n3 = rand(1:100)
-            vh = rand(Float32, n1, n2, n3)
-            v = array_from_host(vh)
-            s = AK.reduce(+, v; init=Float32(0), dims=dims)
-            sh = Array(s)
-            @test sh ≈ sum(vh, dims=dims)
-        end
-    end
-
-    # Ensuring that the init value is respected
-    for _ in 1:100
-        for dims in 1:4
-            n1 = rand(1:100)
-            n2 = rand(1:100)
-            n3 = rand(1:100)
-            vh = rand(Int32(1):Int32(100), n1, n2, n3)
-            v = array_from_host(vh)
-            init = rand(1:100)
-            s = AK.reduce(+, v; init=Int32(init), dims=dims)
-            sh = Array(s)
-            @test sh == reduce(+, vh, dims=dims, init=init)
-        end
-    end
-
-    # Testing different settings
-    AK.reduce(
-        (x, y) -> x + 1,
-        array_from_host(rand(Int32, 3, 4, 5)),
-        init=Int32(0),
-        neutral=Int32(0),
-        dims=2,
-        block_size=64,
-        temp=array_from_host(zeros(Int32, 3, 1, 5)),
-        switch_below=50,
-        scheduler=:dynamic,
-        max_tasks=10,
-        min_elems=100,
-    )
-    AK.reduce(
-        (x, y) -> x + 1,
-        array_from_host(rand(Int32, 3, 4, 5)),
-        init=Int32(0),
-        neutral=Int32(0),
-        dims=3,
-        block_size=64,
-        temp=array_from_host(zeros(Int32, 3, 4, 1)),
-        switch_below=50,
-        scheduler=:greedy,
-        max_tasks=16,
-        min_elems=1000,
-    )
-end
-
-
-@testset "mapreduce_1d" begin
-    Random.seed!(0)
-
-    struct Point
-        x::Float32
-        y::Float32
-    end
-    # Only for backend-agnostic initialisation with KernelAbstractions.zero
-    Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0)
-
-    function minbox(s)
-        # Extract coordinates into tuple and reduce to find dimensionwise minima
-        AK.mapreduce(
-            p -> (p.x, p.y),
-            (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]),
-            s;
-            init=(typemax(Float32), typemax(Float32)),
-            neutral=(typemax(Float32), typemax(Float32)),
-        )
-    end
-
-    function minbox_base(s)
-        # Extract coordinates into tuple and reduce to find dimensionwise minima
-        Base.mapreduce(
-            p -> (p.x, p.y),
-            (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]),
-            s;
-            init=(typemax(Float32), typemax(Float32)),
-        )
-    end
-
-    # Fuzzy correctness testing
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        v = array_from_host([Point(rand(Float32), rand(Float32)) for _ in 1:num_elems])
-        mgpu = minbox(v)
-
-        vh = Array(v)
-        mcpu = minbox(vh)
-        mbase = minbox_base(vh)
-
-        @test typeof(mgpu) === typeof(mcpu) === typeof(mbase)
-        @test mgpu[1] ≈ mcpu[1] ≈ mbase[1]
-        @test mgpu[2] ≈ mcpu[2] ≈ mbase[2]
-    end
-
-    # Allowing N-dimensional arrays, still reduced as 1D
-    for _ in 1:100
-        n1 = rand(1:100)
-        n2 = rand(1:100)
-        n3 = rand(1:100)
-
-        v = array_from_host([Point(rand(Float32), rand(Float32)) for _ in 1:n1, _ in 1:n2, _ in 1:n3])
-        mgpu = minbox(v)
-
-        vh = Array(v)
-        mcpu = minbox(vh)
-        mbase = minbox_base(vh)
-
-        @test typeof(mgpu) === typeof(mcpu) === typeof(mbase)
-        @test mgpu[1] ≈ mcpu[1] ≈ mbase[1]
-        @test mgpu[2] ≈ mcpu[2] ≈ mbase[2]
-    end
-
-    # Ensuring that the init value is respected
-    for _ in 1:100
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(Int32(1):Int32(100), num_elems))
-        s = AK.mapreduce(abs, +, v; init=Int32(10))
-        vh = Array(v)
-        @test s == sum(vh) + 10
-    end
-
-    # Testing with switch_below - i.e. finishing on the CPU
-    for _ in 1:100
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(-100:-1, num_elems), Int32)
-        switch_below = rand(1:100)
-        init = rand(1:100)
-        s = AK.mapreduce(abs, +, v; switch_below=switch_below, init=Int32(init))
-        vh = Array(v)
-        @test s == mapreduce(abs, +, vh, init=init)
-    end
-
-    # Test with unmaterialised ranges
-    for _ in 1:100
-        num_elems = rand(1:1000)
-        v = 1:num_elems
-        s = AK.mapreduce(abs, +, v, backend; init=Int32(0))
-        vh = Array(v)
-        @test s == mapreduce(abs, +, vh)
-    end
-
-    # Testing different settings, enforcing change of type between f and op
-    f(s, temp) = AK.mapreduce(
-        p -> (p.x, p.y),
-        (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]),
-        s,
-        init=(typemax(Float32), typemax(Float32)),
-        neutral=(typemax(Float32), typemax(Float32)),
-        block_size=64,
-        temp=temp,
-        switch_below=50,
-        scheduler=:dynamic,
-        max_tasks=10,
-        min_elems=100,
-    )
-    v = array_from_host([Point(rand(Float32), rand(Float32)) for _ in 1:10_042])
-    temp = similar(v, Tuple{Float32, Float32})
-    f(v, temp)
-end
-
-
-@testset "mapreduce_nd" begin
-    Random.seed!(0)
-
-    # Test all possible corner cases against Base.reduce
-    for dims in 1:4
-        for isize in 0:3
-            for jsize in 0:3
-                for ksize in 0:3
-                    sh = rand(Int32(-100):Int32(100), isize, jsize, ksize)
-                    s = array_from_host(sh)
-                    d = AK.mapreduce(-, +, s; init=Int32(-10), dims=dims)
-                    dh = Array(d)
-                    @test dh == mapreduce(-, +, sh, init=Int32(-10), dims=dims)
-                    @test eltype(dh) == eltype(mapreduce(-, +, sh, init=Int32(-10), dims=dims))
-                end
-            end
-        end
-    end
-
-    # Fuzzy correctness testing
-    for _ in 1:100
-        for dims in 1:3
-            n1 = rand(1:100)
-            n2 = rand(1:100)
-            n3 = rand(1:100)
-            vh = rand(Int32(1):Int32(100), n1, n2, n3)
-            v = array_from_host(vh)
-            s = AK.mapreduce(-, +, v; init=Int32(0), dims=dims)
-            sh = Array(s)
-            @test sh == mapreduce(-, +, vh, init=Int32(0), dims=dims)
-        end
-    end
-
-    struct Point2
-        x::Float32
-        y::Float32
-    end
-
-    # Only for backend-agnostic initialisation with KernelAbstractions.zero
-    Base.zero(::Type{Point2}) = Point2(0.0f0, 0.0f0)
-
-    function minbox(s, dims)
-        # Extract coordinates into tuple and reduce to find dimensionwise minima
-        AK.mapreduce(
-            p -> (p.x, p.y),
-            (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]),
-            s;
-            init=(typemax(Float32), typemax(Float32)),
-            neutral=(typemax(Float32), typemax(Float32)),
-            dims=dims,
-        )
-    end
-
-    function minbox_base(s, dims)
-        # Extract coordinates into tuple and reduce to find dimensionwise minima
-        Base.mapreduce(
-            p -> (p.x, p.y),
-            (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]),
-            s;
-            init=(typemax(Float32), typemax(Float32)),
-            dims=dims,
-        )
-    end
-
-    # Fuzzy correctness testing
-    for _ in 1:100
-        for dims in 1:3
-            n1 = rand(1:100)
-            n2 = rand(1:100)
-            n3 = rand(1:100)
-            v = array_from_host([Point2(rand(Float32), rand(Float32)) for _ in 1:n1, _ in 1:n2, _ in 1:n3])
-            mgpu = minbox(v, dims)
-
-            vh = Array(v)
-            mcpu = minbox(vh, dims)
-            mbase = minbox_base(vh, dims)
-
-            @test eltype(mgpu) === eltype(mcpu) === eltype(mbase)
-            @test all([
-                (mgpu_red[1] ≈ mcpu[i][1] ≈ mbase[i][1]) && (mgpu_red[2] ≈ mcpu[i][2] ≈ mbase[i][2])
-                for (i, mgpu_red) in enumerate(Array(mgpu))
-            ])
-        end
-    end
-
-    # Ensuring that the init value is respected
-    for _ in 1:100
-        for dims in 1:4
-            n1 = rand(1:100)
-            n2 = rand(1:100)
-            n3 = rand(1:100)
-            vh = rand(Int32(-100):Int32(100), n1, n2, n3)
-            v = array_from_host(vh)
-            init = rand(1:100)
-            s = AK.mapreduce(-, +, v; init=Int32(init), dims=dims)
-            sh = Array(s)
-            @test sh == mapreduce(-, +, vh, dims=dims, init=init)
-        end
-    end
-
-    # Testing different settings
-    AK.mapreduce(
-        -,
-        (x, y) -> x + 1,
-        array_from_host(rand(Int32, 3, 4, 5)),
-        init=Int32(0),
-        neutral=Int32(0),
-        dims=2,
-        block_size=64,
-        temp=array_from_host(zeros(Int32, 3, 1, 5)),
-        switch_below=50,
-        scheduler=:dynamic,
-        max_tasks=10,
-        min_elems=100,
-    )
-    AK.mapreduce(
-        -,
-        (x, y) -> x + 1,
-        array_from_host(rand(Int32, 3, 4, 5)),
-        init=Int32(0),
-        neutral=Int32(0),
-        dims=3,
-        block_size=64,
-        temp=array_from_host(zeros(Int32, 3, 4, 1)),
-        switch_below=50,
-        scheduler=:greedy,
-        max_tasks=16,
-        min_elems=1000,
-    )
-end
-
-
-@testset "accumulate_1d" begin
-
-    Random.seed!(0)
-
-    # Single block exlusive scan (each block processes two elements)
-    for num_elems in 1:256
-        x = array_from_host(ones(Int32, num_elems))
-        y = copy(x)
-        AK.accumulate!(+, y; init=0, inclusive=false, block_size=128)
-        yh = Array(y)
-        @test all(yh .== 0:length(yh) - 1)
-    end
-
-    # Single block inclusive scan
-    for num_elems in 1:256
-        x = array_from_host(rand(1:1000, num_elems), Int32)
-        y = copy(x)
-        AK.accumulate!(+, y; init=0, block_size=128)
-        @test all(Array(y) .== accumulate(+, Array(x)))
-    end
-
-    # Large exclusive scan
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        x = array_from_host(ones(Int32, num_elems))
-        y = copy(x)
-        AK.accumulate!(+, y; init=0, inclusive=false)
-        yh = Array(y)
-        @test all(yh .== 0:length(yh) - 1)
-    end
-
-    # Large inclusive scan
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        x = array_from_host(rand(1:1000, num_elems), Int32)
-        y = copy(x)
-        AK.accumulate!(+, y; init=0)
-        @test all(Array(y) .== accumulate(+, Array(x)))
-    end
-
-    # Stress-testing small block sizes -> many blocks
-    for _ in 1:100
-        num_elems = rand(1:100_000)
-        x = array_from_host(rand(1:1000, num_elems), Int32)
-        y = copy(x)
-        AK.accumulate!(+, y; init=0, block_size=16)
-        @test all(Array(y) .== accumulate(+, Array(x)))
-    end
-
-    # Allowing N-dimensional arrays, still reduced as 1D
-    for _ in 1:100
-        n1 = rand(1:100)
-        n2 = rand(1:100)
-        n3 = rand(1:100)
-        vh = rand(Float32, n1, n2, n3)
-        v = array_from_host(vh)
-        AK.accumulate!(+, v; init=0)
-        @test all(Array(v) .≈ accumulate(+, vh))
-    end
-
-    # Ensuring the init value is respected
-    for _ in 1:100
-        num_elems = rand(1:100_000)
-        x = array_from_host(rand(1:1000, num_elems), Int32)
-        y = similar(x)
-        init = rand(-1000:1000)
-        AK.accumulate!(+, y, x; init=Int32(init))
-        @test all(Array(y) .== accumulate(+, Array(x), init=init))
-    end
-
-    # Exclusive scan
-    x = array_from_host(ones(Int32, 10))
-    y = copy(x)
-    AK.accumulate!(+, y; init=0, inclusive=false)
-    @test all(Array(y) .== 0:9)
-
-    # Test init value is respected with exclusive scan too
-    x = array_from_host(ones(Int32, 10))
-    y = copy(x)
-    init = 10
-    AK.accumulate!(+, y; init=Int32(init), inclusive=false)
-    @test all(Array(y) .== 10:19)
-
-    # Testing different settings
-    AK.accumulate!(+, array_from_host(ones(Int32, 1000)), init=0, inclusive=false,
-                   block_size=128,
-                   temp=array_from_host(zeros(Int32, 1000)),
-                   temp_flags=array_from_host(zeros(Int8, 1000)))
-    AK.accumulate(+, array_from_host(ones(Int32, 1000)), init=0, inclusive=false,
-                  block_size=128,
-                  temp=array_from_host(zeros(Int64, 1000)),
-                  temp_flags=array_from_host(zeros(Int8, 1000)))
-end
-
-
-@testset "accumulate_nd" begin
-    Random.seed!(0)
-
-    # Test all possible corner cases against Base.accumulate
-    for dims in 1:4
-        for isize in 0:3
-            for jsize in 0:3
-                for ksize in 0:3
-                    sh = rand(Int32(1):Int32(100), isize, jsize, ksize)
-                    s = array_from_host(sh)
-                    d = AK.accumulate(+, s; init=Int32(0), dims=dims)
-
-                    dh = Array(d)
-                    dhres = accumulate(+, sh, init=Int32(0), dims=dims)
-                    @test dh == dhres
-                    @test eltype(dh) == eltype(dhres)
-                end
-            end
-        end
-    end
-
-    # Fuzzy correctness testing
-    for _ in 1:100
-        for dims in 1:3
-            n1 = rand(1:100)
-            n2 = rand(1:100)
-            n3 = rand(1:100)
-            vh = rand(Int32(1):Int32(100), n1, n2, n3)
-            v = array_from_host(vh)
-
-            s = AK.accumulate(+, v; init=Int32(0), dims=dims)
-            sh = Array(s)
-            @test sh == accumulate(+, vh, init=Int32(0), dims=dims)
-        end
-    end
-
-    for _ in 1:100
-        for dims in 1:3
-            n1 = rand(1:100)
-            n2 = rand(1:100)
-            n3 = rand(1:100)
-            vh = rand(UInt32(1):UInt32(100), n1, n2, n3)
-            v = array_from_host(vh)
-
-            s = AK.accumulate(+, v; init=UInt32(0), dims=dims)
-            sh = Array(s)
-            @test sh == accumulate(+, vh, init=UInt32(0), dims=dims)
-        end
-    end
-
-    for _ in 1:100
-        for dims in 1:3
-            n1 = rand(1:100)
-            n2 = rand(1:100)
-            n3 = rand(1:100)
-            vh = rand(Float32, n1, n2, n3)
-            v = array_from_host(vh)
-
-            s = AK.accumulate(+, v; init=Float32(0), dims=dims)
-            sh = Array(s)
-            @test all(sh .≈ accumulate(+, vh, init=Float32(0), dims=dims))
-        end
-    end
-
-    # Ensure the init value is respected
-    for _ in 1:100
-        for dims in 1:3
-            n1 = rand(1:100)
-            n2 = rand(1:100)
-            n3 = rand(1:100)
-            vh = rand(Float32, n1, n2, n3)
-            v = array_from_host(vh)
-            init = rand(-1000:1000)
-            s = AK.accumulate(+, v; init=Float32(init), dims=dims)
-            sh = Array(s)
-            @test all(sh .≈ accumulate(+, vh, init=Float32(init), dims=dims))
-        end
-    end
-
-    # Exclusive scan
-    vh = ones(Int32, 10, 10)
-    v = array_from_host(vh)
-    s = AK.accumulate(+, v; init=0, dims=2, inclusive=false)
-    sh = Array(s)
-    @test all([sh[i, :] == 0:9 for i in 1:10])
-
-    # Test init value is respected with exclusive scan too
-    vh = ones(Int32, 10, 10)
-    v = array_from_host(vh)
-    s = AK.accumulate(+, v; init=10, dims=2, inclusive=false)
-    sh = Array(s)
-    @test all([sh[i, :] == 10:19 for i in 1:10])
-
-    # Testing different settings
-    AK.accumulate(
-        (x, y) -> x + 1,
-        array_from_host(rand(Int32, 3, 4, 5)),
-        init=Int32(0),
-        neutral=Int32(0),
-        dims=2,
-        block_size=64,
-        temp=array_from_host(zeros(Int32, 3, 1, 5)),
-    )
-    AK.accumulate(
-        (x, y) -> x + 1,
-        array_from_host(rand(Int32, 3, 4, 5)),
-        init=Int32(0),
-        neutral=Int32(0),
-        dims=3,
-        block_size=64,
-        temp=array_from_host(zeros(Int32, 3, 4, 1)),
-    )
-end
-
-
-@testset "searchsorted" begin
-
-    Random.seed!(0)
-
-    # Fuzzy correctness testing of searchsortedfirst
-    for _ in 1:100
-        num_elems_v = rand(1:100_000)
-        num_elems_x = rand(1:100_000)
-
-        # Ints
-        v = array_from_host(sort(rand(Int32, num_elems_v)))
-        x = array_from_host(rand(Int32, num_elems_x))
-        ix = similar(x, Int32)
-        AK.searchsortedfirst!(ix, v, x)
-
-        vh = Array(v)
-        xh = Array(x)
-        ixh = AK.searchsortedfirst(vh, xh)
-        ixh_base = [searchsortedfirst(vh, e) for e in xh]
-
-        @test all(Array(ix) .== ixh .== ixh_base)
-
-        # Floats
-        v = array_from_host(sort(rand(Float32, num_elems_v)))
-        x = array_from_host(rand(Float32, num_elems_x))
-        ix = similar(x, Int32)
-        AK.searchsortedfirst!(ix, v, x)
-
-        vh = Array(v)
-        xh = Array(x)
-        ixh = AK.searchsortedfirst(vh, xh)
-        ixh_base = [searchsortedfirst(vh, e) for e in xh]
-
-        @test all(Array(ix) .== ixh .== ixh_base)
-    end
-
-    # Fuzzy correctness testing of searchsortedlast
-    for _ in 1:100
-        num_elems_v = rand(1:100_000)
-        num_elems_x = rand(1:100_000)
-
-        # Ints
-        v = array_from_host(sort(rand(Int32, num_elems_v)))
-        x = array_from_host(rand(Int32, num_elems_x))
-        ix = similar(x, Int32)
-        AK.searchsortedlast!(ix, v, x)
-
-        vh = Array(v)
-        xh = Array(x)
-        ixh = AK.searchsortedlast(vh, xh)
-        ixh_base = [searchsortedlast(vh, e) for e in xh]
-
-        @test all(Array(ix) .== ixh .== ixh_base)
-
-        # Floats
-        v = array_from_host(sort(rand(Float32, num_elems_v)))
-        x = array_from_host(rand(Float32, num_elems_x))
-        ix = similar(x, Int32)
-        AK.searchsortedlast!(ix, v, x)
-
-        vh = Array(v)
-        xh = Array(x)
-        ixh = AK.searchsortedlast(vh, xh)
-        ixh_base = [searchsortedlast(vh, e) for e in xh]
-
-        @test all(Array(ix) .== ixh .== ixh_base)
-    end
-
-    # Testing different settings
-    v = array_from_host(sort(rand(Int32, 100_000)))
-    x = array_from_host(rand(Int32, 10_000))
-    ix = similar(x, Int32)
-
-    AK.searchsortedfirst!(ix, v, x, by=abs, lt=(>), rev=true, block_size=64)
-    AK.searchsortedfirst(v, x, by=abs, lt=(>), rev=true, block_size=64)
-    AK.searchsortedlast!(ix, v, x, by=abs, lt=(>), rev=true, block_size=64)
-    AK.searchsortedlast(v, x, by=abs, lt=(>), rev=true, block_size=64)
-
-    vh = Array(v)
-    xh = Array(x)
-    ixh = similar(xh, Int32)
-
-    AK.searchsortedfirst!(ixh, vh, xh, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100)
-    AK.searchsortedfirst(vh, xh, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100)
-    AK.searchsortedlast!(ixh, vh, xh, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100)
-    AK.searchsortedlast(vh, xh, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100)
-end
-
-
-@testset "truth" begin
-
-    Random.seed!(0)
-
-    # Simple correctness tests
-    v = array_from_host(1:100)
-
-    @test AK.any(x->x<0, v) === false
-    @test AK.any(x->x>99, v) === true
-
-    @test AK.all(x->x>0, v) === true
-    @test AK.all(x->x<100, v) === false
-
-    for _ in 1:100
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(Float32, num_elems))
-        @test AK.any(x->x<0, v) === false
-        @test AK.any(x->x<1, v) === true
-        @test AK.all(x->x<1, v) === true
-        @test AK.all(x->x<0, v) === false
-    end
-
-    for _ in 1:100
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(Float32, num_elems))
-        @test AK.any(x->x<0, v) === false
-        @test AK.any(x->x<1, v) === true
-        @test AK.all(x->x<1, v) === true
-        @test AK.all(x->x<0, v) === false
-    end
-
-    # Test the MapReduce algorithm which works on all platforms
-    for _ in 1:100
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(Float32, num_elems))
-        alg=AK.MapReduce(temp=similar(v, Bool), switch_below=100)
-        @test AK.any(x->x<0, v, alg=alg) === false
-        @test AK.any(x->x<1, v, alg=alg) === true
-        @test AK.all(x->x<1, v, alg=alg) === true
-        @test AK.all(x->x<0, v, alg=alg) === false
-    end
-
-    # Testing different settings
-    v = array_from_host(rand(-5:5, 100_000))
-    AK.any(x->x<5, v, max_tasks=2, min_elems=100, block_size=64)
-    AK.all(x->x<5, v, max_tasks=2, min_elems=100, block_size=64)
-end
-
-
-@testset "sum" begin
-
-    Random.seed!(0)
-
-    # Simple correctness tests
-    v = array_from_host(1:100)
-    @test AK.sum(v) == sum(Array(v))
-
-    # Fuzzy testing
-    for _ in 1:100
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(Float32, num_elems))
-        @test AK.sum(v) ≈ sum(Array(v))
-    end
-
-    for _ in 1:100
-        for dims in 1:3
-            n1 = rand(1:100)
-            n2 = rand(1:100)
-            n3 = rand(1:100)
-            vh = rand(Int32(-5):Int32(5), n1, n2, n3)
-            v = array_from_host(vh)
-
-            # Indexing into array as if linear
-            @test AK.sum(v) == sum(vh)
-
-            # Along dimensions
-            r = Array(AK.sum(v, dims=dims))
-            rh = sum(vh, dims=dims)
-
-            @test r == rh
-        end
-    end
-
-    # Testing different settings
-    v = array_from_host(rand(-5:5, 100_000))
-    AK.sum(v, block_size=64)
-
-    # The other settings are stress-tested in reduce
-end
-
-
-@testset "prod" begin
-
-    Random.seed!(0)
-
-    # Simple correctness tests
-    v = array_from_host(1:100)
-    @test AK.prod(v) == prod(Array(v))
-
-    # Fuzzy testing
-    for _ in 1:100
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(Float32, num_elems))
-        @test AK.prod(v) ≈ prod(Array(v))
-    end
-
-    for _ in 1:100
-        for dims in 1:3
-            n1 = rand(1:10)
-            n2 = rand(1:10)
-            n3 = rand(1:10)
-            vh = rand(Int32(-5):Int32(5), n1, n2, n3)
-            v = array_from_host(vh)
-
-            # Indexing into array as if linear
-            @test AK.sum(v) == sum(vh)
-
-            # Along dimensions
-            r = Array(AK.sum(v, dims=dims))
-            rh = sum(vh, dims=dims)
-
-            @test r == rh
-        end
-    end
-
-    # Testing different settings
-    v = array_from_host(rand(-5:5, 100_000))
-    AK.prod(v, block_size=64)
-
-    # The other settings are stress-tested in reduce
-end
-
-
-@testset "minimum" begin
-
-    Random.seed!(0)
-
-    # Simple correctness tests
-    v = array_from_host(1:100)
-    @test AK.minimum(v) == minimum(Array(v))
-
-    # Fuzzy testing
-    for _ in 1:100
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(Float32, num_elems))
-        @test AK.minimum(v) == minimum(Array(v))
-    end
-
-    for _ in 1:100
-        for dims in 1:3
-            n1 = rand(1:100)
-            n2 = rand(1:100)
-            n3 = rand(1:100)
-            vh = rand(Int32, n1, n2, n3)
-            v = array_from_host(vh)
-
-            # Indexing into array as if linear
-            @test AK.minimum(v) == minimum(vh)
-
-            # Along dimensions
-            r = Array(AK.minimum(v, dims=dims))
-            rh = minimum(vh, dims=dims)
-
-            @test r == rh
-        end
-    end
-
-    # Testing different settings
-    v = array_from_host(rand(-5:5, 100_000))
-    AK.minimum(v, block_size=64)
-
-    # The other settings are stress-tested in reduce
-end
-
-
-@testset "maximum" begin
-
-    Random.seed!(0)
-
-    # Simple correctness tests
-    v = array_from_host(1:100)
-    @test AK.maximum(v) == maximum(Array(v))
-
-    # Fuzzy testing
-    for _ in 1:100
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(Float32, num_elems))
-        @test AK.maximum(v) == maximum(Array(v))
-    end
-
-    for _ in 1:100
-        for dims in 1:3
-            n1 = rand(1:100)
-            n2 = rand(1:100)
-            n3 = rand(1:100)
-            vh = rand(Int32, n1, n2, n3)
-            v = array_from_host(vh)
-
-            # Indexing into array as if linear
-            @test AK.maximum(v) == maximum(vh)
-
-            # Along dimensions
-            r = Array(AK.maximum(v, dims=dims))
-            rh = maximum(vh, dims=dims)
-
-            @test r == rh
-        end
-    end
-
-    # Testing different settings
-    v = array_from_host(rand(-5:5, 100_000))
-    AK.maximum(v, block_size=64)
-
-    # The other settings are stress-tested in reduce
-end
-
-
-@testset "count" begin
-
-    Random.seed!(0)
-
-    # Simple correctness tests
-    v = array_from_host(1:100)
-    @test AK.count(x->x>50, v) == count(x->x>50, Array(v))
-
-    # Fuzzy testing
-    for _ in 1:100
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(Float32, num_elems))
-        @test AK.count(x->x>0.5, v) == count(x->x>0.5, Array(v))
-    end
-
-    for _ in 1:100
-        for dims in 1:3
-            n1 = rand(1:100)
-            n2 = rand(1:100)
-            n3 = rand(1:100)
-            vh = rand(Float32, n1, n2, n3)
-            v = array_from_host(vh)
-
-            # Indexing into array as if linear
-            @test AK.count(x->x>0.5, v) == count(x->x>0.5, vh)
-
-            # Along dimensions
-            r = Array(AK.count(x->x>0.5, v, dims=dims))
-            rh = count(x->x>0.5, vh, dims=dims)
-
-            @test r == rh
-        end
-    end
-
-    # Counting booleans directly
-    for _ in 1:100
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(Bool, num_elems))
-        @test AK.count(v) == count(Array(v))
-    end
-
-    # Testing different settings
-    v = array_from_host(rand(-5:5, 100_000))
-    AK.count(x->x>0, v, block_size=64)
-
-    # The other settings are stress-tested in reduce
-end
-
-
-@testset "cumsum" begin
-
-    Random.seed!(0)
-
-    # Simple correctness tests
-    v = array_from_host(1:100)
-    vh = Array(v)
-    @test Array(AK.cumsum(v)) == cumsum(vh)
-
-    # Fuzzy testing
-    for _ in 1:100
-        num_elems = rand(1:100_000)
-        vh = rand(Float32, num_elems)
-        v = array_from_host(vh)
-        @test all(Array(AK.cumsum(v)) .≈ cumsum(vh))
-    end
-
-    for _ in 1:100
-        for dims in 1:3
-            n1 = rand(1:10)
-            n2 = rand(1:10)
-            n3 = rand(1:10)
-            vh = rand(Int32(-5):Int32(5), n1, n2, n3)
-            v = array_from_host(vh)
-
-            # Indexing into array as if linear; not supported in Base
-            # @test all(Array(AK.cumsum(v)) .== cumsum(vh))
-
-            # Along dimensions
-            r = Array(AK.cumsum(v, dims=dims))
-            rh = cumsum(vh, dims=dims)
-
-            @test r == rh
-        end
-    end
-
-    # Test promotion to op-dictated type
-    xh = rand(Bool, 16)
-    x = array_from_host(xh)
-    @test Array(AK.cumsum(x)) == cumsum(xh)
-
-    # Testing different settings
-    v = array_from_host(rand(-5:5, 100_000))
-    AK.cumsum(v, block_size=64)
-
-    # The other settings are stress-tested in reduce
-end
-
-
-@testset "cumprod" begin
-
-    Random.seed!(0)
-
-    # Simple correctness tests
-    v = array_from_host(1:100)
-    vh = Array(v)
-    @test Array(AK.cumprod(v)) == cumprod(vh)
-
-    vh = ones(Float32, 100_000)
-    v = array_from_host(vh)
-    @test Array(AK.cumprod(v)) == vh
-
-    # Fuzzy testing
-    for _ in 1:100
-        for dims in 1:3
-            n1 = rand(1:10)
-            n2 = rand(1:10)
-            n3 = rand(1:10)
-            vh = rand(Int32(-5):Int32(5), n1, n2, n3)
-            v = array_from_host(vh)
-
-            # Indexing into array as if linear; not supported in Base
-            # @test all(Array(AK.cumprod(v)) .== cumprod(vh))
-
-            # Along dimensions
-            r = Array(AK.cumprod(v, dims=dims))
-            rh = cumprod(vh, dims=dims)
-
-            @test r == rh
-        end
-    end
-
-    # Testing different settings
-    v = array_from_host(rand(-5:5, 100_000))
-    AK.cumprod(v, block_size=64)
-
-    # The other settings are stress-tested in reduce
-end
+include("partition.jl")
+include("looping.jl")
+include("map.jl")
+include("sort.jl")
+include("reduce.jl")
+include("accumulate.jl")
+include("predicates.jl")
+include("binarysearch.jl")
diff --git a/test/sort.jl b/test/sort.jl
new file mode 100644
index 0000000..59e9505
--- /dev/null
+++ b/test/sort.jl
@@ -0,0 +1,453 @@
+if BACKEND != CPU()
+@testset "merge_sort" begin
+    Random.seed!(0)
+
+    # Fuzzy correctness testing
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(Int32, num_elems))
+        AK.merge_sort!(v)
+        vh = Array(v)
+        @test issorted(vh)
+    end
+
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(UInt32, num_elems))
+        AK.merge_sort!(v)
+        vh = Array(v)
+        @test issorted(vh)
+    end
+
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(Float32, num_elems))
+        AK.merge_sort!(v)
+        vh = Array(v)
+        @test issorted(vh)
+    end
+
+    # Testing different settings
+    v = array_from_host(1:10_000, Float32)
+    AK.merge_sort!(v, lt=(>), by=abs, rev=true,
+                block_size=64, temp=array_from_host(1:10_000, Float32))
+    @test issorted(Array(v))
+
+    v = array_from_host(1:10_000, Int32)
+    AK.merge_sort!(v, lt=(>), rev=true,
+                block_size=64, temp=array_from_host(1:10_000, Int32))
+    @test issorted(Array(v))
+
+    v = array_from_host(1:10_000, Float32)
+    v = AK.merge_sort(v, lt=(>), by=abs, rev=true,
+                block_size=64, temp=array_from_host(1:10_000, Float32))
+    @test issorted(Array(v))
+
+    v = array_from_host(1:10_000, Int32)
+    v = AK.merge_sort(v, lt=(>), by=abs, rev=true,
+                block_size=64, temp=array_from_host(1:10_000, Int32))
+    @test issorted(Array(v))
+end
+else # CPU backend
+@testset "sample_sort" begin
+    Random.seed!(0)
+
+    # Fuzzy correctness testing
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(Int32, num_elems))
+        AK.sample_sort!(v)
+        vh = Array(v)
+        @test issorted(vh)
+    end
+
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(UInt32, num_elems))
+        AK.sample_sort!(v)
+        vh = Array(v)
+        @test issorted(vh)
+    end
+
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(Float32, num_elems))
+        AK.sample_sort!(v)
+        vh = Array(v)
+        @test issorted(vh)
+    end
+
+    # Testing different settings
+    v = array_from_host(rand(1:100_000, 10_000), Float32)
+    AK.sample_sort!(v, lt=(>), by=abs, rev=true,
+                    max_tasks=64, temp=array_from_host(1:10_000, Float32))
+    @test issorted(Array(v))
+
+    v = array_from_host(rand(1:100_000, 10_000), Int32)
+    AK.sample_sort!(v, lt=(>), rev=true,
+                    max_tasks=64, temp=array_from_host(1:10_000, Int32))
+    @test issorted(Array(v))
+end
+end
+
+
+@testset "sort" begin
+    Random.seed!(0)
+
+    # Fuzzy correctness testing
+    for _ in 1:100
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(Int32, num_elems))
+        AK.sort!(v)
+        vh = Array(v)
+        @test issorted(vh)
+    end
+
+    for _ in 1:100
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(UInt32, num_elems))
+        AK.sort!(v)
+        vh = Array(v)
+        @test issorted(vh)
+    end
+
+    for _ in 1:100
+        num_elems = rand(1:100_000)
+        v = array_from_host(rand(Float32, num_elems))
+        AK.sort!(v)
+        vh = Array(v)
+        @test issorted(vh)
+    end
+
+    # Testing different settings
+    v = array_from_host(rand(1:100_000, 10_000), Float32)
+    AK.sort!(v, lt=(>), by=abs, rev=true,
+            max_tasks=64, min_elems=8, block_size=64,
+            temp=array_from_host(1:10_000, Float32))
+    @test issorted(Array(v))
+
+    v = array_from_host(rand(1:100_000, 10_000), Int32)
+    AK.sort!(v, lt=(>), rev=true,
+            max_tasks=64, min_elems=8, block_size=64,
+            temp=array_from_host(1:10_000, Int32))
+    @test issorted(Array(v))
+
+    v = array_from_host(rand(1:100_000, 10_000), Float32)
+    v = AK.sort(v, lt=(>), by=abs, rev=true,
+                max_tasks=64, min_elems=8, block_size=64,
+                temp=array_from_host(1:10_000, Float32))
+    @test issorted(Array(v))
+
+    v = array_from_host(rand(1:100_000, 10_000), Int32)
+    v = AK.sort(v, lt=(>), by=abs, rev=true,
+                max_tasks=64, min_elems=8, block_size=64,
+                temp=array_from_host(1:10_000, Int32))
+    @test issorted(Array(v))
+end
+
+
+if BACKEND != CPU()
+@testset "merge_sort_by_key" begin
+    Random.seed!(0)
+
+    # Fuzzy correctness testing
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        k = array_from_host(rand(Int32, num_elems))
+        v = copy(k) .- 1
+        AK.merge_sort_by_key!(k, v)
+        kh = Array(k)
+        vh = Array(v)
+        @test issorted(kh)
+        @test issorted(vh)
+    end
+
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        k = array_from_host(rand(UInt32, num_elems))
+        v = copy(k) .- 1
+        AK.merge_sort_by_key!(k, v)
+        kh = Array(k)
+        vh = Array(v)
+        @test issorted(kh)
+        @test issorted(vh)
+    end
+
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        k = array_from_host(rand(Float32, num_elems))
+        v = copy(k) .- 1
+        AK.merge_sort_by_key!(k, v)
+        kh = Array(k)
+        vh = Array(v)
+        @test issorted(kh)
+        @test issorted(vh)
+    end
+
+    # Testing different settings
+    k = array_from_host(1:10_000, Float32)
+    v = array_from_host(1:10_000, Int32)
+    AK.merge_sort_by_key!(k, v,
+                        lt=(>), by=abs, rev=true,
+                        block_size=64,
+                        temp_keys=array_from_host(1:10_000, Float32),
+                        temp_values=array_from_host(1:10_000, Int32))
+    @test issorted(Array(k))
+    @test issorted(Array(v))
+
+    k = array_from_host(1:10_000, Int32)
+    v = array_from_host(1:10_000, Float32)
+    AK.merge_sort_by_key!(k, v,
+                        lt=(>), by=abs, rev=true,
+                        block_size=64,
+                        temp_keys=array_from_host(1:10_000, Int32),
+                        temp_values=array_from_host(1:10_000, Float32))
+    @test issorted(Array(k))
+    @test issorted(Array(v))
+
+    k = array_from_host(1:10_000, Float32)
+    v = array_from_host(1:10_000, Int32)
+    AK.merge_sort_by_key(k, v,
+                        lt=(>), by=abs, rev=true,
+                        block_size=64,
+                        temp_keys=array_from_host(1:10_000, Float32),
+                        temp_values=array_from_host(1:10_000, Int32))
+    @test issorted(Array(k))
+    @test issorted(Array(v))
+
+    k = array_from_host(1:10_000, Int32)
+    v = array_from_host(1:10_000, Float32)
+    AK.merge_sort_by_key(k, v,
+                        lt=(>), by=abs, rev=true,
+                        block_size=64,
+                        temp_keys=array_from_host(1:10_000, Int32),
+                        temp_values=array_from_host(1:10_000, Float32))
+    @test issorted(Array(k))
+    @test issorted(Array(v))
+end
+end
+
+
+if BACKEND != CPU()
+@testset "merge_sortperm" begin
+    Random.seed!(0)
+
+    # Fuzzy correctness testing
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        ix = array_from_host(zeros(Int32, num_elems))
+        v = array_from_host(rand(Int32, num_elems))
+        AK.merge_sortperm!(ix, v)
+        ixh = Array(ix)
+        vh = Array(v)
+        @test issorted(vh[ixh])
+    end
+
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        ix = array_from_host(zeros(Int32, num_elems))
+        v = array_from_host(rand(UInt32, num_elems))
+        AK.merge_sortperm!(ix, v)
+        ixh = Array(ix)
+        vh = Array(v)
+        @test issorted(vh[ixh])
+    end
+
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        ix = array_from_host(zeros(Int32, num_elems))
+        v = array_from_host(rand(Float32, num_elems))
+        AK.merge_sortperm!(ix, v)
+        ixh = Array(ix)
+        vh = Array(v)
+        @test issorted(vh[ixh])
+    end
+
+    # Testing different settings
+    ix = array_from_host(1:10_000, Int32)
+    v = array_from_host(1:10_000, Float32)
+    AK.merge_sortperm!(ix,
+                    v,
+                    lt=(>), by=abs, rev=true,
+                    inplace=true, block_size=64,
+                    temp_ix=array_from_host(1:10_000, Int32),
+                    temp_v=array_from_host(1:10_000, Float32))
+    ixh = Array(ix)
+    vh = Array(v)
+    @test issorted(vh[ixh])
+
+    v = array_from_host(1:10_000, Float32)
+    ix = AK.merge_sortperm(v,
+                        lt=(>), by=abs, rev=true,
+                        inplace=true, block_size=64,
+                        temp_ix=array_from_host(1:10_000, Int),
+                        temp_v=array_from_host(1:10_000, Float32))
+    ixh = Array(ix)
+    vh = Array(v)
+    @test issorted(vh[ixh])
+end
+
+else # CPU backend
+    @testset "sample_sortperm" begin
+    Random.seed!(0)
+
+    # Fuzzy correctness testing
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        ix = array_from_host(zeros(Int32, num_elems))
+        v = array_from_host(rand(Int32, num_elems))
+        AK.sample_sortperm!(ix, v)
+        ixh = Array(ix)
+        vh = Array(v)
+        @test issorted(vh[ixh])
+    end
+
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        ix = array_from_host(zeros(Int32, num_elems))
+        v = array_from_host(rand(UInt32, num_elems))
+        AK.sample_sortperm!(ix, v)
+        ixh = Array(ix)
+        vh = Array(v)
+        @test issorted(vh[ixh])
+    end
+
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        ix = array_from_host(zeros(Int32, num_elems))
+        v = array_from_host(rand(Float32, num_elems))
+        AK.sample_sortperm!(ix, v)
+        ixh = Array(ix)
+        vh = Array(v)
+        @test issorted(vh[ixh])
+    end
+
+    # Testing different settings
+    ix = array_from_host(1:10_000, Int32)
+    v = array_from_host(1:10_000, Float32)
+    AK.sample_sortperm!(ix,
+                    v,
+                    lt=(>), by=abs, rev=true,
+                    max_tasks=64,
+                    temp=array_from_host(1:10_000, Int32))
+    ixh = Array(ix)
+    vh = Array(v)
+    @test issorted(vh[ixh])
+end
+end
+
+
+if BACKEND != CPU()
+@testset "merge_sortperm_lowmem" begin
+    Random.seed!(0)
+
+    # Fuzzy correctness testing
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        ix = array_from_host(zeros(Int32, num_elems))
+        v = array_from_host(rand(Int32, num_elems))
+        AK.merge_sortperm_lowmem!(ix, v)
+        ixh = Array(ix)
+        vh = Array(v)
+        @test issorted(vh[ixh])
+    end
+
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        ix = array_from_host(zeros(Int32, num_elems))
+        v = array_from_host(rand(UInt32, num_elems))
+        AK.merge_sortperm_lowmem!(ix, v)
+        ixh = Array(ix)
+        vh = Array(v)
+        @test issorted(vh[ixh])
+    end
+
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        ix = array_from_host(zeros(Int32, num_elems))
+        v = array_from_host(rand(Float32, num_elems))
+        AK.merge_sortperm_lowmem!(ix, v)
+        ixh = Array(ix)
+        vh = Array(v)
+        @test issorted(vh[ixh])
+    end
+
+    # Testing different settings
+    ix = array_from_host(1:10_000, Int32)
+    v = array_from_host(1:10_000, Float32)
+    AK.merge_sortperm_lowmem!(ix,
+                            v,
+                            lt=(>), by=abs, rev=true,
+                            block_size=64,
+                            temp=array_from_host(1:10_000, Int32))
+    ixh = Array(ix)
+    vh = Array(v)
+    @test issorted(vh[ixh])
+
+    v = array_from_host(1:10_000, Float32)
+    ix = AK.merge_sortperm_lowmem(v,
+                                lt=(>), by=abs, rev=true,
+                                block_size=64,
+                                temp=array_from_host(1:10_000, Int))
+    ixh = Array(ix)
+    vh = Array(v)
+    @test issorted(vh[ixh])
+end
+end
+
+
+@testset "sortperm" begin
+    Random.seed!(0)
+
+    # Fuzzy correctness testing
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        ix = array_from_host(zeros(Int32, num_elems))
+        v = array_from_host(rand(Int32, num_elems))
+        AK.sortperm!(ix, v)
+        ixh = Array(ix)
+        vh = Array(v)
+        @test issorted(vh[ixh])
+    end
+
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        ix = array_from_host(zeros(Int32, num_elems))
+        v = array_from_host(rand(UInt32, num_elems))
+        AK.sortperm!(ix, v)
+        ixh = Array(ix)
+        vh = Array(v)
+        @test issorted(vh[ixh])
+    end
+
+    for _ in 1:1000
+        num_elems = rand(1:100_000)
+        ix = array_from_host(zeros(Int32, num_elems))
+        v = array_from_host(rand(Float32, num_elems))
+        AK.sortperm!(ix, v)
+        ixh = Array(ix)
+        vh = Array(v)
+        @test issorted(vh[ixh])
+    end
+
+    # Testing different settings
+    ix = array_from_host(1:10_000, Int32)
+    v = array_from_host(1:10_000, Float32)
+    AK.sortperm!(ix,
+                v,
+                lt=(>), by=abs, rev=true,
+                block_size=64,
+                temp=array_from_host(1:10_000, Int32))
+    ixh = Array(ix)
+    vh = Array(v)
+    @test issorted(vh[ixh])
+
+    v = array_from_host(1:10_000, Float32)
+    ix = AK.sortperm(v,
+                    lt=(>), by=abs, rev=true,
+                    block_size=64,
+                    temp=array_from_host(1:10_000, Int))
+    ixh = Array(ix)
+    vh = Array(v)
+    @test issorted(vh[ixh])
+end