diff --git a/.github/workflows/CI-CPU.yml b/.github/workflows/CI-CPU.yml index a9848a3..298d02c 100644 --- a/.github/workflows/CI-CPU.yml +++ b/.github/workflows/CI-CPU.yml @@ -63,6 +63,46 @@ jobs: - uses: julia-actions/julia-runtest@v1 env: JULIA_NUM_THREADS: ${{ matrix.env.JULIA_NUM_THREADS }} + OpenCL: + name: OpenCL + runs-on: ubuntu-latest + timeout-minutes: 60 + permissions: # needed to allow julia-actions/cache to proactively delete old caches that it has created + actions: write + contents: read + strategy: + fail-fast: true + steps: + - uses: actions/checkout@v4 + - uses: julia-actions/setup-julia@v2 + with: + version: 1 + arch: x64 + - uses: julia-actions/cache@v2 + - uses: julia-actions/julia-buildpkg@v1 + - uses: julia-actions/julia-runtest@v1 + with: + test_args: '--OpenCL' + # cpuKA: + # name: KA CPU Backend + # runs-on: ubuntu-latest + # timeout-minutes: 60 + # permissions: # needed to allow julia-actions/cache to proactively delete old caches that it has created + # actions: write + # contents: read + # strategy: + # fail-fast: true + # steps: + # - uses: actions/checkout@v4 + # - uses: julia-actions/setup-julia@v2 + # with: + # version: 1 + # arch: x64 + # - uses: julia-actions/cache@v2 + # - uses: julia-actions/julia-buildpkg@v1 + # - uses: julia-actions/julia-runtest@v1 + # with: + # test_args: '--cpuKA' docs: name: Documentation runs-on: ubuntu-latest diff --git a/src/AcceleratedKernels.jl b/src/AcceleratedKernels.jl index f97c9d8..a9de68e 100644 --- a/src/AcceleratedKernels.jl +++ b/src/AcceleratedKernels.jl @@ -12,7 +12,7 @@ module AcceleratedKernels # Internal dependencies using ArgCheck: @argcheck -using GPUArraysCore: AbstractGPUArray, @allowscalar +using GPUArraysCore: AnyGPUArray, @allowscalar using KernelAbstractions diff --git a/src/accumulate/accumulate.jl b/src/accumulate/accumulate.jl index 3179b2a..0aff1bf 100644 --- a/src/accumulate/accumulate.jl +++ b/src/accumulate/accumulate.jl @@ -167,6 +167,7 @@ function _accumulate_impl!( # CPU settings max_tasks::Int=Threads.nthreads(), min_elems::Int=2, + prefer_threads::Bool=true, # GPU settings block_size::Int=256, @@ -174,17 +175,26 @@ function _accumulate_impl!( temp_flags::Union{Nothing, AbstractArray}=nothing, ) if isnothing(dims) - return accumulate_1d!( - op, v, backend, alg; - init, neutral, inclusive, - max_tasks, min_elems, - block_size, temp, temp_flags, - ) + return if use_KA_algo(v, prefer_threads) + accumulate_1d_gpu!( + op, v, backend, alg; + init, neutral, inclusive, + max_tasks, min_elems, + block_size, temp, temp_flags, + ) + else + accumulate_1d_cpu!( + op, v, backend, alg; + init, neutral, inclusive, + max_tasks, min_elems, + block_size, temp, temp_flags, + ) + end else return accumulate_nd!( op, v, backend; init, neutral, dims, inclusive, - max_tasks, min_elems, + max_tasks, min_elems, prefer_threads, block_size, ) end diff --git a/src/accumulate/accumulate_1d_cpu.jl b/src/accumulate/accumulate_1d_cpu.jl index ce03c0c..9f45ada 100644 --- a/src/accumulate/accumulate_1d_cpu.jl +++ b/src/accumulate/accumulate_1d_cpu.jl @@ -1,5 +1,5 @@ -function accumulate_1d!( - op, v::AbstractArray, backend::CPU, alg; +function accumulate_1d_cpu!( + op, v::AbstractArray, backend::Backend, alg; init, neutral, inclusive::Bool, diff --git a/src/accumulate/accumulate_1d_gpu.jl b/src/accumulate/accumulate_1d_gpu.jl index be3ee59..f0ca135 100644 --- a/src/accumulate/accumulate_1d_gpu.jl +++ b/src/accumulate/accumulate_1d_gpu.jl @@ -248,8 +248,8 @@ end # DecoupledLookback algorithm -function accumulate_1d!( - op, v::AbstractArray, backend::GPU, ::DecoupledLookback; +function accumulate_1d_gpu!( + op, v::AbstractArray, backend::Backend, ::DecoupledLookback; init, neutral, inclusive::Bool, @@ -307,8 +307,8 @@ end # ScanPrefixes algorithm -function accumulate_1d!( - op, v::AbstractArray, backend::GPU, ::ScanPrefixes; +function accumulate_1d_gpu!( + op, v::AbstractArray, backend, ::ScanPrefixes; init, neutral, inclusive::Bool, diff --git a/src/accumulate/accumulate_nd.jl b/src/accumulate/accumulate_nd.jl index aeb08ca..5e213c2 100644 --- a/src/accumulate/accumulate_nd.jl +++ b/src/accumulate/accumulate_nd.jl @@ -8,6 +8,7 @@ function accumulate_nd!( # CPU settings max_tasks::Int, min_elems::Int, + prefer_threads::Bool=true, # GPU settings block_size::Int, @@ -34,7 +35,7 @@ function accumulate_nd!( # Degenerate cases end - if backend isa CPU + if !use_KA_algo(v, prefer_threads) _accumulate_nd_cpu_sections!(op, v; init, dims, inclusive, max_tasks, min_elems) else # On GPUs we have two parallelisation approaches, based on which dimension has more elements: diff --git a/src/foreachindex.jl b/src/foreachindex.jl index 2cf68a6..24b9d78 100644 --- a/src/foreachindex.jl +++ b/src/foreachindex.jl @@ -15,7 +15,7 @@ end function _forindices_gpu( f, indices, - backend::GPU; + backend::Backend; block_size::Int=256, ) @@ -125,11 +125,12 @@ function foreachindex( # CPU settings max_tasks=Threads.nthreads(), min_elems=1, + prefer_threads::Bool=true, # GPU settings block_size=256, ) - if backend isa GPU + if use_KA_algo(itr, prefer_threads) _forindices_gpu(f, eachindex(itr), backend; block_size) else _forindices_threads(f, eachindex(itr); max_tasks, min_elems) @@ -218,6 +219,7 @@ function foraxes( # CPU settings max_tasks=Threads.nthreads(), min_elems=1, + prefer_threads::Bool=true, # GPU settings block_size=256, @@ -226,11 +228,11 @@ function foraxes( return foreachindex( f, itr, backend; max_tasks, min_elems, - block_size, + prefer_threads, block_size, ) end - if backend isa GPU + if use_KA_algo(itr, prefer_threads) _forindices_gpu(f, axes(itr, dims), backend; block_size) else _forindices_threads(f, axes(itr, dims); max_tasks, min_elems) diff --git a/src/map.jl b/src/map.jl index 6219f0a..10d690d 100644 --- a/src/map.jl +++ b/src/map.jl @@ -33,19 +33,12 @@ end """ function map!( f, dst::AbstractArray, src::AbstractArray, backend::Backend=get_backend(src); - - # CPU settings - max_tasks=Threads.nthreads(), - min_elems=1, - - # GPU settings - block_size=256, + kwargs... ) @argcheck length(dst) == length(src) foreachindex( src, backend; - max_tasks, min_elems, - block_size, + kwargs... ) do idx dst[idx] = f(src[idx]) end diff --git a/src/predicates.jl b/src/predicates.jl index 19c8e5c..1f0a0dd 100644 --- a/src/predicates.jl +++ b/src/predicates.jl @@ -114,11 +114,12 @@ function _any_impl( # CPU settings max_tasks=Threads.nthreads(), min_elems=1, + prefer_threads::Bool=true, # GPU settings block_size::Int=256, ) - if backend isa GPU + if use_KA_algo(v, prefer_threads) @argcheck block_size > 0 # Some platforms crash when multiple threads write to the same memory location in a global @@ -137,7 +138,8 @@ function _any_impl( backend; init=false, neutral=false, - block_size=block_size, + prefer_threads=true, + block_size, temp=alg.temp, switch_below=alg.switch_below, ) @@ -246,11 +248,12 @@ function _all_impl( # CPU settings max_tasks=Threads.nthreads(), min_elems=1, + prefer_threads::Bool=true, # GPU settings block_size::Int=256, ) - if backend isa GPU + if use_KA_algo(v, prefer_threads) @argcheck block_size > 0 # Some platforms crash when multiple threads write to the same memory location in a global @@ -269,7 +272,8 @@ function _all_impl( backend; init=true, neutral=true, - block_size=block_size, + prefer_threads=false, + block_size, temp=alg.temp, switch_below=alg.switch_below, ) diff --git a/src/reduce/mapreduce_1d_cpu.jl b/src/reduce/mapreduce_1d_cpu.jl index 544c3a7..95a93f2 100644 --- a/src/reduce/mapreduce_1d_cpu.jl +++ b/src/reduce/mapreduce_1d_cpu.jl @@ -1,5 +1,5 @@ -function mapreduce_1d( - f, op, src::AbstractArray, backend::CPU; +function mapreduce_1d_cpu( + f, op, src::AbstractArray, backend::Backend; init, neutral, diff --git a/src/reduce/mapreduce_1d_gpu.jl b/src/reduce/mapreduce_1d_gpu.jl index b2b9566..c1e31cc 100644 --- a/src/reduce/mapreduce_1d_gpu.jl +++ b/src/reduce/mapreduce_1d_gpu.jl @@ -99,8 +99,8 @@ end -function mapreduce_1d( - f, op, src::AbstractArray, backend::GPU; +function mapreduce_1d_gpu( + f, op, src::AbstractArray, backend::Backend; init, neutral, diff --git a/src/reduce/mapreduce_nd.jl b/src/reduce/mapreduce_nd.jl index a373a71..cf7d825 100644 --- a/src/reduce/mapreduce_nd.jl +++ b/src/reduce/mapreduce_nd.jl @@ -7,6 +7,7 @@ function mapreduce_nd( # CPU settings - ignored here max_tasks::Int, min_elems::Int, + prefer_threads::Bool=true, # GPU settings block_size::Int, @@ -113,7 +114,7 @@ function mapreduce_nd( end dst_size = length(dst) - if backend isa CPU + if !use_KA_algo(src, prefer_threads) _mapreduce_nd_cpu_sections!( f, op, dst, src; init, diff --git a/src/reduce/reduce.jl b/src/reduce/reduce.jl index 0a5781d..0332531 100644 --- a/src/reduce/reduce.jl +++ b/src/reduce/reduce.jl @@ -175,6 +175,7 @@ function _mapreduce_impl( # CPU settings max_tasks::Int=Threads.nthreads(), min_elems::Int=1, + prefer_threads::Bool=true, # GPU settings block_size::Int=256, @@ -182,18 +183,28 @@ function _mapreduce_impl( switch_below::Int=0, ) if isnothing(dims) - return mapreduce_1d( - f, op, src, backend; - init, neutral, - max_tasks, min_elems, - block_size, temp, - switch_below - ) + if use_KA_algo(src, prefer_threads) + mapreduce_1d_gpu( + f, op, src, backend; + init, neutral, + max_tasks, min_elems, + block_size, temp, + switch_below + ) + else + mapreduce_1d_cpu( + f, op, src, backend; + init, neutral, + max_tasks, min_elems, + block_size, temp, + switch_below + ) + end else return mapreduce_nd( f, op, src, backend; - init, neutral, - dims, max_tasks=max_tasks, + init, neutral, dims, + max_tasks, prefer_threads, min_elems, block_size, temp, ) diff --git a/src/sort/merge_sort.jl b/src/sort/merge_sort.jl index 00ea8fc..5fb7b20 100644 --- a/src/sort/merge_sort.jl +++ b/src/sort/merge_sort.jl @@ -125,7 +125,7 @@ end """ merge_sort!( - v::AbstractGPUArray, backend::Backend=get_backend(v); + v::AbstractArray, backend::Backend=get_backend(v); lt=isless, by=identity, @@ -133,11 +133,11 @@ end order::Base.Order.Ordering=Base.Order.Forward, block_size::Int=256, - temp::Union{Nothing, AbstractGPUArray}=nothing, + temp::Union{Nothing, AbstractArray}=nothing, ) """ function merge_sort!( - v::AbstractGPUArray, backend::Backend=get_backend(v); + v::AbstractArray, backend::Backend=get_backend(v); lt=isless, by=identity, @@ -145,7 +145,7 @@ function merge_sort!( order::Base.Order.Ordering=Base.Order.Forward, block_size::Int=256, - temp::Union{Nothing, AbstractGPUArray}=nothing, + temp::Union{Nothing, AbstractArray}=nothing, ) # Simple sanity checks @argcheck block_size > 0 @@ -195,7 +195,7 @@ end """ merge_sort( - v::AbstractGPUArray, backend::Backend=get_backend(v); + v::AbstractArray, backend::Backend=get_backend(v); lt=isless, by=identity, @@ -203,11 +203,11 @@ end order::Base.Order.Ordering=Base.Order.Forward, block_size::Int=256, - temp::Union{Nothing, AbstractGPUArray}=nothing, + temp::Union{Nothing, AbstractArray}=nothing, ) """ function merge_sort( - v::AbstractGPUArray, backend::Backend=get_backend(v); + v::AbstractArray, backend::Backend=get_backend(v); kwargs... ) v_copy = copy(v) diff --git a/src/sort/merge_sort_by_key.jl b/src/sort/merge_sort_by_key.jl index 8690910..f6de5f3 100644 --- a/src/sort/merge_sort_by_key.jl +++ b/src/sort/merge_sort_by_key.jl @@ -241,8 +241,8 @@ end """ merge_sort_by_key( - keys::AbstractGPUArray, - values::AbstractGPUArray, + keys::AbstractArray, + values::AbstractArray, backend::Backend=get_backend(keys); lt=isless, @@ -251,13 +251,13 @@ end order::Base.Order.Ordering=Base.Order.Forward, block_size::Int=256, - temp_keys::Union{Nothing, AbstractGPUArray}=nothing, - temp_values::Union{Nothing, AbstractGPUArray}=nothing, + temp_keys::Union{Nothing, AbstractArray}=nothing, + temp_values::Union{Nothing, AbstractArray}=nothing, ) """ function merge_sort_by_key( - keys::AbstractGPUArray, - values::AbstractGPUArray, + keys::AbstractArray, + values::AbstractArray, backend::Backend=get_backend(keys); kwargs... ) diff --git a/src/sort/merge_sortperm.jl b/src/sort/merge_sortperm.jl index 3266cc9..6b97061 100644 --- a/src/sort/merge_sortperm.jl +++ b/src/sort/merge_sortperm.jl @@ -1,7 +1,7 @@ """ merge_sortperm!( - ix::AbstractGPUArray, - v::AbstractGPUArray, + ix::AbstractArray, + v::AbstractArray, backend::Backend=get_backend(v); lt=(<), @@ -11,13 +11,13 @@ inplace::Bool=false, block_size::Int=256, - temp_ix::Union{Nothing, AbstractGPUArray}=nothing, - temp_v::Union{Nothing, AbstractGPUArray}=nothing, + temp_ix::Union{Nothing, AbstractArray}=nothing, + temp_v::Union{Nothing, AbstractArray}=nothing, ) """ function merge_sortperm!( - ix::AbstractGPUArray, - v::AbstractGPUArray, + ix::AbstractArray, + v::AbstractArray, backend::Backend=get_backend(v); lt=(<), @@ -27,8 +27,8 @@ function merge_sortperm!( inplace::Bool=false, block_size::Int=256, - temp_ix::Union{Nothing, AbstractGPUArray}=nothing, - temp_v::Union{Nothing, AbstractGPUArray}=nothing, + temp_ix::Union{Nothing, AbstractArray}=nothing, + temp_v::Union{Nothing, AbstractArray}=nothing, ) # Simple sanity checks @argcheck block_size > 0 @@ -61,7 +61,7 @@ end """ merge_sortperm( - v::AbstractGPUArray, backend::Backend=get_backend(v); + v::AbstractArray, backend::Backend=get_backend(v); lt=(<), by=identity, @@ -70,12 +70,12 @@ end inplace::Bool=false, block_size::Int=256, - temp_ix::Union{Nothing, AbstractGPUArray}=nothing, - temp_v::Union{Nothing, AbstractGPUArray}=nothing, + temp_ix::Union{Nothing, AbstractArray}=nothing, + temp_v::Union{Nothing, AbstractArray}=nothing, ) """ function merge_sortperm( - v::AbstractGPUArray, backend::Backend=get_backend(v); + v::AbstractArray, backend::Backend=get_backend(v); kwargs... ) ix = similar(v, Int) @@ -88,8 +88,8 @@ end """ merge_sortperm_lowmem!( - ix::AbstractGPUArray, - v::AbstractGPUArray, + ix::AbstractArray, + v::AbstractArray, backend::Backend=get_backend(v); lt=(<), @@ -98,12 +98,12 @@ end order::Base.Order.Ordering=Base.Order.Forward, block_size::Int=256, - temp::Union{Nothing, AbstractGPUArray}=nothing, + temp::Union{Nothing, AbstractArray}=nothing, ) """ function merge_sortperm_lowmem!( - ix::AbstractGPUArray, - v::AbstractGPUArray, + ix::AbstractArray, + v::AbstractArray, backend::Backend=get_backend(v); lt=(<), @@ -112,7 +112,7 @@ function merge_sortperm_lowmem!( order::Base.Order.Ordering=Base.Order.Forward, block_size::Int=256, - temp::Union{Nothing, AbstractGPUArray}=nothing, + temp::Union{Nothing, AbstractArray}=nothing, ) # Simple sanity checks @argcheck block_size > 0 @@ -168,7 +168,7 @@ end """ merge_sortperm_lowmem( - v::AbstractGPUArray, backend::Backend=get_backend(v); + v::AbstractArray, backend::Backend=get_backend(v); lt=(<), by=identity, @@ -176,11 +176,11 @@ end order::Base.Order.Ordering=Base.Order.Forward, block_size::Int=256, - temp::Union{Nothing, AbstractGPUArray}=nothing, + temp::Union{Nothing, AbstractArray}=nothing, ) """ function merge_sortperm_lowmem( - v::AbstractGPUArray, backend::Backend=get_backend(v); + v::AbstractArray, backend::Backend=get_backend(v); kwargs... ) ix = similar(v, Int) diff --git a/src/sort/sort.jl b/src/sort/sort.jl index 5fbce2d..8e55e3a 100644 --- a/src/sort/sort.jl +++ b/src/sort/sort.jl @@ -88,6 +88,7 @@ function _sort_impl!( max_tasks=Threads.nthreads(), min_elems=1, + prefer_threads::Bool=true, # GPU settings block_size::Int=256, @@ -95,7 +96,7 @@ function _sort_impl!( # Temporary buffer, same size as `v` temp::Union{Nothing, AbstractArray}=nothing, ) - if backend isa GPU + if use_KA_algo(v, prefer_threads) merge_sort!( v, backend; lt, by, rev, order, @@ -198,6 +199,7 @@ function _sortperm_impl!( max_tasks=Threads.nthreads(), min_elems=1, + prefer_threads::Bool=true, # GPU settings block_size::Int=256, @@ -205,7 +207,7 @@ function _sortperm_impl!( # Temporary buffer, same size as `v` temp::Union{Nothing, AbstractArray}=nothing, ) - if backend isa GPU + if use_KA_algo(v, prefer_threads) merge_sortperm_lowmem!( ix, v, backend; lt, by, rev, order, diff --git a/src/utils.jl b/src/utils.jl index 31b6246..f601b44 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -2,6 +2,10 @@ function ispow2(x) count_ones(x) == 1 end +# Helper function to check whether the package cpu implementation of an algorithm should be used +@inline function use_KA_algo(output_array, prefer_threads) + return output_array isa AnyGPUArray || !prefer_threads +end """ struct TypeWrap{T} end diff --git a/test/accumulate.jl b/test/accumulate.jl index 759277d..5022e38 100644 --- a/test/accumulate.jl +++ b/test/accumulate.jl @@ -10,7 +10,7 @@ ALGS = AK.AccumulateAlgorithm[AK.ScanPrefixes()] for num_elems in 1:256 x = array_from_host(ones(Int32, num_elems)) y = copy(x) - AK.accumulate!(+, y; init=0, inclusive=false, block_size=128, alg) + AK.accumulate!(+, y; prefer_threads, init=0, inclusive=false, block_size=128, alg) yh = Array(y) @test all(yh .== 0:length(yh) - 1) end @@ -19,7 +19,7 @@ ALGS = AK.AccumulateAlgorithm[AK.ScanPrefixes()] for num_elems in 1:256 x = array_from_host(rand(1:1000, num_elems), Int32) y = copy(x) - AK.accumulate!(+, y; init=0, block_size=128, alg) + AK.accumulate!(+, y; prefer_threads, init=0, block_size=128, alg) @test all(Array(y) .== accumulate(+, Array(x))) end @@ -28,7 +28,7 @@ ALGS = AK.AccumulateAlgorithm[AK.ScanPrefixes()] num_elems = rand(1:100_000) x = array_from_host(ones(Int32, num_elems)) y = copy(x) - AK.accumulate!(+, y; init=0, inclusive=false, alg) + AK.accumulate!(+, y; prefer_threads, init=0, inclusive=false, alg) yh = Array(y) @test all(yh .== 0:length(yh) - 1) end @@ -38,7 +38,7 @@ ALGS = AK.AccumulateAlgorithm[AK.ScanPrefixes()] num_elems = rand(1:100_000) x = array_from_host(rand(1:1000, num_elems), Int32) y = copy(x) - AK.accumulate!(+, y; init=0, alg) + AK.accumulate!(+, y; prefer_threads, init=0, alg) @test all(Array(y) .== accumulate(+, Array(x))) end @@ -47,7 +47,7 @@ ALGS = AK.AccumulateAlgorithm[AK.ScanPrefixes()] num_elems = rand(1:100_000) x = array_from_host(rand(1:1000, num_elems), Int32) y = copy(x) - AK.accumulate!(+, y; init=0, block_size=16, alg) + AK.accumulate!(+, y; prefer_threads, init=0, block_size=16, alg) @test all(Array(y) .== accumulate(+, Array(x))) end @@ -58,7 +58,7 @@ ALGS = AK.AccumulateAlgorithm[AK.ScanPrefixes()] n3 = rand(1:100) vh = rand(Float32, n1, n2, n3) v = array_from_host(vh) - AK.accumulate!(+, v; init=0, alg) + AK.accumulate!(+, v; prefer_threads, init=0, alg) @test all(Array(v) .≈ accumulate(+, vh)) end @@ -68,33 +68,33 @@ ALGS = AK.AccumulateAlgorithm[AK.ScanPrefixes()] x = array_from_host(rand(1:1000, num_elems), Int32) y = similar(x) init = rand(-1000:1000) - AK.accumulate!(+, y, x; init=Int32(init), alg) + AK.accumulate!(+, y, x; prefer_threads, init=Int32(init), alg) @test all(Array(y) .== accumulate(+, Array(x); init)) end # Exclusive scan x = array_from_host(ones(Int32, 10)) y = copy(x) - AK.accumulate!(+, y; init=0, inclusive=false, alg) + AK.accumulate!(+, y; prefer_threads, init=0, inclusive=false, alg) @test all(Array(y) .== 0:9) # Test init value is respected with exclusive scan too x = array_from_host(ones(Int32, 10)) y = copy(x) init = 10 - AK.accumulate!(+, y; init=Int32(init), inclusive=false, alg) + AK.accumulate!(+, y; prefer_threads, init=Int32(init), inclusive=false, alg) @test all(Array(y) .== 10:19) # Test that undefined kwargs are not accepted - @test_throws MethodError AK.accumulate(+, y; init=10, dims=2, inclusive=false, bad=:kwarg) + @test_throws MethodError AK.accumulate(+, y; prefer_threads, init=10, dims=2, inclusive=false, bad=:kwarg) # Testing different settings AK.accumulate!(+, array_from_host(ones(Int32, 1000)); init=0, inclusive=false, - block_size=128, alg, + prefer_threads, block_size=128, alg, temp=array_from_host(zeros(Int32, 1000)), temp_flags=array_from_host(zeros(Int8, 1000))) AK.accumulate(+, array_from_host(ones(Int32, 1000)); init=0, inclusive=false, - block_size=128, alg, + prefer_threads, block_size=128, alg, temp=array_from_host(zeros(Int64, 1000)), temp_flags=array_from_host(zeros(Int8, 1000))) end @@ -110,7 +110,7 @@ end for ksize in 0:3 sh = rand(Int32(1):Int32(100), isize, jsize, ksize) s = array_from_host(sh) - d = AK.accumulate(+, s; init=Int32(0), dims) + d = AK.accumulate(+, s; prefer_threads, init=Int32(0), dims) dh = Array(d) dhres = accumulate(+, sh; init=Int32(0), dims) @@ -130,7 +130,7 @@ end vh = rand(Int32(1):Int32(100), n1, n2, n3) v = array_from_host(vh) - s = AK.accumulate(+, v; init=Int32(0), dims) + s = AK.accumulate(+, v; prefer_threads, init=Int32(0), dims) sh = Array(s) @test sh == accumulate(+, vh; init=Int32(0), dims) end @@ -144,7 +144,7 @@ end vh = rand(UInt32(1):UInt32(100), n1, n2, n3) v = array_from_host(vh) - s = AK.accumulate(+, v; init=UInt32(0), dims) + s = AK.accumulate(+, v; prefer_threads, init=UInt32(0), dims) sh = Array(s) @test sh == accumulate(+, vh; init=UInt32(0), dims) end @@ -158,7 +158,7 @@ end vh = rand(Float32, n1, n2, n3) v = array_from_host(vh) - s = AK.accumulate(+, v; init=Float32(0), dims) + s = AK.accumulate(+, v; prefer_threads, init=Float32(0), dims) sh = Array(s) @test all(sh .≈ accumulate(+, vh; init=Float32(0), dims)) end @@ -173,7 +173,7 @@ end vh = rand(Float32, n1, n2, n3) v = array_from_host(vh) init = rand(-1000:1000) - s = AK.accumulate(+, v; init=Float32(init), dims) + s = AK.accumulate(+, v; prefer_threads, init=Float32(init), dims) sh = Array(s) @test all(sh .≈ accumulate(+, vh; init=Float32(init), dims)) end @@ -182,19 +182,19 @@ end # Exclusive scan vh = ones(Int32, 10, 10) v = array_from_host(vh) - s = AK.accumulate(+, v; init=0, dims=2, inclusive=false) + s = AK.accumulate(+, v; prefer_threads, init=0, dims=2, inclusive=false) sh = Array(s) @test all([sh[i, :] == 0:9 for i in 1:10]) # Test init value is respected with exclusive scan too vh = ones(Int32, 10, 10) v = array_from_host(vh) - s = AK.accumulate(+, v; init=10, dims=2, inclusive=false) + s = AK.accumulate(+, v; prefer_threads, init=10, dims=2, inclusive=false) sh = Array(s) @test all([sh[i, :] == 10:19 for i in 1:10]) # Test that undefined kwargs are not accepted - @test_throws MethodError AK.accumulate(+, v; init=10, dims=2, inclusive=false, bad=:kwarg) + @test_throws MethodError AK.accumulate(+, v; prefer_threads, init=10, dims=2, inclusive=false, bad=:kwarg) # Test all options with bigger matrices for D in [(1_000_000,3), (3,1_000_000)], dims in [1,2] @@ -210,7 +210,8 @@ end # Testing different settings AK.accumulate( (x, y) -> x + 1, - array_from_host(rand(Int32, 3, 4, 5)), + array_from_host(rand(Int32, 3, 4, 5)); + prefer_threads, init=Int32(0), neutral=Int32(0), dims=2, @@ -219,7 +220,8 @@ end ) AK.accumulate( (x, y) -> x + 1, - array_from_host(rand(Int32, 3, 4, 5)), + array_from_host(rand(Int32, 3, 4, 5)); + prefer_threads, init=Int32(0), neutral=Int32(0), dims=3, @@ -234,14 +236,14 @@ end # Simple correctness tests v = array_from_host(1:100) vh = Array(v) - @test Array(AK.cumsum(v)) == cumsum(vh) + @test Array(AK.cumsum(v; prefer_threads)) == cumsum(vh) # Fuzzy testing for _ in 1:100 num_elems = rand(1:100_000) vh = rand(Float32, num_elems) v = array_from_host(vh) - @test all(Array(AK.cumsum(v)) .≈ cumsum(vh)) + @test all(Array(AK.cumsum(v; prefer_threads)) .≈ cumsum(vh)) end for _ in 1:100 @@ -253,10 +255,10 @@ end v = array_from_host(vh) # Indexing into array as if linear; not supported in Base - # @test all(Array(AK.cumsum(v)) .== cumsum(vh)) + # @test all(Array(AK.cumsum(v; prefer_threads)) .== cumsum(vh)) # Along dimensions - r = Array(AK.cumsum(v; dims)) + r = Array(AK.cumsum(v; prefer_threads, dims)) rh = cumsum(vh; dims) @test r == rh @@ -266,14 +268,14 @@ end # Test promotion to op-dictated type xh = rand(Bool, 16) x = array_from_host(xh) - @test Array(AK.cumsum(x)) == cumsum(xh) + @test Array(AK.cumsum(x; prefer_threads)) == cumsum(xh) # Testing different settings v = array_from_host(rand(-5:5, 100_000)) - AK.cumsum(v, block_size=64) + AK.cumsum(v; prefer_threads, block_size=64) # Test that undefined kwargs are not accepted - @test_throws MethodError AK.cumsum(v; init=10, bad=:kwarg) + @test_throws MethodError AK.cumsum(v; prefer_threads, init=10, bad=:kwarg) # The other settings are stress-tested in reduce end @@ -286,11 +288,11 @@ end # Simple correctness tests v = array_from_host(1:100) vh = Array(v) - @test Array(AK.cumprod(v)) == cumprod(vh) + @test Array(AK.cumprod(v; prefer_threads)) == cumprod(vh) vh = ones(Float32, 100_000) v = array_from_host(vh) - @test Array(AK.cumprod(v)) == vh + @test Array(AK.cumprod(v; prefer_threads)) == vh # Fuzzy testing for _ in 1:100 @@ -302,10 +304,10 @@ end v = array_from_host(vh) # Indexing into array as if linear; not supported in Base - # @test all(Array(AK.cumprod(v)) .== cumprod(vh)) + # @test all(Array(AK.cumprod(v; prefer_threads)) .== cumprod(vh)) # Along dimensions - r = Array(AK.cumprod(v; dims)) + r = Array(AK.cumprod(v; prefer_threads, dims)) rh = cumprod(vh; dims) @test r == rh @@ -314,10 +316,10 @@ end # Testing different settings v = array_from_host(rand(-5:5, 100_000)) - AK.cumprod(v, block_size=64) + AK.cumprod(v; prefer_threads, block_size=64) # Test that undefined kwargs are not accepted - @test_throws MethodError AK.cumprod(v; init=10, bad=:kwarg) + @test_throws MethodError AK.cumprod(v; prefer_threads, init=10, bad=:kwarg) # The other settings are stress-tested in reduce end diff --git a/test/binarysearch.jl b/test/binarysearch.jl index b988cde..0f11c4c 100644 --- a/test/binarysearch.jl +++ b/test/binarysearch.jl @@ -11,11 +11,11 @@ v = array_from_host(sort(rand(Int32, num_elems_v))) x = array_from_host(rand(Int32, num_elems_x)) ix = similar(x, Int32) - AK.searchsortedfirst!(ix, v, x) + AK.searchsortedfirst!(ix, v, x; prefer_threads) vh = Array(v) xh = Array(x) - ixh = AK.searchsortedfirst(vh, xh) + ixh = AK.searchsortedfirst(vh, xh; prefer_threads) ixh_base = [searchsortedfirst(vh, e) for e in xh] @test all(Array(ix) .== ixh .== ixh_base) @@ -24,11 +24,11 @@ v = array_from_host(sort(rand(Float32, num_elems_v))) x = array_from_host(rand(Float32, num_elems_x)) ix = similar(x, Int32) - AK.searchsortedfirst!(ix, v, x) + AK.searchsortedfirst!(ix, v, x; prefer_threads) vh = Array(v) xh = Array(x) - ixh = AK.searchsortedfirst(vh, xh) + ixh = AK.searchsortedfirst(vh, xh; prefer_threads) ixh_base = [searchsortedfirst(vh, e) for e in xh] @test all(Array(ix) .== ixh .== ixh_base) @@ -43,11 +43,11 @@ v = array_from_host(sort(rand(Int32, num_elems_v))) x = array_from_host(rand(Int32, num_elems_x)) ix = similar(x, Int32) - AK.searchsortedlast!(ix, v, x) + AK.searchsortedlast!(ix, v, x; prefer_threads) vh = Array(v) xh = Array(x) - ixh = AK.searchsortedlast(vh, xh) + ixh = AK.searchsortedlast(vh, xh; prefer_threads) ixh_base = [searchsortedlast(vh, e) for e in xh] @test all(Array(ix) .== ixh .== ixh_base) @@ -56,11 +56,11 @@ v = array_from_host(sort(rand(Float32, num_elems_v))) x = array_from_host(rand(Float32, num_elems_x)) ix = similar(x, Int32) - AK.searchsortedlast!(ix, v, x) + AK.searchsortedlast!(ix, v, x; prefer_threads) vh = Array(v) xh = Array(x) - ixh = AK.searchsortedlast(vh, xh) + ixh = AK.searchsortedlast(vh, xh; prefer_threads) ixh_base = [searchsortedlast(vh, e) for e in xh] @test all(Array(ix) .== ixh .== ixh_base) @@ -71,23 +71,23 @@ x = array_from_host(rand(Int32, 10_000)) ix = similar(x, Int32) - AK.searchsortedfirst!(ix, v, x, by=abs, lt=(>), rev=true, block_size=64) - AK.searchsortedfirst(v, x, by=abs, lt=(>), rev=true, block_size=64) - AK.searchsortedlast!(ix, v, x, by=abs, lt=(>), rev=true, block_size=64) - AK.searchsortedlast(v, x, by=abs, lt=(>), rev=true, block_size=64) + AK.searchsortedfirst!(ix, v, x; prefer_threads, by=abs, lt=(>), rev=true, block_size=64) + AK.searchsortedfirst(v, x; prefer_threads, by=abs, lt=(>), rev=true, block_size=64) + AK.searchsortedlast!(ix, v, x; prefer_threads, by=abs, lt=(>), rev=true, block_size=64) + AK.searchsortedlast(v, x; prefer_threads, by=abs, lt=(>), rev=true, block_size=64) vh = Array(v) xh = Array(x) ixh = similar(xh, Int32) - AK.searchsortedfirst!(ixh, vh, xh, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100) - AK.searchsortedfirst(vh, xh, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100) - AK.searchsortedlast!(ixh, vh, xh, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100) - AK.searchsortedlast(vh, xh, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100) + AK.searchsortedfirst!(ixh, vh, xh; prefer_threads, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100) + AK.searchsortedfirst(vh, xh; prefer_threads, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100) + AK.searchsortedlast!(ixh, vh, xh; prefer_threads, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100) + AK.searchsortedlast(vh, xh; prefer_threads, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100) # Test that undefined kwargs are not accepted - @test_throws MethodError AK.searchsortedfirst!(ixh, vh, xh, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100, bad=:kwarg) - @test_throws MethodError AK.searchsortedfirst(vh, xh, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100, bad=:kwarg) - @test_throws MethodError AK.searchsortedlast!(ixh, vh, xh, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100, bad=:kwarg) - @test_throws MethodError AK.searchsortedlast(vh, xh, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100, bad=:kwarg) + @test_throws MethodError AK.searchsortedfirst!(ixh, vh, xh; prefer_threads, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100, bad=:kwarg) + @test_throws MethodError AK.searchsortedfirst(vh, xh; prefer_threads, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100, bad=:kwarg) + @test_throws MethodError AK.searchsortedlast!(ixh, vh, xh; prefer_threads, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100, bad=:kwarg) + @test_throws MethodError AK.searchsortedlast(vh, xh; prefer_threads, by=abs, lt=(>), rev=true, max_tasks=10, min_elems=100, bad=:kwarg) end diff --git a/test/looping.jl b/test/looping.jl index 746fe8c..55944d6 100644 --- a/test/looping.jl +++ b/test/looping.jl @@ -3,27 +3,27 @@ Random.seed!(0) # CPU - if BACKEND == CPU() + if IS_CPU_BACKEND && prefer_threads x = zeros(Int, 1000) - AK.foreachindex(x) do i + AK.foreachindex(x; prefer_threads) do i x[i] = i end @test all(x .== 1:length(x)) x = zeros(Int, 1000) - AK.foreachindex(x, max_tasks=1, min_elems=1) do i + AK.foreachindex(x; prefer_threads, max_tasks=1, min_elems=1) do i x[i] = i end @test all(x .== 1:length(x)) x = zeros(Int, 1000) - AK.foreachindex(x, max_tasks=10, min_elems=1) do i + AK.foreachindex(x; prefer_threads, max_tasks=10, min_elems=1) do i x[i] = i end @test all(x .== 1:length(x)) x = zeros(Int, 1000) - AK.foreachindex(x, max_tasks=10, min_elems=10) do i + AK.foreachindex(x; prefer_threads, max_tasks=10, min_elems=10) do i x[i] = i end @test all(x .== 1:length(x)) @@ -31,7 +31,7 @@ # GPU else x = array_from_host(zeros(Int, 10_000)) - f1(x) = AK.foreachindex(x) do i # This must be inside a function to have a known type! + f1(x) = AK.foreachindex(x; prefer_threads) do i # This must be inside a function to have a known type! x[i] = i end f1(x) @@ -39,7 +39,7 @@ @test all(xh .== 1:length(xh)) x = array_from_host(zeros(Int, 10_000)) - f2(x) = AK.foreachindex(x, block_size=64) do i + f2(x) = AK.foreachindex(x; prefer_threads, block_size=64) do i x[i] = i end f2(x) @@ -59,12 +59,12 @@ end end x = array_from_host(zeros(Int, 10, 1000)) - f1(x) + f1(x; prefer_threads) xh = Array(x) @test all(xh .== (1:10) .+ (1:1000)') x = array_from_host(zeros(UInt32, 10, 1000)) - f1(x, max_tasks=2, min_elems=100, block_size=64) + f1(x; prefer_threads, max_tasks=2, min_elems=100, block_size=64) xh = Array(x) @test all(xh .== (1:10) .+ (1:1000)') @@ -75,12 +75,12 @@ end end x = array_from_host(zeros(Int, 10, 1000)) - f2(x) + f2(x; prefer_threads) xh = Array(x) @test all(xh .== (1:10) .+ (1:1000)') x = array_from_host(zeros(UInt32, 10, 1000)) - f2(x, max_tasks=2, min_elems=100, block_size=64) + f2(x; prefer_threads, max_tasks=2, min_elems=100, block_size=64) xh = Array(x) @test all(xh .== (1:10) .+ (1:1000)') @@ -90,7 +90,7 @@ end end x = array_from_host(zeros(Int, 10, 1000)) - f3(x) + f3(x; prefer_threads) xh = Array(x) @test all(xh[:] .== 1:length(x)) end diff --git a/test/map.jl b/test/map.jl index 01c55bd..24fa8bd 100644 --- a/test/map.jl +++ b/test/map.jl @@ -2,56 +2,56 @@ Random.seed!(0) # CPU - if BACKEND == CPU() + if IS_CPU_BACKEND && prefer_threads x = Array(1:1000) - y = AK.map(x) do i + y = AK.map(x; prefer_threads) do i i^2 end @test y == map(i -> i^2, x) x = Array(1:1000) y = zeros(Int, 1000) - AK.map!(y, x) do i + AK.map!(y, x; prefer_threads) do i i^2 end @test y == map(i -> i^2, x) x = rand(Float32, 1000) - y = AK.map(x, max_tasks=2, min_elems=100) do i + y = AK.map(x; prefer_threads, max_tasks=2, min_elems=100) do i i > 0.5 ? i : 0 end @test y == map(i -> i > 0.5 ? i : 0, x) x = rand(Float32, 1000) - y = AK.map(x, max_tasks=4, min_elems=500) do i + y = AK.map(x; prefer_threads, max_tasks=4, min_elems=500) do i i > 0.5 ? i : 0 end @test y == map(i -> i > 0.5 ? i : 0, x) # Test that undefined kwargs are not accepted - @test_throws MethodError AK.map(x -> x^2, x; bad=:kwarg) + @test_throws MethodError AK.map(x -> x^2, x; prefer_threads, bad=:kwarg) # GPU else x = array_from_host(1:1000) - y = AK.map(x) do i + y = AK.map(x; prefer_threads) do i i^2 end @test Array(y) == map(i -> i^2, 1:1000) x = array_from_host(1:1000) y = array_from_host(zeros(Int, 1000)) - AK.map!(y, x) do i + AK.map!(y, x; prefer_threads) do i i^2 end @test Array(y) == map(i -> i^2, 1:1000) x = array_from_host(rand(Float32, 1000)) - y = AK.map(x, block_size=64) do i + y = AK.map(x; prefer_threads, block_size=64) do i i > 0.5 ? i : 0 end @test Array(y) == map(i -> i > 0.5 ? i : 0, Array(x)) # Test that undefined kwargs are not accepted - @test_throws MethodError AK.map(x -> x^2, x; bad=:kwarg) + @test_throws MethodError AK.map(x -> x^2, x; prefer_threads, bad=:kwarg) end end diff --git a/test/predicates.jl b/test/predicates.jl index cd589e1..cc455bc 100644 --- a/test/predicates.jl +++ b/test/predicates.jl @@ -5,28 +5,28 @@ # Simple correctness tests v = array_from_host(1:100) - @test AK.any(x->x<0, v) === false - @test AK.any(x->x>99, v) === true + @test AK.any(x->x<0, v; prefer_threads) === false + @test AK.any(x->x>99, v; prefer_threads) === true - @test AK.all(x->x>0, v) === true - @test AK.all(x->x<100, v) === false + @test AK.all(x->x>0, v; prefer_threads) === true + @test AK.all(x->x<100, v; prefer_threads) === false for _ in 1:100 num_elems = rand(1:100_000) v = array_from_host(rand(Float32, num_elems)) - @test AK.any(x->x<0, v) === false - @test AK.any(x->x<1, v) === true - @test AK.all(x->x<1, v) === true - @test AK.all(x->x<0, v) === false + @test AK.any(x->x<0, v; prefer_threads) === false + @test AK.any(x->x<1, v; prefer_threads) === true + @test AK.all(x->x<1, v; prefer_threads) === true + @test AK.all(x->x<0, v; prefer_threads) === false end for _ in 1:100 num_elems = rand(1:100_000) v = array_from_host(rand(Float32, num_elems)) - @test AK.any(x->x<0, v) === false - @test AK.any(x->x<1, v) === true - @test AK.all(x->x<1, v) === true - @test AK.all(x->x<0, v) === false + @test AK.any(x->x<0, v; prefer_threads) === false + @test AK.any(x->x<1, v; prefer_threads) === true + @test AK.all(x->x<1, v; prefer_threads) === true + @test AK.all(x->x<0, v; prefer_threads) === false end # Test the MapReduce algorithm which works on all platforms @@ -34,14 +34,14 @@ num_elems = rand(1:100_000) v = array_from_host(rand(Float32, num_elems)) alg=AK.MapReduce(temp=similar(v, Bool), switch_below=100) - @test AK.any(x->x<0, v; alg) === false - @test AK.any(x->x<1, v; alg) === true - @test AK.all(x->x<1, v; alg) === true - @test AK.all(x->x<0, v; alg) === false + @test AK.any(x->x<0, v; prefer_threads, alg) === false + @test AK.any(x->x<1, v; prefer_threads, alg) === true + @test AK.all(x->x<1, v; prefer_threads, alg) === true + @test AK.all(x->x<0, v; prefer_threads, alg) === false end # Testing different settings v = array_from_host(rand(-5:5, 100_000)) - AK.any(x->x<5, v, max_tasks=2, min_elems=100, block_size=64) - AK.all(x->x<5, v, max_tasks=2, min_elems=100, block_size=64) + AK.any(x->x<5, v; prefer_threads, max_tasks=2, min_elems=100, block_size=64) + AK.all(x->x<5, v; prefer_threads, max_tasks=2, min_elems=100, block_size=64) end diff --git a/test/reduce.jl b/test/reduce.jl index 7e9e4a6..9fe8b5c 100644 --- a/test/reduce.jl +++ b/test/reduce.jl @@ -13,6 +13,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0) AK.reduce( (x, y) -> x < y ? x : y, s; + prefer_threads, init=typemax(eltype(s)), neutral=typemax(eltype(s)), ) @@ -48,6 +49,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0) AK.reduce( (x, y) -> x + y, s; + prefer_threads, init=zero(eltype(s)), neutral=zero(eltype(s)), ) @@ -93,7 +95,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0) for _ in 1:100 num_elems = rand(1:100_000) v = array_from_host(rand(Int32(1):Int32(100), num_elems)) - s = AK.reduce(+, v; init=Int32(10)) + s = AK.reduce(+, v; prefer_threads, init=Int32(10)) vh = Array(v) @test s == sum(vh) + 10 end @@ -104,7 +106,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0) v = array_from_host(rand(1:100, num_elems), Int32) switch_below = rand(1:100) init = rand(1:100) - s = AK.reduce(+, v; switch_below=switch_below, init=Int32(init)) + s = AK.reduce(+, v; prefer_threads, switch_below=switch_below, init=Int32(init)) vh = Array(v) @test s == reduce(+, vh; init) end @@ -113,7 +115,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0) for _ in 1:100 num_elems = rand(1:1000) v = 1:num_elems - s = AK.reduce(+, v, BACKEND; init=Int32(0)) + s = AK.reduce(+, v, BACKEND; prefer_threads, init=Int32(0)) vh = Array(v) @test s == reduce(+, vh) end @@ -124,7 +126,8 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0) # Testing different settings AK.reduce( (x, y) -> x + 1, - array_from_host(rand(Int32, 10_000)), + array_from_host(rand(Int32, 10_000)); + prefer_threads, init=Int32(0), neutral=Int64(0), block_size=64, @@ -135,7 +138,8 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0) ) AK.reduce( (x, y) -> x + 1, - rand(Int32, 10_000), + rand(Int32, 10_000); + prefer_threads, init=Int32(0), neutral=Int64(0), max_tasks=16, @@ -154,7 +158,7 @@ end for ksize in 0:3 sh = rand(Int32(1):Int32(100), isize, jsize, ksize) s = array_from_host(sh) - d = AK.reduce(+, s; init=Int32(10), dims) + d = AK.reduce(+, s; prefer_threads, init=Int32(10), dims) dh = Array(d) @test dh == sum(sh; init=Int32(10), dims) @test eltype(dh) == eltype(sum(sh; init=Int32(10), dims)) @@ -171,7 +175,7 @@ end n3 = rand(1:100) vh = rand(Int32(1):Int32(100), n1, n2, n3) v = array_from_host(vh) - s = AK.reduce(+, v; init=Int32(0), dims) + s = AK.reduce(+, v; prefer_threads, init=Int32(0), dims) sh = Array(s) @test sh == sum(vh; dims) end @@ -184,7 +188,7 @@ end n3 = rand(1:100) vh = rand(UInt32(1):UInt32(100), n1, n2, n3) v = array_from_host(vh) - s = AK.reduce(+, v; init=UInt32(0), dims) + s = AK.reduce(+, v; prefer_threads, init=UInt32(0), dims) sh = Array(s) @test sh == sum(vh; dims) end @@ -197,7 +201,7 @@ end n3 = rand(1:100) vh = rand(Float32, n1, n2, n3) v = array_from_host(vh) - s = AK.reduce(+, v; init=Float32(0), dims) + s = AK.reduce(+, v; prefer_threads, init=Float32(0), dims) sh = Array(s) @test sh ≈ sum(vh; dims) end @@ -212,19 +216,20 @@ end vh = rand(Int32(1):Int32(100), n1, n2, n3) v = array_from_host(vh) init = rand(1:100) - s = AK.reduce(+, v; init=Int32(init), dims) + s = AK.reduce(+, v; prefer_threads, init=Int32(init), dims) sh = Array(s) @test sh == reduce(+, vh; dims, init) end end # Test that undefined kwargs are not accepted - @test_throws MethodError AK.reduce(+, array_from_host(rand(Int32, 10, 10)); init=10, bad=:kwarg) + @test_throws MethodError AK.reduce(+, array_from_host(rand(Int32, 10, 10)); prefer_threads, init=10, bad=:kwarg) # Testing different settings AK.reduce( (x, y) -> x + 1, - array_from_host(rand(Int32, 3, 4, 5)), + array_from_host(rand(Int32, 3, 4, 5)); + prefer_threads, init=Int32(0), neutral=Int32(0), dims=2, @@ -236,7 +241,8 @@ end ) AK.reduce( (x, y) -> x + 1, - array_from_host(rand(Int32, 3, 4, 5)), + array_from_host(rand(Int32, 3, 4, 5)); + prefer_threads, init=Int32(0), neutral=Int32(0), dims=3, @@ -258,6 +264,7 @@ end p -> (p.x, p.y), (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]), s; + prefer_threads, init=(typemax(Float32), typemax(Float32)), neutral=(typemax(Float32), typemax(Float32)), ) @@ -310,7 +317,7 @@ end for _ in 1:100 num_elems = rand(1:100_000) v = array_from_host(rand(Int32(1):Int32(100), num_elems)) - s = AK.mapreduce(abs, +, v; init=Int32(10)) + s = AK.mapreduce(abs, +, v; prefer_threads, init=Int32(10)) vh = Array(v) @test s == sum(vh) + 10 end @@ -321,7 +328,7 @@ end v = array_from_host(rand(-100:-1, num_elems), Int32) switch_below = rand(1:100) init = rand(1:100) - s = AK.mapreduce(abs, +, v; switch_below=switch_below, init=Int32(init)) + s = AK.mapreduce(abs, +, v; prefer_threads, switch_below=switch_below, init=Int32(init)) vh = Array(v) @test s == mapreduce(abs, +, vh; init) end @@ -330,7 +337,7 @@ end for _ in 1:100 num_elems = rand(1:1000) v = 1:num_elems - s = AK.mapreduce(abs, +, v, BACKEND; init=Int32(0)) + s = AK.mapreduce(abs, +, v, BACKEND; prefer_threads, init=Int32(0)) vh = Array(v) @test s == mapreduce(abs, +, vh) end @@ -339,7 +346,8 @@ end f(s, temp) = AK.mapreduce( p -> (p.x, p.y), (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]), - s, + s; + prefer_threads, init=(typemax(Float32), typemax(Float32)), neutral=(typemax(Float32), typemax(Float32)), block_size=64, @@ -353,7 +361,7 @@ end f(v, temp) # Test that undefined kwargs are not accepted - @test_throws MethodError AK.mapreduce(-, +, v; init=10, bad=:kwarg) + @test_throws MethodError AK.mapreduce(-, +, v; prefer_threads, init=10, bad=:kwarg) end @@ -367,7 +375,7 @@ end for ksize in 0:3 sh = rand(Int32(-100):Int32(100), isize, jsize, ksize) s = array_from_host(sh) - d = AK.mapreduce(-, +, s; init=Int32(-10), dims) + d = AK.mapreduce(-, +, s; prefer_threads, init=Int32(-10), dims) dh = Array(d) @test dh == mapreduce(-, +, sh; init=Int32(-10), dims) @test eltype(dh) == eltype(mapreduce(-, +, sh; init=Int32(-10), dims)) @@ -384,7 +392,7 @@ end n3 = rand(1:100) vh = rand(Int32(1):Int32(100), n1, n2, n3) v = array_from_host(vh) - s = AK.mapreduce(-, +, v; init=Int32(0), dims) + s = AK.mapreduce(-, +, v; prefer_threads, init=Int32(0), dims) sh = Array(s) @test sh == mapreduce(-, +, vh; init=Int32(0), dims) end @@ -396,6 +404,7 @@ end p -> (p.x, p.y), (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]), s; + prefer_threads, init=(typemax(Float32), typemax(Float32)), neutral=(typemax(Float32), typemax(Float32)), dims, @@ -443,20 +452,21 @@ end vh = rand(Int32(-100):Int32(100), n1, n2, n3) v = array_from_host(vh) init = rand(1:100) - s = AK.mapreduce(-, +, v; init=Int32(init), dims) + s = AK.mapreduce(-, +, v; prefer_threads, init=Int32(init), dims) sh = Array(s) @test sh == mapreduce(-, +, vh; dims, init) end end # Test that undefined kwargs are not accepted - @test_throws MethodError AK.mapreduce(-, +, array_from_host(rand(Int32, 3, 4, 5)); init=10, bad=:kwarg) + @test_throws MethodError AK.mapreduce(-, +, array_from_host(rand(Int32, 3, 4, 5)); prefer_threads, init=10, bad=:kwarg) # Testing different settings AK.mapreduce( -, (x, y) -> x + 1, - array_from_host(rand(Int32, 3, 4, 5)), + array_from_host(rand(Int32, 3, 4, 5)); + prefer_threads, init=Int32(0), neutral=Int32(0), dims=2, @@ -469,7 +479,8 @@ end AK.mapreduce( -, (x, y) -> x + 1, - array_from_host(rand(Int32, 3, 4, 5)), + array_from_host(rand(Int32, 3, 4, 5)); + prefer_threads, init=Int32(0), neutral=Int32(0), dims=3, @@ -486,13 +497,13 @@ end # Simple correctness tests v = array_from_host(1:100) - @test AK.sum(v) == sum(Array(v)) + @test AK.sum(v; prefer_threads) == sum(Array(v)) # Fuzzy testing for _ in 1:100 num_elems = rand(1:100_000) v = array_from_host(rand(Float32, num_elems)) - @test AK.sum(v) ≈ sum(Array(v)) + @test AK.sum(v; prefer_threads) ≈ sum(Array(v)) end for _ in 1:100 @@ -504,10 +515,10 @@ end v = array_from_host(vh) # Indexing into array as if linear - @test AK.sum(v) == sum(vh) + @test AK.sum(v; prefer_threads) == sum(vh) # Along dimensions - r = Array(AK.sum(v; dims)) + r = Array(AK.sum(v; prefer_threads, dims)) rh = sum(vh; dims) @test r == rh @@ -516,10 +527,10 @@ end # Testing different settings v = array_from_host(rand(-5:5, 100_000)) - AK.sum(v, block_size=64) + AK.sum(v; prefer_threads, block_size=64) # Test that undefined kwargs are not accepted - @test_throws MethodError AK.sum(v; bad=:kwarg) + @test_throws MethodError AK.sum(v; prefer_threads, bad=:kwarg) # The other settings are stress-tested in reduce end @@ -531,13 +542,13 @@ end # Simple correctness tests v = array_from_host(1:100) - @test AK.prod(v) == prod(Array(v)) + @test AK.prod(v; prefer_threads) == prod(Array(v)) # Fuzzy testing for _ in 1:100 num_elems = rand(1:100_000) v = array_from_host(rand(Float32, num_elems)) - @test AK.prod(v) ≈ prod(Array(v)) + @test AK.prod(v; prefer_threads) ≈ prod(Array(v)) end for _ in 1:100 @@ -549,10 +560,10 @@ end v = array_from_host(vh) # Indexing into array as if linear - @test AK.sum(v) == sum(vh) + @test AK.sum(v; prefer_threads) == sum(vh) # Along dimensions - r = Array(AK.sum(v; dims)) + r = Array(AK.sum(v; prefer_threads, dims)) rh = sum(vh; dims) @test r == rh @@ -561,10 +572,10 @@ end # Testing different settings v = array_from_host(rand(-5:5, 100_000)) - AK.prod(v, block_size=64) + AK.prod(v; prefer_threads, block_size=64) # Test that undefined kwargs are not accepted - @test_throws MethodError AK.prod(v; bad=:kwarg) + @test_throws MethodError AK.prod(v; prefer_threads, bad=:kwarg) # The other settings are stress-tested in reduce end @@ -576,13 +587,13 @@ end # Simple correctness tests v = array_from_host(1:100) - @test AK.minimum(v) == minimum(Array(v)) + @test AK.minimum(v; prefer_threads) == minimum(Array(v)) # Fuzzy testing for _ in 1:100 num_elems = rand(1:100_000) v = array_from_host(rand(Float32, num_elems)) - @test AK.minimum(v) == minimum(Array(v)) + @test AK.minimum(v; prefer_threads) == minimum(Array(v)) end for _ in 1:100 @@ -594,10 +605,10 @@ end v = array_from_host(vh) # Indexing into array as if linear - @test AK.minimum(v) == minimum(vh) + @test AK.minimum(v; prefer_threads) == minimum(vh) # Along dimensions - r = Array(AK.minimum(v; dims)) + r = Array(AK.minimum(v; prefer_threads, dims)) rh = minimum(vh; dims) @test r == rh @@ -606,10 +617,10 @@ end # Testing different settings v = array_from_host(rand(-5:5, 100_000)) - AK.minimum(v, block_size=64) + AK.minimum(v; prefer_threads, block_size=64) # Test that undefined kwargs are not accepted - @test_throws MethodError AK.minimum(v; bad=:kwarg) + @test_throws MethodError AK.minimum(v; prefer_threads, bad=:kwarg) # The other settings are stress-tested in reduce end @@ -621,13 +632,13 @@ end # Simple correctness tests v = array_from_host(1:100) - @test AK.maximum(v) == maximum(Array(v)) + @test AK.maximum(v; prefer_threads) == maximum(Array(v)) # Fuzzy testing for _ in 1:100 num_elems = rand(1:100_000) v = array_from_host(rand(Float32, num_elems)) - @test AK.maximum(v) == maximum(Array(v)) + @test AK.maximum(v; prefer_threads) == maximum(Array(v)) end for _ in 1:100 @@ -639,10 +650,10 @@ end v = array_from_host(vh) # Indexing into array as if linear - @test AK.maximum(v) == maximum(vh) + @test AK.maximum(v; prefer_threads) == maximum(vh) # Along dimensions - r = Array(AK.maximum(v; dims)) + r = Array(AK.maximum(v; prefer_threads, dims)) rh = maximum(vh; dims) @test r == rh @@ -651,10 +662,10 @@ end # Testing different settings v = array_from_host(rand(-5:5, 100_000)) - AK.maximum(v, block_size=64) + AK.maximum(v; prefer_threads, block_size=64) # Test that undefined kwargs are not accepted - @test_throws MethodError AK.maximum(v; bad=:kwarg) + @test_throws MethodError AK.maximum(v; prefer_threads, bad=:kwarg) # The other settings are stress-tested in reduce end @@ -666,13 +677,13 @@ end # Simple correctness tests v = array_from_host(1:100) - @test AK.count(x->x>50, v) == count(x->x>50, Array(v)) + @test AK.count(x->x>50, v; prefer_threads) == count(x->x>50, Array(v)) # Fuzzy testing for _ in 1:100 num_elems = rand(1:100_000) v = array_from_host(rand(Float32, num_elems)) - @test AK.count(x->x>0.5, v) == count(x->x>0.5, Array(v)) + @test AK.count(x->x>0.5, v; prefer_threads) == count(x->x>0.5, Array(v)) end for _ in 1:100 @@ -684,10 +695,10 @@ end v = array_from_host(vh) # Indexing into array as if linear - @test AK.count(x->x>0.5, v) == count(x->x>0.5, vh) + @test AK.count(x->x>0.5, v; prefer_threads) == count(x->x>0.5, vh) # Along dimensions - r = Array(AK.count(x->x>0.5, v; dims)) + r = Array(AK.count(x->x>0.5, v; prefer_threads, dims)) rh = count(x->x>0.5, vh; dims) @test r == rh @@ -698,15 +709,15 @@ end for _ in 1:100 num_elems = rand(1:100_000) v = array_from_host(rand(Bool, num_elems)) - @test AK.count(v) == count(Array(v)) + @test AK.count(v; prefer_threads) == count(Array(v)) end # Testing different settings v = array_from_host(rand(-5:5, 100_000)) - AK.count(x->x>0, v, block_size=64) + AK.count(x->x>0, v; prefer_threads, block_size=64) # Test that undefined kwargs are not accepted - @test_throws MethodError AK.count(v; bad=:kwarg) + @test_throws MethodError AK.count(v; prefer_threads, bad=:kwarg) # The other settings are stress-tested in reduce end diff --git a/test/runtests.jl b/test/runtests.jl index 8db1c11..dcb4896 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -35,19 +35,22 @@ elseif "--Metal" in ARGS const BACKEND = MetalBackend() elseif "--OpenCL" in ARGS Pkg.add(name="OpenCL", rev="master") + Pkg.add(name="SPIRVIntrinsics", rev="master") Pkg.add("pocl_jll") using pocl_jll using OpenCL OpenCL.versioninfo() const BACKEND = OpenCLBackend() - TEST_DL[] = true elseif !@isdefined(BACKEND) # Otherwise do CPU tests using InteractiveUtils InteractiveUtils.versioninfo() - const BACKEND = CPU() + const BACKEND = get_backend([]) end +const IS_CPU_BACKEND = BACKEND == get_backend([]) + +global prefer_threads::Bool = !(IS_CPU_BACKEND && "--cpuKA" in ARGS) array_from_host(h_arr::AbstractArray, dtype=nothing) = array_from_host(BACKEND, h_arr, dtype) function array_from_host(backend, h_arr::AbstractArray, dtype=nothing) diff --git a/test/sort.jl b/test/sort.jl index 59e9505..ee48c00 100644 --- a/test/sort.jl +++ b/test/sort.jl @@ -1,4 +1,4 @@ -if BACKEND != CPU() +if !IS_CPU_BACKEND || !prefer_threads @testset "merge_sort" begin Random.seed!(0) @@ -98,7 +98,7 @@ end for _ in 1:100 num_elems = rand(1:100_000) v = array_from_host(rand(Int32, num_elems)) - AK.sort!(v) + AK.sort!(v; prefer_threads) vh = Array(v) @test issorted(vh) end @@ -106,7 +106,7 @@ end for _ in 1:100 num_elems = rand(1:100_000) v = array_from_host(rand(UInt32, num_elems)) - AK.sort!(v) + AK.sort!(v; prefer_threads) vh = Array(v) @test issorted(vh) end @@ -114,39 +114,39 @@ end for _ in 1:100 num_elems = rand(1:100_000) v = array_from_host(rand(Float32, num_elems)) - AK.sort!(v) + AK.sort!(v; prefer_threads) vh = Array(v) @test issorted(vh) end # Testing different settings v = array_from_host(rand(1:100_000, 10_000), Float32) - AK.sort!(v, lt=(>), by=abs, rev=true, + AK.sort!(v; prefer_threads, lt=(>), by=abs, rev=true, max_tasks=64, min_elems=8, block_size=64, temp=array_from_host(1:10_000, Float32)) @test issorted(Array(v)) v = array_from_host(rand(1:100_000, 10_000), Int32) - AK.sort!(v, lt=(>), rev=true, + AK.sort!(v; prefer_threads, lt=(>), rev=true, max_tasks=64, min_elems=8, block_size=64, temp=array_from_host(1:10_000, Int32)) @test issorted(Array(v)) v = array_from_host(rand(1:100_000, 10_000), Float32) - v = AK.sort(v, lt=(>), by=abs, rev=true, + v = AK.sort(v; prefer_threads, lt=(>), by=abs, rev=true, max_tasks=64, min_elems=8, block_size=64, temp=array_from_host(1:10_000, Float32)) @test issorted(Array(v)) v = array_from_host(rand(1:100_000, 10_000), Int32) - v = AK.sort(v, lt=(>), by=abs, rev=true, + v = AK.sort(v; prefer_threads, lt=(>), by=abs, rev=true, max_tasks=64, min_elems=8, block_size=64, temp=array_from_host(1:10_000, Int32)) @test issorted(Array(v)) end -if BACKEND != CPU() +if !IS_CPU_BACKEND || !prefer_threads @testset "merge_sort_by_key" begin Random.seed!(0) @@ -228,7 +228,7 @@ end end -if BACKEND != CPU() +if !IS_CPU_BACKEND || !prefer_threads @testset "merge_sortperm" begin Random.seed!(0) @@ -337,7 +337,7 @@ end end -if BACKEND != CPU() +if !IS_CPU_BACKEND || !prefer_threads @testset "merge_sortperm_lowmem" begin Random.seed!(0) @@ -404,7 +404,7 @@ end num_elems = rand(1:100_000) ix = array_from_host(zeros(Int32, num_elems)) v = array_from_host(rand(Int32, num_elems)) - AK.sortperm!(ix, v) + AK.sortperm!(ix, v; prefer_threads) ixh = Array(ix) vh = Array(v) @test issorted(vh[ixh]) @@ -414,7 +414,7 @@ end num_elems = rand(1:100_000) ix = array_from_host(zeros(Int32, num_elems)) v = array_from_host(rand(UInt32, num_elems)) - AK.sortperm!(ix, v) + AK.sortperm!(ix, v; prefer_threads) ixh = Array(ix) vh = Array(v) @test issorted(vh[ixh]) @@ -424,7 +424,7 @@ end num_elems = rand(1:100_000) ix = array_from_host(zeros(Int32, num_elems)) v = array_from_host(rand(Float32, num_elems)) - AK.sortperm!(ix, v) + AK.sortperm!(ix, v; prefer_threads) ixh = Array(ix) vh = Array(v) @test issorted(vh[ixh]) @@ -434,7 +434,8 @@ end ix = array_from_host(1:10_000, Int32) v = array_from_host(1:10_000, Float32) AK.sortperm!(ix, - v, + v; + prefer_threads, lt=(>), by=abs, rev=true, block_size=64, temp=array_from_host(1:10_000, Int32)) @@ -443,7 +444,8 @@ end @test issorted(vh[ixh]) v = array_from_host(1:10_000, Float32) - ix = AK.sortperm(v, + ix = AK.sortperm(v; + prefer_threads, lt=(>), by=abs, rev=true, block_size=64, temp=array_from_host(1:10_000, Int))