diff --git a/Project.toml b/Project.toml index 0051650a..c7e78589 100644 --- a/Project.toml +++ b/Project.toml @@ -3,8 +3,10 @@ uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" version = "11.2.3" [deps] +AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" +GPUToolbox = "096a3bc2-3ced-46d0-87f4-dd12716f4bfc" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -22,8 +24,10 @@ JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" JLD2Ext = "JLD2" [compat] +AcceleratedKernels = "0.4" Adapt = "4.0" GPUArraysCore = "= 0.2.0" +GPUToolbox = "0.2, 0.3" JLD2 = "0.4, 0.5" KernelAbstractions = "0.9.28" LLVM = "3.9, 4, 5, 6, 7, 8, 9" diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl index 8c1fc14e..ad4cad6d 100644 --- a/src/GPUArrays.jl +++ b/src/GPUArrays.jl @@ -1,5 +1,6 @@ module GPUArrays +using GPUToolbox using KernelAbstractions using Serialization using Random @@ -16,6 +17,7 @@ using Reexport @reexport using GPUArraysCore using KernelAbstractions +import AcceleratedKernels as AK # device functionality include("device/abstractarray.jl") @@ -26,6 +28,7 @@ include("host/construction.jl") ## integrations and specialized methods include("host/base.jl") include("host/indexing.jl") +include("host/reverse.jl") include("host/broadcast.jl") include("host/mapreduce.jl") include("host/linalg.jl") diff --git a/src/host/reverse.jl b/src/host/reverse.jl new file mode 100644 index 00000000..f0b1caf7 --- /dev/null +++ b/src/host/reverse.jl @@ -0,0 +1,148 @@ +# reversing + +# the kernel works by treating the array as 1d. after reversing by dimension x an element at +# pos [i1, i2, i3, ... , i{x}, ..., i{n}] will be at +# pos [i1, i2, i3, ... , d{x} - i{x} + 1, ..., i{n}] where d{x} is the size of dimension x + +# out-of-place version, copying a single value per thread from input to output +function _reverse(input::AnyGPUArray{T, N}, output::AnyGPUArray{T, N}; + dims=1:ndims(input)) where {T, N} + @assert size(input) == size(output) + rev_dims = ntuple((d)-> d in dims && size(input, d) > 1, N) + ref = size(input) .+ 1 + # converts an ND-index in the data array to the linear index + lin_idx = LinearIndices(input) + # converts a linear index in a reduced array to an ND-index, but using the reduced size + nd_idx = CartesianIndices(input) + + ## COV_EXCL_START + @kernel unsafe_indices=true function kernel(input, output) + offset_in = Int32(@groupsize()[1]) * (@index(Group, Linear) - 1i32) + index_in = offset_in + @index(Local, Linear) + + @inbounds if index_in <= length(input) + idx = Tuple(nd_idx[index_in]) + idx = ifelse.(rev_dims, ref .- idx, idx) + index_out = lin_idx[idx...] + output[index_out] = input[index_in] + end + end + ## COV_EXCL_STOP + + nthreads = 256 + + kernel(get_backend(input))(input, output; workgroupsize=nthreads, ndrange=length(input)) +end + +# in-place version, swapping elements on half the number of threads +function _reverse!(data::AnyGPUArray{T, N}; dims=1:ndims(data)) where {T, N} + rev_dims = ntuple((d)-> d in dims && size(data, d) > 1, N) + half_dim = findlast(rev_dims) + if isnothing(half_dim) + # no reverse operation needed at all in this case. + return + end + ref = size(data) .+ 1 + # converts an ND-index in the data array to the linear index + lin_idx = LinearIndices(data) + reduced_size = ntuple((d)->ifelse(d==half_dim, cld(size(data,d),2), size(data,d)), N) + reduced_length = prod(reduced_size) + # converts a linear index in a reduced array to an ND-index, but using the reduced size + nd_idx = CartesianIndices(reduced_size) + + ## COV_EXCL_START + @kernel unsafe_indices=true function kernel(data) + offset_in = Int32(@groupsize()[1]) * (@index(Group, Linear) - 1i32) + index_in = offset_in + @index(Local, Linear) + + @inbounds if index_in <= reduced_length + idx = Tuple(nd_idx[index_in]) + index_in = lin_idx[idx...] + idx = ifelse.(rev_dims, ref .- idx, idx) + index_out = lin_idx[idx...] + + if index_in < index_out + temp = data[index_out] + data[index_out] = data[index_in] + data[index_in] = temp + end + end + end + ## COV_EXCL_STOP + + # NOTE: we launch slightly more than half the number of elements in the array as threads. + # The last non-singleton dimension along which to reverse is used to define how the array is split. + # Only the middle row in case of an odd array dimension could cause trouble, but this is prevented by + # ignoring the threads that cross the mid-point + + nthreads = 256 + + kernel(get_backend(data))(data; workgroupsize=nthreads, ndrange=length(data)) +end + + +# n-dimensional API + +function Base.reverse!(data::AnyGPUArray{T, N}; dims=:) where {T, N} + if isa(dims, Colon) + dims = 1:ndims(data) + end + if !applicable(iterate, dims) + throw(ArgumentError("dimension $dims is not an iterable")) + end + if !all(1 .≤ dims .≤ ndims(data)) + throw(ArgumentError("dimension $dims is not 1 ≤ $dims ≤ $(ndims(data))")) + end + + _reverse!(data; dims=dims) + + return data +end + +# out-of-place +function Base.reverse(input::AnyGPUArray{T, N}; dims=:) where {T, N} + if isa(dims, Colon) + dims = 1:ndims(input) + end + if !applicable(iterate, dims) + throw(ArgumentError("dimension $dims is not an iterable")) + end + if !all(1 .≤ dims .≤ ndims(input)) + throw(ArgumentError("dimension $dims is not 1 ≤ $dims ≤ $(ndims(input))")) + end + + if all(size(input)[[dims...]].==1) + # no reverse operation needed at all in this case. + return copy(input) + else + output = similar(input) + _reverse(input, output; dims=dims) + return output + end +end + + +# 1-dimensional API + +# in-place +Base.@propagate_inbounds function Base.reverse!(data::AnyGPUVector{T}, start::Integer, + stop::Integer=length(data)) where {T} + _reverse!(view(data, start:stop)) + return data +end + +Base.reverse!(data::AnyGPUVector{T}) where {T} = @inbounds reverse!(data, 1, length(data)) + +# out-of-place +Base.@propagate_inbounds function Base.reverse(input::AnyGPUVector{T}, start::Integer, + stop::Integer=length(input)) where {T} + output = similar(input) + + start > 1 && copyto!(output, 1, input, 1, start-1) + _reverse(view(input, start:stop), view(output, start:stop)) + stop < length(input) && copyto!(output, stop+1, input, stop+1) + + return output +end + +Base.reverse(data::AnyGPUVector{T}) where {T} = @inbounds reverse(data, 1, length(data)) diff --git a/test/testsuite/base.jl b/test/testsuite/base.jl index 6bcfd5b4..55436126 100644 --- a/test/testsuite/base.jl +++ b/test/testsuite/base.jl @@ -381,6 +381,51 @@ end gA = reshape(AT(A),4) end + @testset "reverse" begin + # 1-d out-of-place + @test compare(x->reverse(x), AT, rand(Float32, 1000)) + @test compare(x->reverse(x, 10), AT, rand(Float32, 1000)) + @test compare(x->reverse(x, 10, 90), AT, rand(Float32, 1000)) + + # 1-d in-place + @test compare(x->reverse!(x), AT, rand(Float32, 1000)) + @test compare(x->reverse!(x, 10), AT, rand(Float32, 1000)) + @test compare(x->reverse!(x, 10, 90), AT, rand(Float32, 1000)) + + # n-d out-of-place + for shape in ([1, 2, 4, 3], [4, 2], [5], [2^5, 2^5, 2^5]), + dim in 1:length(shape) + @test compare(x->reverse(x; dims=dim), AT, rand(Float32, shape...)) + + cpu = rand(Float32, shape...) + gpu = AT(cpu) + reverse!(gpu; dims=dim) + @test Array(gpu) == reverse(cpu; dims=dim) + end + + # supports multidimensional reverse + for shape in ([1, 2, 4, 3], [2^5, 2^5, 2^5]), + dim in ((1,2),(2,3),(1,3),:) + @test compare(x->reverse(x; dims=dim), AT, rand(Float32, shape...)) + + cpu = rand(Float32, shape...) + gpu = AT(cpu) + reverse!(gpu; dims=dim) + @test Array(gpu) == reverse(cpu; dims=dim) + end + + # wrapped array + @test compare(x->reverse(x), AT, reshape(rand(Float32, 2,2), 4)) + + # error throwing + cpu = rand(Float32, 1,2,3,4) + gpu = AT(cpu) + @test_throws ArgumentError reverse!(gpu, dims=5) + @test_throws ArgumentError reverse!(gpu, dims=0) + @test_throws ArgumentError reverse(gpu, dims=5) + @test_throws ArgumentError reverse(gpu, dims=0) + end + @testset "reinterpret" begin A = Int32[-1,-2,-3] dA = AT(A)