Merge pull request #33 from yuehhua/fix

CarloLucibello · web-flow · commit e418a4d2eeae · 2021-12-06T20:01:50.000+01:00
Support gpu scatter/gather with CartesianIndex
diff --git a/ext/NNlibCUDA/src/gather.jl b/ext/NNlibCUDA/src/gather.jl
@@ -37,6 +37,18 @@ function gather_kernel!(dst, src, idx, max_idx, max_dims_idx, dims_size)
     return nothing
 end
 
+function gather_kernel!(dst, src, idx::CUDA.CuDeviceArray{<:CartesianIndex}, max_idx, max_dims_idx, dims_size)
+    index = threadIdx().x + (blockIdx().x - 1) * blockDim().x
+
+    @inbounds if index <= max_idx
+        j, k = divrem(index-1, max_dims_idx)
+        dims_i = CartesianIndices(dims_size)[k+1]
+        li = Base._to_linear_index(src, Tuple(dims_i)..., Tuple(idx[j+1])...)
+        dst[index] = src[li]
+    end
+    return nothing
+end
+
 function NNlib.gather!(dst::AnyCuArray, src::AnyCuArray, idx::AnyCuArray)
     dims = gather_check_dims(src, dst, idx)
     dims_size = size(src)[1:dims]
diff --git a/ext/NNlibCUDA/src/scatter.jl b/ext/NNlibCUDA/src/scatter.jl
@@ -9,6 +9,16 @@ function scatter_kernel!(op, dst, src, idx)
     return nothing
 end
 
+function scatter_kernel!(op, dst, src, idx::CUDA.CuDeviceArray{<:CartesianIndex})
+    index = threadIdx().x + (blockIdx().x - 1) * blockDim().x
+
+    @inbounds if index <= length(idx)
+        li = Base._to_linear_index(dst, Tuple(idx[index])...)
+        CUDA.@atomic dst[li] = op(dst[li], src[index])
+    end
+    return nothing
+end
+
 function scatter_kernel!(op, dst, src, idx, max_idx, max_dims_idx, dims_size)
     index = threadIdx().x + (blockIdx().x - 1) * blockDim().x
 
@@ -20,6 +30,18 @@ function scatter_kernel!(op, dst, src, idx, max_idx, max_dims_idx, dims_size)
     return nothing
 end
 
+function scatter_kernel!(op, dst, src, idx::CUDA.CuDeviceArray{<:CartesianIndex}, max_idx, max_dims_idx, dims_size)
+    index = threadIdx().x + (blockIdx().x - 1) * blockDim().x
+
+    @inbounds if index <= max_idx
+        j, k = divrem(index-1, max_dims_idx)
+        dims_i = CartesianIndices(dims_size)[k+1]
+        li = Base._to_linear_index(dst, Tuple(dims_i)..., Tuple(idx[j+1])...)
+        CUDA.@atomic dst[li] = op(dst[li], src[index])
+    end
+    return nothing
+end
+
 function NNlib.scatter!(op, dst::AnyCuArray, src::AnyCuArray, idx::AnyCuArray)
     dims = NNlib.scatter_dims(dst, src, idx)
     args = if dims == 0
@@ -69,6 +91,25 @@ function ∇scatter_src_kernel!(op, Δsrc, src, idx, rev_idx, max_idx, T)
     return nothing
 end
 
+function ∇scatter_src_kernel!(op, Δsrc, src, idx::CUDA.CuDeviceArray{<:CartesianIndex}, rev_idx, max_idx, T)
+    index = threadIdx().x + (blockIdx().x - 1) * blockDim().x
+
+    @inbounds if index <= max_idx
+        cart_j = CartesianIndices(idx)[index]
+        # get aggregating indeices, which is to be aggregated together, and itself index
+        inds = rev_idx[Tuple(idx[cart_j])...]
+        # multiply all values to be aggregated but not itself
+        x = one(T)
+        for k in inds
+            x *= src[k]
+        end
+        x /= src[cart_j]
+        # apply `op` on `Δsrc[i, k]` and `x`
+        Δsrc[cart_j] = op(Δsrc[cart_j], x)
+    end
+    return nothing
+end
+
 function ∇scatter_src_kernel!(op, Δsrc, src, idx, rev_idx, pre_cart_idx, max_dims_idx, max_idx, T)
     index = threadIdx().x + (blockIdx().x - 1) * blockDim().x
 
@@ -91,6 +132,28 @@ function ∇scatter_src_kernel!(op, Δsrc, src, idx, rev_idx, pre_cart_idx, max_
     return nothing
 end
 
+function ∇scatter_src_kernel!(op, Δsrc, src, idx::CUDA.CuDeviceArray{<:CartesianIndex}, rev_idx, pre_cart_idx, max_dims_idx, max_idx, T)
+    index = threadIdx().x + (blockIdx().x - 1) * blockDim().x
+
+    @inbounds if index <= max_idx
+        i, j = fldmod1(index, max_dims_idx)
+        cart_i = CartesianIndices(idx)[i]
+        cart_j = pre_cart_idx[j]
+        # get aggregating indeices, which is to be aggregated together, and itself index
+        inds = rev_idx[Tuple(idx[cart_i])...]
+        # multiply all values to be aggregated but not itself
+        x = one(T)
+        for k in inds
+            jk = Base._to_linear_index(src, Tuple(cart_j)..., Tuple(k)...)
+            x *= src[jk]
+        end
+        x /= src[index]
+        # apply `op` on `Δsrc[i, k]` and `x`
+        Δsrc[index] = op(Δsrc[index], x)
+    end
+    return nothing
+end
+
 function NNlib.∇scatter_src(op::Union{typeof(*),typeof(/)}, Δ, dst,
                             src::AnyCuArray{Tsrc,Nsrc}, 
                             idx::AnyCuArray{Tidx,Nidx}) where {Tsrc,Tidx,Nsrc,Nidx}
diff --git a/ext/NNlibCUDA/test/gather.jl b/ext/NNlibCUDA/test/gather.jl
@@ -17,6 +17,38 @@
     gputest(src -> NNlib.gather(src, index), src, checkgrad=true)
     @test NNlib.gather!(CUDA.zeros(T, size(index)...), src, index) == output
     @test_throws ArgumentError NNlib.gather!(zeros(T, 3, 5), src, index)
+    
+    ## 1d src, 2d index of tuples -> 2d output
+    src = CT([3, 4, 5, 6, 7])
+    index = cu([(1,) (2,) (3,) (4,);
+                (4,) (2,) (1,) (3,);
+                (3,) (5,) (5,) (3,)])
+    output = CT([3 4 5 6;
+                6 4 3 5;
+                5 7 7 5])
+    
+    y = NNlib.gather(src, index)
+    @test y isa CuArray{Float32,2}
+    @test size(y) == size(index)
+    gputest(src -> NNlib.gather(src, index), src, checkgrad=true)
+    @test NNlib.gather!(CUDA.zeros(T, size(index)...), src, index) == output
+    @test_throws ArgumentError NNlib.gather!(zeros(T, 3, 5), src, index)
+    
+    ## 1d src, 2d index of CartesianIndex -> 2d output
+    src = CT([3, 4, 5, 6, 7])
+    index = cu(CartesianIndex.([(1,) (2,) (3,) (4,);
+                (4,) (2,) (1,) (3,);
+                (3,) (5,) (5,) (3,)]))
+    output = CT([3 4 5 6;
+                6 4 3 5;
+                5 7 7 5])
+    
+    y = NNlib.gather(src, index)
+    @test y isa CuArray{Float32,2}
+    @test size(y) == size(index)
+    gputest(src -> NNlib.gather(src, index), src, checkgrad=true)
+    @test NNlib.gather!(CUDA.zeros(T, size(index)...), src, index) == output
+    @test_throws ArgumentError NNlib.gather!(zeros(T, 3, 5), src, index)
 
     ## 1d src, 3d index of ints -> 3d output
     src = CT([3, 4, 5, 6, 7])
diff --git a/ext/NNlibCUDA/test/scatter.jl b/ext/NNlibCUDA/test/scatter.jl
@@ -16,6 +16,9 @@ idxs = [
     cu([(1,) (2,) (3,) (4,);
         (4,) (2,) (1,) (3,);
         (3,) (5,) (5,) (3,)]),  # tuple index
+    cu(CartesianIndex.([(1,) (2,) (3,) (4,);
+        (4,) (2,) (1,) (3,);
+        (3,) (5,) (5,) (3,)])),  # CartesianIndex index
 ]
 
 types = [CuArray{Int32}, CuArray{Int64}, CuArray{Float32}, CuArray{Float64}]

Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,9 @@ idxs = [`
`16`	`16`	`cu([(1,) (2,) (3,) (4,);`
`17`	`17`	`(4,) (2,) (1,) (3,);`
`18`	`18`	`(3,) (5,) (5,) (3,)]), # tuple index`
	`19`	`+ cu(CartesianIndex.([(1,) (2,) (3,) (4,);`
	`20`	`+ (4,) (2,) (1,) (3,);`
	`21`	`+ (3,) (5,) (5,) (3,)])), # CartesianIndex index`
`19`	`22`	`]`
`20`	`23`
`21`	`24`	`types = [CuArray{Int32}, CuArray{Int64}, CuArray{Float32}, CuArray{Float64}]`