Merge pull request #31 from pxl-th/master

CarloLucibello · web-flow · commit 9cc8a3c2cbb2 · 2021-11-15T21:07:32.000+01:00
Add CUDA kernels for grid sampling
diff --git a/ext/NNlibCUDA/Project.toml b/ext/NNlibCUDA/Project.toml
@@ -11,7 +11,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [compat]
 CUDA = "3.3.1"
-NNlib = "0.7.25"
+NNlib = "0.7.31"
 julia = "1.6"
 
 [extras]
diff --git a/ext/NNlibCUDA/src/NNlibCUDA.jl b/ext/NNlibCUDA/src/NNlibCUDA.jl
@@ -7,6 +7,7 @@ using Random, Statistics
 const IntOrIntTuple = Union{Integer, NTuple{N,<:Integer} where N}
 
 include("upsample.jl")
+include("sampling.jl")
 include("activations.jl")
 include("batchedmul.jl")
 include("scatter.jl")
diff --git a/ext/NNlibCUDA/src/cudnn/batchnorm.jl b/ext/NNlibCUDA/src/cudnn/batchnorm.jl
@@ -121,4 +121,4 @@ function cudnnBNBackward!(dg::DenseCuArray{T}, g::DenseCuArray{T}, db::DenseCuAr
     db .= vec(sum(dy, dims=rdims))
   end
 end
-  
+  
diff --git a/ext/NNlibCUDA/src/sampling.jl b/ext/NNlibCUDA/src/sampling.jl
@@ -0,0 +1,61 @@
+@inline function NNlib._safe_add!(dx::CuDeviceArray{T, 4}, value, ix, iy, c, n) where T
+    @inbounds CUDA.@atomic dx[ix, iy, c, n] += value
+end
+
+function grid_sample_kernel!(n_elem, output, input, grid, padding_mode)
+    index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x
+    if index < n_elem
+        iW, iH, iC, _ = size(input)
+        _, gW, gH, _ = size(grid)
+
+        w = index % gW + 1
+        h = (index ÷ gW) % gH + 1
+        n = index ÷ (gW * gH) + 1
+        NNlib._grid_sample_kernel!(output, input, grid, padding_mode, w, h, n, iW, iH, iC)
+    end
+    nothing
+end
+
+function ∇grid_sample_kernel!(n_elem, dx, dgrid, Δ, input, grid, padding_mode)
+    index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x
+    if index < n_elem
+        iW, iH, iC, _ = size(input)
+        _, gW, gH, _ = size(grid)
+
+        w = index % gW + 1
+        h = (index ÷ gW) % gH + 1
+        n = index ÷ (gW * gH) + 1
+        NNlib._∇grid_sample_kernel!(dx, dgrid, Δ, input, grid, padding_mode, w, h, n, iW, iH, iC)
+    end
+    nothing
+end
+
+function NNlib.grid_sample(x::CuArray{T, 4}, grid::CuArray{V, 4}; padding_mode = :zeros) where {T, V}
+    pad = Val(padding_mode)
+    _, _, xC, xN = size(x)
+    _, gW, gH, _ = size(grid)
+    n_elem = gW * gH * xN
+    y = similar(x, T, (gW, gH, xC, xN))
+
+    kernel = @cuda launch=false grid_sample_kernel!(n_elem, y, x, grid, pad)
+    config = launch_configuration(kernel.fun; max_threads=256)
+    threads = min(n_elem, config.threads)
+    blocks = cld(n_elem, threads)
+    kernel(n_elem, y, x, grid, pad; threads=threads, blocks=blocks)
+    y
+end
+
+function NNlib.∇grid_sample(Δ::CuArray{T, 4}, x::CuArray{T, 4}, grid::CuArray{V, 4}; padding_mode = :zeros) where {T, V}
+    pad = Val(padding_mode)
+    xN = size(x, 4)
+    _, gW, gH, _ = size(grid)
+    n_elem = gW * gH * xN
+    dx, dgrid = CUDA.zeros(T, size(x)), similar(grid)
+
+    kernel = @cuda launch=false ∇grid_sample_kernel!(n_elem, dx, dgrid, Δ, x, grid, pad)
+    config = launch_configuration(kernel.fun; max_threads=256)
+    threads = min(n_elem, config.threads)
+    blocks = cld(n_elem, threads)
+    kernel(n_elem, dx, dgrid, Δ, x, grid, pad; threads=threads, blocks=blocks)
+    dx, dgrid
+end
diff --git a/ext/NNlibCUDA/test/runtests.jl b/ext/NNlibCUDA/test/runtests.jl
@@ -18,4 +18,5 @@ include("softmax.jl")
 include("batchnorm.jl")
 include("scatter.jl")
 include("gather.jl")
+include("sampling.jl")
 end
diff --git a/ext/NNlibCUDA/test/sampling.jl b/ext/NNlibCUDA/test/sampling.jl
@@ -0,0 +1,53 @@
+@testset "Grid Sampling" begin
+    for T in (Float32, Float64)
+        x = ones(T, (2, 2, 1, 1))
+        grid = Array{T}(undef, 2, 2, 2, 1)
+        grid[:, 1, 1, 1] .= (-1, -1)
+        grid[:, 2, 1, 1] .= (1, -1)
+        grid[:, 1, 2, 1] .= (-1, 1)
+        grid[:, 2, 2, 1] .= (1, 1)
+
+        ∇grid_true = Array{T}(undef, size(grid))
+        ∇grid_true[:, :, 1, 1] = [[0.0, 0.0] [-0.5, 0.0]]
+        ∇grid_true[:, :, 2, 1] = [[0.0, -0.5] [-0.5, -0.5]]
+
+        x_gpu, grid_gpu = CuArray(x), CuArray(grid)
+
+        padding_mode = :zeros
+        y_gpu = grid_sample(x_gpu, grid_gpu; padding_mode=padding_mode)
+        @test x == collect(y_gpu)
+        @test eltype(y_gpu) == T
+
+        external_grad = CUDA.ones(T, size(y_gpu))
+        ∇input, ∇grid = ∇grid_sample(external_grad, x_gpu, grid_gpu; padding_mode=padding_mode)
+        @test x == collect(∇input)
+        @test ∇grid_true == collect(∇grid)
+        @test eltype(∇input) == T
+        @test eltype(∇grid) == T
+
+        padding_mode = :border
+        fill!(∇grid_true, 0.0)
+        sampled = grid_sample(x_gpu, grid_gpu; padding_mode=padding_mode)
+        @test x == collect(sampled)
+        @test eltype(sampled) == T
+
+        ∇input, ∇grid = ∇grid_sample(external_grad, x_gpu, grid_gpu; padding_mode=padding_mode)
+        @test x == collect(∇input)
+        @test ∇grid_true == collect(∇grid)
+        @test eltype(∇input) == T
+        @test eltype(∇grid) == T
+    end
+end
+
+@testset "Compare grid sampling with NNlib" begin
+    w, h, c, n = 16, 16, 2, 4
+    input = rand(Float64, w, h, c, n)
+    grid = zeros(Float64, 2, w, h, n)
+    @inbounds for xi in 1:w, yi in 1:h, ni in 1:n
+        grid[1, xi, yi, ni] = (xi / w) * 2.0 - 1.0 + 0.01
+        grid[2, xi, yi, ni] = (yi / h) * 2.0 - 1.0
+    end
+    for padding_mode in (:zeros, :border)
+        gputest(grid_sample, input, grid; atol=1e-6, padding_mode=padding_mode)
+    end
+end
diff --git a/ext/NNlibCUDA/test/test_utils.jl b/ext/NNlibCUDA/test/test_utils.jl
@@ -5,10 +5,10 @@ function gputest(f, xs...; checkgrad=true, atol=1e-10, kws...)
     cpu_out = f(cpu_in...; kws...)
     gpu_out = f(gpu_in...; kws...)
     @test collect(cpu_out) ≈ collect(gpu_out)
-    
+
     if checkgrad
-        cpu_grad = gradient((x...) -> sum(f(x...)), cpu_in...)
-        gpu_grad = gradient((x...) -> sum(f(x...)), gpu_in...)
+        cpu_grad = gradient((x...) -> sum(f(x...; kws...)), cpu_in...)
+        gpu_grad = gradient((x...) -> sum(f(x...; kws...)), gpu_in...)
         for (cpu_g, gpu_g) in zip(cpu_grad, gpu_grad)
             if cpu_g === nothing
                 @test gpu_g === nothing

-Original file line number
+Diff line change
     db .= vec(sum(dy, dims=rdims))
   end
 end
+-
++