Skip to content

Commit a42c7e3

Browse files
authored
Merge pull request #59 from nikopj/nikopj-fold
added fold/unfold and gpu tests
2 parents d6e8939 + cd6b61f commit a42c7e3

File tree

5 files changed

+150
-1
lines changed

5 files changed

+150
-1
lines changed

ext/NNlibCUDA/Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
1313
[compat]
1414
Adapt = "3.3"
1515
CUDA = "3.11"
16-
NNlib = "0.8.9"
16+
NNlib = "0.8.11"
1717
julia = "1.6"
1818

1919
[extras]

ext/NNlibCUDA/src/NNlibCUDA.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ include("activations.jl")
1212
include("batchedadjtrans.jl")
1313
include("batchedmul.jl")
1414
include("ctc.jl")
15+
include("fold.jl")
1516
include("scatter.jl")
1617
include("gather.jl")
1718
include("utils.jl")

ext/NNlibCUDA/src/fold.jl

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
2+
function unfold_kernel!(col::AbstractArray{T}, x, col_size, input_size, output_size, kernel_size, flipkernel, stride, pad_lo, dilation, max_idx) where {T}
3+
index = threadIdx().x + (blockIdx().x - 1) * blockDim().x
4+
5+
@inbounds if index <= max_idx
6+
i, kw, kh, kd, c, b = CartesianIndices(col_size)[index].I # col indices
7+
w, h, d = CartesianIndices(output_size)[i].I # x indices
8+
9+
# project
10+
w, h, d = @. ((w, h, d) - 1)*stride - pad_lo + 1 + ((kw, kh, kd) - 1)*dilation
11+
12+
if !flipkernel
13+
kw, kh, kd = kernel_size .- (kw, kh, kd) .+ 1
14+
end
15+
16+
# check out of bounds
17+
if !all(checkindex.(Bool, UnitRange.(1, input_size), (w, h, d)))
18+
col[i, kw, kh, kd, c, b] = T(0)
19+
return nothing
20+
end
21+
22+
xval::T = x[w, h, d, c, b]
23+
col[i, kw, kh, kd, c, b] = xval
24+
end
25+
26+
return nothing
27+
end
28+
29+
function fold_kernel!(x::AbstractArray{T}, col, col_size, input_size, output_size, kernel_size, flipkernel, stride, pad_lo, dilation, max_idx) where {T}
30+
index = threadIdx().x + (blockIdx().x - 1) * blockDim().x
31+
32+
@inbounds if index <= max_idx
33+
i, kw, kh, kd, c, b = CartesianIndices(col_size)[index].I # col indices
34+
w, h, d = CartesianIndices(output_size)[i].I # x indices
35+
36+
# project
37+
w, h, d = @. ((w, h, d) - 1)*stride - pad_lo + 1 + ((kw, kh, kd) - 1)*dilation
38+
39+
# check out of bounds
40+
if !all(checkindex.(Bool, UnitRange.(1, input_size), (w, h, d)))
41+
return nothing
42+
end
43+
44+
if !flipkernel
45+
kw, kh, kd = kernel_size .- (kw, kh, kd) .+ 1
46+
end
47+
48+
cval::T = col[i, kw, kh, kd, c, b]
49+
CUDA.@atomic x[w, h, d, c, b] += cval
50+
end
51+
52+
return nothing
53+
end
54+
55+
function NNlib.unfold!(col::AnyCuArray{cT,3}, x::AnyCuArray{xT,5}, cdims::NNlib.DenseConvDims) where {cT, xT}
56+
if NNlib.spatial_dims(cdims) != 3
57+
throw(DimensionMismatch("unfold!() only accepts 3d convoluitional inputs"))
58+
end
59+
60+
input_size = NNlib.input_size(cdims)
61+
C_in = NNlib.channels_in(cdims)
62+
kernel_size = NNlib.kernel_size(cdims)
63+
pad_w_lo, pad_w_hi, pad_h_lo, pad_h_hi, pad_d_lo, pad_d_hi = NNlib.padding(cdims)
64+
pad_lo = (pad_w_lo, pad_h_lo, pad_d_lo)
65+
dilation = NNlib.dilation(cdims)
66+
stride = NNlib.stride(cdims)
67+
output_size = NNlib.output_size(cdims)
68+
flipkernel = NNlib.flipkernel(cdims)
69+
70+
col_reshaped = reshape(col, (prod(output_size), kernel_size..., C_in, :))
71+
72+
max_idx = prod(size(col))
73+
args = col_reshaped, x, size(col_reshaped), input_size, output_size, kernel_size, flipkernel, stride, pad_lo, dilation, max_idx
74+
kernel = @cuda launch=false unfold_kernel!(args...)
75+
config = launch_configuration(kernel.fun; max_threads=256)
76+
threads = min(max_idx, config.threads)
77+
blocks = cld(max_idx, threads)
78+
kernel(args...; threads=threads, blocks=blocks)
79+
return col
80+
end
81+
82+
function NNlib.fold!(x::AnyCuArray{xT,5}, col::AnyCuArray{cT,3}, cdims::NNlib.DenseConvDims) where {xT, cT}
83+
if NNlib.spatial_dims(cdims) != 3
84+
throw(DimensionMismatch("fold!() only accepts 3d convoluitional inputs"))
85+
end
86+
87+
# going to accumulate into x
88+
fill!(x, xT(0))
89+
90+
input_size = NNlib.input_size(cdims)
91+
C_in = NNlib.channels_in(cdims)
92+
kernel_size = NNlib.kernel_size(cdims)
93+
pad_w_lo, pad_w_hi, pad_h_lo, pad_h_hi, pad_d_lo, pad_d_hi = NNlib.padding(cdims)
94+
pad_lo = (pad_w_lo, pad_h_lo, pad_d_lo)
95+
dilation = NNlib.dilation(cdims)
96+
stride = NNlib.stride(cdims)
97+
output_size = NNlib.output_size(cdims)
98+
flipkernel = NNlib.flipkernel(cdims)
99+
100+
col_reshaped = reshape(col, (prod(output_size), kernel_size..., C_in, :))
101+
102+
max_idx = prod(size(col))
103+
args = x, col_reshaped, size(col_reshaped), input_size, output_size, kernel_size, flipkernel, stride, pad_lo, dilation, max_idx
104+
kernel = @cuda launch=false fold_kernel!(args...)
105+
config = launch_configuration(kernel.fun; max_threads=256)
106+
threads = min(max_idx, config.threads)
107+
blocks = cld(max_idx, threads)
108+
kernel(args...; threads=threads, blocks=blocks)
109+
return x
110+
end
111+

ext/NNlibCUDA/test/fold.jl

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
2+
@testset "fold" begin
3+
# Test for agreement between CPU/GPU versions, across a variety of kwargs
4+
options = Dict{Any, Any}.((
5+
(), (:dilation => 2), (:flipkernel => true), (:stride => 2),
6+
(:padding => 1),
7+
(:padding => (1,0)),
8+
(:padding => (0,1)),
9+
(:padding => (2,3)),
10+
))
11+
12+
C_in = 3
13+
C_out = 4
14+
batch_size = 1
15+
16+
@testset "spatial_rank=$spatial_rank" for spatial_rank in (1, 2, 3)
17+
for opts in options
18+
if :padding in keys(opts)
19+
padding = opts[:padding]
20+
if 1 < length(padding) && length(padding) != 2spatial_rank
21+
opts[:padding] = ntuple(i -> padding[mod1(i,2)] .+ 2div(i-1,2), 2spatial_rank)
22+
end
23+
end
24+
25+
x = rand(Float64, fill(8, spatial_rank)..., C_in, batch_size)
26+
w = rand(Float64, fill(2, spatial_rank)..., C_in, C_out)
27+
cdims = DenseConvDims(x, w; opts...)
28+
y = NNlib.unfold(x, cdims)
29+
30+
# test equivalence of fold/unfold across GPU/CPU
31+
gputest(x -> NNlib.unfold(x, cdims), x)
32+
gputest(y -> NNlib.fold(y, size(x), cdims), y)
33+
end
34+
end
35+
end
36+

ext/NNlibCUDA/test/runtests.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ include("batchedmul.jl")
1515
include("upsample.jl")
1616
include("conv.jl")
1717
include("ctc.jl")
18+
include("fold.jl")
1819
include("pooling.jl")
1920
include("softmax.jl")
2021
include("batchnorm.jl")

0 commit comments

Comments
 (0)