Skip to content

Commit 24cd95d

Browse files
Merge pull request #49 from maxfreu/deflate-upsampling
get rid of duplicate code in upsampling code by using dispatch
2 parents 06619ac + 1a1dc5c commit 24cd95d

File tree

2 files changed

+43
-107
lines changed

2 files changed

+43
-107
lines changed

ext/NNlibCUDA/Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
1111

1212
[compat]
1313
CUDA = "3.3.1"
14-
NNlib = "0.8.3"
14+
NNlib = "0.8.6"
1515
julia = "1.6"
1616

1717
[extras]

ext/NNlibCUDA/src/upsample.jl

Lines changed: 42 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@
4242

4343
# Forward and backward pass have been tested to produce the same output
4444
# as pytorch with align_corners=True - it works modulo bit noise.
45+
# pytorch's default is align_corners=False, because otherwise the gradients depend on the
46+
# image size, which should be avoided -> this should be considered here as well
4547

4648
@inline function compute_source_index(ratio::T, dst_index, align_corners) where T
4749
if align_corners
@@ -52,11 +54,45 @@
5254
end
5355
end
5456

57+
function NNlib.upsample_linear_kernel!(y::CuArray{T,N}, x::CuArray{T,N}; align_corners=true) where {T,N}
58+
out_size = prod(size(y)[1:N-2])
59+
60+
if align_corners
61+
ratios = ntuple(i -> T((size(x,i)-1) / (size(y,i)-1)), N-2)
62+
else
63+
ratios = ntuple(i -> T(size(x,i) / size(y,i)), N-2)
64+
end
65+
66+
kernel = @cuda launch=false upsample_linear_cuda_kernel!(out_size, ratios..., x, y, align_corners)
67+
config = launch_configuration(kernel.fun; max_threads=256)
68+
threads = Base.min(out_size, config.threads)
69+
blocks = cld(out_size, threads)
70+
kernel(out_size, ratios..., x, y, align_corners; threads=threads, blocks=blocks)
71+
return y
72+
end
73+
74+
function NNlib.∇upsample_linear_kernel!(dx::CuArray{T,N}, Δ::CuArray{T,N}; align_corners=true) where {T,N}
75+
in_size = prod(size(Δ)[1:N-2])
76+
77+
if align_corners
78+
ratios = ntuple(i -> T((size(dx,i)-1) / (size(Δ,i)-1)), N-2) # reversed compared to forward pass
79+
else
80+
ratios = ntuple(i -> T(size(dx,i) / size(Δ,i)), N-2)
81+
end
82+
83+
kernel = @cuda launch=false ∇upsample_linear_cuda_kernel!(in_size, ratios..., Δ, dx, align_corners)
84+
config = launch_configuration(kernel.fun; max_threads=256)
85+
threads = Base.min(in_size, config.threads)
86+
blocks = cld(in_size, threads)
87+
kernel(in_size, ratios..., Δ, dx, align_corners; threads=threads, blocks=blocks)
88+
return dx
89+
end
90+
5591

5692
###########
5793
# linear
5894
###########
59-
function upsample_linear_wcn_kernel!(n_elem, rwidth, x, y, align_corners)
95+
function upsample_linear_cuda_kernel!(n_elem, rwidth, x::CuDeviceArray{<:Any, 3}, y::CuDeviceArray{<:Any, 3}, align_corners)
6096
index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x
6197

6298
if index < n_elem
@@ -86,7 +122,7 @@ function upsample_linear_wcn_kernel!(n_elem, rwidth, x, y, align_corners)
86122
end
87123

88124
# Δ is the gradient backpropagated from downstream layers
89-
function upsample_linear_wcn_kernel!(n_elem, rwidth, Δ, dx, align_corners)
125+
function upsample_linear_cuda_kernel!(n_elem, rwidth, Δ::CuDeviceArray{<:Any, 3}, dx::CuDeviceArray{<:Any, 3}, align_corners)
90126
index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x
91127

92128
if index < n_elem
@@ -115,44 +151,11 @@ function ∇upsample_linear_wcn_kernel!(n_elem, rwidth, Δ, dx, align_corners)
115151
return nothing
116152
end
117153

118-
function NNlib.upsample_linear_wcn!(y::CuArray{T,3}, x::CuArray{T,3}; align_corners=true) where T
119-
out_size = size(y)[1] # w
120-
121-
if align_corners
122-
ratios = ntuple(i -> T((size(x,i)-1) / (size(y,i)-1)), 1)
123-
else
124-
ratios = ntuple(i -> T(size(x,i) / size(y,i)), 1)
125-
end
126-
127-
kernel = @cuda launch=false upsample_linear_wcn_kernel!(out_size, ratios..., x, y, align_corners)
128-
config = launch_configuration(kernel.fun; max_threads=256)
129-
threads = Base.min(out_size, config.threads)
130-
blocks = cld(out_size, threads)
131-
kernel(out_size, ratios..., x, y, align_corners; threads=threads, blocks=blocks)
132-
return y
133-
end
134-
135-
function NNlib.∇upsample_linear_wcn!(dx::CuArray{T,3}, Δ::CuArray{T,3}; align_corners=true) where T
136-
in_size = size(Δ)[1]
137-
if align_corners
138-
ratios = ntuple(i -> T((size(dx,i)-1) / (size(Δ,i)-1)), 1) # reversed compared to forward pass
139-
else
140-
ratios = ntuple(i -> T(size(dx,i) / size(Δ,i)), 1)
141-
end
142-
143-
kernel = @cuda launch=false ∇upsample_linear_wcn_kernel!(in_size, ratios..., Δ, dx, align_corners)
144-
config = launch_configuration(kernel.fun; max_threads=256)
145-
threads = Base.min(in_size, config.threads)
146-
blocks = cld(in_size, threads)
147-
kernel(in_size, ratios..., Δ, dx, align_corners; threads=threads, blocks=blocks)
148-
return dx
149-
end
150-
151154

152155
###########
153156
# bilinear
154157
###########
155-
function upsample_bilinear_whcn_kernel!(n_elem, rwidth, rheight, x, y, align_corners)
158+
function upsample_linear_cuda_kernel!(n_elem, rwidth, rheight, x::CuDeviceArray{<:Any, 4}, y::CuDeviceArray{<:Any, 4}, align_corners)
156159
index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x
157160

158161
if index < n_elem
@@ -194,7 +197,7 @@ function upsample_bilinear_whcn_kernel!(n_elem, rwidth, rheight, x, y, align_cor
194197
end
195198

196199
# Δ is the gradient backpropagated from downstream layers
197-
function upsample_bilinear_whcn_kernel!(n_elem, rwidth, rheight, Δ, dx, align_corners)
200+
function upsample_linear_cuda_kernel!(n_elem, rwidth, rheight, Δ::CuDeviceArray{<:Any, 4}, dx::CuDeviceArray{<:Any, 4}, align_corners)
198201
index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x
199202

200203
if index < n_elem
@@ -237,44 +240,11 @@ function ∇upsample_bilinear_whcn_kernel!(n_elem, rwidth, rheight, Δ, dx, alig
237240
return nothing
238241
end
239242

240-
function NNlib.upsample_bilinear_whcn!(y::CuArray{T,4}, x::CuArray{T,4}; align_corners=true) where T
241-
out_size = prod(size(y)[1:2]) # w*h
242-
243-
if align_corners
244-
ratios = ntuple(i -> T((size(x,i)-1) / (size(y,i)-1)), 2)
245-
else
246-
ratios = ntuple(i -> T(size(x,i) / size(y,i)), 2)
247-
end
248-
249-
kernel = @cuda launch=false upsample_bilinear_whcn_kernel!(out_size, ratios..., x, y, align_corners)
250-
config = launch_configuration(kernel.fun; max_threads=256)
251-
threads = Base.min(out_size, config.threads)
252-
blocks = cld(out_size, threads)
253-
kernel(out_size, ratios..., x, y, align_corners; threads=threads, blocks=blocks)
254-
return y
255-
end
256-
257-
function NNlib.∇upsample_bilinear_whcn!(dx::CuArray{T,4}, Δ::CuArray{T,4}; align_corners=true) where T
258-
in_size = prod(size(Δ)[1:2])
259-
if align_corners
260-
ratios = ntuple(i -> T((size(dx,i)-1) / (size(Δ,i)-1)), 2) # reversed compared to forward pass
261-
else
262-
ratios = ntuple(i -> T(size(dx,i) / size(Δ,i)), 2)
263-
end
264-
265-
kernel = @cuda launch=false ∇upsample_bilinear_whcn_kernel!(in_size, ratios..., Δ, dx, align_corners)
266-
config = launch_configuration(kernel.fun; max_threads=256)
267-
threads = Base.min(in_size, config.threads)
268-
blocks = cld(in_size, threads)
269-
kernel(in_size, ratios..., Δ, dx, align_corners; threads=threads, blocks=blocks)
270-
return dx
271-
end
272-
273243

274244
###########
275245
# trilinear
276246
###########
277-
function upsample_trilinear_whdcn_kernel!(n_elem, rwidth, rheight, rdepth, x, y, align_corners)
247+
function upsample_linear_cuda_kernel!(n_elem, rwidth, rheight, rdepth, x::CuDeviceArray{<:Any, 5}, y::CuDeviceArray{<:Any, 5}, align_corners)
278248
index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x
279249

280250
if index < n_elem
@@ -337,7 +307,7 @@ function upsample_trilinear_whdcn_kernel!(n_elem, rwidth, rheight, rdepth, x, y,
337307
end
338308

339309
# Δ is the gradient backpropagated from downstream layers
340-
function upsample_trilinear_whdcn_kernel!(n_elem, rwidth, rheight, rdepth, Δ, dx, align_corners)
310+
function upsample_linear_cuda_kernel!(n_elem, rwidth, rheight, rdepth, Δ::CuDeviceArray{<:Any, 5}, dx::CuDeviceArray{<:Any, 5}, align_corners)
341311
index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x
342312

343313
if index < n_elem
@@ -389,37 +359,3 @@ function ∇upsample_trilinear_whdcn_kernel!(n_elem, rwidth, rheight, rdepth, Δ
389359
end # if
390360
return nothing
391361
end
392-
393-
function NNlib.upsample_trilinear_whdcn!(y::CuArray{T,5}, x::CuArray{T,5}; align_corners=true) where T
394-
out_size = prod(size(y)[1:3]) # w*h*d
395-
396-
if align_corners
397-
ratios = ntuple(i -> T((size(x,i)-1) / (size(y,i)-1)), 3)
398-
else
399-
ratios = ntuple(i -> T(size(x,i) / size(y,i)), 3)
400-
end
401-
402-
kernel = @cuda launch=false upsample_trilinear_whdcn_kernel!(out_size, ratios..., x, y, align_corners)
403-
config = launch_configuration(kernel.fun; max_threads=256)
404-
threads = Base.min(out_size, config.threads)
405-
blocks = cld(out_size, threads)
406-
kernel(out_size, ratios..., x, y, align_corners; threads=threads, blocks=blocks)
407-
return y
408-
end
409-
410-
function NNlib.∇upsample_trilinear_whdcn!(dx::CuArray{T,5}, Δ::CuArray{T,5}; align_corners=true) where T
411-
in_size = prod(size(Δ)[1:3])
412-
413-
if align_corners
414-
ratios = ntuple(i -> T((size(dx,i)-1) / (size(Δ,i)-1)), 3) # reversed compared to forward pass
415-
else
416-
ratios = ntuple(i -> T(size(dx,i) / size(Δ,i)), 3)
417-
end
418-
419-
kernel = @cuda launch=false ∇upsample_trilinear_whdcn_kernel!(in_size, ratios..., Δ, dx, align_corners)
420-
config = launch_configuration(kernel.fun; max_threads=256)
421-
threads = Base.min(in_size, config.threads)
422-
blocks = cld(in_size, threads)
423-
kernel(in_size, ratios..., Δ, dx, align_corners; threads=threads, blocks=blocks)
424-
return dx
425-
end

0 commit comments

Comments
 (0)