|
42 | 42 |
|
43 | 43 | # Forward and backward pass have been tested to produce the same output
|
44 | 44 | # as pytorch with align_corners=True - it works modulo bit noise.
|
| 45 | +# pytorch's default is align_corners=False, because otherwise the gradients depend on the |
| 46 | +# image size, which should be avoided -> this should be considered here as well |
45 | 47 |
|
46 | 48 | @inline function compute_source_index(ratio::T, dst_index, align_corners) where T
|
47 | 49 | if align_corners
|
|
52 | 54 | end
|
53 | 55 | end
|
54 | 56 |
|
| 57 | +function NNlib.upsample_linear_kernel!(y::CuArray{T,N}, x::CuArray{T,N}; align_corners=true) where {T,N} |
| 58 | + out_size = prod(size(y)[1:N-2]) |
| 59 | + |
| 60 | + if align_corners |
| 61 | + ratios = ntuple(i -> T((size(x,i)-1) / (size(y,i)-1)), N-2) |
| 62 | + else |
| 63 | + ratios = ntuple(i -> T(size(x,i) / size(y,i)), N-2) |
| 64 | + end |
| 65 | + |
| 66 | + kernel = @cuda launch=false upsample_linear_cuda_kernel!(out_size, ratios..., x, y, align_corners) |
| 67 | + config = launch_configuration(kernel.fun; max_threads=256) |
| 68 | + threads = Base.min(out_size, config.threads) |
| 69 | + blocks = cld(out_size, threads) |
| 70 | + kernel(out_size, ratios..., x, y, align_corners; threads=threads, blocks=blocks) |
| 71 | + return y |
| 72 | +end |
| 73 | + |
| 74 | +function NNlib.∇upsample_linear_kernel!(dx::CuArray{T,N}, Δ::CuArray{T,N}; align_corners=true) where {T,N} |
| 75 | + in_size = prod(size(Δ)[1:N-2]) |
| 76 | + |
| 77 | + if align_corners |
| 78 | + ratios = ntuple(i -> T((size(dx,i)-1) / (size(Δ,i)-1)), N-2) # reversed compared to forward pass |
| 79 | + else |
| 80 | + ratios = ntuple(i -> T(size(dx,i) / size(Δ,i)), N-2) |
| 81 | + end |
| 82 | + |
| 83 | + kernel = @cuda launch=false ∇upsample_linear_cuda_kernel!(in_size, ratios..., Δ, dx, align_corners) |
| 84 | + config = launch_configuration(kernel.fun; max_threads=256) |
| 85 | + threads = Base.min(in_size, config.threads) |
| 86 | + blocks = cld(in_size, threads) |
| 87 | + kernel(in_size, ratios..., Δ, dx, align_corners; threads=threads, blocks=blocks) |
| 88 | + return dx |
| 89 | +end |
| 90 | + |
55 | 91 |
|
56 | 92 | ###########
|
57 | 93 | # linear
|
58 | 94 | ###########
|
59 |
| -function upsample_linear_wcn_kernel!(n_elem, rwidth, x, y, align_corners) |
| 95 | +function upsample_linear_cuda_kernel!(n_elem, rwidth, x::CuDeviceArray{<:Any, 3}, y::CuDeviceArray{<:Any, 3}, align_corners) |
60 | 96 | index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x
|
61 | 97 |
|
62 | 98 | if index < n_elem
|
@@ -86,7 +122,7 @@ function upsample_linear_wcn_kernel!(n_elem, rwidth, x, y, align_corners)
|
86 | 122 | end
|
87 | 123 |
|
88 | 124 | # Δ is the gradient backpropagated from downstream layers
|
89 |
| -function ∇upsample_linear_wcn_kernel!(n_elem, rwidth, Δ, dx, align_corners) |
| 125 | +function ∇upsample_linear_cuda_kernel!(n_elem, rwidth, Δ::CuDeviceArray{<:Any, 3}, dx::CuDeviceArray{<:Any, 3}, align_corners) |
90 | 126 | index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x
|
91 | 127 |
|
92 | 128 | if index < n_elem
|
@@ -115,44 +151,11 @@ function ∇upsample_linear_wcn_kernel!(n_elem, rwidth, Δ, dx, align_corners)
|
115 | 151 | return nothing
|
116 | 152 | end
|
117 | 153 |
|
118 |
| -function NNlib.upsample_linear_wcn!(y::CuArray{T,3}, x::CuArray{T,3}; align_corners=true) where T |
119 |
| - out_size = size(y)[1] # w |
120 |
| - |
121 |
| - if align_corners |
122 |
| - ratios = ntuple(i -> T((size(x,i)-1) / (size(y,i)-1)), 1) |
123 |
| - else |
124 |
| - ratios = ntuple(i -> T(size(x,i) / size(y,i)), 1) |
125 |
| - end |
126 |
| - |
127 |
| - kernel = @cuda launch=false upsample_linear_wcn_kernel!(out_size, ratios..., x, y, align_corners) |
128 |
| - config = launch_configuration(kernel.fun; max_threads=256) |
129 |
| - threads = Base.min(out_size, config.threads) |
130 |
| - blocks = cld(out_size, threads) |
131 |
| - kernel(out_size, ratios..., x, y, align_corners; threads=threads, blocks=blocks) |
132 |
| - return y |
133 |
| -end |
134 |
| - |
135 |
| -function NNlib.∇upsample_linear_wcn!(dx::CuArray{T,3}, Δ::CuArray{T,3}; align_corners=true) where T |
136 |
| - in_size = size(Δ)[1] |
137 |
| - if align_corners |
138 |
| - ratios = ntuple(i -> T((size(dx,i)-1) / (size(Δ,i)-1)), 1) # reversed compared to forward pass |
139 |
| - else |
140 |
| - ratios = ntuple(i -> T(size(dx,i) / size(Δ,i)), 1) |
141 |
| - end |
142 |
| - |
143 |
| - kernel = @cuda launch=false ∇upsample_linear_wcn_kernel!(in_size, ratios..., Δ, dx, align_corners) |
144 |
| - config = launch_configuration(kernel.fun; max_threads=256) |
145 |
| - threads = Base.min(in_size, config.threads) |
146 |
| - blocks = cld(in_size, threads) |
147 |
| - kernel(in_size, ratios..., Δ, dx, align_corners; threads=threads, blocks=blocks) |
148 |
| - return dx |
149 |
| -end |
150 |
| - |
151 | 154 |
|
152 | 155 | ###########
|
153 | 156 | # bilinear
|
154 | 157 | ###########
|
155 |
| -function upsample_bilinear_whcn_kernel!(n_elem, rwidth, rheight, x, y, align_corners) |
| 158 | +function upsample_linear_cuda_kernel!(n_elem, rwidth, rheight, x::CuDeviceArray{<:Any, 4}, y::CuDeviceArray{<:Any, 4}, align_corners) |
156 | 159 | index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x
|
157 | 160 |
|
158 | 161 | if index < n_elem
|
@@ -194,7 +197,7 @@ function upsample_bilinear_whcn_kernel!(n_elem, rwidth, rheight, x, y, align_cor
|
194 | 197 | end
|
195 | 198 |
|
196 | 199 | # Δ is the gradient backpropagated from downstream layers
|
197 |
| -function ∇upsample_bilinear_whcn_kernel!(n_elem, rwidth, rheight, Δ, dx, align_corners) |
| 200 | +function ∇upsample_linear_cuda_kernel!(n_elem, rwidth, rheight, Δ::CuDeviceArray{<:Any, 4}, dx::CuDeviceArray{<:Any, 4}, align_corners) |
198 | 201 | index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x
|
199 | 202 |
|
200 | 203 | if index < n_elem
|
@@ -237,44 +240,11 @@ function ∇upsample_bilinear_whcn_kernel!(n_elem, rwidth, rheight, Δ, dx, alig
|
237 | 240 | return nothing
|
238 | 241 | end
|
239 | 242 |
|
240 |
| -function NNlib.upsample_bilinear_whcn!(y::CuArray{T,4}, x::CuArray{T,4}; align_corners=true) where T |
241 |
| - out_size = prod(size(y)[1:2]) # w*h |
242 |
| - |
243 |
| - if align_corners |
244 |
| - ratios = ntuple(i -> T((size(x,i)-1) / (size(y,i)-1)), 2) |
245 |
| - else |
246 |
| - ratios = ntuple(i -> T(size(x,i) / size(y,i)), 2) |
247 |
| - end |
248 |
| - |
249 |
| - kernel = @cuda launch=false upsample_bilinear_whcn_kernel!(out_size, ratios..., x, y, align_corners) |
250 |
| - config = launch_configuration(kernel.fun; max_threads=256) |
251 |
| - threads = Base.min(out_size, config.threads) |
252 |
| - blocks = cld(out_size, threads) |
253 |
| - kernel(out_size, ratios..., x, y, align_corners; threads=threads, blocks=blocks) |
254 |
| - return y |
255 |
| -end |
256 |
| - |
257 |
| -function NNlib.∇upsample_bilinear_whcn!(dx::CuArray{T,4}, Δ::CuArray{T,4}; align_corners=true) where T |
258 |
| - in_size = prod(size(Δ)[1:2]) |
259 |
| - if align_corners |
260 |
| - ratios = ntuple(i -> T((size(dx,i)-1) / (size(Δ,i)-1)), 2) # reversed compared to forward pass |
261 |
| - else |
262 |
| - ratios = ntuple(i -> T(size(dx,i) / size(Δ,i)), 2) |
263 |
| - end |
264 |
| - |
265 |
| - kernel = @cuda launch=false ∇upsample_bilinear_whcn_kernel!(in_size, ratios..., Δ, dx, align_corners) |
266 |
| - config = launch_configuration(kernel.fun; max_threads=256) |
267 |
| - threads = Base.min(in_size, config.threads) |
268 |
| - blocks = cld(in_size, threads) |
269 |
| - kernel(in_size, ratios..., Δ, dx, align_corners; threads=threads, blocks=blocks) |
270 |
| - return dx |
271 |
| -end |
272 |
| - |
273 | 243 |
|
274 | 244 | ###########
|
275 | 245 | # trilinear
|
276 | 246 | ###########
|
277 |
| -function upsample_trilinear_whdcn_kernel!(n_elem, rwidth, rheight, rdepth, x, y, align_corners) |
| 247 | +function upsample_linear_cuda_kernel!(n_elem, rwidth, rheight, rdepth, x::CuDeviceArray{<:Any, 5}, y::CuDeviceArray{<:Any, 5}, align_corners) |
278 | 248 | index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x
|
279 | 249 |
|
280 | 250 | if index < n_elem
|
@@ -337,7 +307,7 @@ function upsample_trilinear_whdcn_kernel!(n_elem, rwidth, rheight, rdepth, x, y,
|
337 | 307 | end
|
338 | 308 |
|
339 | 309 | # Δ is the gradient backpropagated from downstream layers
|
340 |
| -function ∇upsample_trilinear_whdcn_kernel!(n_elem, rwidth, rheight, rdepth, Δ, dx, align_corners) |
| 310 | +function ∇upsample_linear_cuda_kernel!(n_elem, rwidth, rheight, rdepth, Δ::CuDeviceArray{<:Any, 5}, dx::CuDeviceArray{<:Any, 5}, align_corners) |
341 | 311 | index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x
|
342 | 312 |
|
343 | 313 | if index < n_elem
|
@@ -389,37 +359,3 @@ function ∇upsample_trilinear_whdcn_kernel!(n_elem, rwidth, rheight, rdepth, Δ
|
389 | 359 | end # if
|
390 | 360 | return nothing
|
391 | 361 | end
|
392 |
| - |
393 |
| -function NNlib.upsample_trilinear_whdcn!(y::CuArray{T,5}, x::CuArray{T,5}; align_corners=true) where T |
394 |
| - out_size = prod(size(y)[1:3]) # w*h*d |
395 |
| - |
396 |
| - if align_corners |
397 |
| - ratios = ntuple(i -> T((size(x,i)-1) / (size(y,i)-1)), 3) |
398 |
| - else |
399 |
| - ratios = ntuple(i -> T(size(x,i) / size(y,i)), 3) |
400 |
| - end |
401 |
| - |
402 |
| - kernel = @cuda launch=false upsample_trilinear_whdcn_kernel!(out_size, ratios..., x, y, align_corners) |
403 |
| - config = launch_configuration(kernel.fun; max_threads=256) |
404 |
| - threads = Base.min(out_size, config.threads) |
405 |
| - blocks = cld(out_size, threads) |
406 |
| - kernel(out_size, ratios..., x, y, align_corners; threads=threads, blocks=blocks) |
407 |
| - return y |
408 |
| -end |
409 |
| - |
410 |
| -function NNlib.∇upsample_trilinear_whdcn!(dx::CuArray{T,5}, Δ::CuArray{T,5}; align_corners=true) where T |
411 |
| - in_size = prod(size(Δ)[1:3]) |
412 |
| - |
413 |
| - if align_corners |
414 |
| - ratios = ntuple(i -> T((size(dx,i)-1) / (size(Δ,i)-1)), 3) # reversed compared to forward pass |
415 |
| - else |
416 |
| - ratios = ntuple(i -> T(size(dx,i) / size(Δ,i)), 3) |
417 |
| - end |
418 |
| - |
419 |
| - kernel = @cuda launch=false ∇upsample_trilinear_whdcn_kernel!(in_size, ratios..., Δ, dx, align_corners) |
420 |
| - config = launch_configuration(kernel.fun; max_threads=256) |
421 |
| - threads = Base.min(in_size, config.threads) |
422 |
| - blocks = cld(in_size, threads) |
423 |
| - kernel(in_size, ratios..., Δ, dx, align_corners; threads=threads, blocks=blocks) |
424 |
| - return dx |
425 |
| -end |
0 commit comments