Handle ConvTranspose correctly & refactor

pxl-th · pxl-th · commit 746caa51abf2 · 2023-02-27T13:43:28.000+02:00
diff --git a/ext/AMDGPUExt/AMDGPUExt.jl b/ext/AMDGPUExt/AMDGPUExt.jl
@@ -4,7 +4,7 @@ import ChainRulesCore
 import ChainRulesCore: NoTangent
 import Flux
 import Flux: FluxCPUAdaptor, FluxAMDAdaptor, _amd, _isleaf, adapt_storage, fmap
-import Flux: DenseConvDims, Conv, conv, conv_reshape_bias
+import Flux: DenseConvDims, Conv, ConvTranspose, conv, conv_reshape_bias
 import NNlib
 
 using AMDGPU
diff --git a/ext/AMDGPUExt/conv.jl b/ext/AMDGPUExt/conv.jl
@@ -1,9 +1,21 @@
-function (c::Conv)(x::T) where T <: ROCArray
-    Flux._size_check(c, x, ndims(x) - 1 => Flux._channels_in(c))
-    σ = NNlib.fast_act(c.σ, x)
-    cdims = DenseConvDims(
+function Flux.conv_dims(c::Conv, x::T) where T <: ROCArray
+    DenseConvDims(
         x, c.weight; stride=c.stride, padding=c.pad,
         dilation=c.dilation, groups=c.groups, flipkernel=true)
-    xT = Flux._match_eltype(c, x)
-    σ.(conv(xT, c.weight, cdims) .+ conv_reshape_bias(c))
+end
+
+function Flux.conv_transpose_dims(c::ConvTranspose, x::T) where T <: ROCArray
+    # Calculate size of "input", from ∇conv_data()'s perspective...
+    combined_pad = (c.pad[1:2:end] .+ c.pad[2:2:end])
+    I = (size(x)[1:end - 2] .- 1) .* c.stride .+ 1 .+
+        (size(c.weight)[1:end - 2] .- 1) .* c.dilation .- combined_pad
+    C_in = size(c.weight)[end - 1] * c.groups
+    batch_size = size(x)[end]
+
+    # Create DenseConvDims() that looks like the corresponding conv().
+    w_size = size(c.weight)
+    DenseConvDims(
+        (I..., C_in, batch_size), w_size;
+        stride=c.stride, padding=c.pad, dilation=c.dilation,
+        groups=c.groups, flipkernel=true)
 end
diff --git a/ext/AMDGPUExt/functor.jl b/ext/AMDGPUExt/functor.jl
@@ -44,9 +44,12 @@ end
 
 # CPU -> GPU
 
-function adapt_storage(to::FluxAMDAdaptor, m::Flux.Conv)
+_conv_basetype(c::Type{C}) where C <: Conv = Conv
+_conv_basetype(c::Type{C}) where C <: ConvTranspose = ConvTranspose
+
+function adapt_storage(to::FluxAMDAdaptor, m::C) where C <: Union{Conv, ConvTranspose}
     flipped_weight = reverse(m.weight; dims=ntuple(i -> i, ndims(m.weight) - 2))
-    Flux.Conv(
+    _conv_basetype(C)(
         Adapt.adapt(to, m.σ),
         Adapt.adapt(to, flipped_weight),
         Adapt.adapt(to, m.bias),
@@ -55,26 +58,43 @@ end
 
 # Don't adapt again.
 function adapt_storage(
-    to::FluxAMDAdaptor, m::Flux.Conv{N, M, F, A, V},
+    to::FluxAMDAdaptor, m::Conv{N, M, F, A, V},
 ) where {N, M, F, A <: ROCArray, V}
     return m
 end
 
-_amd(m::Flux.Conv) = adapt_storage(FluxAMDAdaptor(), m)
+function adapt_storage(
+    to::FluxAMDAdaptor, m::ConvTranspose{N, M, F, A, V},
+) where {N, M, F, A <: ROCArray, V}
+    return m
+end
+
+_amd(m::Union{Conv, ConvTranspose}) = adapt_storage(FluxAMDAdaptor(), m)
 
 # GPU -> CPU
 
-function Flux.cpu(m::Flux.Conv{N, M, F, A, V}) where {N, M, F, A <: ROCArray, V}
+function Flux.cpu(m::Conv{N, M, F, A, V}) where {N, M, F, A <: ROCArray, V}
+    adapt_storage(FluxCPUAdaptor(), m)
+end
+
+function Flux.cpu(m::ConvTranspose{N, M, F, A, V}) where {N, M, F, A <: ROCArray, V}
     adapt_storage(FluxCPUAdaptor(), m)
 end
 
 function adapt_storage(
-    to::FluxCPUAdaptor, m::Flux.Conv{N, M, F, A, V},
+    to::FluxCPUAdaptor, m::Conv{N, M, F, A, V},
 ) where {N, M, F, A <: ROCArray, V}
     dims = ntuple(i -> i, ndims(m.weight) - 2)
-    Flux.Conv(
-        Adapt.adapt(to, m.σ),
-        reverse(Adapt.adapt(to, m.weight); dims),
-        Adapt.adapt(to, m.bias),
-        m.stride, m.pad, m.dilation, m.groups)
+    Conv(
+        Adapt.adapt(to, m.σ), reverse(Adapt.adapt(to, m.weight); dims),
+        Adapt.adapt(to, m.bias), m.stride, m.pad, m.dilation, m.groups)
+end
+
+function adapt_storage(
+    to::FluxCPUAdaptor, m::ConvTranspose{N, M, F, A, V},
+) where {N, M, F, A <: ROCArray, V}
+    dims = ntuple(i -> i, ndims(m.weight) - 2)
+    ConvTranspose(
+        Adapt.adapt(to, m.σ), reverse(Adapt.adapt(to, m.weight); dims),
+        Adapt.adapt(to, m.bias), m.stride, m.pad, m.dilation, m.groups)
 end
diff --git a/test/amd/basic.jl b/test/amd/basic.jl
@@ -25,8 +25,8 @@ end
 end
 
 @testset "Convolution" begin
-    for nd in 1:3
-        m = Conv(tuple(fill(2, nd)...), 3 => 4) |> f32
+    for conv_type in (Conv, ConvTranspose), nd in 1:3
+        m = conv_type(tuple(fill(2, nd)...), 3 => 4) |> f32
         x = rand(Float32, fill(10, nd)..., 3, 5)
 
         # Ensure outputs are the same.
@@ -85,10 +85,3 @@ end
         amdgputest(bn, x; atol=1f-3, allow_nothing=true)
     end
 end
-
-# FIXME scalar indexing. Needs NNlib.scatter?
-# @testset "Flux.onehot gpu" begin
-#     y = Flux.onehotbatch(ones(3), 1:2) |> Flux.gpu
-#     x = rand(3, 2) |> Flux.gpu
-#     @test gradient(x -> sum(x * y), x)[1] isa ROCArray
-# end
diff --git a/test/amd/utils.jl b/test/amd/utils.jl
@@ -37,7 +37,7 @@ amd_check_grad(
 amd_check_grad(
     g_gpu::ROCArray{Float32}, g_cpu::Zygote.FillArrays.AbstractFill;
     atol, allow_nothing
-) = @test collect(g_cpu) ≈ collect(g_gpu) atol=atol
+) = @test g_cpu ≈ collect(g_gpu) atol=atol
 
 function amd_check_grad(g_gpu::Tuple, g_cpu::Tuple; atol, allow_nothing)
     for (v1, v2) in zip(g_gpu, g_cpu)