Fix batchnorm & handle regular convolutions

pxl-th · pxl-th · commit cf6dd42d8ec7 · 2023-02-17T15:45:48.000+02:00
diff --git a/Project.toml b/Project.toml
@@ -59,4 +59,4 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "Documenter", "IterTools", "LinearAlgebra", "FillArrays", "ComponentArrays"]
+test = ["AMDGPU", "Test", "Documenter", "IterTools", "LinearAlgebra", "FillArrays", "ComponentArrays"]
diff --git a/ext/AMDGPUExt/AMDGPUExt.jl b/ext/AMDGPUExt/AMDGPUExt.jl
@@ -4,6 +4,8 @@ import ChainRulesCore
 import ChainRulesCore: NoTangent
 import Flux
 import Flux: FluxCPUAdaptor, FluxAMDAdaptor, _amd, _isleaf, adapt_storage, fmap
+import Flux: DenseConvDims, Conv, conv, conv_reshape_bias
+import NNlib
 
 using AMDGPU
 using Adapt
@@ -32,6 +34,8 @@ end
 ChainRulesCore.@non_differentiable check_use_amdgpu()
 
 include("functor.jl")
+include("batchnorm.jl")
+include("conv.jl")
 
 function __init__()
     Flux.AMDGPU_LOADED[] = true
diff --git a/ext/AMDGPUExt/batchnorm.jl b/ext/AMDGPUExt/batchnorm.jl
@@ -1,19 +1,23 @@
 function (b::Flux.BatchNorm)(x::ROCArray{T}) where T <: MIOPENFloat
-    bλ.(_amd_batchnorm(x, b.γ, b.β; μ=b.μ, σ²=b.σ², ϵ=b.ϵ))
+    b.λ.(_amd_batchnorm(
+        x, b.γ, b.β; μ=b.μ, σ²=b.σ², ϵ=b.ϵ,
+        within_grad=NNlib.within_gradient(x)))
 end
 
-function _amd_batchnorm(x, γ, β; μ, σ², ϵ)
-    if NNlib.within_gradient(x)
-        return AMDGPU.MIOpen.batchnorm_training(x, γ, β, μ, σ²; ϵ, iteration=0) # TODO iteration
+function _amd_batchnorm(x, γ, β; μ, σ², ϵ, within_grad::Bool)
+    if within_grad
+        return AMDGPU.MIOpen.batchnorm_training(x, γ, β, μ, σ²; ϵ=Float64(ϵ), iteration=0) # TODO iteration
     else
-        return AMDGPU.MIOpen.batchnorm_inference(x, γ, β, μ, σ²; ϵ)
+        return AMDGPU.MIOpen.batchnorm_inference(x, γ, β, μ, σ²; ϵ=Float64(ϵ))
     end
 end
 
-function ChainRulesCore.rrule(::typeof(_amd_batchnorm), x, γ, β; μ, σ², ϵ)
-    y, μ_saved, ν_saved = _amd_batchnorm(x, γ, β; μ, σ², ϵ)
+function ChainRulesCore.rrule(
+    ::typeof(_amd_batchnorm), x, γ, β; μ, σ², ϵ, within_grad::Bool,
+)
+    y, μ_saved, ν_saved = _amd_batchnorm(x, γ, β; μ, σ², ϵ, within_grad)
     function _batchnorm_pullback(Δ)
-        dx, dγ, dβ = MIOpen.∇batchnorm(Δ, x, γ, β, μ_saved, ν_saved)
+        dx, dγ, dβ = AMDGPU.MIOpen.∇batchnorm(Δ, x, γ, β, μ_saved, ν_saved)
         (NoTangent(), dx, dγ, dβ)
     end
     y, _batchnorm_pullback
diff --git a/ext/AMDGPUExt/conv.jl b/ext/AMDGPUExt/conv.jl
@@ -0,0 +1,9 @@
+function (c::Conv)(x::T) where T <: ROCArray
+    Flux._size_check(c, x, ndims(x) - 1 => Flux._channels_in(c))
+    σ = NNlib.fast_act(c.σ, x)
+    cdims = DenseConvDims(
+        x, c.weight; stride=c.stride, padding=c.pad,
+        dilation=c.dilation, groups=c.groups, flipkernel=true)
+    xT = Flux._match_eltype(c, x)
+    σ.(conv(xT, c.weight, cdims) .+ conv_reshape_bias(c))
+end
diff --git a/src/Flux.jl b/src/Flux.jl
@@ -73,22 +73,4 @@ include("deprecations.jl")
 
 include("cuda/cuda.jl")
 
-const GPU_BACKENDS = Dict(
-    "CUDA" => FluxCUDAAdaptor(),
-    "AMD" => FluxAMDAdaptor())
-
-const GPU_BACKEND = Ref{Union{FluxCUDAAdaptor, FluxAMDAdaptor}}(
-    GPU_BACKENDS[@load_preference("gpu_backend", "CUDA")])
-
-function gpu_backend!(backend::String)
-    backend in keys(GPU_BACKENDS) || throw(ArgumentError("""
-    Unsupported GPU backend: $backend.
-    Supported backends are: $(keys(GPU_BACKENDS)).
-    """))
-
-    @set_preferences!("gpu_backend" => backend)
-    GPU_BACKEND[] = GPU_BACKENDS[@load_preference("gpu_backend")]
-    return
-end
-
 end # module
diff --git a/src/functor.jl b/src/functor.jl
@@ -177,6 +177,30 @@ _isbitsarray(x) = false
 _isleaf(::AbstractRNG) = true
 _isleaf(x) = _isbitsarray(x) || Functors.isleaf(x)
 
+const GPU_BACKENDS = ("CUDA", "AMD")
+const GPU_BACKEND = @load_preference("gpu_backend", "CUDA")
+
+function gpu_backend!(backend::String)
+    if backend == GPU_BACKEND
+        @info """
+        GPU backend is already set to: $backend.
+        No need to do anything else.
+        """
+        return
+    end
+
+    backend in GPU_BACKENDS || throw(ArgumentError("""
+    Unsupported GPU backend: $backend.
+    Supported backends are: $GPU_BACKENDS.
+    """))
+
+    @set_preferences!("gpu_backend" => backend)
+    @info """
+    New GPU backend set: $backend.
+    Restart your Julia session for this change to take effect!
+    """
+end
+
 """
     gpu(x)
 
@@ -209,7 +233,16 @@ CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}
 ```
 """
 function gpu(x)
-    gpu(GPU_BACKEND[], x)
+    @static if GPU_BACKEND == "CUDA"
+        gpu(FluxCUDAAdaptor(), x)
+    elseif GPU_BACKEND == "AMD"
+        gpu(FluxAMDAdaptor(), x)
+    else
+        error("""
+        Unsupported GPU backend: $GPU_BACKEND.
+        Supported backends are: $GPU_BACKENDS.
+        """)
+    end
 end
 
 function gpu(::FluxCUDAAdaptor, x)
diff --git a/test/amd/basic.jl b/test/amd/basic.jl
@@ -81,8 +81,8 @@ end
 @testset "Batchnorm" begin
     bn = BatchNorm(3, σ)
     for nd in 1:3
-        x = rand(Float32, fill(16, nd - 1)..., 3, 4)
-        amdgputest(bn, x; atol=1f-3)
+        x = rand(Float32, fill(2, nd - 1)..., 3, 4)
+        amdgputest(bn, x; atol=1f-3, allow_nothing=true)
     end
 end
 
diff --git a/test/amd/utils.jl b/test/amd/utils.jl
@@ -1,4 +1,6 @@
-function amdgputest(model, xs...; checkgrad=true, atol=1e-6)
+function amdgputest(
+    model, xs...; checkgrad=true, atol=1e-6, allow_nothing::Bool = false,
+)
     cpu_model = model
     gpu_model = Flux.gpu(model)
 
@@ -12,36 +14,40 @@ function amdgputest(model, xs...; checkgrad=true, atol=1e-6)
     if checkgrad
         cpu_grad = gradient(m -> sum(m(cpu_in...)), cpu_model)
         gpu_grad = gradient(m -> sum(m(gpu_in...)), gpu_model)
-        amd_check_grad(gpu_grad, cpu_grad; atol)
+        amd_check_grad(gpu_grad, cpu_grad; atol, allow_nothing)
     end
 end
 
-function amd_check_grad(g_gpu, g_cpu; atol)
-  @show g_gpu g_cpu
-  @test false
+function amd_check_grad(g_gpu, g_cpu; atol, allow_nothing)
+    allow_nothing && return
+    @show g_gpu g_cpu
+    @test false
 end
 
-amd_check_grad(g_gpu::Base.RefValue, g_cpu::Base.RefValue, atol) =
-    amd_check_grad(g_gpu[], g_cpu[]; atol)
-amd_check_grad(g_gpu::Nothing, g_cpu::Nothing; atol) =
+amd_check_grad(g_gpu::Base.RefValue, g_cpu::Base.RefValue, atol, allow_nothing) =
+    amd_check_grad(g_gpu[], g_cpu[]; atol, allow_nothing)
+amd_check_grad(g_gpu::Nothing, g_cpu::Nothing; atol, allow_nothing) =
     @test true
-amd_check_grad(g_gpu::Float32, g_cpu::Float32; atol) =
+amd_check_grad(g_gpu::Float32, g_cpu::Float32; atol, allow_nothing) =
     @test g_cpu ≈ g_gpu atol=atol
-amd_check_grad(g_gpu::ROCArray{Float32}, g_cpu::Array{Float32}; atol) =
-    @test g_cpu ≈ collect(g_gpu) atol=atol
 amd_check_grad(
-    g_gpu::ROCArray{Float32}, g_cpu::Zygote.FillArrays.AbstractFill; atol,
+    g_gpu::ROCArray{Float32}, g_cpu::Array{Float32};
+    atol, allow_nothing,
+) = @test g_cpu ≈ collect(g_gpu) atol=atol
+amd_check_grad(
+    g_gpu::ROCArray{Float32}, g_cpu::Zygote.FillArrays.AbstractFill;
+    atol, allow_nothing
 ) = @test collect(g_cpu) ≈ collect(g_gpu) atol=atol
 
-function amd_check_grad(g_gpu::Tuple, g_cpu::Tuple; atol)
+function amd_check_grad(g_gpu::Tuple, g_cpu::Tuple; atol, allow_nothing)
     for (v1, v2) in zip(g_gpu, g_cpu)
-        amd_check_grad(v1, v2; atol)
+        amd_check_grad(v1, v2; atol, allow_nothing)
     end
 end
 
-function amd_check_grad(g_gpu::NamedTuple, g_cpu::NamedTuple; atol)
+function amd_check_grad(g_gpu::NamedTuple, g_cpu::NamedTuple; atol, allow_nothing)
     for ((k1, v1), (k2, v2)) in zip(pairs(g_gpu), pairs(g_cpu))
         @test k1 == k2
-        amd_check_grad(v1, v2; atol)
+        amd_check_grad(v1, v2; atol, allow_nothing)
     end
 end