FluxML
diff --git a/‎.gitignore
Lines changed: 1 addition & 1 deletion b/‎.gitignore
Lines changed: 1 addition & 1 deletion
diff --git a/‎NEWS.md
Lines changed: 3 additions & 0 deletions b/‎NEWS.md
Lines changed: 3 additions & 0 deletions
diff --git a/‎Project.toml
Lines changed: 10 additions & 1 deletion b/‎Project.toml
Lines changed: 10 additions & 1 deletion
diff --git a/‎docs/src/gpu.md
Lines changed: 36 additions & 0 deletions b/‎docs/src/gpu.md
Lines changed: 36 additions & 0 deletions
diff --git a/‎ext/AMDGPUExt/AMDGPUExt.jl
Lines changed: 48 additions & 0 deletions b/‎ext/AMDGPUExt/AMDGPUExt.jl
Lines changed: 48 additions & 0 deletions
diff --git a/‎ext/AMDGPUExt/batchnorm.jl
Lines changed: 24 additions & 0 deletions b/‎ext/AMDGPUExt/batchnorm.jl
Lines changed: 24 additions & 0 deletions
diff --git a/‎ext/AMDGPUExt/conv.jl
Lines changed: 21 additions & 0 deletions b/‎ext/AMDGPUExt/conv.jl
Lines changed: 21 additions & 0 deletions
diff --git a/‎ext/AMDGPUExt/functor.jl
Lines changed: 95 additions & 0 deletions b/‎ext/AMDGPUExt/functor.jl
Lines changed: 95 additions & 0 deletions
diff --git a/‎src/Flux.jl
Lines changed: 1 addition & 0 deletions b/‎src/Flux.jl
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/functor.jl
Lines changed: 57 additions & 2 deletions b/‎src/functor.jl
Lines changed: 57 additions & 2 deletions
@@ -7,4 +7,4 @@ docs/site/
 deps
 .vscode
 Manifest.toml
-
+LocalPreferences.toml
@@ -2,6 +2,9 @@
 
 ## v0.13.13
 * Added `f16` which changes precision to `Float16`, recursively.
+* Initial support for AMDGPU via extension mechanism.
+* Add `gpu_backend` preference to select GPU backend using `LocalPreference.toml`.
+* Add `Flux.gpu_backend!` method to switch between GPU backends.
 
 ## v0.13.12
 * CUDA.jl 4.0 compatibility.
 
@@ -14,6 +14,7 @@ NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 NNlibCUDA = "a00861dc-f156-4864-bf3c-e6376f28a68d"
 OneHotArrays = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
+Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 ProgressLogging = "33c8b6b6-d38a-422a-b730-caa89a2f386c"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
@@ -23,14 +24,21 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
+[weakdeps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+
+[extensions]
+AMDGPUExt = "AMDGPU"
+
 [compat]
+AMDGPU = "0.4.8"
 Adapt = "3.0"
 CUDA = "3, 4"
 ChainRulesCore = "1.12"
 Functors = "0.3, 0.4"
 MLUtils = "0.2, 0.3.1, 0.4"
 MacroTools = "0.5"
-NNlib = "0.8.15"
+NNlib = "0.8.19"
 NNlibCUDA = "0.2.6"
 OneHotArrays = "0.1, 0.2"
 Optimisers = "0.2.12"
@@ -42,6 +50,7 @@ Zygote = "0.6.49"
 julia = "1.6"
 
 [extras]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
 
@@ -2,6 +2,8 @@
 
 NVIDIA GPU support should work out of the box on systems with CUDA and CUDNN installed. For more details see the [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) readme.
 
+AMD GPU support is available since Julia 1.9 on systems with ROCm and MIOpen installed. For more details refer to the [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl) repository.
+
 ## Checking GPU Availability
 
 By default, Flux will run the checks on your system to see if it can support GPU functionality. You can check if Flux identified a valid GPU setup by typing the following:
@@ -13,6 +15,40 @@ julia> CUDA.functional()
 true
 ```
 
+For AMD GPU:
+
+```julia
+julia> using AMDGPU
+
+julia> AMDGPU.functional()
+true
+
+julia> AMDGPU.functional(:MIOpen)
+true
+```
+
+## Selecting GPU backend
+
+Available GPU backends are: `CUDA`, `AMD`.
+
+Flux relies on [Preferences.jl](https://github.com/JuliaPackaging/Preferences.jl) for selecting default GPU backend to use.
+
+There are two ways you can specify it:
+
+- From the REPL/code in your project, call `Flux.gpu_backend!("AMD")` and restart (if needed) Julia session for the changes to take effect.
+- In `LocalPreferences.toml` file in you project directory specify:
+```toml
+[Flux]
+gpu_backend = "AMD"
+```
+
+Current GPU backend can be fetched from `Flux.GPU_BACKEND` variable:
+
+```julia
+julia> Flux.GPU_BACKEND
+"CUDA"
+```
+
 ## GPU Usage
 
 Support for array operations on other hardware backends, like GPUs, is provided by external packages like [CUDA](https://github.com/JuliaGPU/CUDA.jl). Flux is agnostic to array types, so we simply need to move model weights and data to the GPU and Flux will handle it.
 
@@ -0,0 +1,48 @@
+module AMDGPUExt
+
+import ChainRulesCore
+import ChainRulesCore: NoTangent
+import Flux
+import Flux: FluxCPUAdaptor, FluxAMDAdaptor, _amd, _isleaf, adapt_storage, fmap
+import Flux: DenseConvDims, Conv, ConvTranspose, conv, conv_reshape_bias
+import NNlib
+
+using AMDGPU
+using Adapt
+using Random
+using Zygote
+
+const MIOPENFloat = AMDGPU.MIOpen.MIOPENFloat
+const USE_AMDGPU = Ref{Union{Nothing, Bool}}(nothing)
+
+function check_use_amdgpu()
+    isnothing(USE_AMDGPU[]) || return
+
+    USE_AMDGPU[] = AMDGPU.functional()
+    if USE_AMDGPU[]
+        if !AMDGPU.functional(:MIOpen)
+            @warn "MIOpen is not functional in AMDGPU.jl, some functionality will not be available."
+        end
+    else
+        @info """
+        The AMDGPU function is being called but the AMDGPU is not functional.
+        Defaulting back to the CPU. (No action is required if you want to run on the CPU).
+        """ maxlog=1
+    end
+    return
+end
+ChainRulesCore.@non_differentiable check_use_amdgpu()
+
+include("functor.jl")
+include("batchnorm.jl")
+include("conv.jl")
+
+function __init__()
+    Flux.AMDGPU_LOADED[] = true
+end
+
+# TODO
+# fail early if input to the model is not on the device (e.g. on the host)
+# otherwise we get very cryptic errors & segfaults at the rocBLAS level
+
+end
@@ -0,0 +1,24 @@
+function (b::Flux.BatchNorm)(x::ROCArray{T}) where T <: MIOPENFloat
+    b.λ.(_amd_batchnorm(
+        x, b.γ, b.β; μ=b.μ, σ²=b.σ², ϵ=b.ϵ,
+        within_grad=NNlib.within_gradient(x)))
+end
+
+function _amd_batchnorm(x, γ, β; μ, σ², ϵ, within_grad::Bool)
+    if within_grad
+        return AMDGPU.MIOpen.batchnorm_training(x, γ, β, μ, σ²; ϵ=Float64(ϵ), iteration=0) # TODO iteration
+    else
+        return AMDGPU.MIOpen.batchnorm_inference(x, γ, β, μ, σ²; ϵ=Float64(ϵ))
+    end
+end
+
+function ChainRulesCore.rrule(
+    ::typeof(_amd_batchnorm), x, γ, β; μ, σ², ϵ, within_grad::Bool,
+)
+    y, μ_saved, ν_saved = _amd_batchnorm(x, γ, β; μ, σ², ϵ, within_grad)
+    function _batchnorm_pullback(Δ)
+        dx, dγ, dβ = AMDGPU.MIOpen.∇batchnorm(Δ, x, γ, β, μ_saved, ν_saved)
+        (NoTangent(), dx, dγ, dβ)
+    end
+    y, _batchnorm_pullback
+end
@@ -0,0 +1,21 @@
+function Flux.conv_dims(c::Conv, x::T) where T <: ROCArray
+    DenseConvDims(
+        x, c.weight; stride=c.stride, padding=c.pad,
+        dilation=c.dilation, groups=c.groups, flipkernel=true)
+end
+
+function Flux.conv_transpose_dims(c::ConvTranspose, x::T) where T <: ROCArray
+    # Calculate size of "input", from ∇conv_data()'s perspective...
+    combined_pad = (c.pad[1:2:end] .+ c.pad[2:2:end])
+    I = (size(x)[1:end - 2] .- 1) .* c.stride .+ 1 .+
+        (size(c.weight)[1:end - 2] .- 1) .* c.dilation .- combined_pad
+    C_in = size(c.weight)[end - 1] * c.groups
+    batch_size = size(x)[end]
+
+    # Create DenseConvDims() that looks like the corresponding conv().
+    w_size = size(c.weight)
+    DenseConvDims(
+        (I..., C_in, batch_size), w_size;
+        stride=c.stride, padding=c.pad, dilation=c.dilation,
+        groups=c.groups, flipkernel=true)
+end
@@ -0,0 +1,95 @@
+# Convert Float64 to Float32, but preserve Float16.
+adapt_storage(::FluxAMDAdaptor, x::T) where T <: AbstractArray =
+    isbits(x) ? x : ROCArray(x)
+adapt_storage(::FluxAMDAdaptor, x::AbstractArray{T, N}) where {T <: AbstractFloat, N} =
+    isbits(x) ? x : ROCArray{Float32, N}(x)
+adapt_storage(::FluxAMDAdaptor, x::AbstractArray{Float16, N}) where N =
+    isbits(x) ? x : ROCArray{Float16, N}(x)
+
+adapt_storage(::FluxAMDAdaptor, x::Zygote.FillArrays.AbstractFill) =
+    ROCArray(collect(x))
+adapt_storage(::FluxAMDAdaptor, x::Zygote.OneElement) = ROCArray(collect(x))
+adapt_storage(::FluxAMDAdaptor, x::Random.TaskLocalRNG) =
+    AMDGPU.rocRAND.default_rng()
+adapt_storage(::FluxAMDAdaptor, x::AMDGPU.rocRAND.RNG) = x
+adapt_storage(::FluxAMDAdaptor, x::AbstractRNG) = error("""
+    Cannot map RNG of type $(typeof(x)) to AMDGPU.
+    AMDGPU execution only supports Random.default_rng().""")
+
+adapt_storage(::FluxCPUAdaptor, x::AMDGPU.rocRAND.RNG) = Random.default_rng()
+
+function ChainRulesCore.rrule(
+    ::typeof(Adapt.adapt_storage), to::FluxCPUAdaptor, x::AMDGPU.AnyROCArray,
+)
+    adapt_storage(to, x), dx -> (
+        NoTangent(), NoTangent(),
+        adapt_storage(FluxAMDAdaptor(), unthunk(dx)))
+end
+
+function _amd(x)
+    check_use_amdgpu()
+    USE_AMDGPU[] || return x
+    fmap(x -> Adapt.adapt(FluxAMDAdaptor(), x), x; exclude=_isleaf)
+end
+
+# Since MIOpen supports only cross-correlation as convolution,
+# for the actual convolution, we flip horizontally and vertically the weights.
+# Same for CPU -> GPU & GPU -> CPU movements.
+# Note, that gradients are also flipped.
+
+# CPU -> GPU
+
+_conv_basetype(c::Type{C}) where C <: Conv = Conv
+_conv_basetype(c::Type{C}) where C <: ConvTranspose = ConvTranspose
+
+function adapt_storage(to::FluxAMDAdaptor, m::C) where C <: Union{Conv, ConvTranspose}
+    flipped_weight = reverse(m.weight; dims=ntuple(i -> i, ndims(m.weight) - 2))
+    _conv_basetype(C)(
+        Adapt.adapt(to, m.σ),
+        Adapt.adapt(to, flipped_weight),
+        Adapt.adapt(to, m.bias),
+        m.stride, m.pad, m.dilation, m.groups)
+end
+
+# Don't adapt again.
+function adapt_storage(
+    to::FluxAMDAdaptor, m::Conv{N, M, F, A, V},
+) where {N, M, F, A <: ROCArray, V}
+    return m
+end
+
+function adapt_storage(
+    to::FluxAMDAdaptor, m::ConvTranspose{N, M, F, A, V},
+) where {N, M, F, A <: ROCArray, V}
+    return m
+end
+
+_amd(m::Union{Conv, ConvTranspose}) = adapt_storage(FluxAMDAdaptor(), m)
+
+# GPU -> CPU
+
+function Flux.cpu(m::Conv{N, M, F, A, V}) where {N, M, F, A <: ROCArray, V}
+    adapt_storage(FluxCPUAdaptor(), m)
+end
+
+function Flux.cpu(m::ConvTranspose{N, M, F, A, V}) where {N, M, F, A <: ROCArray, V}
+    adapt_storage(FluxCPUAdaptor(), m)
+end
+
+function adapt_storage(
+    to::FluxCPUAdaptor, m::Conv{N, M, F, A, V},
+) where {N, M, F, A <: ROCArray, V}
+    dims = ntuple(i -> i, ndims(m.weight) - 2)
+    Conv(
+        Adapt.adapt(to, m.σ), reverse(Adapt.adapt(to, m.weight); dims),
+        Adapt.adapt(to, m.bias), m.stride, m.pad, m.dilation, m.groups)
+end
+
+function adapt_storage(
+    to::FluxCPUAdaptor, m::ConvTranspose{N, M, F, A, V},
+) where {N, M, F, A <: ROCArray, V}
+    dims = ntuple(i -> i, ndims(m.weight) - 2)
+    ConvTranspose(
+        Adapt.adapt(to, m.σ), reverse(Adapt.adapt(to, m.weight); dims),
+        Adapt.adapt(to, m.bias), m.stride, m.pad, m.dilation, m.groups)
+end
@@ -1,6 +1,7 @@
 module Flux
 
 using Base: tail
+using Preferences
 using LinearAlgebra, Statistics, Random  # standard lib
 using MacroTools, Reexport, ProgressLogging, SpecialFunctions
 using MacroTools: @forward
 
@@ -177,14 +177,38 @@ _isbitsarray(x) = false
 _isleaf(::AbstractRNG) = true
 _isleaf(x) = _isbitsarray(x) || Functors.isleaf(x)
 
+const GPU_BACKENDS = ("CUDA", "AMD")
+const GPU_BACKEND = @load_preference("gpu_backend", "CUDA")
+
+function gpu_backend!(backend::String)
+    if backend == GPU_BACKEND
+        @info """
+        GPU backend is already set to: $backend.
+        No need to do anything else.
+        """
+        return
+    end
+
+    backend in GPU_BACKENDS || throw(ArgumentError("""
+    Unsupported GPU backend: $backend.
+    Supported backends are: $GPU_BACKENDS.
+    """))
+
+    @set_preferences!("gpu_backend" => backend)
+    @info """
+    New GPU backend set: $backend.
+    Restart your Julia session for this change to take effect!
+    """
+end
+
 """
     gpu(x)
 
-Copies `m` to the current GPU device, if one is available.
+Copies `m` to the current GPU device (using current GPU backend), if one is available.
 If no GPU is available, it does nothing (but prints a warning the first time).
 
 On arrays, this calls CUDA's `cu`, which also changes arrays
-with Float64 elements to Float32 while copying them to the device.
+with Float64 elements to Float32 while copying them to the device (same for AMDGPU).
 To act on arrays within a struct, the struct type must be marked with [`@functor`](@ref).
 
 Use [`cpu`](@ref) to copy back to ordinary `Array`s.
@@ -209,6 +233,19 @@ CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}
 ```
 """
 function gpu(x)
+    @static if GPU_BACKEND == "CUDA"
+        gpu(FluxCUDAAdaptor(), x)
+    elseif GPU_BACKEND == "AMD"
+        gpu(FluxAMDAdaptor(), x)
+    else
+        error("""
+        Unsupported GPU backend: $GPU_BACKEND.
+        Supported backends are: $GPU_BACKENDS.
+        """)
+    end
+end
+
+function gpu(::FluxCUDAAdaptor, x)
   check_use_cuda()
   use_cuda[] ? fmap(x -> Adapt.adapt(FluxCUDAAdaptor(), x), x; exclude = _isleaf) : x
 end
@@ -280,3 +317,21 @@ f16(m) = _paramtype(Float16, m)
 @functor Cholesky
 trainable(c::Cholesky) = ()
 
+# AMDGPU extension.
+
+struct FluxAMDAdaptor end
+
+const AMDGPU_LOADED = Ref{Bool}(false)
+
+function gpu(::FluxAMDAdaptor, x)
+    if AMDGPU_LOADED[]
+        return _amd(x)
+    else
+        @info """
+        The AMDGPU functionality is being called via `Flux.amd` but
+        `AMDGPU` must be loaded to access it.
+        """ maxlog=1
+    end
+end
+
+function _amd end