Skip to content

Commit 1b24915

Browse files
authored
Merge pull request #470 from pxl-th/amdgpu-extension
Add AMDGPU extension
2 parents b1226e8 + 44f7b3d commit 1b24915

File tree

17 files changed

+422
-0
lines changed

17 files changed

+422
-0
lines changed

.buildkite/pipeline.yml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,34 @@ steps:
5555
if: build.pull_request.labels includes "benchmark"
5656
timeout_in_minutes: 30
5757

58+
- label: "AMDGPU - Julia 1.9 - No Artifacts"
59+
plugins:
60+
- JuliaCI/julia#v1:
61+
version: 1.9-nightly
62+
- JuliaCI/julia-test#v1:
63+
- JuliaCI/julia-coverage#v1:
64+
codecov: true
65+
dirs:
66+
- src
67+
- ext
68+
agents:
69+
queue: "juliagpu"
70+
rocm: "*"
71+
rocmgpu: "*"
72+
command:
73+
- julia -e """
74+
using TOML;
75+
conf = TOML.parse(read(\"Project.toml\", String));
76+
push!(conf[\"targets\"][\"test\"], \"AMDGPU\");
77+
open(io -> TOML.print(io, conf), \"Project.toml\", \"w\");
78+
"""
79+
timeout_in_minutes: 30
80+
env:
81+
JULIA_AMDGPU_CORE_MUST_LOAD: "1"
82+
JULIA_AMDGPU_HIP_MUST_LOAD: "1"
83+
JULIA_AMDGPU_DISABLE_ARTIFACTS: "1"
84+
NNLIB_TEST_AMDGPU: true
85+
5886
# - label: "GPU julia nightly"
5987
# plugins:
6088
# - JuliaCI/julia#v1:

Project.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,21 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
1111
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
1212
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
1313

14+
[weakdeps]
15+
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
16+
17+
[extensions]
18+
AMDGPUExt = "AMDGPU"
19+
1420
[compat]
21+
AMDGPU = "0.4.7"
1522
Adapt = "2, 3.2"
1623
ChainRulesCore = "1.13"
1724
Requires = "0.5, 1.0"
1825
julia = "1.6"
1926

2027
[extras]
28+
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
2129
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
2230
ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
2331
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"

docs/src/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@
55
For use with automatic differentiation, this package defines gradients using [ChainRules.jl](https://github.com/JuliaDiff/ChainRules.jl). These will be seen by various packages including [Zygote.jl](https://github.com/FluxML/Zygote.jl).
66

77
To use these functions with [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) you will need [NNlibCUDA.jl](https://github.com/FluxML/NNlibCUDA.jl) as well.
8+
For [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl) you will need to load it and NNlib in the same Julia session.

ext/AMDGPUExt/AMDGPUExt.jl

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
module AMDGPUExt
2+
3+
using Adapt
4+
using AMDGPU
5+
using AMDGPU.MIOpen
6+
using ChainRulesCore
7+
using NNlib
8+
using NNlib: BatchedAdjoint, BatchedTranspose, BatchedAdjOrTrans
9+
using NNlib: DenseConvDims, PoolDims
10+
11+
const MIOPENFloat = Union{Float16, Float32}
12+
13+
const ROCBatchedAdjoint{T} = BatchedAdjoint{T, <: ROCArray{T}}
14+
const ROCBatchedTranspose{T} = BatchedTranspose{T, <: ROCArray{T}}
15+
const ROCBatchedAdjOrTrans{T} = Union{ROCBatchedAdjoint{T}, ROCBatchedTranspose{T}}
16+
const WrappedROCBatchedAdjOrTrans{T, N} = Adapt.WrappedArray{T, N, ROCBatchedAdjOrTrans{T}, ROCBatchedAdjOrTrans{T}}
17+
const AnyROCBatchedAdjOrTrans = Union{ROCBatchedAdjOrTrans, WrappedROCBatchedAdjOrTrans}
18+
19+
function Base.convert(::Type{T}, b::AnyROCBatchedAdjOrTrans) where {T <: Array}
20+
Base.convert(T, adapt(Array, b))
21+
end
22+
23+
function Base.Array{T, N}(b::AnyROCBatchedAdjOrTrans) where {T, N}
24+
Array{T, N}(adapt(Array, b))
25+
end
26+
27+
Base.collect(b::AnyROCBatchedAdjOrTrans) = collect(adapt(Array, b))
28+
29+
function Base.show(
30+
io::IO, mime::MIME{Symbol("text/plain")}, x::AnyROCBatchedAdjOrTrans,
31+
)
32+
show(io, mime, adapt(Array, x))
33+
end
34+
35+
Base.show(io::IO, x::AnyROCBatchedAdjOrTrans) = show(io, adapt(Array, x))
36+
37+
Base.display(x::AnyROCBatchedAdjOrTrans) = display(adapt(Array, x))
38+
39+
function NNlib._batched_gemm!(
40+
::Type{<: ROCArray}, transA::Char, transB::Char, α, A, B, β, C,
41+
)
42+
AMDGPU.rocBLAS.gemm_batched!(transA, transB, α, A, B, β, C)
43+
end
44+
45+
function nnlib_padding(dims)
46+
pd = NNlib.padding(dims)
47+
if !all(pd[1:2:end] .== pd[2:2:end])
48+
@warn """
49+
MIOpen does not support asymmetric padding, defaulting to symmetric choice:
50+
$pd -> $(pd[1:2:end]).
51+
""" maxlog=1
52+
end
53+
pd[1:2:end]
54+
end
55+
56+
include("conv.jl")
57+
include("pool.jl")
58+
include("softmax.jl")
59+
include("activations.jl")
60+
61+
end

ext/AMDGPUExt/activations.jl

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
for (f, op) in [
2+
NNlib.relu => MIOpen.relu,
3+
NNlib.relu6 => x -> MIOpen.clippedrelu(x, 6),
4+
NNlib.softplus => MIOpen.softrelu,
5+
NNlib.σ => MIOpen.sigmoid,
6+
Base.tanh => MIOpen.tanh,
7+
# TODO define for leakyrelu, elu, etc.?
8+
]
9+
@eval function Base.materialize(
10+
bc::Broadcast.Broadcasted{<:Any,<:Any,typeof($f),<:Tuple{ROCArray{<:MIOPENFloat}}}
11+
)
12+
return $op(bc.args[1])
13+
end
14+
end
15+
16+
Base.broadcasted(::typeof(identity), x::ROCArray{T}) where {T<:MIOPENFloat} = x

ext/AMDGPUExt/conv.jl

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
function NNlib.conv!(
2+
y::ROCArray{T, N}, x::ROCArray{T, N}, w::ROCArray{T, N}, cdims::DenseConvDims,
3+
) where {T <: MIOPENFloat, N}
4+
NNlib.flipkernel(cdims) || throw(ArgumentError(
5+
"MIOpen supports only cross-correlation as its convolution implementation."))
6+
7+
nd = max(0, 4 - N)
8+
ncdims = NNlib.insert_singleton_spatial_dimension(cdims, nd)
9+
MIOpen.convolution!(
10+
NNlib.insert_singleton_spatial_dimension(y, nd),
11+
NNlib.insert_singleton_spatial_dimension(x, nd),
12+
NNlib.insert_singleton_spatial_dimension(w, nd);
13+
padding=nnlib_padding(ncdims), stride=NNlib.stride(ncdims),
14+
dilation=NNlib.dilation(ncdims), groups=NNlib.groupcount(ncdims))
15+
return y
16+
end
17+
18+
function NNlib.∇conv_data!(
19+
dx::ROCArray{T, N}, dy::ROCArray{T, N}, w::ROCArray{T, N}, cdims::DenseConvDims,
20+
) where {T <: MIOPENFloat, N}
21+
NNlib.flipkernel(cdims) || throw(ArgumentError(
22+
"MIOpen supports only cross-correlation as its convolution implementation."))
23+
24+
nd = max(0, 4 - N)
25+
ncdims = NNlib.insert_singleton_spatial_dimension(cdims, nd)
26+
MIOpen.∇convolution_data!(
27+
NNlib.insert_singleton_spatial_dimension(dx, nd),
28+
NNlib.insert_singleton_spatial_dimension(dy, nd),
29+
NNlib.insert_singleton_spatial_dimension(w, nd);
30+
padding=nnlib_padding(ncdims), stride=NNlib.stride(ncdims),
31+
dilation=NNlib.dilation(ncdims), groups=NNlib.groupcount(ncdims))
32+
return dx
33+
end
34+
35+
function NNlib.∇conv_filter!(
36+
dw::ROCArray{T, N}, x::ROCArray{T, N}, dy::ROCArray{T, N}, cdims::DenseConvDims,
37+
) where {T <: MIOPENFloat, N}
38+
NNlib.flipkernel(cdims) || throw(ArgumentError(
39+
"MIOpen supports only cross-correlation as its convolution implementation."))
40+
41+
nd = max(0, 4 - N)
42+
ncdims = NNlib.insert_singleton_spatial_dimension(cdims, nd)
43+
MIOpen.∇convolution_weight!(
44+
NNlib.insert_singleton_spatial_dimension(dw, nd),
45+
NNlib.insert_singleton_spatial_dimension(dy, nd),
46+
NNlib.insert_singleton_spatial_dimension(x, nd);
47+
padding=nnlib_padding(ncdims), stride=NNlib.stride(ncdims),
48+
dilation=NNlib.dilation(ncdims), groups=NNlib.groupcount(ncdims))
49+
return dw
50+
end

ext/AMDGPUExt/pool.jl

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
for poolname in (:maxpool, :meanpool)
2+
@eval function NNlib.$(poolname)(
3+
x::ROCArray{T, N}, pdims::PoolDims,
4+
) where {T <: MIOPENFloat, N}
5+
y = similar(x, NNlib.output_size(pdims)..., NNlib.channels_out(pdims), size(x, N))
6+
nd = max(0, 4 - N)
7+
npdims = NNlib.insert_singleton_spatial_dimension(pdims, nd)
8+
MIOpen.$(Symbol("$(poolname)!"))(
9+
NNlib.insert_singleton_spatial_dimension(y, nd),
10+
NNlib.insert_singleton_spatial_dimension(x, nd);
11+
dims=NNlib.kernel_size(npdims), padding=nnlib_padding(npdims),
12+
stride=NNlib.stride(npdims), do_backward=false)
13+
return y
14+
end
15+
16+
@eval function ChainRulesCore.rrule(
17+
::typeof(NNlib.$(poolname)), x::ROCArray{T, N}, pdims::PoolDims,
18+
) where {T <: MIOPENFloat, N}
19+
y = similar(x, NNlib.output_size(pdims)..., NNlib.channels_out(pdims), size(x, N))
20+
nd = max(0, 4 - N)
21+
npdims = NNlib.insert_singleton_spatial_dimension(pdims, nd)
22+
23+
# `workspace` is used in the pullback.
24+
_, workspace = MIOpen.$(Symbol("$(poolname)!"))(
25+
NNlib.insert_singleton_spatial_dimension(y, nd),
26+
NNlib.insert_singleton_spatial_dimension(x, nd);
27+
dims=NNlib.kernel_size(npdims), padding=nnlib_padding(npdims),
28+
stride=NNlib.stride(npdims))
29+
30+
function _pooling_pullback(Δ)
31+
dx = similar(x)
32+
MIOpen.$(Symbol("$(poolname)!"))(
33+
NNlib.insert_singleton_spatial_dimension(dx, nd),
34+
NNlib.insert_singleton_spatial_dimension(unthunk(Δ), nd),
35+
NNlib.insert_singleton_spatial_dimension(y, nd),
36+
NNlib.insert_singleton_spatial_dimension(x, nd);
37+
dims=NNlib.kernel_size(npdims), padding=nnlib_padding(npdims),
38+
stride=NNlib.stride(npdims), workspace)
39+
return NoTangent(), dx, NoTangent()
40+
end
41+
y, _pooling_pullback
42+
end
43+
end

ext/AMDGPUExt/softmax.jl

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
for fname in (:softmax, :logsoftmax)
2+
@eval function NNlib.$(fname)(x::ROCArray{T}; dims = 1) where T <: MIOPENFloat
3+
MIOpen.$(fname)(x; dims)
4+
end
5+
6+
@eval function NNlib.$(Symbol("$(fname)"))(
7+
dy::ROCArray{T, N}, x::ROCArray{T, N}, y::ROCArray{T, N}; dims = 1,
8+
) where {T <: MIOPENFloat, N}
9+
MIOpen.$(Symbol("$(fname)!"))(dy, y; dims)
10+
end
11+
end

test/amd/activations.jl

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
@testset "Compare CPU & GPU" begin
2+
for (T, atol) in ((Float16, 1f-2), (Float32, 1f-5))
3+
x = randn(T, 16)
4+
gputest(x -> NNlib.relu.(x), x; atol)
5+
gputest(x -> NNlib.relu6.(x), x; atol)
6+
gputest(x -> NNlib.softplus.(x), x; atol)
7+
gputest(x -> tanh.(x), x; atol)
8+
gputest(x -> identity.(x), x; atol)
9+
end
10+
end

test/amd/batched_mul.jl

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
@testset "batched_mul" begin
2+
A = rand(Float32, 3, 3, 2)
3+
B = rand(Float32, 3, 3, 2)
4+
dA, dB = ROCArray.((A, B))
5+
6+
C = batched_mul(A, B)
7+
@test ROCArray(C) batched_mul(dA, dB)
8+
9+
Ct = batched_mul(batched_transpose(A), B)
10+
@test ROCArray(Ct) batched_mul(batched_transpose(dA), dB)
11+
12+
Ca = batched_mul(A, batched_adjoint(B))
13+
@test ROCArray(Ca) batched_mul(dA, batched_adjoint(dB))
14+
15+
# 5-arg batched_mul!
16+
C .= pi
17+
batched_mul!(C, A, B, 2f0, 3f0)
18+
Cpi = ROCArray(similar(C)) .= pi
19+
@test ROCArray(C) batched_mul!(Cpi, dA, dB, 2f0, 3f0)
20+
21+
# PermutedDimsArray
22+
@test ROCArray(Ct) batched_mul(PermutedDimsArray(dA, (2, 1, 3)), dB)
23+
24+
# FIXME same but with (1, 3, 2) errors
25+
D = permutedims(B, (2, 1, 3))
26+
Cp = batched_mul(batched_adjoint(A), B)
27+
@test ROCArray(Cp) batched_mul(
28+
batched_adjoint(dA), PermutedDimsArray(ROCArray(D), (2, 1, 3)))
29+
30+
# Methods which reshape
31+
M = randn(Float32, 3, 3)
32+
Cm = batched_mul(A, M)
33+
@test ROCArray(Cm) batched_mul(dA, ROCArray(M))
34+
end

0 commit comments

Comments
 (0)