diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index a047715b9..61fe79362 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -55,6 +55,34 @@ steps: if: build.pull_request.labels includes "benchmark" timeout_in_minutes: 30 + - label: "AMDGPU - Julia 1.9 - No Artifacts" + plugins: + - JuliaCI/julia#v1: + version: 1.9-nightly + - JuliaCI/julia-test#v1: + - JuliaCI/julia-coverage#v1: + codecov: true + dirs: + - src + - ext + agents: + queue: "juliagpu" + rocm: "*" + rocmgpu: "*" + command: + - julia -e """ + using TOML; + conf = TOML.parse(read(\"Project.toml\", String)); + push!(conf[\"targets\"][\"test\"], \"AMDGPU\"); + open(io -> TOML.print(io, conf), \"Project.toml\", \"w\"); + """ + timeout_in_minutes: 30 + env: + JULIA_AMDGPU_CORE_MUST_LOAD: "1" + JULIA_AMDGPU_HIP_MUST_LOAD: "1" + JULIA_AMDGPU_DISABLE_ARTIFACTS: "1" + NNLIB_TEST_AMDGPU: true + # - label: "GPU julia nightly" # plugins: # - JuliaCI/julia#v1: diff --git a/Project.toml b/Project.toml index 753a0faf2..1e2d73edf 100644 --- a/Project.toml +++ b/Project.toml @@ -11,13 +11,21 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Requires = "ae029012-a4dd-5104-9daa-d747884805df" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +[weakdeps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" + +[extensions] +AMDGPUExt = "AMDGPU" + [compat] +AMDGPU = "0.4.7" Adapt = "2, 3.2" ChainRulesCore = "1.13" Requires = "0.5, 1.0" julia = "1.6" [extras] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" diff --git a/docs/src/index.md b/docs/src/index.md index 0eea8ddbb..91adcee0c 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -5,3 +5,4 @@ For use with automatic differentiation, this package defines gradients using [ChainRules.jl](https://github.com/JuliaDiff/ChainRules.jl). These will be seen by various packages including [Zygote.jl](https://github.com/FluxML/Zygote.jl). To use these functions with [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) you will need [NNlibCUDA.jl](https://github.com/FluxML/NNlibCUDA.jl) as well. +For [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl) you will need to load it and NNlib in the same Julia session. diff --git a/ext/AMDGPUExt/AMDGPUExt.jl b/ext/AMDGPUExt/AMDGPUExt.jl new file mode 100644 index 000000000..f141c6f8a --- /dev/null +++ b/ext/AMDGPUExt/AMDGPUExt.jl @@ -0,0 +1,61 @@ +module AMDGPUExt + +using Adapt +using AMDGPU +using AMDGPU.MIOpen +using ChainRulesCore +using NNlib +using NNlib: BatchedAdjoint, BatchedTranspose, BatchedAdjOrTrans +using NNlib: DenseConvDims, PoolDims + +const MIOPENFloat = Union{Float16, Float32} + +const ROCBatchedAdjoint{T} = BatchedAdjoint{T, <: ROCArray{T}} +const ROCBatchedTranspose{T} = BatchedTranspose{T, <: ROCArray{T}} +const ROCBatchedAdjOrTrans{T} = Union{ROCBatchedAdjoint{T}, ROCBatchedTranspose{T}} +const WrappedROCBatchedAdjOrTrans{T, N} = Adapt.WrappedArray{T, N, ROCBatchedAdjOrTrans{T}, ROCBatchedAdjOrTrans{T}} +const AnyROCBatchedAdjOrTrans = Union{ROCBatchedAdjOrTrans, WrappedROCBatchedAdjOrTrans} + +function Base.convert(::Type{T}, b::AnyROCBatchedAdjOrTrans) where {T <: Array} + Base.convert(T, adapt(Array, b)) +end + +function Base.Array{T, N}(b::AnyROCBatchedAdjOrTrans) where {T, N} + Array{T, N}(adapt(Array, b)) +end + +Base.collect(b::AnyROCBatchedAdjOrTrans) = collect(adapt(Array, b)) + +function Base.show( + io::IO, mime::MIME{Symbol("text/plain")}, x::AnyROCBatchedAdjOrTrans, +) + show(io, mime, adapt(Array, x)) +end + +Base.show(io::IO, x::AnyROCBatchedAdjOrTrans) = show(io, adapt(Array, x)) + +Base.display(x::AnyROCBatchedAdjOrTrans) = display(adapt(Array, x)) + +function NNlib._batched_gemm!( + ::Type{<: ROCArray}, transA::Char, transB::Char, α, A, B, β, C, +) + AMDGPU.rocBLAS.gemm_batched!(transA, transB, α, A, B, β, C) +end + +function nnlib_padding(dims) + pd = NNlib.padding(dims) + if !all(pd[1:2:end] .== pd[2:2:end]) + @warn """ + MIOpen does not support asymmetric padding, defaulting to symmetric choice: + $pd -> $(pd[1:2:end]). + """ maxlog=1 + end + pd[1:2:end] +end + +include("conv.jl") +include("pool.jl") +include("softmax.jl") +include("activations.jl") + +end diff --git a/ext/AMDGPUExt/activations.jl b/ext/AMDGPUExt/activations.jl new file mode 100644 index 000000000..1563bb45e --- /dev/null +++ b/ext/AMDGPUExt/activations.jl @@ -0,0 +1,16 @@ +for (f, op) in [ + NNlib.relu => MIOpen.relu, + NNlib.relu6 => x -> MIOpen.clippedrelu(x, 6), + NNlib.softplus => MIOpen.softrelu, + NNlib.σ => MIOpen.sigmoid, + Base.tanh => MIOpen.tanh, + # TODO define for leakyrelu, elu, etc.? +] + @eval function Base.materialize( + bc::Broadcast.Broadcasted{<:Any,<:Any,typeof($f),<:Tuple{ROCArray{<:MIOPENFloat}}} + ) + return $op(bc.args[1]) + end +end + +Base.broadcasted(::typeof(identity), x::ROCArray{T}) where {T<:MIOPENFloat} = x diff --git a/ext/AMDGPUExt/conv.jl b/ext/AMDGPUExt/conv.jl new file mode 100644 index 000000000..b0cebff87 --- /dev/null +++ b/ext/AMDGPUExt/conv.jl @@ -0,0 +1,50 @@ +function NNlib.conv!( + y::ROCArray{T, N}, x::ROCArray{T, N}, w::ROCArray{T, N}, cdims::DenseConvDims, +) where {T <: MIOPENFloat, N} + NNlib.flipkernel(cdims) || throw(ArgumentError( + "MIOpen supports only cross-correlation as its convolution implementation.")) + + nd = max(0, 4 - N) + ncdims = NNlib.insert_singleton_spatial_dimension(cdims, nd) + MIOpen.convolution!( + NNlib.insert_singleton_spatial_dimension(y, nd), + NNlib.insert_singleton_spatial_dimension(x, nd), + NNlib.insert_singleton_spatial_dimension(w, nd); + padding=nnlib_padding(ncdims), stride=NNlib.stride(ncdims), + dilation=NNlib.dilation(ncdims), groups=NNlib.groupcount(ncdims)) + return y +end + +function NNlib.∇conv_data!( + dx::ROCArray{T, N}, dy::ROCArray{T, N}, w::ROCArray{T, N}, cdims::DenseConvDims, +) where {T <: MIOPENFloat, N} + NNlib.flipkernel(cdims) || throw(ArgumentError( + "MIOpen supports only cross-correlation as its convolution implementation.")) + + nd = max(0, 4 - N) + ncdims = NNlib.insert_singleton_spatial_dimension(cdims, nd) + MIOpen.∇convolution_data!( + NNlib.insert_singleton_spatial_dimension(dx, nd), + NNlib.insert_singleton_spatial_dimension(dy, nd), + NNlib.insert_singleton_spatial_dimension(w, nd); + padding=nnlib_padding(ncdims), stride=NNlib.stride(ncdims), + dilation=NNlib.dilation(ncdims), groups=NNlib.groupcount(ncdims)) + return dx +end + +function NNlib.∇conv_filter!( + dw::ROCArray{T, N}, x::ROCArray{T, N}, dy::ROCArray{T, N}, cdims::DenseConvDims, +) where {T <: MIOPENFloat, N} + NNlib.flipkernel(cdims) || throw(ArgumentError( + "MIOpen supports only cross-correlation as its convolution implementation.")) + + nd = max(0, 4 - N) + ncdims = NNlib.insert_singleton_spatial_dimension(cdims, nd) + MIOpen.∇convolution_weight!( + NNlib.insert_singleton_spatial_dimension(dw, nd), + NNlib.insert_singleton_spatial_dimension(dy, nd), + NNlib.insert_singleton_spatial_dimension(x, nd); + padding=nnlib_padding(ncdims), stride=NNlib.stride(ncdims), + dilation=NNlib.dilation(ncdims), groups=NNlib.groupcount(ncdims)) + return dw +end diff --git a/ext/AMDGPUExt/pool.jl b/ext/AMDGPUExt/pool.jl new file mode 100644 index 000000000..5549bab1c --- /dev/null +++ b/ext/AMDGPUExt/pool.jl @@ -0,0 +1,43 @@ +for poolname in (:maxpool, :meanpool) + @eval function NNlib.$(poolname)( + x::ROCArray{T, N}, pdims::PoolDims, + ) where {T <: MIOPENFloat, N} + y = similar(x, NNlib.output_size(pdims)..., NNlib.channels_out(pdims), size(x, N)) + nd = max(0, 4 - N) + npdims = NNlib.insert_singleton_spatial_dimension(pdims, nd) + MIOpen.$(Symbol("$(poolname)!"))( + NNlib.insert_singleton_spatial_dimension(y, nd), + NNlib.insert_singleton_spatial_dimension(x, nd); + dims=NNlib.kernel_size(npdims), padding=nnlib_padding(npdims), + stride=NNlib.stride(npdims), do_backward=false) + return y + end + + @eval function ChainRulesCore.rrule( + ::typeof(NNlib.$(poolname)), x::ROCArray{T, N}, pdims::PoolDims, + ) where {T <: MIOPENFloat, N} + y = similar(x, NNlib.output_size(pdims)..., NNlib.channels_out(pdims), size(x, N)) + nd = max(0, 4 - N) + npdims = NNlib.insert_singleton_spatial_dimension(pdims, nd) + + # `workspace` is used in the pullback. + _, workspace = MIOpen.$(Symbol("$(poolname)!"))( + NNlib.insert_singleton_spatial_dimension(y, nd), + NNlib.insert_singleton_spatial_dimension(x, nd); + dims=NNlib.kernel_size(npdims), padding=nnlib_padding(npdims), + stride=NNlib.stride(npdims)) + + function _pooling_pullback(Δ) + dx = similar(x) + MIOpen.$(Symbol("∇$(poolname)!"))( + NNlib.insert_singleton_spatial_dimension(dx, nd), + NNlib.insert_singleton_spatial_dimension(unthunk(Δ), nd), + NNlib.insert_singleton_spatial_dimension(y, nd), + NNlib.insert_singleton_spatial_dimension(x, nd); + dims=NNlib.kernel_size(npdims), padding=nnlib_padding(npdims), + stride=NNlib.stride(npdims), workspace) + return NoTangent(), dx, NoTangent() + end + y, _pooling_pullback + end +end diff --git a/ext/AMDGPUExt/softmax.jl b/ext/AMDGPUExt/softmax.jl new file mode 100644 index 000000000..de75f9748 --- /dev/null +++ b/ext/AMDGPUExt/softmax.jl @@ -0,0 +1,11 @@ +for fname in (:softmax, :logsoftmax) + @eval function NNlib.$(fname)(x::ROCArray{T}; dims = 1) where T <: MIOPENFloat + MIOpen.$(fname)(x; dims) + end + + @eval function NNlib.$(Symbol("∇$(fname)"))( + dy::ROCArray{T, N}, x::ROCArray{T, N}, y::ROCArray{T, N}; dims = 1, + ) where {T <: MIOPENFloat, N} + MIOpen.$(Symbol("∇$(fname)!"))(dy, y; dims) + end +end diff --git a/test/amd/activations.jl b/test/amd/activations.jl new file mode 100644 index 000000000..2abb0c272 --- /dev/null +++ b/test/amd/activations.jl @@ -0,0 +1,10 @@ +@testset "Compare CPU & GPU" begin + for (T, atol) in ((Float16, 1f-2), (Float32, 1f-5)) + x = randn(T, 16) + gputest(x -> NNlib.relu.(x), x; atol) + gputest(x -> NNlib.relu6.(x), x; atol) + gputest(x -> NNlib.softplus.(x), x; atol) + gputest(x -> tanh.(x), x; atol) + gputest(x -> identity.(x), x; atol) + end +end diff --git a/test/amd/batched_mul.jl b/test/amd/batched_mul.jl new file mode 100644 index 000000000..bc9dae899 --- /dev/null +++ b/test/amd/batched_mul.jl @@ -0,0 +1,34 @@ +@testset "batched_mul" begin + A = rand(Float32, 3, 3, 2) + B = rand(Float32, 3, 3, 2) + dA, dB = ROCArray.((A, B)) + + C = batched_mul(A, B) + @test ROCArray(C) ≈ batched_mul(dA, dB) + + Ct = batched_mul(batched_transpose(A), B) + @test ROCArray(Ct) ≈ batched_mul(batched_transpose(dA), dB) + + Ca = batched_mul(A, batched_adjoint(B)) + @test ROCArray(Ca) ≈ batched_mul(dA, batched_adjoint(dB)) + + # 5-arg batched_mul! + C .= pi + batched_mul!(C, A, B, 2f0, 3f0) + Cpi = ROCArray(similar(C)) .= pi + @test ROCArray(C) ≈ batched_mul!(Cpi, dA, dB, 2f0, 3f0) + + # PermutedDimsArray + @test ROCArray(Ct) ≈ batched_mul(PermutedDimsArray(dA, (2, 1, 3)), dB) + + # FIXME same but with (1, 3, 2) errors + D = permutedims(B, (2, 1, 3)) + Cp = batched_mul(batched_adjoint(A), B) + @test ROCArray(Cp) ≈ batched_mul( + batched_adjoint(dA), PermutedDimsArray(ROCArray(D), (2, 1, 3))) + + # Methods which reshape + M = randn(Float32, 3, 3) + Cm = batched_mul(A, M) + @test ROCArray(Cm) ≈ batched_mul(dA, ROCArray(M)) +end diff --git a/test/amd/batched_repr.jl b/test/amd/batched_repr.jl new file mode 100644 index 000000000..dfdbc558b --- /dev/null +++ b/test/amd/batched_repr.jl @@ -0,0 +1,43 @@ +function print_array_strs(x) + str = sprint((io, x)->show(io, MIME"text/plain"(), x), x) + return @view split(str, '\n')[2:end] +end + +@testset "BatchedAdjOrTrans" begin + x = rand(Float32, 3, 4, 2) + y = ROCArray(x) + + bax = batched_adjoint(x) + btx = batched_transpose(x) + bay = batched_adjoint(y) + bty = batched_transpose(y) + + @test sprint(show, bax) == sprint(show, bay) + @test sprint(show, btx) == sprint(show, bty) + + @test print_array_strs(bax) == print_array_strs(bay) + @test print_array_strs(btx) == print_array_strs(bty) + + @test Array(bax) == Array(bay) + @test collect(bax) == collect(bay) + @test Array(btx) == Array(bty) + @test collect(btx) == collect(bty) + + for shape in (:, (12, 2)) + rbax = reshape(bax, shape) + rbtx = reshape(btx, shape) + rbay = reshape(bay, shape) + rbty = reshape(bty, shape) + + @test sprint(show, rbax) == sprint(show, rbay) + @test sprint(show, rbtx) == sprint(show, rbty) + + @test print_array_strs(rbax) == print_array_strs(rbay) + @test print_array_strs(rbtx) == print_array_strs(rbty) + + @test Array(rbax) == Array(rbay) + @test collect(rbax) == collect(rbay) + @test Array(rbtx) == Array(rbty) + @test collect(rbtx) == collect(rbty) + end +end diff --git a/test/amd/conv.jl b/test/amd/conv.jl new file mode 100644 index 000000000..b6be3fd39 --- /dev/null +++ b/test/amd/conv.jl @@ -0,0 +1,9 @@ +@testset "Compare CPU & GPU" begin + channels, batch = 3, 2 + for T in (Float16, Float32), nd in (1, 2, 3) + x = rand(Float32, fill(4, nd)..., 3, 1) + w = rand(Float32, fill(2, nd)..., channels, 4) + cdims = DenseConvDims(x, w, flipkernel=true) + gputest((x, w) -> NNlib.conv(x, w, cdims), x, w; atol=1e-4) + end +end diff --git a/test/amd/pool.jl b/test/amd/pool.jl new file mode 100644 index 000000000..c32f67298 --- /dev/null +++ b/test/amd/pool.jl @@ -0,0 +1,11 @@ +@testset "Compare CPU & GPU" begin + channels, batch = 3, 2 + for T in (Float16, Float32), nd in (1, 2, 3) + x = rand(T, fill(8, nd)..., channels, batch) + pdims = PoolDims(x, 2) + # NOTE: Disable grad check for maxpool as *sometimes* + # it does not *completely* agree with CPU :/ + gputest(x -> NNlib.maxpool(x, pdims), x; checkgrad=false) + gputest(x -> NNlib.meanpool(x, pdims), x) + end +end diff --git a/test/amd/runtests.jl b/test/amd/runtests.jl new file mode 100644 index 000000000..fd15e6274 --- /dev/null +++ b/test/amd/runtests.jl @@ -0,0 +1,54 @@ +using NNlib: batched_adjoint, batched_mul, batched_mul!, batched_transpose +using NNlib: is_strided, storage_type +using LinearAlgebra + +AMDGPU.allowscalar(false) + +function gputest(f, xs...; checkgrad=true, atol=1e-6, kws...) + cpu_in = xs + gpu_in = ROCArray.(xs) + + cpu_out = f(cpu_in...; kws...) + gpu_out = f(gpu_in...; kws...) + @test collect(cpu_out) ≈ collect(gpu_out) + + if checkgrad + cpu_grad = gradient((x...) -> sum(f(x...; kws...)), cpu_in...) + gpu_grad = gradient((x...) -> sum(f(x...; kws...)), gpu_in...) + for (cpu_g, gpu_g) in zip(cpu_grad, gpu_grad) + if cpu_g === nothing + @test gpu_g === nothing + else + @test collect(cpu_g) ≈ collect(gpu_g) atol=atol + end + end + end +end + +@testset "Storage types" begin + include("storage_type.jl") +end + +@testset "Batched repr" begin + include("batched_repr.jl") +end + +@testset "Batched multiplication" begin + include("batched_mul.jl") +end + +@testset "Convolution" begin + include("conv.jl") +end + +@testset "Pooling" begin + include("pool.jl") +end + +@testset "Softmax" begin + include("softmax.jl") +end + +@testset "Activations" begin + include("activations.jl") +end diff --git a/test/amd/softmax.jl b/test/amd/softmax.jl new file mode 100644 index 000000000..cd8545223 --- /dev/null +++ b/test/amd/softmax.jl @@ -0,0 +1,17 @@ +@testset "Compare CPU & GPU" begin + for (T, atol) in ((Float16, 1f-2), (Float32, 1f-5)) + for (sz, dims) in [ + ((5,), :), ((5,), 1), + ((5, 5), :), ((5, 5), 1), ((5, 5), 2), + ((5, 5, 5, 5), (2, 3)), ((5, 5, 5, 5), (2, 4)), + ] + if T == Float16 + x = ones(T, sz) # Really low precision. + else + x = randn(T, sz) + end + gputest(NNlib.softmax, x; atol) + gputest(NNlib.logsoftmax, x; atol) + end + end +end diff --git a/test/amd/storage_type.jl b/test/amd/storage_type.jl new file mode 100644 index 000000000..d884ddd7f --- /dev/null +++ b/test/amd/storage_type.jl @@ -0,0 +1,13 @@ +@testset "NNlib storage type" begin + x = ROCArray(ones(Float32, 10, 10)) + @test storage_type(x) <: ROCArray{Float32, 2} + @test storage_type(reshape(view(x, 1:2:10,:), 10, :)) <: ROCArray{Float32, 2} + + @test is_strided(x) + @test is_strided(view(x, 1:2:5,:)) + @test is_strided(PermutedDimsArray(x, (2, 1))) + + @test !is_strided(reshape(view(x, 1:2:10, :), 10, :)) + @test !is_strided((x .+ im)') + @test !is_strided(Diagonal(ROCArray(ones(3)))) +end diff --git a/test/runtests.jl b/test/runtests.jl index 357b96f4a..e4f2f518a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -26,6 +26,19 @@ include("test_utils.jl") @info "Insufficient version or CUDA not found; Skipping CUDA tests" end + if get(ENV, "NNLIB_TEST_AMDGPU", "false") == "true" + using AMDGPU + if AMDGPU.functional() && AMDGPU.functional(:MIOpen) + @testset "AMDGPU" begin + include("amd/runtests.jl") + end + else + @info "AMDGPU.jl package is not functional. Skipping AMDGPU tests." + end + else + @info "Skipping AMDGPU tests, set NNLIB_TEST_CUDA=true to run them." + end + if VERSION < v"1.6" @info "skipping doctests, on Julia $VERSION" else