Work around the AMDGPU issue

avik-pal · avik-pal · commit 0c7ac8343b5c · 2024-06-13T21:53:53.000-07:00
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -7,6 +7,9 @@ steps:
           test_args: "--quickfail"
       - JuliaCI/julia-coverage#v1:
           codecov: true
+          dirs:
+            - src
+            - ext
     agents:
       queue: "juliagpu"
       cuda: "*"
@@ -27,6 +30,9 @@ steps:
           test_args: "--quickfail"
       - JuliaCI/julia-coverage#v1:
           codecov: true
+          dirs:
+            - src
+            - ext
     env:
       JULIA_AMDGPU_CORE_MUST_LOAD: "1"
       JULIA_AMDGPU_HIP_MUST_LOAD: "1"
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -41,6 +41,8 @@ jobs:
           RETESTITEMS_NWORKERS: 4
           RETESTITEMS_NWORKER_THREADS: 2
       - uses: julia-actions/julia-processcoverage@v1
+        with:
+          directories: src,ext
       - uses: codecov/codecov-action@v4
         with:
           files: lcov.info
diff --git a/Project.toml b/Project.toml
@@ -16,7 +16,14 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 WeightInitializers = "d49dbf32-c5c2-4618-8acc-27bb2598ef2d"
 
+[weakdeps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+
+[extensions]
+LuxNeuralOperatorsAMDGPUExt = "AMDGPU"
+
 [compat]
+AMDGPU = "0.9.5"
 Aqua = "0.8.7"
 ArgCheck = "2.3.0"
 ChainRulesCore = "1.24.0"
diff --git a/ext/LuxNeuralOperatorsAMDGPUExt.jl b/ext/LuxNeuralOperatorsAMDGPUExt.jl
@@ -0,0 +1,14 @@
+module LuxNeuralOperatorsAMDGPUExt
+
+using AMDGPU: AnyROCArray
+using LuxNeuralOperators: LuxNeuralOperators
+
+# This should be upstreamed to NNlib before we release this package
+@inline function LuxNeuralOperators.__batched_mul(
+        x::AnyROCArray{<:Union{ComplexF16, ComplexF32, ComplexF64}, 3},
+        y::AnyROCArray{<:Union{ComplexF16, ComplexF32, ComplexF64}, 3})
+    # FIXME: This is not good for performance but that is okay for now
+    return stack(*, eachslice(x; dims=3), eachslice(y; dims=3))
+end
+
+end
diff --git a/src/LuxNeuralOperators.jl b/src/LuxNeuralOperators.jl
@@ -9,7 +9,7 @@ using PrecompileTools: @recompile_invalidations
     using FFTW: FFTW, irfft, rfft
     using Lux
     using LuxCore: LuxCore, AbstractExplicitLayer
-    using NNlib: NNlib, batched_transpose, ⊠
+    using NNlib: NNlib, ⊠
     using Random: Random, AbstractRNG
     using Reexport: @reexport
 end
@@ -21,6 +21,7 @@ const CRC = ChainRulesCore
 const True = Val(true)
 const False = Val(false)
 
+include("utils.jl")
 include("transform.jl")
 
 include("functional.jl")
diff --git a/src/functional.jl b/src/functional.jl
@@ -17,13 +17,6 @@ end
     return reshape(x_weighted, x_size[1:(N - 2)]..., size(x_weighted)[2:3]...)
 end
 
-@inline function __apply_pattern_batched_mul(
-        x::AbstractArray{T1, 3}, y::AbstractArray{T2, 3}) where {T1, T2}
-    x_ = batched_transpose(x)              # i x b x m
-    res = y ⊠ x_                           # o x b x m
-    return batched_transpose(res)          # m x o x b
-end
-
 @inline __pad_modes(x, dims::Integer...) = __pad_modes(x, dims)
 @inline __pad_modes(x, dims::NTuple) = __pad_modes!(similar(x, dims), x)
 
diff --git a/src/utils.jl b/src/utils.jl
@@ -0,0 +1,2 @@
+# Temporarily capture certain calls like AMDGPU for ComplexFloats
+@inline __batched_mul(x, y) = x ⊠ y

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Temporarily capture certain calls like AMDGPU for ComplexFloats`
	`2`	`+@inline __batched_mul(x, y) = x ⊠ y`