Merge branch 'main' into refactor-cuda-offload-factorizations

ChrisRackauckas · web-flow · commit 75d65465b215 · 2025-08-10T12:44:47.000-04:00
diff --git a/Project.toml b/Project.toml
@@ -28,9 +28,9 @@ StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
 UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
 
 [weakdeps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 BandedMatrices = "aae01518-5342-5314-be14-df237901396f"
 BlockDiagonals = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0"
-blis_jll = "6136c539-28a5-5bf0-87cc-b183200dce32"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CUDSS = "45b445bb-4962-46a0-9369-b4df9d0f772e"
 CUSOLVERRF = "a8cc9031-bad2-4722-94f5-40deabb4245c"
@@ -48,8 +48,10 @@ Pardiso = "46dd5b70-b6fb-5a00-ae2d-e8fea33afaf2"
 RecursiveFactorization = "f2c3362d-daeb-58d1-803e-2bc74f2840b4"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Sparspak = "e56a9233-b9d6-4f03-8d0f-1825330902ac"
+blis_jll = "6136c539-28a5-5bf0-87cc-b183200dce32"
 
 [extensions]
+LinearSolveAMDGPUExt = "AMDGPU"
 LinearSolveBLISExt = ["blis_jll", "LAPACK_jll"]
 LinearSolveBandedMatricesExt = "BandedMatrices"
 LinearSolveBlockDiagonalsExt = "BlockDiagonals"
@@ -71,12 +73,12 @@ LinearSolveSparseArraysExt = "SparseArrays"
 LinearSolveSparspakExt = ["SparseArrays", "Sparspak"]
 
 [compat]
+AMDGPU = "1"
 AllocCheck = "0.2"
 Aqua = "0.8"
 ArrayInterface = "7.7"
 BandedMatrices = "1.5"
 BlockDiagonals = "0.2"
-blis_jll = "0.9.0"
 CUDA = "5"
 CUDSS = "0.4"
 CUSOLVERRF = "0.2.6"
@@ -126,6 +128,7 @@ StaticArraysCore = "1.4.2"
 Test = "1"
 UnPack = "1"
 Zygote = "0.7"
+blis_jll = "0.9.0"
 julia = "1.10"
 
 [extras]
diff --git a/docs/src/solvers/solvers.md b/docs/src/solvers/solvers.md
@@ -241,6 +241,19 @@ CudaOffloadLUFactorization
 CudaOffloadQRFactorization
 ```
 
+### AMDGPU.jl
+
+The following are GPU factorization routines for AMD GPUs using the ROCm stack.
+
+!!! note
+    
+    Using these solvers requires adding the package AMDGPU.jl, i.e. `using AMDGPU`
+
+```@docs
+AMDGPUOffloadLUFactorization
+AMDGPUOffloadQRFactorization
+```
+
 ### CUSOLVERRF.jl
 
 !!! note
diff --git a/docs/src/tutorials/gpu.md b/docs/src/tutorials/gpu.md
@@ -39,18 +39,29 @@ sol.u
 This computation can be moved to the GPU by the following:
 
 ```julia
-using CUDA # Add the GPU library
+using CUDA # Add the GPU library for NVIDIA GPUs
 sol = LS.solve(prob, LS.CudaOffloadLUFactorization())
+# or
+sol = LS.solve(prob, LS.CudaOffloadQRFactorization())
 sol.u
 ```
 
-LinearSolve.jl provides two GPU offloading algorithms:
-- `CudaOffloadLUFactorization()` - Uses LU factorization (generally faster for well-conditioned problems)
-- `CudaOffloadQRFactorization()` - Uses QR factorization (more stable for ill-conditioned problems)
+For AMD GPUs, you can use the AMDGPU.jl package:
 
-!!! warning
-    The old `CudaOffloadFactorization()` is deprecated. Use `CudaOffloadLUFactorization()` or `CudaOffloadQRFactorization()` instead.
+```julia
+using AMDGPU # Add the GPU library for AMD GPUs
+sol = LS.solve(prob, LS.AMDGPUOffloadLUFactorization())  # LU factorization
+# or
+sol = LS.solve(prob, LS.AMDGPUOffloadQRFactorization())  # QR factorization
+sol.u
+```
 
+LinearSolve.jl provides multiple  GPU offloading algorithms:
+- `CudaOffloadLUFactorization()` - Uses LU factorization on NVIDIA GPUs (generally faster for well-conditioned problems)
+- `CudaOffloadQRFactorization()` - Uses QR factorization on NVIDIA GPUs (more stable for ill-conditioned problems)
+- `AMDGPUOffloadLUFactorization()` - Uses LU factorization on AMD GPUs (generally faster for well-conditioned problems)
+- `AMDGPUOffloadQRFactorization()` - Uses QR factorization on AMD GPUs (more stable for ill-conditioned problems)
+- 
 ## GPUArray Interface
 
 For more manual control over the factorization setup, you can use the
diff --git a/docs/src/tutorials/linear.md b/docs/src/tutorials/linear.md
@@ -84,11 +84,10 @@ LinearSolve.jl specifically tests with the following cases:
 
 !!! note
     
-
-Choosing the most specific matrix structure that matches your specific system will give you the most performance.
-Thus if your matrix is symmetric, specifically building with `Symmetric(A)` will be faster than simply using `A`,
-and will generally lead to better automatic linear solver choices. Note that you can also choose the type for `b`,
-but generally a dense vector will be the fastest here and many solvers will not support a sparse `b`.
+    Choosing the most specific matrix structure that matches your specific system will give you the most performance.
+    Thus if your matrix is symmetric, specifically building with `Symmetric(A)` will be faster than simply using `A`,
+    and will generally lead to better automatic linear solver choices. Note that you can also choose the type for `b`,
+    but generally a dense vector will be the fastest here and many solvers will not support a sparse `b`.
 
 ## Using Matrix-Free Operators via SciMLOperators.jl
 
@@ -160,7 +159,6 @@ mfopA * sol.u - b
 
 !!! note
     
-
-Note that not all methods can use a matrix-free operator. For example, `LS.LUFactorization()` requires a matrix. If you use an
-invalid method, you will get an error. The methods particularly from KrylovJL are the ones preferred for these cases
-(and are defaulted to).
+    Note that not all methods can use a matrix-free operator. For example, `LS.LUFactorization()` requires a matrix. If you use an
+    invalid method, you will get an error. The methods particularly from KrylovJL are the ones preferred for these cases
+    (and are defaulted to).
diff --git a/ext/LinearSolveAMDGPUExt.jl b/ext/LinearSolveAMDGPUExt.jl
@@ -0,0 +1,68 @@
+module LinearSolveAMDGPUExt
+
+using AMDGPU
+using LinearSolve: LinearSolve, LinearCache, AMDGPUOffloadLUFactorization,
+                   AMDGPUOffloadQRFactorization, init_cacheval, OperatorAssumptions
+using LinearSolve.LinearAlgebra, LinearSolve.SciMLBase
+
+# LU Factorization
+function SciMLBase.solve!(cache::LinearSolve.LinearCache, alg::AMDGPUOffloadLUFactorization;
+        kwargs...)
+    if cache.isfresh
+        fact = AMDGPU.rocSOLVER.getrf!(AMDGPU.ROCArray(cache.A))
+        cache.cacheval = fact
+        cache.isfresh = false
+    end
+    
+    A_gpu, ipiv = cache.cacheval
+    b_gpu = AMDGPU.ROCArray(cache.b)
+    
+    AMDGPU.rocSOLVER.getrs!('N', A_gpu, ipiv, b_gpu)
+    
+    y = Array(b_gpu)
+    cache.u .= y
+    SciMLBase.build_linear_solution(alg, y, nothing, cache)
+end
+
+function LinearSolve.init_cacheval(alg::AMDGPUOffloadLUFactorization, A, b, u, Pl, Pr,
+        maxiters::Int, abstol, reltol, verbose::Bool,
+        assumptions::OperatorAssumptions)
+    AMDGPU.rocSOLVER.getrf!(AMDGPU.ROCArray(A))
+end
+
+# QR Factorization
+function SciMLBase.solve!(cache::LinearSolve.LinearCache, alg::AMDGPUOffloadQRFactorization;
+        kwargs...)
+    if cache.isfresh
+        A_gpu = AMDGPU.ROCArray(cache.A)
+        tau = AMDGPU.ROCVector{eltype(A_gpu)}(undef, min(size(A_gpu)...))
+        AMDGPU.rocSOLVER.geqrf!(A_gpu, tau)
+        cache.cacheval = (A_gpu, tau)
+        cache.isfresh = false
+    end
+    
+    A_gpu, tau = cache.cacheval
+    b_gpu = AMDGPU.ROCArray(cache.b)
+    
+    # Apply Q^T to b
+    AMDGPU.rocSOLVER.ormqr!('L', 'T', A_gpu, tau, b_gpu)
+    
+    # Solve the upper triangular system
+    m, n = size(A_gpu)
+    AMDGPU.rocBLAS.trsv!('U', 'N', 'N', n, A_gpu, b_gpu)
+    
+    y = Array(b_gpu[1:n])
+    cache.u .= y
+    SciMLBase.build_linear_solution(alg, y, nothing, cache)
+end
+
+function LinearSolve.init_cacheval(alg::AMDGPUOffloadQRFactorization, A, b, u, Pl, Pr,
+        maxiters::Int, abstol, reltol, verbose::Bool,
+        assumptions::OperatorAssumptions)
+    A_gpu = AMDGPU.ROCArray(A)
+    tau = AMDGPU.ROCVector{eltype(A_gpu)}(undef, min(size(A_gpu)...))
+    AMDGPU.rocSOLVER.geqrf!(A_gpu, tau)
+    (A_gpu, tau)
+end
+
+end
diff --git a/src/LinearSolve.jl b/src/LinearSolve.jl
@@ -257,6 +257,7 @@ export HYPREAlgorithm
 export CudaOffloadFactorization
 export CudaOffloadLUFactorization
 export CudaOffloadQRFactorization
+export AMDGPUOffloadLUFactorization, AMDGPUOffloadQRFactorization
 export MKLPardisoFactorize, MKLPardisoIterate
 export PanuaPardisoFactorize, PanuaPardisoIterate
 export PardisoJL
diff --git a/src/extension_algs.jl b/src/extension_algs.jl
@@ -129,6 +129,48 @@ struct CudaOffloadFactorization <: AbstractFactorization
     end
 end
 
+"""
+`AMDGPUOffloadLUFactorization()`
+
+An offloading technique using LU factorization to GPU-accelerate CPU-based computations on AMD GPUs.
+Requires a sufficiently large `A` to overcome the data transfer costs.
+
+!!! note
+
+    Using this solver requires adding the package AMDGPU.jl, i.e. `using AMDGPU`
+"""
+struct AMDGPUOffloadLUFactorization <: LinearSolve.AbstractFactorization
+    function AMDGPUOffloadLUFactorization()
+        ext = Base.get_extension(@__MODULE__, :LinearSolveAMDGPUExt)
+        if ext === nothing
+            error("AMDGPUOffloadLUFactorization requires that AMDGPU is loaded, i.e. `using AMDGPU`")
+        else
+            return new{}()
+        end
+    end
+end
+
+"""
+`AMDGPUOffloadQRFactorization()`
+
+An offloading technique using QR factorization to GPU-accelerate CPU-based computations on AMD GPUs.
+Requires a sufficiently large `A` to overcome the data transfer costs.
+
+!!! note
+
+    Using this solver requires adding the package AMDGPU.jl, i.e. `using AMDGPU`
+"""
+struct AMDGPUOffloadQRFactorization <: LinearSolve.AbstractFactorization
+    function AMDGPUOffloadQRFactorization()
+        ext = Base.get_extension(@__MODULE__, :LinearSolveAMDGPUExt)
+        if ext === nothing
+            error("AMDGPUOffloadQRFactorization requires that AMDGPU is loaded, i.e. `using AMDGPU`")
+        else
+            return new{}()
+        end
+    end
+end
+
 ## RFLUFactorization
 
 """