diff --git a/Project.toml b/Project.toml index dccba4d74..d6f63a783 100644 --- a/Project.toml +++ b/Project.toml @@ -28,9 +28,9 @@ StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" [weakdeps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" BandedMatrices = "aae01518-5342-5314-be14-df237901396f" BlockDiagonals = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0" -blis_jll = "6136c539-28a5-5bf0-87cc-b183200dce32" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" CUDSS = "45b445bb-4962-46a0-9369-b4df9d0f772e" CUSOLVERRF = "a8cc9031-bad2-4722-94f5-40deabb4245c" @@ -48,8 +48,10 @@ Pardiso = "46dd5b70-b6fb-5a00-ae2d-e8fea33afaf2" RecursiveFactorization = "f2c3362d-daeb-58d1-803e-2bc74f2840b4" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Sparspak = "e56a9233-b9d6-4f03-8d0f-1825330902ac" +blis_jll = "6136c539-28a5-5bf0-87cc-b183200dce32" [extensions] +LinearSolveAMDGPUExt = "AMDGPU" LinearSolveBLISExt = ["blis_jll", "LAPACK_jll"] LinearSolveBandedMatricesExt = "BandedMatrices" LinearSolveBlockDiagonalsExt = "BlockDiagonals" @@ -71,12 +73,12 @@ LinearSolveSparseArraysExt = "SparseArrays" LinearSolveSparspakExt = ["SparseArrays", "Sparspak"] [compat] +AMDGPU = "1" AllocCheck = "0.2" Aqua = "0.8" ArrayInterface = "7.7" BandedMatrices = "1.5" BlockDiagonals = "0.2" -blis_jll = "0.9.0" CUDA = "5" CUDSS = "0.4" CUSOLVERRF = "0.2.6" @@ -126,6 +128,7 @@ StaticArraysCore = "1.4.2" Test = "1" UnPack = "1" Zygote = "0.7" +blis_jll = "0.9.0" julia = "1.10" [extras] diff --git a/ext/LinearSolveAMDGPUExt.jl b/ext/LinearSolveAMDGPUExt.jl new file mode 100644 index 000000000..df1ee0453 --- /dev/null +++ b/ext/LinearSolveAMDGPUExt.jl @@ -0,0 +1,68 @@ +module LinearSolveAMDGPUExt + +using AMDGPU +using LinearSolve: LinearSolve, LinearCache, AMDGPUOffloadLUFactorization, + AMDGPUOffloadQRFactorization, init_cacheval, OperatorAssumptions +using LinearSolve.LinearAlgebra, LinearSolve.SciMLBase + +# LU Factorization +function SciMLBase.solve!(cache::LinearSolve.LinearCache, alg::AMDGPUOffloadLUFactorization; + kwargs...) + if cache.isfresh + fact = AMDGPU.rocSOLVER.getrf!(AMDGPU.ROCArray(cache.A)) + cache.cacheval = fact + cache.isfresh = false + end + + A_gpu, ipiv = cache.cacheval + b_gpu = AMDGPU.ROCArray(cache.b) + + AMDGPU.rocSOLVER.getrs!('N', A_gpu, ipiv, b_gpu) + + y = Array(b_gpu) + cache.u .= y + SciMLBase.build_linear_solution(alg, y, nothing, cache) +end + +function LinearSolve.init_cacheval(alg::AMDGPUOffloadLUFactorization, A, b, u, Pl, Pr, + maxiters::Int, abstol, reltol, verbose::Bool, + assumptions::OperatorAssumptions) + AMDGPU.rocSOLVER.getrf!(AMDGPU.ROCArray(A)) +end + +# QR Factorization +function SciMLBase.solve!(cache::LinearSolve.LinearCache, alg::AMDGPUOffloadQRFactorization; + kwargs...) + if cache.isfresh + A_gpu = AMDGPU.ROCArray(cache.A) + tau = AMDGPU.ROCVector{eltype(A_gpu)}(undef, min(size(A_gpu)...)) + AMDGPU.rocSOLVER.geqrf!(A_gpu, tau) + cache.cacheval = (A_gpu, tau) + cache.isfresh = false + end + + A_gpu, tau = cache.cacheval + b_gpu = AMDGPU.ROCArray(cache.b) + + # Apply Q^T to b + AMDGPU.rocSOLVER.ormqr!('L', 'T', A_gpu, tau, b_gpu) + + # Solve the upper triangular system + m, n = size(A_gpu) + AMDGPU.rocBLAS.trsv!('U', 'N', 'N', n, A_gpu, b_gpu) + + y = Array(b_gpu[1:n]) + cache.u .= y + SciMLBase.build_linear_solution(alg, y, nothing, cache) +end + +function LinearSolve.init_cacheval(alg::AMDGPUOffloadQRFactorization, A, b, u, Pl, Pr, + maxiters::Int, abstol, reltol, verbose::Bool, + assumptions::OperatorAssumptions) + A_gpu = AMDGPU.ROCArray(A) + tau = AMDGPU.ROCVector{eltype(A_gpu)}(undef, min(size(A_gpu)...)) + AMDGPU.rocSOLVER.geqrf!(A_gpu, tau) + (A_gpu, tau) +end + +end \ No newline at end of file diff --git a/src/LinearSolve.jl b/src/LinearSolve.jl index bf24093b4..53f854997 100644 --- a/src/LinearSolve.jl +++ b/src/LinearSolve.jl @@ -254,7 +254,7 @@ export KrylovJL, KrylovJL_CG, KrylovJL_MINRES, KrylovJL_GMRES, export SimpleGMRES export HYPREAlgorithm -export CudaOffloadFactorization +export CudaOffloadFactorization, AMDGPUOffloadLUFactorization, AMDGPUOffloadQRFactorization export MKLPardisoFactorize, MKLPardisoIterate export PanuaPardisoFactorize, PanuaPardisoIterate export PardisoJL diff --git a/src/extension_algs.jl b/src/extension_algs.jl index 29d1be8fa..0d4e18696 100644 --- a/src/extension_algs.jl +++ b/src/extension_algs.jl @@ -82,6 +82,48 @@ struct CudaOffloadFactorization <: LinearSolve.AbstractFactorization end end +""" +`AMDGPUOffloadLUFactorization()` + +An offloading technique using LU factorization to GPU-accelerate CPU-based computations on AMD GPUs. +Requires a sufficiently large `A` to overcome the data transfer costs. + +!!! note + + Using this solver requires adding the package AMDGPU.jl, i.e. `using AMDGPU` +""" +struct AMDGPUOffloadLUFactorization <: LinearSolve.AbstractFactorization + function AMDGPUOffloadLUFactorization() + ext = Base.get_extension(@__MODULE__, :LinearSolveAMDGPUExt) + if ext === nothing + error("AMDGPUOffloadLUFactorization requires that AMDGPU is loaded, i.e. `using AMDGPU`") + else + return new{}() + end + end +end + +""" +`AMDGPUOffloadQRFactorization()` + +An offloading technique using QR factorization to GPU-accelerate CPU-based computations on AMD GPUs. +Requires a sufficiently large `A` to overcome the data transfer costs. + +!!! note + + Using this solver requires adding the package AMDGPU.jl, i.e. `using AMDGPU` +""" +struct AMDGPUOffloadQRFactorization <: LinearSolve.AbstractFactorization + function AMDGPUOffloadQRFactorization() + ext = Base.get_extension(@__MODULE__, :LinearSolveAMDGPUExt) + if ext === nothing + error("AMDGPUOffloadQRFactorization requires that AMDGPU is loaded, i.e. `using AMDGPU`") + else + return new{}() + end + end +end + ## RFLUFactorization """