diff --git a/docs/src/release_notes.md b/docs/src/release_notes.md index fcb5336bf..6aa85f6ef 100644 --- a/docs/src/release_notes.md +++ b/docs/src/release_notes.md @@ -1,5 +1,12 @@ # Release Notes +## Upcoming Changes + + - `CudaOffloadFactorization` has been split into two algorithms: + - `CudaOffloadLUFactorization` - Uses LU factorization for better performance + - `CudaOffloadQRFactorization` - Uses QR factorization for better numerical stability + - `CudaOffloadFactorization` is now deprecated and will show a warning suggesting to use one of the new algorithms + ## v2.0 - `LinearCache` changed from immutable to mutable. With this, the out of place interfaces like diff --git a/docs/src/solvers/solvers.md b/docs/src/solvers/solvers.md index 286a5ead2..ed155d8b3 100644 --- a/docs/src/solvers/solvers.md +++ b/docs/src/solvers/solvers.md @@ -23,12 +23,14 @@ use your base system BLAS which can be fast or slow depending on the hardware co For very large dense factorizations, offloading to the GPU can be preferred. Metal.jl can be used on Mac hardware to offload, and has a cutoff point of being faster at around size 20,000 x 20,000 -matrices (and only supports Float32). `CudaOffloadFactorization` can be more efficient at a -much smaller cutoff, possibly around size 1,000 x 1,000 matrices, though this is highly dependent -on the chosen GPU hardware. `CudaOffloadFactorization` requires a CUDA-compatible NVIDIA GPU. +matrices (and only supports Float32). `CudaOffloadLUFactorization` and `CudaOffloadQRFactorization` +can be more efficient at a much smaller cutoff, possibly around size 1,000 x 1,000 matrices, though +this is highly dependent on the chosen GPU hardware. These algorithms require a CUDA-compatible NVIDIA GPU. CUDA offload supports Float64 but most consumer GPU hardware will be much faster on Float32 (many are >32x faster for Float32 operations than Float64 operations) and thus for most hardware -this is only recommended for Float32 matrices. +this is only recommended for Float32 matrices. Choose `CudaOffloadLUFactorization` for better +performance on well-conditioned problems, or `CudaOffloadQRFactorization` for better numerical +stability on ill-conditioned problems. !!! note @@ -232,10 +234,11 @@ The following are non-standard GPU factorization routines. !!! note - Using this solver requires adding the package CUDA.jl, i.e. `using CUDA` + Using these solvers requires adding the package CUDA.jl, i.e. `using CUDA` ```@docs -CudaOffloadFactorization +CudaOffloadLUFactorization +CudaOffloadQRFactorization ``` ### AMDGPU.jl diff --git a/docs/src/tutorials/gpu.md b/docs/src/tutorials/gpu.md index f348c647a..0df5c0677 100644 --- a/docs/src/tutorials/gpu.md +++ b/docs/src/tutorials/gpu.md @@ -41,6 +41,8 @@ This computation can be moved to the GPU by the following: ```julia using CUDA # Add the GPU library for NVIDIA GPUs sol = LS.solve(prob, LS.CudaOffloadLUFactorization()) +# or +sol = LS.solve(prob, LS.CudaOffloadQRFactorization()) sol.u ``` @@ -54,6 +56,12 @@ sol = LS.solve(prob, LS.AMDGPUOffloadQRFactorization()) # QR factorization sol.u ``` +LinearSolve.jl provides multiple GPU offloading algorithms: +- `CudaOffloadLUFactorization()` - Uses LU factorization on NVIDIA GPUs (generally faster for well-conditioned problems) +- `CudaOffloadQRFactorization()` - Uses QR factorization on NVIDIA GPUs (more stable for ill-conditioned problems) +- `AMDGPUOffloadLUFactorization()` - Uses LU factorization on AMD GPUs (generally faster for well-conditioned problems) +- `AMDGPUOffloadQRFactorization()` - Uses QR factorization on AMD GPUs (more stable for ill-conditioned problems) +- ## GPUArray Interface For more manual control over the factorization setup, you can use the diff --git a/ext/LinearSolveCUDAExt.jl b/ext/LinearSolveCUDAExt.jl index aaedcf21a..4e0b25796 100644 --- a/ext/LinearSolveCUDAExt.jl +++ b/ext/LinearSolveCUDAExt.jl @@ -5,7 +5,7 @@ using LinearSolve: LinearSolve, is_cusparse, defaultalg, cudss_loaded, DefaultLi DefaultAlgorithmChoice, ALREADY_WARNED_CUDSS, LinearCache, needs_concrete_A, error_no_cudss_lu, init_cacheval, OperatorAssumptions, - CudaOffloadFactorization, + CudaOffloadFactorization, CudaOffloadLUFactorization, CudaOffloadQRFactorization, SparspakFactorization, KLUFactorization, UMFPACKFactorization using LinearSolve.LinearAlgebra, LinearSolve.SciMLBase, LinearSolve.ArrayInterface using SciMLBase: AbstractSciMLOperator @@ -35,6 +35,48 @@ function LinearSolve.error_no_cudss_lu(A::CUDA.CUSPARSE.CuSparseMatrixCSR) nothing end +function SciMLBase.solve!(cache::LinearSolve.LinearCache, alg::CudaOffloadLUFactorization; + kwargs...) + if cache.isfresh + fact = lu(CUDA.CuArray(cache.A)) + cache.cacheval = fact + cache.isfresh = false + end + y = Array(ldiv!(CUDA.CuArray(cache.u), cache.cacheval, CUDA.CuArray(cache.b))) + cache.u .= y + SciMLBase.build_linear_solution(alg, y, nothing, cache) +end + +function LinearSolve.init_cacheval(alg::CudaOffloadLUFactorization, A, b, u, Pl, Pr, + maxiters::Int, abstol, reltol, verbose::Bool, + assumptions::OperatorAssumptions) + T = eltype(A) + noUnitT = typeof(zero(T)) + luT = LinearAlgebra.lutype(noUnitT) + ipiv = CuVector{Int32}(undef, 0) + info = zero(LinearAlgebra.BlasInt) + return LU{luT}(CuMatrix{Float64}(undef, 0, 0), ipiv, info) +end + +function SciMLBase.solve!(cache::LinearSolve.LinearCache, alg::CudaOffloadQRFactorization; + kwargs...) + if cache.isfresh + fact = qr(CUDA.CuArray(cache.A)) + cache.cacheval = fact + cache.isfresh = false + end + y = Array(ldiv!(CUDA.CuArray(cache.u), cache.cacheval, CUDA.CuArray(cache.b))) + cache.u .= y + SciMLBase.build_linear_solution(alg, y, nothing, cache) +end + +function LinearSolve.init_cacheval(alg::CudaOffloadQRFactorization, A, b, u, Pl, Pr, + maxiters::Int, abstol, reltol, verbose::Bool, + assumptions::OperatorAssumptions) + qr(CUDA.CuArray(A)) +end + +# Keep the deprecated CudaOffloadFactorization working by forwarding to QR function SciMLBase.solve!(cache::LinearSolve.LinearCache, alg::CudaOffloadFactorization; kwargs...) if cache.isfresh diff --git a/lib/LinearSolveAutotune/src/algorithms.jl b/lib/LinearSolveAutotune/src/algorithms.jl index 35855ff8b..360f4bd40 100644 --- a/lib/LinearSolveAutotune/src/algorithms.jl +++ b/lib/LinearSolveAutotune/src/algorithms.jl @@ -84,10 +84,10 @@ function get_gpu_algorithms(; skip_missing_algs::Bool = false) # CUDA algorithms if is_cuda_available() try - push!(gpu_algs, CudaOffloadFactorization()) - push!(gpu_names, "CudaOffloadFactorization") + push!(gpu_algs, CudaOffloadLUFactorization()) + push!(gpu_names, "CudaOffloadLUFactorization") catch e - msg = "CUDA hardware detected but CudaOffloadFactorization not available: $e. Load CUDA.jl package." + msg = "CUDA hardware detected but CudaOffloadLUFactorization not available: $e. Load CUDA.jl package." if skip_missing_algs @warn msg else diff --git a/src/LinearSolve.jl b/src/LinearSolve.jl index 53f854997..ac2b2d148 100644 --- a/src/LinearSolve.jl +++ b/src/LinearSolve.jl @@ -254,7 +254,10 @@ export KrylovJL, KrylovJL_CG, KrylovJL_MINRES, KrylovJL_GMRES, export SimpleGMRES export HYPREAlgorithm -export CudaOffloadFactorization, AMDGPUOffloadLUFactorization, AMDGPUOffloadQRFactorization +export CudaOffloadFactorization +export CudaOffloadLUFactorization +export CudaOffloadQRFactorization +export AMDGPUOffloadLUFactorization, AMDGPUOffloadQRFactorization export MKLPardisoFactorize, MKLPardisoIterate export PanuaPardisoFactorize, PanuaPardisoIterate export PardisoJL diff --git a/src/extension_algs.jl b/src/extension_algs.jl index 0d4e18696..3224ddfa0 100644 --- a/src/extension_algs.jl +++ b/src/extension_algs.jl @@ -61,9 +61,55 @@ struct HYPREAlgorithm <: SciMLLinearSolveAlgorithm end end +# Debug: About to define CudaOffloadLUFactorization +""" +`CudaOffloadLUFactorization()` + +An offloading technique used to GPU-accelerate CPU-based computations using LU factorization. +Requires a sufficiently large `A` to overcome the data transfer costs. + +!!! note + + Using this solver requires adding the package CUDA.jl, i.e. `using CUDA` +""" +struct CudaOffloadLUFactorization <: AbstractFactorization + function CudaOffloadLUFactorization() + ext = Base.get_extension(@__MODULE__, :LinearSolveCUDAExt) + if ext === nothing + error("CudaOffloadLUFactorization requires that CUDA is loaded, i.e. `using CUDA`") + else + return new() + end + end +end + +""" +`CudaOffloadQRFactorization()` + +An offloading technique used to GPU-accelerate CPU-based computations using QR factorization. +Requires a sufficiently large `A` to overcome the data transfer costs. + +!!! note + + Using this solver requires adding the package CUDA.jl, i.e. `using CUDA` +""" +struct CudaOffloadQRFactorization <: AbstractFactorization + function CudaOffloadQRFactorization() + ext = Base.get_extension(@__MODULE__, :LinearSolveCUDAExt) + if ext === nothing + error("CudaOffloadQRFactorization requires that CUDA is loaded, i.e. `using CUDA`") + else + return new() + end + end +end + """ `CudaOffloadFactorization()` +!!! warning + This algorithm is deprecated. Use `CudaOffloadLUFactorization` or `CudaOffloadQRFactorization()` instead. + An offloading technique used to GPU-accelerate CPU-based computations. Requires a sufficiently large `A` to overcome the data transfer costs. @@ -71,13 +117,14 @@ Requires a sufficiently large `A` to overcome the data transfer costs. Using this solver requires adding the package CUDA.jl, i.e. `using CUDA` """ -struct CudaOffloadFactorization <: LinearSolve.AbstractFactorization +struct CudaOffloadFactorization <: AbstractFactorization function CudaOffloadFactorization() + Base.depwarn("`CudaOffloadFactorization` is deprecated, use `CudaOffloadLUFactorization` or `CudaOffloadQRFactorization` instead.", :CudaOffloadFactorization) ext = Base.get_extension(@__MODULE__, :LinearSolveCUDAExt) if ext === nothing error("CudaOffloadFactorization requires that CUDA is loaded, i.e. `using CUDA`") else - return new{}() + return new() end end end diff --git a/test/gpu/cuda.jl b/test/gpu/cuda.jl index 74ee1ab8f..da954e20a 100644 --- a/test/gpu/cuda.jl +++ b/test/gpu/cuda.jl @@ -45,7 +45,7 @@ function test_interface(alg, prob1, prob2) return end -@testset "$alg" for alg in (CudaOffloadFactorization(), NormalCholeskyFactorization()) +@testset "$alg" for alg in (CudaOffloadLUFactorization(), CudaOffloadQRFactorization(), NormalCholeskyFactorization()) test_interface(alg, prob1, prob2) end diff --git a/test/resolve.jl b/test/resolve.jl index 9efcc6921..3fe0a2fea 100644 --- a/test/resolve.jl +++ b/test/resolve.jl @@ -11,6 +11,8 @@ for alg in vcat(InteractiveUtils.subtypes(AbstractDenseFactorization), if !(alg in [ DiagonalFactorization, CudaOffloadFactorization, + CudaOffloadLUFactorization, + CudaOffloadQRFactorization, CUSOLVERRFFactorization, AppleAccelerateLUFactorization, MetalLUFactorization