diff --git a/docs/src/solvers/solvers.md b/docs/src/solvers/solvers.md index 1290a8fa2..71a12edc8 100644 --- a/docs/src/solvers/solvers.md +++ b/docs/src/solvers/solvers.md @@ -32,6 +32,24 @@ this is only recommended for Float32 matrices. Choose `CudaOffloadLUFactorizatio performance on well-conditioned problems, or `CudaOffloadQRFactorization` for better numerical stability on ill-conditioned problems. +#### Mixed Precision Methods + +For large well-conditioned problems where memory bandwidth is the bottleneck, mixed precision +methods can provide significant speedups (up to 2x) by performing the factorization in Float32 +while maintaining Float64 interfaces. These methods are particularly effective for: +- Large dense matrices (> 1000x1000) +- Well-conditioned problems (condition number < 10^4) +- Hardware with good Float32 performance + +Available mixed precision solvers: +- `MKL32MixedLUFactorization` - CPUs with MKL +- `AppleAccelerate32MixedLUFactorization` - Apple CPUs with Accelerate +- `CUDAOffload32MixedLUFactorization` - NVIDIA GPUs with CUDA +- `MetalOffload32MixedLUFactorization` - Apple GPUs with Metal + +These methods automatically handle the precision conversion, making them easy drop-in replacements +when reduced precision is acceptable for the factorization step. + !!! note Performance details for dense LU-factorizations can be highly dependent on the hardware configuration. @@ -205,6 +223,7 @@ KrylovJL ```@docs MKLLUFactorization +MKL32MixedLUFactorization ``` ### AppleAccelerate.jl @@ -215,6 +234,7 @@ MKLLUFactorization ```@docs AppleAccelerateLUFactorization +AppleAccelerate32MixedLUFactorization ``` ### Metal.jl @@ -225,6 +245,7 @@ AppleAccelerateLUFactorization ```@docs MetalLUFactorization +MetalOffload32MixedLUFactorization ``` ### Pardiso.jl @@ -251,6 +272,7 @@ The following are non-standard GPU factorization routines. ```@docs CudaOffloadLUFactorization CudaOffloadQRFactorization +CUDAOffload32MixedLUFactorization ``` ### AMDGPU.jl diff --git a/ext/LinearSolveCUDAExt.jl b/ext/LinearSolveCUDAExt.jl index eab6ee401..503ee71fb 100644 --- a/ext/LinearSolveCUDAExt.jl +++ b/ext/LinearSolveCUDAExt.jl @@ -7,6 +7,7 @@ using LinearSolve: LinearSolve, is_cusparse, defaultalg, cudss_loaded, DefaultLi needs_concrete_A, error_no_cudss_lu, init_cacheval, OperatorAssumptions, CudaOffloadFactorization, CudaOffloadLUFactorization, CudaOffloadQRFactorization, + CUDAOffload32MixedLUFactorization, SparspakFactorization, KLUFactorization, UMFPACKFactorization, LinearVerbosity using LinearSolve.LinearAlgebra, LinearSolve.SciMLBase, LinearSolve.ArrayInterface @@ -118,4 +119,40 @@ function LinearSolve.init_cacheval( nothing end +# Mixed precision CUDA LU implementation +function SciMLBase.solve!(cache::LinearSolve.LinearCache, alg::CUDAOffload32MixedLUFactorization; + kwargs...) + if cache.isfresh + cacheval = LinearSolve.@get_cacheval(cache, :CUDAOffload32MixedLUFactorization) + # Convert to Float32 for factorization + A_f32 = Float32.(cache.A) + fact = lu(CUDA.CuArray(A_f32)) + cache.cacheval = fact + cache.isfresh = false + end + fact = LinearSolve.@get_cacheval(cache, :CUDAOffload32MixedLUFactorization) + # Convert b to Float32, solve, then convert back to original precision + b_f32 = Float32.(cache.b) + u_f32 = CUDA.CuArray(b_f32) + y_f32 = ldiv!(u_f32, fact, CUDA.CuArray(b_f32)) + # Convert back to original precision + y = Array(y_f32) + T = eltype(cache.u) + cache.u .= T.(y) + SciMLBase.build_linear_solution(alg, cache.u, nothing, cache) +end + +function LinearSolve.init_cacheval(alg::CUDAOffload32MixedLUFactorization, A, b, u, Pl, Pr, + maxiters::Int, abstol, reltol, verbose::LinearVerbosity, + assumptions::OperatorAssumptions) + # Pre-allocate with Float32 arrays + A_f32 = Float32.(A) + T = eltype(A_f32) + noUnitT = typeof(zero(T)) + luT = LinearAlgebra.lutype(noUnitT) + ipiv = CuVector{Int32}(undef, 0) + info = zero(LinearAlgebra.BlasInt) + return LU{luT}(CuMatrix{Float32}(undef, 0, 0), ipiv, info) +end + end diff --git a/ext/LinearSolveMetalExt.jl b/ext/LinearSolveMetalExt.jl index 507ae0142..0207b1425 100644 --- a/ext/LinearSolveMetalExt.jl +++ b/ext/LinearSolveMetalExt.jl @@ -3,7 +3,8 @@ module LinearSolveMetalExt using Metal, LinearSolve using LinearAlgebra, SciMLBase using SciMLBase: AbstractSciMLOperator -using LinearSolve: ArrayInterface, MKLLUFactorization, @get_cacheval, LinearCache, SciMLBase +using LinearSolve: ArrayInterface, MKLLUFactorization, MetalOffload32MixedLUFactorization, + @get_cacheval, LinearCache, SciMLBase, OperatorAssumptions, LinearVerbosity default_alias_A(::MetalLUFactorization, ::Any, ::Any) = false default_alias_b(::MetalLUFactorization, ::Any, ::Any) = false @@ -28,4 +29,45 @@ function SciMLBase.solve!(cache::LinearCache, alg::MetalLUFactorization; SciMLBase.build_linear_solution(alg, y, nothing, cache) end +# Mixed precision Metal LU implementation +default_alias_A(::MetalOffload32MixedLUFactorization, ::Any, ::Any) = false +default_alias_b(::MetalOffload32MixedLUFactorization, ::Any, ::Any) = false + +function LinearSolve.init_cacheval(alg::MetalOffload32MixedLUFactorization, A, b, u, Pl, Pr, + maxiters::Int, abstol, reltol, verbose::LinearVerbosity, + assumptions::OperatorAssumptions) + # Pre-allocate with Float32 arrays + A_f32 = Float32.(convert(AbstractMatrix, A)) + ArrayInterface.lu_instance(A_f32) +end + +function SciMLBase.solve!(cache::LinearCache, alg::MetalOffload32MixedLUFactorization; + kwargs...) + A = cache.A + A = convert(AbstractMatrix, A) + if cache.isfresh + cacheval = @get_cacheval(cache, :MetalOffload32MixedLUFactorization) + # Convert to Float32 for factorization + A_f32 = Float32.(A) + res = lu(MtlArray(A_f32)) + # Store factorization on CPU with converted types + cache.cacheval = LU(Array(res.factors), Array{Int}(res.ipiv), res.info) + cache.isfresh = false + end + + fact = @get_cacheval(cache, :MetalOffload32MixedLUFactorization) + # Convert b to Float32 for solving + b_f32 = Float32.(cache.b) + u_f32 = similar(b_f32) + + # Create a temporary Float32 LU factorization for solving + fact_f32 = LU(Float32.(fact.factors), fact.ipiv, fact.info) + ldiv!(u_f32, fact_f32, b_f32) + + # Convert back to original precision + T = eltype(cache.u) + cache.u .= T.(u_f32) + SciMLBase.build_linear_solution(alg, cache.u, nothing, cache) +end + end diff --git a/src/LinearSolve.jl b/src/LinearSolve.jl index e8bcc5b3e..eca4f0692 100644 --- a/src/LinearSolve.jl +++ b/src/LinearSolve.jl @@ -456,13 +456,17 @@ export HYPREAlgorithm export CudaOffloadFactorization export CudaOffloadLUFactorization export CudaOffloadQRFactorization +export CUDAOffload32MixedLUFactorization export AMDGPUOffloadLUFactorization, AMDGPUOffloadQRFactorization export MKLPardisoFactorize, MKLPardisoIterate export PanuaPardisoFactorize, PanuaPardisoIterate export PardisoJL export MKLLUFactorization +export MKL32MixedLUFactorization export AppleAccelerateLUFactorization +export AppleAccelerate32MixedLUFactorization export MetalLUFactorization +export MetalOffload32MixedLUFactorization export OperatorAssumptions, OperatorCondition diff --git a/src/appleaccelerate.jl b/src/appleaccelerate.jl index 3d0dcbf55..b3d454f5d 100644 --- a/src/appleaccelerate.jl +++ b/src/appleaccelerate.jl @@ -14,6 +14,7 @@ to avoid allocations and does not require libblastrampoline. """ struct AppleAccelerateLUFactorization <: AbstractFactorization end + @static if !Sys.isapple() __appleaccelerate_isavailable() = false else @@ -284,3 +285,84 @@ function SciMLBase.solve!(cache::LinearCache, alg::AppleAccelerateLUFactorizatio SciMLBase.build_linear_solution( alg, cache.u, nothing, cache; retcode = ReturnCode.Success) end + +# Mixed precision AppleAccelerate implementation +default_alias_A(::AppleAccelerate32MixedLUFactorization, ::Any, ::Any) = false +default_alias_b(::AppleAccelerate32MixedLUFactorization, ::Any, ::Any) = false + +const PREALLOCATED_APPLE32_LU = begin + A = rand(Float32, 0, 0) + luinst = ArrayInterface.lu_instance(A) + LU(luinst.factors, similar(A, Cint, 0), luinst.info), Ref{Cint}() +end + +function LinearSolve.init_cacheval(alg::AppleAccelerate32MixedLUFactorization, A, b, u, Pl, Pr, + maxiters::Int, abstol, reltol, verbose::LinearVerbosity, + assumptions::OperatorAssumptions) + # Pre-allocate appropriate 32-bit arrays based on input type + if eltype(A) <: Complex + A_32 = rand(ComplexF32, 0, 0) + else + A_32 = rand(Float32, 0, 0) + end + luinst = ArrayInterface.lu_instance(A_32) + LU(luinst.factors, similar(A_32, Cint, 0), luinst.info), Ref{Cint}() +end + +function SciMLBase.solve!(cache::LinearCache, alg::AppleAccelerate32MixedLUFactorization; + kwargs...) + __appleaccelerate_isavailable() || + error("Error, AppleAccelerate binary is missing but solve is being called. Report this issue") + A = cache.A + A = convert(AbstractMatrix, A) + + # Check if we have complex numbers + iscomplex = eltype(A) <: Complex + + if cache.isfresh + cacheval = @get_cacheval(cache, :AppleAccelerate32MixedLUFactorization) + # Convert to appropriate 32-bit type for factorization + if iscomplex + A_f32 = ComplexF32.(A) + else + A_f32 = Float32.(A) + end + res = aa_getrf!(A_f32; ipiv = cacheval[1].ipiv, info = cacheval[2]) + fact = LU(res[1:3]...), res[4] + cache.cacheval = fact + + if !LinearAlgebra.issuccess(fact[1]) + return SciMLBase.build_linear_solution( + alg, cache.u, nothing, cache; retcode = ReturnCode.Failure) + end + cache.isfresh = false + end + + A_lu, info = @get_cacheval(cache, :AppleAccelerate32MixedLUFactorization) + require_one_based_indexing(cache.u, cache.b) + m, n = size(A_lu, 1), size(A_lu, 2) + + # Convert b to appropriate 32-bit type for solving + if iscomplex + b_f32 = ComplexF32.(cache.b) + else + b_f32 = Float32.(cache.b) + end + + if m > n + Bc = copy(b_f32) + aa_getrs!('N', A_lu.factors, A_lu.ipiv, Bc; info) + # Convert back to original precision + T = eltype(cache.u) + cache.u .= T.(Bc[1:n]) + else + u_f32 = copy(b_f32) + aa_getrs!('N', A_lu.factors, A_lu.ipiv, u_f32; info) + # Convert back to original precision + T = eltype(cache.u) + cache.u .= T.(u_f32) + end + + SciMLBase.build_linear_solution( + alg, cache.u, nothing, cache; retcode = ReturnCode.Success) +end diff --git a/src/extension_algs.jl b/src/extension_algs.jl index f515ff855..f44a6d4c6 100644 --- a/src/extension_algs.jl +++ b/src/extension_algs.jl @@ -83,6 +83,35 @@ struct CudaOffloadLUFactorization <: AbstractFactorization end end +""" +`CUDAOffload32MixedLUFactorization()` + +A mixed precision GPU-accelerated LU factorization that converts matrices to Float32 +before offloading to CUDA GPU for factorization, then converts back for the solve. +This can provide speedups when the reduced precision is acceptable and memory +bandwidth is a bottleneck. + +## Performance Notes +- Converts Float64 matrices to Float32 for GPU factorization +- Can be significantly faster for large matrices where memory bandwidth is limiting +- May have reduced accuracy compared to full precision methods +- Most beneficial when the condition number of the matrix is moderate + +!!! note + + Using this solver requires adding the package CUDA.jl, i.e. `using CUDA` +""" +struct CUDAOffload32MixedLUFactorization <: AbstractFactorization + function CUDAOffload32MixedLUFactorization(; throwerror = true) + ext = Base.get_extension(@__MODULE__, :LinearSolveCUDAExt) + if ext === nothing && throwerror + error("CUDAOffload32MixedLUFactorization requires that CUDA is loaded, i.e. `using CUDA`") + else + return new() + end + end +end + """ `CudaOffloadQRFactorization()` @@ -650,6 +679,48 @@ struct MetalLUFactorization <: AbstractFactorization end end +""" + MetalOffload32MixedLUFactorization() + +A mixed precision Metal GPU-accelerated LU factorization that converts matrices to Float32 +before offloading to Metal GPU for factorization, then converts back for the solve. +This can provide speedups on Apple Silicon when reduced precision is acceptable. + +## Performance Notes +- Converts Float64 matrices to Float32 for GPU factorization +- Can be significantly faster for large matrices where memory bandwidth is limiting +- Particularly effective on Apple Silicon Macs with unified memory architecture +- May have reduced accuracy compared to full precision methods + +## Requirements +Using this solver requires that Metal.jl is loaded: `using Metal` + +## Example +```julia +using Metal +alg = MetalOffload32MixedLUFactorization() +sol = solve(prob, alg) +``` +""" +struct MetalOffload32MixedLUFactorization <: AbstractFactorization + function MetalOffload32MixedLUFactorization(; throwerror = true) + @static if !Sys.isapple() + if throwerror + error("MetalOffload32MixedLUFactorization is only available on Apple platforms") + else + return new() + end + else + ext = Base.get_extension(@__MODULE__, :LinearSolveMetalExt) + if ext === nothing && throwerror + error("MetalOffload32MixedLUFactorization requires that Metal.jl is loaded, i.e. `using Metal`") + else + return new() + end + end + end +end + """ BLISLUFactorization() @@ -715,3 +786,51 @@ struct CUSOLVERRFFactorization <: AbstractSparseFactorization end end end + +""" + MKL32MixedLUFactorization() + +A mixed precision LU factorization using Intel MKL that performs factorization in Float32 +precision while maintaining Float64 interface. This can provide significant speedups +for large matrices when reduced precision is acceptable. + +## Performance Notes +- Converts Float64 matrices to Float32 for factorization +- Uses optimized MKL routines for the factorization +- Can be 2x faster than full precision for memory-bandwidth limited problems +- May have reduced accuracy compared to full Float64 precision + +## Requirements +This solver requires MKL to be available through MKL_jll. + +## Example +```julia +alg = MKL32MixedLUFactorization() +sol = solve(prob, alg) +``` +""" +struct MKL32MixedLUFactorization <: AbstractFactorization end + +""" + AppleAccelerate32MixedLUFactorization() + +A mixed precision LU factorization using Apple's Accelerate framework that performs +factorization in Float32 precision while maintaining Float64 interface. This can +provide significant speedups on Apple hardware when reduced precision is acceptable. + +## Performance Notes +- Converts Float64 matrices to Float32 for factorization +- Uses optimized Accelerate routines for the factorization +- Particularly effective on Apple Silicon with unified memory +- May have reduced accuracy compared to full Float64 precision + +## Requirements +This solver is only available on Apple platforms and requires the Accelerate framework. + +## Example +```julia +alg = AppleAccelerate32MixedLUFactorization() +sol = solve(prob, alg) +``` +""" +struct AppleAccelerate32MixedLUFactorization <: AbstractFactorization end diff --git a/src/mkl.jl b/src/mkl.jl index 02a8bdba6..0b216c119 100644 --- a/src/mkl.jl +++ b/src/mkl.jl @@ -8,6 +8,7 @@ to avoid allocations and does not require libblastrampoline. """ struct MKLLUFactorization <: AbstractFactorization end + function getrf!(A::AbstractMatrix{<:ComplexF64}; ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))), info = Ref{BlasInt}(), @@ -241,3 +242,79 @@ function SciMLBase.solve!(cache::LinearCache, alg::MKLLUFactorization; SciMLBase.build_linear_solution(alg, cache.u, nothing, cache; retcode = ReturnCode.Success) end + +# Mixed precision MKL implementation +default_alias_A(::MKL32MixedLUFactorization, ::Any, ::Any) = false +default_alias_b(::MKL32MixedLUFactorization, ::Any, ::Any) = false + +const PREALLOCATED_MKL32_LU = begin + A = rand(Float32, 0, 0) + luinst = ArrayInterface.lu_instance(A), Ref{BlasInt}() +end + +function LinearSolve.init_cacheval(alg::MKL32MixedLUFactorization, A, b, u, Pl, Pr, + maxiters::Int, abstol, reltol, verbose::LinearVerbosity, + assumptions::OperatorAssumptions) + # Pre-allocate appropriate 32-bit arrays based on input type + if eltype(A) <: Complex + A_32 = rand(ComplexF32, 0, 0) + else + A_32 = rand(Float32, 0, 0) + end + ArrayInterface.lu_instance(A_32), Ref{BlasInt}() +end + +function SciMLBase.solve!(cache::LinearCache, alg::MKL32MixedLUFactorization; + kwargs...) + A = cache.A + A = convert(AbstractMatrix, A) + + # Check if we have complex numbers + iscomplex = eltype(A) <: Complex + + if cache.isfresh + cacheval = @get_cacheval(cache, :MKL32MixedLUFactorization) + # Convert to appropriate 32-bit type for factorization + if iscomplex + A_f32 = ComplexF32.(A) + else + A_f32 = Float32.(A) + end + res = getrf!(A_f32; ipiv = cacheval[1].ipiv, info = cacheval[2]) + fact = LU(res[1:3]...), res[4] + cache.cacheval = fact + + if !LinearAlgebra.issuccess(fact[1]) + return SciMLBase.build_linear_solution( + alg, cache.u, nothing, cache; retcode = ReturnCode.Failure) + end + cache.isfresh = false + end + + A_lu, info = @get_cacheval(cache, :MKL32MixedLUFactorization) + require_one_based_indexing(cache.u, cache.b) + m, n = size(A_lu, 1), size(A_lu, 2) + + # Convert b to appropriate 32-bit type for solving + if iscomplex + b_f32 = ComplexF32.(cache.b) + else + b_f32 = Float32.(cache.b) + end + + if m > n + Bc = copy(b_f32) + getrs!('N', A_lu.factors, A_lu.ipiv, Bc; info) + # Convert back to original precision + T = eltype(cache.u) + cache.u .= T.(Bc[1:n]) + else + u_f32 = copy(b_f32) + getrs!('N', A_lu.factors, A_lu.ipiv, u_f32; info) + # Convert back to original precision + T = eltype(cache.u) + cache.u .= T.(u_f32) + end + + SciMLBase.build_linear_solution(alg, cache.u, nothing, cache; retcode = ReturnCode.Success) +end diff --git a/test/runtests.jl b/test/runtests.jl index 048cdf61e..558f17c7b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -19,6 +19,7 @@ if GROUP == "All" || GROUP == "Core" @time @safetestset "Traits" include("traits.jl") @time @safetestset "Verbosity" include("verbosity.jl") @time @safetestset "BandedMatrices" include("banded.jl") + @time @safetestset "Mixed Precision" include("test_mixed_precision.jl") end # Don't run Enzyme tests on prerelease diff --git a/test/test_mixed_precision.jl b/test/test_mixed_precision.jl new file mode 100644 index 000000000..47a34e4af --- /dev/null +++ b/test/test_mixed_precision.jl @@ -0,0 +1,77 @@ +using Test +using LinearSolve +using LinearAlgebra +using Random + +Random.seed!(123) + +@testset "Mixed Precision LU Factorizations" begin + n = 100 + A = rand(Float64, n, n) + b = rand(Float64, n) + + # Make A better conditioned to avoid excessive precision loss + A = A + 5.0 * I + + prob = LinearProblem(A, b) + + # Reference solution with full precision + sol_ref = solve(prob, LUFactorization()) + + @testset "MKL32MixedLUFactorization" begin + if LinearSolve.usemkl + sol_mixed = solve(prob, MKL32MixedLUFactorization()) + @test sol_mixed.retcode == ReturnCode.Success + # Check that solution is reasonably close (allowing for reduced precision) + @test norm(sol_mixed.u - sol_ref.u) / norm(sol_ref.u) < 1e-5 + # Verify it actually solves the system + @test norm(A * sol_mixed.u - b) / norm(b) < 1e-5 + else + @test_skip "MKL not available" + end + end + + @testset "AppleAccelerate32MixedLUFactorization" begin + if Sys.isapple() + sol_mixed = solve(prob, AppleAccelerate32MixedLUFactorization()) + @test sol_mixed.retcode == ReturnCode.Success + # Check that solution is reasonably close (allowing for reduced precision) + @test norm(sol_mixed.u - sol_ref.u) / norm(sol_ref.u) < 1e-5 + # Verify it actually solves the system + @test norm(A * sol_mixed.u - b) / norm(b) < 1e-5 + else + @test_skip "Apple Accelerate not available" + end + end + + @testset "Complex matrices" begin + # Test with complex matrices + A_complex = rand(ComplexF64, n, n) + 5.0 * I + b_complex = rand(ComplexF64, n) + prob_complex = LinearProblem(A_complex, b_complex) + sol_ref_complex = solve(prob_complex, LUFactorization()) + + if LinearSolve.usemkl + sol_mixed = solve(prob_complex, MKL32MixedLUFactorization()) + @test sol_mixed.retcode == ReturnCode.Success + @test norm(sol_mixed.u - sol_ref_complex.u) / norm(sol_ref_complex.u) < 1e-5 + end + + if Sys.isapple() + sol_mixed = solve(prob_complex, AppleAccelerate32MixedLUFactorization()) + @test sol_mixed.retcode == ReturnCode.Success + @test norm(sol_mixed.u - sol_ref_complex.u) / norm(sol_ref_complex.u) < 1e-5 + end + end +end + +# Note: CUDA and Metal tests would require those packages to be loaded +# and appropriate hardware to be available +@testset "GPU Mixed Precision (Mocked)" begin + @test isdefined(LinearSolve, :CUDAOffload32MixedLUFactorization) + @test isdefined(LinearSolve, :MetalOffload32MixedLUFactorization) + + # These would error without the appropriate packages loaded, which is expected + @test_throws Exception CUDAOffload32MixedLUFactorization() + @test_throws Exception MetalOffload32MixedLUFactorization() +end \ No newline at end of file