Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions docs/src/solvers/solvers.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,24 @@ this is only recommended for Float32 matrices. Choose `CudaOffloadLUFactorizatio
performance on well-conditioned problems, or `CudaOffloadQRFactorization` for better numerical
stability on ill-conditioned problems.

#### Mixed Precision Methods

For large well-conditioned problems where memory bandwidth is the bottleneck, mixed precision
methods can provide significant speedups (up to 2x) by performing the factorization in Float32
while maintaining Float64 interfaces. These methods are particularly effective for:
- Large dense matrices (> 1000x1000)
- Well-conditioned problems (condition number < 10^4)
- Hardware with good Float32 performance

Available mixed precision solvers:
- `MKL32MixedLUFactorization` - Intel CPUs with MKL
- `AppleAccelerate32MixedLUFactorization` - Apple CPUs with Accelerate
- `CUDAOffload32MixedLUFactorization` - NVIDIA GPUs with CUDA
- `MetalOffload32MixedLUFactorization` - Apple GPUs with Metal

These methods automatically handle the precision conversion, making them easy drop-in replacements
when reduced precision is acceptable for the factorization step.

!!! note

Performance details for dense LU-factorizations can be highly dependent on the hardware configuration.
Expand Down Expand Up @@ -205,6 +223,7 @@ KrylovJL

```@docs
MKLLUFactorization
MKL32MixedLUFactorization
```

### AppleAccelerate.jl
Expand All @@ -215,6 +234,7 @@ MKLLUFactorization

```@docs
AppleAccelerateLUFactorization
AppleAccelerate32MixedLUFactorization
```

### Metal.jl
Expand All @@ -225,6 +245,7 @@ AppleAccelerateLUFactorization

```@docs
MetalLUFactorization
MetalOffload32MixedLUFactorization
```

### Pardiso.jl
Expand All @@ -251,6 +272,7 @@ The following are non-standard GPU factorization routines.
```@docs
CudaOffloadLUFactorization
CudaOffloadQRFactorization
CUDAOffload32MixedLUFactorization
```

### AMDGPU.jl
Expand Down
37 changes: 37 additions & 0 deletions ext/LinearSolveCUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ using LinearSolve: LinearSolve, is_cusparse, defaultalg, cudss_loaded, DefaultLi
needs_concrete_A,
error_no_cudss_lu, init_cacheval, OperatorAssumptions,
CudaOffloadFactorization, CudaOffloadLUFactorization, CudaOffloadQRFactorization,
CUDAOffload32MixedLUFactorization,
SparspakFactorization, KLUFactorization, UMFPACKFactorization,
LinearVerbosity
using LinearSolve.LinearAlgebra, LinearSolve.SciMLBase, LinearSolve.ArrayInterface
Expand Down Expand Up @@ -118,4 +119,40 @@ function LinearSolve.init_cacheval(
nothing
end

# Mixed precision CUDA LU implementation
function SciMLBase.solve!(cache::LinearSolve.LinearCache, alg::CUDAOffload32MixedLUFactorization;
kwargs...)
if cache.isfresh
cacheval = LinearSolve.@get_cacheval(cache, :CUDAOffload32MixedLUFactorization)
# Convert to Float32 for factorization
A_f32 = Float32.(cache.A)
fact = lu(CUDA.CuArray(A_f32))
cache.cacheval = fact
cache.isfresh = false
end
fact = LinearSolve.@get_cacheval(cache, :CUDAOffload32MixedLUFactorization)
# Convert b to Float32, solve, then convert back to original precision
b_f32 = Float32.(cache.b)
u_f32 = CUDA.CuArray(b_f32)
y_f32 = ldiv!(u_f32, fact, CUDA.CuArray(b_f32))
# Convert back to original precision
y = Array(y_f32)
T = eltype(cache.u)
cache.u .= T.(y)
SciMLBase.build_linear_solution(alg, cache.u, nothing, cache)
end

function LinearSolve.init_cacheval(alg::CUDAOffload32MixedLUFactorization, A, b, u, Pl, Pr,
maxiters::Int, abstol, reltol, verbose::LinearVerbosity,
assumptions::OperatorAssumptions)
# Pre-allocate with Float32 arrays
A_f32 = Float32.(A)
T = eltype(A_f32)
noUnitT = typeof(zero(T))
luT = LinearAlgebra.lutype(noUnitT)
ipiv = CuVector{Int32}(undef, 0)
info = zero(LinearAlgebra.BlasInt)
return LU{luT}(CuMatrix{Float32}(undef, 0, 0), ipiv, info)
end

end
44 changes: 43 additions & 1 deletion ext/LinearSolveMetalExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ module LinearSolveMetalExt
using Metal, LinearSolve
using LinearAlgebra, SciMLBase
using SciMLBase: AbstractSciMLOperator
using LinearSolve: ArrayInterface, MKLLUFactorization, @get_cacheval, LinearCache, SciMLBase
using LinearSolve: ArrayInterface, MKLLUFactorization, MetalOffload32MixedLUFactorization,
@get_cacheval, LinearCache, SciMLBase, OperatorAssumptions, LinearVerbosity

default_alias_A(::MetalLUFactorization, ::Any, ::Any) = false
default_alias_b(::MetalLUFactorization, ::Any, ::Any) = false
Expand All @@ -28,4 +29,45 @@ function SciMLBase.solve!(cache::LinearCache, alg::MetalLUFactorization;
SciMLBase.build_linear_solution(alg, y, nothing, cache)
end

# Mixed precision Metal LU implementation
default_alias_A(::MetalOffload32MixedLUFactorization, ::Any, ::Any) = false
default_alias_b(::MetalOffload32MixedLUFactorization, ::Any, ::Any) = false

function LinearSolve.init_cacheval(alg::MetalOffload32MixedLUFactorization, A, b, u, Pl, Pr,
maxiters::Int, abstol, reltol, verbose::LinearVerbosity,
assumptions::OperatorAssumptions)
# Pre-allocate with Float32 arrays
A_f32 = Float32.(convert(AbstractMatrix, A))
ArrayInterface.lu_instance(A_f32)
end

function SciMLBase.solve!(cache::LinearCache, alg::MetalOffload32MixedLUFactorization;
kwargs...)
A = cache.A
A = convert(AbstractMatrix, A)
if cache.isfresh
cacheval = @get_cacheval(cache, :MetalOffload32MixedLUFactorization)
# Convert to Float32 for factorization
A_f32 = Float32.(A)
res = lu(MtlArray(A_f32))
# Store factorization on CPU with converted types
cache.cacheval = LU(Array(res.factors), Array{Int}(res.ipiv), res.info)
cache.isfresh = false
end

fact = @get_cacheval(cache, :MetalOffload32MixedLUFactorization)
# Convert b to Float32 for solving
b_f32 = Float32.(cache.b)
u_f32 = similar(b_f32)

# Create a temporary Float32 LU factorization for solving
fact_f32 = LU(Float32.(fact.factors), fact.ipiv, fact.info)
ldiv!(u_f32, fact_f32, b_f32)

# Convert back to original precision
T = eltype(cache.u)
cache.u .= T.(u_f32)
SciMLBase.build_linear_solution(alg, cache.u, nothing, cache)
end

end
4 changes: 4 additions & 0 deletions src/LinearSolve.jl
Original file line number Diff line number Diff line change
Expand Up @@ -456,13 +456,17 @@ export HYPREAlgorithm
export CudaOffloadFactorization
export CudaOffloadLUFactorization
export CudaOffloadQRFactorization
export CUDAOffload32MixedLUFactorization
export AMDGPUOffloadLUFactorization, AMDGPUOffloadQRFactorization
export MKLPardisoFactorize, MKLPardisoIterate
export PanuaPardisoFactorize, PanuaPardisoIterate
export PardisoJL
export MKLLUFactorization
export MKL32MixedLUFactorization
export AppleAccelerateLUFactorization
export AppleAccelerate32MixedLUFactorization
export MetalLUFactorization
export MetalOffload32MixedLUFactorization

export OperatorAssumptions, OperatorCondition

Expand Down
82 changes: 82 additions & 0 deletions src/appleaccelerate.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ to avoid allocations and does not require libblastrampoline.
"""
struct AppleAccelerateLUFactorization <: AbstractFactorization end


@static if !Sys.isapple()
__appleaccelerate_isavailable() = false
else
Expand Down Expand Up @@ -284,3 +285,84 @@ function SciMLBase.solve!(cache::LinearCache, alg::AppleAccelerateLUFactorizatio
SciMLBase.build_linear_solution(
alg, cache.u, nothing, cache; retcode = ReturnCode.Success)
end

# Mixed precision AppleAccelerate implementation
default_alias_A(::AppleAccelerate32MixedLUFactorization, ::Any, ::Any) = false
default_alias_b(::AppleAccelerate32MixedLUFactorization, ::Any, ::Any) = false

const PREALLOCATED_APPLE32_LU = begin
A = rand(Float32, 0, 0)
luinst = ArrayInterface.lu_instance(A)
LU(luinst.factors, similar(A, Cint, 0), luinst.info), Ref{Cint}()
end

function LinearSolve.init_cacheval(alg::AppleAccelerate32MixedLUFactorization, A, b, u, Pl, Pr,
maxiters::Int, abstol, reltol, verbose::LinearVerbosity,
assumptions::OperatorAssumptions)
# Pre-allocate appropriate 32-bit arrays based on input type
if eltype(A) <: Complex
A_32 = rand(ComplexF32, 0, 0)
else
A_32 = rand(Float32, 0, 0)
end
luinst = ArrayInterface.lu_instance(A_32)
LU(luinst.factors, similar(A_32, Cint, 0), luinst.info), Ref{Cint}()
end

function SciMLBase.solve!(cache::LinearCache, alg::AppleAccelerate32MixedLUFactorization;
kwargs...)
__appleaccelerate_isavailable() ||
error("Error, AppleAccelerate binary is missing but solve is being called. Report this issue")
A = cache.A
A = convert(AbstractMatrix, A)

# Check if we have complex numbers
iscomplex = eltype(A) <: Complex

if cache.isfresh
cacheval = @get_cacheval(cache, :AppleAccelerate32MixedLUFactorization)
# Convert to appropriate 32-bit type for factorization
if iscomplex
A_f32 = ComplexF32.(A)
else
A_f32 = Float32.(A)
end
res = aa_getrf!(A_f32; ipiv = cacheval[1].ipiv, info = cacheval[2])
fact = LU(res[1:3]...), res[4]
cache.cacheval = fact

if !LinearAlgebra.issuccess(fact[1])
return SciMLBase.build_linear_solution(
alg, cache.u, nothing, cache; retcode = ReturnCode.Failure)
end
cache.isfresh = false
end

A_lu, info = @get_cacheval(cache, :AppleAccelerate32MixedLUFactorization)
require_one_based_indexing(cache.u, cache.b)
m, n = size(A_lu, 1), size(A_lu, 2)

# Convert b to appropriate 32-bit type for solving
if iscomplex
b_f32 = ComplexF32.(cache.b)
else
b_f32 = Float32.(cache.b)
end

if m > n
Bc = copy(b_f32)
aa_getrs!('N', A_lu.factors, A_lu.ipiv, Bc; info)
# Convert back to original precision
T = eltype(cache.u)
cache.u .= T.(Bc[1:n])
else
u_f32 = copy(b_f32)
aa_getrs!('N', A_lu.factors, A_lu.ipiv, u_f32; info)
# Convert back to original precision
T = eltype(cache.u)
cache.u .= T.(u_f32)
end

SciMLBase.build_linear_solution(
alg, cache.u, nothing, cache; retcode = ReturnCode.Success)
end
Loading
Loading