Add 32-bit mixed precision solvers for OpenBLAS and RecursiveFactorization

claude · claude · commit 7b29b4b4b5c5 · 2025-08-21T10:16:50.000-04:00
Adds two new mixed precision LU factorization algorithms that perform factorization in Float32 precision while maintaining Float64 interface for improved performance: - OpenBLAS32MixedLUFactorization: Mixed precision solver using OpenBLAS - RF32MixedLUFactorization: Mixed precision solver using RecursiveFactorization.jl These solvers follow the same pattern as the existing MKL32MixedLUFactorization and AppleAccelerate32MixedLUFactorization implementations, providing: - ~2x speedup for memory-bandwidth limited problems - Support for both real and complex matrices - Automatic precision conversion and management - Comprehensive test coverage The RF32MixedLUFactorization also supports pivoting options for trading stability vs performance. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/ext/LinearSolveRecursiveFactorizationExt.jl b/ext/LinearSolveRecursiveFactorizationExt.jl
@@ -1,7 +1,8 @@
 module LinearSolveRecursiveFactorizationExt
 
 using LinearSolve: LinearSolve, userecursivefactorization, LinearCache, @get_cacheval,
-                   RFLUFactorization
+                   RFLUFactorization, RF32MixedLUFactorization, default_alias_A,
+                   default_alias_b
 using LinearSolve.LinearAlgebra, LinearSolve.ArrayInterface, RecursiveFactorization
 using SciMLBase: SciMLBase, ReturnCode
 
@@ -30,4 +31,83 @@ function SciMLBase.solve!(cache::LinearSolve.LinearCache, alg::RFLUFactorization
     SciMLBase.build_linear_solution(alg, y, nothing, cache; retcode = ReturnCode.Success)
 end
 
+# Mixed precision RecursiveFactorization implementation
+LinearSolve.default_alias_A(::RF32MixedLUFactorization, ::Any, ::Any) = false
+LinearSolve.default_alias_b(::RF32MixedLUFactorization, ::Any, ::Any) = false
+
+const PREALLOCATED_RF32_LU = begin
+    A = rand(Float32, 0, 0)
+    luinst = ArrayInterface.lu_instance(A)
+    (luinst, Vector{LinearAlgebra.BlasInt}(undef, 0))
+end
+
+function LinearSolve.init_cacheval(alg::RF32MixedLUFactorization{P, T}, A, b, u, Pl, Pr,
+        maxiters::Int, abstol, reltol, verbose::LinearSolve.LinearVerbosity,
+        assumptions::LinearSolve.OperatorAssumptions) where {P, T}
+    # Pre-allocate appropriate 32-bit arrays based on input type
+    if eltype(A) <: Complex
+        A_32 = rand(ComplexF32, 0, 0)
+    else
+        A_32 = rand(Float32, 0, 0)
+    end
+    luinst = ArrayInterface.lu_instance(A_32)
+    (luinst, Vector{LinearAlgebra.BlasInt}(undef, min(size(A)...)))
+end
+
+function SciMLBase.solve!(
+        cache::LinearSolve.LinearCache, alg::RF32MixedLUFactorization{P, T};
+        kwargs...) where {P, T}
+    A = cache.A
+    A = convert(AbstractMatrix, A)
+
+    # Check if we have complex numbers
+    iscomplex = eltype(A) <: Complex
+
+    if cache.isfresh
+        fact, ipiv = LinearSolve.@get_cacheval(cache, :RF32MixedLUFactorization)
+
+        # Convert to appropriate 32-bit type for factorization
+        if iscomplex
+            A_f32 = ComplexF32.(A)
+        else
+            A_f32 = Float32.(A)
+        end
+
+        # Ensure ipiv is the right size
+        if length(ipiv) != min(size(A_f32)...)
+            ipiv = Vector{LinearAlgebra.BlasInt}(undef, min(size(A_f32)...))
+        end
+
+        fact = RecursiveFactorization.lu!(A_f32, ipiv, Val(P), Val(T), check = false)
+        cache.cacheval = (fact, ipiv)
+
+        if !LinearAlgebra.issuccess(fact)
+            return SciMLBase.build_linear_solution(
+                alg, cache.u, nothing, cache; retcode = ReturnCode.Failure)
+        end
+
+        cache.isfresh = false
+    end
+
+    fact, ipiv = LinearSolve.@get_cacheval(cache, :RF32MixedLUFactorization)
+
+    # Convert b to appropriate 32-bit type for solving
+    if iscomplex
+        b_f32 = ComplexF32.(cache.b)
+    else
+        b_f32 = Float32.(cache.b)
+    end
+
+    # Solve in 32-bit precision
+    u_f32 = similar(b_f32)
+    ldiv!(u_f32, fact, b_f32)
+
+    # Convert back to original precision
+    T_orig = eltype(cache.u)
+    cache.u .= T_orig.(u_f32)
+
+    SciMLBase.build_linear_solution(
+        alg, cache.u, nothing, cache; retcode = ReturnCode.Success)
+end
+
 end
diff --git a/src/LinearSolve.jl b/src/LinearSolve.jl
@@ -472,9 +472,11 @@ export PanuaPardisoFactorize, PanuaPardisoIterate
 export PardisoJL
 export MKLLUFactorization
 export OpenBLASLUFactorization
+export OpenBLAS32MixedLUFactorization
 export MKL32MixedLUFactorization
 export AppleAccelerateLUFactorization
 export AppleAccelerate32MixedLUFactorization
+export RF32MixedLUFactorization
 export MetalLUFactorization
 export MetalOffload32MixedLUFactorization
 
diff --git a/src/extension_algs.jl b/src/extension_algs.jl
@@ -834,3 +834,77 @@ sol = solve(prob, alg)
 ```
 """
 struct AppleAccelerate32MixedLUFactorization <: AbstractFactorization end
+
+"""
+    OpenBLAS32MixedLUFactorization()
+
+A mixed precision LU factorization using OpenBLAS that performs factorization in Float32
+precision while maintaining Float64 interface. This can provide significant speedups
+for large matrices when reduced precision is acceptable.
+
+## Performance Notes
+- Converts Float64 matrices to Float32 for factorization
+- Uses optimized OpenBLAS routines for the factorization
+- Can be 2x faster than full precision for memory-bandwidth limited problems
+- May have reduced accuracy compared to full Float64 precision
+
+## Requirements
+This solver requires OpenBLAS to be available through OpenBLAS_jll.
+
+## Example
+```julia
+alg = OpenBLAS32MixedLUFactorization()
+sol = solve(prob, alg)
+```
+"""
+struct OpenBLAS32MixedLUFactorization <: AbstractFactorization end
+
+"""
+    RF32MixedLUFactorization{P, T}(; pivot = Val(true), thread = Val(true))
+
+A mixed precision LU factorization using RecursiveFactorization.jl that performs 
+factorization in Float32 precision while maintaining Float64 interface. This combines
+the speed benefits of RecursiveFactorization.jl with reduced precision computation
+for additional performance gains.
+
+## Type Parameters
+- `P`: Pivoting strategy as `Val{Bool}`. `Val{true}` enables partial pivoting for stability.
+- `T`: Threading strategy as `Val{Bool}`. `Val{true}` enables multi-threading for performance.
+
+## Constructor Arguments
+- `pivot = Val(true)`: Enable partial pivoting. Set to `Val{false}` to disable for speed 
+  at the cost of numerical stability.
+- `thread = Val(true)`: Enable multi-threading. Set to `Val{false}` for single-threaded 
+  execution.
+
+## Performance Notes
+- Converts Float64 matrices to Float32 for factorization
+- Leverages RecursiveFactorization.jl's optimized blocking strategies
+- Can provide significant speedups for small to medium matrices (< 500×500)
+- May have reduced accuracy compared to full Float64 precision
+
+## Requirements
+Using this solver requires that RecursiveFactorization.jl is loaded: `using RecursiveFactorization`
+
+## Example
+```julia
+using RecursiveFactorization
+# Fast mixed precision with pivoting
+alg1 = RF32MixedLUFactorization()
+# Fastest mixed precision (no pivoting), less stable
+alg2 = RF32MixedLUFactorization(pivot=Val(false))
+```
+"""
+struct RF32MixedLUFactorization{P, T} <: AbstractDenseFactorization
+    function RF32MixedLUFactorization(::Val{P}, ::Val{T}; throwerror = true) where {P, T}
+        if !userecursivefactorization(nothing)
+            throwerror &&
+                error("RF32MixedLUFactorization requires that RecursiveFactorization.jl is loaded, i.e. `using RecursiveFactorization`")
+        end
+        new{P, T}()
+    end
+end
+
+function RF32MixedLUFactorization(; pivot = Val(true), thread = Val(true), throwerror = true)
+    RF32MixedLUFactorization(pivot, thread; throwerror)
+end
diff --git a/src/openblas.jl b/src/openblas.jl
@@ -44,7 +44,7 @@ function openblas_getrf!(A::AbstractMatrix{<:ComplexF64};
         ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
         info = Ref{BlasInt}(),
         check = false)
-    __openblas_isavailable() || 
+    __openblas_isavailable() ||
         error("Error, OpenBLAS binary is missing but solve is being called. Report this issue")
     require_one_based_indexing(A)
     check && chkfinite(A)
@@ -66,7 +66,7 @@ function openblas_getrf!(A::AbstractMatrix{<:ComplexF32};
         ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
         info = Ref{BlasInt}(),
         check = false)
-    __openblas_isavailable() || 
+    __openblas_isavailable() ||
         error("Error, OpenBLAS binary is missing but solve is being called. Report this issue")
     require_one_based_indexing(A)
     check && chkfinite(A)
@@ -88,7 +88,7 @@ function openblas_getrf!(A::AbstractMatrix{<:Float64};
         ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
         info = Ref{BlasInt}(),
         check = false)
-    __openblas_isavailable() || 
+    __openblas_isavailable() ||
         error("Error, OpenBLAS binary is missing but solve is being called. Report this issue")
     require_one_based_indexing(A)
     check && chkfinite(A)
@@ -110,7 +110,7 @@ function openblas_getrf!(A::AbstractMatrix{<:Float32};
         ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
         info = Ref{BlasInt}(),
         check = false)
-    __openblas_isavailable() || 
+    __openblas_isavailable() ||
         error("Error, OpenBLAS binary is missing but solve is being called. Report this issue")
     require_one_based_indexing(A)
     check && chkfinite(A)
@@ -133,7 +133,7 @@ function openblas_getrs!(trans::AbstractChar,
         ipiv::AbstractVector{BlasInt},
         B::AbstractVecOrMat{<:ComplexF64};
         info = Ref{BlasInt}())
-    __openblas_isavailable() || 
+    __openblas_isavailable() ||
         error("Error, OpenBLAS binary is missing but solve is being called. Report this issue")
     require_one_based_indexing(A, ipiv, B)
     LinearAlgebra.LAPACK.chktrans(trans)
@@ -160,7 +160,7 @@ function openblas_getrs!(trans::AbstractChar,
         ipiv::AbstractVector{BlasInt},
         B::AbstractVecOrMat{<:ComplexF32};
         info = Ref{BlasInt}())
-    __openblas_isavailable() || 
+    __openblas_isavailable() ||
         error("Error, OpenBLAS binary is missing but solve is being called. Report this issue")
     require_one_based_indexing(A, ipiv, B)
     LinearAlgebra.LAPACK.chktrans(trans)
@@ -187,7 +187,7 @@ function openblas_getrs!(trans::AbstractChar,
         ipiv::AbstractVector{BlasInt},
         B::AbstractVecOrMat{<:Float64};
         info = Ref{BlasInt}())
-    __openblas_isavailable() || 
+    __openblas_isavailable() ||
         error("Error, OpenBLAS binary is missing but solve is being called. Report this issue")
     require_one_based_indexing(A, ipiv, B)
     LinearAlgebra.LAPACK.chktrans(trans)
@@ -214,7 +214,7 @@ function openblas_getrs!(trans::AbstractChar,
         ipiv::AbstractVector{BlasInt},
         B::AbstractVecOrMat{<:Float32};
         info = Ref{BlasInt}())
-    __openblas_isavailable() || 
+    __openblas_isavailable() ||
         error("Error, OpenBLAS binary is missing but solve is being called. Report this issue")
     require_one_based_indexing(A, ipiv, B)
     LinearAlgebra.LAPACK.chktrans(trans)
@@ -260,7 +260,7 @@ end
 
 function SciMLBase.solve!(cache::LinearCache, alg::OpenBLASLUFactorization;
         kwargs...)
-    __openblas_isavailable() || 
+    __openblas_isavailable() ||
         error("Error, OpenBLAS binary is missing but solve is being called. Report this issue")
     A = cache.A
     A = convert(AbstractMatrix, A)
@@ -292,3 +292,82 @@ function SciMLBase.solve!(cache::LinearCache, alg::OpenBLASLUFactorization;
     SciMLBase.build_linear_solution(
         alg, cache.u, nothing, cache; retcode = ReturnCode.Success)
 end
+
+# Mixed precision OpenBLAS implementation
+default_alias_A(::OpenBLAS32MixedLUFactorization, ::Any, ::Any) = false
+default_alias_b(::OpenBLAS32MixedLUFactorization, ::Any, ::Any) = false
+
+const PREALLOCATED_OPENBLAS32_LU = begin
+    A = rand(Float32, 0, 0)
+    luinst = ArrayInterface.lu_instance(A), Ref{BlasInt}()
+end
+
+function LinearSolve.init_cacheval(alg::OpenBLAS32MixedLUFactorization, A, b, u, Pl, Pr,
+        maxiters::Int, abstol, reltol, verbose::LinearVerbosity,
+        assumptions::OperatorAssumptions)
+    # Pre-allocate appropriate 32-bit arrays based on input type
+    if eltype(A) <: Complex
+        A_32 = rand(ComplexF32, 0, 0)
+    else
+        A_32 = rand(Float32, 0, 0)
+    end
+    ArrayInterface.lu_instance(A_32), Ref{BlasInt}()
+end
+
+function SciMLBase.solve!(cache::LinearCache, alg::OpenBLAS32MixedLUFactorization;
+        kwargs...)
+    __openblas_isavailable() ||
+        error("Error, OpenBLAS binary is missing but solve is being called. Report this issue")
+    A = cache.A
+    A = convert(AbstractMatrix, A)
+
+    # Check if we have complex numbers
+    iscomplex = eltype(A) <: Complex
+
+    if cache.isfresh
+        cacheval = @get_cacheval(cache, :OpenBLAS32MixedLUFactorization)
+        # Convert to appropriate 32-bit type for factorization
+        if iscomplex
+            A_f32 = ComplexF32.(A)
+        else
+            A_f32 = Float32.(A)
+        end
+        res = openblas_getrf!(A_f32; ipiv = cacheval[1].ipiv, info = cacheval[2])
+        fact = LU(res[1:3]...), res[4]
+        cache.cacheval = fact
+
+        if !LinearAlgebra.issuccess(fact[1])
+            return SciMLBase.build_linear_solution(
+                alg, cache.u, nothing, cache; retcode = ReturnCode.Failure)
+        end
+        cache.isfresh = false
+    end
+
+    A_lu, info = @get_cacheval(cache, :OpenBLAS32MixedLUFactorization)
+    require_one_based_indexing(cache.u, cache.b)
+    m, n = size(A_lu, 1), size(A_lu, 2)
+
+    # Convert b to appropriate 32-bit type for solving
+    if iscomplex
+        b_f32 = ComplexF32.(cache.b)
+    else
+        b_f32 = Float32.(cache.b)
+    end
+
+    if m > n
+        Bc = copy(b_f32)
+        openblas_getrs!('N', A_lu.factors, A_lu.ipiv, Bc; info)
+        # Convert back to original precision
+        T = eltype(cache.u)
+        cache.u .= T.(Bc[1:n])
+    else
+        u_f32 = copy(b_f32)
+        openblas_getrs!('N', A_lu.factors, A_lu.ipiv, u_f32; info)
+        # Convert back to original precision
+        T = eltype(cache.u)
+        cache.u .= T.(u_f32)
+    end
+
+    SciMLBase.build_linear_solution(
+        alg, cache.u, nothing, cache; retcode = ReturnCode.Success)
+end
diff --git a/test/test_mixed_precision.jl b/test/test_mixed_precision.jl