Resolve merge conflict in MWE script - keep interactive version

ChrisRackauckas · claude · ChrisRackauckas · commit 6a6911347129 · 2025-08-06T08:04:56.000-04:00
- Keep enhanced version with interactive token input from REPL - Include multi-attempt authentication with GitHub API warmup - Full reproduction of package's token handling workflow - Provides comprehensive debugging of MethodError issue 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "LinearSolve"
 uuid = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
 authors = ["SciML"]
-version = "3.24.0"
+version = "3.25.0"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
@@ -33,6 +33,7 @@ BlockDiagonals = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0"
 blis_jll = "6136c539-28a5-5bf0-87cc-b183200dce32"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CUDSS = "45b445bb-4962-46a0-9369-b4df9d0f772e"
+CUSOLVERRF = "a8cc9031-bad2-4722-94f5-40deabb4245c"
 EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
 FastAlmostBandedMatrices = "9d29842c-ecb8-4973-b1e9-a27b1157504e"
 FastLapackInterface = "29a986be-02c6-4525-aec4-84b980013641"
@@ -54,6 +55,7 @@ LinearSolveBandedMatricesExt = "BandedMatrices"
 LinearSolveBlockDiagonalsExt = "BlockDiagonals"
 LinearSolveCUDAExt = "CUDA"
 LinearSolveCUDSSExt = "CUDSS"
+LinearSolveCUSOLVERRFExt = ["CUSOLVERRF", "SparseArrays"]
 LinearSolveEnzymeExt = "EnzymeCore"
 LinearSolveFastAlmostBandedMatricesExt = "FastAlmostBandedMatrices"
 LinearSolveFastLapackInterfaceExt = "FastLapackInterface"
@@ -77,6 +79,7 @@ BlockDiagonals = "0.1.42, 0.2"
 blis_jll = "0.9.0"
 CUDA = "5"
 CUDSS = "0.1, 0.2, 0.3, 0.4"
+CUSOLVERRF = "0.2.6"
 ChainRulesCore = "1.22"
 ConcreteStructs = "0.2.3"
 DocStringExtensions = "0.9.3"
diff --git a/docs/src/solvers/solvers.md b/docs/src/solvers/solvers.md
@@ -43,6 +43,14 @@ For sparse LU-factorizations, `KLUFactorization` if there is less structure
 to the sparsity pattern and `UMFPACKFactorization` if there is more structure.
 Pardiso.jl's methods are also known to be very efficient sparse linear solvers.
 
+For GPU-accelerated sparse LU-factorizations, there are two high-performance options.
+When using CuSparseMatrixCSR arrays with CUDSS.jl loaded, `LUFactorization()` will
+automatically use NVIDIA's cuDSS library. Alternatively, `CUSOLVERRFFactorization`
+provides access to NVIDIA's cusolverRF library. Both offer significant performance
+improvements for sparse systems on CUDA-capable GPUs and are particularly effective
+for large sparse matrices that can benefit from GPU parallelization. `CUDSS` is more
+for `Float32` while `CUSOLVERRFFactorization` is for `Float64`.
+
 While these sparse factorizations are based on implementations in other languages,
 and therefore constrained to standard number types (`Float64`,  `Float32` and
 their complex counterparts),  `SparspakFactorization` is able to handle general
@@ -219,7 +227,7 @@ LinearSolve.PardisoJL
 
 ### CUDA.jl
 
-Note that `CuArrays` are supported by `GenericFactorization` in the “normal” way.
+Note that `CuArrays` are supported by `GenericFactorization` in the "normal" way.
 The following are non-standard GPU factorization routines.
 
 !!! note
@@ -230,6 +238,16 @@ The following are non-standard GPU factorization routines.
 CudaOffloadFactorization
 ```
 
+### CUSOLVERRF.jl
+
+!!! note
+    
+    Using this solver requires adding the package CUSOLVERRF.jl, i.e. `using CUSOLVERRF`
+
+```@docs
+CUSOLVERRFFactorization
+```
+
 ### IterativeSolvers.jl
 
 !!! note
diff --git a/docs/src/tutorials/gpu.md b/docs/src/tutorials/gpu.md
@@ -121,6 +121,30 @@ sol = LS.solve(prob, LS.LUFactorization())
     
     For now, CUDSS only supports CuSparseMatrixCSR type matrices.
 
+For high-performance sparse LU factorization on GPUs, you can also use CUSOLVERRF.jl:
+
+```julia
+using CUSOLVERRF
+sol = LS.solve(prob, LS.CUSOLVERRFFactorization())
+```
+
+CUSOLVERRF provides access to NVIDIA's cusolverRF library, which offers significant 
+performance improvements for sparse LU factorization on GPUs. It supports both 
+`:RF` (default) and `:KLU` symbolic factorization methods, and can reuse symbolic 
+factorization for matrices with the same sparsity pattern:
+
+```julia
+# Use KLU for symbolic factorization
+sol = LS.solve(prob, LS.CUSOLVERRFFactorization(symbolic = :KLU))
+
+# Reuse symbolic factorization for better performance
+sol = LS.solve(prob, LS.CUSOLVERRFFactorization(reuse_symbolic = true))
+```
+
+!!! note
+    
+    CUSOLVERRF only supports `Float64` element types with `Int32` indices.
+
 Note that `KrylovJL` methods also work with sparse GPU arrays:
 
 ```julia
diff --git a/ext/LinearSolveCUSOLVERRFExt.jl b/ext/LinearSolveCUSOLVERRFExt.jl
@@ -0,0 +1,89 @@
+module LinearSolveCUSOLVERRFExt
+
+using LinearSolve: LinearSolve, @get_cacheval, pattern_changed, OperatorAssumptions
+using CUSOLVERRF: CUSOLVERRF, RFLU, CUDA
+using SparseArrays: SparseArrays, SparseMatrixCSC, nnz
+using CUSOLVERRF.CUDA.CUSPARSE: CuSparseMatrixCSR
+using LinearAlgebra: LinearAlgebra, Adjoint, ldiv!, lu!
+using SciMLBase: SciMLBase, LinearProblem, ReturnCode
+
+function LinearSolve.init_cacheval(alg::LinearSolve.CUSOLVERRFFactorization,
+        A, b, u, Pl, Pr,
+        maxiters::Int, abstol, reltol,
+        verbose::Bool, assumptions::OperatorAssumptions)
+    nothing
+end
+
+function LinearSolve.init_cacheval(alg::LinearSolve.CUSOLVERRFFactorization,
+        A::Union{CuSparseMatrixCSR{Float64, Int32}, SparseMatrixCSC{Float64, <:Integer}}, 
+        b, u, Pl, Pr,
+        maxiters::Int, abstol, reltol,
+        verbose::Bool, assumptions::OperatorAssumptions)
+    # Create initial factorization with appropriate options
+    nrhs = b isa AbstractMatrix ? size(b, 2) : 1
+    symbolic = alg.symbolic
+    # Convert to CuSparseMatrixCSR if needed
+    A_gpu = A isa CuSparseMatrixCSR ? A : CuSparseMatrixCSR(A)
+    RFLU(A_gpu; nrhs=nrhs, symbolic=symbolic)
+end
+
+function SciMLBase.solve!(cache::LinearSolve.LinearCache, alg::LinearSolve.CUSOLVERRFFactorization; kwargs...)
+    A = cache.A
+    
+    # Convert to appropriate GPU format if needed
+    if A isa SparseMatrixCSC
+        A_gpu = CuSparseMatrixCSR(A)
+    elseif A isa CuSparseMatrixCSR
+        A_gpu = A
+    else
+        error("CUSOLVERRFFactorization only supports SparseMatrixCSC or CuSparseMatrixCSR matrices")
+    end
+    
+    if cache.isfresh
+        cacheval = @get_cacheval(cache, :CUSOLVERRFFactorization)
+        if cacheval === nothing
+            # Create new factorization
+            nrhs = cache.b isa AbstractMatrix ? size(cache.b, 2) : 1
+            fact = RFLU(A_gpu; nrhs=nrhs, symbolic=alg.symbolic)
+        else
+            # Reuse symbolic factorization if pattern hasn't changed
+            if alg.reuse_symbolic && !pattern_changed(cacheval, A_gpu)
+                fact = cacheval
+                lu!(fact, A_gpu)
+            else
+                # Create new factorization if pattern changed
+                nrhs = cache.b isa AbstractMatrix ? size(cache.b, 2) : 1
+                fact = RFLU(A_gpu; nrhs=nrhs, symbolic=alg.symbolic)
+            end
+        end
+        cache.cacheval = fact
+        cache.isfresh = false
+    end
+    
+    F = @get_cacheval(cache, :CUSOLVERRFFactorization)
+    
+    # Ensure b and u are on GPU
+    b_gpu = cache.b isa CUDA.CuArray ? cache.b : CUDA.CuArray(cache.b)
+    u_gpu = cache.u isa CUDA.CuArray ? cache.u : CUDA.CuArray(cache.u)
+    
+    # Solve
+    copyto!(u_gpu, b_gpu)
+    ldiv!(F, u_gpu)
+    
+    # Copy back to CPU if needed
+    if !(cache.u isa CUDA.CuArray)
+        copyto!(cache.u, u_gpu)
+    end
+    
+    SciMLBase.build_linear_solution(alg, cache.u, nothing, cache; retcode = ReturnCode.Success)
+end
+
+# Helper function for pattern checking
+function LinearSolve.pattern_changed(rf::RFLU, A::CuSparseMatrixCSR)
+    # For CUSOLVERRF, we need to check if the sparsity pattern has changed
+    # This is a simplified check - you might need a more sophisticated approach
+    size(rf) != size(A) || nnz(rf.M) != nnz(A)
+end
+
+
+end
diff --git a/mwe_api_call.jl b/mwe_api_call.jl
@@ -58,7 +58,6 @@ function test_github_authentication(token::AbstractString)
     
     return nothing
 end
-
 function mwe_github_api_call()
     println("🧪 MWE: LinearSolveAutotune GitHub API Call")
     println("="^50)
diff --git a/src/LinearSolve.jl b/src/LinearSolve.jl
@@ -211,7 +211,7 @@ for alg in (:LUFactorization, :FastLUFactorization, :SVDFactorization,
     :RFLUFactorization, :UMFPACKFactorization, :KLUFactorization, :SparspakFactorization,
     :DiagonalFactorization, :CholeskyFactorization, :BunchKaufmanFactorization,
     :CHOLMODFactorization, :LDLtFactorization, :AppleAccelerateLUFactorization,
-    :MKLLUFactorization, :MetalLUFactorization)
+    :MKLLUFactorization, :MetalLUFactorization, :CUSOLVERRFFactorization)
     @eval needs_square_A(::$(alg)) = true
 end
 
@@ -240,7 +240,8 @@ export LUFactorization, SVDFactorization, QRFactorization, GenericFactorization,
        NormalCholeskyFactorization, NormalBunchKaufmanFactorization,
        UMFPACKFactorization, KLUFactorization, FastLUFactorization, FastQRFactorization,
        SparspakFactorization, DiagonalFactorization, CholeskyFactorization,
-       BunchKaufmanFactorization, CHOLMODFactorization, LDLtFactorization
+       BunchKaufmanFactorization, CHOLMODFactorization, LDLtFactorization,
+       CUSOLVERRFFactorization
 
 export LinearSolveFunction, DirectLdiv!
 
diff --git a/src/appleaccelerate.jl b/src/appleaccelerate.jl
@@ -34,6 +34,8 @@ function aa_getrf!(A::AbstractMatrix{<:ComplexF64};
         ipiv = similar(A, Cint, min(size(A, 1), size(A, 2))),
         info = Ref{Cint}(),
         check = false)
+    __appleaccelerate_isavailable() || 
+        error("Error, AppleAccelerate binary is missing but solve is being called. Report this issue")
     require_one_based_indexing(A)
     check && chkfinite(A)
     chkstride1(A)
@@ -54,6 +56,8 @@ function aa_getrf!(A::AbstractMatrix{<:ComplexF32};
         ipiv = similar(A, Cint, min(size(A, 1), size(A, 2))),
         info = Ref{Cint}(),
         check = false)
+    __appleaccelerate_isavailable() || 
+        error("Error, AppleAccelerate binary is missing but solve is being called. Report this issue")
     require_one_based_indexing(A)
     check && chkfinite(A)
     chkstride1(A)
@@ -74,6 +78,8 @@ function aa_getrf!(A::AbstractMatrix{<:Float64};
         ipiv = similar(A, Cint, min(size(A, 1), size(A, 2))),
         info = Ref{Cint}(),
         check = false)
+    __appleaccelerate_isavailable() || 
+        error("Error, AppleAccelerate binary is missing but solve is being called. Report this issue")
     require_one_based_indexing(A)
     check && chkfinite(A)
     chkstride1(A)
@@ -94,6 +100,8 @@ function aa_getrf!(A::AbstractMatrix{<:Float32};
         ipiv = similar(A, Cint, min(size(A, 1), size(A, 2))),
         info = Ref{Cint}(),
         check = false)
+    __appleaccelerate_isavailable() || 
+        error("Error, AppleAccelerate binary is missing but solve is being called. Report this issue")
     require_one_based_indexing(A)
     check && chkfinite(A)
     chkstride1(A)
@@ -116,6 +124,8 @@ function aa_getrs!(trans::AbstractChar,
         ipiv::AbstractVector{Cint},
         B::AbstractVecOrMat{<:ComplexF64};
         info = Ref{Cint}())
+    __appleaccelerate_isavailable() || 
+        error("Error, AppleAccelerate binary is missing but solve is being called. Report this issue")
     require_one_based_indexing(A, ipiv, B)
     LinearAlgebra.LAPACK.chktrans(trans)
     chkstride1(A, B, ipiv)
@@ -140,6 +150,8 @@ function aa_getrs!(trans::AbstractChar,
         ipiv::AbstractVector{Cint},
         B::AbstractVecOrMat{<:ComplexF32};
         info = Ref{Cint}())
+    __appleaccelerate_isavailable() || 
+        error("Error, AppleAccelerate binary is missing but solve is being called. Report this issue")
     require_one_based_indexing(A, ipiv, B)
     LinearAlgebra.LAPACK.chktrans(trans)
     chkstride1(A, B, ipiv)
@@ -165,6 +177,8 @@ function aa_getrs!(trans::AbstractChar,
         ipiv::AbstractVector{Cint},
         B::AbstractVecOrMat{<:Float64};
         info = Ref{Cint}())
+    __appleaccelerate_isavailable() || 
+        error("Error, AppleAccelerate binary is missing but solve is being called. Report this issue")
     require_one_based_indexing(A, ipiv, B)
     LinearAlgebra.LAPACK.chktrans(trans)
     chkstride1(A, B, ipiv)
@@ -190,6 +204,8 @@ function aa_getrs!(trans::AbstractChar,
         ipiv::AbstractVector{Cint},
         B::AbstractVecOrMat{<:Float32};
         info = Ref{Cint}())
+    __appleaccelerate_isavailable() || 
+        error("Error, AppleAccelerate binary is missing but solve is being called. Report this issue")
     require_one_based_indexing(A, ipiv, B)
     LinearAlgebra.LAPACK.chktrans(trans)
     chkstride1(A, B, ipiv)
@@ -236,6 +252,8 @@ end
 
 function SciMLBase.solve!(cache::LinearCache, alg::AppleAccelerateLUFactorization;
         kwargs...)
+    __appleaccelerate_isavailable() || 
+        error("Error, AppleAccelerate binary is missing but solve is being called. Report this issue")
     A = cache.A
     A = convert(AbstractMatrix, A)
     if cache.isfresh
diff --git a/src/default.jl b/src/default.jl
@@ -71,17 +71,29 @@ end
 
 function defaultalg(A::Tridiagonal, b, assump::OperatorAssumptions{Bool})
     if assump.issq
-        DefaultLinearSolver(DefaultAlgorithmChoice.LUFactorization)
+        @static if VERSION>=v"1.11"
+            DirectLdiv!()
+        else
+            DefaultLinearSolver(DefaultAlgorithmChoice.LUFactorization)
+        end
     else
         DefaultLinearSolver(DefaultAlgorithmChoice.QRFactorization)
     end
 end
 
 function defaultalg(A::SymTridiagonal, b, ::OperatorAssumptions{Bool})
-    DefaultLinearSolver(DefaultAlgorithmChoice.LDLtFactorization)
+    @static if VERSION>=v"1.11"
+          DirectLdiv!()
+      else
+          DefaultLinearSolver(DefaultAlgorithmChoice.LUFactorization)
+      end
 end
 function defaultalg(A::Bidiagonal, b, ::OperatorAssumptions{Bool})
-    DefaultLinearSolver(DefaultAlgorithmChoice.DirectLdiv!)
+    @static if VERSION>=v"1.11"
+        DirectLdiv!()
+    else
+        DefaultLinearSolver(DefaultAlgorithmChoice.LUFactorization)
+    end
 end
 function defaultalg(A::Factorization, b, ::OperatorAssumptions{Bool})
     DefaultLinearSolver(DefaultAlgorithmChoice.DirectLdiv!)
diff --git a/src/extension_algs.jl b/src/extension_algs.jl
@@ -441,3 +441,36 @@ to avoid allocations and automatically offloads to the GPU.
 struct MetalLUFactorization <: AbstractFactorization end
 
 struct BLISLUFactorization <: AbstractFactorization end
+
+"""
+`CUSOLVERRFFactorization(; symbolic = :RF, reuse_symbolic = true)`
+
+A GPU-accelerated sparse LU factorization using NVIDIA's cusolverRF library.
+This solver is specifically designed for sparse matrices on CUDA GPUs and 
+provides high-performance factorization and solve capabilities.
+
+## Keyword Arguments
+
+  - `symbolic`: The symbolic factorization method to use. Options are:
+    - `:RF` (default): Use cusolverRF's built-in symbolic analysis
+    - `:KLU`: Use KLU for symbolic analysis
+  - `reuse_symbolic`: Whether to reuse the symbolic factorization when the 
+    sparsity pattern doesn't change (default: `true`)
+
+!!! note
+    This solver requires CUSOLVERRF.jl to be loaded and only supports 
+    `Float64` element types with `Int32` indices.
+"""
+struct CUSOLVERRFFactorization <: AbstractSparseFactorization
+    symbolic::Symbol
+    reuse_symbolic::Bool
+
+    function CUSOLVERRFFactorization(; symbolic::Symbol = :RF, reuse_symbolic::Bool = true)
+        ext = Base.get_extension(@__MODULE__, :LinearSolveCUSOLVERRFExt)
+        if ext === nothing
+            error("CUSOLVERRFFactorization requires that CUSOLVERRF.jl is loaded, i.e. `using CUSOLVERRF`")
+        else
+            return new{}(symbolic, reuse_symbolic)
+        end
+    end
+end
diff --git a/test/gpu/Project.toml b/test/gpu/Project.toml
@@ -2,7 +2,9 @@
 BlockDiagonals = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CUDSS = "45b445bb-4962-46a0-9369-b4df9d0f772e"
+CUSOLVERRF = "a8cc9031-bad2-4722-94f5-40deabb4245c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/test/gpu/cuda.jl b/test/gpu/cuda.jl
@@ -106,3 +106,10 @@ end
     prob = LinearProblem(A_gpu_csr, b_gpu)
     sol = solve(prob)
 end
+
+# Include CUSOLVERRF tests if available
+if Base.find_package("CUSOLVERRF") !== nothing
+    @testset "CUSOLVERRF" begin
+        include("cusolverrf.jl")
+    end
+end
diff --git a/test/gpu/cusolverrf.jl b/test/gpu/cusolverrf.jl
diff --git a/test/resolve.jl b/test/resolve.jl