From 4814de7a04c8525801c76f0610ae17087afe1f16 Mon Sep 17 00:00:00 2001
From: ChrisRackauckas <accounts@chrisrackauckas.com>
Date: Sun, 10 Aug 2025 09:20:36 -0400
Subject: [PATCH 01/11] Refactor CudaOffloadFactorization into LU and QR
 variants

- Created CudaOffloadLUFactorization using lu factorization
- Created CudaOffloadQRFactorization using qr factorization
- Deprecated CudaOffloadFactorization to use QR (with deprecation warning)
- Updated CUDA extension to implement both algorithms
- Updated LinearSolveAutotune to use LU version for better performance
- Updated tests to include both new algorithms
- Exported both new algorithms from LinearSolve module
---
 ext/LinearSolveCUDAExt.jl                 | 39 ++++++++++++++++++-
 lib/LinearSolveAutotune/src/algorithms.jl |  6 +--
 src/LinearSolve.jl                        |  2 +-
 src/extension_algs.jl                     | 46 +++++++++++++++++++++++
 test/gpu/cuda.jl                          |  2 +-
 test/resolve.jl                           |  2 +
 6 files changed, 91 insertions(+), 6 deletions(-)

diff --git a/ext/LinearSolveCUDAExt.jl b/ext/LinearSolveCUDAExt.jl
index aaedcf21a..f4e1cf465 100644
--- a/ext/LinearSolveCUDAExt.jl
+++ b/ext/LinearSolveCUDAExt.jl
@@ -5,7 +5,7 @@ using LinearSolve: LinearSolve, is_cusparse, defaultalg, cudss_loaded, DefaultLi
                    DefaultAlgorithmChoice, ALREADY_WARNED_CUDSS, LinearCache,
                    needs_concrete_A,
                    error_no_cudss_lu, init_cacheval, OperatorAssumptions,
-                   CudaOffloadFactorization,
+                   CudaOffloadFactorization, CudaOffloadLUFactorization, CudaOffloadQRFactorization,
                    SparspakFactorization, KLUFactorization, UMFPACKFactorization
 using LinearSolve.LinearAlgebra, LinearSolve.SciMLBase, LinearSolve.ArrayInterface
 using SciMLBase: AbstractSciMLOperator
@@ -35,6 +35,43 @@ function LinearSolve.error_no_cudss_lu(A::CUDA.CUSPARSE.CuSparseMatrixCSR)
     nothing
 end
 
+function SciMLBase.solve!(cache::LinearSolve.LinearCache, alg::CudaOffloadLUFactorization;
+        kwargs...)
+    if cache.isfresh
+        fact = lu(CUDA.CuArray(cache.A))
+        cache.cacheval = fact
+        cache.isfresh = false
+    end
+    y = Array(ldiv!(CUDA.CuArray(cache.u), cache.cacheval, CUDA.CuArray(cache.b)))
+    cache.u .= y
+    SciMLBase.build_linear_solution(alg, y, nothing, cache)
+end
+
+function LinearSolve.init_cacheval(alg::CudaOffloadLUFactorization, A, b, u, Pl, Pr,
+        maxiters::Int, abstol, reltol, verbose::Bool,
+        assumptions::OperatorAssumptions)
+    lu(CUDA.CuArray(A))
+end
+
+function SciMLBase.solve!(cache::LinearSolve.LinearCache, alg::CudaOffloadQRFactorization;
+        kwargs...)
+    if cache.isfresh
+        fact = qr(CUDA.CuArray(cache.A))
+        cache.cacheval = fact
+        cache.isfresh = false
+    end
+    y = Array(ldiv!(CUDA.CuArray(cache.u), cache.cacheval, CUDA.CuArray(cache.b)))
+    cache.u .= y
+    SciMLBase.build_linear_solution(alg, y, nothing, cache)
+end
+
+function LinearSolve.init_cacheval(alg::CudaOffloadQRFactorization, A, b, u, Pl, Pr,
+        maxiters::Int, abstol, reltol, verbose::Bool,
+        assumptions::OperatorAssumptions)
+    qr(CUDA.CuArray(A))
+end
+
+# Keep the deprecated CudaOffloadFactorization working by forwarding to QR
 function SciMLBase.solve!(cache::LinearSolve.LinearCache, alg::CudaOffloadFactorization;
         kwargs...)
     if cache.isfresh
diff --git a/lib/LinearSolveAutotune/src/algorithms.jl b/lib/LinearSolveAutotune/src/algorithms.jl
index 35855ff8b..360f4bd40 100644
--- a/lib/LinearSolveAutotune/src/algorithms.jl
+++ b/lib/LinearSolveAutotune/src/algorithms.jl
@@ -84,10 +84,10 @@ function get_gpu_algorithms(; skip_missing_algs::Bool = false)
     # CUDA algorithms
     if is_cuda_available()
         try
-            push!(gpu_algs, CudaOffloadFactorization())
-            push!(gpu_names, "CudaOffloadFactorization")
+            push!(gpu_algs, CudaOffloadLUFactorization())
+            push!(gpu_names, "CudaOffloadLUFactorization")
         catch e
-            msg = "CUDA hardware detected but CudaOffloadFactorization not available: $e. Load CUDA.jl package."
+            msg = "CUDA hardware detected but CudaOffloadLUFactorization not available: $e. Load CUDA.jl package."
             if skip_missing_algs
                 @warn msg
             else
diff --git a/src/LinearSolve.jl b/src/LinearSolve.jl
index bf24093b4..a698cad98 100644
--- a/src/LinearSolve.jl
+++ b/src/LinearSolve.jl
@@ -254,7 +254,7 @@ export KrylovJL, KrylovJL_CG, KrylovJL_MINRES, KrylovJL_GMRES,
 export SimpleGMRES
 
 export HYPREAlgorithm
-export CudaOffloadFactorization
+export CudaOffloadFactorization, CudaOffloadLUFactorization, CudaOffloadQRFactorization
 export MKLPardisoFactorize, MKLPardisoIterate
 export PanuaPardisoFactorize, PanuaPardisoIterate
 export PardisoJL
diff --git a/src/extension_algs.jl b/src/extension_algs.jl
index 29d1be8fa..421825171 100644
--- a/src/extension_algs.jl
+++ b/src/extension_algs.jl
@@ -61,9 +61,54 @@ struct HYPREAlgorithm <: SciMLLinearSolveAlgorithm
     end
 end
 
+"""
+`CudaOffloadLUFactorization()`
+
+An offloading technique used to GPU-accelerate CPU-based computations using LU factorization.
+Requires a sufficiently large `A` to overcome the data transfer costs.
+
+!!! note
+
+    Using this solver requires adding the package CUDA.jl, i.e. `using CUDA`
+"""
+struct CudaOffloadLUFactorization <: LinearSolve.AbstractFactorization
+    function CudaOffloadLUFactorization()
+        ext = Base.get_extension(@__MODULE__, :LinearSolveCUDAExt)
+        if ext === nothing
+            error("CudaOffloadLUFactorization requires that CUDA is loaded, i.e. `using CUDA`")
+        else
+            return new{}()
+        end
+    end
+end
+
+"""
+`CudaOffloadQRFactorization()`
+
+An offloading technique used to GPU-accelerate CPU-based computations using QR factorization.
+Requires a sufficiently large `A` to overcome the data transfer costs.
+
+!!! note
+
+    Using this solver requires adding the package CUDA.jl, i.e. `using CUDA`
+"""
+struct CudaOffloadQRFactorization <: LinearSolve.AbstractFactorization
+    function CudaOffloadQRFactorization()
+        ext = Base.get_extension(@__MODULE__, :LinearSolveCUDAExt)
+        if ext === nothing
+            error("CudaOffloadQRFactorization requires that CUDA is loaded, i.e. `using CUDA`")
+        else
+            return new{}()
+        end
+    end
+end
+
 """
 `CudaOffloadFactorization()`
 
+!!! warning
+    This algorithm is deprecated. Use `CudaOffloadQRFactorization()` instead.
+
 An offloading technique used to GPU-accelerate CPU-based computations.
 Requires a sufficiently large `A` to overcome the data transfer costs.
 
@@ -73,6 +118,7 @@ Requires a sufficiently large `A` to overcome the data transfer costs.
 """
 struct CudaOffloadFactorization <: LinearSolve.AbstractFactorization
     function CudaOffloadFactorization()
+        Base.depwarn("`CudaOffloadFactorization` is deprecated, use `CudaOffloadQRFactorization` instead.", :CudaOffloadFactorization)
         ext = Base.get_extension(@__MODULE__, :LinearSolveCUDAExt)
         if ext === nothing
             error("CudaOffloadFactorization requires that CUDA is loaded, i.e. `using CUDA`")
diff --git a/test/gpu/cuda.jl b/test/gpu/cuda.jl
index 74ee1ab8f..718a11875 100644
--- a/test/gpu/cuda.jl
+++ b/test/gpu/cuda.jl
@@ -45,7 +45,7 @@ function test_interface(alg, prob1, prob2)
     return
 end
 
-@testset "$alg" for alg in (CudaOffloadFactorization(), NormalCholeskyFactorization())
+@testset "$alg" for alg in (CudaOffloadFactorization(), CudaOffloadLUFactorization(), CudaOffloadQRFactorization(), NormalCholeskyFactorization())
     test_interface(alg, prob1, prob2)
 end
 
diff --git a/test/resolve.jl b/test/resolve.jl
index 9efcc6921..3fe0a2fea 100644
--- a/test/resolve.jl
+++ b/test/resolve.jl
@@ -11,6 +11,8 @@ for alg in vcat(InteractiveUtils.subtypes(AbstractDenseFactorization),
     if !(alg in [
            DiagonalFactorization,
            CudaOffloadFactorization,
+           CudaOffloadLUFactorization,
+           CudaOffloadQRFactorization,
            CUSOLVERRFFactorization,
            AppleAccelerateLUFactorization,
            MetalLUFactorization

From ce4671e574be124486f84426804121c165d76675 Mon Sep 17 00:00:00 2001
From: ChrisRackauckas <accounts@chrisrackauckas.com>
Date: Sun, 10 Aug 2025 09:35:44 -0400
Subject: [PATCH 02/11] Fix syntax issues in CudaOffload factorizations

- Fixed namespace issues (removed LinearSolve. prefix)
- Fixed constructor syntax (new() instead of new{}())
- Added debug comment
- Updated exports to separate lines

Note: The new types are defined correctly but there appears to be
a precompilation caching issue preventing them from being recognized
immediately. A clean rebuild may be required.
---
 src/LinearSolve.jl    |  4 +++-
 src/extension_algs.jl | 13 +++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/LinearSolve.jl b/src/LinearSolve.jl
index a698cad98..09cae56d6 100644
--- a/src/LinearSolve.jl
+++ b/src/LinearSolve.jl
@@ -254,7 +254,9 @@ export KrylovJL, KrylovJL_CG, KrylovJL_MINRES, KrylovJL_GMRES,
 export SimpleGMRES
 
 export HYPREAlgorithm
-export CudaOffloadFactorization, CudaOffloadLUFactorization, CudaOffloadQRFactorization
+export CudaOffloadFactorization
+export CudaOffloadLUFactorization
+export CudaOffloadQRFactorization
 export MKLPardisoFactorize, MKLPardisoIterate
 export PanuaPardisoFactorize, PanuaPardisoIterate
 export PardisoJL
diff --git a/src/extension_algs.jl b/src/extension_algs.jl
index 421825171..9eef4205e 100644
--- a/src/extension_algs.jl
+++ b/src/extension_algs.jl
@@ -61,6 +61,7 @@ struct HYPREAlgorithm <: SciMLLinearSolveAlgorithm
     end
 end
 
+# Debug: About to define CudaOffloadLUFactorization
 """
 `CudaOffloadLUFactorization()`
 
@@ -71,13 +72,13 @@ Requires a sufficiently large `A` to overcome the data transfer costs.
 
     Using this solver requires adding the package CUDA.jl, i.e. `using CUDA`
 """
-struct CudaOffloadLUFactorization <: LinearSolve.AbstractFactorization
+struct CudaOffloadLUFactorization <: AbstractFactorization
     function CudaOffloadLUFactorization()
         ext = Base.get_extension(@__MODULE__, :LinearSolveCUDAExt)
         if ext === nothing
             error("CudaOffloadLUFactorization requires that CUDA is loaded, i.e. `using CUDA`")
         else
-            return new{}()
+            return new()
         end
     end
 end
@@ -92,13 +93,13 @@ Requires a sufficiently large `A` to overcome the data transfer costs.
 
     Using this solver requires adding the package CUDA.jl, i.e. `using CUDA`
 """
-struct CudaOffloadQRFactorization <: LinearSolve.AbstractFactorization
+struct CudaOffloadQRFactorization <: AbstractFactorization
     function CudaOffloadQRFactorization()
         ext = Base.get_extension(@__MODULE__, :LinearSolveCUDAExt)
         if ext === nothing
             error("CudaOffloadQRFactorization requires that CUDA is loaded, i.e. `using CUDA`")
         else
-            return new{}()
+            return new()
         end
     end
 end
@@ -116,14 +117,14 @@ Requires a sufficiently large `A` to overcome the data transfer costs.
 
     Using this solver requires adding the package CUDA.jl, i.e. `using CUDA`
 """
-struct CudaOffloadFactorization <: LinearSolve.AbstractFactorization
+struct CudaOffloadFactorization <: AbstractFactorization
     function CudaOffloadFactorization()
         Base.depwarn("`CudaOffloadFactorization` is deprecated, use `CudaOffloadQRFactorization` instead.", :CudaOffloadFactorization)
         ext = Base.get_extension(@__MODULE__, :LinearSolveCUDAExt)
         if ext === nothing
             error("CudaOffloadFactorization requires that CUDA is loaded, i.e. `using CUDA`")
         else
-            return new{}()
+            return new()
         end
     end
 end

From 52b330786bdc257d200f961272a561ef4c5719a4 Mon Sep 17 00:00:00 2001
From: Christopher Rackauckas <accounts@chrisrackauckas.com>
Date: Sun, 10 Aug 2025 09:42:24 -0400
Subject: [PATCH 03/11] Update ext/LinearSolveCUDAExt.jl

---
 ext/LinearSolveCUDAExt.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ext/LinearSolveCUDAExt.jl b/ext/LinearSolveCUDAExt.jl
index f4e1cf465..0a0c96887 100644
--- a/ext/LinearSolveCUDAExt.jl
+++ b/ext/LinearSolveCUDAExt.jl
@@ -50,7 +50,7 @@ end
 function LinearSolve.init_cacheval(alg::CudaOffloadLUFactorization, A, b, u, Pl, Pr,
         maxiters::Int, abstol, reltol, verbose::Bool,
         assumptions::OperatorAssumptions)
-    lu(CUDA.CuArray(A))
+    ArrayInterface.lu_instance(A)
 end
 
 function SciMLBase.solve!(cache::LinearSolve.LinearCache, alg::CudaOffloadQRFactorization;

From 3a508d7c283a49a15b7595d6e005bf97fc8770cc Mon Sep 17 00:00:00 2001
From: Christopher Rackauckas <accounts@chrisrackauckas.com>
Date: Sun, 10 Aug 2025 09:43:25 -0400
Subject: [PATCH 04/11] Update src/extension_algs.jl

---
 src/extension_algs.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/extension_algs.jl b/src/extension_algs.jl
index 9eef4205e..2fe35b8e4 100644
--- a/src/extension_algs.jl
+++ b/src/extension_algs.jl
@@ -108,7 +108,7 @@ end
 `CudaOffloadFactorization()`
 
 !!! warning
-    This algorithm is deprecated. Use `CudaOffloadQRFactorization()` instead.
+    This algorithm is deprecated. Use `CudaOffloadLUFactorization` or `CudaOffloadQRFactorization()` instead.
 
 An offloading technique used to GPU-accelerate CPU-based computations.
 Requires a sufficiently large `A` to overcome the data transfer costs.

From be4cd196ebd404bac7e8b80465c5098d9129086f Mon Sep 17 00:00:00 2001
From: Christopher Rackauckas <accounts@chrisrackauckas.com>
Date: Sun, 10 Aug 2025 09:43:42 -0400
Subject: [PATCH 05/11] Update src/extension_algs.jl

---
 src/extension_algs.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/extension_algs.jl b/src/extension_algs.jl
index 2fe35b8e4..234bed274 100644
--- a/src/extension_algs.jl
+++ b/src/extension_algs.jl
@@ -119,7 +119,7 @@ Requires a sufficiently large `A` to overcome the data transfer costs.
 """
 struct CudaOffloadFactorization <: AbstractFactorization
     function CudaOffloadFactorization()
-        Base.depwarn("`CudaOffloadFactorization` is deprecated, use `CudaOffloadQRFactorization` instead.", :CudaOffloadFactorization)
+        Base.depwarn("`CudaOffloadFactorization` is deprecated, use `CudaOffloadLUFactorization` or `CudaOffloadQRFactorization` instead.", :CudaOffloadFactorization)
         ext = Base.get_extension(@__MODULE__, :LinearSolveCUDAExt)
         if ext === nothing
             error("CudaOffloadFactorization requires that CUDA is loaded, i.e. `using CUDA`")

From 57fee723ba8a4308a7e96a595c9b3d589052d321 Mon Sep 17 00:00:00 2001
From: Christopher Rackauckas <accounts@chrisrackauckas.com>
Date: Sun, 10 Aug 2025 09:43:56 -0400
Subject: [PATCH 06/11] Update test/gpu/cuda.jl

---
 test/gpu/cuda.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/gpu/cuda.jl b/test/gpu/cuda.jl
index 718a11875..da954e20a 100644
--- a/test/gpu/cuda.jl
+++ b/test/gpu/cuda.jl
@@ -45,7 +45,7 @@ function test_interface(alg, prob1, prob2)
     return
 end
 
-@testset "$alg" for alg in (CudaOffloadFactorization(), CudaOffloadLUFactorization(), CudaOffloadQRFactorization(), NormalCholeskyFactorization())
+@testset "$alg" for alg in (CudaOffloadLUFactorization(), CudaOffloadQRFactorization(), NormalCholeskyFactorization())
     test_interface(alg, prob1, prob2)
 end
 

From e775de6766a1dd66f75e1d936f7eefdc7f62efee Mon Sep 17 00:00:00 2001
From: ChrisRackauckas <accounts@chrisrackauckas.com>
Date: Sun, 10 Aug 2025 09:55:47 -0400
Subject: [PATCH 07/11] Update documentation for CudaOffload factorization
 changes

- Updated GPU tutorial to show new CudaOffloadLUFactorization/QRFactorization
- Updated solver documentation to explain both algorithms
- Added deprecation warning in documentation
- Updated release notes with upcoming changes
- Created example demonstrating usage of both new algorithms
- Explained when to use each algorithm (LU for performance, QR for stability)
---
 docs/src/release_notes.md        |  7 +++
 docs/src/solvers/solvers.md      | 14 +++--
 docs/src/tutorials/gpu.md        |  9 ++-
 examples/cuda_offload_example.jl | 97 ++++++++++++++++++++++++++++++++
 4 files changed, 121 insertions(+), 6 deletions(-)
 create mode 100644 examples/cuda_offload_example.jl

diff --git a/docs/src/release_notes.md b/docs/src/release_notes.md
index fcb5336bf..6aa85f6ef 100644
--- a/docs/src/release_notes.md
+++ b/docs/src/release_notes.md
@@ -1,5 +1,12 @@
 # Release Notes
 
+## Upcoming Changes
+
+  - `CudaOffloadFactorization` has been split into two algorithms:
+    - `CudaOffloadLUFactorization` - Uses LU factorization for better performance
+    - `CudaOffloadQRFactorization` - Uses QR factorization for better numerical stability
+  - `CudaOffloadFactorization` is now deprecated and will show a warning suggesting to use one of the new algorithms
+
 ## v2.0
 
   - `LinearCache` changed from immutable to mutable. With this, the out of place interfaces like
diff --git a/docs/src/solvers/solvers.md b/docs/src/solvers/solvers.md
index 7bcb11a56..f3643aff4 100644
--- a/docs/src/solvers/solvers.md
+++ b/docs/src/solvers/solvers.md
@@ -23,12 +23,14 @@ use your base system BLAS which can be fast or slow depending on the hardware co
 
 For very large dense factorizations, offloading to the GPU can be preferred. Metal.jl can be used
 on Mac hardware to offload, and has a cutoff point of being faster at around size 20,000 x 20,000
-matrices (and only supports Float32). `CudaOffloadFactorization` can be more efficient at a
-much smaller cutoff, possibly around size 1,000 x 1,000 matrices, though this is highly dependent
-on the chosen GPU hardware. `CudaOffloadFactorization` requires a CUDA-compatible NVIDIA GPU.
+matrices (and only supports Float32). `CudaOffloadLUFactorization` and `CudaOffloadQRFactorization` 
+can be more efficient at a much smaller cutoff, possibly around size 1,000 x 1,000 matrices, though 
+this is highly dependent on the chosen GPU hardware. These algorithms require a CUDA-compatible NVIDIA GPU.
 CUDA offload supports Float64 but most consumer GPU hardware will be much faster on Float32
 (many are >32x faster for Float32 operations than Float64 operations) and thus for most hardware
-this is only recommended for Float32 matrices.
+this is only recommended for Float32 matrices. Choose `CudaOffloadLUFactorization` for better 
+performance on well-conditioned problems, or `CudaOffloadQRFactorization` for better numerical 
+stability on ill-conditioned problems.
 
 !!! note
     
@@ -232,9 +234,11 @@ The following are non-standard GPU factorization routines.
 
 !!! note
     
-    Using this solver requires adding the package CUDA.jl, i.e. `using CUDA`
+    Using these solvers requires adding the package CUDA.jl, i.e. `using CUDA`
 
 ```@docs
+CudaOffloadLUFactorization
+CudaOffloadQRFactorization
 CudaOffloadFactorization
 ```
 
diff --git a/docs/src/tutorials/gpu.md b/docs/src/tutorials/gpu.md
index ee737668c..8febddf86 100644
--- a/docs/src/tutorials/gpu.md
+++ b/docs/src/tutorials/gpu.md
@@ -40,10 +40,17 @@ This computation can be moved to the GPU by the following:
 
 ```julia
 using CUDA # Add the GPU library
-sol = LS.solve(prob, LS.CudaOffloadFactorization())
+sol = LS.solve(prob, LS.CudaOffloadLUFactorization())
 sol.u
 ```
 
+LinearSolve.jl provides two GPU offloading algorithms:
+- `CudaOffloadLUFactorization()` - Uses LU factorization (generally faster for well-conditioned problems)
+- `CudaOffloadQRFactorization()` - Uses QR factorization (more stable for ill-conditioned problems)
+
+!!! warning
+    The old `CudaOffloadFactorization()` is deprecated. Use `CudaOffloadLUFactorization()` or `CudaOffloadQRFactorization()` instead.
+
 ## GPUArray Interface
 
 For more manual control over the factorization setup, you can use the
diff --git a/examples/cuda_offload_example.jl b/examples/cuda_offload_example.jl
new file mode 100644
index 000000000..74822d5db
--- /dev/null
+++ b/examples/cuda_offload_example.jl
@@ -0,0 +1,97 @@
+"""
+Example demonstrating the new CudaOffloadLUFactorization and CudaOffloadQRFactorization algorithms.
+
+This example shows how to use the new GPU offloading algorithms for solving linear systems
+with different numerical properties.
+"""
+
+using LinearSolve
+using LinearAlgebra
+using Random
+
+# Set random seed for reproducibility
+Random.seed!(123)
+
+println("CUDA Offload Factorization Examples")
+println("=" ^ 40)
+
+# Create a well-conditioned problem
+println("\n1. Well-conditioned problem (condition number ≈ 10)")
+A_good = rand(100, 100)
+A_good = A_good + 10I  # Make it well-conditioned
+b_good = rand(100)
+prob_good = LinearProblem(A_good, b_good)
+
+println("   Matrix size: $(size(A_good))")
+println("   Condition number: $(cond(A_good))")
+
+# Try to use CUDA if available
+try
+    using CUDA
+    
+    # Solve with LU (faster for well-conditioned)
+    println("\n   Solving with CudaOffloadLUFactorization...")
+    sol_lu = solve(prob_good, CudaOffloadLUFactorization())
+    println("   Solution norm: $(norm(sol_lu.u))")
+    println("   Residual norm: $(norm(A_good * sol_lu.u - b_good))")
+    
+    # Solve with QR (more stable)
+    println("\n   Solving with CudaOffloadQRFactorization...")
+    sol_qr = solve(prob_good, CudaOffloadQRFactorization())
+    println("   Solution norm: $(norm(sol_qr.u))")
+    println("   Residual norm: $(norm(A_good * sol_qr.u - b_good))")
+    
+catch e
+    println("\n   Note: CUDA.jl is not loaded. To use GPU offloading:")
+    println("   1. Install CUDA.jl: using Pkg; Pkg.add(\"CUDA\")")
+    println("   2. Add 'using CUDA' before running this example")
+    println("   3. Ensure you have a CUDA-compatible NVIDIA GPU")
+end
+
+# Create an ill-conditioned problem
+println("\n2. Ill-conditioned problem (condition number ≈ 1e6)")
+A_bad = rand(50, 50)
+# Make it ill-conditioned
+U, S, V = svd(A_bad)
+S[end] = S[1] / 1e6  # Create large condition number
+A_bad = U * Diagonal(S) * V'
+b_bad = rand(50)
+prob_bad = LinearProblem(A_bad, b_bad)
+
+println("   Matrix size: $(size(A_bad))")
+println("   Condition number: $(cond(A_bad))")
+
+try
+    using CUDA
+    
+    # For ill-conditioned problems, QR is typically more stable
+    println("\n   Solving with CudaOffloadQRFactorization (recommended for ill-conditioned)...")
+    sol_qr_bad = solve(prob_bad, CudaOffloadQRFactorization())
+    println("   Solution norm: $(norm(sol_qr_bad.u))")
+    println("   Residual norm: $(norm(A_bad * sol_qr_bad.u - b_bad))")
+    
+    println("\n   Solving with CudaOffloadLUFactorization (may be less stable)...")
+    sol_lu_bad = solve(prob_bad, CudaOffloadLUFactorization())
+    println("   Solution norm: $(norm(sol_lu_bad.u))")
+    println("   Residual norm: $(norm(A_bad * sol_lu_bad.u - b_bad))")
+    
+catch e
+    println("\n   Skipping GPU tests (CUDA not available)")
+end
+
+# Demonstrate the deprecation warning
+println("\n3. Testing deprecated CudaOffloadFactorization")
+try
+    using CUDA
+    println("   Creating deprecated CudaOffloadFactorization...")
+    alg = CudaOffloadFactorization()  # This will show a deprecation warning
+    println("   The deprecated algorithm still works but shows a warning above")
+catch e
+    println("   Skipping deprecation test (CUDA not available)")
+end
+
+println("\n" * "=" ^ 40)
+println("Summary:")
+println("- Use CudaOffloadLUFactorization for well-conditioned problems (faster)")
+println("- Use CudaOffloadQRFactorization for ill-conditioned problems (more stable)")
+println("- The old CudaOffloadFactorization is deprecated")
\ No newline at end of file

From eb0930ffe1397cc433ef7751161931a9ec8b23bd Mon Sep 17 00:00:00 2001
From: Christopher Rackauckas <accounts@chrisrackauckas.com>
Date: Sun, 10 Aug 2025 10:13:20 -0400
Subject: [PATCH 08/11] Update docs/src/solvers/solvers.md

---
 docs/src/solvers/solvers.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/src/solvers/solvers.md b/docs/src/solvers/solvers.md
index f3643aff4..a49416f35 100644
--- a/docs/src/solvers/solvers.md
+++ b/docs/src/solvers/solvers.md
@@ -239,7 +239,6 @@ The following are non-standard GPU factorization routines.
 ```@docs
 CudaOffloadLUFactorization
 CudaOffloadQRFactorization
-CudaOffloadFactorization
 ```
 
 ### CUSOLVERRF.jl

From ad039a174a37eb3b8ff57e204d2d4135dfa0a445 Mon Sep 17 00:00:00 2001
From: Christopher Rackauckas <accounts@chrisrackauckas.com>
Date: Sun, 10 Aug 2025 10:13:37 -0400
Subject: [PATCH 09/11] Delete examples/cuda_offload_example.jl

---
 examples/cuda_offload_example.jl | 97 --------------------------------
 1 file changed, 97 deletions(-)
 delete mode 100644 examples/cuda_offload_example.jl

diff --git a/examples/cuda_offload_example.jl b/examples/cuda_offload_example.jl
deleted file mode 100644
index 74822d5db..000000000
--- a/examples/cuda_offload_example.jl
+++ /dev/null
@@ -1,97 +0,0 @@
-"""
-Example demonstrating the new CudaOffloadLUFactorization and CudaOffloadQRFactorization algorithms.
-
-This example shows how to use the new GPU offloading algorithms for solving linear systems
-with different numerical properties.
-"""
-
-using LinearSolve
-using LinearAlgebra
-using Random
-
-# Set random seed for reproducibility
-Random.seed!(123)
-
-println("CUDA Offload Factorization Examples")
-println("=" ^ 40)
-
-# Create a well-conditioned problem
-println("\n1. Well-conditioned problem (condition number ≈ 10)")
-A_good = rand(100, 100)
-A_good = A_good + 10I  # Make it well-conditioned
-b_good = rand(100)
-prob_good = LinearProblem(A_good, b_good)
-
-println("   Matrix size: $(size(A_good))")
-println("   Condition number: $(cond(A_good))")
-
-# Try to use CUDA if available
-try
-    using CUDA
-    
-    # Solve with LU (faster for well-conditioned)
-    println("\n   Solving with CudaOffloadLUFactorization...")
-    sol_lu = solve(prob_good, CudaOffloadLUFactorization())
-    println("   Solution norm: $(norm(sol_lu.u))")
-    println("   Residual norm: $(norm(A_good * sol_lu.u - b_good))")
-    
-    # Solve with QR (more stable)
-    println("\n   Solving with CudaOffloadQRFactorization...")
-    sol_qr = solve(prob_good, CudaOffloadQRFactorization())
-    println("   Solution norm: $(norm(sol_qr.u))")
-    println("   Residual norm: $(norm(A_good * sol_qr.u - b_good))")
-    
-catch e
-    println("\n   Note: CUDA.jl is not loaded. To use GPU offloading:")
-    println("   1. Install CUDA.jl: using Pkg; Pkg.add(\"CUDA\")")
-    println("   2. Add 'using CUDA' before running this example")
-    println("   3. Ensure you have a CUDA-compatible NVIDIA GPU")
-end
-
-# Create an ill-conditioned problem
-println("\n2. Ill-conditioned problem (condition number ≈ 1e6)")
-A_bad = rand(50, 50)
-# Make it ill-conditioned
-U, S, V = svd(A_bad)
-S[end] = S[1] / 1e6  # Create large condition number
-A_bad = U * Diagonal(S) * V'
-b_bad = rand(50)
-prob_bad = LinearProblem(A_bad, b_bad)
-
-println("   Matrix size: $(size(A_bad))")
-println("   Condition number: $(cond(A_bad))")
-
-try
-    using CUDA
-    
-    # For ill-conditioned problems, QR is typically more stable
-    println("\n   Solving with CudaOffloadQRFactorization (recommended for ill-conditioned)...")
-    sol_qr_bad = solve(prob_bad, CudaOffloadQRFactorization())
-    println("   Solution norm: $(norm(sol_qr_bad.u))")
-    println("   Residual norm: $(norm(A_bad * sol_qr_bad.u - b_bad))")
-    
-    println("\n   Solving with CudaOffloadLUFactorization (may be less stable)...")
-    sol_lu_bad = solve(prob_bad, CudaOffloadLUFactorization())
-    println("   Solution norm: $(norm(sol_lu_bad.u))")
-    println("   Residual norm: $(norm(A_bad * sol_lu_bad.u - b_bad))")
-    
-catch e
-    println("\n   Skipping GPU tests (CUDA not available)")
-end
-
-# Demonstrate the deprecation warning
-println("\n3. Testing deprecated CudaOffloadFactorization")
-try
-    using CUDA
-    println("   Creating deprecated CudaOffloadFactorization...")
-    alg = CudaOffloadFactorization()  # This will show a deprecation warning
-    println("   The deprecated algorithm still works but shows a warning above")
-catch e
-    println("   Skipping deprecation test (CUDA not available)")
-end
-
-println("\n" * "=" ^ 40)
-println("Summary:")
-println("- Use CudaOffloadLUFactorization for well-conditioned problems (faster)")
-println("- Use CudaOffloadQRFactorization for ill-conditioned problems (more stable)")
-println("- The old CudaOffloadFactorization is deprecated")
\ No newline at end of file

From 9c67b436ef467bb68668fe5c28b0699d1b0557be Mon Sep 17 00:00:00 2001
From: Christopher Rackauckas <accounts@chrisrackauckas.com>
Date: Sun, 10 Aug 2025 12:42:07 -0400
Subject: [PATCH 10/11] Update ext/LinearSolveCUDAExt.jl

---
 ext/LinearSolveCUDAExt.jl | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/ext/LinearSolveCUDAExt.jl b/ext/LinearSolveCUDAExt.jl
index 0a0c96887..e7bd08a77 100644
--- a/ext/LinearSolveCUDAExt.jl
+++ b/ext/LinearSolveCUDAExt.jl
@@ -50,7 +50,12 @@ end
 function LinearSolve.init_cacheval(alg::CudaOffloadLUFactorization, A, b, u, Pl, Pr,
         maxiters::Int, abstol, reltol, verbose::Bool,
         assumptions::OperatorAssumptions)
-    ArrayInterface.lu_instance(A)
+    T = eltype(A)
+    noUnitT = typeof(zero(T))
+    luT = LinearAlgebra.lutype(noUnitT)
+    ipiv = Vector{Int32}(undef, 0)
+    info = zero(LinearAlgebra.BlasInt)
+    return LU{luT}(CuMatrix{Float64}(undef, 0, 0), ipiv, info)
 end
 
 function SciMLBase.solve!(cache::LinearSolve.LinearCache, alg::CudaOffloadQRFactorization;

From f6bdcb3c039bb3d07d380bbfeea9ad862ec84511 Mon Sep 17 00:00:00 2001
From: Christopher Rackauckas <accounts@chrisrackauckas.com>
Date: Sun, 10 Aug 2025 12:54:29 -0400
Subject: [PATCH 11/11] Update ext/LinearSolveCUDAExt.jl

---
 ext/LinearSolveCUDAExt.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ext/LinearSolveCUDAExt.jl b/ext/LinearSolveCUDAExt.jl
index e7bd08a77..4e0b25796 100644
--- a/ext/LinearSolveCUDAExt.jl
+++ b/ext/LinearSolveCUDAExt.jl
@@ -53,7 +53,7 @@ function LinearSolve.init_cacheval(alg::CudaOffloadLUFactorization, A, b, u, Pl,
     T = eltype(A)
     noUnitT = typeof(zero(T))
     luT = LinearAlgebra.lutype(noUnitT)
-    ipiv = Vector{Int32}(undef, 0)
+    ipiv = CuVector{Int32}(undef, 0)
     info = zero(LinearAlgebra.BlasInt)
     return LU{luT}(CuMatrix{Float64}(undef, 0, 0), ipiv, info)
 end