feat: overload LinearAlgebra.lu (#1297)

avik-pal · web-flow · commit 12e99413a373 · 2025-05-20T23:27:50.000-04:00
* feat: overload LinearAlgebra.lu

* feat: some more overloads

* feat: implement batched LU with Ops.lu

* chore: bump jll

* feat: more overloads

* fix: batch overload

* test: unbatched LU

* fix: batch op implementation

* fix: batch ordering for lu

* fix: ambiguity in 1.10
diff --git a/Project.toml b/Project.toml
@@ -87,7 +87,7 @@ PythonCall = "0.9"
 Random = "1.10"
 Random123 = "1.7"
 ReactantCore = "0.1.9"
-Reactant_jll = "0.0.185"
+Reactant_jll = "0.0.186"
 ScopedValues = "1.3.0"
 Scratch = "1.2"
 Sockets = "1.10"
diff --git a/src/Ops.jl b/src/Ops.jl
@@ -2797,6 +2797,7 @@ end
 
     # First we permute and make sure the batch dims are at the beginning
     batch_dims = Int64[i for i in 1:N if i ∉ dims]
+    batch_shape = [size(A, i) for i in batch_dims]
     permutation = zeros(Int64, N)
     for (i, d) in enumerate(batch_dims)
         permutation[i] = d
@@ -2805,51 +2806,54 @@ end
         permutation[i + length(batch_dims)] = d
     end
 
-    A = Ops.transpose(A, permutation; location)
+    res = only(batch(f, [Ops.transpose(A, permutation; location)], batch_shape; location))
+    if ndims(res) != length(permutation)
+        res = Ops.reshape(
+            res,
+            vcat(collect(Int64, size(res)), ones(Int64, length(permutation) - ndims(res))),
+        )
+    end
+    return Ops.transpose(res, invperm(permutation); location)
+end
 
-    sample_input = fill(T(0), [size(A, i) for i in (length(batch_dims) + 1):N]; location)
-    # TODO: detect and forbid internal mutations
+@noinline function batch(
+    f::F,
+    inputs::Vector{<:TracedRArray},
+    batch_shape::Vector{Int64};
+    location=mlir_stacktrace("batch", @__FILE__, @__LINE__),
+) where {F}
+    sample_inputs = [
+        fill(
+            unwrapped_eltype(input)(0),
+            [size(input, i) for i in (length(batch_shape) + 1):ndims(input)]...,
+        ) for input in inputs
+    ]
     mlir_fn_res = Reactant.TracedUtils.make_mlir_fn(
         f,
-        (sample_input,),
+        (sample_inputs...,),
         (),
         "unbatched_" * string(f),
         false;
         args_in_result=:none,
         do_transpose=false,
     )
-
     @assert !mlir_fn_res.fnwrapped "Currently we don't support batching closures."
 
     func = mlir_fn_res.f
     @assert MLIR.IR.nregions(func) == 1
 
-    result = only(mlir_fn_res.linear_results)
-    batch_shape = [size(A, i) for i in 1:length(batch_dims)]
-
-    if result isa TracedRArray
-        @assert ndims(result) == ndims(sample_input)
-        output_type = MLIR.IR.TensorType(
-            vcat(batch_shape, collect(Int64, size(result))),
-            MLIR.IR.Type(unwrapped_eltype(result)),
-        )
-    elseif result isa TracedRNumber
-        output_type = MLIR.IR.TensorType(
-            batch_shape, MLIR.IR.Type(unwrapped_eltype(result))
-        )
-    else
-        error("Unsupported result type $(typeof(result))")
-    end
-
-    batched_result = batch([A], [output_type], batch_shape; fn=func, location)[1]
-
-    if result isa TracedRNumber
-        batched_result = Ops.reshape(
-            batched_result, vcat(batch_shape, ones(Int64, ndims(sample_input))); location
+    output_types = MLIR.IR.Type[]
+    for result in mlir_fn_res.linear_results
+        push!(
+            output_types,
+            MLIR.IR.TensorType(
+                vcat(batch_shape, collect(Int64, size(result))),
+                MLIR.IR.Type(unwrapped_eltype(result)),
+            ),
         )
     end
 
-    return Ops.transpose(batched_result, invperm(permutation); location)
+    return batch(inputs, output_types, batch_shape; fn=func, location)
 end
 
 @noinline function batch(
diff --git a/src/stdlibs/LinearAlgebra.jl b/src/stdlibs/LinearAlgebra.jl
@@ -539,4 +539,154 @@ function LinearAlgebra.generic_mattridiv!(
     return C
 end
 
+# Supports batched factorization
+abstract type GeneralizedFactorization{T} <: Factorization{T} end
+
+function LinearAlgebra.TransposeFactorization(f::GeneralizedFactorization)
+    return LinearAlgebra.TransposeFactorization{eltype(f),typeof(f)}(f)
+end
+
+function LinearAlgebra.AdjointFactorization(f::GeneralizedFactorization)
+    return LinearAlgebra.AdjointFactorization{eltype(f),typeof(f)}(f)
+end
+
+const GeneralizedTransposeFactorization{T} =
+    LinearAlgebra.TransposeFactorization{T,<:GeneralizedFactorization{T}} where {T}
+const GeneralizedAdjointFactorization{T} =
+    LinearAlgebra.AdjointFactorization{T,<:GeneralizedFactorization{T}} where {T}
+
+# LU Factorization
+struct GeneralizedLU{T,S<:AbstractArray,P<:AbstractArray,I<:Union{AbstractArray,Number}} <:
+       GeneralizedFactorization{T}
+    factors::S
+    ipiv::P
+    perm::P
+    info::I
+end
+
+Base.ndims(lu::GeneralizedLU) = ndims(lu.factors)
+
+function GeneralizedLU(factors::S, ipiv::P, perm::P, info::I) where {S,P,I}
+    @assert ndims(ipiv) == ndims(perm) == ndims(factors) - 1
+    @assert ndims(info) == ndims(factors) - 2
+    return GeneralizedLU{eltype(factors),S,P,I}(factors, ipiv, perm, info)
+end
+
+## allow > 2 dimensions as inputs
+function LinearAlgebra.lu(A::AnyTracedRArray{T,2}, ::RowMaximum; kwargs...) where {T}
+    return lu!(copy(A), RowMaximum(); kwargs...)
+end
+function LinearAlgebra.lu(
+    A::AnyTracedRArray{T,N}, ::RowMaximum=RowMaximum(); kwargs...
+) where {T,N}
+    return lu!(copy(A), RowMaximum(); kwargs...)
+end
+
+function LinearAlgebra.lu!(A::AnyTracedRArray{T,2}, ::RowMaximum; kwargs...) where {T}
+    return _lu_overload(A, RowMaximum(); kwargs...)
+end
+function LinearAlgebra.lu!(A::AnyTracedRArray{T,N}, ::RowMaximum; kwargs...) where {T,N}
+    return _lu_overload(A, RowMaximum(); kwargs...)
+end
+
+function _lu_overload(
+    A::AnyTracedRArray{T,N}, ::RowMaximum; check::Bool=false, allowsingular::Bool=false
+) where {T,N}
+    # TODO: don't ignore the check and allowsingular flags
+    # Batching here is in the last dimensions. `Ops.lu` expects the last dimensions
+    permdims = vcat(Int64[N - 1, N], collect(Int64, 1:(N - 2)))
+    A = Ops.transpose(materialize_traced_array(A), permdims)
+    factors, ipiv, perm, info = Reactant.Ops.lu(A)
+
+    # Permute back to the original dimensions
+    perm_perm = vcat(N - 1, collect(Int64, 1:(N - 2)))
+    factors = Ops.transpose(factors, invperm(permdims))
+    ipiv = Ops.transpose(ipiv, perm_perm)
+    perm = Ops.transpose(perm, perm_perm)
+    return GeneralizedLU(factors, ipiv, perm, info)
+end
+
+function LinearAlgebra.ldiv!(
+    lu::GeneralizedLU{T,<:AbstractArray{T,N},P,I}, B::AbstractArray{T,M}
+) where {T,P,I,N,M}
+    @assert N == M + 1
+    ldiv!(lu, reshape(B, size(B, 1), 1, size(B)[2:end]...))
+    return B
+end
+
+function LinearAlgebra.ldiv!(
+    lu::GeneralizedLU{T,<:AbstractArray{T,2},P,I}, B::AbstractArray{T,2}
+) where {T,P,I}
+    B .= _lu_solve_core(lu.factors, B, lu.perm)
+    return B
+end
+
+function LinearAlgebra.ldiv!(
+    lu::GeneralizedLU{T,<:AbstractArray{T,N},P,I}, B::AbstractArray{T,N}
+) where {T,P,I,N}
+    batch_shape = size(lu.factors)[3:end]
+    @assert batch_shape == size(B)[3:end]
+
+    permutation = vcat(collect(Int64, 3:N), 1, 2)
+
+    factors = Ops.transpose(materialize_traced_array(lu.factors), permutation)
+    B_permuted = Ops.transpose(materialize_traced_array(B), permutation)
+    perm = Ops.transpose(
+        materialize_traced_array(lu.perm), vcat(collect(Int64, 2:(N - 1)), 1)
+    )
+
+    res = Ops.transpose(
+        only(
+            Ops.batch(
+                _lu_solve_core, [factors, B_permuted, perm], collect(Int64, batch_shape)
+            ),
+        ),
+        invperm(permutation),
+    )
+    B .= res
+    return B
+end
+
+for f_wrapper in (LinearAlgebra.TransposeFactorization, LinearAlgebra.AdjointFactorization),
+    aType in (:AbstractVecOrMat, :AbstractArray)
+
+    @eval function LinearAlgebra.ldiv!(lu::$(f_wrapper){<:Any,<:GeneralizedLU}, B::$aType)
+        # TODO: implement this
+        error("`$(f_wrapper)` is not supported yet for LU.")
+        return nothing
+    end
+end
+
+function _lu_solve_core(factors::AbstractMatrix, B::AbstractMatrix, perm::AbstractVector)
+    permuted_B = B[Int64.(perm), :]
+    return UpperTriangular(factors) \ (UnitLowerTriangular(factors) \ permuted_B)
+end
+
+# Overload \ to support batched factorization
+for T in (
+        :GeneralizedFactorization,
+        :GeneralizedTransposeFactorization,
+        :GeneralizedAdjointFactorization,
+    ),
+    aType in (:AbstractVecOrMat, :AbstractArray)
+
+    @eval Base.:(\)(F::$T, B::$aType) = _overloaded_backslash(F, B)
+end
+
+function _overloaded_backslash(F::GeneralizedFactorization, B::AbstractArray)
+    return ldiv!(
+        F, LinearAlgebra.copy_similar(B, typeof(oneunit(eltype(F)) \ oneunit(eltype(B))))
+    )
+end
+
+function _overloaded_backslash(F::GeneralizedTransposeFactorization, B::AbstractArray)
+    return conj!(adjoint(F.parent) \ conj.(B))
+end
+
+function _overloaded_backslash(F::GeneralizedAdjointFactorization, B::AbstractArray)
+    return ldiv!(
+        F, LinearAlgebra.copy_similar(B, typeof(oneunit(eltype(F)) \ oneunit(eltype(B))))
+    )
+end
+
 end
diff --git a/test/integration/linear_algebra.jl b/test/integration/linear_algebra.jl
@@ -319,3 +319,55 @@ end
         end
     end
 end
+
+solve_with_lu(A, b) = lu(A) \ b
+function solve_with_lu_batched(A::AbstractArray{T,N}, B::AbstractArray{T,N}) where {T,N}
+    A2 = reshape(A, size(A, 1), size(A, 2), prod(size(A)[3:end]))
+    B2 = reshape(B, size(B, 1), size(B, 2), prod(size(B)[3:end]))
+    @assert size(A2, 3) == size(B2, 3)
+    return reshape(
+        stack(lu(view(A2, :, :, i)) \ view(B2, :, :, i) for i in axes(A2, 3)),
+        size(A2, 1),
+        size(B2, 2),
+        size(A)[3:end]...,
+    )
+end
+function solve_with_lu_batched(A::AbstractArray{T,N}, b::AbstractArray{T,M}) where {T,N,M}
+    @assert N == M + 1
+    B = reshape(b, size(b, 1), 1, size(b)[2:end]...)
+    return dropdims(solve_with_lu_batched(A, B); dims=2)
+end
+
+@testset "LU Factorization" begin
+    @testset "Un-batched" begin
+        @testset for T in (Float32, Float64, ComplexF32, ComplexF64)
+            A = rand(T, 4, 4)
+            A_ra = Reactant.to_rarray(A)
+
+            b = rand(T, 4)
+            b_ra = Reactant.to_rarray(b)
+
+            B = rand(T, 4, 3)
+            B_ra = Reactant.to_rarray(B)
+
+            @test @jit(solve_with_lu(A_ra, b_ra)) ≈ solve_with_lu(A, b)
+            @test @jit(solve_with_lu(A_ra, B_ra)) ≈ solve_with_lu(A, B)
+        end
+    end
+
+    @testset "Batched" begin
+        @testset for T in (Float32, Float64, ComplexF32, ComplexF64)
+            A = rand(T, 4, 4, 3, 2)
+            A_ra = Reactant.to_rarray(A)
+
+            b = rand(T, 4, 3, 2)
+            b_ra = Reactant.to_rarray(b)
+
+            B = rand(T, 4, 5, 3, 2)
+            B_ra = Reactant.to_rarray(B)
+
+            @test @jit(solve_with_lu(A_ra, b_ra)) ≈ solve_with_lu_batched(A, b)
+            @test @jit(solve_with_lu(A_ra, B_ra)) ≈ solve_with_lu_batched(A, B)
+        end
+    end
+end