Rules for LU decomposition of StridedMatrixes (#354)

sethaxen · oxinabox · web-flow · commit e0b165a50450 · 2021-02-08T10:21:06.000-08:00
* Add frule for lu decomposition

* Eliminate allocation

* Add rrule for lu

* Test combinations of Zero and non-Zero

* Test check=false is passed

* Test getproperty LU

* Add rules for inverse of LU

* Increment version number

* Avoid ops new to 1.6

* Efficiency improvements

* Project tangents before use

* Add link to blog post

* Apply suggestions from code review

Co-authored-by: Lyndon White &lt;oxinabox@ucc.asn.au&gt;

* Use check_equal throughout

* Avoid reusing variable name

* Refactor to use cotangent of `factor`

* Add to_vec for LU

* Use frule_test and rrule_test

* Add additional comment

* Don't declare variable name again

* Correctly standardize factor cotangent

* Don't reuse type name

* Obviate explanation

* Don't re-allocate ∂A

Co-authored-by: Lyndon White &lt;oxinabox@ucc.asn.au&gt;
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "ChainRules"
 uuid = "082447d4-558c-5d27-93f4-14fc19e9eca2"
-version = "0.7.49"
+version = "0.7.50"
 
 [deps]
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
diff --git a/src/rulesets/LinearAlgebra/factorization.jl b/src/rulesets/LinearAlgebra/factorization.jl
@@ -1,6 +1,204 @@
 using LinearAlgebra: checksquare
 using LinearAlgebra.BLAS: gemv, gemv!, gemm!, trsm!, axpy!, ger!
 
+#####
+##### `lu`
+#####
+
+# These rules are necessary because the primals call LAPACK functions
+
+# frule for square matrix was introduced in Eq. 3.6 of
+# de Hoog, F.R., Anderssen, R.S. and Lukas, M.A. (2011)
+# Differentiation of matrix functionals using triangular factorization.
+# Mathematics of Computation, 80 (275). p. 1585.
+# doi: http://doi.org/10.1090/S0025-5718-2011-02451-8
+# for derivations for wide and tall matrices, see
+# https://sethaxen.com/blog/2021/02/differentiating-the-lu-decomposition/
+
+function frule(
+    (_, ΔA), ::typeof(lu!), A::StridedMatrix, pivot::Union{Val{false},Val{true}}; kwargs...
+)
+    F = lu!(A, pivot; kwargs...)
+    ∂factors = pivot === Val(true) ? ΔA[F.p, :] : ΔA
+    m, n = size(∂factors)
+    q = min(m, n)
+    if m == n  # square A
+        # minimal allocation computation of
+        # ∂L = L * tril(L \ (P * ΔA) / U, -1)
+        # ∂U = triu(L \ (P * ΔA) / U) * U
+        # ∂factors = ∂L + ∂U
+        L = UnitLowerTriangular(F.factors)
+        U = UpperTriangular(F.factors)
+        rdiv!(∂factors, U)
+        ldiv!(L, ∂factors)
+        ∂L = lmul!(L, tril(∂factors, -1))
+        ∂U = rmul!(triu(∂factors), U)
+        ∂factors .= ∂L .+ ∂U
+    elseif m < n  # wide A, system is [P*A1 P*A2] = [L*U1 L*U2]
+        L = UnitLowerTriangular(F.L)
+        U = F.U
+        ldiv!(L, ∂factors)
+        @views begin
+            ∂factors1 = ∂factors[:, 1:q]
+            ∂factors2 = ∂factors[:, (q + 1):end]
+            U1 = UpperTriangular(U[:, 1:q])
+            U2 = U[:, (q + 1):end]
+        end
+        rdiv!(∂factors1, U1)
+        ∂L = tril(∂factors1, -1)
+        mul!(∂factors2, ∂L, U2, -1, 1)
+        lmul!(L, ∂L)
+        rmul!(triu!(∂factors1), U1)
+        ∂factors1 .+= ∂L
+    else  # tall A, system is [P1*A; P2*A] = [L1*U; L2*U]
+        L = F.L
+        U = UpperTriangular(F.U)
+        rdiv!(∂factors, U)
+        @views begin
+            ∂factors1 = ∂factors[1:q, :]
+            ∂factors2 = ∂factors[(q + 1):end, :]
+            L1 = UnitLowerTriangular(L[1:q, :])
+            L2 = L[(q + 1):end, :]
+        end
+        ldiv!(L1, ∂factors1)
+        ∂U = triu(∂factors1)
+        mul!(∂factors2, L2, ∂U, -1, 1)
+        rmul!(∂U, U)
+        lmul!(L1, tril!(∂factors1, -1))
+        ∂factors1 .+= ∂U
+    end
+    ∂F = Composite{typeof(F)}(; factors=∂factors)
+    return F, ∂F
+end
+
+function rrule(
+    ::typeof(lu), A::StridedMatrix, pivot::Union{Val{false},Val{true}}; kwargs...
+)
+    F = lu(A, pivot; kwargs...)
+    function lu_pullback(ΔF::Composite)
+        Δfactors = ΔF.factors
+        Δfactors isa AbstractZero && return (NO_FIELDS, Δfactors, DoesNotExist())
+        factors = F.factors
+        ∂factors = eltype(A) <: Real ? real(Δfactors) : Δfactors
+        ∂A = similar(factors)
+        m, n = size(A)
+        q = min(m, n)
+        if m == n  # square A
+            # ∂A = P' * (L' \ (tril(L' * ∂L, -1) + triu(∂U * U')) / U')
+            L = UnitLowerTriangular(factors)
+            U = UpperTriangular(factors)
+            ∂U = UpperTriangular(∂factors)
+            tril!(copyto!(∂A, ∂factors), -1)
+            lmul!(L', ∂A)
+            copyto!(UpperTriangular(∂A), UpperTriangular(∂U * U'))
+            rdiv!(∂A, U')
+            ldiv!(L', ∂A)
+        elseif m < n  # wide A, system is [P*A1 P*A2] = [L*U1 L*U2]
+            triu!(copyto!(∂A, ∂factors))
+            @views begin
+                factors1 = factors[:, 1:q]
+                U2 = factors[:, (q + 1):end]
+                ∂A1 = ∂A[:, 1:q]
+                ∂A2 = ∂A[:, (q + 1):end]
+                ∂L = tril(∂factors[:, 1:q], -1)
+            end
+            L = UnitLowerTriangular(factors1)
+            U1 = UpperTriangular(factors1)
+            triu!(rmul!(∂A1, U1'))
+            ∂A1 .+= tril!(mul!(lmul!(L', ∂L), ∂A2, U2', -1, 1), -1)
+            rdiv!(∂A1, U1')
+            ldiv!(L', ∂A)
+        else  # tall A, system is [P1*A; P2*A] = [L1*U; L2*U]
+            tril!(copyto!(∂A, ∂factors), -1)
+            @views begin
+                factors1 = factors[1:q, :]
+                L2 = factors[(q + 1):end, :]
+                ∂A1 = ∂A[1:q, :]
+                ∂A2 = ∂A[(q + 1):end, :]
+                ∂U = triu(∂factors[1:q, :])
+            end
+            U = UpperTriangular(factors1)
+            L1 = UnitLowerTriangular(factors1)
+            tril!(lmul!(L1', ∂A1), -1)
+            ∂A1 .+= triu!(mul!(rmul!(∂U, U'), L2', ∂A2, -1, 1))
+            ldiv!(L1', ∂A1)
+            rdiv!(∂A, U')
+        end
+        if pivot === Val(true)
+            ∂A = ∂A[invperm(F.p), :]
+        end
+        return NO_FIELDS, ∂A, DoesNotExist()
+    end
+    return F, lu_pullback
+end
+
+#####
+##### functions of `LU`
+#####
+
+# this rrule is necessary because the primal mutates
+
+function rrule(::typeof(getproperty), F::TF, x::Symbol) where {T,TF<:LU{T,<:StridedMatrix{T}}}
+    function getproperty_LU_pullback(ΔY)
+        ∂factors = if x === :L
+            m, n = size(F.factors)
+            S = eltype(ΔY)
+            tril!([ΔY zeros(S, m, max(0, n - m))], -1)
+        elseif x === :U
+            m, n = size(F.factors)
+            S = eltype(ΔY)
+            triu!([ΔY; zeros(S, max(0, m - n), n)])
+        elseif x === :factors
+            Matrix(ΔY)
+        else
+            return (NO_FIELDS, DoesNotExist(), DoesNotExist())
+        end
+        ∂F = Composite{TF}(; factors=∂factors)
+        return NO_FIELDS, ∂F, DoesNotExist()
+    end
+    return getproperty(F, x), getproperty_LU_pullback
+end
+
+# these rules are needed because the primal calls a LAPACK function
+
+function frule((_, ΔF), ::typeof(LinearAlgebra.inv!), F::LU{<:Any,<:StridedMatrix})
+    # factors must be square if the primal did not error
+    L = UnitLowerTriangular(F.factors)
+    U = UpperTriangular(F.factors)
+    # compute ∂Y = -(U \ (L \ ∂L + ∂U / U) / L) * P while minimizing allocations
+    m, n = size(F.factors)
+    q = min(m, n)
+    ∂L = tril(m ≥ n ? ΔF.factors : view(ΔF.factors, :, 1:q), -1)
+    ∂U = triu(m ≤ n ? ΔF.factors : view(ΔF.factors, 1:q, :))
+    ∂Y = ldiv!(L, ∂L)
+    ∂Y .+= rdiv!(∂U, U)
+    ldiv!(U, ∂Y)
+    rdiv!(∂Y, L)
+    rmul!(∂Y, -1)
+    return LinearAlgebra.inv!(F), ∂Y[:, invperm(F.p)]
+end
+
+function rrule(::typeof(inv), F::LU{<:Any,<:StridedMatrix})
+    function inv_LU_pullback(ΔY)
+        # factors must be square if the primal did not error
+        L = UnitLowerTriangular(F.factors)
+        U = UpperTriangular(F.factors)
+        # compute the following while minimizing allocations
+        # ∂U = - triu((U' \ ∂Y * P' / L') / U')
+        # ∂L = - tril(L' \ (U' \ ∂Y * P' / L'), -1)
+        ∂factors = ΔY[:, F.p]
+        ldiv!(U', ∂factors)
+        rdiv!(∂factors, L')
+        rmul!(∂factors, -1)
+        ∂L = tril!(L' \ ∂factors, -1)
+        triu!(rdiv!(∂factors, U'))
+        ∂factors .+= ∂L
+        ∂F = Composite{typeof(F)}(; factors=∂factors)
+        return NO_FIELDS, ∂F
+    end
+    return inv(F), inv_LU_pullback
+end
+
 #####
 ##### `svd`
 #####
diff --git a/test/rulesets/LinearAlgebra/factorization.jl b/test/rulesets/LinearAlgebra/factorization.jl
@@ -1,3 +1,12 @@
+# TODO: move this to FiniteDifferences
+function FiniteDifferences.to_vec(X::LU)
+    x_vec, back = to_vec(Matrix(X.factors))
+    function LU_from_vec(x_vec)
+        return LU(back(x_vec), X.ipiv, X.info)
+    end
+    return x_vec, LU_from_vec
+end
+
 function FiniteDifferences.to_vec(C::Cholesky)
     C_vec, factors_from_vec = to_vec(C.factors)
     function cholesky_from_vec(v)
@@ -12,6 +21,86 @@ function FiniteDifferences.to_vec(x::Val)
 end
 
 @testset "Factorizations" begin
+    @testset "lu decomposition" begin
+        n = 10
+        @testset "lu! frule" begin
+            @testset "lu!(A::Matrix{$T}, $pivot) for size(A)=($m, $n)" for
+                T in (Float64, ComplexF64),
+                pivot in (Val(true), Val(false)),
+                m in (7, 10, 13)
+
+                A = randn(T, m, n)
+                ΔA = rand_tangent(A)
+                frule_test(lu!, (A, ΔA), (pivot, nothing))
+            end
+            @testset "check=false passed to primal function" begin
+                Asingular = zeros(n, n)
+                ΔAsingular = rand_tangent(Asingular)
+                @test_throws SingularException frule(
+                    (Zero(), copy(ΔAsingular)), lu!, copy(Asingular), Val(true)
+                )
+                frule((Zero(), ΔAsingular), lu!, Asingular, Val(true); check=false)
+            end
+        end
+        @testset "lu rrule" begin
+            @testset "lu(A::Matrix{$T}, $pivot) for size(A)=($m, $n)" for
+                T in (Float64, ComplexF64),
+                pivot in (Val(true), Val(false)),
+                m in (7, 10, 13)
+
+                A = randn(T, m, n)
+                ΔA = rand_tangent(A)
+                F = lu(A, pivot)
+                Δfactors = rand_tangent(F.factors)
+                ΔF = Composite{typeof(F)}(; factors=Δfactors)
+                rrule_test(lu, ΔF, (A, ΔA), (pivot, nothing))
+            end
+            @testset "check=false passed to primal function" begin
+                Asingular = zeros(n, n)
+                F = lu(Asingular, Val(true); check=false)
+                ΔF = Composite{typeof(F)}(; U=rand_tangent(F.U), L=rand_tangent(F.L))
+                @test_throws SingularException rrule(lu, Asingular, Val(true))
+                _, back = rrule(lu, Asingular, Val(true); check=false)
+                back(ΔF)
+            end
+        end
+        @testset "LU" begin
+            @testset "getproperty(::LU, k) rrule" begin
+                # test that the getproperty rrule composes correctly with the lu rrule
+                @testset "getproperty(lu(A::Matrix), :$k) for size(A)=($m, $n)" for
+                    k in (:U, :L, :factors),
+                    m in (7, 10, 13)
+
+                    A = randn(m, n)
+                    F = lu(A)
+                    X = getproperty(F, k)
+                    ΔF = Composite{typeof(F)}(; factors=rand_tangent(F.factors))
+                    ΔX = rand_tangent(X)
+                    rrule_test(getproperty, ΔX, (F, ΔF), (k, nothing); check_inferred=false)
+                end
+            end
+            @testset "matrix inverse using LU" begin
+                @testset "LinearAlgebra.inv!(::LU) frule" begin
+                    @testset "inv!(lu(::LU{$T,<:StridedMatrix}))" for T in (Float64,ComplexF64)
+                        A = randn(T, n, n)
+                        F = lu(A, Val(true))
+                        ΔF = Composite{typeof(F)}(; factors=rand_tangent(F.factors))
+                        frule_test(LinearAlgebra.inv!, (F, ΔF))
+                    end
+                end
+                @testset "inv(::LU) rrule" begin
+                    @testset "inv(::LU{$T,<:StridedMatrix})" for T in (Float64,ComplexF64)
+                        A = randn(T, n, n)
+                        F = lu(A, Val(true))
+                        Y = inv(A)
+                        ΔF = Composite{typeof(F)}(; factors=rand_tangent(F.factors))
+                        ΔY = rand_tangent(Y)
+                        rrule_test(inv, ΔY, (F, ΔF))
+                    end
+                end
+            end
+        end
+    end
     @testset "svd" begin
         for n in [4, 6, 10], m in [3, 5, 10]
             X = randn(n, m)