Allocation-free jtprod for Burer-Monteiro with low-rank constraints (#59)

blegat · web-flow · commit 78e0ee1e8cca · 2025-08-05T13:48:31.000+02:00
* Allocation-free jtprod for Burer-Monteiro with low-rank constraints * Fix format * Fix allocation * Fix * Fix * Add allocation tests * Fix allocations * Fix * Update tests * Fix test * Fix format * Fix * Going on the wrong direction * Revert "Going on the wrong direction" This reverts commit b8ba18d. * Reapply "Going on the wrong direction" This reverts commit 5cbba7e. * Fixes * Fix format * Update holy * Fix format * Much simpler solution * Update tests * update * Fix format * Fix format * Add test * Fix * Fix format * Update benchmarks
diff --git a/perf/holy.jl b/perf/holy.jl
@@ -0,0 +1,63 @@
+using Dualization
+import SDPLRPlus
+include(joinpath(dirname(@__DIR__), "examples", "holy_model.jl"))
+n = 30
+A = data(n)
+lr = holy_lowrank(A)
+set_optimizer(lr, dual_optimizer(LRO.Optimizer))
+set_attribute(lr, "solver", LRO.BurerMonteiro.Solver)
+set_attribute(lr, "sub_solver", SDPLRPlus.Solver)
+set_attribute(lr, "ranks", [15])
+set_attribute(lr, "maxmajoriter", 0)
+set_attribute(lr, "square_scalars", true)
+optimize!(lr)
+
+solver = unsafe_backend(lr).dual_problem.dual_model.model.optimizer.solver;
+aux = solver.model;
+var = solver.solver.var;
+
+using BenchmarkTools
+import NLPModels
+const BM = LRO.BurerMonteiro
+
+function _jtprod(model, var)
+    C = LRO._mul_to!(buffer, B', LRO.right_factor(A))
+    C = LRO._rmul_diag!!(C, A.scaling)
+    lA = LRO.left_factor(A)
+    println("_add_mul!")
+    @btime LRO._add_mul!($res, $lA, $C', $α)
+end
+
+function jtprod(model, var)
+    x = var.Rt
+    y = view(var.y, 1:model.meta.ncon)
+    Jtv = var.Gt
+    println("jtprod!")
+    @btime NLPModels.jtprod!($model, $x, $y, $Jtv)
+
+    X = BM.Solution(x, model.dim)
+    JtV = BM.Solution(Jtv, model.dim)
+    println("Scalar jtprod!")
+    @btime BM.jtprod!(
+        $model,
+        $X,
+        $y,
+        LRO.left_factor($JtV, LRO.ScalarIndex),
+        LRO.ScalarIndex,
+    )
+
+    i = LRO.MatrixIndex(1)
+    println("Matrix jtprod!")
+    @btime BM.add_jtprod!($model, $X[$i], $y, $JtV[$i], $i)
+
+    j = 1
+    res = JtV[i].factor
+    A = LRO.jac(model.model, j, i)
+    B = X[i].factor
+    α = 2y[j]
+    buffer = model.jtprod_buffer[]
+    println("buffered_mul!")
+    @btime LRO.buffered_mul!($res, $A, $B, $α, true, $buffer)
+end
+
+jtprod(aux, var)
diff --git a/perf/maxcut.jl b/perf/maxcut.jl
@@ -42,7 +42,12 @@ function bench_rmul(A)
     x = rand(n, 1)
     y = similar(x)
     println("rmul")
-    @btime LinearAlgebra.mul!($y, $A, $x, 2.0, 1.0)
+    if A.factor isa AbstractVector
+        buffer = zeros(1)
+    else
+        buffer = zeros(1, LRO.max_rank(A))
+    end
+    @btime LRO.buffered_mul!($y, $A, $x, 2.0, 1.0, $buffer)
 end
 
 function bench(aux, var)
@@ -71,6 +76,7 @@ function bench_plus(args...; kws...)
 end
 
 function bench_lro(args...; vector, kws...)
+    println("vector ? $vector")
     model = maxcut(weights(args...; kws...), dual_optimizer(LRO.Optimizer); vector)
     set_attribute(model, "solver", LRO.BurerMonteiro.Solver)
     set_attribute(model, "sub_solver", SDPLRPlus.Solver)
@@ -90,4 +96,3 @@ p = 0.1
 bench_plus(n; p)
 bench_lro(n; p, vector = true)
 bench_lro(n; p, vector = false)
-bench_lro(n; p, vector = false)
diff --git a/perf/set_dot.jl b/perf/set_dot.jl
@@ -1,5 +1,7 @@
 include(joinpath(dirname(@__DIR__), "examples", "maxcut.jl"))
 
+using BenchmarkTools
+
 # Important for Dualization
 function bench_set_dot(n)
     T = Float64
diff --git a/src/BurerMonteiro/model.jl b/src/BurerMonteiro/model.jl
@@ -1,15 +1,18 @@
-mutable struct Model{S,T,CT,AT} <: NLPModels.AbstractNLPModel{T,Vector{T}}
+mutable struct Model{S,T,CT,AT,JTB} <: NLPModels.AbstractNLPModel{T,Vector{T}}
     model::LRO.Model{T,CT,AT}
     dim::Dimensions{S}
     meta::NLPModels.NLPModelMeta{T,Vector{T}}
     counters::NLPModels.Counters
+    jtprod_buffer::JTB
     function Model{S}(model::LRO.Model{T,CT,AT}, ranks) where {S,T,CT,AT}
         dim = Dimensions{S}(model, ranks)
-        return new{S,T,CT,AT}(
+        jtprod_buffer = buffer_for_jtprod(model, dim)
+        return new{S,T,CT,AT,typeof(jtprod_buffer)}(
             model,
             dim,
             meta(dim, LRO.cons_constant(model)),
             NLPModels.Counters(),
+            jtprod_buffer,
         )
     end
 end
@@ -42,6 +45,7 @@ function set_rank!(model::Model, i::LRO.MatrixIndex, r)
     set_rank!(model.dim, i, r)
     # `nvar` has changed so we need to reset `model.meta`
     model.meta = meta(model.dim, model.meta.lcon)
+    model.jtprod_buffer = buffer_for_jtprod(model.model, model.dim)
     return
 end
 
@@ -71,7 +75,8 @@ function grad!(
     i::LRO.MatrixIndex,
 )
     C = LRO.grad(model.model, i)
-    LinearAlgebra.mul!(G.factor, C, X.factor)
+    buffer = _buffer(model.jtprod_buffer[i.value], C, X.factor)
+    LRO.buffered_mul!(G.factor, C, X.factor, true, false, buffer)
     G.factor .*= 2
     return
 end
@@ -152,16 +157,64 @@ function jtprod!(
     return JtV
 end
 
+const _RankOne{T} = LRO.AbstractFactorization{T,<:AbstractVector{T}}
+const _LowRank{T} = LRO.AbstractFactorization{T,<:AbstractMatrix{T}}
+
+function buffer_for_jtprod(
+    model::LRO.Model{T},
+    dim::Dimensions,
+    i::LRO.MatrixIndex,
+) where {T}
+    row = view(model.A, i.value, :)
+    C = model.C[i.value]
+    if any(A -> A isa _LowRank, row) || C isa _LowRank
+        ncols = maximum(row; init = 0) do A
+            if A isa _LowRank
+                return LRO.max_rank(A)
+            else
+                return 0
+            end
+        end
+        if C isa _LowRank
+            ncols = max(ncols, LRO.max_rank(C))
+        end
+        return zeros(T, dim.ranks[i.value], ncols)
+    elseif any(A -> A isa _RankOne, row) || C isa _RankOne
+        return zeros(T, dim.ranks[i.value])
+    end
+    return
+end
+
+function buffer_for_jtprod(model::LRO.Model, dim::Dimensions)
+    return buffer_for_jtprod.(model, dim, LRO.matrix_indices(model))
+end
+
+_buffer(_, ::AbstractMatrix, _) = nothing
+_buffer(buffer::AbstractVector, ::_RankOne, ::AbstractMatrix) = buffer
+function _buffer(buffer::AbstractMatrix, A::_LowRank, ::AbstractMatrix)
+    # FIXME with this if-else, we return a small Union but the compiler
+    #       since to forget about this Union and allocates later
+    #if size(buffer, 2) == LRO.max_rank(A)
+    #    buffer
+    #else
+    # Using this `view` instead of `buffer`, `AllocCheck` now
+    # sees possible allocations but `@allocated` sees none
+    return view(buffer, :, Base.OneTo(LRO.max_rank(A)))
+    #end
+end
+
 function add_jtprod!(
     model::Model,
     X::LRO.Factorization,
     y::AbstractVector,
     JtV::LRO.Factorization,
     i::LRO.MatrixIndex,
+    α = 2,
 )
     for j in eachindex(y)
         A = LRO.jac(model.model, j, i)
-        LinearAlgebra.mul!(JtV.factor, A, X.factor, 2y[j], true)
+        buffer = _buffer(model.jtprod_buffer[i.value], A, X.factor)
+        LRO.buffered_mul!(JtV.factor, A, X.factor, α * y[j], true, buffer)
     end
 end
 
@@ -185,8 +238,10 @@ function NLPModels.jtprod!(
     X = Solution(x, model.dim)
     JtV = Solution(Jtv, model.dim)
     jtprod!(model, X, y, LRO.left_factor(JtV, LRO.ScalarIndex), LRO.ScalarIndex)
-    for i in LRO.matrix_indices(model.model)
-        jtprod!(model, X[i], y, JtV[i], i)
+    for i::LRO.MatrixIndex in LRO.matrix_indices(model.model)
+        Xi = X[i]
+        JtVi = JtV[i]
+        jtprod!(model, Xi, y, JtVi, i)
     end
     return Jtv
 end
@@ -248,14 +303,11 @@ function NLPModels.hprod!(
         obj_weight,
     )
     for i in LRO.matrix_indices(model.model)
-        Vi = V[i].factor
-        C = LRO.grad(model.model, i)
-        Hvi = HV[i].factor
-        LinearAlgebra.mul!(Hvi, C, Vi, 2obj_weight, false)
-        for j in 1:model.meta.ncon
-            A = LRO.jac(model.model, j, i)
-            LinearAlgebra.mul!(Hvi, A, Vi, -2y[j], true)
-        end
+        Vi = V[i]
+        Hvi = HV[i]
+        grad!(model, Vi, Hvi, i)
+        Hvi.factor .*= obj_weight
+        add_jtprod!(model, Vi, y, Hvi, i, -2)
     end
     return Hv
 end
diff --git a/src/BurerMonteiro/solution.jl b/src/BurerMonteiro/solution.jl
@@ -15,6 +15,8 @@ function Dimensions{S}(model::LRO.Model, ranks) where {S}
     return Dimensions{S}(num_scalars, side_dimensions, ranks, offsets)
 end
 
+Base.broadcastable(d::Dimensions) = Ref(d)
+
 Base.length(d::Dimensions) = d.offsets[end]
 
 function set_rank!(d::Dimensions, i::LRO.MatrixIndex, rank)
diff --git a/src/factorization.jl b/src/factorization.jl
@@ -18,6 +18,11 @@ function Base.getindex(m::AbstractFactorization, i::Int, j::Int)
     )
 end
 
+# Structural maximum rank
+function max_rank(m::AbstractFactorization)
+    return size(left_factor(m), 2)
+end
+
 """
     struct Factorization{
         T,
@@ -423,7 +428,11 @@ function _add_mul!(
     C::LinearAlgebra.AdjOrTrans,
     α,
 )
-    @assert axes(res, 2) == axes(C, 2)
+    # For a small sparse vector of two entries and `C` of length 15,
+    # the `@assert` are slowers than what is gained by
+    # `@inbounds` apparently. See `perf/holy.jl` benchmark `jtprod`
+    #@assert axes(res, 1) == eachindex(x)
+    #@assert axes(res, 2) == axes(C, 2)
     for (row, val) in zip(x.nzind, x.nzval)
         γ = val * α
         for j in axes(res, 2)
@@ -468,41 +477,67 @@ function _add_mul!(
     end
 end
 
-function _mul!(res::AbstractVecOrMat, A::AbstractVecOrMat, B, α, β)
-    return LinearAlgebra.mul!(res, A, B, α, β)
-end
+# `MulAddMul(α, β)` is type unstable as the first two type parameters depends on whether `α` is one
+# and whether `β` is zero.
+# One way to avoid this allocation is to construct it and call the next method within `LinearAlgebra.@stable_muladdmul`
+# which will add if-else clauses.
+# This is however not done when calling `BLAS` since when we call BLAS we just wrap and then unwrap this `MulAddMul`
+# so it should be optimized out and hence not allocation should be incurred due to type instability.
+# For this to work, we need to call `@inline` as suggested by
+# https://github.com/JuliaLang/julia/pull/29634#issuecomment-440512432
+@inline _mul!(C, A, B, α, β) = LinearAlgebra.mul!(C, A, B, α, β)
+
+_mul_to!(::Nothing, A, B) = A * B
+_mul_to!(buffer, A, B) = LinearAlgebra.mul!(buffer, A, B)
 
-function _fact_mul!(
+@inline function buffered_mul!(
     res::AbstractVecOrMat,
     A::AbstractFactorization,
     B::AbstractVecOrMat,
-    α::Number,
-    β::Number,
+    α,
+    β,
+    buffer,
 )
     # TODO if `scaling` is `FillArrays.Fill`, we could just update `α`
-    C = _lmul_diag!!(A.scaling, right_factor(A)' * B)
+    # We'd like the rows to be the number of columns of `B`
+    # as we take submatrices as subsets of columns (for it to be contiguous)
+    # in the buffer so we compute the transpose
+    # `UΣV'B = U(B'VΣ)'`
+    C = _mul_to!(buffer, B', right_factor(A))
+    C = _rmul_diag!!(C, A.scaling)
     lA = left_factor(A)
-    return _mul!(res, lA, C, α, β)
+    return _mul!(res, lA, C', α, β)
 end
 
 # We want the same implementation for the two following ones but we can't use
 # `AbstractVecOrMat` as it would give ambiguity so we redirect to `_fact_mul!`
+function buffered_mul!(
+    res::AbstractVecOrMat,
+    A::AbstractMatrix,
+    B::AbstractVecOrMat,
+    α,
+    β,
+    _,
+)
+    return LinearAlgebra.mul!(res, A, B, α, β)
+end
+
 function LinearAlgebra.mul!(
-    res::AbstractMatrix,
-    A::AbstractFactorization,
-    B::AbstractMatrix,
-    α::Number,
-    β::Number,
+    ::AbstractVector,
+    ::AbstractFactorization,
+    ::AbstractVector,
+    ::Number,
+    ::Number,
 )
-    return _fact_mul!(res, A, B, α, β)
+    return error("This is inefficient, call `buffered_mul!` instead")
 end
 
 function LinearAlgebra.mul!(
-    res::AbstractVector,
-    A::AbstractFactorization,
-    B::AbstractVector,
-    α::Number,
-    β::Number,
+    ::AbstractMatrix,
+    ::AbstractFactorization,
+    ::AbstractMatrix,
+    ::Number,
+    ::Number,
 )
-    return _fact_mul!(res, A, B, α, β)
+    return error("This is inefficient, call `buffered_mul!` instead")
 end
diff --git a/src/model.jl b/src/model.jl
@@ -110,9 +110,12 @@ num_scalars(model::Model) = length(model.d_lin)
 
 num_matrices(model::Model) = length(model.C)
 
+# Julia has troubles inferring the return type of constructors
+_matrix_index(i)::MatrixIndex = MatrixIndex(i)
+
 function matrix_indices(model::Union{Model,AbstractSolution})
     return MOI.Utilities.LazyMap{MatrixIndex}(
-        MatrixIndex,
+        _matrix_index,
         Base.OneTo(num_matrices(model)),
     )
 end
diff --git a/test/BurerMonteiro.jl b/test/BurerMonteiro.jl
diff --git a/test/diff_check.jl b/test/diff_check.jl
diff --git a/test/factorization.jl b/test/factorization.jl