Skip to content

Commit 78e0ee1

Browse files
authored
Allocation-free jtprod for Burer-Monteiro with low-rank constraints (#59)
* Allocation-free jtprod for Burer-Monteiro with low-rank constraints * Fix format * Fix allocation * Fix * Fix * Add allocation tests * Fix allocations * Fix * Update tests * Fix test * Fix format * Fix * Going on the wrong direction * Revert "Going on the wrong direction" This reverts commit b8ba18d. * Reapply "Going on the wrong direction" This reverts commit 5cbba7e. * Fixes * Fix format * Update holy * Fix format * Much simpler solution * Update tests * update * Fix format * Fix format * Add test * Fix * Fix format * Update benchmarks
1 parent e5b0e1c commit 78e0ee1

File tree

10 files changed

+264
-41
lines changed

10 files changed

+264
-41
lines changed

perf/holy.jl

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
using Dualization
2+
import SDPLRPlus
3+
include(joinpath(dirname(@__DIR__), "examples", "holy_model.jl"))
4+
n = 30
5+
A = data(n)
6+
lr = holy_lowrank(A)
7+
set_optimizer(lr, dual_optimizer(LRO.Optimizer))
8+
set_attribute(lr, "solver", LRO.BurerMonteiro.Solver)
9+
set_attribute(lr, "sub_solver", SDPLRPlus.Solver)
10+
set_attribute(lr, "ranks", [15])
11+
set_attribute(lr, "maxmajoriter", 0)
12+
set_attribute(lr, "square_scalars", true)
13+
optimize!(lr)
14+
15+
solver = unsafe_backend(lr).dual_problem.dual_model.model.optimizer.solver;
16+
aux = solver.model;
17+
var = solver.solver.var;
18+
19+
using BenchmarkTools
20+
import NLPModels
21+
const BM = LRO.BurerMonteiro
22+
23+
function _jtprod(model, var)
24+
C = LRO._mul_to!(buffer, B', LRO.right_factor(A))
25+
C = LRO._rmul_diag!!(C, A.scaling)
26+
lA = LRO.left_factor(A)
27+
println("_add_mul!")
28+
@btime LRO._add_mul!($res, $lA, $C', $α)
29+
end
30+
31+
function jtprod(model, var)
32+
x = var.Rt
33+
y = view(var.y, 1:model.meta.ncon)
34+
Jtv = var.Gt
35+
println("jtprod!")
36+
@btime NLPModels.jtprod!($model, $x, $y, $Jtv)
37+
38+
X = BM.Solution(x, model.dim)
39+
JtV = BM.Solution(Jtv, model.dim)
40+
println("Scalar jtprod!")
41+
@btime BM.jtprod!(
42+
$model,
43+
$X,
44+
$y,
45+
LRO.left_factor($JtV, LRO.ScalarIndex),
46+
LRO.ScalarIndex,
47+
)
48+
49+
i = LRO.MatrixIndex(1)
50+
println("Matrix jtprod!")
51+
@btime BM.add_jtprod!($model, $X[$i], $y, $JtV[$i], $i)
52+
53+
j = 1
54+
res = JtV[i].factor
55+
A = LRO.jac(model.model, j, i)
56+
B = X[i].factor
57+
α = 2y[j]
58+
buffer = model.jtprod_buffer[]
59+
println("buffered_mul!")
60+
@btime LRO.buffered_mul!($res, $A, $B, $α, true, $buffer)
61+
end
62+
63+
jtprod(aux, var)

perf/maxcut.jl

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,12 @@ function bench_rmul(A)
4242
x = rand(n, 1)
4343
y = similar(x)
4444
println("rmul")
45-
@btime LinearAlgebra.mul!($y, $A, $x, 2.0, 1.0)
45+
if A.factor isa AbstractVector
46+
buffer = zeros(1)
47+
else
48+
buffer = zeros(1, LRO.max_rank(A))
49+
end
50+
@btime LRO.buffered_mul!($y, $A, $x, 2.0, 1.0, $buffer)
4651
end
4752

4853
function bench(aux, var)
@@ -71,6 +76,7 @@ function bench_plus(args...; kws...)
7176
end
7277
7378
function bench_lro(args...; vector, kws...)
79+
println("vector ? $vector")
7480
model = maxcut(weights(args...; kws...), dual_optimizer(LRO.Optimizer); vector)
7581
set_attribute(model, "solver", LRO.BurerMonteiro.Solver)
7682
set_attribute(model, "sub_solver", SDPLRPlus.Solver)
@@ -90,4 +96,3 @@ p = 0.1
9096
bench_plus(n; p)
9197
bench_lro(n; p, vector = true)
9298
bench_lro(n; p, vector = false)
93-
bench_lro(n; p, vector = false)

perf/set_dot.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
include(joinpath(dirname(@__DIR__), "examples", "maxcut.jl"))
22

3+
using BenchmarkTools
4+
35
# Important for Dualization
46
function bench_set_dot(n)
57
T = Float64

src/BurerMonteiro/model.jl

Lines changed: 66 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
1-
mutable struct Model{S,T,CT,AT} <: NLPModels.AbstractNLPModel{T,Vector{T}}
1+
mutable struct Model{S,T,CT,AT,JTB} <: NLPModels.AbstractNLPModel{T,Vector{T}}
22
model::LRO.Model{T,CT,AT}
33
dim::Dimensions{S}
44
meta::NLPModels.NLPModelMeta{T,Vector{T}}
55
counters::NLPModels.Counters
6+
jtprod_buffer::JTB
67
function Model{S}(model::LRO.Model{T,CT,AT}, ranks) where {S,T,CT,AT}
78
dim = Dimensions{S}(model, ranks)
8-
return new{S,T,CT,AT}(
9+
jtprod_buffer = buffer_for_jtprod(model, dim)
10+
return new{S,T,CT,AT,typeof(jtprod_buffer)}(
911
model,
1012
dim,
1113
meta(dim, LRO.cons_constant(model)),
1214
NLPModels.Counters(),
15+
jtprod_buffer,
1316
)
1417
end
1518
end
@@ -42,6 +45,7 @@ function set_rank!(model::Model, i::LRO.MatrixIndex, r)
4245
set_rank!(model.dim, i, r)
4346
# `nvar` has changed so we need to reset `model.meta`
4447
model.meta = meta(model.dim, model.meta.lcon)
48+
model.jtprod_buffer = buffer_for_jtprod(model.model, model.dim)
4549
return
4650
end
4751

@@ -71,7 +75,8 @@ function grad!(
7175
i::LRO.MatrixIndex,
7276
)
7377
C = LRO.grad(model.model, i)
74-
LinearAlgebra.mul!(G.factor, C, X.factor)
78+
buffer = _buffer(model.jtprod_buffer[i.value], C, X.factor)
79+
LRO.buffered_mul!(G.factor, C, X.factor, true, false, buffer)
7580
G.factor .*= 2
7681
return
7782
end
@@ -152,16 +157,64 @@ function jtprod!(
152157
return JtV
153158
end
154159

160+
const _RankOne{T} = LRO.AbstractFactorization{T,<:AbstractVector{T}}
161+
const _LowRank{T} = LRO.AbstractFactorization{T,<:AbstractMatrix{T}}
162+
163+
function buffer_for_jtprod(
164+
model::LRO.Model{T},
165+
dim::Dimensions,
166+
i::LRO.MatrixIndex,
167+
) where {T}
168+
row = view(model.A, i.value, :)
169+
C = model.C[i.value]
170+
if any(A -> A isa _LowRank, row) || C isa _LowRank
171+
ncols = maximum(row; init = 0) do A
172+
if A isa _LowRank
173+
return LRO.max_rank(A)
174+
else
175+
return 0
176+
end
177+
end
178+
if C isa _LowRank
179+
ncols = max(ncols, LRO.max_rank(C))
180+
end
181+
return zeros(T, dim.ranks[i.value], ncols)
182+
elseif any(A -> A isa _RankOne, row) || C isa _RankOne
183+
return zeros(T, dim.ranks[i.value])
184+
end
185+
return
186+
end
187+
188+
function buffer_for_jtprod(model::LRO.Model, dim::Dimensions)
189+
return buffer_for_jtprod.(model, dim, LRO.matrix_indices(model))
190+
end
191+
192+
_buffer(_, ::AbstractMatrix, _) = nothing
193+
_buffer(buffer::AbstractVector, ::_RankOne, ::AbstractMatrix) = buffer
194+
function _buffer(buffer::AbstractMatrix, A::_LowRank, ::AbstractMatrix)
195+
# FIXME with this if-else, we return a small Union but the compiler
196+
# since to forget about this Union and allocates later
197+
#if size(buffer, 2) == LRO.max_rank(A)
198+
# buffer
199+
#else
200+
# Using this `view` instead of `buffer`, `AllocCheck` now
201+
# sees possible allocations but `@allocated` sees none
202+
return view(buffer, :, Base.OneTo(LRO.max_rank(A)))
203+
#end
204+
end
205+
155206
function add_jtprod!(
156207
model::Model,
157208
X::LRO.Factorization,
158209
y::AbstractVector,
159210
JtV::LRO.Factorization,
160211
i::LRO.MatrixIndex,
212+
α = 2,
161213
)
162214
for j in eachindex(y)
163215
A = LRO.jac(model.model, j, i)
164-
LinearAlgebra.mul!(JtV.factor, A, X.factor, 2y[j], true)
216+
buffer = _buffer(model.jtprod_buffer[i.value], A, X.factor)
217+
LRO.buffered_mul!(JtV.factor, A, X.factor, α * y[j], true, buffer)
165218
end
166219
end
167220

@@ -185,8 +238,10 @@ function NLPModels.jtprod!(
185238
X = Solution(x, model.dim)
186239
JtV = Solution(Jtv, model.dim)
187240
jtprod!(model, X, y, LRO.left_factor(JtV, LRO.ScalarIndex), LRO.ScalarIndex)
188-
for i in LRO.matrix_indices(model.model)
189-
jtprod!(model, X[i], y, JtV[i], i)
241+
for i::LRO.MatrixIndex in LRO.matrix_indices(model.model)
242+
Xi = X[i]
243+
JtVi = JtV[i]
244+
jtprod!(model, Xi, y, JtVi, i)
190245
end
191246
return Jtv
192247
end
@@ -248,14 +303,11 @@ function NLPModels.hprod!(
248303
obj_weight,
249304
)
250305
for i in LRO.matrix_indices(model.model)
251-
Vi = V[i].factor
252-
C = LRO.grad(model.model, i)
253-
Hvi = HV[i].factor
254-
LinearAlgebra.mul!(Hvi, C, Vi, 2obj_weight, false)
255-
for j in 1:model.meta.ncon
256-
A = LRO.jac(model.model, j, i)
257-
LinearAlgebra.mul!(Hvi, A, Vi, -2y[j], true)
258-
end
306+
Vi = V[i]
307+
Hvi = HV[i]
308+
grad!(model, Vi, Hvi, i)
309+
Hvi.factor .*= obj_weight
310+
add_jtprod!(model, Vi, y, Hvi, i, -2)
259311
end
260312
return Hv
261313
end

src/BurerMonteiro/solution.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ function Dimensions{S}(model::LRO.Model, ranks) where {S}
1515
return Dimensions{S}(num_scalars, side_dimensions, ranks, offsets)
1616
end
1717

18+
Base.broadcastable(d::Dimensions) = Ref(d)
19+
1820
Base.length(d::Dimensions) = d.offsets[end]
1921

2022
function set_rank!(d::Dimensions, i::LRO.MatrixIndex, rank)

src/factorization.jl

Lines changed: 56 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@ function Base.getindex(m::AbstractFactorization, i::Int, j::Int)
1818
)
1919
end
2020

21+
# Structural maximum rank
22+
function max_rank(m::AbstractFactorization)
23+
return size(left_factor(m), 2)
24+
end
25+
2126
"""
2227
struct Factorization{
2328
T,
@@ -423,7 +428,11 @@ function _add_mul!(
423428
C::LinearAlgebra.AdjOrTrans,
424429
α,
425430
)
426-
@assert axes(res, 2) == axes(C, 2)
431+
# For a small sparse vector of two entries and `C` of length 15,
432+
# the `@assert` are slowers than what is gained by
433+
# `@inbounds` apparently. See `perf/holy.jl` benchmark `jtprod`
434+
#@assert axes(res, 1) == eachindex(x)
435+
#@assert axes(res, 2) == axes(C, 2)
427436
for (row, val) in zip(x.nzind, x.nzval)
428437
γ = val * α
429438
for j in axes(res, 2)
@@ -468,41 +477,67 @@ function _add_mul!(
468477
end
469478
end
470479

471-
function _mul!(res::AbstractVecOrMat, A::AbstractVecOrMat, B, α, β)
472-
return LinearAlgebra.mul!(res, A, B, α, β)
473-
end
480+
# `MulAddMul(α, β)` is type unstable as the first two type parameters depends on whether `α` is one
481+
# and whether `β` is zero.
482+
# One way to avoid this allocation is to construct it and call the next method within `LinearAlgebra.@stable_muladdmul`
483+
# which will add if-else clauses.
484+
# This is however not done when calling `BLAS` since when we call BLAS we just wrap and then unwrap this `MulAddMul`
485+
# so it should be optimized out and hence not allocation should be incurred due to type instability.
486+
# For this to work, we need to call `@inline` as suggested by
487+
# https://github.com/JuliaLang/julia/pull/29634#issuecomment-440512432
488+
@inline _mul!(C, A, B, α, β) = LinearAlgebra.mul!(C, A, B, α, β)
489+
490+
_mul_to!(::Nothing, A, B) = A * B
491+
_mul_to!(buffer, A, B) = LinearAlgebra.mul!(buffer, A, B)
474492

475-
function _fact_mul!(
493+
@inline function buffered_mul!(
476494
res::AbstractVecOrMat,
477495
A::AbstractFactorization,
478496
B::AbstractVecOrMat,
479-
α::Number,
480-
β::Number,
497+
α,
498+
β,
499+
buffer,
481500
)
482501
# TODO if `scaling` is `FillArrays.Fill`, we could just update `α`
483-
C = _lmul_diag!!(A.scaling, right_factor(A)' * B)
502+
# We'd like the rows to be the number of columns of `B`
503+
# as we take submatrices as subsets of columns (for it to be contiguous)
504+
# in the buffer so we compute the transpose
505+
# `UΣV'B = U(B'VΣ)'`
506+
C = _mul_to!(buffer, B', right_factor(A))
507+
C = _rmul_diag!!(C, A.scaling)
484508
lA = left_factor(A)
485-
return _mul!(res, lA, C, α, β)
509+
return _mul!(res, lA, C', α, β)
486510
end
487511

488512
# We want the same implementation for the two following ones but we can't use
489513
# `AbstractVecOrMat` as it would give ambiguity so we redirect to `_fact_mul!`
514+
function buffered_mul!(
515+
res::AbstractVecOrMat,
516+
A::AbstractMatrix,
517+
B::AbstractVecOrMat,
518+
α,
519+
β,
520+
_,
521+
)
522+
return LinearAlgebra.mul!(res, A, B, α, β)
523+
end
524+
490525
function LinearAlgebra.mul!(
491-
res::AbstractMatrix,
492-
A::AbstractFactorization,
493-
B::AbstractMatrix,
494-
α::Number,
495-
β::Number,
526+
::AbstractVector,
527+
::AbstractFactorization,
528+
::AbstractVector,
529+
::Number,
530+
::Number,
496531
)
497-
return _fact_mul!(res, A, B, α, β)
532+
return error("This is inefficient, call `buffered_mul!` instead")
498533
end
499534

500535
function LinearAlgebra.mul!(
501-
res::AbstractVector,
502-
A::AbstractFactorization,
503-
B::AbstractVector,
504-
α::Number,
505-
β::Number,
536+
::AbstractMatrix,
537+
::AbstractFactorization,
538+
::AbstractMatrix,
539+
::Number,
540+
::Number,
506541
)
507-
return _fact_mul!(res, A, B, α, β)
542+
return error("This is inefficient, call `buffered_mul!` instead")
508543
end

src/model.jl

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,9 +110,12 @@ num_scalars(model::Model) = length(model.d_lin)
110110

111111
num_matrices(model::Model) = length(model.C)
112112

113+
# Julia has troubles inferring the return type of constructors
114+
_matrix_index(i)::MatrixIndex = MatrixIndex(i)
115+
113116
function matrix_indices(model::Union{Model,AbstractSolution})
114117
return MOI.Utilities.LazyMap{MatrixIndex}(
115-
MatrixIndex,
118+
_matrix_index,
116119
Base.OneTo(num_matrices(model)),
117120
)
118121
end

0 commit comments

Comments
 (0)