diff --git a/Project.toml b/Project.toml index 4c8aa86..83b63ed 100644 --- a/Project.toml +++ b/Project.toml @@ -4,7 +4,9 @@ authors = ["Klamkin", "Michael and contributors"] version = "1.0.0-DEV" [deps] +DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63" Dualization = "191a621a-6537-11e9-281d-650236a99e60" +ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" JuMP = "4076af6c-e467-56ae-b986-b466b2749572" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee" @@ -17,8 +19,6 @@ MathOptSetDistances = "=0.2.11" [extras] Clarabel = "61c947e1-3e6d-4ee4-985a-eec8c727bd6e" -DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63" -ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" HiGHS = "87dc4568-4c63-4d18-b0c0-bb2238e4078b" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" @@ -27,4 +27,4 @@ Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test", "HTTP", "JSON", "Clarabel", "HiGHS", "Pkg", "DifferentiationInterface", "ForwardDiff", "ParametricOptInterface"] +test = ["Test", "HTTP", "JSON", "Clarabel", "HiGHS", "Pkg", "ParametricOptInterface"] diff --git a/docs/src/index.md b/docs/src/index.md index 030d865..d4c5870 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,7 +1,224 @@ # L2ODLL.jl -Documentation for L2ODLL.jl - !!! warning This documentation is a work in progress. - Please open an issue if content is missing / erroneous \ No newline at end of file + Please open an issue if content is missing / erroneous. + +L2ODLL.jl implements the Dual Lagrangian Learning (DLL) method of [Tanneau and Hentenryck (2024)](https://arxiv.org/pdf/2402.03086) using JuMP. + +## Installation + +```julia +import Pkg +Pkg.add(; url="https://github.com/LearningToOptimize/L2ODLL.jl") +``` + +## Usage + + +This package simplifies the implementation of DLL by taking as input a primal JuMP model, then automatically generating the dual projection and completion functions which can be used in the training and inference of DLL models. The basic usage is as follows: + + +#### Define your (primal) model using JuMP + +For the purposes of this example, we'll use a portfolio optimization problem. +```julia +using JuMP, LinearAlgebra + +model = Model() + +# define constant problem data +Σ = [166 34 58; 34 64 4; 58 4 100] / 100^2 +N = size(Σ, 1) + +# define variables +@variable(model, x[1:N]) +set_lower_bound.(x, 0) # we explicitly set upper and lower bounds +set_upper_bound.(x, 1) # in order to use the BoundDecomposition + +# define parameteric problem data +μ0 = randn(N) +γ0 = rand() +@variable(model, μ[1:N] in MOI.Parameter.(μ0)) +@variable(model, γ in MOI.Parameter(γ0)) + +# define constraints +@constraint(model, simplex, sum(x) == 1) +@constraint(model, risk, [γ; cholesky(Σ).L * x] in SecondOrderCone()) + +# define objective +@objective(model, Max, dot(μ,x)) +``` + +#### Decompose and build the functions + +Since all the variables have finite bounds, L2ODLL will automatically pick the `BoundDecomposition`. +```julia +using L2ODLL + +L2ODLL.decompose!(model) +``` + +Now, L2ODLL has automatically generated the dual projection and completion layer. To compute the dual objective value and gradient with respect to the prediction, use: +```julia +param_value = ... # some values for μ and γ +y_predicted = nn(param_value) # e.g. neural network inference + +dobj = L2ODLL.dual_objective(model, y_predicted, param_value) +dobj_wrt_y = L2ODLL.dual_objective_gradient(model, y_predicted, param_value) +``` + +This also works with batches, using broadcasting: +```julia +dobj = L2ODLL.dual_objective.(model, y_predicted_batch, param_value_batch) +dobj_wrt_y = L2ODLL.dual_objective_gradient.(model, y_predicted_batch, param_value_batch) +``` + +!!! warning + These functions currently run on the CPU. A batched GPU-friendly version is coming soon. + +## Math Background + +### Decomposition + +In DLL, the primal constraints (dual variables) are decomposed into a predicted set and a completed set. +Consider the primal-dual pair: +```math +\begin{equation} +\begin{aligned} +& \min\nolimits_{x} & c^\top x +\\ +& \;\;\text{s.t.} & Ax + b \in \mathcal{C} +\\ +& & x \in \mathbb{R}^n +\end{aligned} +\quad\quad\quad\quad +\begin{aligned} +& \max\nolimits_{y} & - b^\top y +\\ +& \;\;\text{s.t.} & A^\top y = c +\\ +& & y \in \mathcal{C}^* +\end{aligned} +\end{equation} +``` +After the decomposition, we have: +```math +\begin{equation} +\begin{aligned} +& \min\nolimits_{x} & c^\top x +\\ +& \;\;\text{s.t.} & Ax + b \in \mathcal{C} +\\ +& \;\;\phantom{\text{s.t.}} & Hx + h \in \mathcal{K} +\\ +& & x \in \mathbb{R}^n +\end{aligned} +\quad\quad\quad\quad +\begin{aligned} +& \max\nolimits_{y} & - b^\top y +\\ +& \;\;\text{s.t.} & A^\top y + H^\top z = c +\\ +& & y \in \mathcal{C}^*,\; z \in \mathcal{K}^* +\end{aligned} +\end{equation} +``` + +Then, the completion model is: + +```math +\begin{equation} +\begin{aligned} +& \max\nolimits_{z} & - h^\top z - b^\top y +\\ +& \;\;\text{s.t.} & H z = c - A^\top y +\\ +& & z \in \mathcal{K}^* +\end{aligned} +\end{equation} +``` + +To train the neural network, we need the gradient of the optimal value with respect to the predicted $y$. This is $\nabla_y = -b-Ax$ where $x$ is the optimal dual solution corresponding to the affine constraints in the completion model. In the special cases below, we specify just the expression for $x$ in this formula. + + +#### Bounded Decomposition + +When all primal variables have finite upper and lower bounds, a natural way to decompose the constraints is to have $z$ correspond to the bound constraints, and $y$ correspond to the main constraints, i.e. + +```math +\begin{equation} +\begin{aligned} +& \min\nolimits_{x} & c^\top x +\\ +& \;\;\text{s.t.} & Ax + b \in \mathcal{C} +\\ +& & l \leq x \leq u +\end{aligned} +\quad\quad\quad\quad +\begin{aligned} +& \max\nolimits_{y,z_l,z_u} & - b^\top y - l^\top z_l - u^\top z_u +\\ +& \;\;\text{s.t.} & A^\top y + I z_l + I z_u = c +\\ +& & y \in \mathcal{C}^*,\; z_l \in \mathbb{R}_+^n,\; z_u \in \mathbb{R}_-^n +\end{aligned} +\end{equation} +``` + +Then, the completion model is: + +```math +\begin{equation} +\begin{aligned} +& \max\nolimits_{z_l,z_u} & - l^\top z_l - u^\top z_u - b^\top y +\\ +& \;\;\text{s.t.} & I z_l + I z_u = c - A^\top y +\\ +& & z_l \in \mathbb{R}_+^n,\; z_u \in \mathbb{R}_-^n +\end{aligned} +\end{equation} +``` + +This model admits a closed form solution, $z_l = |c-A^\top y|^+$ and $z_u = -|c-A^\top y|^-$. Furthermore, the $x$ that defines the (sub-)gradient is given element-wise by $l$ if $z_l > 0$, $u$ if $z_u < 0$, and $x\in[l,u]$ otherwise. + + +#### (Strictly) Convex QP + +In the convex QP case, the primal has a strictly convex quadratic objective function, i.e. $Q\succ 0$. In that case it is natural to use the main constraints as the predicted set and to complete the quadratic slack dual variables. + +```math +\begin{equation} +\begin{aligned} +& \min\nolimits_{x} & x^\top Q x + c^\top x +\\ +& \;\;\text{s.t.} & Ax + b \in \mathcal{C} +\\ +& & x \in \mathbb{R}^n +\end{aligned} +\quad\quad\quad\quad +\begin{aligned} +& \max\nolimits_{y} & - b^\top y - z^\top Q z +\\ +& \;\;\text{s.t.} & A^\top y + Qz = c +\\ +& & y \in \mathcal{C}^*,\; z \in \mathbb{R}^n +\end{aligned} +\end{equation} +``` + +Then, the completion model is: + +```math +\begin{equation} +\begin{aligned} +& \max\nolimits_{z} & - z^\top Q z - b^\top y +\\ +& \;\;\text{s.t.} & Q z = c - A^\top y +\\ +& & z \in \mathbb{R}^n +\end{aligned} +\end{equation} +``` + +This model admits a closed form solution, $z = Q^{-1}(c - A^\top y)$. Furthermore, the closed form dual solution in this case is $x=z$. diff --git a/src/L2ODLL.jl b/src/L2ODLL.jl index 36f799c..e9a9e6a 100644 --- a/src/L2ODLL.jl +++ b/src/L2ODLL.jl @@ -5,11 +5,15 @@ import JuMP import LinearAlgebra import MathOptSetDistances import SparseArrays +import DifferentiationInterface +import ForwardDiff const MOI = JuMP.MOI const MOIB = JuMP.MOIB const MOIU = JuMP.MOIU const MOSD = MathOptSetDistances +const DI = DifferentiationInterface +const ADTypes = DI.ADTypes abstract type AbstractDecomposition end # must have p_ref and y_ref @@ -26,6 +30,30 @@ struct DLLCache decomposition::AbstractDecomposition end +function decompose!(model::JuMP.Model, decomposition::AbstractDecomposition; + optimizer=nothing, proj_fn=nothing, dll_layer_builder=nothing +) + return build_cache(model, decomposition; optimizer, proj_fn, dll_layer_builder) +end + +function dual_objective(model::JuMP.Model, y_predicted, param_value) + cache = get_cache(model) + @assert length.(y_predicted) == L2ODLL.y_shape(cache) + return cache.dll_layer(y_predicted, param_value) +end + +function dual_objective_gradient(model::JuMP.Model, y_predicted, param_value; ad_type::ADTypes.AbstractADType=DI.AutoForwardDiff()) + cache = get_cache(model) + y_shape = L2ODLL.y_shape(cache) + @assert length.(y_predicted) == y_shape + dobj_wrt_y = DI.gradient( + (y,p) -> dual_objective(model, L2ODLL.unflatten_y(y, y_shape), p), + ad_type, + L2ODLL.flatten_y(y_predicted), DI.Constant(param_value) + ) + return L2ODLL.unflatten_y(dobj_wrt_y, y_shape) +end + function build_cache(model::JuMP.Model, decomposition::AbstractDecomposition; optimizer=nothing, proj_fn=nothing, dll_layer_builder=nothing ) @@ -43,7 +71,13 @@ function build_cache(model::JuMP.Model, decomposition::AbstractDecomposition; jump_builder(decomposition, proj_fn, dual_model, optimizer) end - return DLLCache(proj_fn, dll_layer, dual_model, decomposition) + cache = DLLCache(proj_fn, dll_layer, dual_model, decomposition) + model.ext[:_L2ODLL_cache] = cache + return cache +end + +function get_cache(model::JuMP.Model) + return model.ext[:_L2ODLL_cache] end function make_completion_model(cache::DLLCache) diff --git a/src/layers/generic.jl b/src/layers/generic.jl index fe2f6b0..dddcd74 100644 --- a/src/layers/generic.jl +++ b/src/layers/generic.jl @@ -16,7 +16,7 @@ function jump_builder(decomposition::AbstractDecomposition, proj_fn::Function, d JuMP.set_optimizer(completion_model, optimizer) silent && JuMP.set_silent(completion_model) completion_model.ext[:🔒] = ReentrantLock() - # TODO: use DiffOpt to define frule/rrule + # TODO: define frule/rrule using b-Ax # TODO: handle infeasibility? return (y_pred, param_value) -> begin lock(completion_model.ext[:🔒]) diff --git a/test/runtests.jl b/test/runtests.jl index feed3a8..455ed88 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -46,28 +46,28 @@ SOLVER = () -> ParametricOptInterface.Optimizer(HiGHS.Optimizer()); param_value = [0.1]; JuMP.set_parameter_value.([γ], param_value); - cqp_cache = L2ODLL.build_cache(m, L2ODLL.ConvexQP(m)); - cqp_y_pred = randn_like(L2ODLL.get_y(cqp_cache)); - dobj1 = cqp_cache.dll_layer(cqp_y_pred, param_value) - dobj, dobj_wrt_y_flat = DifferentiationInterface.value_and_gradient( - cqp_cache.dll_layer, - DifferentiationInterface.AutoForwardDiff(), - L2ODLL.flatten_y(cqp_y_pred), DifferentiationInterface.Constant(param_value) - ) - dobj_wrt_y = L2ODLL.unflatten_y(dobj_wrt_y_flat, L2ODLL.y_shape(cqp_cache)) - @test isapprox(dobj1, dobj, atol=1e-6) - - cqp_solver_cache = L2ODLL.build_cache(m, L2ODLL.ConvexQP(m), dll_layer_builder=(d,p,m) -> L2ODLL.jump_builder(d,p,m,SOLVER)); - dobj2 = cqp_solver_cache.dll_layer(cqp_y_pred, param_value) - @test isapprox(dobj1, dobj2, atol=1e-6) + L2ODLL.decompose!(m, L2ODLL.ConvexQP(m)); + + cqp_y_pred = randn_like(L2ODLL.get_y(L2ODLL.get_cache(m))); + + dobj = L2ODLL.dual_objective(m, cqp_y_pred, param_value) + dobj_wrt_y = L2ODLL.dual_objective_gradient(m, cqp_y_pred, param_value) + + m2 = JuMP.Model(); + JuMP.@variable(m2, x[1:N] >=0); + JuMP.@variable(m2, γ in JuMP.MOI.Parameter(0.0)); + JuMP.@objective(m2, Max, γ*LinearAlgebra.dot(μ,x) - x' * Σ * x); + JuMP.@constraint(m2, simplex, sum(x) == 1); + JuMP.set_parameter_value.([γ], param_value); + L2ODLL.decompose!(m2, L2ODLL.ConvexQP(m2), dll_layer_builder=(d,p,m) -> L2ODLL.jump_builder(d,p,m,SOLVER)); + dobj2 = L2ODLL.dual_objective(m2, cqp_y_pred, param_value) JuMP.set_optimizer(m, Clarabel.Optimizer); JuMP.set_silent(m); JuMP.optimize!(m) - cqp_y_true = JuMP.dual.(cqp_cache.decomposition.y_ref) - dobj1 = cqp_cache.dll_layer(cqp_y_true, param_value) - dobj2 = cqp_cache.dll_layer(cqp_y_true, param_value) - @test isapprox(dobj1, dobj2, atol=1e-6) && isapprox(dobj1, JuMP.objective_value(m), atol=1e-6) + cqp_y_true = JuMP.dual.(L2ODLL.get_cache(m).decomposition.y_ref) + dobj1 = L2ODLL.dual_objective(m, cqp_y_true, param_value) + @test isapprox(dobj1, JuMP.objective_value(m), atol=1e-6) end @testset "Markowitz SecondOrderCone" begin @@ -89,27 +89,33 @@ SOLVER = () -> ParametricOptInterface.Optimizer(HiGHS.Optimizer()); param_value = [randn(N); 0.1]; JuMP.set_parameter_value.([μ; γ], param_value); - blp_cache = L2ODLL.build_cache(m, L2ODLL.BoundDecomposition(m)); - blp_y_pred = randn_like(L2ODLL.get_y(blp_cache)); - dobj1 = blp_cache.dll_layer(blp_y_pred, param_value) - dobj, dobj_wrt_y = DifferentiationInterface.value_and_gradient( - (y,p) ->blp_cache.dll_layer(L2ODLL.unflatten_y(y, L2ODLL.y_shape(blp_cache)),p), - DifferentiationInterface.AutoForwardDiff(), - L2ODLL.flatten_y(blp_y_pred), DifferentiationInterface.Constant(param_value) - ) - dobj_wrt_y = L2ODLL.unflatten_y(dobj_wrt_y, L2ODLL.y_shape(blp_cache)) - @test isapprox(dobj1, dobj, atol=1e-6) - - blp_solver_cache = L2ODLL.build_cache(m, L2ODLL.BoundDecomposition(m), dll_layer_builder=(d,p,m) -> L2ODLL.jump_builder(d,p,m,SOLVER)); - dobj2 = blp_solver_cache.dll_layer(blp_y_pred, param_value) - @test isapprox(dobj1, dobj2, atol=1e-6) + L2ODLL.decompose!(m, L2ODLL.BoundDecomposition(m)); + + blp_y_pred = randn_like(L2ODLL.get_y(L2ODLL.get_cache(m))); + + dobj = L2ODLL.dual_objective(m, blp_y_pred, param_value) + dobj_wrt_y = L2ODLL.dual_objective_gradient(m, blp_y_pred, param_value) + + m2 = JuMP.Model(); + JuMP.@variable(m2, x[1:N]); + JuMP.set_lower_bound.(x, 0); + JuMP.set_upper_bound.(x, 1); + JuMP.@variable(m2, μ[1:N] in JuMP.MOI.Parameter.(0.0)); + JuMP.@variable(m2, γ in JuMP.MOI.Parameter(0.1)); + JuMP.@constraint(m2, simplex, sum(x) == 1); + JuMP.@constraint(m2, risk, [γ; LinearAlgebra.cholesky(Σ).L * x] in JuMP.SecondOrderCone()); + JuMP.@objective(m2, Max, LinearAlgebra.dot(μ,x)); + JuMP.set_parameter_value.([μ; γ], param_value); + + L2ODLL.decompose!(m2, L2ODLL.BoundDecomposition(m2), dll_layer_builder=(d,p,m) -> L2ODLL.jump_builder(d,p,m,SOLVER)); + dobj2 = L2ODLL.dual_objective(m2, blp_y_pred, param_value) JuMP.set_optimizer(m, Clarabel.Optimizer); JuMP.set_silent(m); JuMP.optimize!(m) - blp_y_true = JuMP.dual.(blp_cache.decomposition.y_ref) - dobj1 = blp_cache.dll_layer(blp_y_true, param_value) - dobj2 = blp_cache.dll_layer(blp_y_true, param_value) - @test isapprox(dobj1, dobj2, atol=1e-6) && isapprox(dobj1, JuMP.objective_value(m), atol=1e-6) + blp_y_true = JuMP.dual.(L2ODLL.get_cache(m).decomposition.y_ref) + + dobj1 = L2ODLL.dual_objective(m, blp_y_true, param_value) + @test isapprox(dobj1, JuMP.objective_value(m), atol=1e-6) end end \ No newline at end of file