Basic docs + API (#8)

klamike · web-flow · commit 43abb6723538 · 2025-06-10T17:58:30.000-04:00
* background

* update api + docs
diff --git a/Project.toml b/Project.toml
@@ -4,7 +4,9 @@ authors = ["Klamkin", "Michael <michael@klamkin.com> and contributors"]
 version = "1.0.0-DEV"
 
 [deps]
+DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
 Dualization = "191a621a-6537-11e9-281d-650236a99e60"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee"
@@ -17,8 +19,6 @@ MathOptSetDistances = "=0.2.11"
 
 [extras]
 Clarabel = "61c947e1-3e6d-4ee4-985a-eec8c727bd6e"
-DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
-ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 HiGHS = "87dc4568-4c63-4d18-b0c0-bb2238e4078b"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
@@ -27,4 +27,4 @@ Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "HTTP", "JSON", "Clarabel", "HiGHS", "Pkg", "DifferentiationInterface", "ForwardDiff", "ParametricOptInterface"]
+test = ["Test", "HTTP", "JSON", "Clarabel", "HiGHS", "Pkg", "ParametricOptInterface"]
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -1,7 +1,224 @@
 # L2ODLL.jl
 
-Documentation for L2ODLL.jl
-
 !!! warning
     This documentation is a work in progress.
-    Please open an issue if content is missing / erroneous
+    Please open an issue if content is missing / erroneous.
+
+L2ODLL.jl implements the Dual Lagrangian Learning (DLL) method of [Tanneau and Hentenryck (2024)](https://arxiv.org/pdf/2402.03086) using JuMP.
+
+## Installation
+
+```julia
+import Pkg
+Pkg.add(; url="https://github.com/LearningToOptimize/L2ODLL.jl")
+```
+
+## Usage
+
+
+This package simplifies the implementation of DLL by taking as input a primal JuMP model, then automatically generating the dual projection and completion functions which can be used in the training and inference of DLL models. The basic usage is as follows:
+
+
+#### Define your (primal) model using JuMP
+
+For the purposes of this example, we'll use a portfolio optimization problem.
+```julia
+using JuMP, LinearAlgebra
+
+model = Model()
+
+# define constant problem data
+Σ = [166 34 58; 34 64 4; 58 4 100] / 100^2
+N = size(Σ, 1)
+
+# define variables
+@variable(model, x[1:N])
+set_lower_bound.(x, 0)  # we explicitly set upper and lower bounds
+set_upper_bound.(x, 1)  #   in order to use the BoundDecomposition
+
+# define parameteric problem data
+μ0 = randn(N)
+γ0 = rand()
+@variable(model, μ[1:N] in MOI.Parameter.(μ0))
+@variable(model, γ in MOI.Parameter(γ0))
+
+# define constraints
+@constraint(model, simplex, sum(x) == 1)
+@constraint(model, risk, [γ; cholesky(Σ).L * x] in SecondOrderCone())
+
+# define objective
+@objective(model, Max, dot(μ,x))
+```
+
+#### Decompose and build the functions
+
+Since all the variables have finite bounds, L2ODLL will automatically pick the `BoundDecomposition`.
+```julia
+using L2ODLL
+
+L2ODLL.decompose!(model)
+```
+
+Now, L2ODLL has automatically generated the dual projection and completion layer. To compute the dual objective value and gradient with respect to the prediction, use:
+```julia
+param_value = ... # some values for μ and γ
+y_predicted = nn(param_value) # e.g. neural network inference
+
+dobj = L2ODLL.dual_objective(model, y_predicted, param_value)
+dobj_wrt_y = L2ODLL.dual_objective_gradient(model, y_predicted, param_value)
+```
+
+This also works with batches, using broadcasting:
+```julia
+dobj = L2ODLL.dual_objective.(model, y_predicted_batch, param_value_batch)
+dobj_wrt_y = L2ODLL.dual_objective_gradient.(model, y_predicted_batch, param_value_batch)
+```
+
+!!! warning
+    These functions currently run on the CPU. A batched GPU-friendly version is coming soon.
+
+## Math Background
+
+### Decomposition
+
+In DLL, the primal constraints (dual variables) are decomposed into a predicted set and a completed set.
+Consider the primal-dual pair:
+```math
+\begin{equation}
+\begin{aligned}
+& \min\nolimits_{x} & c^\top x
+\\
+& \;\;\text{s.t.} & Ax + b \in \mathcal{C}
+\\
+& & x \in \mathbb{R}^n
+\end{aligned}
+\quad\quad\quad\quad
+\begin{aligned}
+& \max\nolimits_{y} & - b^\top y
+\\
+& \;\;\text{s.t.} & A^\top y = c
+\\
+& & y \in \mathcal{C}^*
+\end{aligned}
+\end{equation}
+```
+After the decomposition, we have:
+```math
+\begin{equation}
+\begin{aligned}
+& \min\nolimits_{x} & c^\top x
+\\
+& \;\;\text{s.t.} & Ax + b \in \mathcal{C}
+\\
+& \;\;\phantom{\text{s.t.}} & Hx + h \in \mathcal{K}
+\\
+& & x \in \mathbb{R}^n
+\end{aligned}
+\quad\quad\quad\quad
+\begin{aligned}
+& \max\nolimits_{y} & - b^\top y
+\\
+& \;\;\text{s.t.} & A^\top y + H^\top z = c
+\\
+& & y \in \mathcal{C}^*,\; z \in \mathcal{K}^*
+\end{aligned}
+\end{equation}
+```
+
+Then, the completion model is:
+
+```math
+\begin{equation}
+\begin{aligned}
+& \max\nolimits_{z} & - h^\top z - b^\top y
+\\
+& \;\;\text{s.t.} & H z = c - A^\top y
+\\
+& & z \in \mathcal{K}^*
+\end{aligned}
+\end{equation}
+```
+
+To train the neural network, we need the gradient of the optimal value with respect to the predicted $y$. This is $\nabla_y = -b-Ax$ where $x$ is the optimal dual solution corresponding to the affine constraints in the completion model. In the special cases below, we specify just the expression for $x$ in this formula.
+
+
+#### Bounded Decomposition
+
+When all primal variables have finite upper and lower bounds, a natural way to decompose the constraints is to have $z$ correspond to the bound constraints, and $y$ correspond to the main constraints, i.e.
+
+```math
+\begin{equation}
+\begin{aligned}
+& \min\nolimits_{x} & c^\top x
+\\
+& \;\;\text{s.t.} & Ax + b \in \mathcal{C}
+\\
+& & l \leq x \leq u
+\end{aligned}
+\quad\quad\quad\quad
+\begin{aligned}
+& \max\nolimits_{y,z_l,z_u} & - b^\top y - l^\top z_l - u^\top z_u
+\\
+& \;\;\text{s.t.} & A^\top y + I z_l + I z_u = c
+\\
+& & y \in \mathcal{C}^*,\; z_l \in \mathbb{R}_+^n,\; z_u \in \mathbb{R}_-^n
+\end{aligned}
+\end{equation}
+```
+
+Then, the completion model is:
+
+```math
+\begin{equation}
+\begin{aligned}
+& \max\nolimits_{z_l,z_u} & - l^\top z_l - u^\top z_u - b^\top y
+\\
+& \;\;\text{s.t.} & I z_l + I z_u = c - A^\top y
+\\
+& & z_l \in \mathbb{R}_+^n,\; z_u \in \mathbb{R}_-^n
+\end{aligned}
+\end{equation}
+```
+
+This model admits a closed form solution, $z_l = |c-A^\top y|^+$ and $z_u = -|c-A^\top y|^-$. Furthermore, the $x$ that defines the (sub-)gradient is given element-wise by $l$ if $z_l > 0$, $u$ if $z_u < 0$, and $x\in[l,u]$ otherwise.
+
+
+#### (Strictly) Convex QP
+
+In the convex QP case, the primal has a strictly convex quadratic objective function, i.e. $Q\succ 0$. In that case it is natural to use the main constraints as the predicted set and to complete the quadratic slack dual variables.
+
+```math
+\begin{equation}
+\begin{aligned}
+& \min\nolimits_{x} & x^\top Q x + c^\top x
+\\
+& \;\;\text{s.t.} & Ax + b \in \mathcal{C}
+\\
+& & x \in \mathbb{R}^n
+\end{aligned}
+\quad\quad\quad\quad
+\begin{aligned}
+& \max\nolimits_{y} & - b^\top y - z^\top Q z
+\\
+& \;\;\text{s.t.} & A^\top y + Qz = c
+\\
+& & y \in \mathcal{C}^*,\; z \in \mathbb{R}^n
+\end{aligned}
+\end{equation}
+```
+
+Then, the completion model is:
+
+```math
+\begin{equation}
+\begin{aligned}
+& \max\nolimits_{z} & - z^\top Q z - b^\top y
+\\
+& \;\;\text{s.t.} & Q z = c - A^\top y
+\\
+& & z \in \mathbb{R}^n
+\end{aligned}
+\end{equation}
+```
+
+This model admits a closed form solution, $z = Q^{-1}(c - A^\top y)$. Furthermore, the closed form dual solution in this case is $x=z$.
diff --git a/src/L2ODLL.jl b/src/L2ODLL.jl
@@ -5,11 +5,15 @@ import JuMP
 import LinearAlgebra
 import MathOptSetDistances
 import SparseArrays
+import DifferentiationInterface
+import ForwardDiff
 
 const MOI = JuMP.MOI
 const MOIB = JuMP.MOIB
 const MOIU = JuMP.MOIU
 const MOSD = MathOptSetDistances
+const DI = DifferentiationInterface
+const ADTypes = DI.ADTypes
 
 abstract type AbstractDecomposition end  # must have p_ref and y_ref
 
@@ -26,6 +30,30 @@ struct DLLCache
     decomposition::AbstractDecomposition
 end
 
+function decompose!(model::JuMP.Model, decomposition::AbstractDecomposition;
+    optimizer=nothing, proj_fn=nothing, dll_layer_builder=nothing
+)
+    return build_cache(model, decomposition; optimizer, proj_fn, dll_layer_builder)
+end
+
+function dual_objective(model::JuMP.Model, y_predicted, param_value)
+    cache = get_cache(model)
+    @assert length.(y_predicted) == L2ODLL.y_shape(cache)
+    return cache.dll_layer(y_predicted, param_value)
+end
+
+function dual_objective_gradient(model::JuMP.Model, y_predicted, param_value; ad_type::ADTypes.AbstractADType=DI.AutoForwardDiff())
+    cache = get_cache(model)
+    y_shape = L2ODLL.y_shape(cache)
+    @assert length.(y_predicted) == y_shape
+    dobj_wrt_y = DI.gradient(
+        (y,p) -> dual_objective(model, L2ODLL.unflatten_y(y, y_shape), p),
+        ad_type,
+        L2ODLL.flatten_y(y_predicted), DI.Constant(param_value)
+    )
+    return L2ODLL.unflatten_y(dobj_wrt_y, y_shape)
+end
+
 function build_cache(model::JuMP.Model, decomposition::AbstractDecomposition;
     optimizer=nothing, proj_fn=nothing, dll_layer_builder=nothing
 )
@@ -43,7 +71,13 @@ function build_cache(model::JuMP.Model, decomposition::AbstractDecomposition;
             jump_builder(decomposition, proj_fn, dual_model, optimizer)
         end
 
-    return DLLCache(proj_fn, dll_layer, dual_model, decomposition)
+    cache = DLLCache(proj_fn, dll_layer, dual_model, decomposition)
+    model.ext[:_L2ODLL_cache] = cache
+    return cache
+end
+
+function get_cache(model::JuMP.Model)
+    return model.ext[:_L2ODLL_cache]
 end
 
 function make_completion_model(cache::DLLCache)
diff --git a/src/layers/generic.jl b/src/layers/generic.jl
@@ -16,7 +16,7 @@ function jump_builder(decomposition::AbstractDecomposition, proj_fn::Function, d
     JuMP.set_optimizer(completion_model, optimizer)
     silent && JuMP.set_silent(completion_model)
     completion_model.ext[:🔒] = ReentrantLock()
-    # TODO: use DiffOpt to define frule/rrule
+    # TODO: define frule/rrule using b-Ax
     # TODO: handle infeasibility?
     return (y_pred, param_value) -> begin
         lock(completion_model.ext[:🔒])
diff --git a/test/runtests.jl b/test/runtests.jl