diff --git a/Project.toml b/Project.toml
index 4c8aa86..83b63ed 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,7 +4,9 @@ authors = ["Klamkin", "Michael <michael@klamkin.com> and contributors"]
 version = "1.0.0-DEV"
 
 [deps]
+DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
 Dualization = "191a621a-6537-11e9-281d-650236a99e60"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee"
@@ -17,8 +19,6 @@ MathOptSetDistances = "=0.2.11"
 
 [extras]
 Clarabel = "61c947e1-3e6d-4ee4-985a-eec8c727bd6e"
-DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
-ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 HiGHS = "87dc4568-4c63-4d18-b0c0-bb2238e4078b"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
@@ -27,4 +27,4 @@ Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "HTTP", "JSON", "Clarabel", "HiGHS", "Pkg", "DifferentiationInterface", "ForwardDiff", "ParametricOptInterface"]
+test = ["Test", "HTTP", "JSON", "Clarabel", "HiGHS", "Pkg", "ParametricOptInterface"]
diff --git a/docs/src/index.md b/docs/src/index.md
index 030d865..d4c5870 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -1,7 +1,224 @@
 # L2ODLL.jl
 
-Documentation for L2ODLL.jl
-
 !!! warning
     This documentation is a work in progress.
-    Please open an issue if content is missing / erroneous
\ No newline at end of file
+    Please open an issue if content is missing / erroneous.
+
+L2ODLL.jl implements the Dual Lagrangian Learning (DLL) method of [Tanneau and Hentenryck (2024)](https://arxiv.org/pdf/2402.03086) using JuMP.
+
+## Installation
+
+```julia
+import Pkg
+Pkg.add(; url="https://github.com/LearningToOptimize/L2ODLL.jl")
+```
+
+## Usage
+
+
+This package simplifies the implementation of DLL by taking as input a primal JuMP model, then automatically generating the dual projection and completion functions which can be used in the training and inference of DLL models. The basic usage is as follows:
+
+
+#### Define your (primal) model using JuMP
+
+For the purposes of this example, we'll use a portfolio optimization problem.
+```julia
+using JuMP, LinearAlgebra
+
+model = Model()
+
+# define constant problem data
+Σ = [166 34 58; 34 64 4; 58 4 100] / 100^2
+N = size(Σ, 1)
+
+# define variables
+@variable(model, x[1:N])
+set_lower_bound.(x, 0)  # we explicitly set upper and lower bounds
+set_upper_bound.(x, 1)  #   in order to use the BoundDecomposition
+
+# define parameteric problem data
+μ0 = randn(N)
+γ0 = rand()
+@variable(model, μ[1:N] in MOI.Parameter.(μ0))
+@variable(model, γ in MOI.Parameter(γ0))
+
+# define constraints
+@constraint(model, simplex, sum(x) == 1)
+@constraint(model, risk, [γ; cholesky(Σ).L * x] in SecondOrderCone())
+
+# define objective
+@objective(model, Max, dot(μ,x))
+```
+
+#### Decompose and build the functions
+
+Since all the variables have finite bounds, L2ODLL will automatically pick the `BoundDecomposition`.
+```julia
+using L2ODLL
+
+L2ODLL.decompose!(model)
+```
+
+Now, L2ODLL has automatically generated the dual projection and completion layer. To compute the dual objective value and gradient with respect to the prediction, use:
+```julia
+param_value = ... # some values for μ and γ
+y_predicted = nn(param_value) # e.g. neural network inference
+
+dobj = L2ODLL.dual_objective(model, y_predicted, param_value)
+dobj_wrt_y = L2ODLL.dual_objective_gradient(model, y_predicted, param_value)
+```
+
+This also works with batches, using broadcasting:
+```julia
+dobj = L2ODLL.dual_objective.(model, y_predicted_batch, param_value_batch)
+dobj_wrt_y = L2ODLL.dual_objective_gradient.(model, y_predicted_batch, param_value_batch)
+```
+
+!!! warning
+    These functions currently run on the CPU. A batched GPU-friendly version is coming soon.
+
+## Math Background
+
+### Decomposition
+
+In DLL, the primal constraints (dual variables) are decomposed into a predicted set and a completed set.
+Consider the primal-dual pair:
+```math
+\begin{equation}
+\begin{aligned}
+& \min\nolimits_{x} & c^\top x
+\\
+& \;\;\text{s.t.} & Ax + b \in \mathcal{C}
+\\
+& & x \in \mathbb{R}^n
+\end{aligned}
+\quad\quad\quad\quad
+\begin{aligned}
+& \max\nolimits_{y} & - b^\top y
+\\
+& \;\;\text{s.t.} & A^\top y = c
+\\
+& & y \in \mathcal{C}^*
+\end{aligned}
+\end{equation}
+```
+After the decomposition, we have:
+```math
+\begin{equation}
+\begin{aligned}
+& \min\nolimits_{x} & c^\top x
+\\
+& \;\;\text{s.t.} & Ax + b \in \mathcal{C}
+\\
+& \;\;\phantom{\text{s.t.}} & Hx + h \in \mathcal{K}
+\\
+& & x \in \mathbb{R}^n
+\end{aligned}
+\quad\quad\quad\quad
+\begin{aligned}
+& \max\nolimits_{y} & - b^\top y
+\\
+& \;\;\text{s.t.} & A^\top y + H^\top z = c
+\\
+& & y \in \mathcal{C}^*,\; z \in \mathcal{K}^*
+\end{aligned}
+\end{equation}
+```
+
+Then, the completion model is:
+
+```math
+\begin{equation}
+\begin{aligned}
+& \max\nolimits_{z} & - h^\top z - b^\top y
+\\
+& \;\;\text{s.t.} & H z = c - A^\top y
+\\
+& & z \in \mathcal{K}^*
+\end{aligned}
+\end{equation}
+```
+
+To train the neural network, we need the gradient of the optimal value with respect to the predicted $y$. This is $\nabla_y = -b-Ax$ where $x$ is the optimal dual solution corresponding to the affine constraints in the completion model. In the special cases below, we specify just the expression for $x$ in this formula.
+
+
+#### Bounded Decomposition
+
+When all primal variables have finite upper and lower bounds, a natural way to decompose the constraints is to have $z$ correspond to the bound constraints, and $y$ correspond to the main constraints, i.e.
+
+```math
+\begin{equation}
+\begin{aligned}
+& \min\nolimits_{x} & c^\top x
+\\
+& \;\;\text{s.t.} & Ax + b \in \mathcal{C}
+\\
+& & l \leq x \leq u
+\end{aligned}
+\quad\quad\quad\quad
+\begin{aligned}
+& \max\nolimits_{y,z_l,z_u} & - b^\top y - l^\top z_l - u^\top z_u
+\\
+& \;\;\text{s.t.} & A^\top y + I z_l + I z_u = c
+\\
+& & y \in \mathcal{C}^*,\; z_l \in \mathbb{R}_+^n,\; z_u \in \mathbb{R}_-^n
+\end{aligned}
+\end{equation}
+```
+
+Then, the completion model is:
+
+```math
+\begin{equation}
+\begin{aligned}
+& \max\nolimits_{z_l,z_u} & - l^\top z_l - u^\top z_u - b^\top y
+\\
+& \;\;\text{s.t.} & I z_l + I z_u = c - A^\top y
+\\
+& & z_l \in \mathbb{R}_+^n,\; z_u \in \mathbb{R}_-^n
+\end{aligned}
+\end{equation}
+```
+
+This model admits a closed form solution, $z_l = |c-A^\top y|^+$ and $z_u = -|c-A^\top y|^-$. Furthermore, the $x$ that defines the (sub-)gradient is given element-wise by $l$ if $z_l > 0$, $u$ if $z_u < 0$, and $x\in[l,u]$ otherwise.
+
+
+#### (Strictly) Convex QP
+
+In the convex QP case, the primal has a strictly convex quadratic objective function, i.e. $Q\succ 0$. In that case it is natural to use the main constraints as the predicted set and to complete the quadratic slack dual variables.
+
+```math
+\begin{equation}
+\begin{aligned}
+& \min\nolimits_{x} & x^\top Q x + c^\top x
+\\
+& \;\;\text{s.t.} & Ax + b \in \mathcal{C}
+\\
+& & x \in \mathbb{R}^n
+\end{aligned}
+\quad\quad\quad\quad
+\begin{aligned}
+& \max\nolimits_{y} & - b^\top y - z^\top Q z
+\\
+& \;\;\text{s.t.} & A^\top y + Qz = c
+\\
+& & y \in \mathcal{C}^*,\; z \in \mathbb{R}^n
+\end{aligned}
+\end{equation}
+```
+
+Then, the completion model is:
+
+```math
+\begin{equation}
+\begin{aligned}
+& \max\nolimits_{z} & - z^\top Q z - b^\top y
+\\
+& \;\;\text{s.t.} & Q z = c - A^\top y
+\\
+& & z \in \mathbb{R}^n
+\end{aligned}
+\end{equation}
+```
+
+This model admits a closed form solution, $z = Q^{-1}(c - A^\top y)$. Furthermore, the closed form dual solution in this case is $x=z$.
diff --git a/src/L2ODLL.jl b/src/L2ODLL.jl
index 36f799c..e9a9e6a 100644
--- a/src/L2ODLL.jl
+++ b/src/L2ODLL.jl
@@ -5,11 +5,15 @@ import JuMP
 import LinearAlgebra
 import MathOptSetDistances
 import SparseArrays
+import DifferentiationInterface
+import ForwardDiff
 
 const MOI = JuMP.MOI
 const MOIB = JuMP.MOIB
 const MOIU = JuMP.MOIU
 const MOSD = MathOptSetDistances
+const DI = DifferentiationInterface
+const ADTypes = DI.ADTypes
 
 abstract type AbstractDecomposition end  # must have p_ref and y_ref
 
@@ -26,6 +30,30 @@ struct DLLCache
     decomposition::AbstractDecomposition
 end
 
+function decompose!(model::JuMP.Model, decomposition::AbstractDecomposition;
+    optimizer=nothing, proj_fn=nothing, dll_layer_builder=nothing
+)
+    return build_cache(model, decomposition; optimizer, proj_fn, dll_layer_builder)
+end
+
+function dual_objective(model::JuMP.Model, y_predicted, param_value)
+    cache = get_cache(model)
+    @assert length.(y_predicted) == L2ODLL.y_shape(cache)
+    return cache.dll_layer(y_predicted, param_value)
+end
+
+function dual_objective_gradient(model::JuMP.Model, y_predicted, param_value; ad_type::ADTypes.AbstractADType=DI.AutoForwardDiff())
+    cache = get_cache(model)
+    y_shape = L2ODLL.y_shape(cache)
+    @assert length.(y_predicted) == y_shape
+    dobj_wrt_y = DI.gradient(
+        (y,p) -> dual_objective(model, L2ODLL.unflatten_y(y, y_shape), p),
+        ad_type,
+        L2ODLL.flatten_y(y_predicted), DI.Constant(param_value)
+    )
+    return L2ODLL.unflatten_y(dobj_wrt_y, y_shape)
+end
+
 function build_cache(model::JuMP.Model, decomposition::AbstractDecomposition;
     optimizer=nothing, proj_fn=nothing, dll_layer_builder=nothing
 )
@@ -43,7 +71,13 @@ function build_cache(model::JuMP.Model, decomposition::AbstractDecomposition;
             jump_builder(decomposition, proj_fn, dual_model, optimizer)
         end
 
-    return DLLCache(proj_fn, dll_layer, dual_model, decomposition)
+    cache = DLLCache(proj_fn, dll_layer, dual_model, decomposition)
+    model.ext[:_L2ODLL_cache] = cache
+    return cache
+end
+
+function get_cache(model::JuMP.Model)
+    return model.ext[:_L2ODLL_cache]
 end
 
 function make_completion_model(cache::DLLCache)
diff --git a/src/layers/generic.jl b/src/layers/generic.jl
index fe2f6b0..dddcd74 100644
--- a/src/layers/generic.jl
+++ b/src/layers/generic.jl
@@ -16,7 +16,7 @@ function jump_builder(decomposition::AbstractDecomposition, proj_fn::Function, d
     JuMP.set_optimizer(completion_model, optimizer)
     silent && JuMP.set_silent(completion_model)
     completion_model.ext[:🔒] = ReentrantLock()
-    # TODO: use DiffOpt to define frule/rrule
+    # TODO: define frule/rrule using b-Ax
     # TODO: handle infeasibility?
     return (y_pred, param_value) -> begin
         lock(completion_model.ext[:🔒])
diff --git a/test/runtests.jl b/test/runtests.jl
index feed3a8..455ed88 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -46,28 +46,28 @@ SOLVER = () -> ParametricOptInterface.Optimizer(HiGHS.Optimizer());
         param_value = [0.1];
         JuMP.set_parameter_value.([γ], param_value);
 
-        cqp_cache = L2ODLL.build_cache(m, L2ODLL.ConvexQP(m));
-        cqp_y_pred = randn_like(L2ODLL.get_y(cqp_cache));
-        dobj1 = cqp_cache.dll_layer(cqp_y_pred, param_value)
-        dobj, dobj_wrt_y_flat = DifferentiationInterface.value_and_gradient(
-            cqp_cache.dll_layer,
-            DifferentiationInterface.AutoForwardDiff(),
-            L2ODLL.flatten_y(cqp_y_pred), DifferentiationInterface.Constant(param_value)
-        )
-        dobj_wrt_y = L2ODLL.unflatten_y(dobj_wrt_y_flat, L2ODLL.y_shape(cqp_cache))
-        @test isapprox(dobj1, dobj, atol=1e-6)
-
-        cqp_solver_cache = L2ODLL.build_cache(m, L2ODLL.ConvexQP(m), dll_layer_builder=(d,p,m) -> L2ODLL.jump_builder(d,p,m,SOLVER));
-        dobj2 = cqp_solver_cache.dll_layer(cqp_y_pred, param_value)
-        @test isapprox(dobj1, dobj2, atol=1e-6)
+        L2ODLL.decompose!(m, L2ODLL.ConvexQP(m));
+
+        cqp_y_pred = randn_like(L2ODLL.get_y(L2ODLL.get_cache(m)));
+
+        dobj = L2ODLL.dual_objective(m, cqp_y_pred, param_value)
+        dobj_wrt_y = L2ODLL.dual_objective_gradient(m, cqp_y_pred, param_value)
+
+        m2 = JuMP.Model();
+        JuMP.@variable(m2, x[1:N] >=0);
+        JuMP.@variable(m2, γ in JuMP.MOI.Parameter(0.0));
+        JuMP.@objective(m2, Max, γ*LinearAlgebra.dot(μ,x) - x' * Σ * x);
+        JuMP.@constraint(m2, simplex, sum(x) == 1);
+        JuMP.set_parameter_value.([γ], param_value);
+        L2ODLL.decompose!(m2, L2ODLL.ConvexQP(m2), dll_layer_builder=(d,p,m) -> L2ODLL.jump_builder(d,p,m,SOLVER));
+        dobj2 = L2ODLL.dual_objective(m2, cqp_y_pred, param_value)
 
         JuMP.set_optimizer(m, Clarabel.Optimizer);
         JuMP.set_silent(m);
         JuMP.optimize!(m)
-        cqp_y_true = JuMP.dual.(cqp_cache.decomposition.y_ref)
-        dobj1 = cqp_cache.dll_layer(cqp_y_true, param_value)
-        dobj2 = cqp_cache.dll_layer(cqp_y_true, param_value)
-        @test isapprox(dobj1, dobj2, atol=1e-6) && isapprox(dobj1, JuMP.objective_value(m), atol=1e-6)
+        cqp_y_true = JuMP.dual.(L2ODLL.get_cache(m).decomposition.y_ref)
+        dobj1 = L2ODLL.dual_objective(m, cqp_y_true, param_value)
+        @test isapprox(dobj1, JuMP.objective_value(m), atol=1e-6)
     end
 
     @testset "Markowitz SecondOrderCone" begin
@@ -89,27 +89,33 @@ SOLVER = () -> ParametricOptInterface.Optimizer(HiGHS.Optimizer());
         param_value = [randn(N); 0.1];
         JuMP.set_parameter_value.([μ; γ], param_value);
 
-        blp_cache = L2ODLL.build_cache(m, L2ODLL.BoundDecomposition(m));
-        blp_y_pred = randn_like(L2ODLL.get_y(blp_cache));
-        dobj1 = blp_cache.dll_layer(blp_y_pred, param_value)
-        dobj, dobj_wrt_y = DifferentiationInterface.value_and_gradient(
-            (y,p) ->blp_cache.dll_layer(L2ODLL.unflatten_y(y, L2ODLL.y_shape(blp_cache)),p),
-            DifferentiationInterface.AutoForwardDiff(),
-            L2ODLL.flatten_y(blp_y_pred), DifferentiationInterface.Constant(param_value)
-        )
-        dobj_wrt_y = L2ODLL.unflatten_y(dobj_wrt_y, L2ODLL.y_shape(blp_cache))
-        @test isapprox(dobj1, dobj, atol=1e-6)
-
-        blp_solver_cache = L2ODLL.build_cache(m, L2ODLL.BoundDecomposition(m), dll_layer_builder=(d,p,m) -> L2ODLL.jump_builder(d,p,m,SOLVER));
-        dobj2 = blp_solver_cache.dll_layer(blp_y_pred, param_value)
-        @test isapprox(dobj1, dobj2, atol=1e-6)
+        L2ODLL.decompose!(m, L2ODLL.BoundDecomposition(m));
+
+        blp_y_pred = randn_like(L2ODLL.get_y(L2ODLL.get_cache(m)));
+
+        dobj = L2ODLL.dual_objective(m, blp_y_pred, param_value)
+        dobj_wrt_y = L2ODLL.dual_objective_gradient(m, blp_y_pred, param_value)
+
+        m2 = JuMP.Model();
+        JuMP.@variable(m2, x[1:N]);
+        JuMP.set_lower_bound.(x, 0);
+        JuMP.set_upper_bound.(x, 1);
+        JuMP.@variable(m2, μ[1:N] in JuMP.MOI.Parameter.(0.0));
+        JuMP.@variable(m2, γ in JuMP.MOI.Parameter(0.1));
+        JuMP.@constraint(m2, simplex, sum(x) == 1);
+        JuMP.@constraint(m2, risk, [γ; LinearAlgebra.cholesky(Σ).L * x] in JuMP.SecondOrderCone());
+        JuMP.@objective(m2, Max, LinearAlgebra.dot(μ,x));
+        JuMP.set_parameter_value.([μ; γ], param_value);
+        
+        L2ODLL.decompose!(m2, L2ODLL.BoundDecomposition(m2), dll_layer_builder=(d,p,m) -> L2ODLL.jump_builder(d,p,m,SOLVER));
+        dobj2 = L2ODLL.dual_objective(m2, blp_y_pred, param_value)
 
         JuMP.set_optimizer(m, Clarabel.Optimizer);
         JuMP.set_silent(m);
         JuMP.optimize!(m)
-        blp_y_true = JuMP.dual.(blp_cache.decomposition.y_ref)
-        dobj1 = blp_cache.dll_layer(blp_y_true, param_value)
-        dobj2 = blp_cache.dll_layer(blp_y_true, param_value)
-        @test isapprox(dobj1, dobj2, atol=1e-6) && isapprox(dobj1, JuMP.objective_value(m), atol=1e-6)
+        blp_y_true = JuMP.dual.(L2ODLL.get_cache(m).decomposition.y_ref)
+
+        dobj1 = L2ODLL.dual_objective(m, blp_y_true, param_value)
+        @test isapprox(dobj1, JuMP.objective_value(m), atol=1e-6)
     end
 end
\ No newline at end of file