LearningToOptimize
diff --git a/‎class02/ISYE_8803___Lecture_2___Slides.pdf‎
264 KB b/‎class02/ISYE_8803___Lecture_2___Slides.pdf‎
264 KB
diff --git a/‎class02/SQP.tex‎
Lines changed: 123 additions & 0 deletions b/‎class02/SQP.tex‎
Lines changed: 123 additions & 0 deletions
diff --git a/‎class02/eq_constraints.tex‎
Lines changed: 157 additions & 0 deletions b/‎class02/eq_constraints.tex‎
Lines changed: 157 additions & 0 deletions
@@ -0,0 +1,123 @@
+\section{Sequential Quadratic Programming (SQP)}
+
+% ------------------------------------------------
+\begin{frame}{What is SQP?}
+\textbf{Idea:} Solve a nonlinear, constrained problem by repeatedly solving a \emph{quadratic program (QP)} built from local models.\\[4pt]
+\begin{itemize}
+  \item Linearize constraints; quadratic model of the Lagrangian/objective.
+  \item Each iteration: solve a QP to get a step \(d\), update \(x \leftarrow x + \alpha d\).
+  \item Strength: strong local convergence (often superlinear) with good Hessian info.
+\end{itemize}
+\end{frame}
+
+% ------------------------------------------------
+\begin{frame}{Target Problem (NLP)}
+\[
+\min_{x \in \R^n} \ f(x)
+\quad
+\text{s.t.}\quad
+g(x)=0,\quad h(x)\le 0
+\]
+\begin{itemize}
+  \item \(f:\R^n\!\to\!\R\), \(g:\R^n\!\to\!\R^{m}\) (equalities), \(h:\R^n\!\to\!\R^{p}\) (inequalities).
+  \item KKT recap (at candidate optimum \(x^\star\)): 
+\[
+\exists \ \lambda \in \R^{m},\ \mu \in \R^{p}_{\ge 0}:
+\ \grad f(x^\star) + \nabla g(x^\star)^T\lambda + \nabla h(x^\star)^T \mu = 0,
+\]
+\[
+g(x^\star)=0,\quad h(x^\star)\le 0,\quad \mu \ge 0,\quad \mu \odot h(x^\star) = 0.
+\]
+\end{itemize}
+\end{frame}
+
+% ------------------------------------------------
+\begin{frame}{From NLP to a QP (Local Model)}
+At iterate \(x_k\) with multipliers \((\lambda_k,\mu_k)\):\\[4pt]
+\textbf{Quadratic model of the Lagrangian}
+\[
+m_k(d) = \ip{\grad f(x_k)}{d} + \tfrac{1}{2} d^T B_k d
+\]
+with \(B_k \approx \nabla^2_{xx}\Lag(x_k,\lambda_k,\mu_k)\).\\[6pt]
+\textbf{Linearized constraints}
+\[
+g(x_k) + \nabla g(x_k)\, d = 0,\qquad
+h(x_k) + \nabla h(x_k)\, d \le 0.
+\]
+\end{frame}
+
+% ------------------------------------------------
+\begin{frame}{The SQP Subproblem (QP)}
+\[
+\begin{aligned}
+\min_{d \in \R^n}\quad & \grad f(x_k)^T d + \tfrac{1}{2} d^T B_k d \\
+\text{s.t.}\quad & \nabla g(x_k)\, d + g(x_k) = 0, \\
+& \nabla h(x_k)\, d + h(x_k) \le 0.
+\end{aligned}
+\]
+\begin{itemize}
+  \item Solve QP \(\Rightarrow\) step \(d_k\) and updated multipliers \((\lambda_{k+1},\mu_{k+1})\).
+  \item Update \(x_{k+1} = x_k + \alpha_k d_k\) (line search or trust-region).
+\end{itemize}
+\end{frame}
+
+% ------------------------------------------------
+\begin{frame}{Algorithm Sketch (SQP)}
+\begin{enumerate}
+  \item Start with \(x_0\), multipliers \((\lambda_0,\mu_0)\), and \(B_0 \succ 0\).
+  \item Build QP at \(x_k\) with \(B_k\), linearized constraints.
+  \item Solve QP \(\Rightarrow\) get \(d_k\), \((\lambda_{k+1},\mu_{k+1})\).
+  \item Globalize: line search on merit or use filter/TR to choose \(\alpha_k\).
+  \item Update \(x_{k+1} = x_k + \alpha_k d_k\), update \(B_{k+1}\) (e.g., BFGS).
+\end{enumerate}
+\end{frame}
+
+% ------------------------------------------------
+\begin{frame}{Toy Example (Local Models)}
+\textbf{Problem:}
+\[
+\min_{x\in\R^2} \ \tfrac{1}{2}\norm{x}^2
+\quad \text{s.t.} \quad g(x)=x_1^2 + x_2 - 1 = 0,\ \ h(x)=x_2 - 0.2 \le 0.
+\]
+At \(x_k\), build QP with
+\[
+\grad f(x_k)=x_k,\quad B_k=I,\quad
+\nabla g(x_k) = \begin{bmatrix} 2x_{k,1} & 1 \end{bmatrix},\ 
+\nabla h(x_k) = \begin{bmatrix} 0 & 1 \end{bmatrix}.
+\]
+Solve for \(d_k\), then \(x_{k+1}=x_k+\alpha_k d_k\).
+\end{frame}
+ 
+
+% ------------------------------------------------
+\begin{frame}{Globalization: Making SQP Robust}
+SQP is an important method, and there are many issues to be considered to obtain an \textbf{efficient} and \textbf{reliable} implementation:
+\begin{itemize}
+  \item Efficient solution of the linear systems at each Newton Iteration (Matrix block structure can be exploited.
+  \item Quasi-Newton approximations to the Hessian.
+  \item Trust region, line search, etc. to improve robustnes (i.e TR: restrict \(\norm{d}\) to maintain model validity.)
+  \item Treatment of constraints (equality and inequality) during the iterative process.
+  \item Selection of good starting guess for $\lambda$.
+\end{itemize}
+\end{frame}
+
+ 
+   
+ 
+ 
+
+% ------------------------------------------------
+\begin{frame}{Final Takeaways on SQP}
+\textbf{When SQP vs.\ Interior-Point?}
+\begin{itemize}
+  \item \textbf{SQP}: strong local convergence; warm-start friendly; natural for NMPC.
+  \item \textbf{IPM}: very robust for large, strictly feasible problems; good for dense inequality sets.
+  \item In practice: both are valuable—choose to match problem structure and runtime needs.
+\end{itemize} 
+\textbf{Takeaways of SQP} 
+\begin{itemize}
+  \item SQP = Newton-like method using a sequence of structured QPs.
+  \item Globalization (merit/filter/TR) makes it reliable from poor starts.
+  \item Excellent fit for control (NMPC/trajectory optimization) due to sparsity and warm starts.
+\end{itemize}
+\end{frame}
@@ -0,0 +1,157 @@
+
+%\section{Part II -- Equality constraints: KKT, Newton vs. Gauss–Newton}
+\section{Constrained Optimization}
+
+% ==== Equality constraints: KKT, Newton vs. Gauss–Newton ====
+
+\begin{frame}{Equality-constrained minimization: geometry and conditions}
+\textbf{Problem.}; $\min_{x\in\mathbb{R}^n} f(x)\quad \text{s.t.}\quad C(x)=0, C:\mathbb{R}^n\to\mathbb{R}^m$.
+
+\medskip
+\textbf{Geometric picture.} At an optimum on the manifold $C(x)=0$, the negative gradient must lie in the tangent space:
+
+$$
+\grad f(x^\star)\ \perp\ \mathcal{T}_{x^\star}=\{p:\; J_C(x^\star)p=0\}.
+$$
+
+Equivalently, the gradient is a linear combination of constraint normals:
+
+$$
+\grad f(x^\star)+J_C(x^\star)^{\!T}\lambda^\star=0,\qquad C(x^\star)=0\quad(\lambda^\star\in\mathbb{R}^m).
+$$
+
+\medskip
+\textbf{Lagrangian.}; $L(x,\lambda)=f(x)+\lambda^{\!T}C(x)$.
+\end{frame}
+
+\begin{frame}{A nicer visual explanation/derivation of KKT conditions}
+\begin{center}
+    Quick little whiteboard derivation
+\end{center}
+    
+\end{frame}
+
+
+
+
+
+\begin{frame}{KKT system for equalities (first-order necessary conditions)}
+\textbf{KKT (FOC).}
+
+$$
+\grad_x L(x,\lambda)=\grad f(x)+J_C(x)^{\!T}\lambda=0,\qquad \grad_\lambda L(x,\lambda)=C(x)=0.
+$$
+
+\textbf{Solve by Newton on KKT:} linearize both optimality and feasibility:
+
+$$
+\begin{bmatrix}
+\hess f(x) + \sum_{i=1}^m \lambda_i\,\hess C_i(x) & J_C(x)^{\!T}\\[2pt]
+J_C(x) & 0
+\end{bmatrix}
+\begin{bmatrix}\Delta x\\ \Delta\lambda\end{bmatrix}
+=-
+\begin{bmatrix}
+\grad f(x)+J_C(x)^{\!T}\lambda\\ C(x)
+\end{bmatrix}.
+$$
+
+\textit{Notes.} This is a symmetric \emph{saddle-point} system; typical solves use block elimination (Schur complement) or sparse factorizations.
+
+
+\end{frame}
+
+\begin{frame}{Move to Julia Code}
+\begin{center}
+    \textbf{Quick Demo of Julia Notebook: part2\_eq\_constraints.ipynb}
+\end{center}
+\end{frame}
+
+\begin{frame}{Numerical practice: Newton on KKT}
+\textbf{When it works best.}
+\begin{itemize}
+\item Near a regular solution with $J_{C}(x^\star)$ full row rank  and positive-definite reduced Hessian.
+\item With a globalization (line search on a merit function) and mild regularization for robustness.
+\end{itemize}
+
+\textbf{Common safeguards.}
+\begin{itemize}
+\item \emph{Regularize} the $(1,1)$ block to ensure a good search direction (e.g., add $\beta I$).
+\item \emph{Merit/penalty} line search to balance feasibility vs.\ optimality during updates.
+\item \emph{Scaling} constraints to improve conditioning of the KKT system.
+\end{itemize}
+
+
+\end{frame}
+
+\begin{frame}{Gauss--Newton vs.\ full Newton on KKT}
+\textbf{Full Newton Hessian of the Lagrangian:} $\nabla_{xx}^2 L(x,\lambda) &= \hess f(x)+\sum_{i=1}^m \lambda_i\,\hess C_i(x)$
+
+\textbf{Gauss--Newton approximation:} drop the \emph{constraint-curvature} term $\sum_i \lambda_i,\hess C_i(x)$:
+
+\begin{align*} 
+H_{\text{GN}}(x) &\approx \hess f(x).
+\end{align*}
+
+\textbf{Trade-offs (high level).}
+\begin{itemize}
+\item \emph{Full Newton:} fewer iterations near the solution, but each step is costlier and can be less robust far from it.
+\item \emph{Gauss--Newton:} cheaper per step and often more stable; may need more iterations but wins in wall-clock on many problems.
+\end{itemize}
+
+\textbf{Practice tip.} Start with GN (with line search); switch to full Newton (or add low-rank updates) as feasibility improves.
+
+
+\end{frame}
+
+% ==== Inequalities & KKT: complementarity ====
+
+\begin{frame}{Inequality-constrained minimization and KKT}
+\textbf{Problem.} $\quad \quad \min f(x)\quad\text{s.t.}\quad c(x)\ge 0,  \quad \quad c:\mathbb{R}^n\to\mathbb{R}^p$.
+
+\textbf{KKT conditions (first-order).}
+
+$$
+\begin{aligned}
+&\text{Stationarity:} && \grad f(x)-J_c(x)^{\!T}\lambda=0,\\
+&\text{Primal feasibility:} && c(x)\ge 0,\\
+&\text{Dual feasibility:} && \lambda\ge 0,\\
+&\text{Complementarity:} && \lambda^{\!T}c(x)=0\quad(\text{i.e., }\lambda_i c_i(x)=0\ \forall i).
+\end{aligned}
+$$
+
+\textbf{Interpretation.}
+\begin{itemize}
+\item \emph{Active} constraints: $c_i(x)=0 \Rightarrow \lambda_i\ge 0$ can be nonzero (acts like an equality).
+\item \emph{Inactive} constraints: $c_i(x)>0 \Rightarrow \lambda_i=0$ (no influence on optimality).
+\end{itemize}
+\end{frame}
+
+
+
+
+\begin{frame}{Complementarity in plain English (and why Newton is tricky)}
+\footnotesize
+
+\textbf{What $\lambda_i c_i(x)=0$ means.}
+\begin{itemize}
+\item Tight constraint ($c_i=0$) $\Rightarrow$ can press back ($\lambda_i\ge0$).
+\item Loose constraint ($c_i>0$) $\Rightarrow$ no force ($\lambda_i=0$).
+\end{itemize}
+
+\textbf{Why naive Newton fails.}
+\begin{itemize}
+\item Complementarity = nonsmooth + inequalities ($\lambda\ge0$, $c(x)\ge0$).
+\item Equality-style Newton can violate nonnegativity or bounce across boundary.
+\end{itemize}
+
+\textbf{Two main strategies (preview).}
+\begin{itemize}
+\item \emph{Active-set:} guess actives $\Rightarrow$ solve equality-constrained subproblem, update set.
+\item \emph{Barrier/PDIP/ALM:} smooth or relax complementarity, damped Newton, drive relaxation $\to 0$.
+\end{itemize}
+\end{frame}
+
+
+
+