add example docs

haozhu10015 · haozhu10015 · commit bd4253b9baf8 · 2025-11-02T20:02:51.000+01:00
diff --git a/examples/blind_deconv.py b/examples/blind_deconv.py
@@ -38,16 +38,78 @@ def _():
     return BiconvexProblem, cp, mo, np, plt
 
 
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Introduction
+
+    Blind deconvolution is a technique used to recover some sharp signal or image from a blurred observation when the blur itself is unknown.
+    It jointly estimates both the original signal and the blur kernel, with some prior knowledge about their structures.
+
+    Suppose we are given a data vector $d \in \mathbf{R}^{m + n - 1}$, which is the convolution of an unknown sparse signal $x \in \mathbf{R}^n$ and an unknown smooth vector $y \in \mathbf{R}^m$ with bounded $\ell_\infty$-norm (i.e., bounded largest entry).
+    Additionally, we have the prior knowledge that both the vectors $x$ and $y$ are nonnegative.
+    The corresponding blind deconvolution problem can be formulated as the following biconvex optimization problem:
+
+    \[
+        \begin{array}{ll}
+            \text{minimize} & {\|x \otimes  y - d\|}_2^2 + \alpha_{\rm sp} {\|x\|}_1 + \alpha_{\rm sm} {\|Dy\|}_2^2\\
+            \text{subject to} & x \succeq 0,\quad y \succeq 0\\
+            & {\|y\|}_\infty \leq \beta
+        \end{array}
+    \]
+
+    with variables $x$ and $y$, where $\alpha_{\rm sp}, \alpha_{\rm sm} > 0$ are the regularization parameters for the sparsity of $x$ and smoothness of $y$, respectively, and $\beta > 0$ is the bound on the $\ell_\infty$-norm of the vector $y$.
+    The matrix $D \in \mathbf{R}^{(m - 1) \times m}$ is the first-order difference operator, given by,
+
+    \[
+        D = \left[\begin{array}{ccccc}
+            1 & -1 &&&\\
+            & 1 & -1 &&\\
+            && \ddots & \ddots &\\
+            &&& 1 & -1
+        \end{array}\right] \in \mathbf{R}^{(m - 1) \times m},
+    \]
+
+    so that $Dy$ computes the vector of successive differences of $y$.
+    The convolution $x \otimes y$ of the vectors $x$ and $y$ is given by
+
+    \[
+        {(x \otimes y)}_k = \sum_{i + j = k} x_i y_j,\quad k = 1, \ldots, m + n - 1.
+    \]
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Generate problem data
+    """)
+    return
+
+
 @app.cell
-def _(BiconvexProblem, conv, cp, np):
+def _(np):
     n = 120
     m = 40
 
     x0 = np.zeros(n)
     x0[6] = 1
     y0 = np.exp(-np.square(np.linspace(-2, 2, m)) * 2)
     d = np.convolve(x0, y0)
+    return d, m, n, x0, y0
+
 
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Specify and solve the problem
+    """)
+    return
+
+
+@app.cell
+def _(BiconvexProblem, conv, cp, d, m, n):
     alpha_sp = 0.1
     alpha_sm = 0.2
     beta = 1
@@ -58,7 +120,15 @@ def _(BiconvexProblem, conv, cp, np):
     constr = [cp.norm(y, "inf") <= beta]
     prob = BiconvexProblem(cp.Minimize(obj), [[x], [y]], constr)
     prob.solve(cp.CLARABEL, gap_tolerance=1e-5, max_iter=200)
-    return d, x, x0, y, y0
+    return x, y
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Plot the results
+    """)
+    return
 
 
 @app.cell
diff --git a/examples/dict_learning.py b/examples/dict_learning.py
@@ -38,14 +38,55 @@ def _():
     return BiconvexProblem, cp, mo, np, plt
 
 
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Introduction
+
+    We consider the sparse dictionary learning problem, which aims to find a dictionary matrix $D \in \mathbf{R}^{m \times k}$ and a sparse code matrix $X \in \mathbf{R}^{k \times n}$, such that the data matrix $Y \in \mathbf{R}^{m \times n}$ can be well approximated by their product $DX$, while the matrix $X$ is sparse and the matrix $D$ has bounded Frobenius norm.
+    The dictionary learning problem can be formulated as the following biconvex optimization problem:
+
+    \[
+        \begin{array}{ll}
+            \text{minimize} & {\|DX - Y\|}_F^2 + \alpha {\|X\|}_1\\
+            \text{subject to} & {\|D\|}_F \leq \beta
+        \end{array}
+    \]
+
+    with variables $D$ and $X$, where $\alpha > 0$ is the sparsity regularization parameter, and $\beta > 0$ is the bound on the Frobenius norm of the dictionary matrix.
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Generate problem data
+    """)
+    return
+
+
 @app.cell
-def _(BiconvexProblem, cp, np):
+def _(np):
     m = 10
     n = 20
     k = 20
     beta = 1
 
     Y = np.random.randn(m, n)
+    return Y, beta, k, m, n
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Specify and solve the problem
+    """)
+    return
+
+
+@app.cell
+def _(BiconvexProblem, Y, beta, cp, k, m, n, np):
     D = cp.Variable((m, k))
     X = cp.Variable((k, n))
     alpha = cp.Parameter(nonneg=True)
@@ -64,6 +105,14 @@ def _(BiconvexProblem, cp, np):
     return cards, errs
 
 
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Plot the results
+    """)
+    return
+
+
 @app.cell
 def _(cards, errs, plt):
     fig, axs = plt.subplots(1, 1, figsize=(4, 3))
diff --git a/examples/iohmm.py b/examples/iohmm.py
@@ -1,12 +1,14 @@
 import marimo
 
-__generated_with = "0.17.3"
+__generated_with = "0.17.6"
 app = marimo.App(width="medium")
 
 
 @app.cell(hide_code=True)
 def _(mo):
-    mo.md(r"""# Fitting Input-output Hidden Markov Models""")
+    mo.md(r"""
+    # Fitting Input-output Hidden Markov Models
+    """)
     return
 
 
@@ -36,6 +38,74 @@ def _():
     return BiconvexRelaxProblem, cp, mo, np, plt
 
 
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Introduction
+
+    We consider the fiting problem of a logistic input-output hidden Markov model (IO-HMM) to some dataset.
+    Suppose we are given a dataset $(x(t), y(t))$, $t = 1, \ldots, m$, where each sample consists of an input feature vector $x(t) \in \mathbf{R}^n$ and an output label $y(t) \in \{0, 1\}$, generated from a $K$-state IO-HMM, according to the following procedure:
+    Let $\hat{z}(t) \in \{1, \ldots, K\}$, $t = 1, \ldots, m$, be the state label of the IO-HMM with initial state distribution $p_{\rm init} \in \mathbf{R}^K$ with $\mathbf{1}^T p_{\rm init} = 1$ and transition matrix $P_{\rm tr} \in \mathbf{R}^{K \times K}$ with $P_{\rm tr} \mathbf{1} = \mathbf{1}$.
+    At the time step $t$, the state label $\hat{z}(t)$ is sampled according to
+
+    \[
+        \hat{z}(t) \sim \left\{
+            \begin{array}{ll}
+                {\rm Cat}(p_{\rm init}) & t = 0\\
+                {\rm Cat}(p_{\hat{z}(t - 1)}) & t > 0,
+            \end{array}\right.
+    \]
+
+    where the vector $p_{\hat{z}(t-1)} \in \mathbf{R}^K$ denotes the $\hat{z}(t-1)$th row of the matrix $P_{\rm tr}$, and ${\rm Cat}(p)$ denotes the categorical distribution with $p$ being the vector of category probabilities.
+    Then, given the feature vector $x(t) \in \mathbf{R}^n$, the output $y(t) \in \{0, 1\}$ of this IO-HMM at time step $t$ is then generated from a logistic model, i.e.,
+
+    \[
+        \mathop{\bf prob}(y(t) = 1) = \frac{1}{1 + \exp(-{x(t)}^T \theta_{\hat{z}(t)})},
+    \]
+
+    where $\theta_{\hat{z}(t)} \in \{\theta_1, \ldots, \theta_K\} \subseteq \mathbf{R}^n$ is the coefficient.
+
+    We are interested in recovering the transition matrix $P_{\rm tr}$, the model parameters $\theta_1, \ldots, \theta_K$, and the unobserved state labels $\hat{z}(1), \ldots, \hat{z}(m)$, given the dataset $(x(t), y(t))$, $t = 1, \ldots, m$.
+    Noticing that the transition matrix $P_{\rm tr}$ can be easily estimated from the state labels $\hat{z}(t)$, $t = 1, \ldots, m$, we consider the following biconvex optimization problem for fitting the IO-HMM:
+
+    \[
+        \begin{array}{ll}
+            \text{minimize} & -\sum_{t = 1}^{m} {z(t)}^T {\left(y(t){x(t)}^T \theta_k - \log(1 + \exp({x(t)}^T \theta_k))\right)}_{k = 1}^K\\
+            &\qquad + \alpha_\theta \sum_{k = 1}^{K} {\|\theta_k\|}^2_2 + \alpha_z \sum_{t = 1}^{m - 1} D_{\rm kl}(z(t), z(t + 1))\\
+            \text{subject to} & 0 \preceq z(t) \preceq \mathbf{1},\quad \mathbf{1}^T z(t) = 1,\quad t = 1, \ldots, m\\
+            & \theta_k \in {\cal C}_k,\quad k = 1, \ldots, K,
+        \end{array}
+    \]
+
+    where the optimization variables are $\theta_k \in \mathbf{R}^n$, $k = 1, \ldots, K$, and $z(t) \in \mathbf{R}^K$, $t = 1, \ldots, m$.
+    Note that the variable $z(t)$ is a soft assignment vector for the hidden state label $\hat{z}(t)$, where the $k$th entry of $z(t)$ indicates the probability of the state being $k$ at time step $t$, and $\hat{z}(t)$ can be estimated as the index of the largest entry of $z(t)$ after solving the problem above.
+
+    Each component of this problem can be interpreted as follows:
+    The first term in the objective function is the negative log-likelihood of the observed data under the IO-HMM model, given the state assignment probabilities $z(t)$, $t = 1, \ldots, m$, and the model parameters $\theta_k$, $k = 1, \ldots, K$.
+    The second term is a Tikhonov regularization on the model parameters $\theta_k$, with regularization parameter $\alpha_\theta > 0$.
+    The third term is a temporal smoothness regularization on the state assignment probabilities, where $D_{\rm kl}(p, q)$ denotes the Kullback-Leibler divergence between two probability distributions $p$ and $q$, and $\alpha_z > 0$ is the corresponding regularization parameter.
+    The constraints on the variables $z(t)$, $t = 1, \ldots, m$, ensure that they are valid probability distributions.
+    The sets ${\cal C}_k \subseteq \mathbf{R}^n$, $k = 1, \ldots, K$, are nonempty closed convex sets that encode potential prior knowledge about the model parameters $\theta_k$.
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Generate problem data
+
+    We consider the case of $n = 2$, and the feature vector for each sample is generated according to
+
+    \[
+        x(t) \sim ({\cal U}(-5, 5),\ 1),
+    \]
+
+    where ${\cal U}(a, b)$ denotes a uniform distribution over the interval $[a, b]$, and the second entry of $x(t)$ is always $1$ to account for the bias term.
+    """)
+    return
+
+
 @app.cell
 def _(np):
     m = 1800
@@ -58,6 +128,22 @@ def _(np):
     return K, coefs, labels, m, n, p_tr, xs, ys
 
 
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Specify and solve the problem
+
+    To fully specify the biconvex problem, it is assumed that we are given the following prior knowledge about the coefficients:
+
+    \[
+        \theta_{1,1} \leq 0,\quad \theta_{2, 1} \geq 0,\quad \theta_{3, 1} \geq 0,\quad \theta_{2, 2} \geq \theta_{3, 2},
+    \]
+
+    where $\theta_{i, j}$ denotes the $j$th entry of the vector $\theta_i$.
+    """)
+    return
+
+
 @app.cell
 def _(BiconvexRelaxProblem, K, cp, m, n, xs, ys):
     thetas = cp.Variable((K, n))
@@ -87,6 +173,14 @@ def _(BiconvexRelaxProblem, K, cp, m, n, xs, ys):
     return thetas, zs
 
 
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Plot the results
+    """)
+    return
+
+
 @app.cell
 def _(K, coefs, labels, m, np, plt, thetas, zs):
     fig, axs = plt.subplots(1, 2, figsize=(8, 3), width_ratios=(1.2, 1))
diff --git a/examples/kmeans.py b/examples/kmeans.py
@@ -39,6 +39,38 @@ def _():
     return BiconvexProblem, cp, make_blobs, mo, np, plt
 
 
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Introduction
+
+    Suppose we are given a set of data points $x_i \in \mathbf{R}^n$, $i = 1, \ldots, m$, and we would like to cluster them into $k$ groups, using the $k$-means clustering method.
+    This corresponds to the following biconvex optimization problem:
+
+    \[
+        \begin{array}{ll}
+            \text{minimize} & \sum_{i = 1}^{m} z_i^T ({\|\bar{x}_1 - x_i\|}_2^2, \ldots, {\|\bar{x}_k - x_i\|}_2^2)\\
+            \text{subject to} & 0 \preceq z_i \preceq \mathbf{1},\quad \mathbf{1}^T z_i = 1,\quad i = 1, \ldots, m
+        \end{array}
+    \]
+
+    with variables $\bar{x}_i \in \mathbf{R}^n$, $i = 1, \ldots, k$, and $z_i \in \mathbf{R}^k$, $i = 1, \ldots, m$.
+
+    We can interpret the problem formulation as follows:
+    The variables $\bar{x}_1, \ldots, \bar{x}_k$ represent the cluster centroids, and each variable $z_i$ is a soft assignment vector for data point $x_i$, where the $j$th entry of $z_i$ indicates the probability of the sample $x_i$ belonging to cluster $j$.
+    Then, the objective function represents the total within-cluster variance, which we would like to minimize.
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Generate problem data
+    """)
+    return
+
+
 @app.cell
 def _(make_blobs):
     n = 2
@@ -49,6 +81,14 @@ def _(make_blobs):
     return k, m, n, xs
 
 
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Specify and solve the problem
+    """)
+    return
+
+
 @app.cell
 def _(BiconvexProblem, cp, k, m, n, xs):
     xbars = cp.Variable((k, n))
@@ -62,6 +102,14 @@ def _(BiconvexProblem, cp, k, m, n, xs):
     return xbars, zs
 
 
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Plot the results
+    """)
+    return
+
+
 @app.cell
 def _(np, plt, xbars, xs, zs):
     fig, axs = plt.subplots(1, 1, figsize=(3, 3))