Skip to content

Commit 2f7b7ad

Browse files
authored
Merge pull request #185 from mrava87/feat-aapg
feat: added AndersonProximalGradient
2 parents 9a98393 + a4593a3 commit 2f7b7ad

File tree

5 files changed

+229
-12
lines changed

5 files changed

+229
-12
lines changed

docs/source/api/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ Primal
157157
AcceleratedProximalGradient
158158
ADMM
159159
ADMML2
160+
AndersonProximalGradient
160161
GeneralizedProximalGradient
161162
HQS
162163
LinearizedADMM

pyproximal/optimization/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
ProximalPoint Proximal point algorithm (or proximal min.)
1212
ProximalGradient Proximal gradient algorithm
1313
AcceleratedProximalGradient Accelerated Proximal gradient algorithm
14+
AndersonProximalGradient Proximal gradient algorithm with Anderson acceleration
15+
GeneralizedProximalGradient Generalized Proximal gradient algorithm
1416
HQS Half Quadrating Splitting
1517
ADMM Alternating Direction Method of Multipliers
1618
ADMML2 ADMM with L2 misfit term

pyproximal/optimization/primal.py

Lines changed: 199 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ def ProximalGradient(proxf, proxg, x0, epsg=1.,
156156
Proximal operator of g function
157157
x0 : :obj:`numpy.ndarray`
158158
Initial vector
159-
epsg : :obj:`float` or :obj:`np.ndarray`, optional
159+
epsg : :obj:`float` or :obj:`numpy.ndarray`, optional
160160
Scaling factor of g function
161161
tau : :obj:`float` or :obj:`numpy.ndarray`, optional
162162
Positive scalar weight, which should satisfy the following condition
@@ -195,7 +195,7 @@ def ProximalGradient(proxf, proxg, x0, epsg=1.,
195195
196196
Notes
197197
-----
198-
The Proximal point algorithm can be expressed by the following recursion:
198+
The Proximal gradient algorithm can be expressed by the following recursion:
199199
200200
.. math::
201201
@@ -359,6 +359,202 @@ def AcceleratedProximalGradient(proxf, proxg, x0, tau=None, beta=0.5,
359359
callback=callback, show=show)
360360

361361

362+
def AndersonProximalGradient(proxf, proxg, x0, epsg=1.,
363+
tau=None, niter=10,
364+
nhistory=10, epsr=1e-10,
365+
safeguard=False, tol=None,
366+
callback=None, show=False):
367+
r"""Proximal gradient with Anderson acceleration
368+
369+
Solves the following minimization problem using the Proximal
370+
gradient algorithm with Anderson acceleration:
371+
372+
.. math::
373+
374+
\mathbf{x} = \argmin_\mathbf{x} f(\mathbf{x}) + \epsilon g(\mathbf{x})
375+
376+
where :math:`f(\mathbf{x})` is a smooth convex function with a uniquely
377+
defined gradient and :math:`g(\mathbf{x})` is any convex function that
378+
has a known proximal operator.
379+
380+
Parameters
381+
----------
382+
proxf : :obj:`pyproximal.ProxOperator`
383+
Proximal operator of f function (must have ``grad`` implemented)
384+
proxg : :obj:`pyproximal.ProxOperator`
385+
Proximal operator of g function
386+
x0 : :obj:`numpy.ndarray`
387+
Initial vector
388+
epsg : :obj:`float` or :obj:`numpy.ndarray`, optional
389+
Scaling factor of g function
390+
tau : :obj:`float` or :obj:`numpy.ndarray`, optional
391+
Positive scalar weight, which should satisfy the following condition
392+
to guarantees convergence: :math:`\tau \in (0, 1/L]` where ``L`` is
393+
the Lipschitz constant of :math:`\nabla f`. When ``tau=None``,
394+
backtracking is used to adaptively estimate the best tau at each
395+
iteration. Finally, note that :math:`\tau` can be chosen to be a vector
396+
when dealing with problems with multiple right-hand-sides
397+
niter : :obj:`int`, optional
398+
Number of iterations of iterative scheme
399+
nhistory : :obj:`int`, optional
400+
Number of previous iterates to be kept in memory (to compute the scaling factors
401+
epsr : :obj:`float`, optional
402+
Scaling factor for regularization added to the inverse of :math:\mathbf{R}^T \mathbf{R}`
403+
safeguard : :obj:`bool`, optional
404+
Apply safeguarding strategy to the update (``True``) or not (``False``)
405+
tol : :obj:`float`, optional
406+
Tolerance on change of objective function (used as stopping criterion). If
407+
``tol=None``, run until ``niter`` is reached
408+
callback : :obj:`callable`, optional
409+
Function with signature (``callback(x)``) to call after each iteration
410+
where ``x`` is the current model vector
411+
show : :obj:`bool`, optional
412+
Display iterations log
413+
414+
Returns
415+
-------
416+
x : :obj:`numpy.ndarray`
417+
Inverted model
418+
419+
Notes
420+
-----
421+
The Proximal gradient algorithm with Anderson acceleration can be expressed by the
422+
following recursion [1]_:
423+
424+
.. math::
425+
m_k = min(m, k)\\
426+
\mathbf{g}^{k} = \mathbf{x}^{k} - \tau^k \nabla f(\mathbf{x}^k)\\
427+
\mathbf{r}^{k} = \mathbf{g}^{k} - \mathbf{g}^{k}\\
428+
\mathbf{G}^{k} = [\mathbf{g}^{k},..., \mathbf{g}^{k-m_k}]\\
429+
\mathbf{R}^{k} = [\mathbf{r}^{k},..., \mathbf{r}^{k-m_k}]\\
430+
\alpha_k = (\mathbf{R}^{kT} \mathbf{R}^{k})^{-1} \mathbf{1} / \mathbf{1}^T
431+
(\mathbf{R}^{kT} \mathbf{R}^{k})^{-1} \mathbf{1}\\
432+
\mathbf{y}^{k+1} = \mathbf{G}^{k} \alpha_k\\
433+
\mathbf{x}^{k+1} = \prox_{\tau^{k+1} g}(\mathbf{y}^{k+1})
434+
435+
where :math:`m` equals ``nhistory``, :math:`k=1,2,...,n_{iter}`, :math:`\mathbf{y}^{0}=\mathbf{x}^{0}`,
436+
:math:`\mathbf{y}^{1}=\mathbf{x}^{0} - \tau^0 \nabla f(\mathbf{x}^0)`,
437+
:math:`\mathbf{x}^{1}=\prox_{\tau^k g}(\mathbf{y}^{1})`, and
438+
:math:`\mathbf{g}^{0}=\mathbf{y}^{1}`.
439+
440+
Refer to [1]_ for the guarded version of the algorithm (when ``safeguard=True``).
441+
442+
.. [1] Mai, V., and Johansson, M. "Anderson Acceleration of Proximal Gradient
443+
Methods", 2020.
444+
445+
"""
446+
# check if epgs is a vector
447+
if np.asarray(epsg).size == 1.:
448+
epsg = epsg * np.ones(niter)
449+
epsg_print = str(epsg[0])
450+
else:
451+
epsg_print = 'Multi'
452+
453+
if show:
454+
tstart = time.time()
455+
print('Proximal Gradient with Anderson Acceleration \n'
456+
'---------------------------------------------------------\n'
457+
'Proximal operator (f): %s\n'
458+
'Proximal operator (g): %s\n'
459+
'tau = %s\t\tepsg = %s\tniter = %d\n'
460+
'nhist = %d\tepsr = %s\n'
461+
'guard = %s\ttol = %s\n' % (type(proxf), type(proxg),
462+
str(tau), epsg_print, niter,
463+
nhistory, str(epsr),
464+
str(safeguard), str(tol)))
465+
head = ' Itn x[0] f g J=f+eps*g tau'
466+
print(head)
467+
468+
# initialize model
469+
y = x0 - tau * proxf.grad(x0)
470+
x = proxg.prox(y, epsg[0] * tau)
471+
g = y.copy()
472+
r = g - x0
473+
R, G = [g, ], [r, ]
474+
pf = proxf(x)
475+
pfg = np.inf
476+
tolbreak = False
477+
478+
# iterate
479+
for iiter in range(niter):
480+
481+
# update fix point
482+
g = x - tau * proxf.grad(x)
483+
r = g - y
484+
485+
# update history vectors
486+
R.insert(0, r)
487+
G.insert(0, g)
488+
if iiter >= nhistory - 1:
489+
R.pop(-1)
490+
G.pop(-1)
491+
492+
# solve for alpha coefficients
493+
Rstack = np.vstack(R)
494+
Rinv = np.linalg.pinv(Rstack @ Rstack.T + epsr * np.linalg.norm(Rstack) ** 2)
495+
ones = np.ones(min(nhistory, iiter + 2))
496+
Rinvones = Rinv @ ones
497+
alpha = Rinvones / (ones[None] @ Rinvones)
498+
499+
if not safeguard:
500+
# update auxiliary variable
501+
y = np.vstack(G).T @ alpha
502+
503+
# update main variable
504+
x = proxg.prox(y, epsg[iiter] * tau)
505+
506+
else:
507+
# update auxiliary variable
508+
ytest = np.vstack(G).T @ alpha
509+
510+
# update main variable
511+
xtest = proxg.prox(ytest, epsg[iiter] * tau)
512+
513+
# check if function is decreased, otherwise do basic PG step
514+
pfold, pf = pf, proxf(xtest)
515+
if pf <= pfold - tau * np.linalg.norm(proxf.grad(x)) ** 2 / 2:
516+
y = ytest
517+
x = xtest
518+
else:
519+
x = proxg.prox(g, epsg[iiter] * tau)
520+
y = g
521+
522+
# run callback
523+
if callback is not None:
524+
callback(x)
525+
526+
# tolerance check: break iterations if overall
527+
# objective does not decrease below tolerance
528+
if tol is not None:
529+
pfgold = pfg
530+
pf, pg = proxf(x), proxg(x)
531+
pfg = pf + np.sum(epsg[iiter] * pg)
532+
if np.abs(1.0 - pfg / pfgold) < tol:
533+
tolbreak = True
534+
535+
# show iteration logger
536+
if show:
537+
if iiter < 10 or niter - iiter < 10 or iiter % (niter // 10) == 0:
538+
if tol is None:
539+
pf, pg = proxf(x), proxg(x)
540+
pfg = pf + np.sum(epsg[iiter] * pg)
541+
msg = '%6g %12.5e %10.3e %10.3e %10.3e %10.3e' % \
542+
(iiter + 1, np.real(to_numpy(x[0])) if x.ndim == 1 else np.real(to_numpy(x[0, 0])),
543+
pf, pg,
544+
pfg,
545+
tau)
546+
print(msg)
547+
548+
# break if tolerance condition is met
549+
if tolbreak:
550+
break
551+
552+
if show:
553+
print('\nTotal time (s) = %.2f' % (time.time() - tstart))
554+
print('---------------------------------------------------------\n')
555+
return x
556+
557+
362558
def GeneralizedProximalGradient(proxfs, proxgs, x0, tau,
363559
epsg=1., weights=None,
364560
eta=1., niter=10,
@@ -390,7 +586,7 @@ def GeneralizedProximalGradient(proxfs, proxgs, x0, tau,
390586
Positive scalar weight, which should satisfy the following condition
391587
to guarantees convergence: :math:`\tau \in (0, 1/L]` where ``L`` is
392588
the Lipschitz constant of :math:`\sum_{i=1}^n \nabla f_i`.
393-
epsg : :obj:`float` or :obj:`np.ndarray`, optional
589+
epsg : :obj:`float` or :obj:`numpy.ndarray`, optional
394590
Scaling factor(s) of ``g`` function(s)
395591
weights : :obj:`float`, optional
396592
Weighting factors of ``g`` functions. Must sum to 1.

pyproximal/proximal/Simplex.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@ def Simplex(n, radius, dims=None, axis=-1, maxiter=100,
204204
Notes
205205
-----
206206
As the Simplex is an indicator function, the proximal operator corresponds
207-
to its orthogonal projection (see :class:`pyprox.projection.SimplexProj`
207+
to its orthogonal projection (see :class:`pyproximal.projection.SimplexProj`
208208
for details.
209209
210210
Note that ``tau`` does not have effect for this proximal operator, any

tutorials/twist.py

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
r"""
2-
IHT, ISTA, FISTA, and TWIST for Compressive sensing
3-
===================================================
2+
IHT, ISTA, FISTA, AA-ISTA, and TWIST for Compressive sensing
3+
============================================================
44
5-
In this example we want to compare four popular solvers in compressive
6-
sensing problem, namely IHT, ISTA, FISTA, and TwIST. The first three can
5+
In this example we want to compare five popular solvers in compressive
6+
sensing problem, namely IHT, ISTA, FISTA, AA-ISTA, and TwIST. The first three can
77
be implemented using the same solver, namely the
88
:py:class:`pyproximal.optimization.primal.ProximalGradient`, whilst the latter
9-
is implemented in :py:class:`pyproximal.optimization.primal.TwIST`.
9+
two are implemented using the :py:class:`pyproximal.optimization.primal.AndersonProximalGradient` and
10+
:py:class:`pyproximal.optimization.primal.TwIST` solvers, respectively
1011
1112
The IHT solver tries to solve the following unconstrained problem with a L0Ball
1213
regularization term:
@@ -48,7 +49,6 @@ def callback(x, pf, pg, eps, cost):
4849
A = np.random.randn(N, M)
4950
A = A / np.linalg.norm(A, axis=0)
5051
Aop = pylops.MatrixMult(A)
51-
Aop.explicit = False # temporary solution whilst PyLops gets updated
5252

5353
x = np.random.rand(M)
5454
x[x < 0.9] = 0
@@ -88,6 +88,17 @@ def callback(x, pf, pg, eps, cost):
8888
callback=lambda x: callback(x, l2, l1, eps, costf))
8989
niterf = len(costf)
9090

91+
# Anderson accelerated ISTA
92+
l1 = pyproximal.proximal.L1()
93+
l2 = pyproximal.proximal.L2(Op=Aop, b=y)
94+
costa = []
95+
x_ander = \
96+
pyproximal.optimization.primal.AndersonProximalGradient(l2, l1, tau=tau, x0=np.zeros(M),
97+
epsg=eps, niter=maxit,
98+
nhistory=5, show=False,
99+
callback=lambda x: callback(x, l2, l1, eps, costa))
100+
nitera = len(costa)
101+
91102
# TWIST (Note that since the smallest eigenvalue is zero, we arbitrarily
92103
# choose a small value for the solver to converge stably)
93104
l1 = pyproximal.proximal.L1(sigma=eps)
@@ -111,16 +122,23 @@ def callback(x, pf, pg, eps, cost):
111122
m, s, b = ax.stem(x_fista, linefmt='--g', basefmt='--g',
112123
markerfmt='go', label='FISTA')
113124
plt.setp(m, markersize=7)
125+
m, s, b = ax.stem(x_ander, linefmt='--m', basefmt='--m',
126+
markerfmt='mo', label='AA-ISTA')
127+
plt.setp(m, markersize=7)
114128
m, s, b = ax.stem(x_twist, linefmt='--b', basefmt='--b',
115129
markerfmt='bo', label='TWIST')
116130
plt.setp(m, markersize=7)
117131
ax.set_title('Model', size=15, fontweight='bold')
118132
ax.legend()
119133
plt.tight_layout()
120134

135+
###############################################################################
136+
# Finally, let's compare the converge behaviour of the different algorithms
137+
121138
fig, ax = plt.subplots(1, 1, figsize=(8, 3))
122139
ax.semilogy(costi, 'r', lw=2, label=r'$x_{ISTA} (niter=%d)$' % niteri)
123140
ax.semilogy(costf, 'g', lw=2, label=r'$x_{FISTA} (niter=%d)$' % niterf)
141+
ax.semilogy(costa, 'm', lw=2, label=r'$x_{AA-ISTA} (niter=%d)$' % nitera)
124142
ax.semilogy(costt, 'b', lw=2, label=r'$x_{TWIST} (niter=%d)$' % maxit)
125143
ax.set_title('Cost function', size=15, fontweight='bold')
126144
ax.set_xlabel('Iteration')
@@ -134,6 +152,6 @@ def callback(x, pf, pg, eps, cost):
134152
# cost function since this is a constrained problem. This is however greatly influenced
135153
# by the fact that we assume exact knowledge of the number of non-zero coefficients.
136154
# When this information is not available, IHT may become suboptimal. In this case the
137-
# FISTA solver should always be preferred (over ISTA) and TwIST represents an
138-
# alternative worth checking.
155+
# FISTA or AA-ISTA solvers should always be preferred (over ISTA) and TwIST represents
156+
# an alternative worth checking.
139157

0 commit comments

Comments
 (0)