Skip to content

Commit ea44694

Browse files
Merge pull request #52 from Quantmetry/test_rpca_analytic
Test rpca analytic
2 parents 907eb6c + 6c4dd50 commit ea44694

File tree

10 files changed

+379
-217
lines changed

10 files changed

+379
-217
lines changed

qolmat/imputations/imputers.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1532,8 +1532,6 @@ def _transform_element(self, df: pd.DataFrame, col: str = "__all__") -> pd.DataF
15321532
X_select = X[is_na & is_valid]
15331533
y_imputed = self.estimators_[col].predict(X_select)
15341534
y_imputed = y_imputed.flatten().astype(float)
1535-
print("y_imputed")
1536-
print(y_imputed)
15371535

15381536
y_imputed = pd.Series(y_imputed, index=X_select.index)
15391537

qolmat/imputations/rpca/rpca.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
from __future__ import annotations
22

3-
from typing import Optional, Tuple, Union
3+
from typing import Tuple, Union
44

55
import numpy as np
66
from numpy.typing import NDArray
77
from sklearn.base import BaseEstimator, TransformerMixin
88

9-
from qolmat.imputations.rpca import rpca_utils
109
from qolmat.utils import utils
1110

1211

qolmat/imputations/rpca/rpca_noisy.py

Lines changed: 78 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@
44

55
import numpy as np
66
import scipy as scp
7-
from matplotlib import pyplot as plt
87
from numpy.typing import NDArray
8+
from sklearn import utils as sku
99

1010
from qolmat.imputations.rpca import rpca_utils as rpca_utils
1111
from qolmat.imputations.rpca.rpca import RPCA
12-
from sklearn import utils as sku
12+
from qolmat.utils.exceptions import CostFunctionRPCANotMinimized
1313

1414

1515
class RPCANoisy(RPCA):
@@ -45,7 +45,7 @@ class RPCANoisy(RPCA):
4545
tol: Optional[float]
4646
stoppign critera, minimum difference between 2 consecutive iterations. By default,
4747
the value is set to 1e-6
48-
norm: Optional[str]
48+
norm: str
4949
error norm, can be "L1" or "L2". By default, the value is set to "L2"
5050
"""
5151

@@ -54,24 +54,24 @@ def __init__(
5454
random_state: Union[None, int, np.random.RandomState] = None,
5555
period: int = 1,
5656
rank: Optional[int] = None,
57+
mu: Optional[float] = None,
5758
tau: Optional[float] = None,
5859
lam: Optional[float] = None,
5960
list_periods: List[int] = [],
6061
list_etas: List[float] = [],
6162
max_iterations: int = int(1e4),
6263
tol: float = 1e-6,
63-
norm: Optional[str] = "L2",
64-
do_report: bool = False,
64+
norm: str = "L2",
6565
) -> None:
6666
super().__init__(period=period, max_iterations=max_iterations, tol=tol)
6767
self.rng = sku.check_random_state(random_state)
6868
self.rank = rank
69+
self.mu = mu
6970
self.tau = tau
7071
self.lam = lam
7172
self.list_periods = list_periods
7273
self.list_etas = list_etas
7374
self.norm = norm
74-
self.do_report = do_report
7575

7676
def decompose_rpca_L1(
7777
self, D: NDArray, Omega: NDArray, lam: float, tau: float, rank: int
@@ -110,8 +110,8 @@ def decompose_rpca_L1(
110110
"""
111111
m, n = D.shape
112112
rho = 1.1
113-
mu = 1e-2
114-
mu_bar = mu * 1e10
113+
mu = self.mu or 1e-2
114+
mu_bar = mu * 1e3
115115

116116
# init
117117
Y = np.ones((m, n))
@@ -122,20 +122,17 @@ def decompose_rpca_L1(
122122
L = np.ones((m, rank))
123123
Q = np.ones((n, rank))
124124
R = [np.ones((m, n - period)) for period in self.list_periods]
125-
# temporal correlations
126-
H = [rpca_utils.toeplitz_matrix(period, n, model="column") for period in self.list_periods]
127125

128-
##
126+
# matrices for temporal correlation
127+
H = [rpca_utils.toeplitz_matrix(period, n, model="column") for period in self.list_periods]
129128
HHT = np.zeros((n, n))
130129
for index, _ in enumerate(self.list_periods):
131130
HHT += self.list_etas[index] * (H[index] @ H[index].T)
132131

133132
Ir = np.eye(rank)
134133
In = np.eye(n)
135134

136-
increments = np.full((self.max_iterations,), np.nan, dtype=float)
137-
138-
for iteration in range(self.max_iterations):
135+
for _ in range(self.max_iterations):
139136
X_temp = X.copy()
140137
A_temp = A.copy()
141138
L_temp = L.copy()
@@ -189,7 +186,6 @@ def decompose_rpca_L1(
189186
for index, _ in enumerate(self.list_periods):
190187
Rc = np.maximum(Rc, np.linalg.norm(R[index] - R_temp[index], np.inf))
191188
tol = np.amax(np.array([Xc, Ac, Lc, Qc, Rc]))
192-
increments[iteration] = tol
193189

194190
if tol < self.tol:
195191
break
@@ -202,7 +198,7 @@ def decompose_rpca_L2(
202198
self, D: NDArray, Omega: NDArray, lam: float, tau: float, rank: int
203199
) -> Tuple:
204200
"""
205-
Compute the noisy RPCA with a L1 time penalisation
201+
Compute the noisy RPCA with a L2 time penalisation
206202
207203
Parameters
208204
----------
@@ -237,14 +233,18 @@ def decompose_rpca_L2(
237233
m, n = D.shape
238234

239235
# init
240-
Y = np.zeros((m, n))
236+
Y = np.full_like(D, 0)
241237
X = D.copy()
242-
A = np.zeros((m, n))
243-
L = np.ones((m, rank))
244-
Q = np.ones((n, rank))
238+
A = np.full_like(D, 0)
239+
U, S, Vt = np.linalg.svd(X)
240+
U = U[:, :rank]
241+
S = S[:rank]
242+
Vt = Vt[:rank, :]
243+
L = U @ np.diag(np.sqrt(S))
244+
Q = Vt.transpose() @ np.diag(np.sqrt(S))
245245

246-
mu = 1e-2
247-
mu_bar = mu * 1e10
246+
mu = self.mu or 1e-2
247+
mu_bar = mu * 1e3
248248

249249
# matrices for temporal correlation
250250
H = [rpca_utils.toeplitz_matrix(period, n, model="column") for period in self.list_periods]
@@ -255,14 +255,7 @@ def decompose_rpca_L2(
255255
Ir = np.eye(rank)
256256
In = np.eye(n)
257257

258-
increment = np.full((self.max_iterations,), np.nan, dtype=float)
259-
errors_ano = []
260-
errors_nuclear = []
261-
errors_noise = []
262-
errors_lagrange = []
263-
self.list_report = []
264-
265-
for iteration in range(self.max_iterations):
258+
for _ in range(self.max_iterations):
266259
X_temp = X.copy()
267260
A_temp = A.copy()
268261
L_temp = L.copy()
@@ -273,10 +266,10 @@ def decompose_rpca_L2(
273266
b=(D - A + mu * L @ Q.T - Y).T,
274267
).T
275268

276-
if np.any(~Omega):
277-
A_omega = rpca_utils.soft_thresholding(D - X, lam)
278-
A_omega_C = D - X
279-
A = np.where(Omega, A_omega, A_omega_C)
269+
if np.any(np.isnan(D)):
270+
A_Omega = rpca_utils.soft_thresholding(D - X, lam)
271+
A_Omega_C = D - X
272+
A = np.where(Omega, A_Omega, A_Omega_C)
280273
else:
281274
A = rpca_utils.soft_thresholding(D - X, lam)
282275

@@ -300,43 +293,10 @@ def decompose_rpca_L2(
300293
Qc = np.linalg.norm(Q - Q_temp, np.inf)
301294

302295
tol = max([Xc, Ac, Lc, Qc])
303-
increment[iteration] = tol
304-
305-
_, values_singular, _ = np.linalg.svd(X, full_matrices=True)
306-
errors_ano.append(np.sum(np.abs(A)))
307-
errors_nuclear.append(np.sum(values_singular))
308-
errors_noise.append(np.sum((D - X - A) ** 2))
309-
errors_lagrange.append(np.sum((X - L @ Q.T) ** 2))
310-
311-
if self.do_report:
312-
self.list_report.append((D, X, A))
313296

314297
if tol < self.tol:
315298
break
316299

317-
if self.do_report:
318-
errors_ano_np = np.array(errors_ano)
319-
errors_nuclear_np = np.array(errors_nuclear)
320-
errors_noise_np = np.array(errors_noise)
321-
errors_lagrange_np = np.array(errors_lagrange)
322-
323-
plt.plot(lam * errors_ano_np, label="Cost (ano)")
324-
plt.plot(tau * errors_nuclear_np, label="Cost (SV)")
325-
plt.plot(0.5 * errors_noise_np, label="Cost (noise)")
326-
plt.plot(errors_lagrange_np, label="Cost (Lagrange)")
327-
plt.plot(
328-
lam * errors_ano_np + tau * errors_nuclear_np + errors_noise_np,
329-
label="Total",
330-
color="black",
331-
)
332-
plt.yscale("log")
333-
# plt.gca().twinx()
334-
# plt.plot(errors_cv, color="black")
335-
plt.grid()
336-
plt.yscale("log")
337-
plt.legend()
338-
plt.show()
339-
340300
X = L @ Q.T
341301

342302
M = X
@@ -411,7 +371,58 @@ def decompose_rpca(self, D: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]:
411371

412372
if self.norm == "L1":
413373
M, A, U, V = self.decompose_rpca_L1(D, Omega, lam, tau, rank)
374+
414375
elif self.norm == "L2":
415376
M, A, U, V = self.decompose_rpca_L2(D, Omega, lam, tau, rank)
416377

378+
self._check_cost_function_minimized(D, M, A, tau, lam, self.norm)
379+
417380
return M, A
381+
382+
@staticmethod
383+
def _check_cost_function_minimized(
384+
observations: NDArray,
385+
low_rank: NDArray,
386+
anomalies: NDArray,
387+
tau: float,
388+
lam: float,
389+
norm: str,
390+
):
391+
"""Check that the functional minimized by the RPCA
392+
is smaller at the end than at the beginning
393+
394+
Parameters
395+
----------
396+
observations : NDArray
397+
observations matrix with first linear interpolation
398+
low_rank : NDArray
399+
low_rank matrix resulting from RPCA
400+
anomalies : NDArray
401+
sparse matrix resulting from RPCA
402+
tau : float
403+
parameter penalizing the nuclear norm of the low rank part
404+
lam : float
405+
parameter penalizing the L1-norm of the anomaly/sparse part
406+
norm : str
407+
norm of the temporal penalisation. Has to be `L1` or `L2`
408+
409+
Raises
410+
------
411+
CostFunctionRPCANotMinimized
412+
The RPCA does not minimized the cost function:
413+
the starting cost is at least equal to the final one.
414+
"""
415+
value_start = tau * np.linalg.norm(observations, "nuc")
416+
if norm == "L1":
417+
anomalies_norm = np.sum(np.abs(anomalies))
418+
function_str = "||D-M-A||_2 + tau ||D||_* + lam ||A||_1"
419+
elif norm == "L2":
420+
anomalies_norm = np.sum(anomalies**2)
421+
function_str = "||D-M-A||_2 + tau ||D||_* + lam ||A||_2"
422+
value_end = (
423+
np.sum((observations - low_rank - anomalies) ** 2)
424+
+ tau * np.linalg.norm(low_rank, "nuc")
425+
+ lam * anomalies_norm
426+
)
427+
if value_start + 1e-4 <= value_end:
428+
raise CostFunctionRPCANotMinimized(function_str, float(value_start), float(value_end))

qolmat/imputations/rpca/rpca_pcp.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@
44

55
import numpy as np
66
from numpy.typing import NDArray
7+
from sklearn import utils as sku
78

89
from qolmat.imputations.rpca import rpca_utils
910
from qolmat.imputations.rpca.rpca import RPCA
10-
from sklearn import utils as sku
11+
from qolmat.utils.exceptions import CostFunctionRPCANotMinimized
1112

1213

1314
class RPCAPCP(RPCA):
@@ -69,11 +70,48 @@ def decompose_rpca(self, D: NDArray, Omega: NDArray) -> Tuple[NDArray, NDArray]:
6970
M = rpca_utils.svd_thresholding(D - A + Y / mu, 1 / mu)
7071
A = rpca_utils.soft_thresholding(D - M + Y / mu, lam / mu)
7172
A[~Omega] = (D - M)[~Omega]
73+
7274
Y += mu * (D - M - A)
7375

7476
error = np.linalg.norm(D - M - A, "fro") / D_norm
7577
errors[iteration] = error
7678

7779
if error < self.tol:
7880
break
81+
82+
self._check_cost_function_minimized(D, M, A, lam)
83+
7984
return M, A
85+
86+
@staticmethod
87+
def _check_cost_function_minimized(
88+
observations: NDArray,
89+
low_rank: NDArray,
90+
anomalies: NDArray,
91+
lam: float,
92+
):
93+
"""Check that the functional minimized by the RPCA
94+
is smaller at the end than at the beginning
95+
96+
Parameters
97+
----------
98+
observations : NDArray
99+
observations matrix with first linear interpolation
100+
low_rank : NDArray
101+
low_rank matrix resulting from RPCA
102+
anomalies : NDArray
103+
sparse matrix resulting from RPCA
104+
lam : float
105+
parameter penalizing the L1-norm of the anomaly/sparse part
106+
107+
Raises
108+
------
109+
CostFunctionRPCANotMinimized
110+
The RPCA does not minimized the cost function:
111+
the starting cost is at least equal to the final one.
112+
"""
113+
value_start = np.linalg.norm(observations, "nuc")
114+
value_end = np.linalg.norm(low_rank, "nuc") + lam * np.sum(np.abs(anomalies))
115+
if value_start + 1e-4 <= value_end:
116+
function_str = "||D||_* + lam ||A||_1"
117+
raise CostFunctionRPCANotMinimized(function_str, float(value_start), float(value_end))

qolmat/imputations/rpca/rpca_utils.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,10 @@
22
Modular utility functions for RPCA
33
"""
44

5-
import warnings
6-
from typing import List, Optional, Tuple
75

86
import numpy as np
9-
import pandas as pd
10-
import scipy
117
from numpy.typing import NDArray
128
from scipy.linalg import toeplitz
13-
from sklearn.neighbors import kneighbors_graph
149

1510

1611
def approx_rank(

qolmat/utils/exceptions.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,14 @@ def __init__(self, subset: Any):
3131
super().__init__(f"Provided subset `{subset}` should be None or a list!")
3232

3333

34+
class CostFunctionRPCANotMinimized(Exception):
35+
def __init__(self, name_fct: str, value_start: float, value_end: float):
36+
super().__init__(
37+
f"RPCA algorithm may provide bad results. Function {name_fct} increased from"
38+
f" {value_start} to {value_end} instead of decreasing!"
39+
)
40+
41+
3442
class NotDimension2(Exception):
3543
def __init__(self, shape: Tuple[int, ...]):
3644
super().__init__(f"Provided matrix is of shape {shape}, which is not of dimension 2!")

0 commit comments

Comments
 (0)