Skip to content

Commit 8c227dc

Browse files
Merge pull request #76 from Quantmetry/softimpute
INIT: softimpute
2 parents b84cdbf + 7ae0b79 commit 8c227dc

File tree

4 files changed

+445
-4
lines changed

4 files changed

+445
-4
lines changed

qolmat/imputations/imputers.py

Lines changed: 117 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
from qolmat.imputations import em_sampler
2020
from qolmat.imputations.rpca import rpca, rpca_noisy, rpca_pcp
21+
from qolmat.imputations import softimpute
2122
from qolmat.utils.exceptions import NotDataFrame
2223
from qolmat.utils.utils import HyperValue
2324

@@ -1772,6 +1773,120 @@ def _transform_element(
17721773
return df_imputed
17731774

17741775

1776+
class ImputerSoftImpute(_Imputer):
1777+
"""_summary_
1778+
1779+
Parameters
1780+
----------
1781+
"""
1782+
1783+
def __init__(
1784+
self,
1785+
groups: Tuple[str, ...] = (),
1786+
columnwise: bool = False,
1787+
random_state: Union[None, int, np.random.RandomState] = None,
1788+
period: int = 1,
1789+
rank: int = 2,
1790+
tolerance: float = 1e-05,
1791+
tau: float = 0,
1792+
max_iterations: int = 100,
1793+
verbose: bool = False,
1794+
projected: bool = True,
1795+
):
1796+
super().__init__(
1797+
imputer_params=(
1798+
"period",
1799+
"rank",
1800+
"tolerance",
1801+
"tau",
1802+
"max_iterations",
1803+
"verbose",
1804+
"projected",
1805+
),
1806+
groups=groups,
1807+
columnwise=columnwise,
1808+
random_state=random_state,
1809+
)
1810+
self.period = period
1811+
self.rank = rank
1812+
self.tolerance = tolerance
1813+
self.tau = tau
1814+
self.max_iterations = max_iterations
1815+
self.verbose = verbose
1816+
self.projected = projected
1817+
1818+
def _fit_element(
1819+
self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0
1820+
) -> softimpute.SoftImpute:
1821+
"""
1822+
Fits the imputer on `df`, at the group and/or column level depending on
1823+
self.groups and self.columnwise.
1824+
1825+
Parameters
1826+
----------
1827+
df : pd.DataFrame
1828+
Dataframe on which the imputer is fitted
1829+
col : str, optional
1830+
Column on which the imputer is fitted, by default "__all__"
1831+
ngroup : int, optional
1832+
Id of the group on which the method is applied
1833+
1834+
Returns
1835+
-------
1836+
Any
1837+
Return fitted SoftImpute model
1838+
1839+
Raises
1840+
------
1841+
NotDataFrame
1842+
Input has to be a pandas.DataFrame.
1843+
"""
1844+
self._check_dataframe(df)
1845+
assert col == "__all__"
1846+
hyperparams = self.get_hyperparams()
1847+
model = softimpute.SoftImpute(random_state=self._rng, **hyperparams)
1848+
model = model.fit(df.values)
1849+
return model
1850+
1851+
def _transform_element(
1852+
self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0
1853+
) -> pd.DataFrame:
1854+
"""
1855+
Transforms the fataframe `df`, at the group level depending on
1856+
self.groups
1857+
1858+
Parameters
1859+
----------
1860+
df : pd.DataFrame
1861+
Dataframe or column to impute
1862+
col : str, optional
1863+
Column transformed by the imputer, by default "__all__"
1864+
1865+
Returns
1866+
-------
1867+
pd.DataFrame
1868+
Imputed dataframe
1869+
1870+
Raises
1871+
------
1872+
NotDataFrame
1873+
Input has to be a pandas.DataFrame.
1874+
"""
1875+
self._check_dataframe(df)
1876+
assert col == "__all__"
1877+
model = self._dict_fitting["__all__"][ngroup]
1878+
X_imputed = model.transform(df.values)
1879+
return pd.DataFrame(X_imputed, index=df.index, columns=df.columns)
1880+
1881+
def _more_tags(self):
1882+
return {
1883+
"_xfail_checks": {
1884+
"check_fit2d_1sample": "This test shouldn't be running at all!",
1885+
"check_fit2d_1feature": "This test shouldn't be running at all!",
1886+
},
1887+
}
1888+
1889+
17751890
class ImputerEM(_Imputer):
17761891
"""
17771892
This class implements an imputation method based on joint modelling and an inference using a
@@ -1873,7 +1988,7 @@ def get_model(self, **hyperparams) -> em_sampler.EM:
18731988

18741989
def _fit_element(
18751990
self, df: pd.DataFrame, col: str = "__all__", ngroup: int = 0
1876-
) -> IterativeImputer:
1991+
) -> em_sampler.EM:
18771992
"""
18781993
Fits the imputer on `df`, at the group and/or column level depending onself.groups and
18791994
self.columnwise.
@@ -1890,7 +2005,7 @@ def _fit_element(
18902005
Returns
18912006
-------
18922007
Any
1893-
Return fitted KNN model
2008+
Return fitted EM model
18942009
18952010
Raises
18962011
------

qolmat/imputations/softimpute.py

Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
from __future__ import annotations
2+
3+
from typing import Optional, Union
4+
5+
import numpy as np
6+
from numpy.typing import NDArray
7+
from sklearn import utils as sku
8+
from sklearn.base import BaseEstimator, TransformerMixin
9+
10+
from qolmat.utils import utils
11+
from qolmat.imputations.rpca import rpca_utils
12+
13+
14+
class SoftImpute(BaseEstimator, TransformerMixin):
15+
"""
16+
This class implements the SoftImpute ALS algorithm presented in
17+
Hastie, Trevor, et al. "Matrix completion and low-rank SVD
18+
via fast alternating least squares." The Journal of Machine Learning
19+
Research 16.1 (2015): 3367-3402.
20+
min_A,B || Proj(X - AB')||_F^2 + tau * (|| A ||_F^2 + || B ||_F^2)
21+
22+
Parameters
23+
----------
24+
period : int
25+
Number of rows of the array if the array is 1D and
26+
reshaped into a 2D array. Corresponds to the period of the time series,
27+
if 1D time series is passed.
28+
rank : int
29+
Estimated rank of the matrix
30+
tolerance : float
31+
Tolerance for the convergence criterion
32+
tau : float
33+
regularisation parameter
34+
max_iterations : int
35+
Maximum number of iterations
36+
random_state : int, optional
37+
The seed of the pseudo random number generator to use, for reproductibility
38+
verbose : bool
39+
flag for verbosity
40+
projected : bool
41+
If true, only imputed values are changed.
42+
If False, the matrix obtained via the algorithm is returned, by default True
43+
44+
Examples
45+
--------
46+
>>> import numpy as np
47+
>>> from qolmat.imputations.softimpute import SoftImpute
48+
>>> X = np.array([[1, 2, np.nan, 4], [1, 5, 3, np.nan], [4, 2, 3, 2], [1, 1, 5, 4]])
49+
>>> X_imputed = SoftImpute().fit_transform(X)
50+
>>> print(X_imputed)
51+
"""
52+
53+
def __init__(
54+
self,
55+
period: int = 1,
56+
rank: int = 2,
57+
tolerance: float = 1e-05,
58+
tau: float = 0,
59+
max_iterations: int = 100,
60+
random_state: Union[None, int, np.random.RandomState] = None,
61+
verbose: bool = False,
62+
projected: bool = True,
63+
):
64+
self.period = period
65+
self.rank = rank
66+
self.tolerance = tolerance
67+
self.tau = tau
68+
self.max_iterations = max_iterations
69+
self.random_state = sku.check_random_state(random_state)
70+
self.verbose = verbose
71+
self.projected = projected
72+
self.u: NDArray = np.empty(0)
73+
self.d: NDArray = np.empty(0)
74+
self.v: NDArray = np.empty(0)
75+
76+
def fit(self, X: NDArray, y=None) -> SoftImpute:
77+
"""Fit the imputer on X.
78+
79+
Parameters
80+
----------
81+
X : NDArray
82+
Input data
83+
84+
y : Ignored
85+
Not used, present here for API consistency by convention.
86+
87+
Returns
88+
-------
89+
self : object
90+
The fitted `SoftImpute` class instance.
91+
"""
92+
X_imputed = X.copy()
93+
X_imputed = utils.prepare_data(X_imputed, self.period)
94+
95+
if not isinstance(X_imputed, np.ndarray):
96+
raise AssertionError("Invalid type. X must be a NDArray.")
97+
98+
n, m = X_imputed.shape
99+
mask = np.isnan(X_imputed)
100+
V = np.zeros((m, self.rank))
101+
U = self.random_state.normal(0.0, 1.0, (n, self.rank))
102+
U, _, _ = np.linalg.svd(U, full_matrices=False)
103+
Dsq = np.ones((self.rank, 1))
104+
col_means = np.nanmean(X_imputed, axis=0)
105+
np.copyto(X_imputed, col_means, where=np.isnan(X_imputed))
106+
if self.rank is None:
107+
self.rank = rpca_utils.approx_rank(X_imputed)
108+
for iter_ in range(self.max_iterations):
109+
U_old = U
110+
V_old = V
111+
Dsq_old = Dsq
112+
113+
B = U.T @ X_imputed
114+
if self.tau > 0:
115+
tmp = Dsq / (Dsq + self.tau)
116+
B = B * tmp
117+
Bsvd = np.linalg.svd(B.T, full_matrices=False)
118+
V = Bsvd[0]
119+
Dsq = Bsvd[1][:, np.newaxis]
120+
U = U @ Bsvd[2]
121+
tmp = Dsq * V.T
122+
X_hat = U @ tmp
123+
X_imputed[mask] = X_hat[mask]
124+
125+
A = (X_imputed @ V).T
126+
if self.tau > 0:
127+
tmp = Dsq / (Dsq + self.tau)
128+
A = A * tmp
129+
Asvd = np.linalg.svd(A.T, full_matrices=False)
130+
U = Asvd[0]
131+
Dsq = Asvd[1][:, np.newaxis]
132+
V = V @ Asvd[2]
133+
tmp = Dsq * V.T
134+
X_hat = U @ tmp
135+
X_imputed[mask] = X_hat[mask]
136+
137+
ratio = self._check_convergence(U_old, Dsq_old, V_old, U, Dsq, V)
138+
if self.verbose:
139+
print(f"iter {iter_}: ratio = {round(ratio, 4)}")
140+
if ratio < self.tolerance:
141+
break
142+
143+
self.u = U[:, : self.rank]
144+
self.d = Dsq[: self.rank]
145+
self.v = V[:, : self.rank]
146+
147+
return self
148+
149+
def _check_convergence(
150+
self,
151+
U_old: NDArray,
152+
Ds_qold: NDArray,
153+
V_old: NDArray,
154+
U: NDArray,
155+
Dsq: NDArray,
156+
V: NDArray,
157+
) -> float:
158+
"""Given a pair of iterates (U_old, Ds_qold, V_old) and (U, Dsq, V),
159+
it computes the relative change in Frobenius norm given by
160+
|| U_old @ Dsq_old @ V_old.T - U @ Dsq @ V.T ||_F^2
161+
/ || U_old @ Ds_qold @ V_old.T ||_F^2
162+
163+
Parameters
164+
----------
165+
U_old : NDArray
166+
previous matrix U
167+
Ds_qold : NDArray
168+
previous matrix Dsq
169+
V_old : NDArray
170+
previous matrix V
171+
U : NDArray
172+
current matrix U
173+
Dsq : NDArray
174+
current matrix Dsq
175+
V : NDArray
176+
current matrix V
177+
178+
Returns
179+
-------
180+
float
181+
relative change
182+
"""
183+
if any(arg is None for arg in (U_old, Ds_qold, V_old, U, Dsq, V)):
184+
raise ValueError("One or more arguments are None.")
185+
186+
denom = (Ds_qold**2).sum()
187+
utu = Dsq * (U.T @ U_old)
188+
vtv = Ds_qold * (V_old.T @ V)
189+
uvprod = (utu @ vtv).diagonal().sum()
190+
num = denom + (Ds_qold**2).sum() - 2 * uvprod
191+
return num / max(denom, 1e-9)
192+
193+
def transform(self, X: NDArray) -> NDArray:
194+
"""Impute all missing values in X.
195+
196+
Parameters
197+
----------
198+
X : array-like of shape (n_samples, n_features)
199+
The input data to complete.
200+
201+
Returns
202+
-------
203+
X : NDArray
204+
The imputed dataset.
205+
"""
206+
X_transformed = self.u @ np.diag(self.d.T[0]) @ (self.v).T
207+
if self.projected:
208+
X_ = utils.prepare_data(X, self.period)
209+
mask = np.isnan(X_)
210+
X_transformed[~mask] = X_[~mask]
211+
212+
X_transformed = utils.get_shape_original(X_transformed, X.shape)
213+
214+
if np.all(np.isnan(X_transformed)):
215+
raise AssertionError("Result contains NaN. This is a bug.")
216+
217+
return X_transformed

tests/imputations/test_imputers.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -275,18 +275,31 @@ def test_ImputerRPCA_fit_transform(df: pd.DataFrame) -> None:
275275
np.testing.assert_allclose(result, expected, atol=1e-2)
276276

277277

278+
@pytest.mark.parametrize("df", [df_incomplete])
279+
def test_ImputerSoftImpute_fit_transform(df: pd.DataFrame) -> None:
280+
imputer = imputers.ImputerSoftImpute(
281+
columnwise=False, max_iterations=100, tau=0.3, random_state=4
282+
)
283+
result = imputer.fit_transform(df)
284+
expected = pd.DataFrame(
285+
{
286+
"col1": [0, 1.327, 2, 3, 0.137],
287+
"col2": [-1, 0.099, 0.5, 0.122, 1.5],
288+
}
289+
)
290+
np.testing.assert_allclose(result, expected, atol=1e-2)
291+
292+
278293
@pytest.mark.parametrize("df", [df_timeseries])
279294
def test_ImputerEM_fit_transform(df: pd.DataFrame) -> None:
280295
imputer = imputers.ImputerEM(method="sample", dt=1e-3, random_state=42)
281296
result = imputer.fit_transform(df)
282-
print(result)
283297
expected = pd.DataFrame(
284298
{
285299
"col1": [i for i in range(20)],
286300
"col2": [0, 0.773, 2, 2.621, 2] + [i for i in range(5, 20)],
287301
}
288302
)
289-
print(result)
290303
np.testing.assert_allclose(result, expected, atol=1e-2)
291304

292305

0 commit comments

Comments
 (0)