Skip to content

Commit 59307bc

Browse files
authored
More cleaning (#34)
Remove pandas, switch over to SVD impute, add progress bars.
1 parent 830fa39 commit 59307bc

File tree

12 files changed

+107
-550
lines changed

12 files changed

+107
-550
lines changed

poetry.lock

Lines changed: 22 additions & 62 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ python = ">=3.9,<3.11"
1010
numpy = "^1.21"
1111
tensorly = "^0.7"
1212
matplotlib = "^3.5.0"
13-
pandas = "^1.3.5"
13+
tqdm = "^4.62.3"
1414

1515
[tool.poetry.dev-dependencies]
1616
pytest = "^6.2"

tensorpack/SVD_impute.py

Lines changed: 55 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -13,38 +13,70 @@
1313
from tensorly import partial_svd
1414
import numpy as np
1515

16-
from .soft_impute import Solver
1716

18-
F32PREC = np.finfo(np.float32).eps
19-
20-
def masked_mae(X_true, X_pred, mask):
21-
masked_diff = X_true[mask] - X_pred[mask]
22-
return np.mean(np.abs(masked_diff))
23-
24-
25-
class IterativeSVD(Solver):
17+
class IterativeSVD(object):
2618
def __init__(
2719
self,
2820
rank,
29-
convergence_threshold=0.00001,
30-
max_iters=200,
31-
init_fill_method="zero",
21+
convergence_threshold=1e-7,
22+
max_iters=500,
3223
random_state=None,
3324
min_value=None,
3425
max_value=None,
3526
verbose=False):
36-
Solver.__init__(
37-
self,
38-
fill_method=init_fill_method,
39-
min_value=min_value,
40-
max_value=max_value)
27+
self.min_value=min_value
28+
self.max_value=max_value
4129
self.rank = rank
4230
self.max_iters = max_iters
4331
self.convergence_threshold = convergence_threshold
4432
self.verbose = verbose
4533
self.random_state = random_state
4634

35+
def clip(self, X):
36+
"""
37+
Clip values to fall within any global or column-wise min/max constraints
38+
"""
39+
X = np.asarray(X)
40+
if self.min_value is not None:
41+
X[X < self.min_value] = self.min_value
42+
if self.max_value is not None:
43+
X[X > self.max_value] = self.max_value
44+
return X
45+
46+
def prepare_input_data(self, X):
47+
"""
48+
Check to make sure that the input matrix and its mask of missing
49+
values are valid. Returns X and missing mask.
50+
"""
51+
if X.dtype != "f" and X.dtype != "d":
52+
X = X.astype(float)
53+
54+
assert X.ndim == 2
55+
missing_mask = np.isnan(X)
56+
assert not missing_mask.all()
57+
return X, missing_mask
58+
59+
def fit_transform(self, X, y=None):
60+
"""
61+
Fit the imputer and then transform input `X`
62+
Note: all imputations should have a `fit_transform` method,
63+
but only some (like IterativeImputer in sklearn) also support inductive
64+
mode using `fit` or `fit_transform` on `X_train` and then `transform`
65+
on new `X_test`.
66+
"""
67+
X_original, missing_mask = self.prepare_input_data(X)
68+
observed_mask = ~missing_mask
69+
X_filled = X_original.copy()
70+
X_filled[missing_mask] = 0.0
71+
assert isinstance(X_filled, np.ndarray)
72+
X_result = self.solve(X_filled, missing_mask)
73+
assert isinstance(X_result, np.ndarray)
74+
X_result = self.clip(np.asarray(X_result))
75+
X_result[observed_mask] = X_original[observed_mask]
76+
return X_result
77+
4778
def _converged(self, X_old, X_new, missing_mask):
79+
F32PREC = np.finfo(np.float32).eps
4880
# check for convergence
4981
old_missing_values = X_old[missing_mask]
5082
new_missing_values = X_new[missing_mask]
@@ -59,19 +91,17 @@ def _converged(self, X_old, X_new, missing_mask):
5991
return (ssd / old_norm_squared) < self.convergence_threshold
6092

6193
def solve(self, X, missing_mask):
62-
# X = check_array(X, force_all_finite=False)
63-
6494
observed_mask = ~missing_mask
6595
X_filled = X
6696
for i in range(self.max_iters):
6797
curr_rank = self.rank
68-
U, S, V = partial_svd(X_filled, curr_rank, random_state=self.random_state)
69-
X_reconstructed = U @ np.diag(S) @ V
98+
self.U, S, V = partial_svd(X_filled, curr_rank, random_state=self.random_state)
99+
X_reconstructed = self.U @ np.diag(S) @ V
70100
X_reconstructed = self.clip(X_reconstructed)
71-
mae = masked_mae(
72-
X_true=X,
73-
X_pred=X_reconstructed,
74-
mask=observed_mask)
101+
102+
# Masked mae
103+
mae = np.mean(np.abs(X[observed_mask] - X_reconstructed[observed_mask]))
104+
75105
if self.verbose:
76106
print(
77107
"[IterativeSVD] Iter %d: observed MAE=%0.6f" % (

tensorpack/cmtf.py

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
from tensorly.tenalg import khatri_rao
99
from copy import deepcopy
1010
from tensorly.decomposition._cp import initialize_cp, parafac
11+
from tqdm import tqdm
1112
from .SVD_impute import IterativeSVD
12-
from .soft_impute import SoftImpute
1313

1414

1515
tl.set_backend('numpy')
@@ -176,8 +176,9 @@ def initialize_cmtf(tensor: np.ndarray, matrix: np.ndarray, rank: int):
176176
if np.sum(~np.isfinite(unfold)) > 0:
177177
si = IterativeSVD(rank=rank, random_state=1)
178178
unfold = si.fit_transform(unfold)
179-
180-
factors[0] = np.linalg.svd(unfold)[0][:, :rank]
179+
factors[0] = si.U
180+
else:
181+
factors[0] = np.linalg.svd(unfold)[0][:, :rank]
181182

182183
unfold = tl.unfold(tensor, 1)
183184
unfold = unfold[:, np.all(np.isfinite(unfold), axis=0)]
@@ -205,7 +206,7 @@ def initialize_cp(tensor: np.ndarray, rank: int):
205206
if tensor.shape[mode] >= rank:
206207
unfold = tl.unfold(tensor, mode)
207208
if contain_missing:
208-
si = SoftImpute(max_rank=rank)
209+
si = IterativeSVD(rank)
209210
unfold = si.fit_transform(unfold)
210211

211212
factors[mode] = partial_svd(unfold, rank, flip=True)[0]
@@ -250,7 +251,7 @@ def perform_CP(tOrig, r=6, tol=1e-6):
250251
return tFac
251252

252253

253-
def perform_CMTF(tOrig, mOrig, r=9, tol=1e-6, maxiter=50, qr=False):
254+
def perform_CMTF(tOrig, mOrig, r=9, tol=1e-6, maxiter=50, progress=True):
254255
""" Perform CMTF decomposition. """
255256
assert tOrig.dtype == float
256257
assert mOrig.dtype == float
@@ -265,9 +266,10 @@ def perform_CMTF(tOrig, mOrig, r=9, tol=1e-6, maxiter=50, qr=False):
265266
# Precalculate the missingness patterns
266267
uniqueInfo = np.unique(np.isfinite(unfolded.T), axis=1, return_inverse=True)
267268

268-
for _ in range(maxiter):
269+
tq = tqdm(range(maxiter), disable=(not progress))
270+
for _ in tq:
269271
tensor = np.nan_to_num(tOrig) + tl.cp_to_tensor(tFac) * np.isnan(tOrig)
270-
tFac = parafac(tensor, r, 200, init=tFac, verbose=False, fixed_modes=[0], mask=np.isfinite(tOrig), linesearch=True, tol=1e-9)
272+
tFac = parafac(tensor, r, 2000, init=tFac, verbose=False, fixed_modes=[0], mask=np.isfinite(tOrig), linesearch=True, tol=1e-9)
271273

272274
# Solve for the glycan matrix fit
273275
tFac.mFactor = np.linalg.lstsq(tFac.factors[0][missingM, :], mOrig[missingM, :], rcond=-1)[0].T
@@ -277,11 +279,9 @@ def perform_CMTF(tOrig, mOrig, r=9, tol=1e-6, maxiter=50, qr=False):
277279
kr = np.vstack((kr, tFac.mFactor))
278280
tFac.factors[0] = censored_lstsq(kr, unfolded.T, uniqueInfo)
279281

280-
if qr:
281-
tFac.factors[0] = np.linalg.qr(tFac.factors[0])[0]
282-
283282
R2X_last = R2X
284283
R2X = calcR2X(tFac, tOrig, mOrig)
284+
tq.set_postfix(R2X=R2X, delta=R2X - R2X_last, refresh=False)
285285
assert R2X > 0.0
286286

287287
if R2X - R2X_last < tol:
@@ -293,6 +293,4 @@ def perform_CMTF(tOrig, mOrig, r=9, tol=1e-6, maxiter=50, qr=False):
293293
tFac = sort_factors(tFac)
294294
tFac.R2X = R2X
295295

296-
print("R2X: " + str(tFac.R2X))
297-
298-
return tFac
296+
return tFac

tensorpack/decomposition.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import pickle
22
import numpy as np
3-
import pandas as pd
43
from numpy.linalg import norm
54
from tensorly import partial_svd
65
from .cmtf import perform_CP, calcR2X
@@ -19,7 +18,7 @@ def impute_missing_mat(dat):
1918

2019
diff = 1.0
2120
while diff > 1e-3:
22-
U, S, V = partial_svd(imp, min(dat.shape)-1)
21+
U, S, V = partial_svd(imp, min(dat.shape) - 1)
2322
scores = U @ np.diag(S)
2423
loadings = V
2524
recon = scores @ loadings
@@ -52,12 +51,11 @@ def perform_PCA(self, flattenon=0):
5251
scores = U @ np.diag(S)
5352
loadings = V
5453
recon = [scores[:, :rr] @ loadings[:rr, :] for rr in self.rrs]
55-
self.PCAR2X = [calcR2X(c, mIn = flatData) for c in recon]
54+
self.PCAR2X = [calcR2X(c, mIn=flatData) for c in recon]
5655
self.sizePCA = [sum(flatData.shape) * rr for rr in self.rrs]
5756

58-
5957
def Q2X_chord(self, drop=10, repeat=10):
60-
self.chordQ2X = None # df
58+
self.chordQ2X = None # df
6159
pass
6260

6361
def Q2X_entry(self, drop=10, repeat=10):
@@ -73,4 +71,4 @@ def load(self, pfile):
7371
tmp_dict = pickle.load(input_file)
7472
self.__dict__.update(tmp_dict)
7573

76-
pass
74+
pass

tensorpack/figureCommon.py

Lines changed: 0 additions & 18 deletions
This file was deleted.

0 commit comments

Comments
 (0)