Skip to content

Commit 97490a7

Browse files
authored
Merge pull request #6815 from pavlin-policar/remove-randomized-pca
[FIX] Remove Orange implementation of randomized PCA for sparse data
2 parents ce27d18 + 971cd90 commit 97490a7

File tree

6 files changed

+20
-327
lines changed

6 files changed

+20
-327
lines changed

Orange/projection/pca.py

Lines changed: 11 additions & 227 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,8 @@
1-
import numbers
2-
import six
31
import numpy as np
42
import scipy.sparse as sp
5-
from scipy.linalg import lu, qr, svd
6-
73
from sklearn import decomposition as skl_decomposition
8-
from sklearn.utils import check_array, check_random_state
9-
from sklearn.utils.extmath import svd_flip, safe_sparse_dot
10-
from sklearn.utils.validation import check_is_fitted
114

125
import Orange.data
13-
from Orange.statistics import util as ut
146
from Orange.data import Variable
157
from Orange.data.util import get_unique_names
168
from Orange.misc.wrapper_meta import WrapperMeta
@@ -20,223 +12,6 @@
2012
__all__ = ["PCA", "SparsePCA", "IncrementalPCA", "TruncatedSVD"]
2113

2214

23-
def randomized_pca(A, n_components, n_oversamples=10, n_iter="auto",
24-
flip_sign=True, random_state=0):
25-
"""Compute the randomized PCA decomposition of a given matrix.
26-
27-
This method differs from the scikit-learn implementation in that it supports
28-
and handles sparse matrices well.
29-
30-
"""
31-
if n_iter == "auto":
32-
# Checks if the number of iterations is explicitly specified
33-
# Adjust n_iter. 7 was found a good compromise for PCA. See sklearn #5299
34-
n_iter = 7 if n_components < .1 * min(A.shape) else 4
35-
36-
n_samples, n_features = A.shape
37-
38-
c = np.atleast_2d(ut.nanmean(A, axis=0))
39-
40-
if n_samples >= n_features:
41-
Q = random_state.normal(size=(n_features, n_components + n_oversamples))
42-
if A.dtype.kind == "f":
43-
Q = Q.astype(A.dtype, copy=False)
44-
45-
Q = safe_sparse_dot(A, Q) - safe_sparse_dot(c, Q)
46-
47-
# Normalized power iterations
48-
for _ in range(n_iter):
49-
Q = safe_sparse_dot(A.T, Q) - safe_sparse_dot(c.T, Q.sum(axis=0)[None, :])
50-
Q, _ = lu(Q, permute_l=True)
51-
Q = safe_sparse_dot(A, Q) - safe_sparse_dot(c, Q)
52-
Q, _ = lu(Q, permute_l=True)
53-
54-
Q, _ = qr(Q, mode="economic")
55-
56-
QA = safe_sparse_dot(A.T, Q) - safe_sparse_dot(c.T, Q.sum(axis=0)[None, :])
57-
R, s, V = svd(QA.T, full_matrices=False)
58-
U = Q.dot(R)
59-
60-
else: # n_features > n_samples
61-
Q = random_state.normal(size=(n_samples, n_components + n_oversamples))
62-
if A.dtype.kind == "f":
63-
Q = Q.astype(A.dtype, copy=False)
64-
65-
Q = safe_sparse_dot(A.T, Q) - safe_sparse_dot(c.T, Q.sum(axis=0)[None, :])
66-
67-
# Normalized power iterations
68-
for _ in range(n_iter):
69-
Q = safe_sparse_dot(A, Q) - safe_sparse_dot(c, Q)
70-
Q, _ = lu(Q, permute_l=True)
71-
Q = safe_sparse_dot(A.T, Q) - safe_sparse_dot(c.T, Q.sum(axis=0)[None, :])
72-
Q, _ = lu(Q, permute_l=True)
73-
74-
Q, _ = qr(Q, mode="economic")
75-
76-
QA = safe_sparse_dot(A, Q) - safe_sparse_dot(c, Q)
77-
U, s, R = svd(QA, full_matrices=False)
78-
V = R.dot(Q.T)
79-
80-
if flip_sign:
81-
U, V = svd_flip(U, V)
82-
83-
return U[:, :n_components], s[:n_components], V[:n_components, :]
84-
85-
86-
class ImprovedPCA(skl_decomposition.PCA):
87-
"""Patch sklearn PCA learner to include randomized PCA for sparse matrices.
88-
89-
Scikit-learn does not currently support sparse matrices at all, even though
90-
efficient methods exist for PCA. This class patches the default scikit-learn
91-
implementation to properly handle sparse matrices.
92-
93-
Notes
94-
-----
95-
- This should be removed once scikit-learn releases a version which
96-
implements this functionality.
97-
98-
"""
99-
# pylint: disable=too-many-branches
100-
def _fit(self, X):
101-
"""Dispatch to the right submethod depending on the chosen solver."""
102-
X = self._validate_data(
103-
X,
104-
dtype=[np.float64, np.float32],
105-
reset=False,
106-
accept_sparse=["csr", "csc"],
107-
copy=self.copy
108-
)
109-
110-
# Handle n_components==None
111-
if self.n_components is None:
112-
if self.svd_solver != "arpack":
113-
n_components = min(X.shape)
114-
else:
115-
n_components = min(X.shape) - 1
116-
else:
117-
n_components = self.n_components
118-
119-
# Handle svd_solver
120-
self._fit_svd_solver = self.svd_solver
121-
if self._fit_svd_solver == "auto":
122-
# Sparse data can only be handled with the randomized solver
123-
if sp.issparse(X):
124-
self._fit_svd_solver = "randomized"
125-
# Small problem or n_components == 'mle', just call full PCA
126-
elif max(X.shape) <= 500 or n_components == "mle":
127-
self._fit_svd_solver = "full"
128-
elif 1 <= n_components < .8 * min(X.shape):
129-
self._fit_svd_solver = "randomized"
130-
# This is also the case of n_components in (0,1)
131-
else:
132-
self._fit_svd_solver = "full"
133-
134-
# Ensure we don't try call arpack or full on a sparse matrix
135-
if sp.issparse(X) and self._fit_svd_solver != "randomized":
136-
raise ValueError("only the randomized solver supports sparse matrices")
137-
138-
# Call different fits for either full or truncated SVD
139-
if self._fit_svd_solver == "full":
140-
return self._fit_full(X, n_components)
141-
elif self._fit_svd_solver in ["arpack", "randomized"]:
142-
return self._fit_truncated(X, n_components, self._fit_svd_solver)
143-
else:
144-
raise ValueError(
145-
"Unrecognized svd_solver='{0}'".format(self._fit_svd_solver)
146-
)
147-
148-
def _fit_truncated(self, X, n_components, svd_solver):
149-
"""Fit the model by computing truncated SVD (by ARPACK or randomized) on X"""
150-
n_samples, n_features = X.shape
151-
152-
if isinstance(n_components, six.string_types):
153-
raise ValueError(
154-
"n_components=%r cannot be a string with svd_solver='%s'" %
155-
(n_components, svd_solver)
156-
)
157-
if not 1 <= n_components <= min(n_samples, n_features):
158-
raise ValueError(
159-
"n_components=%r must be between 1 and min(n_samples, "
160-
"n_features)=%r with svd_solver='%s'" % (
161-
n_components, min(n_samples, n_features), svd_solver
162-
)
163-
)
164-
if not isinstance(n_components, (numbers.Integral, np.integer)):
165-
raise ValueError(
166-
"n_components=%r must be of type int when greater than or "
167-
"equal to 1, was of type=%r" % (n_components, type(n_components))
168-
)
169-
if svd_solver == "arpack" and n_components == min(n_samples, n_features):
170-
raise ValueError(
171-
"n_components=%r must be strictly less than min(n_samples, "
172-
"n_features)=%r with svd_solver='%s'" % (
173-
n_components, min(n_samples, n_features), svd_solver
174-
)
175-
)
176-
177-
random_state = check_random_state(self.random_state)
178-
179-
self.mean_ = X.mean(axis=0)
180-
total_var = ut.var(X, axis=0, ddof=1)
181-
182-
if svd_solver == "arpack":
183-
# Center data
184-
X -= self.mean_
185-
# random init solution, as ARPACK does it internally
186-
v0 = random_state.uniform(-1, 1, size=min(X.shape))
187-
U, S, V = sp.linalg.svds(X, k=n_components, tol=self.tol, v0=v0)
188-
# svds doesn't abide by scipy.linalg.svd/randomized_svd
189-
# conventions, so reverse its outputs.
190-
S = S[::-1]
191-
# flip eigenvectors' sign to enforce deterministic output
192-
U, V = svd_flip(U[:, ::-1], V[::-1])
193-
194-
elif svd_solver == "randomized":
195-
# sign flipping is done inside
196-
U, S, V = randomized_pca(
197-
X,
198-
n_components=n_components,
199-
n_iter=self.iterated_power,
200-
flip_sign=True,
201-
random_state=random_state,
202-
)
203-
204-
self.n_samples_ = n_samples
205-
self.components_ = V
206-
self.n_components_ = n_components
207-
208-
# Get variance explained by singular values
209-
self.explained_variance_ = (S ** 2) / (n_samples - 1)
210-
self.explained_variance_ratio_ = self.explained_variance_ / total_var.sum()
211-
self.singular_values_ = S.copy() # Store the singular values.
212-
213-
if self.n_components_ < min(n_features, n_samples):
214-
self.noise_variance_ = (total_var.sum() - self.explained_variance_.sum())
215-
self.noise_variance_ /= min(n_features, n_samples) - n_components
216-
else:
217-
self.noise_variance_ = 0
218-
219-
return U, S, V
220-
221-
def transform(self, X):
222-
check_is_fitted(self, ["mean_", "components_"], all_or_any=all)
223-
224-
X = self._validate_data(
225-
X,
226-
accept_sparse=["csr", "csc"],
227-
dtype=[np.float64, np.float32],
228-
reset=False,
229-
copy=self.copy
230-
)
231-
232-
if self.mean_ is not None:
233-
X = X - self.mean_
234-
X_transformed = np.dot(X, self.components_.T)
235-
if self.whiten:
236-
X_transformed /= np.sqrt(self.explained_variance_)
237-
return X_transformed
238-
239-
24015
class _FeatureScorerMixin(LearnerScorer):
24116
feature_type = Variable
24217
component = 0
@@ -250,7 +25,7 @@ def score(self, data):
25025

25126

25227
class PCA(SklProjector, _FeatureScorerMixin):
253-
__wraps__ = ImprovedPCA
28+
__wraps__ = skl_decomposition.PCA
25429
name = 'PCA'
25530
supports_sparse = True
25631

@@ -264,6 +39,15 @@ def fit(self, X, Y=None):
26439
params = self.params.copy()
26540
if params["n_components"] is not None:
26641
params["n_components"] = min(min(X.shape), params["n_components"])
42+
43+
# scikit-learn doesn't support requesting the same number of PCs as
44+
# there are columns when the data is sparse. In this case, densify the
45+
# data. Since we're essentially requesting back a PC matrix of the same
46+
# size as the original data, we will assume the matrix is small enough
47+
# to densify as well
48+
if sp.issparse(X) and params["n_components"] == min(X.shape):
49+
X = X.toarray()
50+
26751
proj = self.__wraps__(**params)
26852
proj = proj.fit(X, Y)
26953
return PCAModel(proj, self.domain, len(proj.components_))
@@ -339,7 +123,7 @@ def fit(self, X, Y=None):
339123
params = self.params.copy()
340124
# strict requirement in scikit fit_transform:
341125
# n_components must be < n_features
342-
params["n_components"] = min(min(X.shape)-1, params["n_components"])
126+
params["n_components"] = min(min(X.shape) - 1, params["n_components"])
343127

344128
proj = self.__wraps__(**params)
345129
proj = proj.fit(X, Y)

Orange/tests/test_pca.py

Lines changed: 2 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,12 @@
22
# pylint: disable=missing-docstring
33
import pickle
44
import unittest
5-
from unittest.mock import MagicMock
65

76
import numpy as np
8-
from sklearn.utils import check_random_state
97

10-
from Orange.data import Table, Domain
8+
from Orange.data import Table
119
from Orange.preprocess import Continuize, Normalize
12-
from Orange.projection import pca, PCA, SparsePCA, IncrementalPCA, TruncatedSVD
10+
from Orange.projection import PCA, SparsePCA, IncrementalPCA, TruncatedSVD
1311
from Orange.tests import test_filename
1412

1513

@@ -65,95 +63,6 @@ def __rnd_pca_test_helper(self, data, n_com, min_xpl_var):
6563
proj = np.dot(data.X - pca_model.mean_, pca_model.components_.T)
6664
np.testing.assert_almost_equal(pca_model(data).X, proj)
6765

68-
def test_improved_randomized_pca_properly_called(self):
69-
# It doesn't matter what we put into the matrix
70-
x_ = np.random.normal(0, 1, (100, 20))
71-
x = Table.from_numpy(Domain.from_numpy(x_), x_)
72-
73-
pca.randomized_pca = MagicMock(wraps=pca.randomized_pca)
74-
PCA(10, svd_solver="randomized", random_state=42)(x)
75-
pca.randomized_pca.assert_called_once()
76-
77-
pca.randomized_pca.reset_mock()
78-
PCA(10, svd_solver="arpack", random_state=42)(x)
79-
pca.randomized_pca.assert_not_called()
80-
81-
def test_improved_randomized_pca_dense_data(self):
82-
"""Randomized PCA should work well on dense data."""
83-
random_state = check_random_state(42)
84-
85-
# Let's take a tall, skinny matrix
86-
x_ = random_state.normal(0, 1, (100, 20))
87-
x = Table.from_numpy(Domain.from_numpy(x_), x_)
88-
89-
pca = PCA(10, svd_solver="full", random_state=random_state)(x)
90-
rpca = PCA(10, svd_solver="randomized", random_state=random_state)(x)
91-
92-
np.testing.assert_almost_equal(
93-
pca.components_, rpca.components_, decimal=8
94-
)
95-
np.testing.assert_almost_equal(
96-
pca.explained_variance_, rpca.explained_variance_, decimal=8
97-
)
98-
np.testing.assert_almost_equal(
99-
pca.singular_values_, rpca.singular_values_, decimal=8
100-
)
101-
102-
# And take a short, fat matrix
103-
x_ = random_state.normal(0, 1, (20, 100))
104-
x = Table.from_numpy(Domain.from_numpy(x_), x_)
105-
106-
pca = PCA(10, svd_solver="full", random_state=random_state)(x)
107-
rpca = PCA(10, svd_solver="randomized", random_state=random_state)(x)
108-
109-
np.testing.assert_almost_equal(
110-
pca.components_, rpca.components_, decimal=8
111-
)
112-
np.testing.assert_almost_equal(
113-
pca.explained_variance_, rpca.explained_variance_, decimal=8
114-
)
115-
np.testing.assert_almost_equal(
116-
pca.singular_values_, rpca.singular_values_, decimal=8
117-
)
118-
119-
def test_improved_randomized_pca_sparse_data(self):
120-
"""Randomized PCA should work well on dense data."""
121-
random_state = check_random_state(42)
122-
123-
# Let's take a tall, skinny matrix
124-
x_ = random_state.negative_binomial(1, 0.5, (100, 20))
125-
x = Table.from_numpy(Domain.from_numpy(x_), x_).to_sparse()
126-
127-
pca = PCA(10, svd_solver="full", random_state=random_state)(x.to_dense())
128-
rpca = PCA(10, svd_solver="randomized", random_state=random_state)(x)
129-
130-
np.testing.assert_almost_equal(
131-
pca.components_, rpca.components_, decimal=8
132-
)
133-
np.testing.assert_almost_equal(
134-
pca.explained_variance_, rpca.explained_variance_, decimal=8
135-
)
136-
np.testing.assert_almost_equal(
137-
pca.singular_values_, rpca.singular_values_, decimal=8
138-
)
139-
140-
# And take a short, fat matrix
141-
x_ = random_state.negative_binomial(1, 0.5, (20, 100))
142-
x = Table.from_numpy(Domain.from_numpy(x_), x_).to_sparse()
143-
144-
pca = PCA(10, svd_solver="full", random_state=random_state)(x.to_dense())
145-
rpca = PCA(10, svd_solver="randomized", random_state=random_state)(x)
146-
147-
np.testing.assert_almost_equal(
148-
pca.components_, rpca.components_, decimal=8
149-
)
150-
np.testing.assert_almost_equal(
151-
pca.explained_variance_, rpca.explained_variance_, decimal=8
152-
)
153-
np.testing.assert_almost_equal(
154-
pca.singular_values_, rpca.singular_values_, decimal=8
155-
)
156-
15766
def test_incremental_pca(self):
15867
data = self.ionosphere
15968
self.__ipca_test_helper(data, n_com=3, min_xpl_var=0.49)

Orange/widgets/unsupervised/tests/test_owpca.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ def test_normalized_gives_correct_result(self, prepare_table):
223223
x = (x - x.mean(0)) / x.std(0)
224224
U, S, Va = np.linalg.svd(x)
225225
U, S, Va = U[:, :2], S[:2], Va[:2]
226-
U, Va = svd_flip(U, Va)
226+
U, Va = svd_flip(U, Va, u_based_decision=False)
227227
pca_embedding = U * S
228228

229229
np.testing.assert_almost_equal(widget_result.X, pca_embedding)

0 commit comments

Comments
 (0)