Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
238 changes: 11 additions & 227 deletions Orange/projection/pca.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,8 @@
import numbers
import six
import numpy as np
import scipy.sparse as sp
from scipy.linalg import lu, qr, svd

from sklearn import decomposition as skl_decomposition
from sklearn.utils import check_array, check_random_state
from sklearn.utils.extmath import svd_flip, safe_sparse_dot
from sklearn.utils.validation import check_is_fitted

import Orange.data
from Orange.statistics import util as ut
from Orange.data import Variable
from Orange.data.util import get_unique_names
from Orange.misc.wrapper_meta import WrapperMeta
Expand All @@ -20,223 +12,6 @@
__all__ = ["PCA", "SparsePCA", "IncrementalPCA", "TruncatedSVD"]


def randomized_pca(A, n_components, n_oversamples=10, n_iter="auto",
flip_sign=True, random_state=0):
"""Compute the randomized PCA decomposition of a given matrix.
This method differs from the scikit-learn implementation in that it supports
and handles sparse matrices well.
"""
if n_iter == "auto":
# Checks if the number of iterations is explicitly specified
# Adjust n_iter. 7 was found a good compromise for PCA. See sklearn #5299
n_iter = 7 if n_components < .1 * min(A.shape) else 4

n_samples, n_features = A.shape

c = np.atleast_2d(ut.nanmean(A, axis=0))

if n_samples >= n_features:
Q = random_state.normal(size=(n_features, n_components + n_oversamples))
if A.dtype.kind == "f":
Q = Q.astype(A.dtype, copy=False)

Q = safe_sparse_dot(A, Q) - safe_sparse_dot(c, Q)

# Normalized power iterations
for _ in range(n_iter):
Q = safe_sparse_dot(A.T, Q) - safe_sparse_dot(c.T, Q.sum(axis=0)[None, :])
Q, _ = lu(Q, permute_l=True)
Q = safe_sparse_dot(A, Q) - safe_sparse_dot(c, Q)
Q, _ = lu(Q, permute_l=True)

Q, _ = qr(Q, mode="economic")

QA = safe_sparse_dot(A.T, Q) - safe_sparse_dot(c.T, Q.sum(axis=0)[None, :])
R, s, V = svd(QA.T, full_matrices=False)
U = Q.dot(R)

else: # n_features > n_samples
Q = random_state.normal(size=(n_samples, n_components + n_oversamples))
if A.dtype.kind == "f":
Q = Q.astype(A.dtype, copy=False)

Q = safe_sparse_dot(A.T, Q) - safe_sparse_dot(c.T, Q.sum(axis=0)[None, :])

# Normalized power iterations
for _ in range(n_iter):
Q = safe_sparse_dot(A, Q) - safe_sparse_dot(c, Q)
Q, _ = lu(Q, permute_l=True)
Q = safe_sparse_dot(A.T, Q) - safe_sparse_dot(c.T, Q.sum(axis=0)[None, :])
Q, _ = lu(Q, permute_l=True)

Q, _ = qr(Q, mode="economic")

QA = safe_sparse_dot(A, Q) - safe_sparse_dot(c, Q)
U, s, R = svd(QA, full_matrices=False)
V = R.dot(Q.T)

if flip_sign:
U, V = svd_flip(U, V)

return U[:, :n_components], s[:n_components], V[:n_components, :]


class ImprovedPCA(skl_decomposition.PCA):
"""Patch sklearn PCA learner to include randomized PCA for sparse matrices.
Scikit-learn does not currently support sparse matrices at all, even though
efficient methods exist for PCA. This class patches the default scikit-learn
implementation to properly handle sparse matrices.
Notes
-----
- This should be removed once scikit-learn releases a version which
implements this functionality.
"""
# pylint: disable=too-many-branches
def _fit(self, X):
"""Dispatch to the right submethod depending on the chosen solver."""
X = self._validate_data(
X,
dtype=[np.float64, np.float32],
reset=False,
accept_sparse=["csr", "csc"],
copy=self.copy
)

# Handle n_components==None
if self.n_components is None:
if self.svd_solver != "arpack":
n_components = min(X.shape)
else:
n_components = min(X.shape) - 1
else:
n_components = self.n_components

# Handle svd_solver
self._fit_svd_solver = self.svd_solver
if self._fit_svd_solver == "auto":
# Sparse data can only be handled with the randomized solver
if sp.issparse(X):
self._fit_svd_solver = "randomized"
# Small problem or n_components == 'mle', just call full PCA
elif max(X.shape) <= 500 or n_components == "mle":
self._fit_svd_solver = "full"
elif 1 <= n_components < .8 * min(X.shape):
self._fit_svd_solver = "randomized"
# This is also the case of n_components in (0,1)
else:
self._fit_svd_solver = "full"

# Ensure we don't try call arpack or full on a sparse matrix
if sp.issparse(X) and self._fit_svd_solver != "randomized":
raise ValueError("only the randomized solver supports sparse matrices")

# Call different fits for either full or truncated SVD
if self._fit_svd_solver == "full":
return self._fit_full(X, n_components)
elif self._fit_svd_solver in ["arpack", "randomized"]:
return self._fit_truncated(X, n_components, self._fit_svd_solver)
else:
raise ValueError(
"Unrecognized svd_solver='{0}'".format(self._fit_svd_solver)
)

def _fit_truncated(self, X, n_components, svd_solver):
"""Fit the model by computing truncated SVD (by ARPACK or randomized) on X"""
n_samples, n_features = X.shape

if isinstance(n_components, six.string_types):
raise ValueError(
"n_components=%r cannot be a string with svd_solver='%s'" %
(n_components, svd_solver)
)
if not 1 <= n_components <= min(n_samples, n_features):
raise ValueError(
"n_components=%r must be between 1 and min(n_samples, "
"n_features)=%r with svd_solver='%s'" % (
n_components, min(n_samples, n_features), svd_solver
)
)
if not isinstance(n_components, (numbers.Integral, np.integer)):
raise ValueError(
"n_components=%r must be of type int when greater than or "
"equal to 1, was of type=%r" % (n_components, type(n_components))
)
if svd_solver == "arpack" and n_components == min(n_samples, n_features):
raise ValueError(
"n_components=%r must be strictly less than min(n_samples, "
"n_features)=%r with svd_solver='%s'" % (
n_components, min(n_samples, n_features), svd_solver
)
)

random_state = check_random_state(self.random_state)

self.mean_ = X.mean(axis=0)
total_var = ut.var(X, axis=0, ddof=1)

if svd_solver == "arpack":
# Center data
X -= self.mean_
# random init solution, as ARPACK does it internally
v0 = random_state.uniform(-1, 1, size=min(X.shape))
U, S, V = sp.linalg.svds(X, k=n_components, tol=self.tol, v0=v0)
# svds doesn't abide by scipy.linalg.svd/randomized_svd
# conventions, so reverse its outputs.
S = S[::-1]
# flip eigenvectors' sign to enforce deterministic output
U, V = svd_flip(U[:, ::-1], V[::-1])

elif svd_solver == "randomized":
# sign flipping is done inside
U, S, V = randomized_pca(
X,
n_components=n_components,
n_iter=self.iterated_power,
flip_sign=True,
random_state=random_state,
)

self.n_samples_ = n_samples
self.components_ = V
self.n_components_ = n_components

# Get variance explained by singular values
self.explained_variance_ = (S ** 2) / (n_samples - 1)
self.explained_variance_ratio_ = self.explained_variance_ / total_var.sum()
self.singular_values_ = S.copy() # Store the singular values.

if self.n_components_ < min(n_features, n_samples):
self.noise_variance_ = (total_var.sum() - self.explained_variance_.sum())
self.noise_variance_ /= min(n_features, n_samples) - n_components
else:
self.noise_variance_ = 0

return U, S, V

def transform(self, X):
check_is_fitted(self, ["mean_", "components_"], all_or_any=all)

X = self._validate_data(
X,
accept_sparse=["csr", "csc"],
dtype=[np.float64, np.float32],
reset=False,
copy=self.copy
)

if self.mean_ is not None:
X = X - self.mean_
X_transformed = np.dot(X, self.components_.T)
if self.whiten:
X_transformed /= np.sqrt(self.explained_variance_)
return X_transformed


class _FeatureScorerMixin(LearnerScorer):
feature_type = Variable
component = 0
Expand All @@ -250,7 +25,7 @@ def score(self, data):


class PCA(SklProjector, _FeatureScorerMixin):
__wraps__ = ImprovedPCA
__wraps__ = skl_decomposition.PCA
name = 'PCA'
supports_sparse = True

Expand All @@ -264,6 +39,15 @@ def fit(self, X, Y=None):
params = self.params.copy()
if params["n_components"] is not None:
params["n_components"] = min(min(X.shape), params["n_components"])

# scikit-learn doesn't support requesting the same number of PCs as
# there are columns when the data is sparse. In this case, densify the
# data. Since we're essentially requesting back a PC matrix of the same
# size as the original data, we will assume the matrix is small enough
# to densify as well
if sp.issparse(X) and params["n_components"] == min(X.shape):
X = X.toarray()

proj = self.__wraps__(**params)
proj = proj.fit(X, Y)
return PCAModel(proj, self.domain, len(proj.components_))
Expand Down Expand Up @@ -339,7 +123,7 @@ def fit(self, X, Y=None):
params = self.params.copy()
# strict requirement in scikit fit_transform:
# n_components must be < n_features
params["n_components"] = min(min(X.shape)-1, params["n_components"])
params["n_components"] = min(min(X.shape) - 1, params["n_components"])

proj = self.__wraps__(**params)
proj = proj.fit(X, Y)
Expand Down
95 changes: 2 additions & 93 deletions Orange/tests/test_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,12 @@
# pylint: disable=missing-docstring
import pickle
import unittest
from unittest.mock import MagicMock

import numpy as np
from sklearn.utils import check_random_state

from Orange.data import Table, Domain
from Orange.data import Table
from Orange.preprocess import Continuize, Normalize
from Orange.projection import pca, PCA, SparsePCA, IncrementalPCA, TruncatedSVD
from Orange.projection import PCA, SparsePCA, IncrementalPCA, TruncatedSVD
from Orange.tests import test_filename


Expand Down Expand Up @@ -65,95 +63,6 @@ def __rnd_pca_test_helper(self, data, n_com, min_xpl_var):
proj = np.dot(data.X - pca_model.mean_, pca_model.components_.T)
np.testing.assert_almost_equal(pca_model(data).X, proj)

def test_improved_randomized_pca_properly_called(self):
# It doesn't matter what we put into the matrix
x_ = np.random.normal(0, 1, (100, 20))
x = Table.from_numpy(Domain.from_numpy(x_), x_)

pca.randomized_pca = MagicMock(wraps=pca.randomized_pca)
PCA(10, svd_solver="randomized", random_state=42)(x)
pca.randomized_pca.assert_called_once()

pca.randomized_pca.reset_mock()
PCA(10, svd_solver="arpack", random_state=42)(x)
pca.randomized_pca.assert_not_called()

def test_improved_randomized_pca_dense_data(self):
"""Randomized PCA should work well on dense data."""
random_state = check_random_state(42)

# Let's take a tall, skinny matrix
x_ = random_state.normal(0, 1, (100, 20))
x = Table.from_numpy(Domain.from_numpy(x_), x_)

pca = PCA(10, svd_solver="full", random_state=random_state)(x)
rpca = PCA(10, svd_solver="randomized", random_state=random_state)(x)

np.testing.assert_almost_equal(
pca.components_, rpca.components_, decimal=8
)
np.testing.assert_almost_equal(
pca.explained_variance_, rpca.explained_variance_, decimal=8
)
np.testing.assert_almost_equal(
pca.singular_values_, rpca.singular_values_, decimal=8
)

# And take a short, fat matrix
x_ = random_state.normal(0, 1, (20, 100))
x = Table.from_numpy(Domain.from_numpy(x_), x_)

pca = PCA(10, svd_solver="full", random_state=random_state)(x)
rpca = PCA(10, svd_solver="randomized", random_state=random_state)(x)

np.testing.assert_almost_equal(
pca.components_, rpca.components_, decimal=8
)
np.testing.assert_almost_equal(
pca.explained_variance_, rpca.explained_variance_, decimal=8
)
np.testing.assert_almost_equal(
pca.singular_values_, rpca.singular_values_, decimal=8
)

def test_improved_randomized_pca_sparse_data(self):
"""Randomized PCA should work well on dense data."""
random_state = check_random_state(42)

# Let's take a tall, skinny matrix
x_ = random_state.negative_binomial(1, 0.5, (100, 20))
x = Table.from_numpy(Domain.from_numpy(x_), x_).to_sparse()

pca = PCA(10, svd_solver="full", random_state=random_state)(x.to_dense())
rpca = PCA(10, svd_solver="randomized", random_state=random_state)(x)

np.testing.assert_almost_equal(
pca.components_, rpca.components_, decimal=8
)
np.testing.assert_almost_equal(
pca.explained_variance_, rpca.explained_variance_, decimal=8
)
np.testing.assert_almost_equal(
pca.singular_values_, rpca.singular_values_, decimal=8
)

# And take a short, fat matrix
x_ = random_state.negative_binomial(1, 0.5, (20, 100))
x = Table.from_numpy(Domain.from_numpy(x_), x_).to_sparse()

pca = PCA(10, svd_solver="full", random_state=random_state)(x.to_dense())
rpca = PCA(10, svd_solver="randomized", random_state=random_state)(x)

np.testing.assert_almost_equal(
pca.components_, rpca.components_, decimal=8
)
np.testing.assert_almost_equal(
pca.explained_variance_, rpca.explained_variance_, decimal=8
)
np.testing.assert_almost_equal(
pca.singular_values_, rpca.singular_values_, decimal=8
)

def test_incremental_pca(self):
data = self.ionosphere
self.__ipca_test_helper(data, n_com=3, min_xpl_var=0.49)
Expand Down
2 changes: 1 addition & 1 deletion Orange/widgets/unsupervised/tests/test_owpca.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def test_normalized_gives_correct_result(self, prepare_table):
x = (x - x.mean(0)) / x.std(0)
U, S, Va = np.linalg.svd(x)
U, S, Va = U[:, :2], S[:2], Va[:2]
U, Va = svd_flip(U, Va)
U, Va = svd_flip(U, Va, u_based_decision=False)
pca_embedding = U * S

np.testing.assert_almost_equal(widget_result.X, pca_embedding)
Expand Down
Loading