1- import numbers
2- import six
31import numpy as np
42import scipy .sparse as sp
5- from scipy .linalg import lu , qr , svd
6-
73from sklearn import decomposition as skl_decomposition
8- from sklearn .utils import check_array , check_random_state
9- from sklearn .utils .extmath import svd_flip , safe_sparse_dot
10- from sklearn .utils .validation import check_is_fitted
114
125import Orange .data
13- from Orange .statistics import util as ut
146from Orange .data import Variable
157from Orange .data .util import get_unique_names
168from Orange .misc .wrapper_meta import WrapperMeta
2012__all__ = ["PCA" , "SparsePCA" , "IncrementalPCA" , "TruncatedSVD" ]
2113
2214
23- def randomized_pca (A , n_components , n_oversamples = 10 , n_iter = "auto" ,
24- flip_sign = True , random_state = 0 ):
25- """Compute the randomized PCA decomposition of a given matrix.
26-
27- This method differs from the scikit-learn implementation in that it supports
28- and handles sparse matrices well.
29-
30- """
31- if n_iter == "auto" :
32- # Checks if the number of iterations is explicitly specified
33- # Adjust n_iter. 7 was found a good compromise for PCA. See sklearn #5299
34- n_iter = 7 if n_components < .1 * min (A .shape ) else 4
35-
36- n_samples , n_features = A .shape
37-
38- c = np .atleast_2d (ut .nanmean (A , axis = 0 ))
39-
40- if n_samples >= n_features :
41- Q = random_state .normal (size = (n_features , n_components + n_oversamples ))
42- if A .dtype .kind == "f" :
43- Q = Q .astype (A .dtype , copy = False )
44-
45- Q = safe_sparse_dot (A , Q ) - safe_sparse_dot (c , Q )
46-
47- # Normalized power iterations
48- for _ in range (n_iter ):
49- Q = safe_sparse_dot (A .T , Q ) - safe_sparse_dot (c .T , Q .sum (axis = 0 )[None , :])
50- Q , _ = lu (Q , permute_l = True )
51- Q = safe_sparse_dot (A , Q ) - safe_sparse_dot (c , Q )
52- Q , _ = lu (Q , permute_l = True )
53-
54- Q , _ = qr (Q , mode = "economic" )
55-
56- QA = safe_sparse_dot (A .T , Q ) - safe_sparse_dot (c .T , Q .sum (axis = 0 )[None , :])
57- R , s , V = svd (QA .T , full_matrices = False )
58- U = Q .dot (R )
59-
60- else : # n_features > n_samples
61- Q = random_state .normal (size = (n_samples , n_components + n_oversamples ))
62- if A .dtype .kind == "f" :
63- Q = Q .astype (A .dtype , copy = False )
64-
65- Q = safe_sparse_dot (A .T , Q ) - safe_sparse_dot (c .T , Q .sum (axis = 0 )[None , :])
66-
67- # Normalized power iterations
68- for _ in range (n_iter ):
69- Q = safe_sparse_dot (A , Q ) - safe_sparse_dot (c , Q )
70- Q , _ = lu (Q , permute_l = True )
71- Q = safe_sparse_dot (A .T , Q ) - safe_sparse_dot (c .T , Q .sum (axis = 0 )[None , :])
72- Q , _ = lu (Q , permute_l = True )
73-
74- Q , _ = qr (Q , mode = "economic" )
75-
76- QA = safe_sparse_dot (A , Q ) - safe_sparse_dot (c , Q )
77- U , s , R = svd (QA , full_matrices = False )
78- V = R .dot (Q .T )
79-
80- if flip_sign :
81- U , V = svd_flip (U , V )
82-
83- return U [:, :n_components ], s [:n_components ], V [:n_components , :]
84-
85-
86- class ImprovedPCA (skl_decomposition .PCA ):
87- """Patch sklearn PCA learner to include randomized PCA for sparse matrices.
88-
89- Scikit-learn does not currently support sparse matrices at all, even though
90- efficient methods exist for PCA. This class patches the default scikit-learn
91- implementation to properly handle sparse matrices.
92-
93- Notes
94- -----
95- - This should be removed once scikit-learn releases a version which
96- implements this functionality.
97-
98- """
99- # pylint: disable=too-many-branches
100- def _fit (self , X ):
101- """Dispatch to the right submethod depending on the chosen solver."""
102- X = self ._validate_data (
103- X ,
104- dtype = [np .float64 , np .float32 ],
105- reset = False ,
106- accept_sparse = ["csr" , "csc" ],
107- copy = self .copy
108- )
109-
110- # Handle n_components==None
111- if self .n_components is None :
112- if self .svd_solver != "arpack" :
113- n_components = min (X .shape )
114- else :
115- n_components = min (X .shape ) - 1
116- else :
117- n_components = self .n_components
118-
119- # Handle svd_solver
120- self ._fit_svd_solver = self .svd_solver
121- if self ._fit_svd_solver == "auto" :
122- # Sparse data can only be handled with the randomized solver
123- if sp .issparse (X ):
124- self ._fit_svd_solver = "randomized"
125- # Small problem or n_components == 'mle', just call full PCA
126- elif max (X .shape ) <= 500 or n_components == "mle" :
127- self ._fit_svd_solver = "full"
128- elif 1 <= n_components < .8 * min (X .shape ):
129- self ._fit_svd_solver = "randomized"
130- # This is also the case of n_components in (0,1)
131- else :
132- self ._fit_svd_solver = "full"
133-
134- # Ensure we don't try call arpack or full on a sparse matrix
135- if sp .issparse (X ) and self ._fit_svd_solver != "randomized" :
136- raise ValueError ("only the randomized solver supports sparse matrices" )
137-
138- # Call different fits for either full or truncated SVD
139- if self ._fit_svd_solver == "full" :
140- return self ._fit_full (X , n_components )
141- elif self ._fit_svd_solver in ["arpack" , "randomized" ]:
142- return self ._fit_truncated (X , n_components , self ._fit_svd_solver )
143- else :
144- raise ValueError (
145- "Unrecognized svd_solver='{0}'" .format (self ._fit_svd_solver )
146- )
147-
148- def _fit_truncated (self , X , n_components , svd_solver ):
149- """Fit the model by computing truncated SVD (by ARPACK or randomized) on X"""
150- n_samples , n_features = X .shape
151-
152- if isinstance (n_components , six .string_types ):
153- raise ValueError (
154- "n_components=%r cannot be a string with svd_solver='%s'" %
155- (n_components , svd_solver )
156- )
157- if not 1 <= n_components <= min (n_samples , n_features ):
158- raise ValueError (
159- "n_components=%r must be between 1 and min(n_samples, "
160- "n_features)=%r with svd_solver='%s'" % (
161- n_components , min (n_samples , n_features ), svd_solver
162- )
163- )
164- if not isinstance (n_components , (numbers .Integral , np .integer )):
165- raise ValueError (
166- "n_components=%r must be of type int when greater than or "
167- "equal to 1, was of type=%r" % (n_components , type (n_components ))
168- )
169- if svd_solver == "arpack" and n_components == min (n_samples , n_features ):
170- raise ValueError (
171- "n_components=%r must be strictly less than min(n_samples, "
172- "n_features)=%r with svd_solver='%s'" % (
173- n_components , min (n_samples , n_features ), svd_solver
174- )
175- )
176-
177- random_state = check_random_state (self .random_state )
178-
179- self .mean_ = X .mean (axis = 0 )
180- total_var = ut .var (X , axis = 0 , ddof = 1 )
181-
182- if svd_solver == "arpack" :
183- # Center data
184- X -= self .mean_
185- # random init solution, as ARPACK does it internally
186- v0 = random_state .uniform (- 1 , 1 , size = min (X .shape ))
187- U , S , V = sp .linalg .svds (X , k = n_components , tol = self .tol , v0 = v0 )
188- # svds doesn't abide by scipy.linalg.svd/randomized_svd
189- # conventions, so reverse its outputs.
190- S = S [::- 1 ]
191- # flip eigenvectors' sign to enforce deterministic output
192- U , V = svd_flip (U [:, ::- 1 ], V [::- 1 ])
193-
194- elif svd_solver == "randomized" :
195- # sign flipping is done inside
196- U , S , V = randomized_pca (
197- X ,
198- n_components = n_components ,
199- n_iter = self .iterated_power ,
200- flip_sign = True ,
201- random_state = random_state ,
202- )
203-
204- self .n_samples_ = n_samples
205- self .components_ = V
206- self .n_components_ = n_components
207-
208- # Get variance explained by singular values
209- self .explained_variance_ = (S ** 2 ) / (n_samples - 1 )
210- self .explained_variance_ratio_ = self .explained_variance_ / total_var .sum ()
211- self .singular_values_ = S .copy () # Store the singular values.
212-
213- if self .n_components_ < min (n_features , n_samples ):
214- self .noise_variance_ = (total_var .sum () - self .explained_variance_ .sum ())
215- self .noise_variance_ /= min (n_features , n_samples ) - n_components
216- else :
217- self .noise_variance_ = 0
218-
219- return U , S , V
220-
221- def transform (self , X ):
222- check_is_fitted (self , ["mean_" , "components_" ], all_or_any = all )
223-
224- X = self ._validate_data (
225- X ,
226- accept_sparse = ["csr" , "csc" ],
227- dtype = [np .float64 , np .float32 ],
228- reset = False ,
229- copy = self .copy
230- )
231-
232- if self .mean_ is not None :
233- X = X - self .mean_
234- X_transformed = np .dot (X , self .components_ .T )
235- if self .whiten :
236- X_transformed /= np .sqrt (self .explained_variance_ )
237- return X_transformed
238-
239-
24015class _FeatureScorerMixin (LearnerScorer ):
24116 feature_type = Variable
24217 component = 0
@@ -250,7 +25,7 @@ def score(self, data):
25025
25126
25227class PCA (SklProjector , _FeatureScorerMixin ):
253- __wraps__ = ImprovedPCA
28+ __wraps__ = skl_decomposition . PCA
25429 name = 'PCA'
25530 supports_sparse = True
25631
@@ -264,6 +39,15 @@ def fit(self, X, Y=None):
26439 params = self .params .copy ()
26540 if params ["n_components" ] is not None :
26641 params ["n_components" ] = min (min (X .shape ), params ["n_components" ])
42+
43+ # scikit-learn doesn't support requesting the same number of PCs as
44+ # there are columns when the data is sparse. In this case, densify the
45+ # data. Since we're essentially requesting back a PC matrix of the same
46+ # size as the original data, we will assume the matrix is small enough
47+ # to densify as well
48+ if sp .issparse (X ) and params ["n_components" ] == min (X .shape ):
49+ X = X .toarray ()
50+
26751 proj = self .__wraps__ (** params )
26852 proj = proj .fit (X , Y )
26953 return PCAModel (proj , self .domain , len (proj .components_ ))
@@ -339,7 +123,7 @@ def fit(self, X, Y=None):
339123 params = self .params .copy ()
340124 # strict requirement in scikit fit_transform:
341125 # n_components must be < n_features
342- params ["n_components" ] = min (min (X .shape )- 1 , params ["n_components" ])
126+ params ["n_components" ] = min (min (X .shape ) - 1 , params ["n_components" ])
343127
344128 proj = self .__wraps__ (** params )
345129 proj = proj .fit (X , Y )
0 commit comments