Skip to content

Commit d974ef4

Browse files
PCA: Preserve f32s & reduce memory footprint when computing means
1 parent dd6093d commit d974ef4

File tree

3 files changed

+44
-17
lines changed

3 files changed

+44
-17
lines changed

Orange/projection/pca.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
import numbers
2-
32
import six
43
import numpy as np
54
import scipy.sparse as sp
65
from scipy.linalg import lu, qr, svd
6+
77
from sklearn import decomposition as skl_decomposition
88
from sklearn.utils import check_array, check_random_state
99
from sklearn.utils.extmath import svd_flip, safe_sparse_dot
@@ -45,10 +45,13 @@ def randomized_pca(A, n_components, n_oversamples=10, n_iter="auto",
4545

4646
n_samples, n_features = A.shape
4747

48-
c = np.atleast_2d(A.mean(axis=0))
48+
c = np.atleast_2d(ut.nanmean(A, axis=0))
4949

5050
if n_samples >= n_features:
5151
Q = random_state.normal(size=(n_features, n_components + n_oversamples))
52+
if A.dtype.kind == "f":
53+
Q = Q.astype(A.dtype, copy=False)
54+
5255
Q = safe_sparse_dot(A, Q) - safe_sparse_dot(c, Q)
5356

5457
# Normalized power iterations
@@ -66,6 +69,9 @@ def randomized_pca(A, n_components, n_oversamples=10, n_iter="auto",
6669

6770
else: # n_features > n_samples
6871
Q = random_state.normal(size=(n_samples, n_components + n_oversamples))
72+
if A.dtype.kind == "f":
73+
Q = Q.astype(A.dtype, copy=False)
74+
6975
Q = safe_sparse_dot(A.T, Q) - safe_sparse_dot(c.T, Q.sum(axis=0)[None, :])
7076

7177
# Normalized power iterations

Orange/statistics/util.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import bottleneck as bn
1111
from scipy import sparse as sp
1212
import scipy.stats.stats
13+
from sklearn.utils.sparsefuncs import mean_variance_axis
1314

1415

1516
def _count_nans_per_row_sparse(X, weights, dtype=None):
@@ -422,14 +423,15 @@ def nansum_sparse(x):
422423

423424
def nanmean(x, axis=None):
424425
""" Equivalent of np.nanmean that supports sparse or dense matrices. """
425-
def nanmean_sparse(x):
426-
n_values = np.prod(x.shape) - np.sum(np.isnan(x.data))
427-
if not n_values:
428-
warnings.warn(RuntimeWarning, "Mean of empty slice")
429-
return np.nan
430-
return np.nansum(x.data) / n_values
431-
432-
return _apply_func(x, np.nanmean, nanmean_sparse, axis=axis)
426+
if not sp.issparse(x):
427+
means = np.nanmean(x, axis=axis)
428+
elif axis is None:
429+
means, _ = mean_variance_axis(x, axis=0)
430+
means = np.nanmean(means)
431+
else:
432+
means, _ = mean_variance_axis(x, axis=axis)
433+
434+
return means
433435

434436

435437
def nanvar(x, axis=None, ddof=0):

Orange/tests/test_statistics.py

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from Orange.statistics.util import bincount, countnans, contingency, digitize, \
1111
mean, nanmax, nanmean, nanmedian, nanmin, nansum, nanunique, stats, std, \
1212
unique, var, nanstd, nanvar, nanmode
13+
from sklearn.utils import check_random_state
1314

1415

1516
def dense_sparse(test_case):
@@ -166,13 +167,6 @@ def test_mean(self):
166167
with self.assertWarns(UserWarning):
167168
mean([1, np.nan, 0])
168169

169-
def test_nanmean(self):
170-
for X in self.data:
171-
X_sparse = csr_matrix(X)
172-
np.testing.assert_array_equal(
173-
nanmean(X_sparse),
174-
np.nanmean(X))
175-
176170
def test_nanmode(self):
177171
X = np.array([[np.nan, np.nan, 1, 1],
178172
[2, np.nan, 1, 1]])
@@ -270,6 +264,31 @@ def test_nanstd_with_ddof(self):
270264
)
271265

272266

267+
class TestNanmean(unittest.TestCase):
268+
def setUp(self):
269+
self.random_state = check_random_state(42)
270+
self.x = self.random_state.uniform(size=(10, 5))
271+
np.fill_diagonal(self.x, np.nan)
272+
273+
@dense_sparse
274+
def test_axis_none(self, array):
275+
np.testing.assert_almost_equal(
276+
np.nanmean(self.x), nanmean(array(self.x))
277+
)
278+
279+
@dense_sparse
280+
def test_axis_0(self, array):
281+
np.testing.assert_almost_equal(
282+
np.nanmean(self.x, axis=0), nanmean(array(self.x), axis=0)
283+
)
284+
285+
@dense_sparse
286+
def test_axis_1(self, array):
287+
np.testing.assert_almost_equal(
288+
np.nanmean(self.x, axis=1), nanmean(array(self.x), axis=1)
289+
)
290+
291+
273292
class TestDigitize(unittest.TestCase):
274293
def setUp(self):
275294
# pylint: disable=bad-whitespace

0 commit comments

Comments
 (0)