PCA: Preserve f32s & reduce memory footprint when computing means

pavlin-policar · pavlin-policar · commit d974ef45a6ce · 2019-02-11T13:55:39.000+01:00
diff --git a/Orange/projection/pca.py b/Orange/projection/pca.py
@@ -1,9 +1,9 @@
 import numbers
-
 import six
 import numpy as np
 import scipy.sparse as sp
 from scipy.linalg import lu, qr, svd
+
 from sklearn import decomposition as skl_decomposition
 from sklearn.utils import check_array, check_random_state
 from sklearn.utils.extmath import svd_flip, safe_sparse_dot
@@ -45,10 +45,13 @@ def randomized_pca(A, n_components, n_oversamples=10, n_iter="auto",
 
     n_samples, n_features = A.shape
 
-    c = np.atleast_2d(A.mean(axis=0))
+    c = np.atleast_2d(ut.nanmean(A, axis=0))
 
     if n_samples >= n_features:
         Q = random_state.normal(size=(n_features, n_components + n_oversamples))
+        if A.dtype.kind == "f":
+            Q = Q.astype(A.dtype, copy=False)
+
         Q = safe_sparse_dot(A, Q) - safe_sparse_dot(c, Q)
 
         # Normalized power iterations
@@ -66,6 +69,9 @@ def randomized_pca(A, n_components, n_oversamples=10, n_iter="auto",
 
     else:  # n_features > n_samples
         Q = random_state.normal(size=(n_samples, n_components + n_oversamples))
+        if A.dtype.kind == "f":
+            Q = Q.astype(A.dtype, copy=False)
+
         Q = safe_sparse_dot(A.T, Q) - safe_sparse_dot(c.T, Q.sum(axis=0)[None, :])
 
         # Normalized power iterations
diff --git a/Orange/statistics/util.py b/Orange/statistics/util.py
@@ -10,6 +10,7 @@
 import bottleneck as bn
 from scipy import sparse as sp
 import scipy.stats.stats
+from sklearn.utils.sparsefuncs import mean_variance_axis
 
 
 def _count_nans_per_row_sparse(X, weights, dtype=None):
@@ -422,14 +423,15 @@ def nansum_sparse(x):
 
 def nanmean(x, axis=None):
     """ Equivalent of np.nanmean that supports sparse or dense matrices. """
-    def nanmean_sparse(x):
-        n_values = np.prod(x.shape) - np.sum(np.isnan(x.data))
-        if not n_values:
-            warnings.warn(RuntimeWarning, "Mean of empty slice")
-            return np.nan
-        return np.nansum(x.data) / n_values
-
-    return _apply_func(x, np.nanmean, nanmean_sparse, axis=axis)
+    if not sp.issparse(x):
+        means = np.nanmean(x, axis=axis)
+    elif axis is None:
+        means, _ = mean_variance_axis(x, axis=0)
+        means = np.nanmean(means)
+    else:
+        means, _ = mean_variance_axis(x, axis=axis)
+
+    return means
 
 
 def nanvar(x, axis=None, ddof=0):
diff --git a/Orange/tests/test_statistics.py b/Orange/tests/test_statistics.py
@@ -10,6 +10,7 @@
 from Orange.statistics.util import bincount, countnans, contingency, digitize, \
     mean, nanmax, nanmean, nanmedian, nanmin, nansum, nanunique, stats, std, \
     unique, var, nanstd, nanvar, nanmode
+from sklearn.utils import check_random_state
 
 
 def dense_sparse(test_case):
@@ -166,13 +167,6 @@ def test_mean(self):
         with self.assertWarns(UserWarning):
             mean([1, np.nan, 0])
 
-    def test_nanmean(self):
-        for X in self.data:
-            X_sparse = csr_matrix(X)
-            np.testing.assert_array_equal(
-                nanmean(X_sparse),
-                np.nanmean(X))
-
     def test_nanmode(self):
         X = np.array([[np.nan, np.nan, 1, 1],
                       [2, np.nan, 1, 1]])
@@ -270,6 +264,31 @@ def test_nanstd_with_ddof(self):
             )
 
 
+class TestNanmean(unittest.TestCase):
+    def setUp(self):
+        self.random_state = check_random_state(42)
+        self.x = self.random_state.uniform(size=(10, 5))
+        np.fill_diagonal(self.x, np.nan)
+
+    @dense_sparse
+    def test_axis_none(self, array):
+        np.testing.assert_almost_equal(
+            np.nanmean(self.x), nanmean(array(self.x))
+        )
+
+    @dense_sparse
+    def test_axis_0(self, array):
+        np.testing.assert_almost_equal(
+            np.nanmean(self.x, axis=0), nanmean(array(self.x), axis=0)
+        )
+
+    @dense_sparse
+    def test_axis_1(self, array):
+        np.testing.assert_almost_equal(
+            np.nanmean(self.x, axis=1), nanmean(array(self.x), axis=1)
+        )
+
+
 class TestDigitize(unittest.TestCase):
     def setUp(self):
         # pylint: disable=bad-whitespace