biolab · nikicc · Jun 2, 2017 · May 30, 2017 · Jun 1, 2017 · Jun 2, 2017
diff --git a/Orange/preprocess/impute.py b/Orange/preprocess/impute.py
@@ -1,5 +1,5 @@
-import numpy
-from scipy.sparse import issparse
+import numpy as np
+import scipy.sparse as sp
 
 import Orange.data
 from Orange.statistics import distribution, basic_stats
@@ -26,11 +26,11 @@ def __init__(self, variable, value=0):
         self.value = value
 
     def transform(self, c):
-        if issparse(c):
-            c.data = numpy.where(numpy.isnan(c.data), self.value, c.data)
+        if sp.issparse(c):
+            c.data = np.where(np.isnan(c.data), self.value, c.data)
             return c
         else:
-            return numpy.where(numpy.isnan(c), self.value, c)
+            return np.where(np.isnan(c), self.value, c)
 
 
 class BaseImputeMethod(Reprable):
@@ -83,7 +83,7 @@ class DropInstances(BaseImputeMethod):
 
     def __call__(self, data, variable):
         index = data.domain.index(variable)
-        return numpy.isnan(data[:, index]).reshape(-1)
+        return np.isnan(data[:, index]).reshape(-1)
 
 
 class Average(BaseImputeMethod):
@@ -154,13 +154,13 @@ def __init__(self, variable, model):
 
     def __call__(self, data):
         if isinstance(data, Orange.data.Instance):
-            column = numpy.array([float(data[self.variable])])
+            column = np.array([float(data[self.variable])])
         else:
-            column = numpy.array(data.get_column_view(self.variable)[0],
+            column = np.array(data.get_column_view(self.variable)[0],
                                  copy=True)
 
-        mask = numpy.isnan(column)
-        if not numpy.any(mask):
+        mask = np.isnan(column)
+        if not np.any(mask):
             return column
 
         if isinstance(data, Orange.data.Instance):
@@ -224,7 +224,9 @@ def domain_with_class_var(domain, class_var):
 
 class IsDefined(Transformation):
     def transform(self, c):
-        return ~numpy.isnan(c)
+        if sp.issparse(c):
+            c = c.toarray()
+        return ~np.isnan(c)
 
 
 class AsValue(BaseImputeMethod):
@@ -243,7 +245,7 @@ def __call__(self, data, variable):
                 base_value=variable.base_value,
                 compute_value=Lookup(
                     variable,
-                    numpy.arange(len(variable.values), dtype=int),
+                    np.arange(len(variable.values), dtype=int),
                     unknown=len(variable.values))
                 )
             return var
@@ -281,29 +283,32 @@ def __init__(self, variable, distribution):
         self.distribution = distribution
 
         if variable.is_discrete:
-            counts = numpy.array(distribution)
+            counts = np.array(distribution)
         elif variable.is_continuous:
-            counts = numpy.array(distribution)[1, :]
+            counts = np.array(distribution)[1, :]
         else:
             raise TypeError("Only discrete and continuous "
                             "variables are supported")
-        csum = numpy.sum(counts)
+        csum = np.sum(counts)
         if csum > 0:
             self.sample_prob = counts / csum
         else:
-            self.sample_prob = numpy.ones_like(counts) / len(counts)
+            self.sample_prob = np.ones_like(counts) / len(counts)
 
     def transform(self, c):
-        c = numpy.array(c, copy=True)
-        nanindices = numpy.flatnonzero(numpy.isnan(c))
+        if not sp.issparse(c):
+            c = np.array(c, copy=True)
+        else:
+            c = c.toarray().ravel()
+        nanindices = np.flatnonzero(np.isnan(c))
 
         if self.variable.is_discrete:
-            sample = numpy.random.choice(
+            sample = np.random.choice(
                 len(self.variable.values), size=len(nanindices),
                 replace=True, p=self.sample_prob)
         else:
-            sample = numpy.random.choice(
-                numpy.asarray(self.distribution)[0, :], size=len(nanindices),
+            sample = np.random.choice(
+                np.asarray(self.distribution)[0, :], size=len(nanindices),
                 replace=True, p=self.sample_prob)
 
         c[nanindices] = sample
@@ -328,9 +333,9 @@ def __call__(self, data, variable):
             raise ValueError("'{}' has an unknown distribution"
                              .format(variable))
 
-        if variable.is_discrete and numpy.sum(dist) == 0:
+        if variable.is_discrete and np.sum(dist) == 0:
             dist += 1 / len(dist)
-        elif variable.is_continuous and numpy.sum(dist[1, :]) == 0:
+        elif variable.is_continuous and np.sum(dist[1, :]) == 0:
             dist[1, :] += 1 / dist.shape[1]
         return variable.copy(
             compute_value=ReplaceUnknownsRandom(variable, dist))
diff --git a/Orange/statistics/util.py b/Orange/statistics/util.py
@@ -216,9 +216,9 @@ def weighted_mean():
         non_zero = np.bincount(X.nonzero()[1], minlength=X.shape[1])
         X = X.tocsc()
         return np.column_stack((
-            X.min(axis=0).toarray().ravel(),
-            X.max(axis=0).toarray().ravel(),
-            np.asarray(X.mean(axis=0)).ravel() if not weighted else weighted_mean(),
+            nanmin(X, axis=0),
+            nanmax(X, axis=0),
+            nanmean(X, axis=0) if not weighted else weighted_mean(),
             np.zeros(X.shape[1]),      # variance not supported
             X.shape[0] - non_zero,
             non_zero))
@@ -280,15 +280,22 @@ def mean(x):
     n_values = np.prod(x.shape)
     return np.sum(x.data) / n_values
 
-
-def nanmean(x):
+def nanmean(x, axis=None):
     """ Equivalent of np.nanmean that supports sparse or dense matrices. """
-    if not sp.issparse(x):
-        return np.nanmean(x)
-
-    n_values = np.prod(x.shape) - np.sum(np.isnan(x.data))
-    return np.nansum(x.data) / n_values
+    def nanmean_sparse(x):
+        n_values = np.prod(x.shape) - np.sum(np.isnan(x.data))
+        return np.nansum(x.data) / n_values
 
+    if not sp.issparse(x):
+        return np.nanmean(x, axis=axis)
+    if axis is None:
+        return nanmean_sparse(x)
+    if axis in [0, 1]:
+        arr = x if axis == 1 else x.T
+        arr = arr.tocsr()
+        return np.array([nanmean_sparse(row) for row in arr])
+    else:
+        raise NotImplementedError
 
 def unique(x, return_counts=False):
     """ Equivalent of np.unique that supports sparse or dense matrices. """

diff --git a/Orange/tests/test_impute.py b/Orange/tests/test_impute.py
@@ -158,7 +158,7 @@ def test_str(self):
 
 
 class TestAsValue(unittest.TestCase):
-    def test_replacement(self):
+    def _create_table(self):
         nan = np.nan
         X = [
             [1.0, nan, 0.0],
@@ -170,7 +170,11 @@ def test_replacement(self):
              data.ContinuousVariable("B"),
              data.ContinuousVariable("C"))
         )
-        table = data.Table.from_numpy(domain, np.array(X))
+        return data.Table.from_numpy(domain, np.array(X))
+
+    def test_replacement(self):
+        table = self._create_table()
+        domain = table.domain
 
         v1 = impute.AsValue()(table, domain[0])
         self.assertTrue(np.all(np.isfinite(v1.compute_value(table))))
@@ -200,6 +204,20 @@ def test_replacement(self):
              [3, 1.0, 0, 1.5, 0]]
         )
 
+    def test_sparse(self):
+        """
+        Impute: As a distinct value test. Sparse support.
+        GH-2357
+        """
+        table = self._create_table()
+        domain = table.domain
+        table.X = sp.csr_matrix(table.X)
+
+        v1, v2 = impute.AsValue()(table, domain[1])
+        self.assertTrue(np.all(np.isfinite(v2.compute_value(table))))
+        self.assertEqual([v2.str_val(v) for v in v2.compute_value(table)],
+                         ["undef", "def", "undef"])
+
 
 class TestModel(unittest.TestCase):
     def test_replacement(self):

diff --git a/Orange/tests/test_util.py b/Orange/tests/test_util.py
@@ -7,7 +7,9 @@
 
 from Orange.util import export_globals, flatten, deprecated, try_, deepgetattr, \
     OrangeDeprecationWarning
+from Orange.data import Table
 from Orange.data.util import vstack, hstack
+from Orange.statistics.util import stats
 
 SOMETHING = 0xf00babe
 
@@ -106,3 +108,12 @@ def assertCorrectArrayType(self, array, shape, sparsity):
     def test_raise_deprecations(self):
         with self.assertRaises(OrangeDeprecationWarning):
             warnings.warn('foo', OrangeDeprecationWarning)
+
+    def test_stats_sparse(self):
+        """
+        Stats should not fail when trying to calculate mean on sparse data.
+        GH-2357
+        """
+        data = Table("iris")
+        sparse_x = sp.csr_matrix(data.X)
+        self.assertTrue(stats(data.X).all() == stats(sparse_x).all())
diff --git a/Orange/widgets/data/owimpute.py b/Orange/widgets/data/owimpute.py
@@ -65,6 +65,7 @@ class OWImpute(OWWidget):
 
     class Error(OWWidget.Error):
         imputation_failed = Msg("Imputation failed for '{}'")
+        model_based_imputer_sparse = Msg("Model based imputer does not work for sparse data")
 
     DEFAULT_LEARNER = SimpleTreeLearner()
     METHODS = [AsDefault(), impute.DoNotImpute(), impute.Average(),
@@ -258,9 +259,13 @@ def commit(self):
 
             self.warning()
             self.Error.imputation_failed.clear()
+            self.Error.model_based_imputer_sparse.clear()
             with self.progressBar(len(self.varmodel)) as progress:
                 for i, var in enumerate(self.varmodel):
                     method = self.variable_methods.get(i, self.default_method)
+                    if isinstance(method, impute.Model) and data.is_sparse():
+                        self.Error.model_based_imputer_sparse()
+                        continue
 
                     try:
                         if not method.supports_variable(var):