Merge pull request #2572 from pavlin-policar/statistics-utils-unique

nikicc · web-flow · commit 63df7e1e3250 · 2017-10-21T22:22:41.000+02:00
[FIX] Statistics.unique: Fix Sparse Return Order For Negative Numbers
diff --git a/Orange/statistics/util.py b/Orange/statistics/util.py
@@ -437,22 +437,31 @@ def unique(x, return_counts=False):
     r = np.unique(x.data, return_counts=return_counts)
     if not implicit_zeros:
         return r
+
     if return_counts:
+        zero_index = np.searchsorted(r[0], 0)
         if explicit_zeros:
             r[1][r[0] == 0.] += implicit_zeros
             return r
-        return np.insert(r[0], 0, 0), np.insert(r[1], 0, implicit_zeros)
+        return np.insert(r[0], zero_index, 0), np.insert(r[1], zero_index, implicit_zeros)
     else:
         if explicit_zeros:
             return r
-        return np.insert(r, 0, 0)
+        zero_index = np.searchsorted(r, 0)
+        return np.insert(r, zero_index, 0)
 
 
-def nanunique(x):
+def nanunique(*args, **kwargs):
     """ Return unique values while disregarding missing (np.nan) values.
     Supports sparse or dense matrices. """
-    r = unique(x)
-    return r[~np.isnan(r)]
+    result = unique(*args, **kwargs)
+
+    if isinstance(result, tuple):
+        result, counts = result
+        non_nan_mask = ~np.isnan(result)
+        return result[non_nan_mask], counts[non_nan_mask]
+
+    return result[~np.isnan(result)]
 
 
 def digitize(x, bins, right=False):
diff --git a/Orange/tests/test_statistics.py b/Orange/tests/test_statistics.py
@@ -1,11 +1,9 @@
 import unittest
-import warnings
-from functools import wraps, partial
 from itertools import chain
+from functools import partial, wraps
 
 import numpy as np
-import scipy as sp
-from scipy.sparse import csr_matrix, issparse, csc_matrix
+from scipy.sparse import csr_matrix, issparse, lil_matrix, csc_matrix
 
 from Orange.statistics.util import bincount, countnans, contingency, stats, \
     nanmin, nanmax, unique, nanunique, mean, nanmean, digitize, var
@@ -128,44 +126,6 @@ def test_nanmin_nanmax(self):
                     nanmax(X_sparse, axis=axis),
                     np.nanmax(X, axis=axis))
 
-    def test_unique(self):
-        for X in self.data:
-            X_sparse = csr_matrix(X)
-            np.testing.assert_array_equal(
-                unique(X_sparse, return_counts=False),
-                np.unique(X, return_counts=False))
-
-            for a1, a2 in zip(unique(X_sparse, return_counts=True),
-                              np.unique(X, return_counts=True)):
-                np.testing.assert_array_equal(a1, a2)
-
-    def test_unique_explicit_zeros(self):
-        x1 = csr_matrix(np.eye(3))
-        x2 = csr_matrix(np.eye(3))
-
-        # set some of-diagonal to explicit zeros
-        with warnings.catch_warnings():
-            warnings.filterwarnings("ignore",
-                                    category=sp.sparse.SparseEfficiencyWarning)
-            x2[0, 1] = 0
-            x2[1, 0] = 0
-
-        np.testing.assert_array_equal(
-            unique(x1, return_counts=False),
-            unique(x2, return_counts=False),
-        )
-        np.testing.assert_array_equal(
-            unique(x1, return_counts=True),
-            unique(x2, return_counts=True),
-        )
-
-    def test_nanunique(self):
-        x = csr_matrix(np.array([0, 1, 1, np.nan]))
-        np.testing.assert_array_equal(
-            nanunique(x),
-            np.array([0, 1])
-        )
-
     def test_mean(self):
         for X in self.data:
             X_sparse = csr_matrix(X)
@@ -420,3 +380,59 @@ def test_weights_with_transposed_x(self, array):
 
         expected = [3, 0, 2, 1]
         np.testing.assert_equal(bincount(x, w)[0], expected)
+
+
+class TestUnique(unittest.TestCase):
+    @dense_sparse
+    def test_returns_unique_values(self, array):
+        # pylint: disable=bad-whitespace
+        x = array([[-1., 1., 0., 2., 3., np.nan],
+                   [ 0., 0., 0., 3., 5., np.nan],
+                   [-1., 0., 0., 1., 7.,     6.]])
+        expected = [-1, 0, 1, 2, 3, 5, 6, 7, np.nan, np.nan]
+
+        np.testing.assert_equal(unique(x, return_counts=False), expected)
+
+    @dense_sparse
+    def test_returns_counts(self, array):
+        # pylint: disable=bad-whitespace
+        x = array([[-1., 1., 0., 2., 3., np.nan],
+                   [ 0., 0., 0., 3., 5., np.nan],
+                   [-1., 0., 0., 1., 7.,     6.]])
+        expected = [2, 6, 2, 1, 2, 1, 1, 1, 1, 1]
+
+        np.testing.assert_equal(unique(x, return_counts=True)[1], expected)
+
+    def test_sparse_explicit_zeros(self):
+        # Use `lil_matrix` to fix sparse warning for matrix construction
+        x = lil_matrix(np.eye(3))
+        x[0, 1] = 0
+        x[1, 0] = 0
+        x = x.tocsr()
+        # Test against identity matrix
+        y = csr_matrix(np.eye(3))
+
+        np.testing.assert_array_equal(
+            unique(y, return_counts=True),
+            unique(x, return_counts=True),
+        )
+
+    @dense_sparse
+    def test_nanunique_ignores_nans_in_values(self, array):
+        # pylint: disable=bad-whitespace
+        x = array([[-1., 1., 0., 2., 3., np.nan],
+                   [ 0., 0., 0., 3., 5., np.nan],
+                   [-1., 0., 0., 1., 7.,     6.]])
+        expected = [-1, 0, 1, 2, 3, 5, 6, 7]
+
+        np.testing.assert_equal(nanunique(x, return_counts=False), expected)
+
+    @dense_sparse
+    def test_nanunique_ignores_nans_in_counts(self, array):
+        # pylint: disable=bad-whitespace
+        x = array([[-1., 1., 0., 2., 3., np.nan],
+                   [ 0., 0., 0., 3., 5., np.nan],
+                   [-1., 0., 0., 1., 7.,     6.]])
+        expected = [2, 6, 2, 1, 2, 1, 1, 1]
+
+        np.testing.assert_equal(nanunique(x, return_counts=True)[1], expected)