Skip to content

Commit 63df7e1

Browse files
authored
Merge pull request #2572 from pavlin-policar/statistics-utils-unique
[FIX] Statistics.unique: Fix Sparse Return Order For Negative Numbers
2 parents 5f3d314 + 3e08ba0 commit 63df7e1

File tree

2 files changed

+72
-47
lines changed

2 files changed

+72
-47
lines changed

Orange/statistics/util.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -437,22 +437,31 @@ def unique(x, return_counts=False):
437437
r = np.unique(x.data, return_counts=return_counts)
438438
if not implicit_zeros:
439439
return r
440+
440441
if return_counts:
442+
zero_index = np.searchsorted(r[0], 0)
441443
if explicit_zeros:
442444
r[1][r[0] == 0.] += implicit_zeros
443445
return r
444-
return np.insert(r[0], 0, 0), np.insert(r[1], 0, implicit_zeros)
446+
return np.insert(r[0], zero_index, 0), np.insert(r[1], zero_index, implicit_zeros)
445447
else:
446448
if explicit_zeros:
447449
return r
448-
return np.insert(r, 0, 0)
450+
zero_index = np.searchsorted(r, 0)
451+
return np.insert(r, zero_index, 0)
449452

450453

451-
def nanunique(x):
454+
def nanunique(*args, **kwargs):
452455
""" Return unique values while disregarding missing (np.nan) values.
453456
Supports sparse or dense matrices. """
454-
r = unique(x)
455-
return r[~np.isnan(r)]
457+
result = unique(*args, **kwargs)
458+
459+
if isinstance(result, tuple):
460+
result, counts = result
461+
non_nan_mask = ~np.isnan(result)
462+
return result[non_nan_mask], counts[non_nan_mask]
463+
464+
return result[~np.isnan(result)]
456465

457466

458467
def digitize(x, bins, right=False):

Orange/tests/test_statistics.py

Lines changed: 58 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
import unittest
2-
import warnings
3-
from functools import wraps, partial
42
from itertools import chain
3+
from functools import partial, wraps
54

65
import numpy as np
7-
import scipy as sp
8-
from scipy.sparse import csr_matrix, issparse, csc_matrix
6+
from scipy.sparse import csr_matrix, issparse, lil_matrix, csc_matrix
97

108
from Orange.statistics.util import bincount, countnans, contingency, stats, \
119
nanmin, nanmax, unique, nanunique, mean, nanmean, digitize, var
@@ -128,44 +126,6 @@ def test_nanmin_nanmax(self):
128126
nanmax(X_sparse, axis=axis),
129127
np.nanmax(X, axis=axis))
130128

131-
def test_unique(self):
132-
for X in self.data:
133-
X_sparse = csr_matrix(X)
134-
np.testing.assert_array_equal(
135-
unique(X_sparse, return_counts=False),
136-
np.unique(X, return_counts=False))
137-
138-
for a1, a2 in zip(unique(X_sparse, return_counts=True),
139-
np.unique(X, return_counts=True)):
140-
np.testing.assert_array_equal(a1, a2)
141-
142-
def test_unique_explicit_zeros(self):
143-
x1 = csr_matrix(np.eye(3))
144-
x2 = csr_matrix(np.eye(3))
145-
146-
# set some of-diagonal to explicit zeros
147-
with warnings.catch_warnings():
148-
warnings.filterwarnings("ignore",
149-
category=sp.sparse.SparseEfficiencyWarning)
150-
x2[0, 1] = 0
151-
x2[1, 0] = 0
152-
153-
np.testing.assert_array_equal(
154-
unique(x1, return_counts=False),
155-
unique(x2, return_counts=False),
156-
)
157-
np.testing.assert_array_equal(
158-
unique(x1, return_counts=True),
159-
unique(x2, return_counts=True),
160-
)
161-
162-
def test_nanunique(self):
163-
x = csr_matrix(np.array([0, 1, 1, np.nan]))
164-
np.testing.assert_array_equal(
165-
nanunique(x),
166-
np.array([0, 1])
167-
)
168-
169129
def test_mean(self):
170130
for X in self.data:
171131
X_sparse = csr_matrix(X)
@@ -420,3 +380,59 @@ def test_weights_with_transposed_x(self, array):
420380

421381
expected = [3, 0, 2, 1]
422382
np.testing.assert_equal(bincount(x, w)[0], expected)
383+
384+
385+
class TestUnique(unittest.TestCase):
386+
@dense_sparse
387+
def test_returns_unique_values(self, array):
388+
# pylint: disable=bad-whitespace
389+
x = array([[-1., 1., 0., 2., 3., np.nan],
390+
[ 0., 0., 0., 3., 5., np.nan],
391+
[-1., 0., 0., 1., 7., 6.]])
392+
expected = [-1, 0, 1, 2, 3, 5, 6, 7, np.nan, np.nan]
393+
394+
np.testing.assert_equal(unique(x, return_counts=False), expected)
395+
396+
@dense_sparse
397+
def test_returns_counts(self, array):
398+
# pylint: disable=bad-whitespace
399+
x = array([[-1., 1., 0., 2., 3., np.nan],
400+
[ 0., 0., 0., 3., 5., np.nan],
401+
[-1., 0., 0., 1., 7., 6.]])
402+
expected = [2, 6, 2, 1, 2, 1, 1, 1, 1, 1]
403+
404+
np.testing.assert_equal(unique(x, return_counts=True)[1], expected)
405+
406+
def test_sparse_explicit_zeros(self):
407+
# Use `lil_matrix` to fix sparse warning for matrix construction
408+
x = lil_matrix(np.eye(3))
409+
x[0, 1] = 0
410+
x[1, 0] = 0
411+
x = x.tocsr()
412+
# Test against identity matrix
413+
y = csr_matrix(np.eye(3))
414+
415+
np.testing.assert_array_equal(
416+
unique(y, return_counts=True),
417+
unique(x, return_counts=True),
418+
)
419+
420+
@dense_sparse
421+
def test_nanunique_ignores_nans_in_values(self, array):
422+
# pylint: disable=bad-whitespace
423+
x = array([[-1., 1., 0., 2., 3., np.nan],
424+
[ 0., 0., 0., 3., 5., np.nan],
425+
[-1., 0., 0., 1., 7., 6.]])
426+
expected = [-1, 0, 1, 2, 3, 5, 6, 7]
427+
428+
np.testing.assert_equal(nanunique(x, return_counts=False), expected)
429+
430+
@dense_sparse
431+
def test_nanunique_ignores_nans_in_counts(self, array):
432+
# pylint: disable=bad-whitespace
433+
x = array([[-1., 1., 0., 2., 3., np.nan],
434+
[ 0., 0., 0., 3., 5., np.nan],
435+
[-1., 0., 0., 1., 7., 6.]])
436+
expected = [2, 6, 2, 1, 2, 1, 1, 1]
437+
438+
np.testing.assert_equal(nanunique(x, return_counts=True)[1], expected)

0 commit comments

Comments
 (0)