Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
b39db6e
Statistics.countnans: Fix sparse implementation and add axis support
pavlin-policar Sep 4, 2017
1d2bee0
Statistics.bincount: Fix sparse implementation
pavlin-policar Sep 8, 2017
ef2ba73
Statistics.tests: Implement dense_sparse decorator
pavlin-policar Sep 8, 2017
ee8634b
Statistics.countnans: Support 2d weights for sparse matrices
pavlin-policar Sep 8, 2017
941bd2b
Statistics.digitize: Move tests to own class and use dense_sparse dec…
pavlin-policar Sep 8, 2017
ea74b94
Statistics.bincount: Add weight support to sparse, add docstring
pavlin-policar Sep 8, 2017
ab5cc8b
Statistics: Implement sparse_count_zeros
pavlin-policar Sep 8, 2017
b4eb25a
Statistics.countnans: Add dtype param support to sparse
pavlin-policar Sep 8, 2017
ca4c80f
Table._compute_distributions: Fix parameter ordering to bincount call
pavlin-policar Sep 8, 2017
09ddc33
Statistics.sparse_has_zeros: Make public
pavlin-policar Sep 9, 2017
0057143
Table._compute_distributions: Correctly count zeros in sparse continu…
pavlin-policar Sep 9, 2017
a21af1a
DomainDistribution: Change tests to check for true zero counts
pavlin-policar Sep 9, 2017
d7d91c8
TestNormalize: Fix failing test due to previous handling of zeros in …
pavlin-policar Sep 9, 2017
afa3df8
Statistics.countnans: Fix copy=False param from coo.tocsr call
pavlin-policar Sep 9, 2017
6f12808
Pylint: Add pylint ignores to more human-friendly formatted matrices
pavlin-policar Sep 9, 2017
dd516a7
Statistics.countnans: Support csc_matrices
pavlin-policar Oct 20, 2017
e515f30
Statistics: Rename sparse_zeros to sparse_implicit_zeros
pavlin-policar Oct 20, 2017
e4206e2
Statistics.tests: Inject explicit zeros into dense_sparse decorator
pavlin-policar Oct 20, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 46 additions & 14 deletions Orange/statistics/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,35 +30,67 @@ def _count_nans_per_row_sparse(X, weights):
return np.fromiter((np.isnan(row.data).sum() for row in X), dtype=np.float)


def bincount(X, max_val=None, weights=None, minlength=None):
def bincount(x, weights=None, max_val=None, minlength=None):
"""Return counts of values in array X.

Works kind of like np.bincount(), except that it also supports floating
arrays with nans.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would you be willing to also document the max_val argument? This doesn't seem to be a numpy argument, and it's not obvious what it is for.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added thorough documentation.

Parameters
----------
x : array_like, 1 dimension, nonnegative ints
Input array.
weights : array_like, optional
Weights, array of the same shape as x.
max_val : int, optional
Indicates the maximum value we expect to find in X and sets the result
array size accordingly. E.g. if we set `max_val=2` yet the largest
value in X is 1, the result will contain a bin for the value 2, and
will be set to 0. See examples for usage.
minlength : int, optional
A minimum number of bins for the output array. See numpy docs for info.

Returns
-------
Tuple[np.ndarray, int]
Returns the bincounts and the number of NaN values.

Examples
--------
In case `max_val` is provided, the return shape includes bins for these
values as well, even if they do not appear in the data. However, this will
not truncate the bincount if values larger than `max_count` are found.
>>> bincount([0, 0, 1, 1, 2], max_val=4)
(array([ 2., 2., 1., 0., 0.]), 0.0)
>>> bincount([0, 1, 2, 3, 4], max_val=2)
(array([ 1., 1., 1., 1., 1.]), 0.0)

"""
# Store the original matrix before any manipulation to check for sparse
X_ = X
if sp.issparse(X):
weights = weights[X.indices] if weights is not None else weights
X = X.data

X = np.asanyarray(X)
if X.dtype.kind == 'f' and bn.anynan(X):
nonnan = ~np.isnan(X)
X = X[nonnan]
x_original = x
if sp.issparse(x):
n_items = np.prod(x_original.shape)
zero_indices = np.setdiff1d(np.arange(n_items), x.indices, assume_unique=True)

if weights is not None:
zero_weights = weights[zero_indices].sum()
weights = weights[x.indices]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does x.indices works here for both csc and csr? If not, please cast it to whatever (csc/csr) is required beforhand.

else:
zero_weights = np.prod(x_original.shape) - x_original.nnz

x = x.data

x = np.asanyarray(x)
if x.dtype.kind == 'f' and bn.anynan(x):
nonnan = ~np.isnan(x)
x = x[nonnan]
if weights is not None:
nans = (~nonnan * weights).sum(axis=0)
weights = weights[nonnan]
else:
nans = (~nonnan).sum(axis=0)
else:
nans = 0. if X.ndim == 1 else np.zeros(X.shape[1], dtype=float)
nans = 0. if x.ndim == 1 else np.zeros(x.shape[1], dtype=float)

if minlength is None and max_val is not None:
minlength = max_val + 1
Expand All @@ -67,12 +99,12 @@ def bincount(X, max_val=None, weights=None, minlength=None):
bc = np.array([])
else:
bc = np.bincount(
X.astype(np.int32, copy=False), weights=weights, minlength=minlength
x.astype(np.int32, copy=False), weights=weights, minlength=minlength
).astype(float)
# Since `csr_matrix.values` only contain non-zero values, we must count
# those separately and set the appropriate bin
if sp.issparse(X_):
bc[0] = np.prod(X_.shape) - X_.nnz
if sp.issparse(x_original):
bc[0] = zero_weights
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should probably be bc[0] = bc[0] + zero_weights to account for explicit zeros stored in x.data.


return bc, nans

Expand Down
32 changes: 32 additions & 0 deletions Orange/tests/test_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,3 +355,35 @@ def test_maxval_adds_empty_bins(self, array):
expected = [0, 3, 2, 1, 0, 0]

np.testing.assert_equal(bincount(x, max_val=max_val)[0], expected)

@dense_sparse
def test_maxval_doesnt_truncate_values_when_too_small(self, array):
x = array([1, 1, 1, 2, 3, 2])
max_val = 1
expected = [0, 3, 2, 1]

np.testing.assert_equal(bincount(x, max_val=max_val)[0], expected)

@dense_sparse
def test_minlength_adds_empty_bins(self, array):
x = array([1, 1, 1, 2, 3, 2])
minlength = 5
expected = [0, 3, 2, 1, 0]

np.testing.assert_equal(bincount(x, minlength=minlength)[0], expected)

@dense_sparse
def test_weights(self, array):
x = array([0, 0, 1, 1, 2, 2, 3, 3])
w = np.array([1, 2, 0, 0, 1, 1, 0, 1])

expected = [3, 0, 2, 1]
np.testing.assert_equal(bincount(x, w)[0], expected)

@dense_sparse
def test_weights_with_nans(self, array):
x = array([0, 0, 1, 1, np.nan, 2, np.nan, 3])
w = np.array([1, 2, 0, 0, 1, 1, 0, 1])

expected = [3, 0, 1, 1]
np.testing.assert_equal(bincount(x, w)[0], expected)