biolab · pavlin-policar · Sep 4, 2017 · Sep 8, 2017 · Sep 8, 2017 · Sep 8, 2017
diff --git a/Orange/statistics/util.py b/Orange/statistics/util.py
@@ -30,35 +30,67 @@ def _count_nans_per_row_sparse(X, weights):
     return np.fromiter((np.isnan(row.data).sum() for row in X), dtype=np.float)
 
 
-def bincount(X, max_val=None, weights=None, minlength=None):
+def bincount(x, weights=None, max_val=None, minlength=None):
     """Return counts of values in array X.
 
     Works kind of like np.bincount(), except that it also supports floating
     arrays with nans.
 
+    Parameters
+    ----------
+    x : array_like, 1 dimension, nonnegative ints
+        Input array.
+    weights : array_like, optional
+        Weights, array of the same shape as x.
+    max_val : int, optional
+        Indicates the maximum value we expect to find in X and sets the result
+        array size accordingly. E.g. if we set `max_val=2` yet the largest
+        value in X is 1, the result will contain a bin for the value 2, and
+        will be set to 0. See examples for usage.
+    minlength : int, optional
+        A minimum number of bins for the output array. See numpy docs for info.
+
     Returns
     -------
     Tuple[np.ndarray, int]
         Returns the bincounts and the number of NaN values.
 
+    Examples
+    --------
+    In case `max_val` is provided, the return shape includes bins for these
+    values as well, even if they do not appear in the data. However, this will
+    not truncate the bincount if values larger than `max_count` are found.
+    >>> bincount([0, 0, 1, 1, 2], max_val=4)
+    (array([ 2.,  2.,  1.,  0.,  0.]), 0.0)
+    >>> bincount([0, 1, 2, 3, 4], max_val=2)
+    (array([ 1.,  1.,  1.,  1.,  1.]), 0.0)
+
     """
     # Store the original matrix before any manipulation to check for sparse
-    X_ = X
-    if sp.issparse(X):
-        weights = weights[X.indices] if weights is not None else weights
-        X = X.data
-
-    X = np.asanyarray(X)
-    if X.dtype.kind == 'f' and bn.anynan(X):
-        nonnan = ~np.isnan(X)
-        X = X[nonnan]
+    x_original = x
+    if sp.issparse(x):
+        n_items = np.prod(x_original.shape)
+        zero_indices = np.setdiff1d(np.arange(n_items), x.indices, assume_unique=True)
+
+        if weights is not None:
+            zero_weights = weights[zero_indices].sum()
+            weights = weights[x.indices]
+        else:
+            zero_weights = np.prod(x_original.shape) - x_original.nnz
+
+        x = x.data
+
+    x = np.asanyarray(x)
+    if x.dtype.kind == 'f' and bn.anynan(x):
+        nonnan = ~np.isnan(x)
+        x = x[nonnan]
         if weights is not None:
             nans = (~nonnan * weights).sum(axis=0)
             weights = weights[nonnan]
         else:
             nans = (~nonnan).sum(axis=0)
     else:
-        nans = 0. if X.ndim == 1 else np.zeros(X.shape[1], dtype=float)
+        nans = 0. if x.ndim == 1 else np.zeros(x.shape[1], dtype=float)
 
     if minlength is None and max_val is not None:
         minlength = max_val + 1
@@ -67,12 +99,12 @@ def bincount(X, max_val=None, weights=None, minlength=None):
         bc = np.array([])
     else:
         bc = np.bincount(
-            X.astype(np.int32, copy=False), weights=weights, minlength=minlength
+            x.astype(np.int32, copy=False), weights=weights, minlength=minlength
         ).astype(float)
         # Since `csr_matrix.values` only contain non-zero values, we must count
         # those separately and set the appropriate bin
-        if sp.issparse(X_):
-            bc[0] = np.prod(X_.shape) - X_.nnz
+        if sp.issparse(x_original):
+            bc[0] = zero_weights
 
     return bc, nans
 

diff --git a/Orange/tests/test_statistics.py b/Orange/tests/test_statistics.py
@@ -355,3 +355,35 @@ def test_maxval_adds_empty_bins(self, array):
         expected = [0, 3, 2, 1, 0, 0]
 
         np.testing.assert_equal(bincount(x, max_val=max_val)[0], expected)
+
+    @dense_sparse
+    def test_maxval_doesnt_truncate_values_when_too_small(self, array):
+        x = array([1, 1, 1, 2, 3, 2])
+        max_val = 1
+        expected = [0, 3, 2, 1]
+
+        np.testing.assert_equal(bincount(x, max_val=max_val)[0], expected)
+
+    @dense_sparse
+    def test_minlength_adds_empty_bins(self, array):
+        x = array([1, 1, 1, 2, 3, 2])
+        minlength = 5
+        expected = [0, 3, 2, 1, 0]
+
+        np.testing.assert_equal(bincount(x, minlength=minlength)[0], expected)
+
+    @dense_sparse
+    def test_weights(self, array):
+        x = array([0, 0, 1, 1, 2, 2, 3, 3])
+        w = np.array([1, 2, 0, 0, 1, 1, 0, 1])
+
+        expected = [3, 0, 2, 1]
+        np.testing.assert_equal(bincount(x, w)[0], expected)
+
+    @dense_sparse
+    def test_weights_with_nans(self, array):
+        x = array([0, 0, 1, 1, np.nan, 2, np.nan, 3])
+        w = np.array([1, 2, 0, 0, 1, 1, 0, 1])
+
+        expected = [3, 0, 1, 1]
+        np.testing.assert_equal(bincount(x, w)[0], expected)