Skip to content

Commit 5f3d314

Browse files
authored
Merge pull request #2698 from pavlin-policar/statistics-countnans-bincount
[FIX] Statistics.countnans/bincount: Fix NaN Counting, Consider Implicit Zeros
2 parents 7e64078 + b7bc576 commit 5f3d314

File tree

6 files changed

+526
-200
lines changed

6 files changed

+526
-200
lines changed

Orange/data/table.py

Lines changed: 39 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@
1919
)
2020
from Orange.data.util import SharedComputeValue, vstack, hstack
2121
from Orange.statistics.util import bincount, countnans, contingency, \
22-
stats as fast_stats
22+
stats as fast_stats, sparse_has_implicit_zeros, sparse_count_implicit_zeros, \
23+
sparse_implicit_zero_weights
2324
from Orange.util import flatten
2425

2526
__all__ = ["dataset_dirs", "get_sample_datasets_dir", "RowInstance", "Table"]
@@ -1384,42 +1385,58 @@ def _compute_distributions(self, columns=None):
13841385
columns = range(len(self.domain.variables))
13851386
else:
13861387
columns = [self.domain.index(var) for var in columns]
1388+
13871389
distributions = []
13881390
if sp.issparse(self.X):
13891391
self.X = self.X.tocsc()
1392+
13901393
W = self.W.ravel() if self.has_weights() else None
1394+
13911395
for col in columns:
1392-
var = self.domain[col]
1396+
variable = self.domain[col]
1397+
1398+
# Select the correct data column from X, Y or metas
13931399
if 0 <= col < self.X.shape[1]:
1394-
m = self.X[:, col]
1400+
x = self.X[:, col]
13951401
elif col < 0:
1396-
m = self.metas[:, col * (-1) - 1]
1397-
if np.issubdtype(m.dtype, np.dtype(object)):
1398-
m = m.astype(float)
1402+
x = self.metas[:, col * (-1) - 1]
1403+
if np.issubdtype(x.dtype, np.dtype(object)):
1404+
x = x.astype(float)
13991405
else:
1400-
m = self._Y[:, col - self.X.shape[1]]
1401-
if var.is_discrete:
1402-
dist, unknowns = bincount(m, len(var.values) - 1, W)
1403-
elif not m.shape[0]:
1406+
x = self._Y[:, col - self.X.shape[1]]
1407+
1408+
if variable.is_discrete:
1409+
dist, unknowns = bincount(x, weights=W, max_val=len(variable.values) - 1)
1410+
elif not x.shape[0]:
14041411
dist, unknowns = np.zeros((2, 0)), 0
14051412
else:
14061413
if W is not None:
1407-
unknowns = countnans(m, W)
1408-
if sp.issparse(m):
1409-
arg_sort = np.argsort(m.data)
1410-
ranks = m.indices[arg_sort]
1411-
vals = np.vstack((m.data[arg_sort], W[ranks]))
1414+
if sp.issparse(x):
1415+
arg_sort = np.argsort(x.data)
1416+
ranks = x.indices[arg_sort]
1417+
vals = np.vstack((x.data[arg_sort], W[ranks]))
14121418
else:
1413-
ranks = np.argsort(m)
1414-
vals = np.vstack((m[ranks], W[ranks]))
1419+
ranks = np.argsort(x)
1420+
vals = np.vstack((x[ranks], W[ranks]))
14151421
else:
1416-
unknowns = countnans(m.astype(float))
1417-
if sp.issparse(m):
1418-
m = m.data
1419-
vals = np.ones((2, m.shape[0]))
1420-
vals[0, :] = m
1422+
x_values = x.data if sp.issparse(x) else x
1423+
vals = np.ones((2, x_values.shape[0]))
1424+
vals[0, :] = x_values
14211425
vals[0, :].sort()
1426+
14221427
dist = np.array(_valuecount.valuecount(vals))
1428+
# If sparse, then 0s will not be counted with `valuecount`, so
1429+
# we have to add them to the result manually.
1430+
if sp.issparse(x) and sparse_has_implicit_zeros(x):
1431+
if W is not None:
1432+
zero_weights = sparse_implicit_zero_weights(x, W).sum()
1433+
else:
1434+
zero_weights = sparse_count_implicit_zeros(x)
1435+
zero_vec = [0, zero_weights]
1436+
dist = np.insert(dist, np.searchsorted(dist[0], 0), zero_vec, axis=1)
1437+
# Since `countnans` assumes vector shape to be (1, n) and `x`
1438+
# shape is (n, 1), we pass the transpose
1439+
unknowns = countnans(x.T, W)
14231440
distributions.append((dist, unknowns))
14241441

14251442
return distributions

Orange/statistics/distribution.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -272,8 +272,8 @@ def mean(self):
272272
return np.average(np.asarray(self[0]), weights=np.asarray(self[1]))
273273

274274
def variance(self):
275-
avg = self.mean()
276-
return sum([((x-avg)**2)*w for x, w in zip(self[0], self[1])])/sum(self[1])
275+
mean = self.mean()
276+
return sum(((x - mean) ** 2) * w for x, w in zip(self[0], self[1])) / sum(self[1])
277277

278278
def standard_deviation(self):
279279
return math.sqrt(self.variance())

Orange/statistics/util.py

Lines changed: 158 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -5,85 +5,199 @@
55
It also patches bottleneck to contain these functions.
66
"""
77
from warnings import warn
8-
import numpy as np
9-
import scipy.sparse as sp
8+
109
import bottleneck as bn
10+
import numpy as np
11+
from scipy import sparse as sp
1112

1213

13-
def _count_nans_per_row_sparse(X, weights):
14+
def _count_nans_per_row_sparse(X, weights, dtype=None):
1415
""" Count the number of nans (undefined) values per row. """
15-
items_per_row = 1 if X.ndim == 1 else X.shape[1]
16-
counts = np.ones(X.shape[0]) * items_per_row
17-
nnz_per_row = np.bincount(X.indices, minlength=len(counts))
18-
counts -= nnz_per_row
1916
if weights is not None:
20-
counts *= weights
21-
return np.sum(counts)
17+
X = X.tocoo(copy=False)
18+
nonzero_mask = np.isnan(X.data)
19+
nan_rows, nan_cols = X.row[nonzero_mask], X.col[nonzero_mask]
20+
21+
if weights.ndim == 1:
22+
data_weights = weights[nan_rows]
23+
else:
24+
data_weights = weights[nan_rows, nan_cols]
25+
26+
w = sp.coo_matrix((data_weights, (nan_rows, nan_cols)), shape=X.shape)
27+
w = w.tocsr()
28+
29+
return np.fromiter((np.sum(row.data) for row in w), dtype=dtype)
2230

31+
return np.fromiter((np.isnan(row.data).sum() for row in X), dtype=dtype)
2332

24-
def bincount(X, max_val=None, weights=None, minlength=None):
33+
34+
def sparse_count_implicit_zeros(x):
35+
""" Count the number of implicit zeros in a sparse matrix. """
36+
if not sp.issparse(x):
37+
raise TypeError('The matrix provided was not sparse.')
38+
return np.prod(x.shape) - x.nnz
39+
40+
41+
def sparse_has_implicit_zeros(x):
42+
""" Check if sparse matrix contains any implicit zeros. """
43+
if not sp.issparse(x):
44+
raise TypeError('The matrix provided was not sparse.')
45+
return np.prod(x.shape) != x.nnz
46+
47+
48+
def sparse_implicit_zero_weights(x, weights):
49+
""" Extract the weight values of all zeros in a sparse matrix. """
50+
if not sp.issparse(x):
51+
raise TypeError('The matrix provided was not sparse.')
52+
53+
if weights.ndim == 1:
54+
# Match weights and x axis so `indices` will be set appropriately
55+
if x.shape[0] == weights.shape[0]:
56+
x = x.tocsc()
57+
elif x.shape[1] == weights.shape[0]:
58+
x = x.tocsr()
59+
n_items = np.prod(x.shape)
60+
zero_indices = np.setdiff1d(np.arange(n_items), x.indices, assume_unique=True)
61+
return weights[zero_indices]
62+
else:
63+
# Can easily be implemented using a coo_matrix
64+
raise NotImplementedError(
65+
'Computing zero weights on ndimensinal weight matrix is not implemented'
66+
)
67+
68+
69+
def bincount(x, weights=None, max_val=None, minlength=None):
2570
"""Return counts of values in array X.
2671
2772
Works kind of like np.bincount(), except that it also supports floating
2873
arrays with nans.
74+
75+
Parameters
76+
----------
77+
x : array_like, 1 dimension, nonnegative ints
78+
Input array.
79+
weights : array_like, optional
80+
Weights, array of the same shape as x.
81+
max_val : int, optional
82+
Indicates the maximum value we expect to find in X and sets the result
83+
array size accordingly. E.g. if we set `max_val=2` yet the largest
84+
value in X is 1, the result will contain a bin for the value 2, and
85+
will be set to 0. See examples for usage.
86+
minlength : int, optional
87+
A minimum number of bins for the output array. See numpy docs for info.
88+
89+
Returns
90+
-------
91+
Tuple[np.ndarray, int]
92+
Returns the bincounts and the number of NaN values.
93+
94+
Examples
95+
--------
96+
In case `max_val` is provided, the return shape includes bins for these
97+
values as well, even if they do not appear in the data. However, this will
98+
not truncate the bincount if values larger than `max_count` are found.
99+
>>> bincount([0, 0, 1, 1, 2], max_val=4)
100+
(array([ 2., 2., 1., 0., 0.]), 0.0)
101+
>>> bincount([0, 1, 2, 3, 4], max_val=2)
102+
(array([ 1., 1., 1., 1., 1.]), 0.0)
103+
29104
"""
30-
if sp.issparse(X):
31-
minlength = max_val + 1
32-
bin_weights = weights[X.indices] if weights is not None else None
33-
return (np.bincount(X.data.astype(int),
34-
weights=bin_weights,
35-
minlength=minlength, ),
36-
_count_nans_per_row_sparse(X, weights))
37-
38-
X = np.asanyarray(X)
39-
if X.dtype.kind == 'f' and bn.anynan(X):
40-
nonnan = ~np.isnan(X)
41-
X = X[nonnan]
105+
# Store the original matrix before any manipulation to check for sparse
106+
x_original = x
107+
if sp.issparse(x):
108+
if weights is not None:
109+
# Match weights and x axis so `indices` will be set appropriately
110+
if x.shape[0] == weights.shape[0]:
111+
x = x.tocsc()
112+
elif x.shape[1] == weights.shape[0]:
113+
x = x.tocsr()
114+
115+
zero_weights = sparse_implicit_zero_weights(x, weights).sum()
116+
weights = weights[x.indices]
117+
else:
118+
zero_weights = sparse_count_implicit_zeros(x)
119+
120+
x = x.data
121+
122+
x = np.asanyarray(x)
123+
if x.dtype.kind == 'f' and bn.anynan(x):
124+
nonnan = ~np.isnan(x)
125+
x = x[nonnan]
42126
if weights is not None:
43127
nans = (~nonnan * weights).sum(axis=0)
44128
weights = weights[nonnan]
45129
else:
46130
nans = (~nonnan).sum(axis=0)
47131
else:
48-
nans = 0. if X.ndim == 1 else np.zeros(X.shape[1], dtype=float)
132+
nans = 0. if x.ndim == 1 else np.zeros(x.shape[1], dtype=float)
133+
49134
if minlength is None and max_val is not None:
50135
minlength = max_val + 1
51-
bc = np.array([]) if minlength is not None and minlength <= 0 else \
52-
np.bincount(X.astype(np.int32, copy=False),
53-
weights=weights, minlength=minlength).astype(float)
136+
137+
if minlength is not None and minlength <= 0:
138+
bc = np.array([])
139+
else:
140+
bc = np.bincount(
141+
x.astype(np.int32, copy=False), weights=weights, minlength=minlength
142+
).astype(float)
143+
# Since `csr_matrix.values` only contain non-zero values or explicit
144+
# zeros, we must count implicit zeros separately and add them to the
145+
# explicit ones found before
146+
if sp.issparse(x_original):
147+
bc[0] += zero_weights
148+
54149
return bc, nans
55150

56151

57-
def countnans(X, weights=None, axis=None, dtype=None, keepdims=False):
152+
def countnans(x, weights=None, axis=None, dtype=None, keepdims=False):
58153
"""
59-
Count the undefined elements in arr along given axis.
154+
Count the undefined elements in an array along given axis.
60155
61156
Parameters
62157
----------
63-
X : array_like
64-
weights : array_like
158+
x : array_like
159+
weights : array_like, optional
65160
Weights to weight the nans with, before or after counting (depending
66161
on the weights shape).
162+
axis : int, optional
163+
dtype : dtype, optional
164+
The data type of the returned array.
67165
68166
Returns
69167
-------
70-
counts
168+
Union[np.ndarray, float]
169+
71170
"""
72-
if not sp.issparse(X):
73-
X = np.asanyarray(X)
74-
isnan = np.isnan(X)
75-
if weights is not None and weights.shape == X.shape:
171+
if not sp.issparse(x):
172+
x = np.asanyarray(x)
173+
isnan = np.isnan(x)
174+
if weights is not None and weights.shape == x.shape:
76175
isnan = isnan * weights
176+
77177
counts = isnan.sum(axis=axis, dtype=dtype, keepdims=keepdims)
78-
if weights is not None and weights.shape != X.shape:
178+
if weights is not None and weights.shape != x.shape:
79179
counts = counts * weights
80180
else:
81-
if any(attr is not None for attr in [axis, dtype]) or \
82-
keepdims is not False:
83-
raise ValueError('Arguments axis, dtype and keepdims'
84-
'are not yet supported on sparse data!')
181+
assert axis in [None, 0, 1], 'Only axis 0 and 1 are currently supported'
182+
# To have consistent behaviour with dense matrices, raise error when
183+
# `axis=1` and the array is 1d (e.g. [[1 2 3]])
184+
if x.shape[0] == 1 and axis == 1:
185+
raise ValueError('Axis %d is out of bounds' % axis)
186+
187+
arr = x if axis == 1 else x.T
188+
189+
if weights is not None:
190+
weights = weights if axis == 1 else weights.T
191+
192+
arr = arr.tocsr()
193+
counts = _count_nans_per_row_sparse(arr, weights, dtype=dtype)
194+
195+
# We want a scalar value if `axis=None` or if the sparse matrix is
196+
# actually a vector (e.g. [[1 2 3]]), but has `ndim=2` due to scipy
197+
# implementation
198+
if axis is None or x.shape[0] == 1:
199+
counts = counts.sum(dtype=dtype)
85200

86-
counts = _count_nans_per_row_sparse(X, weights)
87201
return counts
88202

89203

@@ -234,17 +348,12 @@ def weighted_mean():
234348
X.shape[0] - nans))
235349

236350

237-
def _sparse_has_zeros(x):
238-
""" Check if sparse matrix contains any implicit zeros. """
239-
return np.prod(x.shape) != x.nnz
240-
241-
242351
def _nan_min_max(x, func, axis=0):
243352
if not sp.issparse(x):
244353
return func(x, axis=axis)
245354
if axis is None:
246355
extreme = func(x.data, axis=axis) if x.nnz else float('nan')
247-
if _sparse_has_zeros(x):
356+
if sparse_has_implicit_zeros(x):
248357
extreme = func([0, extreme])
249358
return extreme
250359
if axis == 0:
@@ -257,7 +366,7 @@ def _nan_min_max(x, func, axis=0):
257366
for row in x:
258367
values = row.data
259368
extreme = func(values) if values.size else float('nan')
260-
if _sparse_has_zeros(row):
369+
if sparse_has_implicit_zeros(row):
261370
extreme = func([0, extreme])
262371
r.append(extreme)
263372
return np.array(r)
@@ -323,7 +432,7 @@ def unique(x, return_counts=False):
323432
if not sp.issparse(x):
324433
return np.unique(x, return_counts=return_counts)
325434

326-
implicit_zeros = np.prod(x.shape) - x.nnz
435+
implicit_zeros = sparse_count_implicit_zeros(x)
327436
explicit_zeros = not np.all(x.data)
328437
r = np.unique(x.data, return_counts=return_counts)
329438
if not implicit_zeros:

0 commit comments

Comments
 (0)