diff --git a/Orange/data/table.py b/Orange/data/table.py index e3f22d6988c..fde63c6feef 100644 --- a/Orange/data/table.py +++ b/Orange/data/table.py @@ -19,7 +19,8 @@ ) from Orange.data.util import SharedComputeValue, vstack, hstack from Orange.statistics.util import bincount, countnans, contingency, \ - stats as fast_stats + stats as fast_stats, sparse_has_implicit_zeros, sparse_count_implicit_zeros, \ + sparse_implicit_zero_weights from Orange.util import flatten __all__ = ["dataset_dirs", "get_sample_datasets_dir", "RowInstance", "Table"] @@ -1384,42 +1385,58 @@ def _compute_distributions(self, columns=None): columns = range(len(self.domain.variables)) else: columns = [self.domain.index(var) for var in columns] + distributions = [] if sp.issparse(self.X): self.X = self.X.tocsc() + W = self.W.ravel() if self.has_weights() else None + for col in columns: - var = self.domain[col] + variable = self.domain[col] + + # Select the correct data column from X, Y or metas if 0 <= col < self.X.shape[1]: - m = self.X[:, col] + x = self.X[:, col] elif col < 0: - m = self.metas[:, col * (-1) - 1] - if np.issubdtype(m.dtype, np.dtype(object)): - m = m.astype(float) + x = self.metas[:, col * (-1) - 1] + if np.issubdtype(x.dtype, np.dtype(object)): + x = x.astype(float) else: - m = self._Y[:, col - self.X.shape[1]] - if var.is_discrete: - dist, unknowns = bincount(m, len(var.values) - 1, W) - elif not m.shape[0]: + x = self._Y[:, col - self.X.shape[1]] + + if variable.is_discrete: + dist, unknowns = bincount(x, weights=W, max_val=len(variable.values) - 1) + elif not x.shape[0]: dist, unknowns = np.zeros((2, 0)), 0 else: if W is not None: - unknowns = countnans(m, W) - if sp.issparse(m): - arg_sort = np.argsort(m.data) - ranks = m.indices[arg_sort] - vals = np.vstack((m.data[arg_sort], W[ranks])) + if sp.issparse(x): + arg_sort = np.argsort(x.data) + ranks = x.indices[arg_sort] + vals = np.vstack((x.data[arg_sort], W[ranks])) else: - ranks = np.argsort(m) - vals = np.vstack((m[ranks], W[ranks])) + ranks = np.argsort(x) + vals = np.vstack((x[ranks], W[ranks])) else: - unknowns = countnans(m.astype(float)) - if sp.issparse(m): - m = m.data - vals = np.ones((2, m.shape[0])) - vals[0, :] = m + x_values = x.data if sp.issparse(x) else x + vals = np.ones((2, x_values.shape[0])) + vals[0, :] = x_values vals[0, :].sort() + dist = np.array(_valuecount.valuecount(vals)) + # If sparse, then 0s will not be counted with `valuecount`, so + # we have to add them to the result manually. + if sp.issparse(x) and sparse_has_implicit_zeros(x): + if W is not None: + zero_weights = sparse_implicit_zero_weights(x, W).sum() + else: + zero_weights = sparse_count_implicit_zeros(x) + zero_vec = [0, zero_weights] + dist = np.insert(dist, np.searchsorted(dist[0], 0), zero_vec, axis=1) + # Since `countnans` assumes vector shape to be (1, n) and `x` + # shape is (n, 1), we pass the transpose + unknowns = countnans(x.T, W) distributions.append((dist, unknowns)) return distributions diff --git a/Orange/statistics/distribution.py b/Orange/statistics/distribution.py index 09511fa88d3..40809519843 100644 --- a/Orange/statistics/distribution.py +++ b/Orange/statistics/distribution.py @@ -272,8 +272,8 @@ def mean(self): return np.average(np.asarray(self[0]), weights=np.asarray(self[1])) def variance(self): - avg = self.mean() - return sum([((x-avg)**2)*w for x, w in zip(self[0], self[1])])/sum(self[1]) + mean = self.mean() + return sum(((x - mean) ** 2) * w for x, w in zip(self[0], self[1])) / sum(self[1]) def standard_deviation(self): return math.sqrt(self.variance()) diff --git a/Orange/statistics/util.py b/Orange/statistics/util.py index 478128a2f26..73f2de55af5 100644 --- a/Orange/statistics/util.py +++ b/Orange/statistics/util.py @@ -5,85 +5,199 @@ It also patches bottleneck to contain these functions. """ from warnings import warn -import numpy as np -import scipy.sparse as sp + import bottleneck as bn +import numpy as np +from scipy import sparse as sp -def _count_nans_per_row_sparse(X, weights): +def _count_nans_per_row_sparse(X, weights, dtype=None): """ Count the number of nans (undefined) values per row. """ - items_per_row = 1 if X.ndim == 1 else X.shape[1] - counts = np.ones(X.shape[0]) * items_per_row - nnz_per_row = np.bincount(X.indices, minlength=len(counts)) - counts -= nnz_per_row if weights is not None: - counts *= weights - return np.sum(counts) + X = X.tocoo(copy=False) + nonzero_mask = np.isnan(X.data) + nan_rows, nan_cols = X.row[nonzero_mask], X.col[nonzero_mask] + + if weights.ndim == 1: + data_weights = weights[nan_rows] + else: + data_weights = weights[nan_rows, nan_cols] + + w = sp.coo_matrix((data_weights, (nan_rows, nan_cols)), shape=X.shape) + w = w.tocsr() + + return np.fromiter((np.sum(row.data) for row in w), dtype=dtype) + return np.fromiter((np.isnan(row.data).sum() for row in X), dtype=dtype) -def bincount(X, max_val=None, weights=None, minlength=None): + +def sparse_count_implicit_zeros(x): + """ Count the number of implicit zeros in a sparse matrix. """ + if not sp.issparse(x): + raise TypeError('The matrix provided was not sparse.') + return np.prod(x.shape) - x.nnz + + +def sparse_has_implicit_zeros(x): + """ Check if sparse matrix contains any implicit zeros. """ + if not sp.issparse(x): + raise TypeError('The matrix provided was not sparse.') + return np.prod(x.shape) != x.nnz + + +def sparse_implicit_zero_weights(x, weights): + """ Extract the weight values of all zeros in a sparse matrix. """ + if not sp.issparse(x): + raise TypeError('The matrix provided was not sparse.') + + if weights.ndim == 1: + # Match weights and x axis so `indices` will be set appropriately + if x.shape[0] == weights.shape[0]: + x = x.tocsc() + elif x.shape[1] == weights.shape[0]: + x = x.tocsr() + n_items = np.prod(x.shape) + zero_indices = np.setdiff1d(np.arange(n_items), x.indices, assume_unique=True) + return weights[zero_indices] + else: + # Can easily be implemented using a coo_matrix + raise NotImplementedError( + 'Computing zero weights on ndimensinal weight matrix is not implemented' + ) + + +def bincount(x, weights=None, max_val=None, minlength=None): """Return counts of values in array X. Works kind of like np.bincount(), except that it also supports floating arrays with nans. + + Parameters + ---------- + x : array_like, 1 dimension, nonnegative ints + Input array. + weights : array_like, optional + Weights, array of the same shape as x. + max_val : int, optional + Indicates the maximum value we expect to find in X and sets the result + array size accordingly. E.g. if we set `max_val=2` yet the largest + value in X is 1, the result will contain a bin for the value 2, and + will be set to 0. See examples for usage. + minlength : int, optional + A minimum number of bins for the output array. See numpy docs for info. + + Returns + ------- + Tuple[np.ndarray, int] + Returns the bincounts and the number of NaN values. + + Examples + -------- + In case `max_val` is provided, the return shape includes bins for these + values as well, even if they do not appear in the data. However, this will + not truncate the bincount if values larger than `max_count` are found. + >>> bincount([0, 0, 1, 1, 2], max_val=4) + (array([ 2., 2., 1., 0., 0.]), 0.0) + >>> bincount([0, 1, 2, 3, 4], max_val=2) + (array([ 1., 1., 1., 1., 1.]), 0.0) + """ - if sp.issparse(X): - minlength = max_val + 1 - bin_weights = weights[X.indices] if weights is not None else None - return (np.bincount(X.data.astype(int), - weights=bin_weights, - minlength=minlength, ), - _count_nans_per_row_sparse(X, weights)) - - X = np.asanyarray(X) - if X.dtype.kind == 'f' and bn.anynan(X): - nonnan = ~np.isnan(X) - X = X[nonnan] + # Store the original matrix before any manipulation to check for sparse + x_original = x + if sp.issparse(x): + if weights is not None: + # Match weights and x axis so `indices` will be set appropriately + if x.shape[0] == weights.shape[0]: + x = x.tocsc() + elif x.shape[1] == weights.shape[0]: + x = x.tocsr() + + zero_weights = sparse_implicit_zero_weights(x, weights).sum() + weights = weights[x.indices] + else: + zero_weights = sparse_count_implicit_zeros(x) + + x = x.data + + x = np.asanyarray(x) + if x.dtype.kind == 'f' and bn.anynan(x): + nonnan = ~np.isnan(x) + x = x[nonnan] if weights is not None: nans = (~nonnan * weights).sum(axis=0) weights = weights[nonnan] else: nans = (~nonnan).sum(axis=0) else: - nans = 0. if X.ndim == 1 else np.zeros(X.shape[1], dtype=float) + nans = 0. if x.ndim == 1 else np.zeros(x.shape[1], dtype=float) + if minlength is None and max_val is not None: minlength = max_val + 1 - bc = np.array([]) if minlength is not None and minlength <= 0 else \ - np.bincount(X.astype(np.int32, copy=False), - weights=weights, minlength=minlength).astype(float) + + if minlength is not None and minlength <= 0: + bc = np.array([]) + else: + bc = np.bincount( + x.astype(np.int32, copy=False), weights=weights, minlength=minlength + ).astype(float) + # Since `csr_matrix.values` only contain non-zero values or explicit + # zeros, we must count implicit zeros separately and add them to the + # explicit ones found before + if sp.issparse(x_original): + bc[0] += zero_weights + return bc, nans -def countnans(X, weights=None, axis=None, dtype=None, keepdims=False): +def countnans(x, weights=None, axis=None, dtype=None, keepdims=False): """ - Count the undefined elements in arr along given axis. + Count the undefined elements in an array along given axis. Parameters ---------- - X : array_like - weights : array_like + x : array_like + weights : array_like, optional Weights to weight the nans with, before or after counting (depending on the weights shape). + axis : int, optional + dtype : dtype, optional + The data type of the returned array. Returns ------- - counts + Union[np.ndarray, float] + """ - if not sp.issparse(X): - X = np.asanyarray(X) - isnan = np.isnan(X) - if weights is not None and weights.shape == X.shape: + if not sp.issparse(x): + x = np.asanyarray(x) + isnan = np.isnan(x) + if weights is not None and weights.shape == x.shape: isnan = isnan * weights + counts = isnan.sum(axis=axis, dtype=dtype, keepdims=keepdims) - if weights is not None and weights.shape != X.shape: + if weights is not None and weights.shape != x.shape: counts = counts * weights else: - if any(attr is not None for attr in [axis, dtype]) or \ - keepdims is not False: - raise ValueError('Arguments axis, dtype and keepdims' - 'are not yet supported on sparse data!') + assert axis in [None, 0, 1], 'Only axis 0 and 1 are currently supported' + # To have consistent behaviour with dense matrices, raise error when + # `axis=1` and the array is 1d (e.g. [[1 2 3]]) + if x.shape[0] == 1 and axis == 1: + raise ValueError('Axis %d is out of bounds' % axis) + + arr = x if axis == 1 else x.T + + if weights is not None: + weights = weights if axis == 1 else weights.T + + arr = arr.tocsr() + counts = _count_nans_per_row_sparse(arr, weights, dtype=dtype) + + # We want a scalar value if `axis=None` or if the sparse matrix is + # actually a vector (e.g. [[1 2 3]]), but has `ndim=2` due to scipy + # implementation + if axis is None or x.shape[0] == 1: + counts = counts.sum(dtype=dtype) - counts = _count_nans_per_row_sparse(X, weights) return counts @@ -234,17 +348,12 @@ def weighted_mean(): X.shape[0] - nans)) -def _sparse_has_zeros(x): - """ Check if sparse matrix contains any implicit zeros. """ - return np.prod(x.shape) != x.nnz - - def _nan_min_max(x, func, axis=0): if not sp.issparse(x): return func(x, axis=axis) if axis is None: extreme = func(x.data, axis=axis) if x.nnz else float('nan') - if _sparse_has_zeros(x): + if sparse_has_implicit_zeros(x): extreme = func([0, extreme]) return extreme if axis == 0: @@ -257,7 +366,7 @@ def _nan_min_max(x, func, axis=0): for row in x: values = row.data extreme = func(values) if values.size else float('nan') - if _sparse_has_zeros(row): + if sparse_has_implicit_zeros(row): extreme = func([0, extreme]) r.append(extreme) return np.array(r) @@ -323,7 +432,7 @@ def unique(x, return_counts=False): if not sp.issparse(x): return np.unique(x, return_counts=return_counts) - implicit_zeros = np.prod(x.shape) - x.nnz + implicit_zeros = sparse_count_implicit_zeros(x) explicit_zeros = not np.all(x.data) r = np.unique(x.data, return_counts=return_counts) if not implicit_zeros: diff --git a/Orange/tests/test_distribution.py b/Orange/tests/test_distribution.py index ff2735ffca0..58d3add06fc 100644 --- a/Orange/tests/test_distribution.py +++ b/Orange/tests/test_distribution.py @@ -12,6 +12,7 @@ from Orange import data from Orange.tests import test_filename + class Distribution_DiscreteTestCase(unittest.TestCase): def setUp(self): self.freqs = [4.0, 20.0, 13.0, 8.0, 10.0, 41.0, 5.0] @@ -291,6 +292,7 @@ def test_class_distribution(self): np.testing.assert_array_equal(disc, [4.0, 20.0, 13.0, 8.0, 10.0, 41.0, 5.0]) + class TestGetDistribution(unittest.TestCase): def test_get_distribution(self): d = data.Table("iris") @@ -338,93 +340,85 @@ def test_get_distributions(self): np.testing.assert_almost_equal(ddist[-1], [50, 50, 50]) def test_sparse_get_distributions(self): - def assert_dist_and_unknowns(computed, gold_dist): + def assert_dist_and_unknowns(computed, goal_dist): nonlocal d - gold_dist = np.array(gold_dist) - sum_dist = np.sum(gold_dist[1, :] if gold_dist.ndim == 2 else gold_dist) + goal_dist = np.array(goal_dist) + sum_dist = np.sum(goal_dist[1, :] if goal_dist.ndim == 2 else goal_dist) n_all = np.sum(d.W) if d.has_weights() else len(d) - np.testing.assert_almost_equal(computed, gold_dist) + np.testing.assert_almost_equal(computed, goal_dist) self.assertEqual(computed.unknowns, n_all - sum_dist) domain = data.Domain( - [data.DiscreteVariable("d%i" % i, values=list("abc")) - for i in range(10)] + + [data.DiscreteVariable("d%i" % i, values=list("abc")) for i in range(10)] + [data.ContinuousVariable("c%i" % i) for i in range(10)]) - # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 - # ------------------------------------------------------------ - # 2 2 1 1 2 1 1 1 2 0 2 - # 1 1 0 0 1 2 2 1 0 - # 1 2 0 - # - # 2 0 1 1.1 - # - sdata = np.array([2, 2, 1, 1, 2, 1, 1, 1, 2, 0, 2, - 1, 1, 0, 0, 1, 2, 2, 1, 0, - 1, 2, 0, - 2, 0, 1, 1.1]) - indices = [1, 3, 4, 5, 6, 9, 13, 14, 16, 17, 18, - 2, 3, 4, 5, 6, 8, 14, 16, 17, - 3, 5, 6, - 2, 5, 6, 13] - indptr = [0, 11, 20, 23, 23, 27] - X = sp.csr_matrix((sdata, indices, indptr), shape=(5, 20)) - d = data.Table.from_numpy(domain, X) + # pylint: disable=bad-whitespace + X = sp.csr_matrix( + # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 + # -------------------------------------------------------------------------------- + [[0, 2, 0, 2, 1, 1, 2, 0, 0, 1, 0, 0, 0, 1, 1, 0, 2, np.nan, 2, 0], + [0, 0, 1, 1, np.nan, np.nan, 1, 0, 2, 0, 0, 0, 0, 0, 2, 0, 1, np.nan, 0, 0], + [0, 0, 0, 1, 0, 2, np.nan, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1.1, 0, 0, 0, 0, 0, 0]] + ) + X[0, 0] = 0 + d = data.Table.from_numpy(domain, X) ddist = distribution.get_distributions(d) self.assertEqual(len(ddist), 20) - assert_dist_and_unknowns(ddist[0], [0, 0, 0]) - assert_dist_and_unknowns(ddist[1], [0, 0, 1]) - assert_dist_and_unknowns(ddist[2], [0, 1, 1]) - assert_dist_and_unknowns(ddist[3], [0, 2, 1]) - assert_dist_and_unknowns(ddist[4], [1, 1, 0]) + zeros = [5, 0, 0] + assert_dist_and_unknowns(ddist[0], zeros) + assert_dist_and_unknowns(ddist[1], [4, 0, 1]) + assert_dist_and_unknowns(ddist[2], [3, 1, 1]) + assert_dist_and_unknowns(ddist[3], [2, 2, 1]) + assert_dist_and_unknowns(ddist[4], [3, 1, 0]) assert_dist_and_unknowns(ddist[5], [2, 1, 1]) assert_dist_and_unknowns(ddist[6], [1, 2, 1]) - assert_dist_and_unknowns(ddist[7], [0, 0, 0]) - assert_dist_and_unknowns(ddist[8], [0, 0, 1]) - assert_dist_and_unknowns(ddist[9], [0, 1, 0]) - - z = np.zeros((2, 0)) - assert_dist_and_unknowns(ddist[10], z) - assert_dist_and_unknowns(ddist[11], z) - assert_dist_and_unknowns(ddist[12], z) - assert_dist_and_unknowns(ddist[13], [[1, 1.1], [1, 1]]) - assert_dist_and_unknowns(ddist[14], [[1, 2], [1, 1]]) - assert_dist_and_unknowns(ddist[15], z) - assert_dist_and_unknowns(ddist[16], [[1, 2], [1, 1]]) - assert_dist_and_unknowns(ddist[17], [[0], [2]]) - assert_dist_and_unknowns(ddist[18], [[2], [1]]) - assert_dist_and_unknowns(ddist[19], z) + assert_dist_and_unknowns(ddist[7], zeros) + assert_dist_and_unknowns(ddist[8], [4, 0, 1]) + assert_dist_and_unknowns(ddist[9], [4, 1, 0]) + + zeros = [[0], [5]] + assert_dist_and_unknowns(ddist[10], zeros) + assert_dist_and_unknowns(ddist[11], zeros) + assert_dist_and_unknowns(ddist[12], zeros) + assert_dist_and_unknowns(ddist[13], [[0, 1, 1.1], [3, 1, 1]]) + assert_dist_and_unknowns(ddist[14], [[0, 1, 2], [3, 1, 1]]) + assert_dist_and_unknowns(ddist[15], zeros) + assert_dist_and_unknowns(ddist[16], [[0, 1, 2], [3, 1, 1]]) + assert_dist_and_unknowns(ddist[17], [[0], [3]]) + assert_dist_and_unknowns(ddist[18], [[0, 2], [4, 1]]) + assert_dist_and_unknowns(ddist[19], zeros) d.set_weights(np.array([1, 2, 3, 4, 5])) - ddist = distribution.get_distributions(d) self.assertEqual(len(ddist), 20) - assert_dist_and_unknowns(ddist[0], [0, 0, 0]) - assert_dist_and_unknowns(ddist[1], [0, 0, 1]) - assert_dist_and_unknowns(ddist[2], [0, 2, 5]) - assert_dist_and_unknowns(ddist[3], [0, 5, 1]) - assert_dist_and_unknowns(ddist[4], [2, 1, 0]) - assert_dist_and_unknowns(ddist[5], [7, 1, 3]) - assert_dist_and_unknowns(ddist[6], [3, 7, 1]) - assert_dist_and_unknowns(ddist[7], [0, 0, 0]) - assert_dist_and_unknowns(ddist[8], [0, 0, 2]) - assert_dist_and_unknowns(ddist[9], [0, 1, 0]) - - z = np.zeros((2, 0)) - assert_dist_and_unknowns(ddist[10], z) - assert_dist_and_unknowns(ddist[11], z) - assert_dist_and_unknowns(ddist[12], z) - assert_dist_and_unknowns(ddist[13], [[1, 1.1], [1, 5]]) - assert_dist_and_unknowns(ddist[14], [[1, 2], [1, 2]]) - assert_dist_and_unknowns(ddist[15], z) - assert_dist_and_unknowns(ddist[16], [[1, 2], [2, 1]]) - assert_dist_and_unknowns(ddist[17], [[0], [3]]) - assert_dist_and_unknowns(ddist[18], [[2], [1]]) - assert_dist_and_unknowns(ddist[19], z) + assert_dist_and_unknowns(ddist[0], [15, 0, 0]) + assert_dist_and_unknowns(ddist[1], [14, 0, 1]) + assert_dist_and_unknowns(ddist[2], [8, 2, 5]) + assert_dist_and_unknowns(ddist[3], [9, 5, 1]) + assert_dist_and_unknowns(ddist[4], [12, 1, 0]) + assert_dist_and_unknowns(ddist[5], [9, 1, 3]) + assert_dist_and_unknowns(ddist[6], [4, 7, 1]) + assert_dist_and_unknowns(ddist[7], [15, 0, 0]) + assert_dist_and_unknowns(ddist[8], [13, 0, 2]) + assert_dist_and_unknowns(ddist[9], [14, 1, 0]) + + zeros = [[0], [15]] + assert_dist_and_unknowns(ddist[10], zeros) + assert_dist_and_unknowns(ddist[11], zeros) + assert_dist_and_unknowns(ddist[12], zeros) + assert_dist_and_unknowns(ddist[13], [[0, 1, 1.1], [9, 1, 5]]) + assert_dist_and_unknowns(ddist[14], [[0, 1, 2], [12, 1, 2]]) + assert_dist_and_unknowns(ddist[15], zeros) + assert_dist_and_unknowns(ddist[16], [[0, 1, 2], [12, 2, 1]]) + assert_dist_and_unknowns(ddist[17], [[0], [12]]) + assert_dist_and_unknowns(ddist[18], [[0, 2], [14, 1]]) + assert_dist_and_unknowns(ddist[19], zeros) def test_compute_distributions_metas(self): d = data.Table(test_filename("test9.tab")) @@ -439,5 +433,32 @@ def test_compute_distributions_metas(self): np.testing.assert_almost_equal(dist, [2, 3, 2]) self.assertEqual(nanc, 1) + +class TestContinuous(unittest.TestCase): + def test_mean(self): + # pylint: disable=bad-whitespace + x = np.array([[0, 5, 10], + [9, 0, 1]]) + dist = distribution.Continuous(x) + + self.assertEqual(dist.mean(), np.mean(([0] * 9) + [10])) + + def test_variance(self): + # pylint: disable=bad-whitespace + x = np.array([[0, 5, 10], + [9, 0, 1]]) + dist = distribution.Continuous(x) + + self.assertEqual(dist.variance(), np.var(([0] * 9) + [10])) + + def test_standard_deviation(self): + # pylint: disable=bad-whitespace + x = np.array([[0, 5, 10], + [9, 0, 1]]) + dist = distribution.Continuous(x) + + self.assertEqual(dist.standard_deviation(), np.std(([0] * 9) + [10])) + + if __name__ == "__main__": unittest.main() diff --git a/Orange/tests/test_normalize.py b/Orange/tests/test_normalize.py index c3497aa67d6..b6c42239015 100644 --- a/Orange/tests/test_normalize.py +++ b/Orange/tests/test_normalize.py @@ -97,17 +97,17 @@ def test_normalize_transform_by_span_zero_class(self): def test_normalize_sparse(self): domain = Domain([ContinuousVariable(str(i)) for i in range(3)]) + # pylint: disable=bad-whitespace X = sp.csr_matrix(np.array([ - [0, 0, 0,], [0, -1, -2], - [0, 1, 2], + [0, 1, 2], ])) data = Table.from_numpy(domain, X) + # pylint: disable=bad-whitespace solution = sp.csr_matrix(np.array([ - [0, 0, 0,], [0, -1, -1], - [0, 1, 1], + [0, 1, 1], ])) normalizer = Normalize() @@ -116,7 +116,7 @@ def test_normalize_sparse(self): # raise error for non-zero offsets data.X = sp.csr_matrix(np.array([ - [0, 0, 0, ], + [0, 0, 0], [0, 1, 3], [0, 2, 4], ])) diff --git a/Orange/tests/test_statistics.py b/Orange/tests/test_statistics.py index a8a0eca057f..8639c845d62 100644 --- a/Orange/tests/test_statistics.py +++ b/Orange/tests/test_statistics.py @@ -1,15 +1,40 @@ import unittest import warnings +from functools import wraps, partial from itertools import chain import numpy as np import scipy as sp -from scipy.sparse import csr_matrix, issparse +from scipy.sparse import csr_matrix, issparse, csc_matrix from Orange.statistics.util import bincount, countnans, contingency, stats, \ nanmin, nanmax, unique, nanunique, mean, nanmean, digitize, var +def dense_sparse(test_case): + # type: (Callable) -> Callable + """Run a single test case on both dense and sparse data.""" + @wraps(test_case) + def _wrapper(self): + + def sparse_with_explicit_zero(x, array): + """Inject one explicit zero into a sparse array.""" + np_array, sp_array = np.atleast_2d(x), array(x) + assert issparse(sp_array), 'Can not inject explicit zero into non-sparse matrix' + + zero_indices = np.argwhere(np_array == 0) + if zero_indices.size: + sp_array[tuple(zero_indices[0])] = 0 + + return sp_array + + test_case(self, lambda x: np.array(x)) + test_case(self, partial(sparse_with_explicit_zero, array=csr_matrix)) + test_case(self, partial(sparse_with_explicit_zero, array=csc_matrix)) + + return _wrapper + + class TestUtil(unittest.TestCase): def setUp(self): nan = float('nan') @@ -22,19 +47,6 @@ def setUp(self): np.ones((2, 3)), ] - def test_bincount(self): - hist, n_nans = bincount([0., 1., np.nan, 3]) - self.assertEqual(n_nans, 1) - np.testing.assert_equal(hist, [1, 1, 0, 1]) - - hist, n_nans = bincount([0., 1., 3], max_val=3) - self.assertEqual(n_nans, 0) - np.testing.assert_equal(hist, [1, 1, 0, 1]) - - def test_countnans(self): - np.testing.assert_equal(countnans([[1, np.nan], - [2, np.nan]], axis=0), [0, 2]) - def test_contingency(self): x = np.array([0, 1, 0, 2, np.nan]) y = np.array([0, 0, 1, np.nan, 0]) @@ -171,56 +183,63 @@ def test_nanmean(self): nanmean(X_sparse), np.nanmean(X)) - def test_digitize(self): - for x in self.data: - x_sparse = csr_matrix(x) + def test_var(self): + for data in self.data: + for axis in chain((None,), range(len(data.shape))): + # Can't use array_equal here due to differences on 1e-16 level + np.testing.assert_array_almost_equal( + var(csr_matrix(data), axis=axis), + np.var(data, axis=axis) + ) + + +class TestDigitize(unittest.TestCase): + def setUp(self): + # pylint: disable=bad-whitespace + self.data = [ + np.array([ + [0., 1., 0., np.nan, 3., 5.], + [0., 0., np.nan, np.nan, 5., np.nan], + [0., 0., 0., np.nan, 7., 6.]]), + np.zeros((2, 3)), + np.ones((2, 3)), + ] + + @dense_sparse + def test_digitize(self, array): + for x_original in self.data: + x = array(x_original) bins = np.arange(-2, 2) x_shape = x.shape np.testing.assert_array_equal( - np.digitize(x.flatten(), bins).reshape(x_shape), + np.digitize(x_original.flatten(), bins).reshape(x_shape), digitize(x, bins), - 'Digitize fails on dense data' - ) - np.testing.assert_array_equal( - np.digitize(x.flatten(), bins).reshape(x_shape), - digitize(x_sparse, bins), - 'Digitize fails on sparse data' ) - def test_digitize_right(self): - for x in self.data: - x_sparse = csr_matrix(x) + @dense_sparse + def test_digitize_right(self, array): + for x_original in self.data: + x = array(x_original) bins = np.arange(-2, 2) x_shape = x.shape np.testing.assert_array_equal( - np.digitize(x.flatten(), bins, right=True).reshape(x_shape), - digitize(x, bins, right=True), - 'Digitize fails on dense data' - ) - np.testing.assert_array_equal( - np.digitize(x.flatten(), bins, right=True).reshape(x_shape), - digitize(x_sparse, bins, right=True), - 'Digitize fails on sparse data' + np.digitize(x_original.flatten(), bins, right=True).reshape(x_shape), + digitize(x, bins, right=True) ) - def test_digitize_1d_array(self): + @dense_sparse + def test_digitize_1d_array(self, array): """A consistent return shape must be returned for both sparse and dense.""" - x = np.array([0, 1, 1, 0, np.nan, 0, 1]) - x_sparse = csr_matrix(x) + x_original = np.array([0, 1, 1, 0, np.nan, 0, 1]) + x = array(x_original) bins = np.arange(-2, 2) - x_shape = x.shape + x_shape = x_original.shape np.testing.assert_array_equal( - [np.digitize(x.flatten(), bins).reshape(x_shape)], + [np.digitize(x_original.flatten(), bins).reshape(x_shape)], digitize(x, bins), - 'Digitize fails on 1d dense data' - ) - np.testing.assert_array_equal( - [np.digitize(x.flatten(), bins).reshape(x_shape)], - digitize(x_sparse, bins), - 'Digitize fails on 1d sparse data' ) def test_digitize_sparse_zeroth_bin(self): @@ -233,11 +252,171 @@ def test_digitize_sparse_zeroth_bin(self): # Then digitize should return a sparse matrix self.assertTrue(issparse(digitize(data, bins))) - def test_var(self): - for data in self.data: - for axis in chain((None,), range(len(data.shape))): - # Can't use array_equal here due to differences on 1e-16 level - np.testing.assert_array_almost_equal( - var(csr_matrix(data), axis=axis), - np.var(data, axis=axis) - ) + +class TestCountnans(unittest.TestCase): + @dense_sparse + def test_1d_array(self, array): + x = array([0, 1, 0, 2, 2, np.nan, 1, np.nan, 0, 1]) + self.assertEqual(countnans(x), 2) + + @dense_sparse + def test_1d_array_with_axis_0(self, array): + x = array([0, 1, 0, 2, 2, np.nan, 1, np.nan, 0, 1]) + expected = 2 + + self.assertEqual(countnans(x, axis=0), expected) + + @dense_sparse + def test_1d_array_with_axis_1_raises_exception(self, array): + with self.assertRaises(ValueError): + countnans(array([0, 1, 0, 2, 2, np.nan, 1, np.nan, 0, 1]), axis=1) + + @dense_sparse + def test_shape_matches_dense_and_sparse(self, array): + x = array([[0, 1, 0, 2, 2, np.nan, 1, np.nan, 0, 1], + [1, 2, 2, 1, np.nan, 1, 2, 3, np.nan, 3]]) + expected = 4 + + self.assertEqual(countnans(x), expected) + + @dense_sparse + def test_shape_matches_dense_and_sparse_with_axis_0(self, array): + x = array([[0, 1, 0, 2, 2, np.nan, 1, np.nan, 0, 1], + [1, 2, 2, 1, np.nan, 1, 2, np.nan, 3, 3]]) + expected = [0, 0, 0, 0, 1, 1, 0, 2, 0, 0] + + np.testing.assert_equal(countnans(x, axis=0), expected) + + @dense_sparse + def test_shape_matches_dense_and_sparse_with_axis_1(self, array): + x = array([[0, 1, 0, 2, 2, np.nan, 1, np.nan, 0, 1], + [1, 2, 2, 1, np.nan, 1, 2, 3, np.nan, 3]]) + expected = [2, 2] + + np.testing.assert_equal(countnans(x, axis=1), expected) + + @dense_sparse + def test_2d_matrix(self, array): + x = array([[1, np.nan, 1, 2], + [2, np.nan, 2, 3]]) + expected = 2 + + self.assertEqual(countnans(x), expected) + + @dense_sparse + def test_on_columns(self, array): + x = array([[1, np.nan, 1, 2], + [2, np.nan, 2, 3]]) + expected = [0, 2, 0, 0] + + np.testing.assert_equal(countnans(x, axis=0), expected) + + @dense_sparse + def test_on_rows(self, array): + x = array([[1, np.nan, 1, 2], + [2, np.nan, 2, 3]]) + expected = [1, 1] + + np.testing.assert_equal(countnans(x, axis=1), expected) + + @dense_sparse + def test_1d_weights_with_axis_0(self, array): + x = array([[1, 1, np.nan, 1], + [np.nan, 1, 1, 1]]) + w = np.array([0.5, 1, 1, 1]) + + np.testing.assert_equal(countnans(x, w, axis=0), [.5, 0, 1, 0]) + + @dense_sparse + def test_1d_weights_with_axis_1(self, array): + x = array([[1, 1, np.nan, 1], + [np.nan, 1, 1, 1]]) + w = np.array([0.5, 1]) + + np.testing.assert_equal(countnans(x, w, axis=1), [.5, 1]) + + @dense_sparse + def test_2d_weights(self, array): + # pylint: disable=bad-whitespace + x = array([[np.nan, np.nan, 1, 1 ], + [ 0, np.nan, 2, np.nan ]]) + w = np.array([[1, 2, 3, 4], + [5, 6, 7, 8]]) + + np.testing.assert_equal(countnans(x, w), 17) + np.testing.assert_equal(countnans(x, w, axis=0), [1, 8, 0, 8]) + np.testing.assert_equal(countnans(x, w, axis=1), [3, 14]) + + @dense_sparse + def test_dtype(self, array): + x = array([0, np.nan, 2, 3]) + w = np.array([0, 1.5, 0, 0]) + + self.assertIsInstance(countnans(x, w, dtype=np.int32), np.int32) + self.assertEqual(countnans(x, w, dtype=np.int32), 1) + self.assertIsInstance(countnans(x, w, dtype=np.float64), np.float64) + self.assertEqual(countnans(x, w, dtype=np.float64), 1.5) + + +class TestBincount(unittest.TestCase): + @dense_sparse + def test_count_nans(self, array): + x = array([0, 0, 1, 2, np.nan, 2]) + expected = 1 + + np.testing.assert_equal(bincount(x)[1], expected) + + @dense_sparse + def test_adds_empty_bins(self, array): + x = array([0, 1, 3, 5]) + expected = [1, 1, 0, 1, 0, 1] + + np.testing.assert_equal(bincount(x)[0], expected) + + @dense_sparse + def test_maxval_adds_empty_bins(self, array): + x = array([1, 1, 1, 2, 3, 2]) + max_val = 5 + expected = [0, 3, 2, 1, 0, 0] + + np.testing.assert_equal(bincount(x, max_val=max_val)[0], expected) + + @dense_sparse + def test_maxval_doesnt_truncate_values_when_too_small(self, array): + x = array([1, 1, 1, 2, 3, 2]) + max_val = 1 + expected = [0, 3, 2, 1] + + np.testing.assert_equal(bincount(x, max_val=max_val)[0], expected) + + @dense_sparse + def test_minlength_adds_empty_bins(self, array): + x = array([1, 1, 1, 2, 3, 2]) + minlength = 5 + expected = [0, 3, 2, 1, 0] + + np.testing.assert_equal(bincount(x, minlength=minlength)[0], expected) + + @dense_sparse + def test_weights(self, array): + x = array([0, 0, 1, 1, 2, 2, 3, 3]) + w = np.array([1, 2, 0, 0, 1, 1, 0, 1]) + + expected = [3, 0, 2, 1] + np.testing.assert_equal(bincount(x, w)[0], expected) + + @dense_sparse + def test_weights_with_nans(self, array): + x = array([0, 0, 1, 1, np.nan, 2, np.nan, 3]) + w = np.array([1, 2, 0, 0, 1, 1, 0, 1]) + + expected = [3, 0, 1, 1] + np.testing.assert_equal(bincount(x, w)[0], expected) + + @dense_sparse + def test_weights_with_transposed_x(self, array): + x = array([0, 0, 1, 1, 2, 2, 3, 3]).T + w = np.array([1, 2, 0, 0, 1, 1, 0, 1]) + + expected = [3, 0, 2, 1] + np.testing.assert_equal(bincount(x, w)[0], expected)