biolab
diff --git a/‎Orange/base.py‎
Lines changed: 3 additions & 5 deletions b/‎Orange/base.py‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎Orange/data/filter.py‎
Lines changed: 1 addition & 1 deletion b/‎Orange/data/filter.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Orange/data/io.py‎
Lines changed: 1 addition & 1 deletion b/‎Orange/data/io.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Orange/data/table.py‎
Lines changed: 16 additions & 20 deletions b/‎Orange/data/table.py‎
Lines changed: 16 additions & 20 deletions
diff --git a/‎Orange/data/util.py‎
Lines changed: 30 additions & 0 deletions b/‎Orange/data/util.py‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎Orange/preprocess/preprocess.py‎
Lines changed: 3 additions & 3 deletions b/‎Orange/preprocess/preprocess.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎Orange/statistics/util.py‎
Lines changed: 174 additions & 0 deletions b/‎Orange/statistics/util.py‎
Lines changed: 174 additions & 0 deletions
@@ -2,12 +2,12 @@
 
 import numpy as np
 import scipy
-import bottlechest as bn
 
 from Orange.data import Table, Storage, Instance, Value
 from Orange.preprocess import (RemoveNaNClasses, Continuize,
                                RemoveNaNColumns, SklImpute)
 from Orange.misc.wrapper_meta import WrapperMeta
+from Orange.data.util import one_hot
 
 __all__ = ["Learner", "Model", "SklLearner", "SklModel"]
 
@@ -157,11 +157,9 @@ def __call__(self, data, ret=Value):
                                for c in self.domain.class_vars)
                 probs = np.zeros(value.shape + (max_card,), float)
                 for i, cvar in enumerate(self.domain.class_vars):
-                    probs[:, i, :], _ = bn.bincount(np.atleast_2d(value[:, i]),
-                                                    max_card - 1)
+                    probs[:, i, :] = one_hot(value[:, i])
             else:
-                probs, _ = bn.bincount(np.atleast_2d(value),
-                                       len(self.domain.class_var.values) - 1)
+                probs = one_hot(value)
             if ret == Model.ValueProbs:
                 return value, probs
             else:
 
@@ -5,7 +5,7 @@
 
 from ..misc.enum import Enum
 import numpy as np
-import bottlechest as bn
+import bottleneck as bn
 from Orange.data import Instance, Storage, Variable
 
 
 
@@ -18,7 +18,7 @@
 from urllib.parse import urlparse, unquote as urlunquote
 from urllib.request import urlopen
 
-import bottlechest as bn
+import bottleneck as bn
 import numpy as np
 from chardet.universaldetector import UniversalDetector
 
 
@@ -13,9 +13,10 @@
 from urllib.request import urlopen
 from urllib.error import URLError
 
-import bottlechest as bn
+import bottleneck as bn
 from scipy import sparse as sp
 
+from Orange.statistics.util import bincount, countnans, contingency, stats as fast_stats
 from .instance import *
 from Orange.util import flatten
 from Orange.data import Domain, Variable, StringVariable
@@ -935,12 +936,7 @@ def __determine_density(data):
         if data is None:
             return Storage.Missing
         if data is not None and sp.issparse(data):
-            try:
-                if bn.bincount(data.data, 1)[0][0] == 0:
-                    return Storage.SPARSE_BOOL
-            except ValueError as e:
-                pass
-            return Storage.SPARSE
+            return Storage.SPARSE_BOOL if (data.data == 1).all() else Storage.SPARSE
         else:
             return Storage.DENSE
 
@@ -1212,19 +1208,19 @@ def _compute_basic_stats(self, columns=None,
         stats = []
         if not columns:
             if self.domain.attributes:
-                rr.append(bn.stats(self.X, W))
+                rr.append(fast_stats(self.X, W))
             if self.domain.class_vars:
-                rr.append(bn.stats(self._Y, W))
+                rr.append(fast_stats(self._Y, W))
             if include_metas and self.domain.metas:
-                rr.append(bn.stats(self.metas, W))
+                rr.append(fast_stats(self.metas, W))
             if len(rr):
                 stats = np.vstack(tuple(rr))
         else:
             columns = [self.domain.index(c) for c in columns]
             nattrs = len(self.domain.attributes)
-            Xs = any(0 <= c < nattrs for c in columns) and bn.stats(self.X, W)
-            Ys = any(c >= nattrs for c in columns) and bn.stats(self._Y, W)
-            ms = any(c < 0 for c in columns) and bn.stats(self.metas, W)
+            Xs = any(0 <= c < nattrs for c in columns) and fast_stats(self.X, W)
+            Ys = any(c >= nattrs for c in columns) and fast_stats(self._Y, W)
+            ms = any(c < 0 for c in columns) and fast_stats(self.metas, W)
             for column in columns:
                 if 0 <= column < nattrs:
                     stats.append(Xs[column, :])
@@ -1271,19 +1267,19 @@ def _get_matrix(M, cachedM, col):
             if var.is_discrete:
                 if W is not None:
                     W = W.ravel()
-                dist, unknowns = bn.bincount(m, len(var.values) - 1, W)
+                dist, unknowns = bincount(m, len(var.values) - 1, W)
             elif not len(m):
                 dist, unknowns = np.zeros((2, 0)), 0
             else:
                 if W is not None:
                     ranks = np.argsort(m)
                     vals = np.vstack((m[ranks], W[ranks].flatten()))
-                    unknowns = bn.countnans(m, W)
+                    unknowns = countnans(m, W)
                 else:
                     vals = np.ones((2, m.shape[0]))
                     vals[0, :] = m
                     vals[0, :].sort()
-                    unknowns = bn.countnans(m.astype(float))
+                    unknowns = countnans(m.astype(float))
                 dist = np.array(_valuecount.valuecount(vals))
             distributions.append((dist, unknowns))
 
@@ -1329,7 +1325,7 @@ def _compute_contingency(self, col_vars=None, row_var=None):
         if row_data.dtype.kind != "f": #meta attributes can be stored as type object
             row_data = row_data.astype(float)
 
-        unknown_rows = bn.countnans(row_data)
+        unknown_rows = countnans(row_data)
         if unknown_rows:
             nan_inds = np.isnan(row_data)
             row_data = row_data[~nan_inds]
@@ -1355,13 +1351,13 @@ def _compute_contingency(self, col_vars=None, row_var=None):
                     max_vals = max(len(v[2].values) for v in disc_vars)
                     disc_indi = {i for _, i, _ in disc_vars}
                     mask = [i in disc_indi for i in range(arr.shape[1])]
-                    conts, nans = bn.contingency(arr, row_data, max_vals - 1,
-                                                 n_rows - 1, W, mask)
+                    conts, nans = contingency(arr, row_data, max_vals - 1,
+                                              n_rows - 1, W, mask)
                     for col_i, arr_i, _ in disc_vars:
                         contingencies[col_i] = (conts[arr_i], nans[arr_i])
                 else:
                     for col_i, arr_i, var in disc_vars:
-                        contingencies[col_i] = bn.contingency(
+                        contingencies[col_i] = contingency(
                             arr[:, arr_i].astype(float),
                             row_data, len(var.values) - 1, n_rows - 1, W)
 
 
@@ -0,0 +1,30 @@
+"""
+Data-manipulation utilities.
+"""
+import numpy as np
+import bottleneck as bn
+
+
+def one_hot(values, dtype=float):
+    """Return a one-hot transform of values
+
+    Parameters
+    ----------
+    values : 1d array
+        Integer values (hopefully 0-max).
+
+    Returns
+    -------
+    result
+        2d array with ones in respective indicator columns.
+    """
+    return np.eye(np.max(values) + 1, dtype=dtype)[np.asanyarray(values, dtype=int)]
+
+
+def scale(values, min=0, max=1):
+    """Return values scaled to [min, max]"""
+    minval = np.float_(bn.nanmin(values))
+    ptp = bn.nanmax(values) - minval
+    if ptp == 0:
+        return np.clip(values, min, max)
+    return (-minval + values) / ptp * (max - min) + min
@@ -5,7 +5,7 @@
 """
 import numpy as np
 import sklearn.preprocessing as skl_preprocessing
-import bottlechest
+import bottleneck as bn
 
 import Orange.data
 from Orange.data import Table
@@ -198,8 +198,8 @@ def __call__(self, data):
         data : an input data set
         """
 
-        oks = bottlechest.nanmin(data.X, axis=0) != \
-              bottlechest.nanmax(data.X, axis=0)
+        oks = bn.nanmin(data.X, axis=0) != \
+              bn.nanmax(data.X, axis=0)
         atts = [data.domain.attributes[i] for i, ok in enumerate(oks) if ok]
         domain = Orange.data.Domain(atts, data.domain.class_vars,
                                     data.domain.metas)
 
@@ -0,0 +1,174 @@
+"""
+This module provides alternatives for the few additional functions found in
+and once used from the bottlechest package (fork of bottleneck).
+
+It also patches bottleneck to contain these functions.
+"""
+import numpy as np
+from scipy.sparse import issparse
+import bottleneck as bn
+
+
+def bincount(X, max_val=None, weights=None, minlength=None):
+    """Return counts of values in array X.
+
+    Works kind of like np.bincount(), except that it also supports floating
+    arrays with nans.
+    """
+    X = np.asanyarray(X)
+    if X.dtype.kind == 'f' and bn.anynan(X):
+        nonnan = ~np.isnan(X)
+        nans = (~nonnan).sum(axis=0)
+        X = X[nonnan]
+        if weights is not None:
+            weights = weights[nonnan]
+    else:
+        nans = 0 if X.ndim == 1 else np.zeros(X.shape[1])
+    if minlength is None and max_val is not None:
+        minlength = max_val + 1
+    return (np.bincount(X.astype(np.int32, copy=False),
+                        weights=weights,
+                        minlength=minlength),
+            nans)
+
+
+def countnans(X, weights=None, axis=None, dtype=None, keepdims=False):
+    """
+    Count the undefined elements in arr along given axis.
+
+    Parameters
+    ----------
+    X : array_like
+    weights : array_like
+        Weights to weight the nans with, before or after counting (depending
+        on the weights shape).
+
+    Returns
+    -------
+    counts
+    """
+    X = np.asanyarray(X)
+    isnan = np.isnan(X)
+    if weights is not None and weights.shape == X.shape:
+        isnan = isnan * weights
+    counts = isnan.sum(axis=axis, dtype=dtype, keepdims=keepdims)
+    if weights is not None and weights.shape != X.shape:
+        counts = counts * weights
+    return counts
+
+
+def contingency(X, y, max_X=None, max_y=None, weights=None, mask=None):
+    """
+    Compute the contingency matrices for each column of X (excluding the masked)
+    versus the vector y.
+
+    If the array is 1-dimensional, a 2d contingency matrix is returned. If the
+    array is 2d, the function returns a 3d array, with the first dimension
+    corresponding to column index (variable in the input array).
+
+    The rows of contingency matrix correspond to values of variables, the
+    columns correspond to values in vector `y`.
+    (??? isn't it the other way around ???)
+
+    Rows in the input array can be weighted (argument `weights`). A subset of
+    columns can be selected by additional argument `mask`.
+
+    The function also returns a count of NaN values per each value of `y`.
+
+    Parameters
+    ----------
+    X : array_like
+        With values in columns.
+    y : 1d array
+        Vector of true values.
+    max_X : int
+        The maximal value in the array
+    max_y : int
+        The maximal value in `y`
+    weights : ...
+    mask : sequence
+        Discrete columns of X.
+
+    Returns
+    -------
+    contingencies: (m × ny × nx) array
+        m number of masked (used) columns (all if mask=None), i.e.
+        for each column of X;
+        ny number of uniques in y,
+        nx number of uniques in column of X.
+    nans : array_like
+        Number of nans in each column of X for each unique value of y.
+    """
+    if weights is not None and np.any(weights) and np.unique(weights)[0] != 1:
+        raise ValueError('weights not yet supported')
+
+    was_1d = False
+    if X.ndim == 1:
+        X = X[..., np.newaxis]
+        was_1d = True
+
+    contingencies, nans = [], []
+    ny = np.unique(y).size if max_y is None else max_y + 1
+    for i in range(X.shape[1]):
+        if mask is not None and not mask[i]:
+            contingencies.append(np.zeros((ny, max_X + 1)))
+            nans.append(np.zeros(ny))
+            continue
+        col = X[..., i]
+        nx = np.unique(col[~np.isnan(col)]).size if max_X is None else max_X + 1
+        if issparse(col):
+            col = np.ravel(col.todense())
+        contingencies.append(
+            bincount(y + ny * col,
+                     minlength=ny * nx)[0].reshape(nx, ny).T)
+        nans.append(
+            bincount(y[np.isnan(col)], minlength=ny)[0])
+    if was_1d:
+        return contingencies[0], nans[0]
+    return np.array(contingencies), np.array(nans)
+
+
+def stats(X, weights=None, compute_variance=False):
+    """
+    Compute min, max, #nans, mean and variance.
+
+    Result is a tuple (min, max, mean, variance, #nans, #non-nans) or an
+    array of shape (len(X), 6).
+
+    The mean and the number of nans and non-nans are weighted.
+
+    Computation of variance requires an additional pass and is not enabled
+    by default. Zeros are filled in instead of variance.
+
+    Parameters
+    ----------
+    X : array_like, 1 or 2 dimensions
+        Input array.
+    weights : array_like, optional
+        Weights, array of the same length as `x`.
+    compute_variance : bool, optional
+        If set to True, the function also computes variance.
+
+    Returns
+    -------
+    out : a 6-element tuple or an array of shape (len(x), 6)
+        Computed (min, max, mean, variance or 0, #nans, #non-nans)
+
+    Raises
+    ------
+    ValueError
+        If the length of the weight vector does not match the length of the
+        array
+    """
+    if weights is not None:
+        X = X * weights
+    is_numeric = np.issubdtype(X.dtype, np.number)
+    nans = (np.isnan(X) if is_numeric else ~X.astype(bool)).sum(axis=0)
+    variance = np.nanvar(X, axis=0) if compute_variance and is_numeric else np.zeros(X.shape[1])
+    return np.column_stack((
+        np.nanmin(X, axis=0) if is_numeric else np.tile(np.inf, X.shape[1]),
+        np.nanmax(X, axis=0) if is_numeric else np.tile(-np.inf, X.shape[1]),
+        np.nanmean(X, axis=0) if is_numeric else np.zeros(X.shape[1]),
+        variance,
+        nans,
+        X.shape[0] - nans))