Skip to content

Commit f415742

Browse files
authored
Merge pull request #1413 from kernc/bottlechest
Bottlechest, Bottleneck (v2)
2 parents 7d65e24 + 625c165 commit f415742

File tree

17 files changed

+328
-64
lines changed

17 files changed

+328
-64
lines changed

Orange/base.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@
22

33
import numpy as np
44
import scipy
5-
import bottlechest as bn
65

76
from Orange.data import Table, Storage, Instance, Value
87
from Orange.preprocess import (RemoveNaNClasses, Continuize,
98
RemoveNaNColumns, SklImpute)
109
from Orange.misc.wrapper_meta import WrapperMeta
10+
from Orange.data.util import one_hot
1111

1212
__all__ = ["Learner", "Model", "SklLearner", "SklModel"]
1313

@@ -157,11 +157,9 @@ def __call__(self, data, ret=Value):
157157
for c in self.domain.class_vars)
158158
probs = np.zeros(value.shape + (max_card,), float)
159159
for i, cvar in enumerate(self.domain.class_vars):
160-
probs[:, i, :], _ = bn.bincount(np.atleast_2d(value[:, i]),
161-
max_card - 1)
160+
probs[:, i, :] = one_hot(value[:, i])
162161
else:
163-
probs, _ = bn.bincount(np.atleast_2d(value),
164-
len(self.domain.class_var.values) - 1)
162+
probs = one_hot(value)
165163
if ret == Model.ValueProbs:
166164
return value, probs
167165
else:

Orange/data/filter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
from ..misc.enum import Enum
77
import numpy as np
8-
import bottlechest as bn
8+
import bottleneck as bn
99
from Orange.data import Instance, Storage, Variable
1010

1111

Orange/data/io.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from urllib.parse import urlparse, unquote as urlunquote
1919
from urllib.request import urlopen
2020

21-
import bottlechest as bn
21+
import bottleneck as bn
2222
import numpy as np
2323
from chardet.universaldetector import UniversalDetector
2424

Orange/data/table.py

Lines changed: 16 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,10 @@
1313
from urllib.request import urlopen
1414
from urllib.error import URLError
1515

16-
import bottlechest as bn
16+
import bottleneck as bn
1717
from scipy import sparse as sp
1818

19+
from Orange.statistics.util import bincount, countnans, contingency, stats as fast_stats
1920
from .instance import *
2021
from Orange.util import flatten
2122
from Orange.data import Domain, Variable, StringVariable
@@ -935,12 +936,7 @@ def __determine_density(data):
935936
if data is None:
936937
return Storage.Missing
937938
if data is not None and sp.issparse(data):
938-
try:
939-
if bn.bincount(data.data, 1)[0][0] == 0:
940-
return Storage.SPARSE_BOOL
941-
except ValueError as e:
942-
pass
943-
return Storage.SPARSE
939+
return Storage.SPARSE_BOOL if (data.data == 1).all() else Storage.SPARSE
944940
else:
945941
return Storage.DENSE
946942

@@ -1212,19 +1208,19 @@ def _compute_basic_stats(self, columns=None,
12121208
stats = []
12131209
if not columns:
12141210
if self.domain.attributes:
1215-
rr.append(bn.stats(self.X, W))
1211+
rr.append(fast_stats(self.X, W))
12161212
if self.domain.class_vars:
1217-
rr.append(bn.stats(self._Y, W))
1213+
rr.append(fast_stats(self._Y, W))
12181214
if include_metas and self.domain.metas:
1219-
rr.append(bn.stats(self.metas, W))
1215+
rr.append(fast_stats(self.metas, W))
12201216
if len(rr):
12211217
stats = np.vstack(tuple(rr))
12221218
else:
12231219
columns = [self.domain.index(c) for c in columns]
12241220
nattrs = len(self.domain.attributes)
1225-
Xs = any(0 <= c < nattrs for c in columns) and bn.stats(self.X, W)
1226-
Ys = any(c >= nattrs for c in columns) and bn.stats(self._Y, W)
1227-
ms = any(c < 0 for c in columns) and bn.stats(self.metas, W)
1221+
Xs = any(0 <= c < nattrs for c in columns) and fast_stats(self.X, W)
1222+
Ys = any(c >= nattrs for c in columns) and fast_stats(self._Y, W)
1223+
ms = any(c < 0 for c in columns) and fast_stats(self.metas, W)
12281224
for column in columns:
12291225
if 0 <= column < nattrs:
12301226
stats.append(Xs[column, :])
@@ -1271,19 +1267,19 @@ def _get_matrix(M, cachedM, col):
12711267
if var.is_discrete:
12721268
if W is not None:
12731269
W = W.ravel()
1274-
dist, unknowns = bn.bincount(m, len(var.values) - 1, W)
1270+
dist, unknowns = bincount(m, len(var.values) - 1, W)
12751271
elif not len(m):
12761272
dist, unknowns = np.zeros((2, 0)), 0
12771273
else:
12781274
if W is not None:
12791275
ranks = np.argsort(m)
12801276
vals = np.vstack((m[ranks], W[ranks].flatten()))
1281-
unknowns = bn.countnans(m, W)
1277+
unknowns = countnans(m, W)
12821278
else:
12831279
vals = np.ones((2, m.shape[0]))
12841280
vals[0, :] = m
12851281
vals[0, :].sort()
1286-
unknowns = bn.countnans(m.astype(float))
1282+
unknowns = countnans(m.astype(float))
12871283
dist = np.array(_valuecount.valuecount(vals))
12881284
distributions.append((dist, unknowns))
12891285

@@ -1329,7 +1325,7 @@ def _compute_contingency(self, col_vars=None, row_var=None):
13291325
if row_data.dtype.kind != "f": #meta attributes can be stored as type object
13301326
row_data = row_data.astype(float)
13311327

1332-
unknown_rows = bn.countnans(row_data)
1328+
unknown_rows = countnans(row_data)
13331329
if unknown_rows:
13341330
nan_inds = np.isnan(row_data)
13351331
row_data = row_data[~nan_inds]
@@ -1355,13 +1351,13 @@ def _compute_contingency(self, col_vars=None, row_var=None):
13551351
max_vals = max(len(v[2].values) for v in disc_vars)
13561352
disc_indi = {i for _, i, _ in disc_vars}
13571353
mask = [i in disc_indi for i in range(arr.shape[1])]
1358-
conts, nans = bn.contingency(arr, row_data, max_vals - 1,
1359-
n_rows - 1, W, mask)
1354+
conts, nans = contingency(arr, row_data, max_vals - 1,
1355+
n_rows - 1, W, mask)
13601356
for col_i, arr_i, _ in disc_vars:
13611357
contingencies[col_i] = (conts[arr_i], nans[arr_i])
13621358
else:
13631359
for col_i, arr_i, var in disc_vars:
1364-
contingencies[col_i] = bn.contingency(
1360+
contingencies[col_i] = contingency(
13651361
arr[:, arr_i].astype(float),
13661362
row_data, len(var.values) - 1, n_rows - 1, W)
13671363

Orange/data/util.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
"""
2+
Data-manipulation utilities.
3+
"""
4+
import numpy as np
5+
import bottleneck as bn
6+
7+
8+
def one_hot(values, dtype=float):
9+
"""Return a one-hot transform of values
10+
11+
Parameters
12+
----------
13+
values : 1d array
14+
Integer values (hopefully 0-max).
15+
16+
Returns
17+
-------
18+
result
19+
2d array with ones in respective indicator columns.
20+
"""
21+
return np.eye(np.max(values) + 1, dtype=dtype)[np.asanyarray(values, dtype=int)]
22+
23+
24+
def scale(values, min=0, max=1):
25+
"""Return values scaled to [min, max]"""
26+
minval = np.float_(bn.nanmin(values))
27+
ptp = bn.nanmax(values) - minval
28+
if ptp == 0:
29+
return np.clip(values, min, max)
30+
return (-minval + values) / ptp * (max - min) + min

Orange/preprocess/preprocess.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
"""
66
import numpy as np
77
import sklearn.preprocessing as skl_preprocessing
8-
import bottlechest
8+
import bottleneck as bn
99

1010
import Orange.data
1111
from Orange.data import Table
@@ -198,8 +198,8 @@ def __call__(self, data):
198198
data : an input data set
199199
"""
200200

201-
oks = bottlechest.nanmin(data.X, axis=0) != \
202-
bottlechest.nanmax(data.X, axis=0)
201+
oks = bn.nanmin(data.X, axis=0) != \
202+
bn.nanmax(data.X, axis=0)
203203
atts = [data.domain.attributes[i] for i, ok in enumerate(oks) if ok]
204204
domain = Orange.data.Domain(atts, data.domain.class_vars,
205205
data.domain.metas)

Orange/statistics/util.py

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
"""
2+
This module provides alternatives for the few additional functions found in
3+
and once used from the bottlechest package (fork of bottleneck).
4+
5+
It also patches bottleneck to contain these functions.
6+
"""
7+
import numpy as np
8+
from scipy.sparse import issparse
9+
import bottleneck as bn
10+
11+
12+
def bincount(X, max_val=None, weights=None, minlength=None):
13+
"""Return counts of values in array X.
14+
15+
Works kind of like np.bincount(), except that it also supports floating
16+
arrays with nans.
17+
"""
18+
X = np.asanyarray(X)
19+
if X.dtype.kind == 'f' and bn.anynan(X):
20+
nonnan = ~np.isnan(X)
21+
nans = (~nonnan).sum(axis=0)
22+
X = X[nonnan]
23+
if weights is not None:
24+
weights = weights[nonnan]
25+
else:
26+
nans = 0 if X.ndim == 1 else np.zeros(X.shape[1])
27+
if minlength is None and max_val is not None:
28+
minlength = max_val + 1
29+
return (np.bincount(X.astype(np.int32, copy=False),
30+
weights=weights,
31+
minlength=minlength),
32+
nans)
33+
34+
35+
def countnans(X, weights=None, axis=None, dtype=None, keepdims=False):
36+
"""
37+
Count the undefined elements in arr along given axis.
38+
39+
Parameters
40+
----------
41+
X : array_like
42+
weights : array_like
43+
Weights to weight the nans with, before or after counting (depending
44+
on the weights shape).
45+
46+
Returns
47+
-------
48+
counts
49+
"""
50+
X = np.asanyarray(X)
51+
isnan = np.isnan(X)
52+
if weights is not None and weights.shape == X.shape:
53+
isnan = isnan * weights
54+
counts = isnan.sum(axis=axis, dtype=dtype, keepdims=keepdims)
55+
if weights is not None and weights.shape != X.shape:
56+
counts = counts * weights
57+
return counts
58+
59+
60+
def contingency(X, y, max_X=None, max_y=None, weights=None, mask=None):
61+
"""
62+
Compute the contingency matrices for each column of X (excluding the masked)
63+
versus the vector y.
64+
65+
If the array is 1-dimensional, a 2d contingency matrix is returned. If the
66+
array is 2d, the function returns a 3d array, with the first dimension
67+
corresponding to column index (variable in the input array).
68+
69+
The rows of contingency matrix correspond to values of variables, the
70+
columns correspond to values in vector `y`.
71+
(??? isn't it the other way around ???)
72+
73+
Rows in the input array can be weighted (argument `weights`). A subset of
74+
columns can be selected by additional argument `mask`.
75+
76+
The function also returns a count of NaN values per each value of `y`.
77+
78+
Parameters
79+
----------
80+
X : array_like
81+
With values in columns.
82+
y : 1d array
83+
Vector of true values.
84+
max_X : int
85+
The maximal value in the array
86+
max_y : int
87+
The maximal value in `y`
88+
weights : ...
89+
mask : sequence
90+
Discrete columns of X.
91+
92+
Returns
93+
-------
94+
contingencies: (m × ny × nx) array
95+
m number of masked (used) columns (all if mask=None), i.e.
96+
for each column of X;
97+
ny number of uniques in y,
98+
nx number of uniques in column of X.
99+
nans : array_like
100+
Number of nans in each column of X for each unique value of y.
101+
"""
102+
if weights is not None and np.any(weights) and np.unique(weights)[0] != 1:
103+
raise ValueError('weights not yet supported')
104+
105+
was_1d = False
106+
if X.ndim == 1:
107+
X = X[..., np.newaxis]
108+
was_1d = True
109+
110+
contingencies, nans = [], []
111+
ny = np.unique(y).size if max_y is None else max_y + 1
112+
for i in range(X.shape[1]):
113+
if mask is not None and not mask[i]:
114+
contingencies.append(np.zeros((ny, max_X + 1)))
115+
nans.append(np.zeros(ny))
116+
continue
117+
col = X[..., i]
118+
nx = np.unique(col[~np.isnan(col)]).size if max_X is None else max_X + 1
119+
if issparse(col):
120+
col = np.ravel(col.todense())
121+
contingencies.append(
122+
bincount(y + ny * col,
123+
minlength=ny * nx)[0].reshape(nx, ny).T)
124+
nans.append(
125+
bincount(y[np.isnan(col)], minlength=ny)[0])
126+
if was_1d:
127+
return contingencies[0], nans[0]
128+
return np.array(contingencies), np.array(nans)
129+
130+
131+
def stats(X, weights=None, compute_variance=False):
132+
"""
133+
Compute min, max, #nans, mean and variance.
134+
135+
Result is a tuple (min, max, mean, variance, #nans, #non-nans) or an
136+
array of shape (len(X), 6).
137+
138+
The mean and the number of nans and non-nans are weighted.
139+
140+
Computation of variance requires an additional pass and is not enabled
141+
by default. Zeros are filled in instead of variance.
142+
143+
Parameters
144+
----------
145+
X : array_like, 1 or 2 dimensions
146+
Input array.
147+
weights : array_like, optional
148+
Weights, array of the same length as `x`.
149+
compute_variance : bool, optional
150+
If set to True, the function also computes variance.
151+
152+
Returns
153+
-------
154+
out : a 6-element tuple or an array of shape (len(x), 6)
155+
Computed (min, max, mean, variance or 0, #nans, #non-nans)
156+
157+
Raises
158+
------
159+
ValueError
160+
If the length of the weight vector does not match the length of the
161+
array
162+
"""
163+
if weights is not None:
164+
X = X * weights
165+
is_numeric = np.issubdtype(X.dtype, np.number)
166+
nans = (np.isnan(X) if is_numeric else ~X.astype(bool)).sum(axis=0)
167+
variance = np.nanvar(X, axis=0) if compute_variance and is_numeric else np.zeros(X.shape[1])
168+
return np.column_stack((
169+
np.nanmin(X, axis=0) if is_numeric else np.tile(np.inf, X.shape[1]),
170+
np.nanmax(X, axis=0) if is_numeric else np.tile(-np.inf, X.shape[1]),
171+
np.nanmean(X, axis=0) if is_numeric else np.zeros(X.shape[1]),
172+
variance,
173+
nans,
174+
X.shape[0] - nans))

0 commit comments

Comments
 (0)