55It also patches bottleneck to contain these functions.
66"""
77from warnings import warn
8- import numpy as np
9- import scipy .sparse as sp
8+
109import bottleneck as bn
10+ import numpy as np
11+ from scipy import sparse as sp
1112
1213
13- def _count_nans_per_row_sparse (X , weights ):
14+ def _count_nans_per_row_sparse (X , weights , dtype = None ):
1415 """ Count the number of nans (undefined) values per row. """
15- items_per_row = 1 if X .ndim == 1 else X .shape [1 ]
16- counts = np .ones (X .shape [0 ]) * items_per_row
17- nnz_per_row = np .bincount (X .indices , minlength = len (counts ))
18- counts -= nnz_per_row
1916 if weights is not None :
20- counts *= weights
21- return np .sum (counts )
17+ X = X .tocoo (copy = False )
18+ nonzero_mask = np .isnan (X .data )
19+ nan_rows , nan_cols = X .row [nonzero_mask ], X .col [nonzero_mask ]
20+
21+ if weights .ndim == 1 :
22+ data_weights = weights [nan_rows ]
23+ else :
24+ data_weights = weights [nan_rows , nan_cols ]
25+
26+ w = sp .coo_matrix ((data_weights , (nan_rows , nan_cols )), shape = X .shape )
27+ w = w .tocsr ()
28+
29+ return np .fromiter ((np .sum (row .data ) for row in w ), dtype = dtype )
2230
31+ return np .fromiter ((np .isnan (row .data ).sum () for row in X ), dtype = dtype )
2332
24- def bincount (X , max_val = None , weights = None , minlength = None ):
33+
34+ def sparse_count_implicit_zeros (x ):
35+ """ Count the number of implicit zeros in a sparse matrix. """
36+ if not sp .issparse (x ):
37+ raise TypeError ('The matrix provided was not sparse.' )
38+ return np .prod (x .shape ) - x .nnz
39+
40+
41+ def sparse_has_implicit_zeros (x ):
42+ """ Check if sparse matrix contains any implicit zeros. """
43+ if not sp .issparse (x ):
44+ raise TypeError ('The matrix provided was not sparse.' )
45+ return np .prod (x .shape ) != x .nnz
46+
47+
48+ def sparse_implicit_zero_weights (x , weights ):
49+ """ Extract the weight values of all zeros in a sparse matrix. """
50+ if not sp .issparse (x ):
51+ raise TypeError ('The matrix provided was not sparse.' )
52+
53+ if weights .ndim == 1 :
54+ # Match weights and x axis so `indices` will be set appropriately
55+ if x .shape [0 ] == weights .shape [0 ]:
56+ x = x .tocsc ()
57+ elif x .shape [1 ] == weights .shape [0 ]:
58+ x = x .tocsr ()
59+ n_items = np .prod (x .shape )
60+ zero_indices = np .setdiff1d (np .arange (n_items ), x .indices , assume_unique = True )
61+ return weights [zero_indices ]
62+ else :
63+ # Can easily be implemented using a coo_matrix
64+ raise NotImplementedError (
65+ 'Computing zero weights on ndimensinal weight matrix is not implemented'
66+ )
67+
68+
69+ def bincount (x , weights = None , max_val = None , minlength = None ):
2570 """Return counts of values in array X.
2671
2772 Works kind of like np.bincount(), except that it also supports floating
2873 arrays with nans.
74+
75+ Parameters
76+ ----------
77+ x : array_like, 1 dimension, nonnegative ints
78+ Input array.
79+ weights : array_like, optional
80+ Weights, array of the same shape as x.
81+ max_val : int, optional
82+ Indicates the maximum value we expect to find in X and sets the result
83+ array size accordingly. E.g. if we set `max_val=2` yet the largest
84+ value in X is 1, the result will contain a bin for the value 2, and
85+ will be set to 0. See examples for usage.
86+ minlength : int, optional
87+ A minimum number of bins for the output array. See numpy docs for info.
88+
89+ Returns
90+ -------
91+ Tuple[np.ndarray, int]
92+ Returns the bincounts and the number of NaN values.
93+
94+ Examples
95+ --------
96+ In case `max_val` is provided, the return shape includes bins for these
97+ values as well, even if they do not appear in the data. However, this will
98+ not truncate the bincount if values larger than `max_count` are found.
99+ >>> bincount([0, 0, 1, 1, 2], max_val=4)
100+ (array([ 2., 2., 1., 0., 0.]), 0.0)
101+ >>> bincount([0, 1, 2, 3, 4], max_val=2)
102+ (array([ 1., 1., 1., 1., 1.]), 0.0)
103+
29104 """
30- if sp .issparse (X ):
31- minlength = max_val + 1
32- bin_weights = weights [X .indices ] if weights is not None else None
33- return (np .bincount (X .data .astype (int ),
34- weights = bin_weights ,
35- minlength = minlength , ),
36- _count_nans_per_row_sparse (X , weights ))
37-
38- X = np .asanyarray (X )
39- if X .dtype .kind == 'f' and bn .anynan (X ):
40- nonnan = ~ np .isnan (X )
41- X = X [nonnan ]
105+ # Store the original matrix before any manipulation to check for sparse
106+ x_original = x
107+ if sp .issparse (x ):
108+ if weights is not None :
109+ # Match weights and x axis so `indices` will be set appropriately
110+ if x .shape [0 ] == weights .shape [0 ]:
111+ x = x .tocsc ()
112+ elif x .shape [1 ] == weights .shape [0 ]:
113+ x = x .tocsr ()
114+
115+ zero_weights = sparse_implicit_zero_weights (x , weights ).sum ()
116+ weights = weights [x .indices ]
117+ else :
118+ zero_weights = sparse_count_implicit_zeros (x )
119+
120+ x = x .data
121+
122+ x = np .asanyarray (x )
123+ if x .dtype .kind == 'f' and bn .anynan (x ):
124+ nonnan = ~ np .isnan (x )
125+ x = x [nonnan ]
42126 if weights is not None :
43127 nans = (~ nonnan * weights ).sum (axis = 0 )
44128 weights = weights [nonnan ]
45129 else :
46130 nans = (~ nonnan ).sum (axis = 0 )
47131 else :
48- nans = 0. if X .ndim == 1 else np .zeros (X .shape [1 ], dtype = float )
132+ nans = 0. if x .ndim == 1 else np .zeros (x .shape [1 ], dtype = float )
133+
49134 if minlength is None and max_val is not None :
50135 minlength = max_val + 1
51- bc = np .array ([]) if minlength is not None and minlength <= 0 else \
52- np .bincount (X .astype (np .int32 , copy = False ),
53- weights = weights , minlength = minlength ).astype (float )
136+
137+ if minlength is not None and minlength <= 0 :
138+ bc = np .array ([])
139+ else :
140+ bc = np .bincount (
141+ x .astype (np .int32 , copy = False ), weights = weights , minlength = minlength
142+ ).astype (float )
143+ # Since `csr_matrix.values` only contain non-zero values or explicit
144+ # zeros, we must count implicit zeros separately and add them to the
145+ # explicit ones found before
146+ if sp .issparse (x_original ):
147+ bc [0 ] += zero_weights
148+
54149 return bc , nans
55150
56151
57- def countnans (X , weights = None , axis = None , dtype = None , keepdims = False ):
152+ def countnans (x , weights = None , axis = None , dtype = None , keepdims = False ):
58153 """
59- Count the undefined elements in arr along given axis.
154+ Count the undefined elements in an array along given axis.
60155
61156 Parameters
62157 ----------
63- X : array_like
64- weights : array_like
158+ x : array_like
159+ weights : array_like, optional
65160 Weights to weight the nans with, before or after counting (depending
66161 on the weights shape).
162+ axis : int, optional
163+ dtype : dtype, optional
164+ The data type of the returned array.
67165
68166 Returns
69167 -------
70- counts
168+ Union[np.ndarray, float]
169+
71170 """
72- if not sp .issparse (X ):
73- X = np .asanyarray (X )
74- isnan = np .isnan (X )
75- if weights is not None and weights .shape == X .shape :
171+ if not sp .issparse (x ):
172+ x = np .asanyarray (x )
173+ isnan = np .isnan (x )
174+ if weights is not None and weights .shape == x .shape :
76175 isnan = isnan * weights
176+
77177 counts = isnan .sum (axis = axis , dtype = dtype , keepdims = keepdims )
78- if weights is not None and weights .shape != X .shape :
178+ if weights is not None and weights .shape != x .shape :
79179 counts = counts * weights
80180 else :
81- if any (attr is not None for attr in [axis , dtype ]) or \
82- keepdims is not False :
83- raise ValueError ('Arguments axis, dtype and keepdims'
84- 'are not yet supported on sparse data!' )
181+ assert axis in [None , 0 , 1 ], 'Only axis 0 and 1 are currently supported'
182+ # To have consistent behaviour with dense matrices, raise error when
183+ # `axis=1` and the array is 1d (e.g. [[1 2 3]])
184+ if x .shape [0 ] == 1 and axis == 1 :
185+ raise ValueError ('Axis %d is out of bounds' % axis )
186+
187+ arr = x if axis == 1 else x .T
188+
189+ if weights is not None :
190+ weights = weights if axis == 1 else weights .T
191+
192+ arr = arr .tocsr ()
193+ counts = _count_nans_per_row_sparse (arr , weights , dtype = dtype )
194+
195+ # We want a scalar value if `axis=None` or if the sparse matrix is
196+ # actually a vector (e.g. [[1 2 3]]), but has `ndim=2` due to scipy
197+ # implementation
198+ if axis is None or x .shape [0 ] == 1 :
199+ counts = counts .sum (dtype = dtype )
85200
86- counts = _count_nans_per_row_sparse (X , weights )
87201 return counts
88202
89203
@@ -234,17 +348,12 @@ def weighted_mean():
234348 X .shape [0 ] - nans ))
235349
236350
237- def _sparse_has_zeros (x ):
238- """ Check if sparse matrix contains any implicit zeros. """
239- return np .prod (x .shape ) != x .nnz
240-
241-
242351def _nan_min_max (x , func , axis = 0 ):
243352 if not sp .issparse (x ):
244353 return func (x , axis = axis )
245354 if axis is None :
246355 extreme = func (x .data , axis = axis ) if x .nnz else float ('nan' )
247- if _sparse_has_zeros (x ):
356+ if sparse_has_implicit_zeros (x ):
248357 extreme = func ([0 , extreme ])
249358 return extreme
250359 if axis == 0 :
@@ -257,7 +366,7 @@ def _nan_min_max(x, func, axis=0):
257366 for row in x :
258367 values = row .data
259368 extreme = func (values ) if values .size else float ('nan' )
260- if _sparse_has_zeros (row ):
369+ if sparse_has_implicit_zeros (row ):
261370 extreme = func ([0 , extreme ])
262371 r .append (extreme )
263372 return np .array (r )
@@ -323,7 +432,7 @@ def unique(x, return_counts=False):
323432 if not sp .issparse (x ):
324433 return np .unique (x , return_counts = return_counts )
325434
326- implicit_zeros = np . prod ( x . shape ) - x . nnz
435+ implicit_zeros = sparse_count_implicit_zeros ( x )
327436 explicit_zeros = not np .all (x .data )
328437 r = np .unique (x .data , return_counts = return_counts )
329438 if not implicit_zeros :
0 commit comments