Skip to content

Commit 94e5261

Browse files
Add meta inference for ParallelPostFit.predict for sparse arrays (#889)
* Added work-around for sparse matrices Co-authored-by: Genevieve Buckley <[email protected]>
1 parent cf24100 commit 94e5261

File tree

3 files changed

+113
-6
lines changed

3 files changed

+113
-6
lines changed

dask_ml/wrappers.py

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Meta-estimators for parallelizing estimators using the scikit-learn API."""
22
import logging
3+
import warnings
34

45
import dask.array as da
56
import dask.dataframe as dd
@@ -662,9 +663,32 @@ def _get_output_dask_ar_meta_for_estimator(model_fn, estimator, input_dask_ar):
662663
"""
663664
# sklearn fails if input array has size size
664665
# It requires at least 1 sample to run successfully
665-
ar = np.zeros(
666-
shape=(1, input_dask_ar.shape[1]),
667-
dtype=input_dask_ar.dtype,
668-
like=input_dask_ar._meta,
669-
)
666+
input_meta = input_dask_ar._meta
667+
if hasattr(input_meta, "__array_function__"):
668+
ar = np.zeros(
669+
shape=(1, input_dask_ar.shape[1]),
670+
dtype=input_dask_ar.dtype,
671+
like=input_meta,
672+
)
673+
elif "scipy.sparse" in type(input_meta).__module__:
674+
# sparse matrices dont support
675+
# `like` due to non implimented __array_function__
676+
# Refer https://github.com/scipy/scipy/issues/10362
677+
# Note below works for both cupy and scipy sparse matrices
678+
ar = type(input_meta)((1, input_dask_ar.shape[1]), dtype=input_dask_ar.dtype)
679+
else:
680+
func_name = model_fn.__name__.strip("_")
681+
msg = (
682+
f"Metadata for {func_name} is not provided, so Dask is "
683+
f"running the {func_name} "
684+
"function on a small dataset to guess output metadata. "
685+
"As a result, It is possible that Dask will guess incorrectly.\n"
686+
"To silence this warning, provide explicit "
687+
f"`{func_name}_meta` to the dask_ml.wrapper."
688+
"\nExample: \n"
689+
"wrap_clf = dask_ml.wrappers.Incremental(GradientBoostingClassifier(), "
690+
f"{func_name}_meta = np.array([1],dtype=np.int8))"
691+
)
692+
warnings.warn(msg)
693+
ar = np.zeros(shape=(1, input_dask_ar.shape[1]), dtype=input_dask_ar.dtype)
670694
return model_fn(ar, estimator)

tests/test_incremental.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import sklearn.datasets
88
import sklearn.model_selection
99
from dask.array.utils import assert_eq
10+
from scipy.sparse import csr_matrix
1011
from sklearn.base import clone
1112
from sklearn.linear_model import SGDClassifier, SGDRegressor
1213
from sklearn.pipeline import make_pipeline
@@ -210,3 +211,27 @@ def test_incremental_text_pipeline(container):
210211

211212
X2.compute_chunk_sizes()
212213
assert X2.shape == (300, vect.n_features)
214+
215+
preds = pipe.predict(X).compute()
216+
assert len(y) == len(preds)
217+
218+
219+
def test_incremental_sparse_inputs():
220+
X = csr_matrix((3, 4))
221+
y = np.asarray([0, 1, 1], dtype=np.int32)
222+
223+
X_da = da.from_array(X, chunks=(1, 4))
224+
y_da = da.from_array(y, chunks=(1))
225+
226+
clf = SGDClassifier(tol=1e-3)
227+
wrap_clf = dask_ml.wrappers.Incremental(
228+
SGDClassifier(tol=1e-3), scoring="accuracy", assume_equal_chunks=True,
229+
)
230+
231+
wrap_clf = wrap_clf.fit(X_da, y_da, classes=[0, 1])
232+
wrap_output = wrap_clf.predict(X_da).compute()
233+
234+
clf = clf.fit(X, y)
235+
clf_output = clf.predict(X).astype(np.int64)
236+
237+
assert_eq(clf_output, wrap_output, ignore_dtype=True)

tests/test_parallel_post_fit.py

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,10 @@
55
import pandas as pd
66
import pytest
77
import sklearn.datasets
8+
from scipy.sparse import csr_matrix
89
from sklearn.decomposition import PCA
910
from sklearn.ensemble import GradientBoostingClassifier
10-
from sklearn.linear_model import LinearRegression, LogisticRegression
11+
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
1112
from sklearn.naive_bayes import CategoricalNB
1213
from sklearn.preprocessing import OneHotEncoder
1314

@@ -231,3 +232,60 @@ def test_auto_rechunk():
231232
X = X.rechunk({0: 100, 1: 10})
232233
X._chunks = (tuple(np.nan for _ in X.chunks[0]), X.chunks[1])
233234
clf.predict(X)
235+
236+
237+
def test_sparse_inputs():
238+
X = csr_matrix((3, 4))
239+
y = np.asarray([0, 0, 1], dtype=np.int32)
240+
241+
base = SGDClassifier(tol=1e-3)
242+
base = base.fit(X, y)
243+
244+
wrap = ParallelPostFit(base)
245+
X_da = da.from_array(X, chunks=(1, 4))
246+
247+
result = wrap.predict(X_da).compute()
248+
expected = base.predict(X)
249+
250+
assert_eq_ar(result, expected)
251+
252+
253+
def test_warning_on_dask_array_without_array_function():
254+
X, y = make_classification(n_samples=10, n_features=2, chunks=10)
255+
clf = ParallelPostFit(GradientBoostingClassifier())
256+
clf = clf.fit(X, y)
257+
258+
class FakeArray:
259+
def __init__(self, value):
260+
self.value = value
261+
262+
@property
263+
def ndim(self):
264+
return self.value.ndim
265+
266+
@property
267+
def len(self):
268+
return self.value.len
269+
270+
@property
271+
def dtype(self):
272+
return self.value.dtype
273+
274+
@property
275+
def shape(self):
276+
return self.value.shape
277+
278+
ar = FakeArray(np.zeros(shape=(2, 2)))
279+
fake_dask_ar = da.from_array(ar)
280+
fake_dask_ar._meta = FakeArray(np.zeros(shape=(0, 0)))
281+
282+
with pytest.warns(
283+
UserWarning, match="provide explicit `predict_meta` to the dask_ml.wrapper"
284+
):
285+
clf.predict(fake_dask_ar)
286+
287+
with pytest.warns(
288+
UserWarning,
289+
match="provide explicit `predict_proba_meta` to the dask_ml.wrapper",
290+
):
291+
clf.predict_proba(fake_dask_ar)

0 commit comments

Comments
 (0)