Skip to content

Commit fa40fa3

Browse files
authored
Update coefficient assignment (#914)
1 parent 7b8c8e0 commit fa40fa3

File tree

4 files changed

+126
-5
lines changed

4 files changed

+126
-5
lines changed

dask_ml/linear_model/glm.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ def fit(self, X, y=None):
191191
self.intercept_ = self._coef[-1]
192192
else:
193193
self.coef_ = self._coef
194+
self.intercept_ = 0.0
194195
return self
195196

196197
def _check_array(self, X):

dask_ml/linear_model/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,12 @@ def log1p(A):
2828

2929
@dispatch(np.ndarray)
3030
def add_intercept(X):
31-
return np.concatenate([X, np.ones((X.shape[0], 1))], axis=1)
31+
return _add_intercept(X)
3232

3333

3434
def _add_intercept(x):
3535
ones = np.ones((x.shape[0], 1), dtype=x.dtype)
36-
return np.concatenate([ones, x], axis=1)
36+
return np.concatenate([x, ones], axis=1)
3737

3838

3939
@dispatch(da.Array) # noqa: F811

tests/conftest.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,3 +146,37 @@ def scheduler(request):
146146
yield cluster
147147
else:
148148
yield not_cluster
149+
150+
151+
@pytest.fixture
152+
def medium_size_regression():
153+
"""X, y pair for regression with N >> p.
154+
155+
There are many more samples in this problem than there are
156+
features. Useful for testing stability of solutions.
157+
"""
158+
X, y = make_regression(
159+
chunks=100, n_samples=500, n_features=100, n_informative=100, random_state=0
160+
)
161+
return X, y
162+
163+
164+
@pytest.fixture
165+
def medium_size_counts():
166+
"""X, y pair for classification with N >> p.
167+
168+
The samples outnumber the total features, leading to
169+
greater stability of the solutions. Useful for testing
170+
the accuracy of solvers.
171+
"""
172+
sample_size = 500
173+
n_features = 100
174+
X, y = make_counts(
175+
chunks=100,
176+
n_samples=sample_size,
177+
n_features=n_features,
178+
n_informative=n_features,
179+
random_state=0,
180+
scale=1 / np.sqrt(n_features),
181+
)
182+
return X, y

tests/linear_model/test_glm.py

Lines changed: 89 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,15 @@
11
import dask.array as da
22
import dask.dataframe as dd
33
import numpy as np
4+
import numpy.linalg as LA
45
import pandas as pd
56
import pytest
7+
import sklearn.linear_model
68
from dask.dataframe.utils import assert_eq
79
from dask_glm.regularizers import Regularizer
810
from sklearn.pipeline import make_pipeline
911

12+
import dask_ml.linear_model
1013
from dask_ml.datasets import make_classification, make_counts, make_regression
1114
from dask_ml.linear_model import LinearRegression, LogisticRegression, PoissonRegression
1215
from dask_ml.linear_model.utils import add_intercept
@@ -39,9 +42,6 @@ def get_params(self, deep=True):
3942
return {}
4043

4144

42-
X, y = make_classification(chunks=50)
43-
44-
4545
def test_lr_init(solver):
4646
LogisticRegression(solver=solver)
4747

@@ -174,6 +174,15 @@ def test_add_intercept_raises_chunks():
174174
assert m.match("Chunking is only allowed")
175175

176176

177+
def test_add_intercept_ordering():
178+
"""Tests that add_intercept gives same result for dask / numpy objects"""
179+
X_np = np.arange(100).reshape(20, 5)
180+
X_da = da.from_array(X_np, chunks=(20, 5))
181+
np_result = add_intercept(X_np)
182+
da_result = add_intercept(X_da)
183+
da.utils.assert_eq(np_result, da_result)
184+
185+
177186
def test_lr_score():
178187
X = da.from_array(np.arange(1000).reshape(1000, 1))
179188
lr = LinearRegression()
@@ -203,3 +212,80 @@ def test_logistic_predict_proba_shape():
203212
lr.fit(X, y)
204213
prob = lr.predict_proba(X)
205214
assert prob.shape == (100, 2)
215+
216+
217+
@pytest.mark.parametrize(
218+
"est,data",
219+
[
220+
(LinearRegression, "single_chunk_regression"),
221+
(LogisticRegression, "single_chunk_classification"),
222+
(PoissonRegression, "single_chunk_count_classification"),
223+
],
224+
)
225+
def test_model_coef_dask_numpy(est, data, request):
226+
"""Tests that models return same coefficients and intercepts with array types"""
227+
X, y = request.getfixturevalue(data)
228+
np_mod, da_mod = est(fit_intercept=True), est(fit_intercept=True)
229+
da_mod.fit(X, y)
230+
np_mod.fit(X.compute(), y.compute())
231+
da_coef = np.hstack((da_mod.coef_, da_mod.intercept_))
232+
np_coef = np.hstack((np_mod.coef_, np_mod.intercept_))
233+
234+
rel_error = LA.norm(da_coef - np_coef) / LA.norm(np_coef)
235+
assert rel_error < 1e-8
236+
237+
238+
# fmt: off
239+
@pytest.mark.parametrize("solver", ["newton", "lbfgs"])
240+
@pytest.mark.parametrize("fit_intercept", [True, False])
241+
@pytest.mark.parametrize(
242+
"est, skl_params, data_generator",
243+
[
244+
("LinearRegression", {}, "medium_size_regression"),
245+
("LogisticRegression", {"penalty": "none"}, "single_chunk_classification"),
246+
("PoissonRegression", {"alpha": 0}, "medium_size_counts"),
247+
],
248+
)
249+
def test_model_against_sklearn(
250+
est, skl_params, data_generator, fit_intercept, solver, request
251+
):
252+
"""
253+
Test accuracy of model predictions and coefficients.
254+
255+
All tests of model coefficients are done via relative error, the
256+
standard for optimization proofs, and by the numpy utility
257+
``np.testing.assert_allclose``. This ensures that the model coefficients
258+
match up with SK Learn.
259+
"""
260+
X, y = request.getfixturevalue(data_generator)
261+
262+
# sklearn uses 'PoissonRegressor' while dask-ml uses 'PoissonRegression'
263+
assert est in ["LinearRegression", "LogisticRegression", "PoissonRegression"]
264+
EstDask = getattr(dask_ml.linear_model, est)
265+
EstSklearn = getattr(
266+
sklearn.linear_model, est if "Poisson" not in est else "PoissonRegressor"
267+
)
268+
269+
dask_ml_model = EstDask(
270+
fit_intercept=fit_intercept, solver=solver, penalty="l2", C=1e8, max_iter=500
271+
)
272+
dask_ml_model.fit(X, y)
273+
274+
# skl_model has to be fit with numpy data
275+
skl_model = EstSklearn(fit_intercept=fit_intercept, **skl_params)
276+
skl_model.fit(X.compute(), y.compute())
277+
278+
# test coefficients
279+
est, truth = np.hstack((dask_ml_model.intercept_, dask_ml_model.coef_)), np.hstack(
280+
(skl_model.intercept_, skl_model.coef_.flatten())
281+
)
282+
rel_error = LA.norm(est - truth) / LA.norm(truth)
283+
assert rel_error < 1e-3
284+
285+
np.testing.assert_allclose(truth, est, rtol=1e-3, atol=2e-4)
286+
287+
# test predictions
288+
skl_preds = skl_model.predict(X.compute())
289+
dml_preds = dask_ml_model.predict(X)
290+
291+
np.testing.assert_allclose(skl_preds, dml_preds, rtol=1e-3, atol=2e-3)

0 commit comments

Comments
 (0)