|
1 | 1 | import dask.array as da |
2 | 2 | import dask.dataframe as dd |
3 | 3 | import numpy as np |
| 4 | +import numpy.linalg as LA |
4 | 5 | import pandas as pd |
5 | 6 | import pytest |
| 7 | +import sklearn.linear_model |
6 | 8 | from dask.dataframe.utils import assert_eq |
7 | 9 | from dask_glm.regularizers import Regularizer |
8 | 10 | from sklearn.pipeline import make_pipeline |
9 | 11 |
|
| 12 | +import dask_ml.linear_model |
10 | 13 | from dask_ml.datasets import make_classification, make_counts, make_regression |
11 | 14 | from dask_ml.linear_model import LinearRegression, LogisticRegression, PoissonRegression |
12 | 15 | from dask_ml.linear_model.utils import add_intercept |
@@ -39,9 +42,6 @@ def get_params(self, deep=True): |
39 | 42 | return {} |
40 | 43 |
|
41 | 44 |
|
42 | | -X, y = make_classification(chunks=50) |
43 | | - |
44 | | - |
45 | 45 | def test_lr_init(solver): |
46 | 46 | LogisticRegression(solver=solver) |
47 | 47 |
|
@@ -174,6 +174,15 @@ def test_add_intercept_raises_chunks(): |
174 | 174 | assert m.match("Chunking is only allowed") |
175 | 175 |
|
176 | 176 |
|
| 177 | +def test_add_intercept_ordering(): |
| 178 | + """Tests that add_intercept gives same result for dask / numpy objects""" |
| 179 | + X_np = np.arange(100).reshape(20, 5) |
| 180 | + X_da = da.from_array(X_np, chunks=(20, 5)) |
| 181 | + np_result = add_intercept(X_np) |
| 182 | + da_result = add_intercept(X_da) |
| 183 | + da.utils.assert_eq(np_result, da_result) |
| 184 | + |
| 185 | + |
177 | 186 | def test_lr_score(): |
178 | 187 | X = da.from_array(np.arange(1000).reshape(1000, 1)) |
179 | 188 | lr = LinearRegression() |
@@ -203,3 +212,80 @@ def test_logistic_predict_proba_shape(): |
203 | 212 | lr.fit(X, y) |
204 | 213 | prob = lr.predict_proba(X) |
205 | 214 | assert prob.shape == (100, 2) |
| 215 | + |
| 216 | + |
| 217 | +@pytest.mark.parametrize( |
| 218 | + "est,data", |
| 219 | + [ |
| 220 | + (LinearRegression, "single_chunk_regression"), |
| 221 | + (LogisticRegression, "single_chunk_classification"), |
| 222 | + (PoissonRegression, "single_chunk_count_classification"), |
| 223 | + ], |
| 224 | +) |
| 225 | +def test_model_coef_dask_numpy(est, data, request): |
| 226 | + """Tests that models return same coefficients and intercepts with array types""" |
| 227 | + X, y = request.getfixturevalue(data) |
| 228 | + np_mod, da_mod = est(fit_intercept=True), est(fit_intercept=True) |
| 229 | + da_mod.fit(X, y) |
| 230 | + np_mod.fit(X.compute(), y.compute()) |
| 231 | + da_coef = np.hstack((da_mod.coef_, da_mod.intercept_)) |
| 232 | + np_coef = np.hstack((np_mod.coef_, np_mod.intercept_)) |
| 233 | + |
| 234 | + rel_error = LA.norm(da_coef - np_coef) / LA.norm(np_coef) |
| 235 | + assert rel_error < 1e-8 |
| 236 | + |
| 237 | + |
| 238 | +# fmt: off |
| 239 | +@pytest.mark.parametrize("solver", ["newton", "lbfgs"]) |
| 240 | +@pytest.mark.parametrize("fit_intercept", [True, False]) |
| 241 | +@pytest.mark.parametrize( |
| 242 | + "est, skl_params, data_generator", |
| 243 | + [ |
| 244 | + ("LinearRegression", {}, "medium_size_regression"), |
| 245 | + ("LogisticRegression", {"penalty": "none"}, "single_chunk_classification"), |
| 246 | + ("PoissonRegression", {"alpha": 0}, "medium_size_counts"), |
| 247 | + ], |
| 248 | +) |
| 249 | +def test_model_against_sklearn( |
| 250 | + est, skl_params, data_generator, fit_intercept, solver, request |
| 251 | +): |
| 252 | + """ |
| 253 | + Test accuracy of model predictions and coefficients. |
| 254 | +
|
| 255 | + All tests of model coefficients are done via relative error, the |
| 256 | + standard for optimization proofs, and by the numpy utility |
| 257 | + ``np.testing.assert_allclose``. This ensures that the model coefficients |
| 258 | + match up with SK Learn. |
| 259 | + """ |
| 260 | + X, y = request.getfixturevalue(data_generator) |
| 261 | + |
| 262 | + # sklearn uses 'PoissonRegressor' while dask-ml uses 'PoissonRegression' |
| 263 | + assert est in ["LinearRegression", "LogisticRegression", "PoissonRegression"] |
| 264 | + EstDask = getattr(dask_ml.linear_model, est) |
| 265 | + EstSklearn = getattr( |
| 266 | + sklearn.linear_model, est if "Poisson" not in est else "PoissonRegressor" |
| 267 | + ) |
| 268 | + |
| 269 | + dask_ml_model = EstDask( |
| 270 | + fit_intercept=fit_intercept, solver=solver, penalty="l2", C=1e8, max_iter=500 |
| 271 | + ) |
| 272 | + dask_ml_model.fit(X, y) |
| 273 | + |
| 274 | + # skl_model has to be fit with numpy data |
| 275 | + skl_model = EstSklearn(fit_intercept=fit_intercept, **skl_params) |
| 276 | + skl_model.fit(X.compute(), y.compute()) |
| 277 | + |
| 278 | + # test coefficients |
| 279 | + est, truth = np.hstack((dask_ml_model.intercept_, dask_ml_model.coef_)), np.hstack( |
| 280 | + (skl_model.intercept_, skl_model.coef_.flatten()) |
| 281 | + ) |
| 282 | + rel_error = LA.norm(est - truth) / LA.norm(truth) |
| 283 | + assert rel_error < 1e-3 |
| 284 | + |
| 285 | + np.testing.assert_allclose(truth, est, rtol=1e-3, atol=2e-4) |
| 286 | + |
| 287 | + # test predictions |
| 288 | + skl_preds = skl_model.predict(X.compute()) |
| 289 | + dml_preds = dask_ml_model.predict(X) |
| 290 | + |
| 291 | + np.testing.assert_allclose(skl_preds, dml_preds, rtol=1e-3, atol=2e-3) |
0 commit comments