diff --git a/dask_ml/_compat.py b/dask_ml/_compat.py index 165387704..bbc907797 100644 --- a/dask_ml/_compat.py +++ b/dask_ml/_compat.py @@ -23,6 +23,8 @@ DASK_2_28_0 = DASK_VERSION > packaging.version.parse("2.27.0") DASK_2021_02_0 = DASK_VERSION >= packaging.version.parse("2021.02.0") DASK_2022_01_0 = DASK_VERSION > packaging.version.parse("2021.12.0") +DASK_2025_3_0 = DASK_VERSION >= packaging.version.parse("2025.3.0") +DASK_2025_5_0 = DASK_VERSION >= packaging.version.parse("2025.5.0") DISTRIBUTED_2_5_0 = DISTRIBUTED_VERSION > packaging.version.parse("2.5.0") DISTRIBUTED_2_11_0 = DISTRIBUTED_VERSION > packaging.version.parse("2.10.0") # dev DISTRIBUTED_2021_02_0 = DISTRIBUTED_VERSION >= packaging.version.parse("2021.02.0") diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py index 2bbe31526..6abaa77c1 100644 --- a/dask_ml/model_selection/_search.py +++ b/dask_ml/model_selection/_search.py @@ -1521,7 +1521,7 @@ def fit(self, X, y=None, groups=None, **fit_params): The number of cross-validation splits (folds/iterations). Notes ------- +----- The parameters selected are those that maximize the score of the left out data, unless an explicit score is passed in which case it is used instead. """ diff --git a/docs/_static/.gitignore b/docs/_static/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/docs/source/conf.py b/docs/source/conf.py index d7c4d891e..1b974ffbd 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -86,7 +86,7 @@ # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. diff --git a/docs/source/hyper-parameter-search.rst b/docs/source/hyper-parameter-search.rst index d60af7c88..7ac14983d 100644 --- a/docs/source/hyper-parameter-search.rst +++ b/docs/source/hyper-parameter-search.rst @@ -419,8 +419,7 @@ Basic use This section uses :class:`~dask_ml.model_selection.HyperbandSearchCV`, but it can also be applied to to :class:`~dask_ml.model_selection.IncrementalSearchCV` too. -.. ipython:: python - :okwarning: +.. code-block:: python from dask.distributed import Client from dask_ml.datasets import make_classification @@ -432,14 +431,14 @@ also be applied to to :class:`~dask_ml.model_selection.IncrementalSearchCV` too. Our underlying model is an :class:`sklearn.linear_model.SGDClasifier`. We specify a few parameters common to each clone of the model: -.. ipython:: python +.. code-block:: python from sklearn.linear_model import SGDClassifier clf = SGDClassifier(tol=1e-3, penalty='elasticnet', random_state=0) We also define the distribution of parameters from which we will sample: -.. ipython:: python +.. code-block:: python from scipy.stats import uniform, loguniform params = {'alpha': loguniform(1e-2, 1e0), # or np.logspace @@ -449,7 +448,7 @@ We also define the distribution of parameters from which we will sample: Finally we create many random models in this parameter space and train-and-score them until we find the best one. -.. ipython:: python +.. code-block:: python from dask_ml.model_selection import HyperbandSearchCV @@ -465,7 +464,7 @@ larger-than-memory Dask Array, you'll exhaust your machine's memory. If you plan to use post-estimation features like scoring or prediction, we recommend using :class:`dask_ml.wrappers.ParallelPostFit`. -.. ipython:: python +.. code-block:: python from dask_ml.wrappers import ParallelPostFit params = {'estimator__alpha': loguniform(1e-2, 1e0), @@ -523,14 +522,14 @@ Hyperband parameters: rule-of-thumb These fall out pretty naturally once it's known how long to train the best model and very approximately how many parameters to sample: -.. ipython:: python +.. code-block:: python n_examples = 20 * len(X_train) # 20 passes through dataset for best model n_params = 94 # sample approximately 100 parameters; more than 94 will be sampled With this, it's easy use a rule-of-thumb to compute the inputs to Hyperband: -.. ipython:: python +.. code-block:: python max_iter = n_params chunk_size = n_examples // n_params # implicit @@ -538,7 +537,7 @@ With this, it's easy use a rule-of-thumb to compute the inputs to Hyperband: Now that we've determined the inputs, let's create our search object and rechunk the Dask array: -.. ipython:: python +.. code-block:: python clf = SGDClassifier(tol=1e-3, penalty='elasticnet', random_state=0) params = {'alpha': loguniform(1e-2, 1e0), # or np.logspace @@ -567,7 +566,7 @@ rule-of-thumb in the "Notes" section of However, this does not explicitly mention the amount of computation performed -- it's only an approximation. The amount of computation can be viewed like so: -.. ipython:: python +.. code-block:: python search.metadata["partial_fit_calls"] # best model will see `max_iter` chunks search.metadata["n_models"] # actual number of parameters to sample @@ -578,7 +577,7 @@ amount of computation. Let's fit :class:`~dask_ml.model_selection.HyperbandSearchCV` with these different chunks: -.. ipython:: python +.. code-block:: python search.fit(X_train, y_train, classes=[0, 1]); search.best_params_ diff --git a/pyproject.toml b/pyproject.toml index f4ba3511c..6f2be0f60 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,7 +51,7 @@ dev = [ "sphinx-gallery", "sphinx-rtd-theme", ] -docs = ["nbsphinx", "numpydoc", "sphinx", "sphinx-gallery", "sphinx-rtd-theme"] +docs = ["nbsphinx", "numpydoc", "sphinx", "sphinx-gallery", "sphinx-rtd-theme", "dask-sphinx-theme"] test = [ "black", "coverage", diff --git a/tests/linear_model/test_glm.py b/tests/linear_model/test_glm.py index 1de8de496..1cfbcd1f6 100644 --- a/tests/linear_model/test_glm.py +++ b/tests/linear_model/test_glm.py @@ -10,12 +10,19 @@ from sklearn.pipeline import make_pipeline import dask_ml.linear_model +import dask_ml._compat from dask_ml.datasets import make_classification, make_counts, make_regression from dask_ml.linear_model import LinearRegression, LogisticRegression, PoissonRegression from dask_ml.linear_model.utils import add_intercept from dask_ml.model_selection import GridSearchCV +pytestmark = pytest.mark.skipif( + dask_ml._compat.DASK_2025_3_0, + reason="https://github.com/dask/dask-ml/issues/1016", +) + + @pytest.fixture(params=[r() for r in Regularizer.__subclasses__()]) def solver(request): """Parametrized fixture for all the solver names""" diff --git a/tests/model_selection/test_hyperband.py b/tests/model_selection/test_hyperband.py index af8a8a30f..e92b9bed6 100644 --- a/tests/model_selection/test_hyperband.py +++ b/tests/model_selection/test_hyperband.py @@ -15,7 +15,7 @@ ) from sklearn.linear_model import SGDClassifier -from dask_ml._compat import DISTRIBUTED_2_5_0, SK_LOG_LOSS +from dask_ml._compat import SK_LOG_LOSS, DASK_2025_3_0 from dask_ml.datasets import make_classification from dask_ml.model_selection import ( HyperbandSearchCV, @@ -26,7 +26,10 @@ from dask_ml.utils import ConstantFunction from dask_ml.wrappers import Incremental -pytestmark = pytest.mark.skipif(not DISTRIBUTED_2_5_0, reason="hangs") +pytestmark = pytest.mark.skipif( + DASK_2025_3_0, + reason="https://github.com/dask/dask-ml/issues/1016", +) @pytest.mark.parametrize( diff --git a/tests/model_selection/test_incremental.py b/tests/model_selection/test_incremental.py index f696ac2e1..9e5889042 100644 --- a/tests/model_selection/test_incremental.py +++ b/tests/model_selection/test_incremental.py @@ -27,7 +27,7 @@ from sklearn.model_selection import ParameterGrid, ParameterSampler from sklearn.utils import check_random_state -from dask_ml._compat import DISTRIBUTED_2_5_0, SK_LOG_LOSS +from dask_ml._compat import DASK_2025_3_0, SK_LOG_LOSS from dask_ml.datasets import make_classification from dask_ml.model_selection import ( HyperbandSearchCV, @@ -39,7 +39,10 @@ from dask_ml.utils import ConstantFunction pytestmark = [ - pytest.mark.skipif(not DISTRIBUTED_2_5_0, reason="hangs"), + pytest.mark.skipif( + DASK_2025_3_0, + reason="https://github.com/dask/dask-ml/issues/1016", + ), pytest.mark.filterwarnings("ignore:decay_rate"), ] # decay_rate warnings are tested in test_incremental_warns.py diff --git a/tests/model_selection/test_incremental_warns.py b/tests/model_selection/test_incremental_warns.py index d03f0a918..c45673640 100644 --- a/tests/model_selection/test_incremental_warns.py +++ b/tests/model_selection/test_incremental_warns.py @@ -2,11 +2,18 @@ import pytest from distributed.utils_test import gen_cluster +import dask_ml._compat from dask_ml.datasets import make_classification from dask_ml.model_selection import IncrementalSearchCV, InverseDecaySearchCV from dask_ml.utils import ConstantFunction +pytestmark = pytest.mark.skipif( + dask_ml._compat.DASK_2025_3_0, + reason="https://github.com/dask/dask-ml/issues/1016", +) + + @gen_cluster(client=True) async def test_warns_decay_rate(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=10) diff --git a/tests/model_selection/test_successive_halving.py b/tests/model_selection/test_successive_halving.py index 0108c4d62..41b43a986 100644 --- a/tests/model_selection/test_successive_halving.py +++ b/tests/model_selection/test_successive_halving.py @@ -4,11 +4,14 @@ from sklearn.datasets import make_classification from sklearn.linear_model import SGDClassifier -from dask_ml._compat import DISTRIBUTED_2_5_0 +import dask_ml._compat from dask_ml.model_selection import SuccessiveHalvingSearchCV from dask_ml.utils import ConstantFunction -pytestmark = pytest.mark.skipif(not DISTRIBUTED_2_5_0, reason="hangs") +pytestmark = pytest.mark.skipif( + dask_ml._compat.DASK_2025_3_0, + reason="https://github.com/dask/dask-ml/issues/1016", +) @gen_cluster(client=True) diff --git a/tests/preprocessing/test_data.py b/tests/preprocessing/test_data.py index 0880b27fe..7b394def4 100644 --- a/tests/preprocessing/test_data.py +++ b/tests/preprocessing/test_data.py @@ -17,6 +17,7 @@ import dask_ml.preprocessing as dpp from dask_ml.datasets import make_classification from dask_ml.utils import assert_estimator_equal +from dask_ml._compat import DASK_2025_5_0 X, y = make_classification(chunks=50) df = X.to_dask_dataframe().rename(columns=str) @@ -276,6 +277,10 @@ def test_types(self, type_, kwargs): dqt = dpp.QuantileTransformer() dqt.fit(dX) + @pytest.mark.skipif( + not DASK_2025_5_0, + reason="https://github.com/dask/dask/pull/11943", + ) def test_fit_transform_frame(self): df = pd.DataFrame(np.random.randn(1000, 3)) ddf = dd.from_pandas(df, 2)