diff --git a/dask_ml/_compat.py b/dask_ml/_compat.py
index 165387704..bbc907797 100644
--- a/dask_ml/_compat.py
+++ b/dask_ml/_compat.py
@@ -23,6 +23,8 @@
 DASK_2_28_0 = DASK_VERSION > packaging.version.parse("2.27.0")
 DASK_2021_02_0 = DASK_VERSION >= packaging.version.parse("2021.02.0")
 DASK_2022_01_0 = DASK_VERSION > packaging.version.parse("2021.12.0")
+DASK_2025_3_0 = DASK_VERSION >= packaging.version.parse("2025.3.0")
+DASK_2025_5_0 = DASK_VERSION >= packaging.version.parse("2025.5.0")
 DISTRIBUTED_2_5_0 = DISTRIBUTED_VERSION > packaging.version.parse("2.5.0")
 DISTRIBUTED_2_11_0 = DISTRIBUTED_VERSION > packaging.version.parse("2.10.0")  # dev
 DISTRIBUTED_2021_02_0 = DISTRIBUTED_VERSION >= packaging.version.parse("2021.02.0")
diff --git a/dask_ml/model_selection/_search.py b/dask_ml/model_selection/_search.py
index 2bbe31526..6abaa77c1 100644
--- a/dask_ml/model_selection/_search.py
+++ b/dask_ml/model_selection/_search.py
@@ -1521,7 +1521,7 @@ def fit(self, X, y=None, groups=None, **fit_params):
     The number of cross-validation splits (folds/iterations).
 
 Notes
-------
+-----
 The parameters selected are those that maximize the score of the left out
 data, unless an explicit score is passed in which case it is used instead.
 """
diff --git a/docs/_static/.gitignore b/docs/_static/.gitignore
new file mode 100644
index 000000000..e69de29bb
diff --git a/docs/source/conf.py b/docs/source/conf.py
index d7c4d891e..1b974ffbd 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -86,7 +86,7 @@
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
-language = None
+language = "en"
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
diff --git a/docs/source/hyper-parameter-search.rst b/docs/source/hyper-parameter-search.rst
index d60af7c88..7ac14983d 100644
--- a/docs/source/hyper-parameter-search.rst
+++ b/docs/source/hyper-parameter-search.rst
@@ -419,8 +419,7 @@ Basic use
 This section uses :class:`~dask_ml.model_selection.HyperbandSearchCV`, but it can
 also be applied to to :class:`~dask_ml.model_selection.IncrementalSearchCV` too.
 
-.. ipython:: python
-   :okwarning:
+.. code-block:: python
 
     from dask.distributed import Client
     from dask_ml.datasets import make_classification
@@ -432,14 +431,14 @@ also be applied to to :class:`~dask_ml.model_selection.IncrementalSearchCV` too.
 Our underlying model is an :class:`sklearn.linear_model.SGDClasifier`. We
 specify a few parameters common to each clone of the model:
 
-.. ipython:: python
+.. code-block:: python
 
     from sklearn.linear_model import SGDClassifier
     clf = SGDClassifier(tol=1e-3, penalty='elasticnet', random_state=0)
 
 We also define the distribution of parameters from which we will sample:
 
-.. ipython:: python
+.. code-block:: python
 
     from scipy.stats import uniform, loguniform
     params = {'alpha': loguniform(1e-2, 1e0),  # or np.logspace
@@ -449,7 +448,7 @@ We also define the distribution of parameters from which we will sample:
 Finally we create many random models in this parameter space and
 train-and-score them until we find the best one.
 
-.. ipython:: python
+.. code-block:: python
 
     from dask_ml.model_selection import HyperbandSearchCV
 
@@ -465,7 +464,7 @@ larger-than-memory Dask Array, you'll exhaust your machine's memory. If you plan
 to use post-estimation features like scoring or prediction, we recommend using
 :class:`dask_ml.wrappers.ParallelPostFit`.
 
-.. ipython:: python
+.. code-block:: python
 
    from dask_ml.wrappers import ParallelPostFit
    params = {'estimator__alpha': loguniform(1e-2, 1e0),
@@ -523,14 +522,14 @@ Hyperband parameters: rule-of-thumb
 These fall out pretty naturally once it's known how long to train the best
 model and very approximately how many parameters to sample:
 
-.. ipython:: python
+.. code-block:: python
 
    n_examples = 20 * len(X_train)  # 20 passes through dataset for best model
    n_params = 94  # sample approximately 100 parameters; more than 94 will be sampled
 
 With this, it's easy use a rule-of-thumb to compute the inputs to Hyperband:
 
-.. ipython:: python
+.. code-block:: python
 
    max_iter = n_params
    chunk_size = n_examples // n_params  # implicit
@@ -538,7 +537,7 @@ With this, it's easy use a rule-of-thumb to compute the inputs to Hyperband:
 Now that we've determined the inputs, let's create our search object and
 rechunk the Dask array:
 
-.. ipython:: python
+.. code-block:: python
 
    clf = SGDClassifier(tol=1e-3, penalty='elasticnet', random_state=0)
    params = {'alpha': loguniform(1e-2, 1e0),  # or np.logspace
@@ -567,7 +566,7 @@ rule-of-thumb in the "Notes" section of
 However, this does not explicitly mention the amount of computation performed
 -- it's only an approximation. The amount of computation can be viewed like so:
 
-.. ipython:: python
+.. code-block:: python
 
    search.metadata["partial_fit_calls"]  # best model will see `max_iter` chunks
    search.metadata["n_models"]  # actual number of parameters to sample
@@ -578,7 +577,7 @@ amount of computation.  Let's fit
 :class:`~dask_ml.model_selection.HyperbandSearchCV` with these different
 chunks:
 
-.. ipython:: python
+.. code-block:: python
 
    search.fit(X_train, y_train, classes=[0, 1]);
    search.best_params_
diff --git a/pyproject.toml b/pyproject.toml
index f4ba3511c..6f2be0f60 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,7 +51,7 @@ dev = [
     "sphinx-gallery",
     "sphinx-rtd-theme",
 ]
-docs = ["nbsphinx", "numpydoc", "sphinx", "sphinx-gallery", "sphinx-rtd-theme"]
+docs = ["nbsphinx", "numpydoc", "sphinx", "sphinx-gallery", "sphinx-rtd-theme", "dask-sphinx-theme"]
 test = [
     "black",
     "coverage",
diff --git a/tests/linear_model/test_glm.py b/tests/linear_model/test_glm.py
index 1de8de496..1cfbcd1f6 100644
--- a/tests/linear_model/test_glm.py
+++ b/tests/linear_model/test_glm.py
@@ -10,12 +10,19 @@
 from sklearn.pipeline import make_pipeline
 
 import dask_ml.linear_model
+import dask_ml._compat
 from dask_ml.datasets import make_classification, make_counts, make_regression
 from dask_ml.linear_model import LinearRegression, LogisticRegression, PoissonRegression
 from dask_ml.linear_model.utils import add_intercept
 from dask_ml.model_selection import GridSearchCV
 
 
+pytestmark = pytest.mark.skipif(
+    dask_ml._compat.DASK_2025_3_0,
+    reason="https://github.com/dask/dask-ml/issues/1016",
+)
+
+
 @pytest.fixture(params=[r() for r in Regularizer.__subclasses__()])
 def solver(request):
     """Parametrized fixture for all the solver names"""
diff --git a/tests/model_selection/test_hyperband.py b/tests/model_selection/test_hyperband.py
index af8a8a30f..e92b9bed6 100644
--- a/tests/model_selection/test_hyperband.py
+++ b/tests/model_selection/test_hyperband.py
@@ -15,7 +15,7 @@
 )
 from sklearn.linear_model import SGDClassifier
 
-from dask_ml._compat import DISTRIBUTED_2_5_0, SK_LOG_LOSS
+from dask_ml._compat import SK_LOG_LOSS, DASK_2025_3_0
 from dask_ml.datasets import make_classification
 from dask_ml.model_selection import (
     HyperbandSearchCV,
@@ -26,7 +26,10 @@
 from dask_ml.utils import ConstantFunction
 from dask_ml.wrappers import Incremental
 
-pytestmark = pytest.mark.skipif(not DISTRIBUTED_2_5_0, reason="hangs")
+pytestmark = pytest.mark.skipif(
+    DASK_2025_3_0,
+    reason="https://github.com/dask/dask-ml/issues/1016",
+)
 
 
 @pytest.mark.parametrize(
diff --git a/tests/model_selection/test_incremental.py b/tests/model_selection/test_incremental.py
index f696ac2e1..9e5889042 100644
--- a/tests/model_selection/test_incremental.py
+++ b/tests/model_selection/test_incremental.py
@@ -27,7 +27,7 @@
 from sklearn.model_selection import ParameterGrid, ParameterSampler
 from sklearn.utils import check_random_state
 
-from dask_ml._compat import DISTRIBUTED_2_5_0, SK_LOG_LOSS
+from dask_ml._compat import DASK_2025_3_0, SK_LOG_LOSS
 from dask_ml.datasets import make_classification
 from dask_ml.model_selection import (
     HyperbandSearchCV,
@@ -39,7 +39,10 @@
 from dask_ml.utils import ConstantFunction
 
 pytestmark = [
-    pytest.mark.skipif(not DISTRIBUTED_2_5_0, reason="hangs"),
+    pytest.mark.skipif(
+        DASK_2025_3_0,
+        reason="https://github.com/dask/dask-ml/issues/1016",
+    ),
     pytest.mark.filterwarnings("ignore:decay_rate"),
 ]  # decay_rate warnings are tested in test_incremental_warns.py
 
diff --git a/tests/model_selection/test_incremental_warns.py b/tests/model_selection/test_incremental_warns.py
index d03f0a918..c45673640 100644
--- a/tests/model_selection/test_incremental_warns.py
+++ b/tests/model_selection/test_incremental_warns.py
@@ -2,11 +2,18 @@
 import pytest
 from distributed.utils_test import gen_cluster
 
+import dask_ml._compat
 from dask_ml.datasets import make_classification
 from dask_ml.model_selection import IncrementalSearchCV, InverseDecaySearchCV
 from dask_ml.utils import ConstantFunction
 
 
+pytestmark = pytest.mark.skipif(
+    dask_ml._compat.DASK_2025_3_0,
+    reason="https://github.com/dask/dask-ml/issues/1016",
+)
+
+
 @gen_cluster(client=True)
 async def test_warns_decay_rate(c, s, a, b):
     X, y = make_classification(n_samples=100, n_features=5, chunks=10)
diff --git a/tests/model_selection/test_successive_halving.py b/tests/model_selection/test_successive_halving.py
index 0108c4d62..41b43a986 100644
--- a/tests/model_selection/test_successive_halving.py
+++ b/tests/model_selection/test_successive_halving.py
@@ -4,11 +4,14 @@
 from sklearn.datasets import make_classification
 from sklearn.linear_model import SGDClassifier
 
-from dask_ml._compat import DISTRIBUTED_2_5_0
+import dask_ml._compat
 from dask_ml.model_selection import SuccessiveHalvingSearchCV
 from dask_ml.utils import ConstantFunction
 
-pytestmark = pytest.mark.skipif(not DISTRIBUTED_2_5_0, reason="hangs")
+pytestmark = pytest.mark.skipif(
+    dask_ml._compat.DASK_2025_3_0,
+    reason="https://github.com/dask/dask-ml/issues/1016",
+)
 
 
 @gen_cluster(client=True)
diff --git a/tests/preprocessing/test_data.py b/tests/preprocessing/test_data.py
index 0880b27fe..7b394def4 100644
--- a/tests/preprocessing/test_data.py
+++ b/tests/preprocessing/test_data.py
@@ -17,6 +17,7 @@
 import dask_ml.preprocessing as dpp
 from dask_ml.datasets import make_classification
 from dask_ml.utils import assert_estimator_equal
+from dask_ml._compat import DASK_2025_5_0
 
 X, y = make_classification(chunks=50)
 df = X.to_dask_dataframe().rename(columns=str)
@@ -276,6 +277,10 @@ def test_types(self, type_, kwargs):
         dqt = dpp.QuantileTransformer()
         dqt.fit(dX)
 
+    @pytest.mark.skipif(
+        not DASK_2025_5_0,
+        reason="https://github.com/dask/dask/pull/11943",
+    )
     def test_fit_transform_frame(self):
         df = pd.DataFrame(np.random.randn(1000, 3))
         ddf = dd.from_pandas(df, 2)