Test dataset for dask dataframe with str columns. (dmlc#11310)

trivialfis · web-flow · commit bb10ba32c7bf · 2025-03-06T02:11:13.000+08:00
diff --git a/python-package/xgboost/testing/dask.py b/python-package/xgboost/testing/dask.py
@@ -16,6 +16,8 @@
 
 from .. import dask as dxgb
 from ..dask import _get_rabit_args
+from .data import make_batches
+from .data import make_categorical as make_cat_local
 
 
 def check_init_estimation_clf(
@@ -113,7 +115,7 @@ def check_external_memory(  # pylint: disable=too-many-locals
     n_threads = get_worker().state.nthreads
     with xgb.collective.CommunicatorContext(dmlc_communicator="rabit", **comm_args):
         it = tm.IteratorForTest(
-            *tm.make_batches(
+            *make_batches(
                 n_samples_per_batch,
                 n_features,
                 n_batches,
@@ -138,7 +140,7 @@ def check_external_memory(  # pylint: disable=too-many-locals
 
     lx, ly, lw = [], [], []
     for i in range(n_workers):
-        x, y, w = tm.make_batches(
+        x, y, w = make_batches(
             n_samples_per_batch,
             n_features,
             n_batches,
@@ -254,3 +256,57 @@ def check_no_group_split(client: Client, device: str) -> None:
     ndcg = ltr.evals_result()["validation_0"]["ndcg@32"]
     assert tm.non_decreasing(ndcg[:16], tolerance=1e-2), ndcg
     np.testing.assert_allclose(ndcg[-1], 1.0, rtol=1e-2)
+
+
+def make_categorical(  # pylint: disable=too-many-locals, too-many-arguments
+    client: Client,
+    n_samples: int,
+    n_features: int,
+    n_categories: int,
+    *,
+    onehot: bool = False,
+    cat_dtype: np.typing.DTypeLike = np.int64,
+) -> Tuple[dd.DataFrame, dd.Series]:
+    """Synthesize categorical data with dask."""
+    workers = get_client_workers(client)
+    n_workers = len(workers)
+    dfs = []
+
+    def pack(**kwargs: Any) -> dd.DataFrame:
+        X, y = make_cat_local(**kwargs)
+        X["label"] = y
+        return X
+
+    meta = pack(
+        n_samples=1,
+        n_features=n_features,
+        n_categories=n_categories,
+        onehot=False,
+        cat_dtype=cat_dtype,
+    )
+
+    for i, worker in enumerate(workers):
+        l_n_samples = min(
+            n_samples // n_workers, n_samples - i * (n_samples // n_workers)
+        )
+        # make sure there's at least one sample for testing empty DMatrix
+        if n_samples == 1 and i == 0:
+            l_n_samples = 1
+        future = client.submit(
+            pack,
+            n_samples=l_n_samples,
+            n_features=n_features,
+            n_categories=n_categories,
+            cat_dtype=cat_dtype,
+            onehot=False,
+            workers=[worker],
+        )
+        dfs.append(future)
+
+    df: dd.DataFrame = cast(dd.DataFrame, dd.from_delayed(dfs, meta=meta))
+    y = df["label"]
+    X = df[df.columns.difference(["label"])]
+
+    if onehot:
+        return dd.get_dummies(X), y
+    return X, y
diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
@@ -15,15 +15,14 @@
 import xgboost as xgb
 from xgboost import testing as tm
 from xgboost.collective import CommunicatorContext
-from xgboost.testing.dask import get_rabit_args
+from xgboost.testing.dask import get_rabit_args, make_categorical
 from xgboost.testing.params import hist_parameter_strategy
 
 from ..test_with_dask.test_with_dask import (
     generate_array,
 )
 from ..test_with_dask.test_with_dask import kCols as random_cols
 from ..test_with_dask.test_with_dask import (
-    make_categorical,
     run_auc,
     run_boost_from_prediction,
     run_boost_from_prediction_multi_class,
@@ -256,7 +255,7 @@ def test_categorical(self, local_cuda_client: Client) -> None:
         X, y = make_categorical(local_cuda_client, 10000, 30, 13)
         X = X.to_backend("cudf")
 
-        X_onehot, _ = make_categorical(local_cuda_client, 10000, 30, 13, True)
+        X_onehot, _ = make_categorical(local_cuda_client, 10000, 30, 13, onehot=True)
         X_onehot = X_onehot.to_backend("cudf")
         run_categorical(local_cuda_client, "hist", "cuda", X, X_onehot, y)
 
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -31,7 +31,12 @@
 from xgboost import testing as tm
 from xgboost.collective import Config as CollConfig
 from xgboost.dask import DaskDMatrix
-from xgboost.testing.dask import check_init_estimation, check_uneven_nan, get_rabit_args
+from xgboost.testing.dask import (
+    check_init_estimation,
+    check_uneven_nan,
+    get_rabit_args,
+    make_categorical,
+)
 from xgboost.testing.params import hist_cache_strategy, hist_parameter_strategy
 from xgboost.testing.shared import (
     get_feature_weights,
@@ -71,52 +76,6 @@ def client(cluster: "LocalCluster") -> Generator:
 kWorkers = 5
 
 
-def make_categorical(
-    client: Client,
-    n_samples: int,
-    n_features: int,
-    n_categories: int,
-    onehot: bool = False,
-) -> Tuple[dd.DataFrame, dd.Series]:
-    workers = tm.dask.get_client_workers(client)
-    n_workers = len(workers)
-    dfs = []
-
-    def pack(**kwargs: Any) -> dd.DataFrame:
-        X, y = tm.make_categorical(**kwargs)
-        X["label"] = y
-        return X
-
-    meta = pack(
-        n_samples=1, n_features=n_features, n_categories=n_categories, onehot=False
-    )
-
-    for i, worker in enumerate(workers):
-        l_n_samples = min(
-            n_samples // n_workers, n_samples - i * (n_samples // n_workers)
-        )
-        # make sure there's at least one sample for testing empty DMatrix
-        if n_samples == 1 and i == 0:
-            l_n_samples = 1
-        future = client.submit(
-            pack,
-            n_samples=l_n_samples,
-            n_features=n_features,
-            n_categories=n_categories,
-            onehot=False,
-            workers=[worker],
-        )
-        dfs.append(future)
-
-    df = dd.from_delayed(dfs, meta=meta)
-    y = df["label"]
-    X = df[df.columns.difference(["label"])]
-
-    if onehot:
-        return dd.get_dummies(X), y
-    return X, y
-
-
 def generate_array(
     with_weights: bool = False,
 ) -> Tuple[da.Array, da.Array, Optional[da.Array]]: