Skip to content

Commit bb10ba3

Browse files
authored
Test dataset for dask dataframe with str columns. (dmlc#11310)
1 parent 4349d1d commit bb10ba3

File tree

3 files changed

+66
-52
lines changed

3 files changed

+66
-52
lines changed

python-package/xgboost/testing/dask.py

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616

1717
from .. import dask as dxgb
1818
from ..dask import _get_rabit_args
19+
from .data import make_batches
20+
from .data import make_categorical as make_cat_local
1921

2022

2123
def check_init_estimation_clf(
@@ -113,7 +115,7 @@ def check_external_memory( # pylint: disable=too-many-locals
113115
n_threads = get_worker().state.nthreads
114116
with xgb.collective.CommunicatorContext(dmlc_communicator="rabit", **comm_args):
115117
it = tm.IteratorForTest(
116-
*tm.make_batches(
118+
*make_batches(
117119
n_samples_per_batch,
118120
n_features,
119121
n_batches,
@@ -138,7 +140,7 @@ def check_external_memory( # pylint: disable=too-many-locals
138140

139141
lx, ly, lw = [], [], []
140142
for i in range(n_workers):
141-
x, y, w = tm.make_batches(
143+
x, y, w = make_batches(
142144
n_samples_per_batch,
143145
n_features,
144146
n_batches,
@@ -254,3 +256,57 @@ def check_no_group_split(client: Client, device: str) -> None:
254256
ndcg = ltr.evals_result()["validation_0"]["ndcg@32"]
255257
assert tm.non_decreasing(ndcg[:16], tolerance=1e-2), ndcg
256258
np.testing.assert_allclose(ndcg[-1], 1.0, rtol=1e-2)
259+
260+
261+
def make_categorical( # pylint: disable=too-many-locals, too-many-arguments
262+
client: Client,
263+
n_samples: int,
264+
n_features: int,
265+
n_categories: int,
266+
*,
267+
onehot: bool = False,
268+
cat_dtype: np.typing.DTypeLike = np.int64,
269+
) -> Tuple[dd.DataFrame, dd.Series]:
270+
"""Synthesize categorical data with dask."""
271+
workers = get_client_workers(client)
272+
n_workers = len(workers)
273+
dfs = []
274+
275+
def pack(**kwargs: Any) -> dd.DataFrame:
276+
X, y = make_cat_local(**kwargs)
277+
X["label"] = y
278+
return X
279+
280+
meta = pack(
281+
n_samples=1,
282+
n_features=n_features,
283+
n_categories=n_categories,
284+
onehot=False,
285+
cat_dtype=cat_dtype,
286+
)
287+
288+
for i, worker in enumerate(workers):
289+
l_n_samples = min(
290+
n_samples // n_workers, n_samples - i * (n_samples // n_workers)
291+
)
292+
# make sure there's at least one sample for testing empty DMatrix
293+
if n_samples == 1 and i == 0:
294+
l_n_samples = 1
295+
future = client.submit(
296+
pack,
297+
n_samples=l_n_samples,
298+
n_features=n_features,
299+
n_categories=n_categories,
300+
cat_dtype=cat_dtype,
301+
onehot=False,
302+
workers=[worker],
303+
)
304+
dfs.append(future)
305+
306+
df: dd.DataFrame = cast(dd.DataFrame, dd.from_delayed(dfs, meta=meta))
307+
y = df["label"]
308+
X = df[df.columns.difference(["label"])]
309+
310+
if onehot:
311+
return dd.get_dummies(X), y
312+
return X, y

tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,14 @@
1515
import xgboost as xgb
1616
from xgboost import testing as tm
1717
from xgboost.collective import CommunicatorContext
18-
from xgboost.testing.dask import get_rabit_args
18+
from xgboost.testing.dask import get_rabit_args, make_categorical
1919
from xgboost.testing.params import hist_parameter_strategy
2020

2121
from ..test_with_dask.test_with_dask import (
2222
generate_array,
2323
)
2424
from ..test_with_dask.test_with_dask import kCols as random_cols
2525
from ..test_with_dask.test_with_dask import (
26-
make_categorical,
2726
run_auc,
2827
run_boost_from_prediction,
2928
run_boost_from_prediction_multi_class,
@@ -256,7 +255,7 @@ def test_categorical(self, local_cuda_client: Client) -> None:
256255
X, y = make_categorical(local_cuda_client, 10000, 30, 13)
257256
X = X.to_backend("cudf")
258257

259-
X_onehot, _ = make_categorical(local_cuda_client, 10000, 30, 13, True)
258+
X_onehot, _ = make_categorical(local_cuda_client, 10000, 30, 13, onehot=True)
260259
X_onehot = X_onehot.to_backend("cudf")
261260
run_categorical(local_cuda_client, "hist", "cuda", X, X_onehot, y)
262261

tests/test_distributed/test_with_dask/test_with_dask.py

Lines changed: 6 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,12 @@
3131
from xgboost import testing as tm
3232
from xgboost.collective import Config as CollConfig
3333
from xgboost.dask import DaskDMatrix
34-
from xgboost.testing.dask import check_init_estimation, check_uneven_nan, get_rabit_args
34+
from xgboost.testing.dask import (
35+
check_init_estimation,
36+
check_uneven_nan,
37+
get_rabit_args,
38+
make_categorical,
39+
)
3540
from xgboost.testing.params import hist_cache_strategy, hist_parameter_strategy
3641
from xgboost.testing.shared import (
3742
get_feature_weights,
@@ -71,52 +76,6 @@ def client(cluster: "LocalCluster") -> Generator:
7176
kWorkers = 5
7277

7378

74-
def make_categorical(
75-
client: Client,
76-
n_samples: int,
77-
n_features: int,
78-
n_categories: int,
79-
onehot: bool = False,
80-
) -> Tuple[dd.DataFrame, dd.Series]:
81-
workers = tm.dask.get_client_workers(client)
82-
n_workers = len(workers)
83-
dfs = []
84-
85-
def pack(**kwargs: Any) -> dd.DataFrame:
86-
X, y = tm.make_categorical(**kwargs)
87-
X["label"] = y
88-
return X
89-
90-
meta = pack(
91-
n_samples=1, n_features=n_features, n_categories=n_categories, onehot=False
92-
)
93-
94-
for i, worker in enumerate(workers):
95-
l_n_samples = min(
96-
n_samples // n_workers, n_samples - i * (n_samples // n_workers)
97-
)
98-
# make sure there's at least one sample for testing empty DMatrix
99-
if n_samples == 1 and i == 0:
100-
l_n_samples = 1
101-
future = client.submit(
102-
pack,
103-
n_samples=l_n_samples,
104-
n_features=n_features,
105-
n_categories=n_categories,
106-
onehot=False,
107-
workers=[worker],
108-
)
109-
dfs.append(future)
110-
111-
df = dd.from_delayed(dfs, meta=meta)
112-
y = df["label"]
113-
X = df[df.columns.difference(["label"])]
114-
115-
if onehot:
116-
return dd.get_dummies(X), y
117-
return X, y
118-
119-
12079
def generate_array(
12180
with_weights: bool = False,
12281
) -> Tuple[da.Array, da.Array, Optional[da.Array]]:

0 commit comments

Comments
 (0)