Skip to content

Commit 8fb2468

Browse files
authored
Store categories from iterator. (dmlc#11313)
- Support iterator with `DMatrix`, `QuantileDMatrix`, `ExtMemQuantileDMatrix`. - Move some test utilities. - Check consistency between batches.
1 parent cefc196 commit 8fb2468

23 files changed

+394
-213
lines changed

include/xgboost/data.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -710,10 +710,9 @@ class DMatrix {
710710
/**
711711
* @brief Accessor for the string representation of the categories.
712712
*/
713-
CatContainer const* Cats() const { return this->CatsShared().get(); }
714-
[[nodiscard]] virtual std::shared_ptr<CatContainer const> CatsShared() const {
715-
LOG(FATAL) << "Not implemented for the current DMatrix type.";
716-
return nullptr;
713+
[[nodiscard]] CatContainer const* Cats() const { return this->CatsShared().get(); }
714+
[[nodiscard]] std::shared_ptr<CatContainer const> CatsShared() const {
715+
return this->Info().CatsShared();
717716
}
718717

719718
protected:

python-package/xgboost/core.py

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -477,6 +477,38 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes
477477
`X`) as key. Don't repeat the `X` for multiple batches with different meta data
478478
(like `label`), make a copy if necessary.
479479
480+
.. note::
481+
482+
When the input for each batch is a DataFrame, we assume categories are
483+
consistently encoded for all batches. For example, given two dataframes for two
484+
batches, this is invalid:
485+
486+
.. code-block::
487+
488+
import pandas as pd
489+
490+
x0 = pd.DataFrame({"a": [0, 1]}, dtype="category")
491+
x1 = pd.DataFrame({"a": [1, 2]}, dtype="category")
492+
493+
This is invalid because the `x0` has `[0, 1]` as categories while `x2` has `[1,
494+
2]`. They should share the same set of categories and encoding:
495+
496+
.. code-block::
497+
498+
import numpy as np
499+
500+
categories = np.array([0, 1, 2])
501+
x0["a"] = pd.Categorical.from_codes(
502+
codes=np.array([0, 1]), categories=categories
503+
)
504+
x1["a"] = pd.Categorical.from_codes(
505+
codes=np.array([1, 2]), categories=categories
506+
)
507+
508+
You can make sure the consistent encoding in your preprocessing step be careful
509+
that the data is stored in formats that preserve the encoding when chunking the
510+
data.
511+
480512
Parameters
481513
----------
482514
cache_prefix :
@@ -861,15 +893,16 @@ def __init__(
861893
862894
Experimental support of specializing for categorical features.
863895
864-
If passing 'True' and 'data' is a data frame (from supported libraries such
865-
as Pandas, Modin or cuDF), columns of categorical types will automatically
866-
be set to be of categorical type (feature_type='c') in the resulting
867-
DMatrix.
896+
If passing `True` and `data` is a data frame (from supported libraries such as
897+
Pandas, Modin or cuDF), The DMatrix recognizes categorical columns and
898+
automatically set the `feature_types` parameter. If `data` is not a data
899+
frame, this argument is ignored.
868900
869-
If passing 'False' and 'data' is a data frame with categorical columns,
870-
it will result in an error being thrown.
901+
If passing `False` and `data` is a data frame with categorical columns, it
902+
will result in an error.
871903
872-
If 'data' is not a data frame, this argument is ignored.
904+
See notes in the :py:class:`DataIter` for consistency requirement when the
905+
input is an iterator.
873906
874907
JSON/UBJSON serialization format is required for this.
875908

python-package/xgboost/testing/__init__.py

Lines changed: 1 addition & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
"""
55

66
# pylint: disable=invalid-name,missing-function-docstring
7-
import gc
87
import importlib.util
98
import os
109
import platform
@@ -41,6 +40,7 @@
4140

4241
from .._typing import PathLike
4342
from .data import (
43+
IteratorForTest,
4444
get_california_housing,
4545
get_cancer,
4646
get_digits,
@@ -217,65 +217,6 @@ def skip_win() -> PytestSkip:
217217
return {"reason": "Unsupported platform.", "condition": is_windows()}
218218

219219

220-
class IteratorForTest(xgb.core.DataIter):
221-
"""Iterator for testing streaming DMatrix. (external memory, quantile)"""
222-
223-
def __init__( # pylint: disable=too-many-arguments
224-
self,
225-
X: Sequence,
226-
y: Sequence,
227-
w: Optional[Sequence],
228-
*,
229-
cache: Optional[str],
230-
on_host: bool = False,
231-
min_cache_page_bytes: Optional[int] = None,
232-
) -> None:
233-
assert len(X) == len(y)
234-
self.X = X
235-
self.y = y
236-
self.w = w
237-
self.it = 0
238-
super().__init__(
239-
cache_prefix=cache,
240-
on_host=on_host,
241-
min_cache_page_bytes=min_cache_page_bytes,
242-
)
243-
244-
def next(self, input_data: Callable) -> bool:
245-
if self.it == len(self.X):
246-
return False
247-
248-
with pytest.raises(TypeError, match="Keyword argument"):
249-
input_data(self.X[self.it], self.y[self.it], None)
250-
251-
# Use copy to make sure the iterator doesn't hold a reference to the data.
252-
input_data(
253-
data=self.X[self.it].copy(),
254-
label=self.y[self.it].copy(),
255-
weight=self.w[self.it].copy() if self.w else None,
256-
)
257-
gc.collect() # clear up the copy, see if XGBoost access freed memory.
258-
self.it += 1
259-
return True
260-
261-
def reset(self) -> None:
262-
self.it = 0
263-
264-
def as_arrays(
265-
self,
266-
) -> Tuple[Union[np.ndarray, sparse.csr_matrix], ArrayLike, Optional[ArrayLike]]:
267-
if isinstance(self.X[0], sparse.csr_matrix):
268-
X = sparse.vstack(self.X, format="csr")
269-
else:
270-
X = np.concatenate(self.X, axis=0)
271-
y = np.concatenate(self.y, axis=0)
272-
if self.w:
273-
w = np.concatenate(self.w, axis=0)
274-
else:
275-
w = None
276-
return X, y, w
277-
278-
279220
def make_regression(
280221
n_samples: int, n_features: int, use_cupy: bool
281222
) -> Tuple[ArrayLike, ArrayLike, ArrayLike]:

python-package/xgboost/testing/data.py

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# pylint: disable=invalid-name, too-many-lines
22
"""Utilities for data generation."""
3+
import gc
34
import multiprocessing
45
import os
56
import string
@@ -15,6 +16,7 @@
1516
List,
1617
NamedTuple,
1718
Optional,
19+
Sequence,
1820
Set,
1921
Tuple,
2022
Type,
@@ -28,7 +30,7 @@
2830
from numpy.random import Generator as RNG
2931
from scipy import sparse
3032

31-
from ..core import DMatrix, QuantileDMatrix
33+
from ..core import DataIter, DMatrix, QuantileDMatrix
3234
from ..data import is_pd_cat_dtype, pandas_pyarrow_mapper
3335
from ..sklearn import ArrayLike, XGBRanker
3436
from ..training import train as train_fn
@@ -1043,3 +1045,63 @@ def make_categorical(
10431045
df = cudf.from_pandas(df)
10441046
label = cupy.array(label)
10451047
return df, label
1048+
1049+
1050+
class IteratorForTest(DataIter):
1051+
"""Iterator for testing streaming DMatrix. (external memory, quantile)"""
1052+
1053+
def __init__( # pylint: disable=too-many-arguments
1054+
self,
1055+
X: Sequence,
1056+
y: Sequence,
1057+
w: Optional[Sequence],
1058+
*,
1059+
cache: Optional[str],
1060+
on_host: bool = False,
1061+
min_cache_page_bytes: Optional[int] = None,
1062+
) -> None:
1063+
assert len(X) == len(y)
1064+
self.X = X
1065+
self.y = y
1066+
self.w = w
1067+
self.it = 0
1068+
super().__init__(
1069+
cache_prefix=cache,
1070+
on_host=on_host,
1071+
min_cache_page_bytes=min_cache_page_bytes,
1072+
)
1073+
1074+
def next(self, input_data: Callable) -> bool:
1075+
if self.it == len(self.X):
1076+
return False
1077+
1078+
with pytest.raises(TypeError, match="Keyword argument"):
1079+
input_data(self.X[self.it], self.y[self.it], None)
1080+
1081+
# Use copy to make sure the iterator doesn't hold a reference to the data.
1082+
input_data(
1083+
data=self.X[self.it].copy(),
1084+
label=self.y[self.it].copy(),
1085+
weight=self.w[self.it].copy() if self.w else None,
1086+
)
1087+
gc.collect() # clear up the copy, see if XGBoost access freed memory.
1088+
self.it += 1
1089+
return True
1090+
1091+
def reset(self) -> None:
1092+
self.it = 0
1093+
1094+
def as_arrays(
1095+
self,
1096+
) -> Tuple[Union[np.ndarray, sparse.csr_matrix], ArrayLike, Optional[ArrayLike]]:
1097+
"""Return concatenated arrays."""
1098+
if isinstance(self.X[0], sparse.csr_matrix):
1099+
X = sparse.vstack(self.X, format="csr")
1100+
else:
1101+
X = np.concatenate(self.X, axis=0)
1102+
y = np.concatenate(self.y, axis=0)
1103+
if self.w:
1104+
w = np.concatenate(self.w, axis=0)
1105+
else:
1106+
w = None
1107+
return X, y, w

python-package/xgboost/testing/data_iter.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,15 @@ def reset(self) -> None:
7575
it = _InvalidCatIter()
7676
import pytest
7777

78-
with pytest.raises(ValueError, match="Inconsistent feature types between batches"):
78+
with pytest.raises(ValueError, match="Inconsistent number of categories between"):
7979
ExtMemQuantileDMatrix(it, enable_categorical=True)
8080

81+
with pytest.raises(ValueError, match="Inconsistent number of categories between"):
82+
QuantileDMatrix(it, enable_categorical=True)
83+
84+
with pytest.raises(ValueError, match="Inconsistent feature types"):
85+
DMatrix(it, enable_categorical=True)
86+
8187

8288
def check_uneven_sizes(device: str) -> None:
8389
"""Tests for having irregular data shapes."""

0 commit comments

Comments
 (0)