IntelPython
diff --git a/‎include/xgboost/data.h‎
Lines changed: 3 additions & 4 deletions b/‎include/xgboost/data.h‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎python-package/xgboost/core.py‎
Lines changed: 40 additions & 7 deletions b/‎python-package/xgboost/core.py‎
Lines changed: 40 additions & 7 deletions
diff --git a/‎python-package/xgboost/testing/__init__.py‎
Lines changed: 1 addition & 60 deletions b/‎python-package/xgboost/testing/__init__.py‎
Lines changed: 1 addition & 60 deletions
diff --git a/‎python-package/xgboost/testing/data.py‎
Lines changed: 63 additions & 1 deletion b/‎python-package/xgboost/testing/data.py‎
Lines changed: 63 additions & 1 deletion
diff --git a/‎python-package/xgboost/testing/data_iter.py‎
Lines changed: 7 additions & 1 deletion b/‎python-package/xgboost/testing/data_iter.py‎
Lines changed: 7 additions & 1 deletion
@@ -710,10 +710,9 @@ class DMatrix {
   /**
    * @brief Accessor for the string representation of the categories.
    */
-  CatContainer const* Cats() const { return this->CatsShared().get(); }
-  [[nodiscard]] virtual std::shared_ptr<CatContainer const> CatsShared() const {
-    LOG(FATAL) << "Not implemented for the current DMatrix type.";
-    return nullptr;
+  [[nodiscard]] CatContainer const* Cats() const { return this->CatsShared().get(); }
+  [[nodiscard]] std::shared_ptr<CatContainer const> CatsShared() const {
+    return this->Info().CatsShared();
   }
 
  protected:
 
@@ -477,6 +477,38 @@ class DataIter(ABC):  # pylint: disable=too-many-instance-attributes
         `X`) as key. Don't repeat the `X` for multiple batches with different meta data
         (like `label`), make a copy if necessary.
 
+    .. note::
+
+        When the input for each batch is a DataFrame, we assume categories are
+        consistently encoded for all batches. For example, given two dataframes for two
+        batches, this is invalid:
+
+        .. code-block::
+
+            import pandas as pd
+
+            x0 = pd.DataFrame({"a": [0, 1]}, dtype="category")
+            x1 = pd.DataFrame({"a": [1, 2]}, dtype="category")
+
+        This is invalid because the `x0` has `[0, 1]` as categories while `x2` has `[1,
+        2]`. They should share the same set of categories and encoding:
+
+        .. code-block::
+
+            import numpy as np
+
+            categories = np.array([0, 1, 2])
+            x0["a"] = pd.Categorical.from_codes(
+                codes=np.array([0, 1]), categories=categories
+            )
+            x1["a"] = pd.Categorical.from_codes(
+                codes=np.array([1, 2]), categories=categories
+            )
+
+        You can make sure the consistent encoding in your preprocessing step be careful
+        that the data is stored in formats that preserve the encoding when chunking the
+        data.
+
     Parameters
     ----------
     cache_prefix :
@@ -861,15 +893,16 @@ def __init__(
 
             Experimental support of specializing for categorical features.
 
-            If passing 'True' and 'data' is a data frame (from supported libraries such
-            as Pandas, Modin or cuDF), columns of categorical types will automatically
-            be set to be of categorical type (feature_type='c') in the resulting
-            DMatrix.
+            If passing `True` and `data` is a data frame (from supported libraries such as
+            Pandas, Modin or cuDF), The DMatrix recognizes categorical columns and
+            automatically set the `feature_types` parameter. If `data` is not a data
+            frame, this argument is ignored.
 
-            If passing 'False' and 'data' is a data frame with categorical columns,
-            it will result in an error being thrown.
+            If passing `False` and `data` is a data frame with categorical columns, it
+            will result in an error.
 
-            If 'data' is not a data frame, this argument is ignored.
+            See notes in the :py:class:`DataIter` for consistency requirement when the
+            input is an iterator.
 
             JSON/UBJSON serialization format is required for this.
 
 
@@ -4,7 +4,6 @@
 """
 
 # pylint: disable=invalid-name,missing-function-docstring
-import gc
 import importlib.util
 import os
 import platform
@@ -41,6 +40,7 @@
 
 from .._typing import PathLike
 from .data import (
+    IteratorForTest,
     get_california_housing,
     get_cancer,
     get_digits,
@@ -217,65 +217,6 @@ def skip_win() -> PytestSkip:
     return {"reason": "Unsupported platform.", "condition": is_windows()}
 
 
-class IteratorForTest(xgb.core.DataIter):
-    """Iterator for testing streaming DMatrix. (external memory, quantile)"""
-
-    def __init__(  # pylint: disable=too-many-arguments
-        self,
-        X: Sequence,
-        y: Sequence,
-        w: Optional[Sequence],
-        *,
-        cache: Optional[str],
-        on_host: bool = False,
-        min_cache_page_bytes: Optional[int] = None,
-    ) -> None:
-        assert len(X) == len(y)
-        self.X = X
-        self.y = y
-        self.w = w
-        self.it = 0
-        super().__init__(
-            cache_prefix=cache,
-            on_host=on_host,
-            min_cache_page_bytes=min_cache_page_bytes,
-        )
-
-    def next(self, input_data: Callable) -> bool:
-        if self.it == len(self.X):
-            return False
-
-        with pytest.raises(TypeError, match="Keyword argument"):
-            input_data(self.X[self.it], self.y[self.it], None)
-
-        # Use copy to make sure the iterator doesn't hold a reference to the data.
-        input_data(
-            data=self.X[self.it].copy(),
-            label=self.y[self.it].copy(),
-            weight=self.w[self.it].copy() if self.w else None,
-        )
-        gc.collect()  # clear up the copy, see if XGBoost access freed memory.
-        self.it += 1
-        return True
-
-    def reset(self) -> None:
-        self.it = 0
-
-    def as_arrays(
-        self,
-    ) -> Tuple[Union[np.ndarray, sparse.csr_matrix], ArrayLike, Optional[ArrayLike]]:
-        if isinstance(self.X[0], sparse.csr_matrix):
-            X = sparse.vstack(self.X, format="csr")
-        else:
-            X = np.concatenate(self.X, axis=0)
-        y = np.concatenate(self.y, axis=0)
-        if self.w:
-            w = np.concatenate(self.w, axis=0)
-        else:
-            w = None
-        return X, y, w
-
-
 def make_regression(
     n_samples: int, n_features: int, use_cupy: bool
 ) -> Tuple[ArrayLike, ArrayLike, ArrayLike]:
 
@@ -1,5 +1,6 @@
 # pylint: disable=invalid-name, too-many-lines
 """Utilities for data generation."""
+import gc
 import multiprocessing
 import os
 import string
@@ -15,6 +16,7 @@
     List,
     NamedTuple,
     Optional,
+    Sequence,
     Set,
     Tuple,
     Type,
@@ -28,7 +30,7 @@
 from numpy.random import Generator as RNG
 from scipy import sparse
 
-from ..core import DMatrix, QuantileDMatrix
+from ..core import DataIter, DMatrix, QuantileDMatrix
 from ..data import is_pd_cat_dtype, pandas_pyarrow_mapper
 from ..sklearn import ArrayLike, XGBRanker
 from ..training import train as train_fn
@@ -1043,3 +1045,63 @@ def make_categorical(
         df = cudf.from_pandas(df)
         label = cupy.array(label)
     return df, label
+
+
+class IteratorForTest(DataIter):
+    """Iterator for testing streaming DMatrix. (external memory, quantile)"""
+
+    def __init__(  # pylint: disable=too-many-arguments
+        self,
+        X: Sequence,
+        y: Sequence,
+        w: Optional[Sequence],
+        *,
+        cache: Optional[str],
+        on_host: bool = False,
+        min_cache_page_bytes: Optional[int] = None,
+    ) -> None:
+        assert len(X) == len(y)
+        self.X = X
+        self.y = y
+        self.w = w
+        self.it = 0
+        super().__init__(
+            cache_prefix=cache,
+            on_host=on_host,
+            min_cache_page_bytes=min_cache_page_bytes,
+        )
+
+    def next(self, input_data: Callable) -> bool:
+        if self.it == len(self.X):
+            return False
+
+        with pytest.raises(TypeError, match="Keyword argument"):
+            input_data(self.X[self.it], self.y[self.it], None)
+
+        # Use copy to make sure the iterator doesn't hold a reference to the data.
+        input_data(
+            data=self.X[self.it].copy(),
+            label=self.y[self.it].copy(),
+            weight=self.w[self.it].copy() if self.w else None,
+        )
+        gc.collect()  # clear up the copy, see if XGBoost access freed memory.
+        self.it += 1
+        return True
+
+    def reset(self) -> None:
+        self.it = 0
+
+    def as_arrays(
+        self,
+    ) -> Tuple[Union[np.ndarray, sparse.csr_matrix], ArrayLike, Optional[ArrayLike]]:
+        """Return concatenated arrays."""
+        if isinstance(self.X[0], sparse.csr_matrix):
+            X = sparse.vstack(self.X, format="csr")
+        else:
+            X = np.concatenate(self.X, axis=0)
+        y = np.concatenate(self.y, axis=0)
+        if self.w:
+            w = np.concatenate(self.w, axis=0)
+        else:
+            w = None
+        return X, y, w
@@ -75,9 +75,15 @@ def reset(self) -> None:
     it = _InvalidCatIter()
     import pytest
 
-    with pytest.raises(ValueError, match="Inconsistent feature types between batches"):
+    with pytest.raises(ValueError, match="Inconsistent number of categories between"):
         ExtMemQuantileDMatrix(it, enable_categorical=True)
 
+    with pytest.raises(ValueError, match="Inconsistent number of categories between"):
+        QuantileDMatrix(it, enable_categorical=True)
+
+    with pytest.raises(ValueError, match="Inconsistent feature types"):
+        DMatrix(it, enable_categorical=True)
+
 
 def check_uneven_sizes(device: str) -> None:
     """Tests for having irregular data shapes."""