Skip to content

Commit d81e319

Browse files
trivialfishcho3
andauthored
Fixes for the latest pandas. (dmlc#10266)
Co-authored-by: Philip Hyunsu Cho <[email protected]>
1 parent 5e816e6 commit d81e319

File tree

2 files changed

+36
-28
lines changed

2 files changed

+36
-28
lines changed

python-package/xgboost/data.py

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -370,10 +370,8 @@ def pandas_feature_info(
370370
if feature_names is None and meta is None:
371371
if isinstance(data.columns, pd.MultiIndex):
372372
feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
373-
elif isinstance(data.columns, (pd.Index, pd.RangeIndex)):
374-
feature_names = list(map(str, data.columns))
375373
else:
376-
feature_names = data.columns.format()
374+
feature_names = list(data.columns.map(str))
377375

378376
# handle feature types
379377
if feature_types is None and meta is None:
@@ -865,18 +863,30 @@ def _is_cudf_df(data: DataType) -> bool:
865863
return lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
866864

867865

866+
def _get_cudf_cat_predicate() -> Callable[[Any], bool]:
867+
try:
868+
from cudf import CategoricalDtype
869+
870+
def is_categorical_dtype(dtype: Any) -> bool:
871+
return isinstance(dtype, CategoricalDtype)
872+
873+
except ImportError:
874+
try:
875+
from cudf.api.types import is_categorical_dtype # type: ignore
876+
except ImportError:
877+
from cudf.utils.dtypes import is_categorical_dtype # type: ignore
878+
879+
return is_categorical_dtype
880+
881+
868882
def _cudf_array_interfaces(data: DataType, cat_codes: list) -> bytes:
869883
"""Extract CuDF __cuda_array_interface__. This is special as it returns a new list
870884
of data and a list of array interfaces. The data is list of categorical codes that
871885
caller can safely ignore, but have to keep their reference alive until usage of
872886
array interface is finished.
873887
874888
"""
875-
try:
876-
from cudf.api.types import is_categorical_dtype
877-
except ImportError:
878-
from cudf.utils.dtypes import is_categorical_dtype
879-
889+
is_categorical_dtype = _get_cudf_cat_predicate()
880890
interfaces = []
881891

882892
def append(interface: dict) -> None:
@@ -908,12 +918,13 @@ def _transform_cudf_df(
908918
feature_types: Optional[FeatureTypes],
909919
enable_categorical: bool,
910920
) -> Tuple[ctypes.c_void_p, list, Optional[FeatureNames], Optional[FeatureTypes]]:
921+
911922
try:
912-
from cudf.api.types import is_bool_dtype, is_categorical_dtype
923+
from cudf.api.types import is_bool_dtype
913924
except ImportError:
914-
from cudf.utils.dtypes import is_categorical_dtype
915925
from pandas.api.types import is_bool_dtype
916926

927+
is_categorical_dtype = _get_cudf_cat_predicate()
917928
# Work around https://github.com/dmlc/xgboost/issues/10181
918929
if _is_cudf_ser(data):
919930
if is_bool_dtype(data.dtype):
@@ -941,15 +952,8 @@ def _transform_cudf_df(
941952
feature_names = [data.name]
942953
elif lazy_isinstance(data.columns, "cudf.core.multiindex", "MultiIndex"):
943954
feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
944-
elif (
945-
lazy_isinstance(data.columns, "cudf.core.index", "RangeIndex")
946-
or lazy_isinstance(data.columns, "cudf.core.index", "Int64Index")
947-
# Unique to cuDF, no equivalence in pandas 1.3.3
948-
or lazy_isinstance(data.columns, "cudf.core.index", "Int32Index")
949-
):
950-
feature_names = list(map(str, data.columns))
951955
else:
952-
feature_names = data.columns.format()
956+
feature_names = list(data.columns.map(str))
953957

954958
# handle feature types
955959
if feature_types is None:

tests/python/test_with_pandas.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -280,10 +280,12 @@ def test_pandas_sparse(self):
280280
}
281281
)
282282
y = pd.Series(pd.arrays.SparseArray(np.random.randn(rows)))
283-
dtrain = xgb.DMatrix(X, y)
283+
with pytest.warns(UserWarning, match="Sparse arrays from pandas"):
284+
dtrain = xgb.DMatrix(X, y)
284285
booster = xgb.train({}, dtrain, num_boost_round=4)
285-
predt_sparse = booster.predict(xgb.DMatrix(X))
286-
predt_dense = booster.predict(xgb.DMatrix(X.sparse.to_dense()))
286+
with pytest.warns(UserWarning, match="Sparse arrays from pandas"):
287+
predt_sparse = booster.predict(xgb.DMatrix(X))
288+
predt_dense = booster.predict(xgb.DMatrix(X.sparse.to_dense()))
287289
np.testing.assert_allclose(predt_sparse, predt_dense)
288290

289291
def test_pandas_label(
@@ -572,14 +574,16 @@ def test_pandas_sparse_column_split(self):
572574
y = pd.Series(pd.arrays.SparseArray(np.random.randn(rows)))
573575

574576
def verify_pandas_sparse():
575-
dtrain = xgb.DMatrix(X, y, data_split_mode=DataSplitMode.COL)
577+
with pytest.warns(UserWarning, match="Sparse arrays from pandas"):
578+
dtrain = xgb.DMatrix(X, y, data_split_mode=DataSplitMode.COL)
576579
booster = xgb.train({}, dtrain, num_boost_round=4)
577-
predt_sparse = booster.predict(
578-
xgb.DMatrix(X, data_split_mode=DataSplitMode.COL)
579-
)
580-
predt_dense = booster.predict(
581-
xgb.DMatrix(X.sparse.to_dense(), data_split_mode=DataSplitMode.COL)
582-
)
580+
with pytest.warns(UserWarning, match="Sparse arrays from pandas"):
581+
predt_sparse = booster.predict(
582+
xgb.DMatrix(X, data_split_mode=DataSplitMode.COL)
583+
)
584+
predt_dense = booster.predict(
585+
xgb.DMatrix(X.sparse.to_dense(), data_split_mode=DataSplitMode.COL)
586+
)
583587
np.testing.assert_allclose(predt_sparse, predt_dense)
584588

585589
tm.run_with_rabit(world_size=3, test_fn=verify_pandas_sparse)

0 commit comments

Comments
 (0)