Validate features for inplace predict. (dmlc#8359)

trivialfis · web-flow · commit c884b9e88812 · 2022-10-19T23:05:36.000+08:00
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
@@ -406,10 +406,7 @@ def c_array(
 
 
 def _prediction_output(
-    shape: CNumericPtr,
-    dims: c_bst_ulong,
-    predts: CFloatPtr,
-    is_cuda: bool
+    shape: CNumericPtr, dims: c_bst_ulong, predts: CFloatPtr, is_cuda: bool
 ) -> NumpyOrCupy:
     arr_shape = ctypes2numpy(shape, dims.value, np.uint64)
     length = int(np.prod(arr_shape))
@@ -1555,7 +1552,7 @@ def __init__(
                                          ctypes.byref(self.handle)))
         for d in cache:
             # Validate feature only after the feature names are saved into booster.
-            self._validate_features(d)
+            self._validate_dmatrix_features(d)
 
         if isinstance(model_file, Booster):
             assert self.handle is not None
@@ -1914,7 +1911,7 @@ def update(
         """
         if not isinstance(dtrain, DMatrix):
             raise TypeError(f"invalid training matrix: {type(dtrain).__name__}")
-        self._validate_features(dtrain)
+        self._validate_dmatrix_features(dtrain)
 
         if fobj is None:
             _check_call(_LIB.XGBoosterUpdateOneIter(self.handle,
@@ -1946,7 +1943,7 @@ def boost(self, dtrain: DMatrix, grad: np.ndarray, hess: np.ndarray) -> None:
             )
         if not isinstance(dtrain, DMatrix):
             raise TypeError(f"invalid training matrix: {type(dtrain).__name__}")
-        self._validate_features(dtrain)
+        self._validate_dmatrix_features(dtrain)
 
         _check_call(_LIB.XGBoosterBoostOneIter(self.handle, dtrain.handle,
                                                c_array(ctypes.c_float, grad),
@@ -1982,7 +1979,7 @@ def eval_set(
                 raise TypeError(f"expected DMatrix, got {type(d[0]).__name__}")
             if not isinstance(d[1], str):
                 raise TypeError(f"expected string, got {type(d[1]).__name__}")
-            self._validate_features(d[0])
+            self._validate_dmatrix_features(d[0])
 
         dmats = c_array(ctypes.c_void_p, [d[0].handle for d in evals])
         evnames = c_array(ctypes.c_char_p, [c_str(d[1]) for d in evals])
@@ -2033,7 +2030,7 @@ def eval(self, data: DMatrix, name: str = 'eval', iteration: int = 0) -> str:
         result: str
             Evaluation result string.
         """
-        self._validate_features(data)
+        self._validate_dmatrix_features(data)
         return self.eval_set([(data, name)], iteration)
 
     # pylint: disable=too-many-function-args
@@ -2136,7 +2133,7 @@ def predict(
         if not isinstance(data, DMatrix):
             raise TypeError('Expecting data to be a DMatrix object, got: ', type(data))
         if validate_features:
-            self._validate_features(data)
+            self._validate_dmatrix_features(data)
         iteration_range = _convert_ntree_limit(self, ntree_limit, iteration_range)
         args = {
             "type": 0,
@@ -2184,8 +2181,8 @@ def inplace_predict(
         base_margin: Any = None,
         strict_shape: bool = False
     ) -> NumpyOrCupy:
-        """Run prediction in-place, Unlike :py:meth:`predict` method, inplace prediction does not
-        cache the prediction result.
+        """Run prediction in-place, Unlike :py:meth:`predict` method, inplace prediction
+        does not cache the prediction result.
 
         Calling only ``inplace_predict`` in multiple threads is safe and lock
         free.  But the safety does not hold when used in conjunction with other
@@ -2273,18 +2270,22 @@ def inplace_predict(
                 )
 
         from .data import (
-            _is_pandas_df,
-            _transform_pandas_df,
+            _array_interface,
             _is_cudf_df,
             _is_cupy_array,
-            _array_interface,
+            _is_pandas_df,
+            _transform_pandas_df,
         )
+
         enable_categorical = _has_categorical(self, data)
         if _is_pandas_df(data):
-            data, _, _ = _transform_pandas_df(data, enable_categorical)
+            data, fns, _ = _transform_pandas_df(data, enable_categorical)
+            if validate_features:
+                self._validate_features(fns)
 
         if isinstance(data, np.ndarray):
             from .data import _ensure_np_dtype
+
             data, _ = _ensure_np_dtype(data, data.dtype)
             _check_call(
                 _LIB.XGBoosterPredictFromDense(
@@ -2334,10 +2335,13 @@ def inplace_predict(
             return _prediction_output(shape, dims, preds, True)
         if _is_cudf_df(data):
             from .data import _cudf_array_interfaces, _transform_cudf_df
-            data, cat_codes, _, _ = _transform_cudf_df(
+
+            data, cat_codes, fns, _ = _transform_cudf_df(
                 data, None, None, enable_categorical
             )
             interfaces_str = _cudf_array_interfaces(data, cat_codes)
+            if validate_features:
+                self._validate_features(fns)
             _check_call(
                 _LIB.XGBoosterPredictFromCudaColumnar(
                     self.handle,
@@ -2723,40 +2727,55 @@ def trees_to_dataframe(self, fmap: Union[str, os.PathLike] = '') -> DataFrame:
         # pylint: disable=no-member
         return df.sort(['Tree', 'Node']).reset_index(drop=True)
 
-    def _validate_features(self, data: DMatrix) -> None:
-        """
-        Validate Booster and data's feature_names are identical.
-        Set feature_names and feature_types from DMatrix
-        """
+    def _validate_dmatrix_features(self, data: DMatrix) -> None:
         if data.num_row() == 0:
             return
 
+        fn = data.feature_names
+        ft = data.feature_types
+        # Be consistent with versions before 1.7, "validate" actually modifies the
+        # booster.
         if self.feature_names is None:
-            self.feature_names = data.feature_names
-            self.feature_types = data.feature_types
-        if data.feature_names is None and self.feature_names is not None:
+            self.feature_names = fn
+        if self.feature_types is None:
+            self.feature_types = ft
+
+        self._validate_features(fn)
+
+    def _validate_features(self, feature_names: Optional[FeatureNames]) -> None:
+        if self.feature_names is None:
+            return
+
+        if feature_names is None and self.feature_names is not None:
             raise ValueError(
-                "training data did not have the following fields: " +
-                ", ".join(self.feature_names)
+                "training data did not have the following fields: "
+                + ", ".join(self.feature_names)
             )
-        # Booster can't accept data with different feature names
-        if self.feature_names != data.feature_names:
-            dat_missing = set(cast(FeatureNames, self.feature_names)) - \
-                          set(cast(FeatureNames, data.feature_names))
-            my_missing = set(cast(FeatureNames, data.feature_names)) - \
-                         set(cast(FeatureNames, self.feature_names))
 
-            msg = 'feature_names mismatch: {0} {1}'
+        if self.feature_names != feature_names:
+            dat_missing = set(cast(FeatureNames, self.feature_names)) - set(
+                cast(FeatureNames, feature_names)
+            )
+            my_missing = set(cast(FeatureNames, feature_names)) - set(
+                cast(FeatureNames, self.feature_names)
+            )
+
+            msg = "feature_names mismatch: {0} {1}"
 
             if dat_missing:
-                msg += ('\nexpected ' + ', '.join(
-                    str(s) for s in dat_missing) + ' in input data')
+                msg += (
+                    "\nexpected "
+                    + ", ".join(str(s) for s in dat_missing)
+                    + " in input data"
+                )
 
             if my_missing:
-                msg += ('\ntraining data did not have the following fields: ' +
-                        ', '.join(str(s) for s in my_missing))
+                msg += (
+                    "\ntraining data did not have the following fields: "
+                    + ", ".join(str(s) for s in my_missing)
+                )
 
-            raise ValueError(msg.format(self.feature_names, data.feature_names))
+            raise ValueError(msg.format(self.feature_names, feature_names))
 
     def get_split_value_histogram(
         self,
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
@@ -2,6 +2,7 @@
 import importlib.util
 import json
 import os
+import random
 import tempfile
 from typing import Callable, Optional
 
@@ -998,34 +999,40 @@ def test_deprecate_position_arg():
 def test_pandas_input():
     import pandas as pd
     from sklearn.calibration import CalibratedClassifierCV
+
     rng = np.random.RandomState(1994)
 
     kRows = 100
     kCols = 6
 
-    X = rng.randint(low=0, high=2, size=kRows*kCols)
+    X = rng.randint(low=0, high=2, size=kRows * kCols)
     X = X.reshape(kRows, kCols)
 
     df = pd.DataFrame(X)
     feature_names = []
     for i in range(1, kCols):
-        feature_names += ['k'+str(i)]
+        feature_names += ["k" + str(i)]
 
-    df.columns = ['status'] + feature_names
+    df.columns = ["status"] + feature_names
 
-    target = df['status']
-    train = df.drop(columns=['status'])
+    target = df["status"]
+    train = df.drop(columns=["status"])
     model = xgb.XGBClassifier()
     model.fit(train, target)
     np.testing.assert_equal(model.feature_names_in_, np.array(feature_names))
 
-    clf_isotonic = CalibratedClassifierCV(model,
-                                          cv='prefit', method='isotonic')
+    columns = list(train.columns)
+    random.shuffle(columns, lambda: 0.1)
+    df_incorrect = df[columns]
+    with pytest.raises(ValueError):
+        model.predict(df_incorrect)
+
+    clf_isotonic = CalibratedClassifierCV(model, cv="prefit", method="isotonic")
     clf_isotonic.fit(train, target)
-    assert isinstance(clf_isotonic.calibrated_classifiers_[0].base_estimator,
-                      xgb.XGBClassifier)
-    np.testing.assert_allclose(np.array(clf_isotonic.classes_),
-                               np.array([0, 1]))
+    assert isinstance(
+        clf_isotonic.calibrated_classifiers_[0].base_estimator, xgb.XGBClassifier
+    )
+    np.testing.assert_allclose(np.array(clf_isotonic.classes_), np.array([0, 1]))
 
 
 def run_feature_weights(X, y, fw, tree_method, model=xgb.XGBRegressor):