[backport][pyspark] Support columnar input for cpu pipeline (dmlc#11299) (dmlc#11301)

trivialfis · wbo4958 · web-flow · commit 532318d569da · 2025-03-03T16:12:36.000+08:00
Co-authored-by: Bobby Wang &lt;wbo4958@gmail.com&gt;
diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
@@ -449,12 +449,6 @@ def _validate_params(self) -> None:
                 "The `exact` tree method is not supported for distributed systems."
             )
 
-        if self.getOrDefault(self.features_cols):
-            if not self._run_on_gpu():
-                raise ValueError(
-                    "features_col param with list value requires `device=cuda`."
-                )
-
         if self.getOrDefault("objective") is not None:
             if not isinstance(self.getOrDefault("objective"), str):
                 raise ValueError("Only string type 'objective' param is allowed.")
diff --git a/tests/test_distributed/test_with_spark/test_spark_local.py b/tests/test_distributed/test_with_spark/test_spark_local.py
@@ -1796,6 +1796,23 @@ def check_conf(conf: Config) -> None:
             loaded_model = SparkXGBClassifierModel.load(path)
             check_conf(loaded_model.getOrDefault(loaded_model.coll_cfg))
 
+    def test_classifier_with_multi_cols(self):
+        df = self.session.createDataFrame(
+            [
+                (1.0, 2.0, 0),
+                (3.1, 4.2, 1),
+            ],
+            ["a", "b", "label"],
+        )
+        features = ["a", "b"]
+        cls = SparkXGBClassifier(features_col=features, device="cpu", n_estimators=2)
+        model = cls.fit(df)
+        self.assertEqual(features, model.getOrDefault(model.features_cols))
+        self.assertTrue(not model.isSet(model.featuresCol))
+
+        # No exception
+        model.transform(df).collect()
+
 
 LTRData = namedtuple(
     "LTRData",