Skip to content

Commit bdc1a3c

Browse files
authored
Fix pyspark parameter. (dmlc#9460)
- Don't pass the `use_gpu` parameter to the learner. - Fix GPU approx with PySpark.
1 parent 428f6cb commit bdc1a3c

File tree

3 files changed

+16
-9
lines changed

3 files changed

+16
-9
lines changed

python-package/xgboost/spark/core.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@
115115
"qid_col",
116116
"repartition_random_shuffle",
117117
"pred_contrib_col",
118+
"use_gpu",
118119
]
119120

120121
_non_booster_params = ["missing", "n_estimators", "feature_types", "feature_weights"]
@@ -349,11 +350,9 @@ def _validate_params(self) -> None:
349350
)
350351

351352
tree_method = self.getOrDefault(self.getParam("tree_method"))
352-
if (
353-
self.getOrDefault(self.use_gpu) or use_cuda(self.getOrDefault(self.device))
354-
) and not _can_use_qdm(tree_method):
353+
if tree_method == "exact":
355354
raise ValueError(
356-
f"The `{tree_method}` tree method is not supported on GPU."
355+
"The `exact` tree method is not supported for distributed systems."
357356
)
358357

359358
if self.getOrDefault(self.features_cols):

tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,12 +151,18 @@ def spark_diabetes_dataset_feature_cols(spark_session_with_gpu):
151151
return train_df, test_df, data.feature_names
152152

153153

154-
def test_sparkxgb_classifier_with_gpu(spark_iris_dataset):
154+
@pytest.mark.parametrize("tree_method", ["hist", "approx"])
155+
def test_sparkxgb_classifier_with_gpu(tree_method: str, spark_iris_dataset) -> None:
155156
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
156157

157-
classifier = SparkXGBClassifier(device="cuda", num_workers=num_workers)
158+
classifier = SparkXGBClassifier(
159+
device="cuda", num_workers=num_workers, tree_method=tree_method
160+
)
158161
train_df, test_df = spark_iris_dataset
159162
model = classifier.fit(train_df)
163+
config = json.loads(model.get_booster().save_config())
164+
ctx = config["learner"]["generic_param"]
165+
assert ctx["device"] == "cuda:0"
160166
pred_result_df = model.transform(test_df)
161167
evaluator = MulticlassClassificationEvaluator(metricName="f1")
162168
f1 = evaluator.evaluate(pred_result_df)

tests/test_distributed/test_with_spark/test_spark_local.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -456,7 +456,9 @@ def check_sub_dict_match(
456456
assert sub_dist[k] == whole_dict[k], f"check on {k} failed"
457457

458458

459-
def get_params_map(params_kv: dict, estimator: Type) -> dict:
459+
def get_params_map(
460+
params_kv: dict, estimator: xgb.spark.core._SparkXGBEstimator
461+
) -> dict:
460462
return {getattr(estimator, k): v for k, v in params_kv.items()}
461463

462464

@@ -870,10 +872,10 @@ def test_regressor_model_pipeline_save_load(self, reg_data: RegData) -> None:
870872

871873
def test_device_param(self, reg_data: RegData, clf_data: ClfData) -> None:
872874
clf = SparkXGBClassifier(device="cuda", tree_method="exact")
873-
with pytest.raises(ValueError, match="not supported on GPU"):
875+
with pytest.raises(ValueError, match="not supported for distributed"):
874876
clf.fit(clf_data.cls_df_train)
875877
regressor = SparkXGBRegressor(device="cuda", tree_method="exact")
876-
with pytest.raises(ValueError, match="not supported on GPU"):
878+
with pytest.raises(ValueError, match="not supported for distributed"):
877879
regressor.fit(reg_data.reg_df_train)
878880

879881
reg = SparkXGBRegressor(device="cuda", tree_method="gpu_hist")

0 commit comments

Comments
 (0)