32
32
ShortType ,
33
33
)
34
34
from scipy .special import expit , softmax # pylint: disable=no-name-in-module
35
+ from xgboost .compat import is_cudf_available
35
36
from xgboost .core import Booster
36
37
from xgboost .training import train as worker_train
37
38
@@ -759,7 +760,8 @@ def _fit(self, dataset):
759
760
k : v for k , v in train_call_kwargs_params .items () if v is not None
760
761
}
761
762
dmatrix_kwargs = {k : v for k , v in dmatrix_kwargs .items () if v is not None }
762
- use_qdm = booster_params .get ("tree_method" , None ) in ("hist" , "gpu_hist" )
763
+
764
+ use_hist = booster_params .get ("tree_method" , None ) in ("hist" , "gpu_hist" )
763
765
764
766
def _train_booster (pandas_df_iter ):
765
767
"""Takes in an RDD partition and outputs a booster for that partition after
@@ -773,6 +775,15 @@ def _train_booster(pandas_df_iter):
773
775
774
776
gpu_id = None
775
777
778
+ # If cuDF is not installed, then using DMatrix instead of QDM,
779
+ # because without cuDF, DMatrix performs better than QDM.
780
+ # Note: Checking `is_cudf_available` in spark worker side because
781
+ # spark worker might has different python environment with driver side.
782
+ if use_gpu :
783
+ use_qdm = use_hist and is_cudf_available ()
784
+ else :
785
+ use_qdm = use_hist
786
+
776
787
if use_qdm and (booster_params .get ("max_bin" , None ) is not None ):
777
788
dmatrix_kwargs ["max_bin" ] = booster_params ["max_bin" ]
778
789
0 commit comments