@@ -143,10 +143,9 @@ struct SparsePageLoader {
143
143
};
144
144
145
145
struct EllpackLoader {
146
- EllpackDeviceAccessor const & matrix;
147
- XGBOOST_DEVICE EllpackLoader (EllpackDeviceAccessor const & m, bool , bst_feature_t , bst_idx_t ,
148
- float )
149
- : matrix{m} {}
146
+ EllpackDeviceAccessor matrix;
147
+ XGBOOST_DEVICE EllpackLoader (EllpackDeviceAccessor m, bool , bst_feature_t , bst_idx_t , float )
148
+ : matrix{std::move (m)} {}
150
149
[[nodiscard]] XGBOOST_DEV_INLINE float GetElement (size_t ridx, size_t fidx) const {
151
150
auto gidx = matrix.GetBinIndex <false >(ridx, fidx);
152
151
if (gidx == -1 ) {
@@ -162,6 +161,8 @@ struct EllpackLoader {
162
161
}
163
162
return matrix.gidx_fvalue_map [gidx - 1 ];
164
163
}
164
+ [[nodiscard]] XGBOOST_DEVICE bst_idx_t NumCols () const { return this ->matrix .NumFeatures (); }
165
+ [[nodiscard]] XGBOOST_DEVICE bst_idx_t NumRows () const { return this ->matrix .n_rows ; }
165
166
};
166
167
167
168
template <typename Batch>
@@ -1031,9 +1032,6 @@ class GPUPredictor : public xgboost::Predictor {
1031
1032
if (tree_weights != nullptr ) {
1032
1033
LOG (FATAL) << " Dart booster feature " << not_implemented;
1033
1034
}
1034
- if (!p_fmat->PageExists <SparsePage>()) {
1035
- LOG (FATAL) << " SHAP value for QuantileDMatrix is not yet implemented for GPU." ;
1036
- }
1037
1035
CHECK (!p_fmat->Info ().IsColumnSplit ())
1038
1036
<< " Predict contribution support for column-wise data split is not yet implemented." ;
1039
1037
dh::safe_cuda (cudaSetDevice (ctx_->Ordinal ()));
@@ -1047,8 +1045,8 @@ class GPUPredictor : public xgboost::Predictor {
1047
1045
// allocate space for (number of features + bias) times the number of rows
1048
1046
size_t contributions_columns =
1049
1047
model.learner_model_param ->num_feature + 1 ; // +1 for bias
1050
- out_contribs-> Resize (p_fmat-> Info (). num_row_ * contributions_columns *
1051
- model. learner_model_param -> num_output_group );
1048
+ auto dim_size = contributions_columns * model. learner_model_param -> num_output_group ;
1049
+ out_contribs-> Resize (p_fmat-> Info (). num_row_ * dim_size );
1052
1050
out_contribs->Fill (0 .0f );
1053
1051
auto phis = out_contribs->DeviceSpan ();
1054
1052
@@ -1058,16 +1056,27 @@ class GPUPredictor : public xgboost::Predictor {
1058
1056
d_model.Init (model, 0 , tree_end, ctx_->Device ());
1059
1057
dh::device_vector<uint32_t > categories;
1060
1058
ExtractPaths (&device_paths, &d_model, &categories, ctx_->Device ());
1061
- for (auto & batch : p_fmat->GetBatches <SparsePage>()) {
1062
- batch.data .SetDevice (ctx_->Device ());
1063
- batch.offset .SetDevice (ctx_->Device ());
1064
- SparsePageView X (batch.data .DeviceSpan (), batch.offset .DeviceSpan (),
1065
- model.learner_model_param ->num_feature );
1066
- auto begin = dh::tbegin (phis) + batch.base_rowid * contributions_columns;
1067
- gpu_treeshap::GPUTreeShap<dh::XGBDeviceAllocator<int >>(
1068
- X, device_paths.begin (), device_paths.end (), ngroup, begin,
1069
- dh::tend (phis));
1059
+ if (p_fmat->PageExists <SparsePage>()) {
1060
+ for (auto & batch : p_fmat->GetBatches <SparsePage>()) {
1061
+ batch.data .SetDevice (ctx_->Device ());
1062
+ batch.offset .SetDevice (ctx_->Device ());
1063
+ SparsePageView X (batch.data .DeviceSpan (), batch.offset .DeviceSpan (),
1064
+ model.learner_model_param ->num_feature );
1065
+ auto begin = dh::tbegin (phis) + batch.base_rowid * dim_size;
1066
+ gpu_treeshap::GPUTreeShap<dh::XGBDeviceAllocator<int >>(
1067
+ X, device_paths.begin (), device_paths.end (), ngroup, begin, dh::tend (phis));
1068
+ }
1069
+ } else {
1070
+ for (auto & batch : p_fmat->GetBatches <EllpackPage>(ctx_, {})) {
1071
+ EllpackDeviceAccessor acc{batch.Impl ()->GetDeviceAccessor (ctx_->Device ())};
1072
+ auto X = EllpackLoader{acc, true , model.learner_model_param ->num_feature , batch.Size (),
1073
+ std::numeric_limits<float >::quiet_NaN ()};
1074
+ auto begin = dh::tbegin (phis) + batch.BaseRowId () * dim_size;
1075
+ gpu_treeshap::GPUTreeShap<dh::XGBDeviceAllocator<int >>(
1076
+ X, device_paths.begin (), device_paths.end (), ngroup, begin, dh::tend (phis));
1077
+ }
1070
1078
}
1079
+
1071
1080
// Add the base margin term to last column
1072
1081
p_fmat->Info ().base_margin_ .SetDevice (ctx_->Device ());
1073
1082
const auto margin = p_fmat->Info ().base_margin_ .Data ()->ConstDeviceSpan ();
@@ -1094,9 +1103,6 @@ class GPUPredictor : public xgboost::Predictor {
1094
1103
if (tree_weights != nullptr ) {
1095
1104
LOG (FATAL) << " Dart booster feature " << not_implemented;
1096
1105
}
1097
- if (!p_fmat->PageExists <SparsePage>()) {
1098
- LOG (FATAL) << " SHAP value for QuantileDMatrix is not yet implemented for GPU." ;
1099
- }
1100
1106
dh::safe_cuda (cudaSetDevice (ctx_->Ordinal ()));
1101
1107
out_contribs->SetDevice (ctx_->Device ());
1102
1108
if (tree_end == 0 || tree_end > model.trees .size ()) {
@@ -1108,9 +1114,9 @@ class GPUPredictor : public xgboost::Predictor {
1108
1114
// allocate space for (number of features + bias) times the number of rows
1109
1115
size_t contributions_columns =
1110
1116
model.learner_model_param ->num_feature + 1 ; // +1 for bias
1111
- out_contribs-> Resize (p_fmat-> Info (). num_row_ * contributions_columns *
1112
- contributions_columns *
1113
- model. learner_model_param -> num_output_group );
1117
+ auto dim_size =
1118
+ contributions_columns * contributions_columns * model. learner_model_param -> num_output_group ;
1119
+ out_contribs-> Resize (p_fmat-> Info (). num_row_ * dim_size );
1114
1120
out_contribs->Fill (0 .0f );
1115
1121
auto phis = out_contribs->DeviceSpan ();
1116
1122
@@ -1120,16 +1126,29 @@ class GPUPredictor : public xgboost::Predictor {
1120
1126
d_model.Init (model, 0 , tree_end, ctx_->Device ());
1121
1127
dh::device_vector<uint32_t > categories;
1122
1128
ExtractPaths (&device_paths, &d_model, &categories, ctx_->Device ());
1123
- for (auto & batch : p_fmat->GetBatches <SparsePage>()) {
1124
- batch.data .SetDevice (ctx_->Device ());
1125
- batch.offset .SetDevice (ctx_->Device ());
1126
- SparsePageView X (batch.data .DeviceSpan (), batch.offset .DeviceSpan (),
1127
- model.learner_model_param ->num_feature );
1128
- auto begin = dh::tbegin (phis) + batch.base_rowid * contributions_columns;
1129
- gpu_treeshap::GPUTreeShapInteractions<dh::XGBDeviceAllocator<int >>(
1130
- X, device_paths.begin (), device_paths.end (), ngroup, begin,
1131
- dh::tend (phis));
1129
+ if (p_fmat->PageExists <SparsePage>()) {
1130
+ for (auto const & batch : p_fmat->GetBatches <SparsePage>()) {
1131
+ batch.data .SetDevice (ctx_->Device ());
1132
+ batch.offset .SetDevice (ctx_->Device ());
1133
+ SparsePageView X (batch.data .DeviceSpan (), batch.offset .DeviceSpan (),
1134
+ model.learner_model_param ->num_feature );
1135
+ auto begin = dh::tbegin (phis) + batch.base_rowid * dim_size;
1136
+ gpu_treeshap::GPUTreeShapInteractions<dh::XGBDeviceAllocator<int >>(
1137
+ X, device_paths.begin (), device_paths.end (), ngroup, begin, dh::tend (phis));
1138
+ }
1139
+ } else {
1140
+ for (auto const & batch : p_fmat->GetBatches <EllpackPage>(ctx_, {})) {
1141
+ auto impl = batch.Impl ();
1142
+ auto acc =
1143
+ impl->GetDeviceAccessor (ctx_->Device (), p_fmat->Info ().feature_types .ConstDeviceSpan ());
1144
+ auto begin = dh::tbegin (phis) + batch.BaseRowId () * dim_size;
1145
+ auto X = EllpackLoader{acc, true , model.learner_model_param ->num_feature , batch.Size (),
1146
+ std::numeric_limits<float >::quiet_NaN ()};
1147
+ gpu_treeshap::GPUTreeShapInteractions<dh::XGBDeviceAllocator<int >>(
1148
+ X, device_paths.begin (), device_paths.end (), ngroup, begin, dh::tend (phis));
1149
+ }
1132
1150
}
1151
+
1133
1152
// Add the base margin term to last column
1134
1153
p_fmat->Info ().base_margin_ .SetDevice (ctx_->Device ());
1135
1154
const auto margin = p_fmat->Info ().base_margin_ .Data ()->ConstDeviceSpan ();
@@ -1180,51 +1199,35 @@ class GPUPredictor : public xgboost::Predictor {
1180
1199
bool use_shared = shared_memory_bytes != 0 ;
1181
1200
bst_feature_t num_features = info.num_col_ ;
1182
1201
1202
+ auto launch = [&](auto fn, std::uint32_t grid, auto data, bst_idx_t batch_offset) {
1203
+ dh::LaunchKernel {grid, kBlockThreads , shared_memory_bytes}(
1204
+ fn, data, d_model.nodes .ConstDeviceSpan (),
1205
+ predictions->DeviceSpan ().subspan (batch_offset), d_model.tree_segments .ConstDeviceSpan (),
1206
+
1207
+ d_model.split_types .ConstDeviceSpan (), d_model.categories_tree_segments .ConstDeviceSpan (),
1208
+ d_model.categories_node_segments .ConstDeviceSpan (), d_model.categories .ConstDeviceSpan (),
1209
+
1210
+ d_model.tree_beg_ , d_model.tree_end_ , num_features, num_rows, use_shared,
1211
+ std::numeric_limits<float >::quiet_NaN ());
1212
+ };
1213
+
1183
1214
if (p_fmat->PageExists <SparsePage>()) {
1215
+ bst_idx_t batch_offset = 0 ;
1184
1216
for (auto const & batch : p_fmat->GetBatches <SparsePage>()) {
1185
1217
batch.data .SetDevice (ctx_->Device ());
1186
1218
batch.offset .SetDevice (ctx_->Device ());
1187
- bst_idx_t batch_offset = 0 ;
1188
1219
SparsePageView data{batch.data .DeviceSpan (), batch.offset .DeviceSpan (),
1189
1220
model.learner_model_param ->num_feature };
1190
- size_t num_rows = batch.Size ();
1191
- auto grid =
1192
- static_cast <uint32_t >(common::DivRoundUp (num_rows, kBlockThreads ));
1193
- dh::LaunchKernel {grid, kBlockThreads , shared_memory_bytes} (
1194
- PredictLeafKernel<SparsePageLoader, SparsePageView>, data,
1195
- d_model.nodes .ConstDeviceSpan (),
1196
- predictions->DeviceSpan ().subspan (batch_offset),
1197
- d_model.tree_segments .ConstDeviceSpan (),
1198
-
1199
- d_model.split_types .ConstDeviceSpan (),
1200
- d_model.categories_tree_segments .ConstDeviceSpan (),
1201
- d_model.categories_node_segments .ConstDeviceSpan (),
1202
- d_model.categories .ConstDeviceSpan (),
1203
-
1204
- d_model.tree_beg_ , d_model.tree_end_ , num_features, num_rows,
1205
- use_shared, std::numeric_limits<float >::quiet_NaN ());
1221
+ auto grid = static_cast <std::uint32_t >(common::DivRoundUp (batch.Size (), kBlockThreads ));
1222
+ launch (PredictLeafKernel<SparsePageLoader, SparsePageView>, grid, data, batch_offset);
1206
1223
batch_offset += batch.Size ();
1207
1224
}
1208
1225
} else {
1226
+ bst_idx_t batch_offset = 0 ;
1209
1227
for (auto const & batch : p_fmat->GetBatches <EllpackPage>(ctx_, BatchParam{})) {
1210
- bst_idx_t batch_offset = 0 ;
1211
1228
EllpackDeviceAccessor data{batch.Impl ()->GetDeviceAccessor (ctx_->Device ())};
1212
- size_t num_rows = batch.Size ();
1213
- auto grid =
1214
- static_cast <uint32_t >(common::DivRoundUp (num_rows, kBlockThreads ));
1215
- dh::LaunchKernel {grid, kBlockThreads , shared_memory_bytes} (
1216
- PredictLeafKernel<EllpackLoader, EllpackDeviceAccessor>, data,
1217
- d_model.nodes .ConstDeviceSpan (),
1218
- predictions->DeviceSpan ().subspan (batch_offset),
1219
- d_model.tree_segments .ConstDeviceSpan (),
1220
-
1221
- d_model.split_types .ConstDeviceSpan (),
1222
- d_model.categories_tree_segments .ConstDeviceSpan (),
1223
- d_model.categories_node_segments .ConstDeviceSpan (),
1224
- d_model.categories .ConstDeviceSpan (),
1225
-
1226
- d_model.tree_beg_ , d_model.tree_end_ , num_features, num_rows,
1227
- use_shared, std::numeric_limits<float >::quiet_NaN ());
1229
+ auto grid = static_cast <std::uint32_t >(common::DivRoundUp (batch.Size (), kBlockThreads ));
1230
+ launch (PredictLeafKernel<EllpackLoader, EllpackDeviceAccessor>, grid, data, batch_offset);
1228
1231
batch_offset += batch.Size ();
1229
1232
}
1230
1233
}
0 commit comments