IntelPython
diff --git a/‎gputreeshap b/‎gputreeshap
diff --git a/‎src/predictor/gpu_predictor.cu
Lines changed: 70 additions & 67 deletions b/‎src/predictor/gpu_predictor.cu
Lines changed: 70 additions & 67 deletions
diff --git a/‎tests/cpp/data/test_simple_dmatrix.cc
Lines changed: 5 additions & 6 deletions b/‎tests/cpp/data/test_simple_dmatrix.cc
Lines changed: 5 additions & 6 deletions
diff --git a/‎tests/cpp/gbm/test_gbtree.cc
Lines changed: 5 additions & 5 deletions b/‎tests/cpp/gbm/test_gbtree.cc
Lines changed: 5 additions & 5 deletions
diff --git a/‎tests/cpp/helpers.cc
Lines changed: 35 additions & 28 deletions b/‎tests/cpp/helpers.cc
Lines changed: 35 additions & 28 deletions
@@ -143,10 +143,9 @@ struct SparsePageLoader {
 };
 
 struct EllpackLoader {
-  EllpackDeviceAccessor const& matrix;
-  XGBOOST_DEVICE EllpackLoader(EllpackDeviceAccessor const& m, bool, bst_feature_t, bst_idx_t,
-                               float)
-      : matrix{m} {}
+  EllpackDeviceAccessor matrix;
+  XGBOOST_DEVICE EllpackLoader(EllpackDeviceAccessor m, bool, bst_feature_t, bst_idx_t, float)
+      : matrix{std::move(m)} {}
   [[nodiscard]] XGBOOST_DEV_INLINE float GetElement(size_t ridx, size_t fidx) const {
     auto gidx = matrix.GetBinIndex<false>(ridx, fidx);
     if (gidx == -1) {
@@ -162,6 +161,8 @@ struct EllpackLoader {
     }
     return matrix.gidx_fvalue_map[gidx - 1];
   }
+  [[nodiscard]] XGBOOST_DEVICE bst_idx_t NumCols() const { return this->matrix.NumFeatures(); }
+  [[nodiscard]] XGBOOST_DEVICE bst_idx_t NumRows() const { return this->matrix.n_rows; }
 };
 
 template <typename Batch>
@@ -1031,9 +1032,6 @@ class GPUPredictor : public xgboost::Predictor {
     if (tree_weights != nullptr) {
       LOG(FATAL) << "Dart booster feature " << not_implemented;
     }
-    if (!p_fmat->PageExists<SparsePage>()) {
-      LOG(FATAL) << "SHAP value for QuantileDMatrix is not yet implemented for GPU.";
-    }
     CHECK(!p_fmat->Info().IsColumnSplit())
         << "Predict contribution support for column-wise data split is not yet implemented.";
     dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
@@ -1047,8 +1045,8 @@ class GPUPredictor : public xgboost::Predictor {
     // allocate space for (number of features + bias) times the number of rows
     size_t contributions_columns =
         model.learner_model_param->num_feature + 1;  // +1 for bias
-    out_contribs->Resize(p_fmat->Info().num_row_ * contributions_columns *
-                    model.learner_model_param->num_output_group);
+    auto dim_size = contributions_columns * model.learner_model_param->num_output_group;
+    out_contribs->Resize(p_fmat->Info().num_row_ * dim_size);
     out_contribs->Fill(0.0f);
     auto phis = out_contribs->DeviceSpan();
 
@@ -1058,16 +1056,27 @@ class GPUPredictor : public xgboost::Predictor {
     d_model.Init(model, 0, tree_end, ctx_->Device());
     dh::device_vector<uint32_t> categories;
     ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
-    for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
-      batch.data.SetDevice(ctx_->Device());
-      batch.offset.SetDevice(ctx_->Device());
-      SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
-                       model.learner_model_param->num_feature);
-      auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
-      gpu_treeshap::GPUTreeShap<dh::XGBDeviceAllocator<int>>(
-          X, device_paths.begin(), device_paths.end(), ngroup, begin,
-          dh::tend(phis));
+    if (p_fmat->PageExists<SparsePage>()) {
+      for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
+        batch.data.SetDevice(ctx_->Device());
+        batch.offset.SetDevice(ctx_->Device());
+        SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
+                         model.learner_model_param->num_feature);
+        auto begin = dh::tbegin(phis) + batch.base_rowid * dim_size;
+        gpu_treeshap::GPUTreeShap<dh::XGBDeviceAllocator<int>>(
+            X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
+      }
+    } else {
+      for (auto& batch : p_fmat->GetBatches<EllpackPage>(ctx_, {})) {
+        EllpackDeviceAccessor acc{batch.Impl()->GetDeviceAccessor(ctx_->Device())};
+        auto X = EllpackLoader{acc, true, model.learner_model_param->num_feature, batch.Size(),
+                               std::numeric_limits<float>::quiet_NaN()};
+        auto begin = dh::tbegin(phis) + batch.BaseRowId() * dim_size;
+        gpu_treeshap::GPUTreeShap<dh::XGBDeviceAllocator<int>>(
+            X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
+      }
     }
+
     // Add the base margin term to last column
     p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
     const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
@@ -1094,9 +1103,6 @@ class GPUPredictor : public xgboost::Predictor {
     if (tree_weights != nullptr) {
       LOG(FATAL) << "Dart booster feature " << not_implemented;
     }
-    if (!p_fmat->PageExists<SparsePage>()) {
-      LOG(FATAL) << "SHAP value for QuantileDMatrix is not yet implemented for GPU.";
-    }
     dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
     out_contribs->SetDevice(ctx_->Device());
     if (tree_end == 0 || tree_end > model.trees.size()) {
@@ -1108,9 +1114,9 @@ class GPUPredictor : public xgboost::Predictor {
     // allocate space for (number of features + bias) times the number of rows
     size_t contributions_columns =
         model.learner_model_param->num_feature + 1;  // +1 for bias
-    out_contribs->Resize(p_fmat->Info().num_row_ * contributions_columns *
-                         contributions_columns *
-                         model.learner_model_param->num_output_group);
+    auto dim_size =
+        contributions_columns * contributions_columns * model.learner_model_param->num_output_group;
+    out_contribs->Resize(p_fmat->Info().num_row_ * dim_size);
     out_contribs->Fill(0.0f);
     auto phis = out_contribs->DeviceSpan();
 
@@ -1120,16 +1126,29 @@ class GPUPredictor : public xgboost::Predictor {
     d_model.Init(model, 0, tree_end, ctx_->Device());
     dh::device_vector<uint32_t> categories;
     ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
-    for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
-      batch.data.SetDevice(ctx_->Device());
-      batch.offset.SetDevice(ctx_->Device());
-      SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
-                       model.learner_model_param->num_feature);
-      auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
-      gpu_treeshap::GPUTreeShapInteractions<dh::XGBDeviceAllocator<int>>(
-          X, device_paths.begin(), device_paths.end(), ngroup, begin,
-          dh::tend(phis));
+    if (p_fmat->PageExists<SparsePage>()) {
+      for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
+        batch.data.SetDevice(ctx_->Device());
+        batch.offset.SetDevice(ctx_->Device());
+        SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
+                         model.learner_model_param->num_feature);
+        auto begin = dh::tbegin(phis) + batch.base_rowid * dim_size;
+        gpu_treeshap::GPUTreeShapInteractions<dh::XGBDeviceAllocator<int>>(
+            X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
+      }
+    } else {
+      for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, {})) {
+        auto impl = batch.Impl();
+        auto acc =
+            impl->GetDeviceAccessor(ctx_->Device(), p_fmat->Info().feature_types.ConstDeviceSpan());
+        auto begin = dh::tbegin(phis) + batch.BaseRowId() * dim_size;
+        auto X = EllpackLoader{acc, true, model.learner_model_param->num_feature, batch.Size(),
+                               std::numeric_limits<float>::quiet_NaN()};
+        gpu_treeshap::GPUTreeShapInteractions<dh::XGBDeviceAllocator<int>>(
+            X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
+      }
     }
+
     // Add the base margin term to last column
     p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
     const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
@@ -1180,51 +1199,35 @@ class GPUPredictor : public xgboost::Predictor {
     bool use_shared = shared_memory_bytes != 0;
     bst_feature_t num_features = info.num_col_;
 
+    auto launch = [&](auto fn, std::uint32_t grid, auto data, bst_idx_t batch_offset) {
+      dh::LaunchKernel {grid, kBlockThreads, shared_memory_bytes}(
+          fn, data, d_model.nodes.ConstDeviceSpan(),
+          predictions->DeviceSpan().subspan(batch_offset), d_model.tree_segments.ConstDeviceSpan(),
+
+          d_model.split_types.ConstDeviceSpan(), d_model.categories_tree_segments.ConstDeviceSpan(),
+          d_model.categories_node_segments.ConstDeviceSpan(), d_model.categories.ConstDeviceSpan(),
+
+          d_model.tree_beg_, d_model.tree_end_, num_features, num_rows, use_shared,
+          std::numeric_limits<float>::quiet_NaN());
+    };
+
     if (p_fmat->PageExists<SparsePage>()) {
+      bst_idx_t batch_offset = 0;
       for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
         batch.data.SetDevice(ctx_->Device());
         batch.offset.SetDevice(ctx_->Device());
-        bst_idx_t batch_offset = 0;
         SparsePageView data{batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
                             model.learner_model_param->num_feature};
-        size_t num_rows = batch.Size();
-        auto grid =
-            static_cast<uint32_t>(common::DivRoundUp(num_rows, kBlockThreads));
-        dh::LaunchKernel {grid, kBlockThreads, shared_memory_bytes} (
-            PredictLeafKernel<SparsePageLoader, SparsePageView>, data,
-            d_model.nodes.ConstDeviceSpan(),
-            predictions->DeviceSpan().subspan(batch_offset),
-            d_model.tree_segments.ConstDeviceSpan(),
-
-            d_model.split_types.ConstDeviceSpan(),
-            d_model.categories_tree_segments.ConstDeviceSpan(),
-            d_model.categories_node_segments.ConstDeviceSpan(),
-            d_model.categories.ConstDeviceSpan(),
-
-            d_model.tree_beg_, d_model.tree_end_, num_features, num_rows,
-            use_shared, std::numeric_limits<float>::quiet_NaN());
+        auto grid = static_cast<std::uint32_t>(common::DivRoundUp(batch.Size(), kBlockThreads));
+        launch(PredictLeafKernel<SparsePageLoader, SparsePageView>, grid, data, batch_offset);
         batch_offset += batch.Size();
       }
     } else {
+      bst_idx_t batch_offset = 0;
       for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
-        bst_idx_t batch_offset = 0;
         EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->Device())};
-        size_t num_rows = batch.Size();
-        auto grid =
-            static_cast<uint32_t>(common::DivRoundUp(num_rows, kBlockThreads));
-        dh::LaunchKernel {grid, kBlockThreads, shared_memory_bytes} (
-            PredictLeafKernel<EllpackLoader, EllpackDeviceAccessor>, data,
-            d_model.nodes.ConstDeviceSpan(),
-            predictions->DeviceSpan().subspan(batch_offset),
-            d_model.tree_segments.ConstDeviceSpan(),
-
-            d_model.split_types.ConstDeviceSpan(),
-            d_model.categories_tree_segments.ConstDeviceSpan(),
-            d_model.categories_node_segments.ConstDeviceSpan(),
-            d_model.categories.ConstDeviceSpan(),
-
-            d_model.tree_beg_, d_model.tree_end_, num_features, num_rows,
-            use_shared, std::numeric_limits<float>::quiet_NaN());
+        auto grid = static_cast<std::uint32_t>(common::DivRoundUp(batch.Size(), kBlockThreads));
+        launch(PredictLeafKernel<EllpackLoader, EllpackDeviceAccessor>, grid, data, batch_offset);
         batch_offset += batch.Size();
       }
     }
 
@@ -1,5 +1,5 @@
 /**
- * Copyright 2016-2023 by XGBoost Contributors
+ * Copyright 2016-2024, XGBoost Contributors
  */
 #include <xgboost/data.h>
 
@@ -434,12 +434,11 @@ namespace {
 void VerifyColumnSplit() {
   size_t constexpr kRows {16};
   size_t constexpr kCols {8};
-  auto dmat =
-      RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(false, false, 1, DataSplitMode::kCol);
+  auto p_fmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(false, DataSplitMode::kCol);
 
-  ASSERT_EQ(dmat->Info().num_col_, kCols * collective::GetWorldSize());
-  ASSERT_EQ(dmat->Info().num_row_, kRows);
-  ASSERT_EQ(dmat->Info().data_split_mode, DataSplitMode::kCol);
+  ASSERT_EQ(p_fmat->Info().num_col_, kCols * collective::GetWorldSize());
+  ASSERT_EQ(p_fmat->Info().num_row_, kRows);
+  ASSERT_EQ(p_fmat->Info().data_split_mode, DataSplitMode::kCol);
 }
 }  // anonymous namespace
 
 
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2023, XGBoost contributors
+ * Copyright 2019-2024, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/context.h>
@@ -463,7 +463,7 @@ INSTANTIATE_TEST_SUITE_P(PredictorTypes, Dart, testing::Values("CPU"));
 
 std::pair<Json, Json> TestModelSlice(std::string booster) {
   size_t constexpr kRows = 1000, kCols = 100, kForest = 2, kClasses = 3;
-  auto m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true, false, kClasses);
+  auto m = RandomDataGenerator{kRows, kCols, 0}.Classes(kClasses).GenerateDMatrix(true);
 
   int32_t kIters = 10;
   std::unique_ptr<Learner> learner {
@@ -592,7 +592,7 @@ TEST(Dart, Slice) {
 
 TEST(GBTree, FeatureScore) {
   size_t n_samples = 1000, n_features = 10, n_classes = 4;
-  auto m = RandomDataGenerator{n_samples, n_features, 0.5}.GenerateDMatrix(true, false, n_classes);
+  auto m = RandomDataGenerator{n_samples, n_features, 0.5}.Classes(n_classes).GenerateDMatrix(true);
 
   std::unique_ptr<Learner> learner{ Learner::Create({m}) };
   learner->SetParam("num_class", std::to_string(n_classes));
@@ -629,7 +629,7 @@ TEST(GBTree, FeatureScore) {
 
 TEST(GBTree, PredictRange) {
   size_t n_samples = 1000, n_features = 10, n_classes = 4;
-  auto m = RandomDataGenerator{n_samples, n_features, 0.5}.GenerateDMatrix(true, false, n_classes);
+  auto m = RandomDataGenerator{n_samples, n_features, 0.5}.Classes(n_classes).GenerateDMatrix(true);
 
   std::unique_ptr<Learner> learner{Learner::Create({m})};
   learner->SetParam("num_class", std::to_string(n_classes));
@@ -642,7 +642,7 @@ TEST(GBTree, PredictRange) {
   ASSERT_THROW(learner->Predict(m, false, &out_predt, 0, 3), dmlc::Error);
 
   auto m_1 =
-      RandomDataGenerator{n_samples, n_features, 0.5}.GenerateDMatrix(true, false, n_classes);
+      RandomDataGenerator{n_samples, n_features, 0.5}.Classes(n_classes).GenerateDMatrix(true);
   HostDeviceVector<float> out_predt_full;
   learner->Predict(m_1, false, &out_predt_full, 0, 0);
   ASSERT_TRUE(std::equal(out_predt.HostVector().begin(), out_predt.HostVector().end(),
 
@@ -376,8 +376,33 @@ void RandomDataGenerator::GenerateCSR(
   CHECK_EQ(columns->Size(), value->Size());
 }
 
+namespace {
+void MakeLabels(DeviceOrd device, bst_idx_t n_samples, bst_target_t n_classes,
+                bst_target_t n_targets, std::shared_ptr<DMatrix> out) {
+  RandomDataGenerator gen{n_samples, n_targets, 0.0f};
+  if (n_classes != 0) {
+    gen.Lower(0).Upper(n_classes).GenerateDense(out->Info().labels.Data());
+    out->Info().labels.Reshape(n_samples, n_targets);
+    auto& h_labels = out->Info().labels.Data()->HostVector();
+    for (auto& v : h_labels) {
+      v = static_cast<float>(static_cast<uint32_t>(v));
+    }
+  } else {
+    gen.GenerateDense(out->Info().labels.Data());
+    CHECK_EQ(out->Info().labels.Size(), n_samples * n_targets);
+    out->Info().labels.Reshape(n_samples, n_targets);
+  }
+  if (device.IsCUDA()) {
+    out->Info().labels.Data()->SetDevice(device);
+    out->Info().labels.Data()->ConstDevicePointer();
+    out->Info().feature_types.SetDevice(device);
+    out->Info().feature_types.ConstDevicePointer();
+  }
+}
+}  // namespace
+
 [[nodiscard]] std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDMatrix(
-    bool with_label, bool float_label, size_t classes, DataSplitMode data_split_mode) const {
+    bool with_label, DataSplitMode data_split_mode) const {
   HostDeviceVector<float> data;
   HostDeviceVector<std::size_t> rptrs;
   HostDeviceVector<bst_feature_t> columns;
@@ -388,19 +413,7 @@ void RandomDataGenerator::GenerateCSR(
       DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1, "", data_split_mode)};
 
   if (with_label) {
-    RandomDataGenerator gen{rows_, n_targets_, 0.0f};
-    if (!float_label) {
-      gen.Lower(0).Upper(classes).GenerateDense(out->Info().labels.Data());
-      out->Info().labels.Reshape(this->rows_, this->n_targets_);
-      auto& h_labels = out->Info().labels.Data()->HostVector();
-      for (auto& v : h_labels) {
-        v = static_cast<float>(static_cast<uint32_t>(v));
-      }
-    } else {
-      gen.GenerateDense(out->Info().labels.Data());
-      CHECK_EQ(out->Info().labels.Size(), this->rows_ * this->n_targets_);
-      out->Info().labels.Reshape(this->rows_, this->n_targets_);
-    }
+    MakeLabels(this->device_, this->rows_, this->n_classes_, this->n_targets_, out);
   }
   if (device_.IsCUDA()) {
     out->Info().labels.SetDevice(device_);
@@ -435,34 +448,31 @@ void RandomDataGenerator::GenerateCSR(
 #endif  // defined(XGBOOST_USE_CUDA)
   }
 
-  std::unique_ptr<DMatrix> dmat{DMatrix::Create(
+  std::shared_ptr<DMatrix> p_fmat{DMatrix::Create(
       static_cast<DataIterHandle>(iter.get()), iter->Proxy(), Reset, Next,
       std::numeric_limits<float>::quiet_NaN(), Context{}.Threads(), prefix, on_host_)};
 
   auto row_page_path =
-      data::MakeId(prefix, dynamic_cast<data::SparsePageDMatrix*>(dmat.get())) + ".row.page";
+      data::MakeId(prefix, dynamic_cast<data::SparsePageDMatrix*>(p_fmat.get())) + ".row.page";
   EXPECT_TRUE(FileExists(row_page_path)) << row_page_path;
 
   // Loop over the batches and count the number of pages
   std::size_t batch_count = 0;
   bst_idx_t row_count = 0;
-  for (const auto& batch : dmat->GetBatches<xgboost::SparsePage>()) {
+  for (const auto& batch : p_fmat->GetBatches<xgboost::SparsePage>()) {
     batch_count++;
     row_count += batch.Size();
     CHECK_NE(batch.data.Size(), 0);
   }
 
   EXPECT_EQ(batch_count, n_batches_);
-  EXPECT_EQ(dmat->NumBatches(), n_batches_);
-  EXPECT_EQ(row_count, dmat->Info().num_row_);
+  EXPECT_EQ(p_fmat->NumBatches(), n_batches_);
+  EXPECT_EQ(row_count, p_fmat->Info().num_row_);
 
   if (with_label) {
-    RandomDataGenerator{static_cast<bst_idx_t>(dmat->Info().num_row_), this->n_targets_, 0.0f}.GenerateDense(
-        dmat->Info().labels.Data());
-    CHECK_EQ(dmat->Info().labels.Size(), this->rows_ * this->n_targets_);
-    dmat->Info().labels.Reshape(this->rows_, this->n_targets_);
+    MakeLabels(this->device_, this->rows_, this->n_classes_, this->n_targets_, p_fmat);
   }
-  return dmat;
+  return p_fmat;
 }
 
 [[nodiscard]] std::shared_ptr<DMatrix> RandomDataGenerator::GenerateExtMemQuantileDMatrix(
@@ -492,10 +502,7 @@ void RandomDataGenerator::GenerateCSR(
   }
 
   if (with_label) {
-    RandomDataGenerator{static_cast<bst_idx_t>(p_fmat->Info().num_row_), this->n_targets_, 0.0f}
-        .GenerateDense(p_fmat->Info().labels.Data());
-    CHECK_EQ(p_fmat->Info().labels.Size(), this->rows_ * this->n_targets_);
-    p_fmat->Info().labels.Reshape(this->rows_, this->n_targets_);
+    MakeLabels(this->device_, this->rows_, this->n_classes_, this->n_targets_, p_fmat);
   }
   return p_fmat;
 }