From 2cace856149fc708008a8218f70a4f76248d5989 Mon Sep 17 00:00:00 2001
From: Dmitry Razdoburdin <d.razdoburdin@gmail.com>
Date: Mon, 10 Mar 2025 17:47:01 +0100
Subject: [PATCH] Revert "fix training continuation for iGPUs (#71)"

This reverts commit 3d067f496c656cf1ea0a64db15ddf8a88cd4c6f7.
---
 plugin/sycl/data/gradient_index.cc            | 76 ++++++++++---------
 plugin/sycl/data/gradient_index.h             | 52 ++++++++++---
 plugin/sycl/predictor/predictor.cc            |  7 +-
 .../cpp/plugin/test_sycl_partition_builder.cc |  2 +-
 .../test_sycl_training_continuation.py        |  4 +-
 5 files changed, 89 insertions(+), 52 deletions(-)
diff --git a/plugin/sycl/data/gradient_index.cc b/plugin/sycl/data/gradient_index.cc
index a18ca0798278..e6182e07b976 100644
--- a/plugin/sycl/data/gradient_index.cc
+++ b/plugin/sycl/data/gradient_index.cc
@@ -50,9 +50,10 @@ void mergeSort(BinIdxType* begin, BinIdxType* end, BinIdxType* buf) {
 
 template <typename BinIdxType, bool isDense>
 void GHistIndexMatrix::SetIndexData(::sycl::queue* qu,
-                                    Context const * ctx,
                                     BinIdxType* index_data,
-                                    DMatrix *dmat) {
+                                    DMatrix *dmat,
+                                    size_t nbins,
+                                    size_t row_stride) {
   if (nbins == 0) return;
   const bst_float* cut_values = cut.cut_values_.ConstDevicePointer();
   const uint32_t* cut_ptrs = cut.cut_ptrs_.ConstDevicePointer();
@@ -60,19 +61,17 @@ void GHistIndexMatrix::SetIndexData(::sycl::queue* qu,
 
   BinIdxType* sort_data = reinterpret_cast<BinIdxType*>(sort_buff.Data());
 
+  ::sycl::event event;
   for (auto &batch : dmat->GetBatches<SparsePage>()) {
-    batch.data.SetDevice(ctx->Device());
-    batch.offset.SetDevice(ctx->Device());
-
-    const xgboost::Entry *data_ptr = batch.data.ConstDevicePointer();
-    const bst_idx_t *offset_vec = batch.offset.ConstDevicePointer();
-    size_t batch_size = batch.Size();
-    if (batch_size > 0) {
-      const auto base_rowid = batch.base_rowid;
-      size_t row_stride = this->row_stride;
-      size_t nbins = this->nbins;
-      qu->submit([&](::sycl::handler& cgh) {
-         cgh.parallel_for<>(::sycl::range<1>(batch_size), [=](::sycl::item<1> pid) {
+    for (auto &batch : dmat->GetBatches<SparsePage>()) {
+      const xgboost::Entry *data_ptr = batch.data.ConstDevicePointer();
+      const bst_idx_t *offset_vec = batch.offset.ConstDevicePointer();
+      size_t batch_size = batch.Size();
+      if (batch_size > 0) {
+        const auto base_rowid = batch.base_rowid;
+        event = qu->submit([&](::sycl::handler& cgh) {
+          cgh.depends_on(event);
+          cgh.parallel_for<>(::sycl::range<1>(batch_size), [=](::sycl::item<1> pid) {
           const size_t i = pid.get_id(0);
           const size_t ibegin = offset_vec[i];
           const size_t iend = offset_vec[i + 1];
@@ -93,22 +92,23 @@ void GHistIndexMatrix::SetIndexData(::sycl::queue* qu,
           }
         });
       });
-      qu->wait();
+      }
     }
   }
+  qu->wait();
 }
 
-void GHistIndexMatrix::ResizeIndex(::sycl::queue* qu, size_t n_index) {
-  if ((max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) && isDense_) {
+void GHistIndexMatrix::ResizeIndex(size_t n_index, bool isDense) {
+  if ((max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) && isDense) {
     index.SetBinTypeSize(BinTypeSize::kUint8BinsTypeSize);
-    index.Resize(qu, (sizeof(uint8_t)) * n_index);
+    index.Resize((sizeof(uint8_t)) * n_index);
   } else if ((max_num_bins - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max())  &&
-    max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) && isDense_) {
+    max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) && isDense) {
     index.SetBinTypeSize(BinTypeSize::kUint16BinsTypeSize);
-    index.Resize(qu, (sizeof(uint16_t)) * n_index);
+    index.Resize((sizeof(uint16_t)) * n_index);
   } else {
     index.SetBinTypeSize(BinTypeSize::kUint32BinsTypeSize);
-    index.Resize(qu, (sizeof(uint32_t)) * n_index);
+    index.Resize((sizeof(uint32_t)) * n_index);
   }
 }
 
@@ -122,50 +122,52 @@ void GHistIndexMatrix::Init(::sycl::queue* qu,
   cut.SetDevice(ctx->Device());
 
   max_num_bins = max_bins;
-  nbins = cut.Ptrs().back();
+  const uint32_t nbins = cut.Ptrs().back();
+  this->nbins = nbins;
 
   hit_count.SetDevice(ctx->Device());
   hit_count.Resize(nbins, 0);
 
+  this->p_fmat = dmat;
   const bool isDense = dmat->IsDense();
   this->isDense_ = isDense;
 
+  index.setQueue(qu);
+
   row_stride = 0;
   size_t n_rows = 0;
-  if (!isDense) {
-    for (const auto& batch : dmat->GetBatches<SparsePage>()) {
-      const auto& row_offset = batch.offset.ConstHostVector();
-      n_rows += batch.Size();
-      for (auto i = 1ull; i < row_offset.size(); i++) {
-        row_stride = std::max(row_stride, static_cast<size_t>(row_offset[i] - row_offset[i - 1]));
-      }
+  for (const auto& batch : dmat->GetBatches<SparsePage>()) {
+    const auto& row_offset = batch.offset.ConstHostVector();
+    batch.data.SetDevice(ctx->Device());
+    batch.offset.SetDevice(ctx->Device());
+    n_rows += batch.Size();
+    for (auto i = 1ull; i < row_offset.size(); i++) {
+      row_stride = std::max(row_stride, static_cast<size_t>(row_offset[i] - row_offset[i - 1]));
     }
-  } else {
-    row_stride = nfeatures;
-    n_rows = dmat->Info().num_row_;
   }
 
   const size_t n_offsets = cut.cut_ptrs_.Size() - 1;
   const size_t n_index = n_rows * row_stride;
-  ResizeIndex(qu, n_index);
+  ResizeIndex(n_index, isDense);
 
   CHECK_GT(cut.cut_values_.Size(), 0U);
 
   if (isDense) {
     BinTypeSize curent_bin_size = index.GetBinTypeSize();
     if (curent_bin_size == BinTypeSize::kUint8BinsTypeSize) {
-      SetIndexData<uint8_t, true>(qu, ctx, index.data<uint8_t>(), dmat);
+      SetIndexData<uint8_t, true>(qu, index.data<uint8_t>(), dmat, nbins, row_stride);
+
     } else if (curent_bin_size == BinTypeSize::kUint16BinsTypeSize) {
-      SetIndexData<uint16_t, true>(qu, ctx, index.data<uint16_t>(), dmat);
+      SetIndexData<uint16_t, true>(qu, index.data<uint16_t>(), dmat, nbins, row_stride);
     } else {
       CHECK_EQ(curent_bin_size, BinTypeSize::kUint32BinsTypeSize);
-      SetIndexData<uint32_t, true>(qu, ctx, index.data<uint32_t>(), dmat);
+      SetIndexData<uint32_t, true>(qu, index.data<uint32_t>(), dmat, nbins, row_stride);
     }
   /* For sparse DMatrix we have to store index of feature for each bin
      in index field to chose right offset. So offset is nullptr and index is not reduced */
   } else {
     sort_buff.Resize(qu, n_rows * row_stride * sizeof(uint32_t));
-    SetIndexData<uint32_t, false>(qu, ctx, index.data<uint32_t>(), dmat);
+    SetIndexData<uint32_t, false>(qu, index.data<uint32_t>(), dmat, nbins, row_stride);
   }
 }
 
diff --git a/plugin/sycl/data/gradient_index.h b/plugin/sycl/data/gradient_index.h
index 15748fa7f47f..b88f2a8015ce 100644
--- a/plugin/sycl/data/gradient_index.h
+++ b/plugin/sycl/data/gradient_index.h
@@ -31,9 +31,21 @@ struct Index {
   Index& operator=(Index&& i) = delete;
   void SetBinTypeSize(BinTypeSize binTypeSize) {
     binTypeSize_ = binTypeSize;
-    CHECK(binTypeSize == BinTypeSize::kUint8BinsTypeSize  ||
-          binTypeSize == BinTypeSize::kUint16BinsTypeSize ||
-          binTypeSize == BinTypeSize::kUint32BinsTypeSize);
+    switch (binTypeSize) {
+      case BinTypeSize::kUint8BinsTypeSize:
+        func_ = &GetValueFromUint8;
+        break;
+      case BinTypeSize::kUint16BinsTypeSize:
+        func_ = &GetValueFromUint16;
+        break;
+      case BinTypeSize::kUint32BinsTypeSize:
+        func_ = &GetValueFromUint32;
+        break;
+      default:
+        CHECK(binTypeSize == BinTypeSize::kUint8BinsTypeSize  ||
+              binTypeSize == BinTypeSize::kUint16BinsTypeSize ||
+              binTypeSize == BinTypeSize::kUint32BinsTypeSize);
+    }
   }
   BinTypeSize GetBinTypeSize() const {
     return binTypeSize_;
@@ -53,8 +65,8 @@ struct Index {
     return data_.Size() / (binTypeSize_);
   }
 
-  void Resize(::sycl::queue* qu, const size_t nBytesData) {
-    data_.Resize(qu, nBytesData);
+  void Resize(const size_t nBytesData) {
+    data_.Resize(qu_, nBytesData);
   }
 
   uint8_t* begin() const {
@@ -65,9 +77,28 @@ struct Index {
     return data_.End();
   }
 
+  void setQueue(::sycl::queue* qu) {
+    qu_ = qu;
+  }
+
  private:
+  static uint32_t GetValueFromUint8(const uint8_t* t, size_t i) {
+    return reinterpret_cast<const uint8_t*>(t)[i];
+  }
+  static uint32_t GetValueFromUint16(const uint8_t* t, size_t i) {
+    return reinterpret_cast<const uint16_t*>(t)[i];
+  }
+  static uint32_t GetValueFromUint32(const uint8_t* t, size_t i) {
+    return reinterpret_cast<const uint32_t*>(t)[i];
+  }
+
+  using Func = uint32_t (*)(const uint8_t*, size_t);
+
   USMVector<uint8_t, MemoryType::on_device> data_;
   BinTypeSize binTypeSize_ {BinTypeSize::kUint8BinsTypeSize};
+  Func func_;
+
+  ::sycl::queue* qu_;
 };
 
 /*!
@@ -85,19 +116,22 @@ struct GHistIndexMatrix {
   USMVector<uint8_t, MemoryType::on_device> sort_buff;
   /*! \brief The corresponding cuts */
   xgboost::common::HistogramCuts cut;
+  DMatrix* p_fmat;
   size_t max_num_bins;
   size_t nbins;
   size_t nfeatures;
   size_t row_stride;
 
   // Create a global histogram matrix based on a given DMatrix device wrapper
-  void Init(::sycl::queue* qu, Context const * ctx, DMatrix *dmat, int max_num_bins);
+  void Init(::sycl::queue* qu, Context const * ctx,
+            DMatrix *dmat, int max_num_bins);
 
   template <typename BinIdxType, bool isDense>
-  void SetIndexData(::sycl::queue* qu, Context const * ctx, BinIdxType* index_data,
-                    DMatrix *dmat);
+  void SetIndexData(::sycl::queue* qu, BinIdxType* index_data,
+                    DMatrix *dmat,
+                    size_t nbins, size_t row_stride);
 
-  void ResizeIndex(::sycl::queue* qu, size_t n_index);
+  void ResizeIndex(size_t n_index, bool isDense);
 
   inline void GetFeatureCounts(size_t* counts) const {
     auto nfeature = cut.cut_ptrs_.Size() - 1;
diff --git a/plugin/sycl/predictor/predictor.cc b/plugin/sycl/predictor/predictor.cc
index 9eb05271084c..43356f64eb0b 100755
--- a/plugin/sycl/predictor/predictor.cc
+++ b/plugin/sycl/predictor/predictor.cc
@@ -291,7 +291,7 @@ class Predictor : public xgboost::Predictor {
         }
 
         if (num_group == 1) {
-          float& sum = out_predictions[row_idx];
+          float sum = 0.0;
           for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
             const Node* first_node = nodes + first_node_position[tree_idx - tree_begin];
             if constexpr (any_missing) {
@@ -300,6 +300,7 @@ class Predictor : public xgboost::Predictor {
               sum += GetLeafWeight(first_node, fval_buff_row_ptr);
             }
           }
+          out_predictions[row_idx] += sum;
         } else {
           for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
             const Node* first_node = nodes + first_node_position[tree_idx - tree_begin];
@@ -332,6 +333,7 @@ class Predictor : public xgboost::Predictor {
     int num_features = dmat->Info().num_col_;
 
     float* out_predictions = out_preds->DevicePointer();
+    ::sycl::event event;
     for (auto &batch : dmat->GetBatches<SparsePage>()) {
       batch.data.SetDevice(ctx_->Device());
       batch.offset.SetDevice(ctx_->Device());
@@ -341,7 +343,6 @@ class Predictor : public xgboost::Predictor {
       if (batch_size > 0) {
         const auto base_rowid = batch.base_rowid;
 
-        ::sycl::event event;
         if (needs_buffer_update) {
           fval_buff.ResizeNoCopy(qu_, num_features * batch_size);
           if constexpr (any_missing) {
@@ -353,9 +354,9 @@ class Predictor : public xgboost::Predictor {
                                    row_ptr, batch_size, num_features,
                                    num_group, tree_begin, tree_end);
         needs_buffer_update = (batch_size != out_preds->Size());
-        qu_->wait();
       }
     }
+    qu_->wait();
   }
 
   mutable USMVector<float,   MemoryType::on_device> fval_buff;
diff --git a/tests/cpp/plugin/test_sycl_partition_builder.cc b/tests/cpp/plugin/test_sycl_partition_builder.cc
index 584b5c26fb72..5928988c6441 100644
--- a/tests/cpp/plugin/test_sycl_partition_builder.cc
+++ b/tests/cpp/plugin/test_sycl_partition_builder.cc
@@ -67,7 +67,7 @@ void TestPartitioning(float sparsity, int max_bins) {
 
   std::vector<uint8_t> ridx_left(num_rows, 0);
   std::vector<uint8_t> ridx_right(num_rows, 0);
-  for (auto &batch : p_fmat->GetBatches<SparsePage>()) {
+  for (auto &batch : gmat.p_fmat->GetBatches<SparsePage>()) {
     const auto& data_vec = batch.data.HostVector();
     const auto& offset_vec = batch.offset.HostVector();
 
diff --git a/tests/python-sycl/test_sycl_training_continuation.py b/tests/python-sycl/test_sycl_training_continuation.py
index 71d5965600e7..e2a11c987bb4 100644
--- a/tests/python-sycl/test_sycl_training_continuation.py
+++ b/tests/python-sycl/test_sycl_training_continuation.py
@@ -9,8 +9,8 @@ class TestSYCLTrainingContinuation:
     def run_training_continuation(self, use_json):
         kRows = 64
         kCols = 32
-        X = rng.randn(kRows, kCols)
-        y = rng.randn(kRows)
+        X = np.random.randn(kRows, kCols)
+        y = np.random.randn(kRows)
         dtrain = xgb.DMatrix(X, y)
         params = {
             "device": "sycl",