From 2cace856149fc708008a8218f70a4f76248d5989 Mon Sep 17 00:00:00 2001 From: Dmitry Razdoburdin Date: Mon, 10 Mar 2025 17:47:01 +0100 Subject: [PATCH] Revert "fix training continuation for iGPUs (#71)" This reverts commit 3d067f496c656cf1ea0a64db15ddf8a88cd4c6f7. --- plugin/sycl/data/gradient_index.cc | 76 ++++++++++--------- plugin/sycl/data/gradient_index.h | 52 ++++++++++--- plugin/sycl/predictor/predictor.cc | 7 +- .../cpp/plugin/test_sycl_partition_builder.cc | 2 +- .../test_sycl_training_continuation.py | 4 +- 5 files changed, 89 insertions(+), 52 deletions(-) diff --git a/plugin/sycl/data/gradient_index.cc b/plugin/sycl/data/gradient_index.cc index a18ca0798278..e6182e07b976 100644 --- a/plugin/sycl/data/gradient_index.cc +++ b/plugin/sycl/data/gradient_index.cc @@ -50,9 +50,10 @@ void mergeSort(BinIdxType* begin, BinIdxType* end, BinIdxType* buf) { template void GHistIndexMatrix::SetIndexData(::sycl::queue* qu, - Context const * ctx, BinIdxType* index_data, - DMatrix *dmat) { + DMatrix *dmat, + size_t nbins, + size_t row_stride) { if (nbins == 0) return; const bst_float* cut_values = cut.cut_values_.ConstDevicePointer(); const uint32_t* cut_ptrs = cut.cut_ptrs_.ConstDevicePointer(); @@ -60,19 +61,17 @@ void GHistIndexMatrix::SetIndexData(::sycl::queue* qu, BinIdxType* sort_data = reinterpret_cast(sort_buff.Data()); + ::sycl::event event; for (auto &batch : dmat->GetBatches()) { - batch.data.SetDevice(ctx->Device()); - batch.offset.SetDevice(ctx->Device()); - - const xgboost::Entry *data_ptr = batch.data.ConstDevicePointer(); - const bst_idx_t *offset_vec = batch.offset.ConstDevicePointer(); - size_t batch_size = batch.Size(); - if (batch_size > 0) { - const auto base_rowid = batch.base_rowid; - size_t row_stride = this->row_stride; - size_t nbins = this->nbins; - qu->submit([&](::sycl::handler& cgh) { - cgh.parallel_for<>(::sycl::range<1>(batch_size), [=](::sycl::item<1> pid) { + for (auto &batch : dmat->GetBatches()) { + const xgboost::Entry *data_ptr = batch.data.ConstDevicePointer(); + const bst_idx_t *offset_vec = batch.offset.ConstDevicePointer(); + size_t batch_size = batch.Size(); + if (batch_size > 0) { + const auto base_rowid = batch.base_rowid; + event = qu->submit([&](::sycl::handler& cgh) { + cgh.depends_on(event); + cgh.parallel_for<>(::sycl::range<1>(batch_size), [=](::sycl::item<1> pid) { const size_t i = pid.get_id(0); const size_t ibegin = offset_vec[i]; const size_t iend = offset_vec[i + 1]; @@ -93,22 +92,23 @@ void GHistIndexMatrix::SetIndexData(::sycl::queue* qu, } }); }); - qu->wait(); + } } } + qu->wait(); } -void GHistIndexMatrix::ResizeIndex(::sycl::queue* qu, size_t n_index) { - if ((max_num_bins - 1 <= static_cast(std::numeric_limits::max())) && isDense_) { +void GHistIndexMatrix::ResizeIndex(size_t n_index, bool isDense) { + if ((max_num_bins - 1 <= static_cast(std::numeric_limits::max())) && isDense) { index.SetBinTypeSize(BinTypeSize::kUint8BinsTypeSize); - index.Resize(qu, (sizeof(uint8_t)) * n_index); + index.Resize((sizeof(uint8_t)) * n_index); } else if ((max_num_bins - 1 > static_cast(std::numeric_limits::max()) && - max_num_bins - 1 <= static_cast(std::numeric_limits::max())) && isDense_) { + max_num_bins - 1 <= static_cast(std::numeric_limits::max())) && isDense) { index.SetBinTypeSize(BinTypeSize::kUint16BinsTypeSize); - index.Resize(qu, (sizeof(uint16_t)) * n_index); + index.Resize((sizeof(uint16_t)) * n_index); } else { index.SetBinTypeSize(BinTypeSize::kUint32BinsTypeSize); - index.Resize(qu, (sizeof(uint32_t)) * n_index); + index.Resize((sizeof(uint32_t)) * n_index); } } @@ -122,50 +122,52 @@ void GHistIndexMatrix::Init(::sycl::queue* qu, cut.SetDevice(ctx->Device()); max_num_bins = max_bins; - nbins = cut.Ptrs().back(); + const uint32_t nbins = cut.Ptrs().back(); + this->nbins = nbins; hit_count.SetDevice(ctx->Device()); hit_count.Resize(nbins, 0); + this->p_fmat = dmat; const bool isDense = dmat->IsDense(); this->isDense_ = isDense; + index.setQueue(qu); + row_stride = 0; size_t n_rows = 0; - if (!isDense) { - for (const auto& batch : dmat->GetBatches()) { - const auto& row_offset = batch.offset.ConstHostVector(); - n_rows += batch.Size(); - for (auto i = 1ull; i < row_offset.size(); i++) { - row_stride = std::max(row_stride, static_cast(row_offset[i] - row_offset[i - 1])); - } + for (const auto& batch : dmat->GetBatches()) { + const auto& row_offset = batch.offset.ConstHostVector(); + batch.data.SetDevice(ctx->Device()); + batch.offset.SetDevice(ctx->Device()); + n_rows += batch.Size(); + for (auto i = 1ull; i < row_offset.size(); i++) { + row_stride = std::max(row_stride, static_cast(row_offset[i] - row_offset[i - 1])); } - } else { - row_stride = nfeatures; - n_rows = dmat->Info().num_row_; } const size_t n_offsets = cut.cut_ptrs_.Size() - 1; const size_t n_index = n_rows * row_stride; - ResizeIndex(qu, n_index); + ResizeIndex(n_index, isDense); CHECK_GT(cut.cut_values_.Size(), 0U); if (isDense) { BinTypeSize curent_bin_size = index.GetBinTypeSize(); if (curent_bin_size == BinTypeSize::kUint8BinsTypeSize) { - SetIndexData(qu, ctx, index.data(), dmat); + SetIndexData(qu, index.data(), dmat, nbins, row_stride); + } else if (curent_bin_size == BinTypeSize::kUint16BinsTypeSize) { - SetIndexData(qu, ctx, index.data(), dmat); + SetIndexData(qu, index.data(), dmat, nbins, row_stride); } else { CHECK_EQ(curent_bin_size, BinTypeSize::kUint32BinsTypeSize); - SetIndexData(qu, ctx, index.data(), dmat); + SetIndexData(qu, index.data(), dmat, nbins, row_stride); } /* For sparse DMatrix we have to store index of feature for each bin in index field to chose right offset. So offset is nullptr and index is not reduced */ } else { sort_buff.Resize(qu, n_rows * row_stride * sizeof(uint32_t)); - SetIndexData(qu, ctx, index.data(), dmat); + SetIndexData(qu, index.data(), dmat, nbins, row_stride); } } diff --git a/plugin/sycl/data/gradient_index.h b/plugin/sycl/data/gradient_index.h index 15748fa7f47f..b88f2a8015ce 100644 --- a/plugin/sycl/data/gradient_index.h +++ b/plugin/sycl/data/gradient_index.h @@ -31,9 +31,21 @@ struct Index { Index& operator=(Index&& i) = delete; void SetBinTypeSize(BinTypeSize binTypeSize) { binTypeSize_ = binTypeSize; - CHECK(binTypeSize == BinTypeSize::kUint8BinsTypeSize || - binTypeSize == BinTypeSize::kUint16BinsTypeSize || - binTypeSize == BinTypeSize::kUint32BinsTypeSize); + switch (binTypeSize) { + case BinTypeSize::kUint8BinsTypeSize: + func_ = &GetValueFromUint8; + break; + case BinTypeSize::kUint16BinsTypeSize: + func_ = &GetValueFromUint16; + break; + case BinTypeSize::kUint32BinsTypeSize: + func_ = &GetValueFromUint32; + break; + default: + CHECK(binTypeSize == BinTypeSize::kUint8BinsTypeSize || + binTypeSize == BinTypeSize::kUint16BinsTypeSize || + binTypeSize == BinTypeSize::kUint32BinsTypeSize); + } } BinTypeSize GetBinTypeSize() const { return binTypeSize_; @@ -53,8 +65,8 @@ struct Index { return data_.Size() / (binTypeSize_); } - void Resize(::sycl::queue* qu, const size_t nBytesData) { - data_.Resize(qu, nBytesData); + void Resize(const size_t nBytesData) { + data_.Resize(qu_, nBytesData); } uint8_t* begin() const { @@ -65,9 +77,28 @@ struct Index { return data_.End(); } + void setQueue(::sycl::queue* qu) { + qu_ = qu; + } + private: + static uint32_t GetValueFromUint8(const uint8_t* t, size_t i) { + return reinterpret_cast(t)[i]; + } + static uint32_t GetValueFromUint16(const uint8_t* t, size_t i) { + return reinterpret_cast(t)[i]; + } + static uint32_t GetValueFromUint32(const uint8_t* t, size_t i) { + return reinterpret_cast(t)[i]; + } + + using Func = uint32_t (*)(const uint8_t*, size_t); + USMVector data_; BinTypeSize binTypeSize_ {BinTypeSize::kUint8BinsTypeSize}; + Func func_; + + ::sycl::queue* qu_; }; /*! @@ -85,19 +116,22 @@ struct GHistIndexMatrix { USMVector sort_buff; /*! \brief The corresponding cuts */ xgboost::common::HistogramCuts cut; + DMatrix* p_fmat; size_t max_num_bins; size_t nbins; size_t nfeatures; size_t row_stride; // Create a global histogram matrix based on a given DMatrix device wrapper - void Init(::sycl::queue* qu, Context const * ctx, DMatrix *dmat, int max_num_bins); + void Init(::sycl::queue* qu, Context const * ctx, + DMatrix *dmat, int max_num_bins); template - void SetIndexData(::sycl::queue* qu, Context const * ctx, BinIdxType* index_data, - DMatrix *dmat); + void SetIndexData(::sycl::queue* qu, BinIdxType* index_data, + DMatrix *dmat, + size_t nbins, size_t row_stride); - void ResizeIndex(::sycl::queue* qu, size_t n_index); + void ResizeIndex(size_t n_index, bool isDense); inline void GetFeatureCounts(size_t* counts) const { auto nfeature = cut.cut_ptrs_.Size() - 1; diff --git a/plugin/sycl/predictor/predictor.cc b/plugin/sycl/predictor/predictor.cc index 9eb05271084c..43356f64eb0b 100755 --- a/plugin/sycl/predictor/predictor.cc +++ b/plugin/sycl/predictor/predictor.cc @@ -291,7 +291,7 @@ class Predictor : public xgboost::Predictor { } if (num_group == 1) { - float& sum = out_predictions[row_idx]; + float sum = 0.0; for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) { const Node* first_node = nodes + first_node_position[tree_idx - tree_begin]; if constexpr (any_missing) { @@ -300,6 +300,7 @@ class Predictor : public xgboost::Predictor { sum += GetLeafWeight(first_node, fval_buff_row_ptr); } } + out_predictions[row_idx] += sum; } else { for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) { const Node* first_node = nodes + first_node_position[tree_idx - tree_begin]; @@ -332,6 +333,7 @@ class Predictor : public xgboost::Predictor { int num_features = dmat->Info().num_col_; float* out_predictions = out_preds->DevicePointer(); + ::sycl::event event; for (auto &batch : dmat->GetBatches()) { batch.data.SetDevice(ctx_->Device()); batch.offset.SetDevice(ctx_->Device()); @@ -341,7 +343,6 @@ class Predictor : public xgboost::Predictor { if (batch_size > 0) { const auto base_rowid = batch.base_rowid; - ::sycl::event event; if (needs_buffer_update) { fval_buff.ResizeNoCopy(qu_, num_features * batch_size); if constexpr (any_missing) { @@ -353,9 +354,9 @@ class Predictor : public xgboost::Predictor { row_ptr, batch_size, num_features, num_group, tree_begin, tree_end); needs_buffer_update = (batch_size != out_preds->Size()); - qu_->wait(); } } + qu_->wait(); } mutable USMVector fval_buff; diff --git a/tests/cpp/plugin/test_sycl_partition_builder.cc b/tests/cpp/plugin/test_sycl_partition_builder.cc index 584b5c26fb72..5928988c6441 100644 --- a/tests/cpp/plugin/test_sycl_partition_builder.cc +++ b/tests/cpp/plugin/test_sycl_partition_builder.cc @@ -67,7 +67,7 @@ void TestPartitioning(float sparsity, int max_bins) { std::vector ridx_left(num_rows, 0); std::vector ridx_right(num_rows, 0); - for (auto &batch : p_fmat->GetBatches()) { + for (auto &batch : gmat.p_fmat->GetBatches()) { const auto& data_vec = batch.data.HostVector(); const auto& offset_vec = batch.offset.HostVector(); diff --git a/tests/python-sycl/test_sycl_training_continuation.py b/tests/python-sycl/test_sycl_training_continuation.py index 71d5965600e7..e2a11c987bb4 100644 --- a/tests/python-sycl/test_sycl_training_continuation.py +++ b/tests/python-sycl/test_sycl_training_continuation.py @@ -9,8 +9,8 @@ class TestSYCLTrainingContinuation: def run_training_continuation(self, use_json): kRows = 64 kCols = 32 - X = rng.randn(kRows, kCols) - y = rng.randn(kRows) + X = np.random.randn(kRows, kCols) + y = np.random.randn(kRows) dtrain = xgb.DMatrix(X, y) params = { "device": "sycl",