diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f4c487f8c61a..f176dfc7f6a8 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -90,6 +90,10 @@ jobs: run: | conda info conda list + - name: Display SYCL devices + shell: bash -l {0} + run: | + lscpu - name: Build and install XGBoost shell: bash -l {0} run: | diff --git a/plugin/sycl/tree/hist_updater.cc b/plugin/sycl/tree/hist_updater.cc index 7ce363145ac8..da39cd8a723f 100644 --- a/plugin/sycl/tree/hist_updater.cc +++ b/plugin/sycl/tree/hist_updater.cc @@ -183,7 +183,7 @@ void HistUpdater::EvaluateAndApplySplits( int *num_leaves, int depth, std::vector *temp_qexpand_depth) { - EvaluateSplits(qexpand_depth_wise_, gmat, hist_, *p_tree); + EvaluateSplits(qexpand_depth_wise_, gmat, *p_tree); std::vector nodes_for_apply_split; AddSplitsToTree(gmat, p_tree, num_leaves, depth, @@ -280,7 +280,7 @@ void HistUpdater::ExpandWithLossGuide( this->InitNewNode(ExpandEntry::kRootNid, gmat, gpair, *p_fmat, *p_tree); - this->EvaluateSplits({node}, gmat, hist_, *p_tree); + this->EvaluateSplits({node}, gmat, *p_tree); node.split.loss_chg = snode_host_[ExpandEntry::kRootNid].best.loss_chg; qexpand_loss_guided_->push(node); @@ -325,7 +325,7 @@ void HistUpdater::ExpandWithLossGuide( snode_host_[cleft].weight, snode_host_[cright].weight); interaction_constraints_.Split(nid, featureid, cleft, cright); - this->EvaluateSplits({left_node, right_node}, gmat, hist_, *p_tree); + this->EvaluateSplits({left_node, right_node}, gmat, *p_tree); left_node.split.loss_chg = snode_host_[cleft].best.loss_chg; right_node.split.loss_chg = snode_host_[cright].best.loss_chg; @@ -472,7 +472,7 @@ void HistUpdater::InitSampling( }); }); } else { - // Use oneDPL uniform for better perf, as far as bernoulli_distribution uses fp64 + // Use oneDPL uniform, as far as bernoulli_distribution uses fp64 event = qu_.submit([&](::sycl::handler& cgh) { auto flag_buf_acc = flag_buf.get_access<::sycl::access::mode::read_write>(cgh); cgh.parallel_for<>(::sycl::range<1>(::sycl::range<1>(num_rows)), @@ -649,45 +649,32 @@ template void HistUpdater::EvaluateSplits( const std::vector& nodes_set, const common::GHistIndexMatrix& gmat, - const common::HistCollection& hist, const RegTree& tree) { builder_monitor_.Start("EvaluateSplits"); const size_t n_nodes_in_set = nodes_set.size(); using FeatureSetType = std::shared_ptr>; - std::vector features_sets(n_nodes_in_set); // Generate feature set for each tree node - size_t total_features = 0; - for (size_t nid_in_set = 0; nid_in_set < n_nodes_in_set; ++nid_in_set) { - const int32_t nid = nodes_set[nid_in_set].nid; - features_sets[nid_in_set] = column_sampler_->GetFeatureSet(tree.GetDepth(nid)); - for (size_t idx = 0; idx < features_sets[nid_in_set]->Size(); idx++) { - const auto fid = features_sets[nid_in_set]->ConstHostVector()[idx]; - if (interaction_constraints_.Query(nid, fid)) { - total_features++; - } - } - } - - split_queries_host_.resize(total_features); size_t pos = 0; - for (size_t nid_in_set = 0; nid_in_set < n_nodes_in_set; ++nid_in_set) { - const size_t nid = nodes_set[nid_in_set].nid; - - for (size_t idx = 0; idx < features_sets[nid_in_set]->Size(); idx++) { - const auto fid = features_sets[nid_in_set]->ConstHostVector()[idx]; + const bst_node_t nid = nodes_set[nid_in_set].nid; + FeatureSetType features_set = column_sampler_->GetFeatureSet(tree.GetDepth(nid)); + for (size_t idx = 0; idx < features_set->Size(); idx++) { + const size_t fid = features_set->ConstHostVector()[idx]; if (interaction_constraints_.Query(nid, fid)) { - split_queries_host_[pos].nid = nid; - split_queries_host_[pos].fid = fid; - split_queries_host_[pos].hist = hist[nid].DataConst(); - split_queries_host_[pos].best = snode_host_[nid].best; - pos++; + auto this_hist = hist_[nid].DataConst(); + if (pos < split_queries_host_.size()) { + split_queries_host_[pos] = SplitQuery{nid, fid, this_hist}; + } else { + split_queries_host_.push_back({nid, fid, this_hist}); + } + ++pos; } } } + const size_t total_features = pos; split_queries_device_.Resize(&qu_, total_features); auto event = qu_.memcpy(split_queries_device_.Data(), split_queries_host_.data(), @@ -702,10 +689,14 @@ void HistUpdater::EvaluateSplits( snode_device_.ResizeNoCopy(&qu_, snode_host_.size()); event = qu_.memcpy(snode_device_.Data(), snode_host_.data(), snode_host_.size() * sizeof(NodeEntry), event); - const NodeEntry* snode = snode_device_.DataConst(); + const NodeEntry* snode = snode_device_.Data(); const float min_child_weight = param_.min_child_weight; + best_splits_device_.ResizeNoCopy(&qu_, total_features); + if (best_splits_host_.size() < total_features) best_splits_host_.resize(total_features); + SplitEntry* best_splits = best_splits_device_.Data(); + event = qu_.submit([&](::sycl::handler& cgh) { cgh.depends_on(event); cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(total_features, sub_group_size_), @@ -717,17 +708,18 @@ void HistUpdater::EvaluateSplits( int fid = split_queries_device[i].fid; const GradientPairT* hist_data = split_queries_device[i].hist; + best_splits[i] = snode[nid].best; EnumerateSplit(sg, cut_ptr, cut_val, hist_data, snode[nid], - &(split_queries_device[i].best), fid, nid, evaluator, min_child_weight); + &(best_splits[i]), fid, nid, evaluator, min_child_weight); }); }); - event = qu_.memcpy(split_queries_host_.data(), split_queries_device_.Data(), - total_features * sizeof(SplitQuery), event); + event = qu_.memcpy(best_splits_host_.data(), best_splits, + total_features * sizeof(SplitEntry), event); qu_.wait(); for (size_t i = 0; i < total_features; i++) { int nid = split_queries_host_[i].nid; - snode_host_[nid].best.Update(split_queries_host_[i].best); + snode_host_[nid].best.Update(best_splits_host_[i]); } builder_monitor_.Stop("EvaluateSplits"); diff --git a/plugin/sycl/tree/hist_updater.h b/plugin/sycl/tree/hist_updater.h index 933343ba170f..92e960e668b7 100644 --- a/plugin/sycl/tree/hist_updater.h +++ b/plugin/sycl/tree/hist_updater.h @@ -95,9 +95,8 @@ class HistUpdater { friend class DistributedHistRowsAdder; struct SplitQuery { - int nid; - int fid; - SplitEntry best; + bst_node_t nid; + size_t fid; const GradientPairT* hist; }; @@ -106,7 +105,6 @@ class HistUpdater { void EvaluateSplits(const std::vector& nodes_set, const common::GHistIndexMatrix& gmat, - const common::HistCollection& hist, const RegTree& tree); // Enumerate the split values of specific feature @@ -222,6 +220,9 @@ class HistUpdater { std::vector split_queries_host_; USMVector split_queries_device_; + USMVector, MemoryType::on_device> best_splits_device_; + std::vector> best_splits_host_; + TreeEvaluator tree_evaluator_; FeatureInteractionConstraintHost interaction_constraints_; diff --git a/tests/ci_build/conda_env/linux_sycl_test.yml b/tests/ci_build/conda_env/linux_sycl_test.yml index 1356fb8ed048..edac720c34f5 100644 --- a/tests/ci_build/conda_env/linux_sycl_test.yml +++ b/tests/ci_build/conda_env/linux_sycl_test.yml @@ -1,7 +1,7 @@ name: linux_sycl_test channels: - conda-forge -- intel +- https://software.repos.intel.com/python/conda/ dependencies: - python=3.8 - cmake diff --git a/tests/cpp/plugin/test_sycl_hist_updater.cc b/tests/cpp/plugin/test_sycl_hist_updater.cc index bcc49d7f9ef2..24270e76143f 100644 --- a/tests/cpp/plugin/test_sycl_hist_updater.cc +++ b/tests/cpp/plugin/test_sycl_hist_updater.cc @@ -52,6 +52,13 @@ class TestHistUpdater : public HistUpdater { HistUpdater::InitNewNode(nid, gmat, gpair, fmat, tree); return HistUpdater::snode_host_[nid]; } + + auto TestEvaluateSplits(const std::vector& nodes_set, + const common::GHistIndexMatrix& gmat, + const RegTree& tree) { + HistUpdater::EvaluateSplits(nodes_set, gmat, tree); + return HistUpdater::snode_host_; + } }; void GenerateRandomGPairs(::sycl::queue* qu, GradientPair* gpair_ptr, size_t num_rows, bool has_neg_hess) { @@ -301,6 +308,83 @@ void TestHistUpdaterInitNewNode(const xgboost::tree::TrainParam& param, float sp EXPECT_NEAR(snode.stats.GetHess(), grad_stat.GetHess(), 1e-6 * grad_stat.GetHess()); } +template +void TestHistUpdaterEvaluateSplits(const xgboost::tree::TrainParam& param) { + const size_t num_rows = 1u << 8; + const size_t num_columns = 2; + const size_t n_bins = 32; + + Context ctx; + ctx.UpdateAllowUnknown(Args{{"device", "sycl"}}); + + DeviceManager device_manager; + auto qu = device_manager.GetQueue(ctx.Device()); + ObjInfo task{ObjInfo::kRegression}; + + auto p_fmat = RandomDataGenerator{num_rows, num_columns, 0.0f}.GenerateDMatrix(); + + FeatureInteractionConstraintHost int_constraints; + + TestHistUpdater updater(&ctx, qu, param, int_constraints, p_fmat.get()); + updater.SetHistSynchronizer(new BatchHistSynchronizer()); + updater.SetHistRowsAdder(new BatchHistRowsAdder()); + + USMVector gpair(&qu, num_rows); + auto* gpair_ptr = gpair.Data(); + GenerateRandomGPairs(&qu, gpair_ptr, num_rows, false); + + DeviceMatrix dmat; + dmat.Init(qu, p_fmat.get()); + common::GHistIndexMatrix gmat; + gmat.Init(qu, &ctx, dmat, n_bins); + + RegTree tree; + tree.ExpandNode(0, 0, 0, false, 0, 0, 0, 0, 0, 0, 0); + ExpandEntry node(ExpandEntry::kRootNid, tree.GetDepth(ExpandEntry::kRootNid)); + + auto* row_set_collection = updater.TestInitData(gmat, gpair, *p_fmat, tree); + auto& row_idxs = row_set_collection->Data(); + const size_t* row_idxs_ptr = row_idxs.DataConst(); + const auto* hist = updater.TestBuildHistogramsLossGuide(node, gmat, &tree, gpair); + const auto snode_init = updater.TestInitNewNode(ExpandEntry::kRootNid, gmat, gpair, *p_fmat, tree); + + const auto snode_updated = updater.TestEvaluateSplits({node}, gmat, tree); + auto best_loss_chg = snode_updated[0].best.loss_chg; + auto stats = snode_init.stats; + auto root_gain = snode_init.root_gain; + + // Check all splits manually. Save the best one and compare with the ans + TreeEvaluator tree_evaluator(qu, param, num_columns); + auto evaluator = tree_evaluator.GetEvaluator(); + const uint32_t* cut_ptr = gmat.cut_device.Ptrs().DataConst(); + const size_t size = gmat.cut_device.Ptrs().Size(); + int n_better_splits = 0; + const auto* hist_ptr = (*hist)[0].DataConst(); + std::vector best_loss_chg_des(1, -1); + { + ::sycl::buffer best_loss_chg_buff(best_loss_chg_des.data(), 1); + qu.submit([&](::sycl::handler& cgh) { + auto best_loss_chg_acc = best_loss_chg_buff.template get_access<::sycl::access::mode::read_write>(cgh); + cgh.single_task<>([=]() { + for (size_t i = 1; i < size; ++i) { + GradStats left(0, 0); + GradStats right = stats - left; + for (size_t j = cut_ptr[i-1]; j < cut_ptr[i]; ++j) { + auto loss_change = evaluator.CalcSplitGain(0, i - 1, left, right) - root_gain; + if (loss_change > best_loss_chg_acc[0]) { + best_loss_chg_acc[0] = loss_change; + } + left.Add(hist_ptr[j].GetGrad(), hist_ptr[j].GetHess()); + right = stats - left; + } + } + }); + }).wait(); + } + + ASSERT_NEAR(best_loss_chg_des[0], best_loss_chg, 1e-6); +} + TEST(SyclHistUpdater, Sampling) { xgboost::tree::TrainParam param; param.UpdateAllowUnknown(Args{{"subsample", "0.7"}}); @@ -340,4 +424,12 @@ TEST(SyclHistUpdater, InitNewNode) { TestHistUpdaterInitNewNode(param, 0.5); } +TEST(SyclHistUpdater, EvaluateSplits) { + xgboost::tree::TrainParam param; + param.UpdateAllowUnknown(Args{{"max_depth", "3"}}); + + TestHistUpdaterEvaluateSplits(param); + TestHistUpdaterEvaluateSplits(param); +} + } // namespace xgboost::sycl::tree