IntelPython
diff --git a/‎doc/parameter.rst
Lines changed: 3 additions & 3 deletions b/‎doc/parameter.rst
Lines changed: 3 additions & 3 deletions
diff --git a/‎include/xgboost/c_api.h
Lines changed: 1 addition & 0 deletions b/‎include/xgboost/c_api.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/data/ellpack_page_raw_format.cu
Lines changed: 4 additions & 4 deletions b/‎src/data/ellpack_page_raw_format.cu
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/tree/gpu_hist/expand_entry.cuh
Lines changed: 4 additions & 4 deletions b/‎src/tree/gpu_hist/expand_entry.cuh
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/tree/gpu_hist/histogram.cu
Lines changed: 24 additions & 3 deletions b/‎src/tree/gpu_hist/histogram.cu
Lines changed: 24 additions & 3 deletions
diff --git a/‎src/tree/gpu_hist/histogram.cuh
Lines changed: 83 additions & 47 deletions b/‎src/tree/gpu_hist/histogram.cuh
Lines changed: 83 additions & 47 deletions
diff --git a/‎src/tree/gpu_hist/quantiser.cuh
Lines changed: 39 additions & 0 deletions b/‎src/tree/gpu_hist/quantiser.cuh
Lines changed: 39 additions & 0 deletions
diff --git a/‎src/tree/hist/hist_cache.h
Lines changed: 4 additions & 4 deletions b/‎src/tree/hist/hist_cache.h
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/tree/hist/histogram.h
Lines changed: 1 addition & 1 deletion b/‎src/tree/hist/histogram.h
Lines changed: 1 addition & 1 deletion
@@ -232,12 +232,12 @@ Parameters for Tree Booster
 
 * ``max_cached_hist_node``, [default = 65536]
 
-  Maximum number of cached nodes for CPU histogram.
+  Maximum number of cached nodes for histogram.
 
   .. versionadded:: 2.0.0
 
-  - For most of the cases this parameter should not be set except for growing deep trees
-    on CPU.
+  - For most of the cases this parameter should not be set except for growing deep
+    trees. After 3.0, this parameter affects GPU algorithms as well.
 
 .. _cat-param:
 
 
@@ -522,6 +522,7 @@ XGB_DLL int XGQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHand
  *   - nthread (optional): Number of threads used for initializing DMatrix.
  *   - max_bin (optional): Maximum number of bins for building histogram. Must be consistent with
                            the corresponding booster training parameter.
+ *   - on_host (optional): Whether the data should be placed on host memory. Used by GPU inputs.
  * @param out      The created Quantile DMatrix.
  *
  * @return 0 when success, -1 when failure happens
 
@@ -60,10 +60,10 @@ template <typename T>
   RET_IF_NOT(fi->Read(&impl->is_dense));
   RET_IF_NOT(fi->Read(&impl->row_stride));
 
-  if (has_hmm_ats_ && !this->param_.prefetch_copy) {
-    RET_IF_NOT(common::ReadVec(fi, &impl->gidx_buffer));
-  } else {
+  if (this->param_.prefetch_copy || !has_hmm_ats_) {
     RET_IF_NOT(ReadDeviceVec(fi, &impl->gidx_buffer));
+  } else {
+    RET_IF_NOT(common::ReadVec(fi, &impl->gidx_buffer));
   }
   RET_IF_NOT(fi->Read(&impl->base_rowid));
   dh::DefaultStream().Sync();
@@ -95,7 +95,7 @@ template <typename T>
   CHECK(this->cuts_->cut_values_.DeviceCanRead());
   impl->SetCuts(this->cuts_);
 
-  fi->Read(page, this->param_.prefetch_copy);
+  fi->Read(page, this->param_.prefetch_copy || !this->has_hmm_ats_);
   dh::DefaultStream().Sync();
 
   return true;
 
@@ -1,15 +1,15 @@
 /**
- * Copyright 2020-2023, XGBoost Contributors
+ * Copyright 2020-2024, XGBoost Contributors
  */
 #ifndef EXPAND_ENTRY_CUH_
 #define EXPAND_ENTRY_CUH_
 
 #include <limits>   // for numeric_limits
 #include <utility>  // for move
 
-#include "../param.h"
-#include "../updater_gpu_common.cuh"
-#include "xgboost/base.h"  // for bst_node_t
+#include "../param.h"                 // for TrainParam
+#include "../updater_gpu_common.cuh"  // for DeviceSplitCandidate
+#include "xgboost/base.h"             // for bst_node_t
 
 namespace xgboost::tree {
 struct GPUExpandEntry {
 
@@ -356,13 +356,19 @@ class DeviceHistogramBuilderImpl {
 };
 
 DeviceHistogramBuilder::DeviceHistogramBuilder()
-    : p_impl_{std::make_unique<DeviceHistogramBuilderImpl>()} {}
+    : p_impl_{std::make_unique<DeviceHistogramBuilderImpl>()} {
+  monitor_.Init(__func__);
+}
 
 DeviceHistogramBuilder::~DeviceHistogramBuilder() = default;
 
-void DeviceHistogramBuilder::Reset(Context const* ctx, FeatureGroupsAccessor const& feature_groups,
-                                   bool force_global_memory) {
+void DeviceHistogramBuilder::Reset(Context const* ctx, std::size_t max_cached_hist_nodes,
+                                   FeatureGroupsAccessor const& feature_groups,
+                                   bst_bin_t n_total_bins, bool force_global_memory) {
+  this->monitor_.Start(__func__);
   this->p_impl_->Reset(ctx, feature_groups, force_global_memory);
+  this->hist_.Reset(ctx, n_total_bins, max_cached_hist_nodes);
+  this->monitor_.Stop(__func__);
 }
 
 void DeviceHistogramBuilder::BuildHistogram(CUDAContext const* ctx,
@@ -372,6 +378,21 @@ void DeviceHistogramBuilder::BuildHistogram(CUDAContext const* ctx,
                                             common::Span<const cuda_impl::RowIndexT> ridx,
                                             common::Span<GradientPairInt64> histogram,
                                             GradientQuantiser rounding) {
+  this->monitor_.Start(__func__);
   this->p_impl_->BuildHistogram(ctx, matrix, feature_groups, gpair, ridx, histogram, rounding);
+  this->monitor_.Stop(__func__);
+}
+
+void DeviceHistogramBuilder::AllReduceHist(Context const* ctx, MetaInfo const& info,
+                                           bst_node_t nidx, std::size_t num_histograms) {
+  this->monitor_.Start(__func__);
+  auto d_node_hist = hist_.GetNodeHistogram(nidx);
+  using ReduceT = typename std::remove_pointer<decltype(d_node_hist.data())>::type::ValueT;
+  auto rc = collective::GlobalSum(
+      ctx, info,
+      linalg::MakeVec(reinterpret_cast<ReduceT*>(d_node_hist.data()),
+                      d_node_hist.size() * 2 * num_histograms, ctx->Device()));
+  SafeColl(rc);
+  this->monitor_.Stop(__func__);
 }
 }  // namespace xgboost::tree
@@ -9,7 +9,9 @@
 #include "../../common/device_helpers.cuh"  // for LaunchN
 #include "../../common/device_vector.cuh"   // for device_vector
 #include "../../data/ellpack_page.cuh"      // for EllpackDeviceAccessor
+#include "expand_entry.cuh"                 // for GPUExpandEntry
 #include "feature_groups.cuh"               // for FeatureGroupsAccessor
+#include "quantiser.cuh"                    // for GradientQuantiser
 #include "xgboost/base.h"                   // for GradientPair, GradientPairInt64
 #include "xgboost/context.h"                // for Context
 #include "xgboost/span.h"                   // for Span
@@ -34,92 +36,67 @@ XGBOOST_DEV_INLINE void AtomicAdd64As32(int64_t* dst, int64_t src) {
   atomicAdd(y_high, sig);
 }
 
-class GradientQuantiser {
- private:
-  /* Convert gradient to fixed point representation. */
-  GradientPairPrecise to_fixed_point_;
-  /* Convert fixed point representation back to floating point. */
-  GradientPairPrecise to_floating_point_;
-
- public:
-  GradientQuantiser(Context const* ctx, common::Span<GradientPair const> gpair, MetaInfo const& info);
-  [[nodiscard]] XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
-    auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
-                                      gpair.GetHess() * to_fixed_point_.GetHess());
-    return adjusted;
-  }
-  [[nodiscard]] XGBOOST_DEVICE GradientPairInt64
-  ToFixedPoint(GradientPairPrecise const& gpair) const {
-    auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
-                                      gpair.GetHess() * to_fixed_point_.GetHess());
-    return adjusted;
-  }
-  [[nodiscard]] XGBOOST_DEVICE GradientPairPrecise
-  ToFloatingPoint(const GradientPairInt64& gpair) const {
-    auto g = gpair.GetQuantisedGrad() * to_floating_point_.GetGrad();
-    auto h = gpair.GetQuantisedHess() * to_floating_point_.GetHess();
-    return {g,h};
-  }
-};
+namespace cuda_impl {
+// Start with about 16mb
+std::size_t constexpr DftReserveSize() { return 1 << 22; }
+}  // namespace cuda_impl
 
 /**
  * @brief Data storage for node histograms on device. Automatically expands.
  *
- * @tparam kStopGrowingSize  Do not grow beyond this size
- *
  * @author  Rory
  * @date    28/07/2018
  */
-template <size_t kStopGrowingSize = 1 << 28>
 class DeviceHistogramStorage {
  private:
   using GradientSumT = GradientPairInt64;
+  std::size_t stop_growing_size_{0};
   /** @brief Map nidx to starting index of its histogram. */
   std::map<int, size_t> nidx_map_;
   // Large buffer of zeroed memory, caches histograms
   dh::device_vector<typename GradientSumT::ValueT> data_;
-  // If we run out of storage allocate one histogram at a time
-  // in overflow. Not cached, overwritten when a new histogram
-  // is requested
+  // If we run out of storage allocate one histogram at a time in overflow. Not cached,
+  // overwritten when a new histogram is requested
   dh::device_vector<typename GradientSumT::ValueT> overflow_;
   std::map<int, size_t> overflow_nidx_map_;
   int n_bins_;
-  DeviceOrd device_id_;
-  static constexpr size_t kNumItemsInGradientSum =
+  static constexpr std::size_t kNumItemsInGradientSum =
       sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
   static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2.");
 
  public:
-  // Start with about 16mb
-  DeviceHistogramStorage() { data_.reserve(1 << 22); }
-  void Init(DeviceOrd device_id, int n_bins) {
-    this->n_bins_ = n_bins;
-    this->device_id_ = device_id;
-  }
+  explicit DeviceHistogramStorage() { data_.reserve(cuda_impl::DftReserveSize()); }
 
-  void Reset(Context const* ctx) {
+  void Reset(Context const* ctx, bst_bin_t n_total_bins, std::size_t max_cached_nodes) {
+    this->n_bins_ = n_total_bins;
     auto d_data = data_.data().get();
     dh::LaunchN(data_.size(), ctx->CUDACtx()->Stream(),
                 [=] __device__(size_t idx) { d_data[idx] = 0.0f; });
     nidx_map_.clear();
     overflow_nidx_map_.clear();
+
+    auto max_cached_bin_values =
+        static_cast<std::size_t>(n_total_bins) * max_cached_nodes * kNumItemsInGradientSum;
+    this->stop_growing_size_ = max_cached_bin_values;
   }
-  [[nodiscard]] bool HistogramExists(int nidx) const {
+
+  [[nodiscard]] bool HistogramExists(bst_node_t nidx) const {
     return nidx_map_.find(nidx) != nidx_map_.cend() ||
            overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend();
   }
   [[nodiscard]] int Bins() const { return n_bins_; }
   [[nodiscard]] size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; }
   dh::device_vector<typename GradientSumT::ValueT>& Data() { return data_; }
 
-  void AllocateHistograms(Context const* ctx, const std::vector<int>& new_nidxs) {
+  void AllocateHistograms(Context const* ctx, std::vector<bst_node_t> const& new_nidxs) {
     for (int nidx : new_nidxs) {
       CHECK(!HistogramExists(nidx));
     }
     // Number of items currently used in data
     const size_t used_size = nidx_map_.size() * HistogramSize();
     const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size();
-    if (used_size >= kStopGrowingSize) {
+    CHECK_GE(this->stop_growing_size_, kNumItemsInGradientSum);
+    if (used_size >= this->stop_growing_size_) {
       // Use overflow
       // Delete previous entries
       overflow_nidx_map_.clear();
@@ -171,18 +148,77 @@ class DeviceHistogramBuilderImpl;
 
 class DeviceHistogramBuilder {
   std::unique_ptr<DeviceHistogramBuilderImpl> p_impl_;
+  DeviceHistogramStorage hist_;
+  common::Monitor monitor_;
 
  public:
-  DeviceHistogramBuilder();
+  explicit DeviceHistogramBuilder();
   ~DeviceHistogramBuilder();
 
-  void Reset(Context const* ctx, FeatureGroupsAccessor const& feature_groups,
+  void Reset(Context const* ctx, std::size_t max_cached_hist_nodes,
+             FeatureGroupsAccessor const& feature_groups, bst_bin_t n_total_bins,
              bool force_global_memory);
   void BuildHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
                       FeatureGroupsAccessor const& feature_groups,
                       common::Span<GradientPair const> gpair,
                       common::Span<const std::uint32_t> ridx,
                       common::Span<GradientPairInt64> histogram, GradientQuantiser rounding);
+
+  [[nodiscard]] auto GetNodeHistogram(bst_node_t nidx) { return hist_.GetNodeHistogram(nidx); }
+
+  // num histograms is the number of contiguous histograms in memory to reduce over
+  void AllReduceHist(Context const* ctx, MetaInfo const& info, bst_node_t nidx,
+                     std::size_t num_histograms);
+
+  // Attempt to do subtraction trick
+  // return true if succeeded
+  [[nodiscard]] bool SubtractionTrick(bst_node_t nidx_parent, bst_node_t nidx_histogram,
+                                      bst_node_t nidx_subtraction) {
+    if (!hist_.HistogramExists(nidx_histogram) || !hist_.HistogramExists(nidx_parent)) {
+      return false;
+    }
+    auto d_node_hist_parent = hist_.GetNodeHistogram(nidx_parent);
+    auto d_node_hist_histogram = hist_.GetNodeHistogram(nidx_histogram);
+    auto d_node_hist_subtraction = hist_.GetNodeHistogram(nidx_subtraction);
+
+    dh::LaunchN(d_node_hist_parent.size(), [=] __device__(size_t idx) {
+      d_node_hist_subtraction[idx] = d_node_hist_parent[idx] - d_node_hist_histogram[idx];
+    });
+    return true;
+  }
+
+  [[nodiscard]] auto SubtractHist(std::vector<GPUExpandEntry> const& candidates,
+                                  std::vector<bst_node_t> const& build_nidx,
+                                  std::vector<bst_node_t> const& subtraction_nidx) {
+    this->monitor_.Start(__func__);
+    std::vector<bst_node_t> need_build;
+    for (std::size_t i = 0; i < subtraction_nidx.size(); i++) {
+      auto build_hist_nidx = build_nidx.at(i);
+      auto subtraction_trick_nidx = subtraction_nidx.at(i);
+      auto parent_nidx = candidates.at(i).nid;
+
+      if (!this->SubtractionTrick(parent_nidx, build_hist_nidx, subtraction_trick_nidx)) {
+        need_build.push_back(subtraction_trick_nidx);
+      }
+    }
+    this->monitor_.Stop(__func__);
+    return need_build;
+  }
+
+  void AllocateHistograms(Context const* ctx, std::vector<bst_node_t> const& nodes_to_build,
+                          std::vector<bst_node_t> const& nodes_to_sub) {
+    this->monitor_.Start(__func__);
+    std::vector<bst_node_t> all_new = nodes_to_build;
+    all_new.insert(all_new.end(), nodes_to_sub.cbegin(), nodes_to_sub.cend());
+    // Allocate the histograms
+    // Guaranteed contiguous memory
+    this->AllocateHistograms(ctx, all_new);
+    this->monitor_.Stop(__func__);
+  }
+
+  void AllocateHistograms(Context const* ctx, std::vector<int> const& new_nidxs) {
+    this->hist_.AllocateHistograms(ctx, new_nidxs);
+  }
 };
 }  // namespace xgboost::tree
 #endif  // HISTOGRAM_CUH_
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2020-2024, XGBoost Contributors
+ */
+#pragma once
+#include "xgboost/base.h"     // for GradientPairPrecise, GradientPairInt64
+#include "xgboost/context.h"  // for Context
+#include "xgboost/data.h"     // for MetaInfo
+#include "xgboost/span.h"     // for Span
+
+namespace xgboost::tree {
+class GradientQuantiser {
+ private:
+  /* Convert gradient to fixed point representation. */
+  GradientPairPrecise to_fixed_point_;
+  /* Convert fixed point representation back to floating point. */
+  GradientPairPrecise to_floating_point_;
+
+ public:
+  GradientQuantiser(Context const* ctx, common::Span<GradientPair const> gpair,
+                    MetaInfo const& info);
+  [[nodiscard]] XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
+    auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
+                                      gpair.GetHess() * to_fixed_point_.GetHess());
+    return adjusted;
+  }
+  [[nodiscard]] XGBOOST_DEVICE GradientPairInt64
+  ToFixedPoint(GradientPairPrecise const& gpair) const {
+    auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
+                                      gpair.GetHess() * to_fixed_point_.GetHess());
+    return adjusted;
+  }
+  [[nodiscard]] XGBOOST_DEVICE GradientPairPrecise
+  ToFloatingPoint(const GradientPairInt64& gpair) const {
+    auto g = gpair.GetQuantisedGrad() * to_floating_point_.GetGrad();
+    auto h = gpair.GetQuantisedHess() * to_floating_point_.GetHess();
+    return {g, h};
+  }
+};
+}  // namespace xgboost::tree
@@ -11,7 +11,7 @@
 #include "../../common/hist_util.h"          // for GHistRow, ConstGHistRow
 #include "../../common/ref_resource_view.h"  // for ReallocVector
 #include "xgboost/base.h"                    // for bst_node_t, bst_bin_t
-#include "xgboost/logging.h"                 // for CHECK_GT
+#include "xgboost/logging.h"                 // for CHECK_EQ
 #include "xgboost/span.h"                    // for Span
 
 namespace xgboost::tree {
@@ -40,7 +40,7 @@ class BoundedHistCollection {
   // number of histogram bins across all features
   bst_bin_t n_total_bins_{0};
   // limits the number of nodes that can be in the cache for each tree
-  std::size_t n_cached_nodes_{0};
+  std::size_t max_cached_nodes_{0};
   // whether the tree has grown beyond the cache limit
   bool has_exceeded_{false};
 
@@ -58,7 +58,7 @@ class BoundedHistCollection {
   }
   void Reset(bst_bin_t n_total_bins, std::size_t n_cached_nodes) {
     n_total_bins_ = n_total_bins;
-    n_cached_nodes_ = n_cached_nodes;
+    max_cached_nodes_ = n_cached_nodes;
     this->Clear(false);
   }
   /**
@@ -73,7 +73,7 @@ class BoundedHistCollection {
   [[nodiscard]] bool CanHost(common::Span<bst_node_t const> nodes_to_build,
                              common::Span<bst_node_t const> nodes_to_sub) const {
     auto n_new_nodes = nodes_to_build.size() + nodes_to_sub.size();
-    return n_new_nodes + node_map_.size() <= n_cached_nodes_;
+    return n_new_nodes + node_map_.size() <= max_cached_nodes_;
   }
 
   /**
 
@@ -61,7 +61,7 @@ class HistogramBuilder {
              bool is_col_split, HistMakerTrainParam const *param) {
     n_threads_ = ctx->Threads();
     param_ = p;
-    hist_.Reset(total_bins, param->max_cached_hist_node);
+    hist_.Reset(total_bins, param->MaxCachedHistNodes(ctx->Device()));
     buffer_.Init(total_bins);
     is_distributed_ = is_distributed;
     is_col_split_ = is_col_split;
Original file line number	Diff line number	Diff line change
`@@ -522,6 +522,7 @@ XGB_DLL int XGQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHand`
`522`	`522`	`* - nthread (optional): Number of threads used for initializing DMatrix.`
`523`	`523`	`* - max_bin (optional): Maximum number of bins for building histogram. Must be consistent with`
`524`	`524`	`the corresponding booster training parameter.`
	`525`	`+ * - on_host (optional): Whether the data should be placed on host memory. Used by GPU inputs.`
`525`	`526`	`* @param out The created Quantile DMatrix.`
`526`	`527`	`*`
`527`	`528`	`* @return 0 when success, -1 when failure happens`