Skip to content

Commit 61dd854

Browse files
authored
[EM] Refactor GPU histogram builder. (dmlc#10764)
- Expose the maximum number of cached nodes to be consistent with the CPU implementation. Also easier for testing. - Extract the subtraction trick for easier testing. - Split up the `GradientQuantiser` to avoid circular dependency.
1 parent 34937fe commit 61dd854

17 files changed

+394
-187
lines changed

doc/parameter.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -232,12 +232,12 @@ Parameters for Tree Booster
232232

233233
* ``max_cached_hist_node``, [default = 65536]
234234

235-
Maximum number of cached nodes for CPU histogram.
235+
Maximum number of cached nodes for histogram.
236236

237237
.. versionadded:: 2.0.0
238238

239-
- For most of the cases this parameter should not be set except for growing deep trees
240-
on CPU.
239+
- For most of the cases this parameter should not be set except for growing deep
240+
trees. After 3.0, this parameter affects GPU algorithms as well.
241241

242242
.. _cat-param:
243243

include/xgboost/c_api.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -522,6 +522,7 @@ XGB_DLL int XGQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHand
522522
* - nthread (optional): Number of threads used for initializing DMatrix.
523523
* - max_bin (optional): Maximum number of bins for building histogram. Must be consistent with
524524
the corresponding booster training parameter.
525+
* - on_host (optional): Whether the data should be placed on host memory. Used by GPU inputs.
525526
* @param out The created Quantile DMatrix.
526527
*
527528
* @return 0 when success, -1 when failure happens

src/data/ellpack_page_raw_format.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,10 @@ template <typename T>
6060
RET_IF_NOT(fi->Read(&impl->is_dense));
6161
RET_IF_NOT(fi->Read(&impl->row_stride));
6262

63-
if (has_hmm_ats_ && !this->param_.prefetch_copy) {
64-
RET_IF_NOT(common::ReadVec(fi, &impl->gidx_buffer));
65-
} else {
63+
if (this->param_.prefetch_copy || !has_hmm_ats_) {
6664
RET_IF_NOT(ReadDeviceVec(fi, &impl->gidx_buffer));
65+
} else {
66+
RET_IF_NOT(common::ReadVec(fi, &impl->gidx_buffer));
6767
}
6868
RET_IF_NOT(fi->Read(&impl->base_rowid));
6969
dh::DefaultStream().Sync();
@@ -95,7 +95,7 @@ template <typename T>
9595
CHECK(this->cuts_->cut_values_.DeviceCanRead());
9696
impl->SetCuts(this->cuts_);
9797

98-
fi->Read(page, this->param_.prefetch_copy);
98+
fi->Read(page, this->param_.prefetch_copy || !this->has_hmm_ats_);
9999
dh::DefaultStream().Sync();
100100

101101
return true;

src/tree/gpu_hist/expand_entry.cuh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
/**
2-
* Copyright 2020-2023, XGBoost Contributors
2+
* Copyright 2020-2024, XGBoost Contributors
33
*/
44
#ifndef EXPAND_ENTRY_CUH_
55
#define EXPAND_ENTRY_CUH_
66

77
#include <limits> // for numeric_limits
88
#include <utility> // for move
99

10-
#include "../param.h"
11-
#include "../updater_gpu_common.cuh"
12-
#include "xgboost/base.h" // for bst_node_t
10+
#include "../param.h" // for TrainParam
11+
#include "../updater_gpu_common.cuh" // for DeviceSplitCandidate
12+
#include "xgboost/base.h" // for bst_node_t
1313

1414
namespace xgboost::tree {
1515
struct GPUExpandEntry {

src/tree/gpu_hist/histogram.cu

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -356,13 +356,19 @@ class DeviceHistogramBuilderImpl {
356356
};
357357

358358
DeviceHistogramBuilder::DeviceHistogramBuilder()
359-
: p_impl_{std::make_unique<DeviceHistogramBuilderImpl>()} {}
359+
: p_impl_{std::make_unique<DeviceHistogramBuilderImpl>()} {
360+
monitor_.Init(__func__);
361+
}
360362

361363
DeviceHistogramBuilder::~DeviceHistogramBuilder() = default;
362364

363-
void DeviceHistogramBuilder::Reset(Context const* ctx, FeatureGroupsAccessor const& feature_groups,
364-
bool force_global_memory) {
365+
void DeviceHistogramBuilder::Reset(Context const* ctx, std::size_t max_cached_hist_nodes,
366+
FeatureGroupsAccessor const& feature_groups,
367+
bst_bin_t n_total_bins, bool force_global_memory) {
368+
this->monitor_.Start(__func__);
365369
this->p_impl_->Reset(ctx, feature_groups, force_global_memory);
370+
this->hist_.Reset(ctx, n_total_bins, max_cached_hist_nodes);
371+
this->monitor_.Stop(__func__);
366372
}
367373

368374
void DeviceHistogramBuilder::BuildHistogram(CUDAContext const* ctx,
@@ -372,6 +378,21 @@ void DeviceHistogramBuilder::BuildHistogram(CUDAContext const* ctx,
372378
common::Span<const cuda_impl::RowIndexT> ridx,
373379
common::Span<GradientPairInt64> histogram,
374380
GradientQuantiser rounding) {
381+
this->monitor_.Start(__func__);
375382
this->p_impl_->BuildHistogram(ctx, matrix, feature_groups, gpair, ridx, histogram, rounding);
383+
this->monitor_.Stop(__func__);
384+
}
385+
386+
void DeviceHistogramBuilder::AllReduceHist(Context const* ctx, MetaInfo const& info,
387+
bst_node_t nidx, std::size_t num_histograms) {
388+
this->monitor_.Start(__func__);
389+
auto d_node_hist = hist_.GetNodeHistogram(nidx);
390+
using ReduceT = typename std::remove_pointer<decltype(d_node_hist.data())>::type::ValueT;
391+
auto rc = collective::GlobalSum(
392+
ctx, info,
393+
linalg::MakeVec(reinterpret_cast<ReduceT*>(d_node_hist.data()),
394+
d_node_hist.size() * 2 * num_histograms, ctx->Device()));
395+
SafeColl(rc);
396+
this->monitor_.Stop(__func__);
376397
}
377398
} // namespace xgboost::tree

src/tree/gpu_hist/histogram.cuh

Lines changed: 83 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99
#include "../../common/device_helpers.cuh" // for LaunchN
1010
#include "../../common/device_vector.cuh" // for device_vector
1111
#include "../../data/ellpack_page.cuh" // for EllpackDeviceAccessor
12+
#include "expand_entry.cuh" // for GPUExpandEntry
1213
#include "feature_groups.cuh" // for FeatureGroupsAccessor
14+
#include "quantiser.cuh" // for GradientQuantiser
1315
#include "xgboost/base.h" // for GradientPair, GradientPairInt64
1416
#include "xgboost/context.h" // for Context
1517
#include "xgboost/span.h" // for Span
@@ -34,92 +36,67 @@ XGBOOST_DEV_INLINE void AtomicAdd64As32(int64_t* dst, int64_t src) {
3436
atomicAdd(y_high, sig);
3537
}
3638

37-
class GradientQuantiser {
38-
private:
39-
/* Convert gradient to fixed point representation. */
40-
GradientPairPrecise to_fixed_point_;
41-
/* Convert fixed point representation back to floating point. */
42-
GradientPairPrecise to_floating_point_;
43-
44-
public:
45-
GradientQuantiser(Context const* ctx, common::Span<GradientPair const> gpair, MetaInfo const& info);
46-
[[nodiscard]] XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
47-
auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
48-
gpair.GetHess() * to_fixed_point_.GetHess());
49-
return adjusted;
50-
}
51-
[[nodiscard]] XGBOOST_DEVICE GradientPairInt64
52-
ToFixedPoint(GradientPairPrecise const& gpair) const {
53-
auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
54-
gpair.GetHess() * to_fixed_point_.GetHess());
55-
return adjusted;
56-
}
57-
[[nodiscard]] XGBOOST_DEVICE GradientPairPrecise
58-
ToFloatingPoint(const GradientPairInt64& gpair) const {
59-
auto g = gpair.GetQuantisedGrad() * to_floating_point_.GetGrad();
60-
auto h = gpair.GetQuantisedHess() * to_floating_point_.GetHess();
61-
return {g,h};
62-
}
63-
};
39+
namespace cuda_impl {
40+
// Start with about 16mb
41+
std::size_t constexpr DftReserveSize() { return 1 << 22; }
42+
} // namespace cuda_impl
6443

6544
/**
6645
* @brief Data storage for node histograms on device. Automatically expands.
6746
*
68-
* @tparam kStopGrowingSize Do not grow beyond this size
69-
*
7047
* @author Rory
7148
* @date 28/07/2018
7249
*/
73-
template <size_t kStopGrowingSize = 1 << 28>
7450
class DeviceHistogramStorage {
7551
private:
7652
using GradientSumT = GradientPairInt64;
53+
std::size_t stop_growing_size_{0};
7754
/** @brief Map nidx to starting index of its histogram. */
7855
std::map<int, size_t> nidx_map_;
7956
// Large buffer of zeroed memory, caches histograms
8057
dh::device_vector<typename GradientSumT::ValueT> data_;
81-
// If we run out of storage allocate one histogram at a time
82-
// in overflow. Not cached, overwritten when a new histogram
83-
// is requested
58+
// If we run out of storage allocate one histogram at a time in overflow. Not cached,
59+
// overwritten when a new histogram is requested
8460
dh::device_vector<typename GradientSumT::ValueT> overflow_;
8561
std::map<int, size_t> overflow_nidx_map_;
8662
int n_bins_;
87-
DeviceOrd device_id_;
88-
static constexpr size_t kNumItemsInGradientSum =
63+
static constexpr std::size_t kNumItemsInGradientSum =
8964
sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
9065
static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2.");
9166

9267
public:
93-
// Start with about 16mb
94-
DeviceHistogramStorage() { data_.reserve(1 << 22); }
95-
void Init(DeviceOrd device_id, int n_bins) {
96-
this->n_bins_ = n_bins;
97-
this->device_id_ = device_id;
98-
}
68+
explicit DeviceHistogramStorage() { data_.reserve(cuda_impl::DftReserveSize()); }
9969

100-
void Reset(Context const* ctx) {
70+
void Reset(Context const* ctx, bst_bin_t n_total_bins, std::size_t max_cached_nodes) {
71+
this->n_bins_ = n_total_bins;
10172
auto d_data = data_.data().get();
10273
dh::LaunchN(data_.size(), ctx->CUDACtx()->Stream(),
10374
[=] __device__(size_t idx) { d_data[idx] = 0.0f; });
10475
nidx_map_.clear();
10576
overflow_nidx_map_.clear();
77+
78+
auto max_cached_bin_values =
79+
static_cast<std::size_t>(n_total_bins) * max_cached_nodes * kNumItemsInGradientSum;
80+
this->stop_growing_size_ = max_cached_bin_values;
10681
}
107-
[[nodiscard]] bool HistogramExists(int nidx) const {
82+
83+
[[nodiscard]] bool HistogramExists(bst_node_t nidx) const {
10884
return nidx_map_.find(nidx) != nidx_map_.cend() ||
10985
overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend();
11086
}
11187
[[nodiscard]] int Bins() const { return n_bins_; }
11288
[[nodiscard]] size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; }
11389
dh::device_vector<typename GradientSumT::ValueT>& Data() { return data_; }
11490

115-
void AllocateHistograms(Context const* ctx, const std::vector<int>& new_nidxs) {
91+
void AllocateHistograms(Context const* ctx, std::vector<bst_node_t> const& new_nidxs) {
11692
for (int nidx : new_nidxs) {
11793
CHECK(!HistogramExists(nidx));
11894
}
11995
// Number of items currently used in data
12096
const size_t used_size = nidx_map_.size() * HistogramSize();
12197
const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size();
122-
if (used_size >= kStopGrowingSize) {
98+
CHECK_GE(this->stop_growing_size_, kNumItemsInGradientSum);
99+
if (used_size >= this->stop_growing_size_) {
123100
// Use overflow
124101
// Delete previous entries
125102
overflow_nidx_map_.clear();
@@ -171,18 +148,77 @@ class DeviceHistogramBuilderImpl;
171148

172149
class DeviceHistogramBuilder {
173150
std::unique_ptr<DeviceHistogramBuilderImpl> p_impl_;
151+
DeviceHistogramStorage hist_;
152+
common::Monitor monitor_;
174153

175154
public:
176-
DeviceHistogramBuilder();
155+
explicit DeviceHistogramBuilder();
177156
~DeviceHistogramBuilder();
178157

179-
void Reset(Context const* ctx, FeatureGroupsAccessor const& feature_groups,
158+
void Reset(Context const* ctx, std::size_t max_cached_hist_nodes,
159+
FeatureGroupsAccessor const& feature_groups, bst_bin_t n_total_bins,
180160
bool force_global_memory);
181161
void BuildHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
182162
FeatureGroupsAccessor const& feature_groups,
183163
common::Span<GradientPair const> gpair,
184164
common::Span<const std::uint32_t> ridx,
185165
common::Span<GradientPairInt64> histogram, GradientQuantiser rounding);
166+
167+
[[nodiscard]] auto GetNodeHistogram(bst_node_t nidx) { return hist_.GetNodeHistogram(nidx); }
168+
169+
// num histograms is the number of contiguous histograms in memory to reduce over
170+
void AllReduceHist(Context const* ctx, MetaInfo const& info, bst_node_t nidx,
171+
std::size_t num_histograms);
172+
173+
// Attempt to do subtraction trick
174+
// return true if succeeded
175+
[[nodiscard]] bool SubtractionTrick(bst_node_t nidx_parent, bst_node_t nidx_histogram,
176+
bst_node_t nidx_subtraction) {
177+
if (!hist_.HistogramExists(nidx_histogram) || !hist_.HistogramExists(nidx_parent)) {
178+
return false;
179+
}
180+
auto d_node_hist_parent = hist_.GetNodeHistogram(nidx_parent);
181+
auto d_node_hist_histogram = hist_.GetNodeHistogram(nidx_histogram);
182+
auto d_node_hist_subtraction = hist_.GetNodeHistogram(nidx_subtraction);
183+
184+
dh::LaunchN(d_node_hist_parent.size(), [=] __device__(size_t idx) {
185+
d_node_hist_subtraction[idx] = d_node_hist_parent[idx] - d_node_hist_histogram[idx];
186+
});
187+
return true;
188+
}
189+
190+
[[nodiscard]] auto SubtractHist(std::vector<GPUExpandEntry> const& candidates,
191+
std::vector<bst_node_t> const& build_nidx,
192+
std::vector<bst_node_t> const& subtraction_nidx) {
193+
this->monitor_.Start(__func__);
194+
std::vector<bst_node_t> need_build;
195+
for (std::size_t i = 0; i < subtraction_nidx.size(); i++) {
196+
auto build_hist_nidx = build_nidx.at(i);
197+
auto subtraction_trick_nidx = subtraction_nidx.at(i);
198+
auto parent_nidx = candidates.at(i).nid;
199+
200+
if (!this->SubtractionTrick(parent_nidx, build_hist_nidx, subtraction_trick_nidx)) {
201+
need_build.push_back(subtraction_trick_nidx);
202+
}
203+
}
204+
this->monitor_.Stop(__func__);
205+
return need_build;
206+
}
207+
208+
void AllocateHistograms(Context const* ctx, std::vector<bst_node_t> const& nodes_to_build,
209+
std::vector<bst_node_t> const& nodes_to_sub) {
210+
this->monitor_.Start(__func__);
211+
std::vector<bst_node_t> all_new = nodes_to_build;
212+
all_new.insert(all_new.end(), nodes_to_sub.cbegin(), nodes_to_sub.cend());
213+
// Allocate the histograms
214+
// Guaranteed contiguous memory
215+
this->AllocateHistograms(ctx, all_new);
216+
this->monitor_.Stop(__func__);
217+
}
218+
219+
void AllocateHistograms(Context const* ctx, std::vector<int> const& new_nidxs) {
220+
this->hist_.AllocateHistograms(ctx, new_nidxs);
221+
}
186222
};
187223
} // namespace xgboost::tree
188224
#endif // HISTOGRAM_CUH_

src/tree/gpu_hist/quantiser.cuh

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
/**
2+
* Copyright 2020-2024, XGBoost Contributors
3+
*/
4+
#pragma once
5+
#include "xgboost/base.h" // for GradientPairPrecise, GradientPairInt64
6+
#include "xgboost/context.h" // for Context
7+
#include "xgboost/data.h" // for MetaInfo
8+
#include "xgboost/span.h" // for Span
9+
10+
namespace xgboost::tree {
11+
class GradientQuantiser {
12+
private:
13+
/* Convert gradient to fixed point representation. */
14+
GradientPairPrecise to_fixed_point_;
15+
/* Convert fixed point representation back to floating point. */
16+
GradientPairPrecise to_floating_point_;
17+
18+
public:
19+
GradientQuantiser(Context const* ctx, common::Span<GradientPair const> gpair,
20+
MetaInfo const& info);
21+
[[nodiscard]] XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
22+
auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
23+
gpair.GetHess() * to_fixed_point_.GetHess());
24+
return adjusted;
25+
}
26+
[[nodiscard]] XGBOOST_DEVICE GradientPairInt64
27+
ToFixedPoint(GradientPairPrecise const& gpair) const {
28+
auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
29+
gpair.GetHess() * to_fixed_point_.GetHess());
30+
return adjusted;
31+
}
32+
[[nodiscard]] XGBOOST_DEVICE GradientPairPrecise
33+
ToFloatingPoint(const GradientPairInt64& gpair) const {
34+
auto g = gpair.GetQuantisedGrad() * to_floating_point_.GetGrad();
35+
auto h = gpair.GetQuantisedHess() * to_floating_point_.GetHess();
36+
return {g, h};
37+
}
38+
};
39+
} // namespace xgboost::tree

src/tree/hist/hist_cache.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
#include "../../common/hist_util.h" // for GHistRow, ConstGHistRow
1212
#include "../../common/ref_resource_view.h" // for ReallocVector
1313
#include "xgboost/base.h" // for bst_node_t, bst_bin_t
14-
#include "xgboost/logging.h" // for CHECK_GT
14+
#include "xgboost/logging.h" // for CHECK_EQ
1515
#include "xgboost/span.h" // for Span
1616

1717
namespace xgboost::tree {
@@ -40,7 +40,7 @@ class BoundedHistCollection {
4040
// number of histogram bins across all features
4141
bst_bin_t n_total_bins_{0};
4242
// limits the number of nodes that can be in the cache for each tree
43-
std::size_t n_cached_nodes_{0};
43+
std::size_t max_cached_nodes_{0};
4444
// whether the tree has grown beyond the cache limit
4545
bool has_exceeded_{false};
4646

@@ -58,7 +58,7 @@ class BoundedHistCollection {
5858
}
5959
void Reset(bst_bin_t n_total_bins, std::size_t n_cached_nodes) {
6060
n_total_bins_ = n_total_bins;
61-
n_cached_nodes_ = n_cached_nodes;
61+
max_cached_nodes_ = n_cached_nodes;
6262
this->Clear(false);
6363
}
6464
/**
@@ -73,7 +73,7 @@ class BoundedHistCollection {
7373
[[nodiscard]] bool CanHost(common::Span<bst_node_t const> nodes_to_build,
7474
common::Span<bst_node_t const> nodes_to_sub) const {
7575
auto n_new_nodes = nodes_to_build.size() + nodes_to_sub.size();
76-
return n_new_nodes + node_map_.size() <= n_cached_nodes_;
76+
return n_new_nodes + node_map_.size() <= max_cached_nodes_;
7777
}
7878

7979
/**

src/tree/hist/histogram.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ class HistogramBuilder {
6161
bool is_col_split, HistMakerTrainParam const *param) {
6262
n_threads_ = ctx->Threads();
6363
param_ = p;
64-
hist_.Reset(total_bins, param->max_cached_hist_node);
64+
hist_.Reset(total_bins, param->MaxCachedHistNodes(ctx->Device()));
6565
buffer_.Init(total_bins);
6666
is_distributed_ = is_distributed;
6767
is_col_split_ = is_col_split;

0 commit comments

Comments
 (0)