9
9
#include " ../../common/device_helpers.cuh" // for LaunchN
10
10
#include " ../../common/device_vector.cuh" // for device_vector
11
11
#include " ../../data/ellpack_page.cuh" // for EllpackDeviceAccessor
12
+ #include " expand_entry.cuh" // for GPUExpandEntry
12
13
#include " feature_groups.cuh" // for FeatureGroupsAccessor
14
+ #include " quantiser.cuh" // for GradientQuantiser
13
15
#include " xgboost/base.h" // for GradientPair, GradientPairInt64
14
16
#include " xgboost/context.h" // for Context
15
17
#include " xgboost/span.h" // for Span
@@ -34,92 +36,67 @@ XGBOOST_DEV_INLINE void AtomicAdd64As32(int64_t* dst, int64_t src) {
34
36
atomicAdd (y_high, sig);
35
37
}
36
38
37
- class GradientQuantiser {
38
- private:
39
- /* Convert gradient to fixed point representation. */
40
- GradientPairPrecise to_fixed_point_;
41
- /* Convert fixed point representation back to floating point. */
42
- GradientPairPrecise to_floating_point_;
43
-
44
- public:
45
- GradientQuantiser (Context const * ctx, common::Span<GradientPair const > gpair, MetaInfo const & info);
46
- [[nodiscard]] XGBOOST_DEVICE GradientPairInt64 ToFixedPoint (GradientPair const & gpair) const {
47
- auto adjusted = GradientPairInt64 (gpair.GetGrad () * to_fixed_point_.GetGrad (),
48
- gpair.GetHess () * to_fixed_point_.GetHess ());
49
- return adjusted;
50
- }
51
- [[nodiscard]] XGBOOST_DEVICE GradientPairInt64
52
- ToFixedPoint (GradientPairPrecise const & gpair) const {
53
- auto adjusted = GradientPairInt64 (gpair.GetGrad () * to_fixed_point_.GetGrad (),
54
- gpair.GetHess () * to_fixed_point_.GetHess ());
55
- return adjusted;
56
- }
57
- [[nodiscard]] XGBOOST_DEVICE GradientPairPrecise
58
- ToFloatingPoint (const GradientPairInt64& gpair) const {
59
- auto g = gpair.GetQuantisedGrad () * to_floating_point_.GetGrad ();
60
- auto h = gpair.GetQuantisedHess () * to_floating_point_.GetHess ();
61
- return {g,h};
62
- }
63
- };
39
+ namespace cuda_impl {
40
+ // Start with about 16mb
41
+ std::size_t constexpr DftReserveSize () { return 1 << 22 ; }
42
+ } // namespace cuda_impl
64
43
65
44
/* *
66
45
* @brief Data storage for node histograms on device. Automatically expands.
67
46
*
68
- * @tparam kStopGrowingSize Do not grow beyond this size
69
- *
70
47
* @author Rory
71
48
* @date 28/07/2018
72
49
*/
73
- template <size_t kStopGrowingSize = 1 << 28 >
74
50
class DeviceHistogramStorage {
75
51
private:
76
52
using GradientSumT = GradientPairInt64;
53
+ std::size_t stop_growing_size_{0 };
77
54
/* * @brief Map nidx to starting index of its histogram. */
78
55
std::map<int , size_t > nidx_map_;
79
56
// Large buffer of zeroed memory, caches histograms
80
57
dh::device_vector<typename GradientSumT::ValueT> data_;
81
- // If we run out of storage allocate one histogram at a time
82
- // in overflow. Not cached, overwritten when a new histogram
83
- // is requested
58
+ // If we run out of storage allocate one histogram at a time in overflow. Not cached,
59
+ // overwritten when a new histogram is requested
84
60
dh::device_vector<typename GradientSumT::ValueT> overflow_;
85
61
std::map<int , size_t > overflow_nidx_map_;
86
62
int n_bins_;
87
- DeviceOrd device_id_;
88
- static constexpr size_t kNumItemsInGradientSum =
63
+ static constexpr std::size_t kNumItemsInGradientSum =
89
64
sizeof (GradientSumT) / sizeof (typename GradientSumT::ValueT);
90
65
static_assert (kNumItemsInGradientSum == 2 , " Number of items in gradient type should be 2." );
91
66
92
67
public:
93
- // Start with about 16mb
94
- DeviceHistogramStorage () { data_.reserve (1 << 22 ); }
95
- void Init (DeviceOrd device_id, int n_bins) {
96
- this ->n_bins_ = n_bins;
97
- this ->device_id_ = device_id;
98
- }
68
+ explicit DeviceHistogramStorage () { data_.reserve (cuda_impl::DftReserveSize ()); }
99
69
100
- void Reset (Context const * ctx) {
70
+ void Reset (Context const * ctx, bst_bin_t n_total_bins, std::size_t max_cached_nodes) {
71
+ this ->n_bins_ = n_total_bins;
101
72
auto d_data = data_.data ().get ();
102
73
dh::LaunchN (data_.size (), ctx->CUDACtx ()->Stream (),
103
74
[=] __device__ (size_t idx) { d_data[idx] = 0 .0f ; });
104
75
nidx_map_.clear ();
105
76
overflow_nidx_map_.clear ();
77
+
78
+ auto max_cached_bin_values =
79
+ static_cast <std::size_t >(n_total_bins) * max_cached_nodes * kNumItemsInGradientSum ;
80
+ this ->stop_growing_size_ = max_cached_bin_values;
106
81
}
107
- [[nodiscard]] bool HistogramExists (int nidx) const {
82
+
83
+ [[nodiscard]] bool HistogramExists (bst_node_t nidx) const {
108
84
return nidx_map_.find (nidx) != nidx_map_.cend () ||
109
85
overflow_nidx_map_.find (nidx) != overflow_nidx_map_.cend ();
110
86
}
111
87
[[nodiscard]] int Bins () const { return n_bins_; }
112
88
[[nodiscard]] size_t HistogramSize () const { return n_bins_ * kNumItemsInGradientSum ; }
113
89
dh::device_vector<typename GradientSumT::ValueT>& Data () { return data_; }
114
90
115
- void AllocateHistograms (Context const * ctx, const std::vector<int > & new_nidxs) {
91
+ void AllocateHistograms (Context const * ctx, std::vector<bst_node_t > const & new_nidxs) {
116
92
for (int nidx : new_nidxs) {
117
93
CHECK (!HistogramExists (nidx));
118
94
}
119
95
// Number of items currently used in data
120
96
const size_t used_size = nidx_map_.size () * HistogramSize ();
121
97
const size_t new_used_size = used_size + HistogramSize () * new_nidxs.size ();
122
- if (used_size >= kStopGrowingSize ) {
98
+ CHECK_GE (this ->stop_growing_size_ , kNumItemsInGradientSum );
99
+ if (used_size >= this ->stop_growing_size_ ) {
123
100
// Use overflow
124
101
// Delete previous entries
125
102
overflow_nidx_map_.clear ();
@@ -171,18 +148,77 @@ class DeviceHistogramBuilderImpl;
171
148
172
149
class DeviceHistogramBuilder {
173
150
std::unique_ptr<DeviceHistogramBuilderImpl> p_impl_;
151
+ DeviceHistogramStorage hist_;
152
+ common::Monitor monitor_;
174
153
175
154
public:
176
- DeviceHistogramBuilder ();
155
+ explicit DeviceHistogramBuilder ();
177
156
~DeviceHistogramBuilder ();
178
157
179
- void Reset (Context const * ctx, FeatureGroupsAccessor const & feature_groups,
158
+ void Reset (Context const * ctx, std::size_t max_cached_hist_nodes,
159
+ FeatureGroupsAccessor const & feature_groups, bst_bin_t n_total_bins,
180
160
bool force_global_memory);
181
161
void BuildHistogram (CUDAContext const * ctx, EllpackDeviceAccessor const & matrix,
182
162
FeatureGroupsAccessor const & feature_groups,
183
163
common::Span<GradientPair const > gpair,
184
164
common::Span<const std::uint32_t > ridx,
185
165
common::Span<GradientPairInt64> histogram, GradientQuantiser rounding);
166
+
167
+ [[nodiscard]] auto GetNodeHistogram (bst_node_t nidx) { return hist_.GetNodeHistogram (nidx); }
168
+
169
+ // num histograms is the number of contiguous histograms in memory to reduce over
170
+ void AllReduceHist (Context const * ctx, MetaInfo const & info, bst_node_t nidx,
171
+ std::size_t num_histograms);
172
+
173
+ // Attempt to do subtraction trick
174
+ // return true if succeeded
175
+ [[nodiscard]] bool SubtractionTrick (bst_node_t nidx_parent, bst_node_t nidx_histogram,
176
+ bst_node_t nidx_subtraction) {
177
+ if (!hist_.HistogramExists (nidx_histogram) || !hist_.HistogramExists (nidx_parent)) {
178
+ return false ;
179
+ }
180
+ auto d_node_hist_parent = hist_.GetNodeHistogram (nidx_parent);
181
+ auto d_node_hist_histogram = hist_.GetNodeHistogram (nidx_histogram);
182
+ auto d_node_hist_subtraction = hist_.GetNodeHistogram (nidx_subtraction);
183
+
184
+ dh::LaunchN (d_node_hist_parent.size (), [=] __device__ (size_t idx) {
185
+ d_node_hist_subtraction[idx] = d_node_hist_parent[idx] - d_node_hist_histogram[idx];
186
+ });
187
+ return true ;
188
+ }
189
+
190
+ [[nodiscard]] auto SubtractHist (std::vector<GPUExpandEntry> const & candidates,
191
+ std::vector<bst_node_t > const & build_nidx,
192
+ std::vector<bst_node_t > const & subtraction_nidx) {
193
+ this ->monitor_ .Start (__func__);
194
+ std::vector<bst_node_t > need_build;
195
+ for (std::size_t i = 0 ; i < subtraction_nidx.size (); i++) {
196
+ auto build_hist_nidx = build_nidx.at (i);
197
+ auto subtraction_trick_nidx = subtraction_nidx.at (i);
198
+ auto parent_nidx = candidates.at (i).nid ;
199
+
200
+ if (!this ->SubtractionTrick (parent_nidx, build_hist_nidx, subtraction_trick_nidx)) {
201
+ need_build.push_back (subtraction_trick_nidx);
202
+ }
203
+ }
204
+ this ->monitor_ .Stop (__func__);
205
+ return need_build;
206
+ }
207
+
208
+ void AllocateHistograms (Context const * ctx, std::vector<bst_node_t > const & nodes_to_build,
209
+ std::vector<bst_node_t > const & nodes_to_sub) {
210
+ this ->monitor_ .Start (__func__);
211
+ std::vector<bst_node_t > all_new = nodes_to_build;
212
+ all_new.insert (all_new.end (), nodes_to_sub.cbegin (), nodes_to_sub.cend ());
213
+ // Allocate the histograms
214
+ // Guaranteed contiguous memory
215
+ this ->AllocateHistograms (ctx, all_new);
216
+ this ->monitor_ .Stop (__func__);
217
+ }
218
+
219
+ void AllocateHistograms (Context const * ctx, std::vector<int > const & new_nidxs) {
220
+ this ->hist_ .AllocateHistograms (ctx, new_nidxs);
221
+ }
186
222
};
187
223
} // namespace xgboost::tree
188
224
#endif // HISTOGRAM_CUH_
0 commit comments