@@ -15,17 +15,20 @@ namespace xgboost {
15
15
namespace tree {
16
16
17
17
// With constraints
18
- XGBOOST_DEVICE float LossChangeMissing (const GradientPairPrecise &scan,
19
- const GradientPairPrecise &missing,
20
- const GradientPairPrecise &parent_sum,
18
+ XGBOOST_DEVICE float LossChangeMissing (const GradientPairInt64 &scan,
19
+ const GradientPairInt64 &missing,
20
+ const GradientPairInt64 &parent_sum,
21
21
const GPUTrainingParam ¶m, bst_node_t nidx,
22
22
bst_feature_t fidx,
23
23
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
24
- bool &missing_left_out) { // NOLINT
24
+ bool &missing_left_out, const GradientQuantiser& quantiser ) { // NOLINT
25
25
const auto left_sum = scan + missing;
26
- float missing_left_gain =
27
- evaluator.CalcSplitGain (param, nidx, fidx, left_sum, parent_sum - left_sum);
28
- float missing_right_gain = evaluator.CalcSplitGain (param, nidx, fidx, scan, parent_sum - scan);
26
+ float missing_left_gain = evaluator.CalcSplitGain (
27
+ param, nidx, fidx, quantiser.ToFloatingPoint (left_sum),
28
+ quantiser.ToFloatingPoint (parent_sum - left_sum));
29
+ float missing_right_gain = evaluator.CalcSplitGain (
30
+ param, nidx, fidx, quantiser.ToFloatingPoint (scan),
31
+ quantiser.ToFloatingPoint (parent_sum - scan));
29
32
30
33
missing_left_out = missing_left_gain > missing_right_gain;
31
34
return missing_left_out?missing_left_gain:missing_right_gain;
@@ -42,9 +45,9 @@ template <int kBlockSize>
42
45
class EvaluateSplitAgent {
43
46
public:
44
47
using ArgMaxT = cub::KeyValuePair<int , float >;
45
- using BlockScanT = cub::BlockScan<GradientPairPrecise , kBlockSize >;
48
+ using BlockScanT = cub::BlockScan<GradientPairInt64 , kBlockSize >;
46
49
using MaxReduceT = cub::WarpReduce<ArgMaxT>;
47
- using SumReduceT = cub::WarpReduce<GradientPairPrecise >;
50
+ using SumReduceT = cub::WarpReduce<GradientPairInt64 >;
48
51
49
52
struct TempStorage {
50
53
typename BlockScanT::TempStorage scan;
@@ -59,67 +62,67 @@ class EvaluateSplitAgent {
59
62
const uint32_t gidx_end; // end bin for i^th feature
60
63
const dh::LDGIterator<float > feature_values;
61
64
const GradientPairInt64 *node_histogram;
62
- const GradientQuantizer &rounding;
63
- const GradientPairPrecise parent_sum;
64
- const GradientPairPrecise missing;
65
+ const GradientQuantiser &rounding;
66
+ const GradientPairInt64 parent_sum;
67
+ const GradientPairInt64 missing;
65
68
const GPUTrainingParam ¶m;
66
69
const TreeEvaluator::SplitEvaluator<GPUTrainingParam> &evaluator;
67
70
TempStorage *temp_storage;
68
- SumCallbackOp<GradientPairPrecise > prefix_op;
71
+ SumCallbackOp<GradientPairInt64 > prefix_op;
69
72
static float constexpr kNullGain = -std::numeric_limits<bst_float>::infinity();
70
73
71
- __device__ EvaluateSplitAgent (TempStorage *temp_storage, int fidx,
72
- const EvaluateSplitInputs &inputs,
73
- const EvaluateSplitSharedInputs &shared_inputs,
74
- const TreeEvaluator::SplitEvaluator<GPUTrainingParam> &evaluator)
75
- : temp_storage(temp_storage),
76
- nidx(inputs.nidx),
77
- fidx(fidx),
74
+ __device__ EvaluateSplitAgent (
75
+ TempStorage *temp_storage, int fidx, const EvaluateSplitInputs &inputs,
76
+ const EvaluateSplitSharedInputs &shared_inputs,
77
+ const TreeEvaluator::SplitEvaluator<GPUTrainingParam> &evaluator)
78
+ : temp_storage(temp_storage), nidx(inputs.nidx), fidx(fidx),
78
79
min_fvalue(__ldg (shared_inputs.min_fvalue.data() + fidx)),
79
80
gidx_begin(__ldg (shared_inputs.feature_segments.data() + fidx)),
80
81
gidx_end(__ldg (shared_inputs.feature_segments.data() + fidx + 1)),
81
82
feature_values(shared_inputs.feature_values.data()),
82
83
node_histogram(inputs.gradient_histogram.data()),
83
84
rounding(shared_inputs.rounding),
84
- parent_sum(dh::LDGIterator<GradientPairPrecise>(&inputs.parent_sum)[0]),
85
- param(shared_inputs.param),
86
- evaluator(evaluator),
85
+ parent_sum(dh::LDGIterator<GradientPairInt64>(&inputs.parent_sum)[0]),
86
+ param(shared_inputs.param), evaluator(evaluator),
87
87
missing(parent_sum - ReduceFeature()) {
88
- static_assert (kBlockSize == 32 ,
89
- " This kernel relies on the assumption block_size == warp_size" );
88
+ static_assert (
89
+ kBlockSize == 32 ,
90
+ " This kernel relies on the assumption block_size == warp_size" );
91
+ // There should be no missing value gradients for a dense matrix
92
+ KERNEL_CHECK (!shared_inputs.is_dense || missing.GetQuantisedHess () == 0 );
90
93
}
91
- __device__ GradientPairPrecise ReduceFeature () {
92
- GradientPairPrecise local_sum;
93
- for (int idx = gidx_begin + threadIdx .x ; idx < gidx_end; idx += kBlockSize ) {
94
+ __device__ GradientPairInt64 ReduceFeature () {
95
+ GradientPairInt64 local_sum;
96
+ for (int idx = gidx_begin + threadIdx .x ; idx < gidx_end;
97
+ idx += kBlockSize ) {
94
98
local_sum += LoadGpair (node_histogram + idx);
95
99
}
96
100
local_sum = SumReduceT (temp_storage->sum_reduce ).Sum (local_sum);
97
101
// Broadcast result from thread 0
98
- return {__shfl_sync (0xffffffff , local_sum.GetGrad (), 0 ),
99
- __shfl_sync (0xffffffff , local_sum.GetHess (), 0 )};
102
+ return {__shfl_sync (0xffffffff , local_sum.GetQuantisedGrad (), 0 ),
103
+ __shfl_sync (0xffffffff , local_sum.GetQuantisedHess (), 0 )};
100
104
}
101
105
102
106
// Load using efficient 128 vector load instruction
103
- __device__ __forceinline__ GradientPairPrecise LoadGpair (const GradientPairInt64 *ptr) {
107
+ __device__ __forceinline__ GradientPairInt64 LoadGpair (const GradientPairInt64 *ptr) {
104
108
float4 tmp = *reinterpret_cast <const float4 *>(ptr);
105
- auto gpair_int = *reinterpret_cast <const GradientPairInt64 *>(&tmp);
106
- static_assert (sizeof (decltype (gpair_int )) == sizeof (float4 ),
109
+ auto gpair = *reinterpret_cast <const GradientPairInt64 *>(&tmp);
110
+ static_assert (sizeof (decltype (gpair )) == sizeof (float4 ),
107
111
" Vector type size does not match gradient pair size." );
108
- return rounding. ToFloatingPoint (gpair_int) ;
112
+ return gpair ;
109
113
}
110
114
111
115
__device__ __forceinline__ void Numerical (DeviceSplitCandidate *__restrict__ best_split) {
112
116
for (int scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += kBlockSize ) {
113
117
bool thread_active = (scan_begin + threadIdx .x ) < gidx_end;
114
- GradientPairPrecise bin = thread_active ? LoadGpair (node_histogram + scan_begin + threadIdx .x )
115
- : GradientPairPrecise ();
118
+ GradientPairInt64 bin = thread_active ? LoadGpair (node_histogram + scan_begin + threadIdx .x )
119
+ : GradientPairInt64 ();
116
120
BlockScanT (temp_storage->scan ).ExclusiveScan (bin, bin, cub::Sum (), prefix_op);
117
121
// Whether the gradient of missing values is put to the left side.
118
122
bool missing_left = true ;
119
123
float gain = thread_active ? LossChangeMissing (bin, missing, parent_sum, param, nidx, fidx,
120
- evaluator, missing_left)
124
+ evaluator, missing_left, rounding )
121
125
: kNullGain ;
122
-
123
126
// Find thread with best gain
124
127
auto best = MaxReduceT (temp_storage->max_reduce ).Reduce ({threadIdx .x , gain}, cub::ArgMax ());
125
128
// This reduce result is only valid in thread 0
@@ -132,10 +135,10 @@ class EvaluateSplitAgent {
132
135
int split_gidx = (scan_begin + threadIdx .x ) - 1 ;
133
136
float fvalue =
134
137
split_gidx < static_cast <int >(gidx_begin) ? min_fvalue : feature_values[split_gidx];
135
- GradientPairPrecise left = missing_left ? bin + missing : bin;
136
- GradientPairPrecise right = parent_sum - left;
138
+ GradientPairInt64 left = missing_left ? bin + missing : bin;
139
+ GradientPairInt64 right = parent_sum - left;
137
140
best_split->Update (gain, missing_left ? kLeftDir : kRightDir , fvalue, fidx, left, right,
138
- false , param);
141
+ false , param, rounding );
139
142
}
140
143
}
141
144
}
@@ -145,12 +148,12 @@ class EvaluateSplitAgent {
145
148
bool thread_active = (scan_begin + threadIdx .x ) < gidx_end;
146
149
147
150
auto rest = thread_active ? LoadGpair (node_histogram + scan_begin + threadIdx .x )
148
- : GradientPairPrecise ();
149
- GradientPairPrecise bin = parent_sum - rest - missing;
151
+ : GradientPairInt64 ();
152
+ GradientPairInt64 bin = parent_sum - rest - missing;
150
153
// Whether the gradient of missing values is put to the left side.
151
154
bool missing_left = true ;
152
155
float gain = thread_active ? LossChangeMissing (bin, missing, parent_sum, param, nidx, fidx,
153
- evaluator, missing_left)
156
+ evaluator, missing_left, rounding )
154
157
: kNullGain ;
155
158
156
159
// Find thread with best gain
@@ -162,10 +165,10 @@ class EvaluateSplitAgent {
162
165
if (threadIdx .x == best_thread) {
163
166
int32_t split_gidx = (scan_begin + threadIdx .x );
164
167
float fvalue = feature_values[split_gidx];
165
- GradientPairPrecise left = missing_left ? bin + missing : bin;
166
- GradientPairPrecise right = parent_sum - left;
168
+ GradientPairInt64 left = missing_left ? bin + missing : bin;
169
+ GradientPairInt64 right = parent_sum - left;
167
170
best_split->UpdateCat (gain, missing_left ? kLeftDir : kRightDir ,
168
- static_cast <bst_cat_t >(fvalue), fidx, left, right, param);
171
+ static_cast <bst_cat_t >(fvalue), fidx, left, right, param, rounding );
169
172
}
170
173
}
171
174
}
@@ -174,11 +177,13 @@ class EvaluateSplitAgent {
174
177
*/
175
178
__device__ __forceinline__ void PartitionUpdate (bst_bin_t scan_begin, bool thread_active,
176
179
bool missing_left, bst_bin_t it,
177
- GradientPairPrecise const &left_sum,
178
- GradientPairPrecise const &right_sum,
180
+ GradientPairInt64 const &left_sum,
181
+ GradientPairInt64 const &right_sum,
179
182
DeviceSplitCandidate *__restrict__ best_split) {
180
- auto gain =
181
- thread_active ? evaluator.CalcSplitGain (param, nidx, fidx, left_sum, right_sum) : kNullGain ;
183
+ auto gain = thread_active
184
+ ? evaluator.CalcSplitGain (param, nidx, fidx, rounding.ToFloatingPoint (left_sum),
185
+ rounding.ToFloatingPoint (right_sum))
186
+ : kNullGain ;
182
187
183
188
// Find thread with best gain
184
189
auto best = MaxReduceT (temp_storage->max_reduce ).Reduce ({threadIdx .x , gain}, cub::ArgMax ());
@@ -191,7 +196,7 @@ class EvaluateSplitAgent {
191
196
// index of best threshold inside a feature.
192
197
auto best_thresh = it - gidx_begin;
193
198
best_split->UpdateCat (gain, missing_left ? kLeftDir : kRightDir , best_thresh, fidx, left_sum,
194
- right_sum, param);
199
+ right_sum, param, rounding );
195
200
}
196
201
}
197
202
/* *
@@ -213,28 +218,28 @@ class EvaluateSplitAgent {
213
218
bool thread_active = it < it_end;
214
219
215
220
auto right_sum = thread_active ? LoadGpair (node_histogram + sorted_idx[it] - node_offset)
216
- : GradientPairPrecise ();
221
+ : GradientPairInt64 ();
217
222
// No min value for cat feature, use inclusive scan.
218
223
BlockScanT (temp_storage->scan ).InclusiveSum (right_sum, right_sum, prefix_op);
219
- GradientPairPrecise left_sum = parent_sum - right_sum;
224
+ GradientPairInt64 left_sum = parent_sum - right_sum;
220
225
221
226
PartitionUpdate (scan_begin, thread_active, true , it, left_sum, right_sum, best_split);
222
227
}
223
228
224
229
// backward
225
230
it_begin = gidx_end - 1 ;
226
231
it_end = it_begin - n_bins + 1 ;
227
- prefix_op = SumCallbackOp<GradientPairPrecise >{}; // reset
232
+ prefix_op = SumCallbackOp<GradientPairInt64 >{}; // reset
228
233
229
234
for (bst_bin_t scan_begin = it_begin; scan_begin > it_end; scan_begin -= kBlockSize ) {
230
235
auto it = scan_begin - static_cast <bst_bin_t >(threadIdx .x );
231
236
bool thread_active = it > it_end;
232
237
233
238
auto left_sum = thread_active ? LoadGpair (node_histogram + sorted_idx[it] - node_offset)
234
- : GradientPairPrecise ();
239
+ : GradientPairInt64 ();
235
240
// No min value for cat feature, use inclusive scan.
236
241
BlockScanT (temp_storage->scan ).InclusiveSum (left_sum, left_sum, prefix_op);
237
- GradientPairPrecise right_sum = parent_sum - left_sum;
242
+ GradientPairInt64 right_sum = parent_sum - left_sum;
238
243
239
244
PartitionUpdate (scan_begin, thread_active, false , it, left_sum, right_sum, best_split);
240
245
}
@@ -399,22 +404,30 @@ void GPUHistEvaluator::EvaluateSplits(
399
404
auto const input = d_inputs[i];
400
405
auto &split = out_splits[i];
401
406
// Subtract parent gain here
402
- // As it is constant, this is more efficient than doing it during every split evaluation
403
- float parent_gain = CalcGain (shared_inputs.param , input.parent_sum );
407
+ // As it is constant, this is more efficient than doing it during every
408
+ // split evaluation
409
+ float parent_gain =
410
+ CalcGain (shared_inputs.param ,
411
+ shared_inputs.rounding .ToFloatingPoint (input.parent_sum ));
404
412
split.loss_chg -= parent_gain;
405
413
auto fidx = out_splits[i].findex ;
406
414
407
415
if (split.is_cat ) {
408
416
SetCategoricalSplit (shared_inputs, d_sorted_idx, fidx, i,
409
- device_cats_accessor.GetNodeCatStorage (input.nidx ), &out_splits[i]);
417
+ device_cats_accessor.GetNodeCatStorage (input.nidx ),
418
+ &out_splits[i]);
410
419
}
411
420
412
- float base_weight = evaluator.CalcWeight (input.nidx , shared_inputs.param ,
413
- GradStats{split.left_sum + split.right_sum });
414
- float left_weight =
415
- evaluator.CalcWeight (input.nidx , shared_inputs.param , GradStats{split.left_sum });
416
- float right_weight =
417
- evaluator.CalcWeight (input.nidx , shared_inputs.param , GradStats{split.right_sum });
421
+ float base_weight =
422
+ evaluator.CalcWeight (input.nidx , shared_inputs.param ,
423
+ shared_inputs.rounding .ToFloatingPoint (
424
+ split.left_sum + split.right_sum ));
425
+ float left_weight = evaluator.CalcWeight (
426
+ input.nidx , shared_inputs.param ,
427
+ shared_inputs.rounding .ToFloatingPoint (split.left_sum ));
428
+ float right_weight = evaluator.CalcWeight (
429
+ input.nidx , shared_inputs.param ,
430
+ shared_inputs.rounding .ToFloatingPoint (split.right_sum ));
418
431
419
432
d_entries[i] = GPUExpandEntry{input.nidx , input.depth , out_splits[i],
420
433
base_weight, left_weight, right_weight};
0 commit comments