Skip to content

Commit 508ac13

Browse files
authored
Check cub errors. (dmlc#10721)
- Make sure cuda error returned by cub scan is caught. - Avoid temporary buffer allocation in thrust device vector.
1 parent b949a4b commit 508ac13

File tree

5 files changed

+27
-21
lines changed

5 files changed

+27
-21
lines changed

src/data/ellpack_page.cu

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -309,9 +309,9 @@ ELLPACK_BATCH_SPECIALIZE(data::CudfAdapterBatch)
309309
ELLPACK_BATCH_SPECIALIZE(data::CupyAdapterBatch)
310310

311311
namespace {
312-
void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span<size_t const> d_row_ptr,
313-
size_t row_stride, common::CompressedByteT* d_compressed_buffer,
314-
size_t null) {
312+
void CopyGHistToEllpack(Context const* ctx, GHistIndexMatrix const& page,
313+
common::Span<size_t const> d_row_ptr, size_t row_stride,
314+
common::CompressedByteT* d_compressed_buffer, size_t null) {
315315
dh::device_vector<uint8_t> data(page.index.begin(), page.index.end());
316316
auto d_data = dh::ToSpan(data);
317317

@@ -323,7 +323,8 @@ void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span<size_t const>
323323
common::CompressedBufferWriter writer{page.cut.TotalBins() +
324324
static_cast<std::size_t>(1)}; // +1 for null value
325325

326-
dh::LaunchN(row_stride * page.Size(), [=] __device__(size_t idx) mutable {
326+
auto cuctx = ctx->CUDACtx();
327+
dh::LaunchN(row_stride * page.Size(), cuctx->Stream(), [=] __device__(bst_idx_t idx) mutable {
327328
auto ridx = idx / row_stride;
328329
auto ifeature = idx % row_stride;
329330

@@ -336,7 +337,7 @@ void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span<size_t const>
336337
return;
337338
}
338339

339-
size_t offset = 0;
340+
bst_idx_t offset = 0;
340341
if (!d_csc_indptr.empty()) {
341342
// is dense, ifeature is the actual feature index.
342343
offset = d_csc_indptr[ifeature];
@@ -362,7 +363,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
362363
row_stride = *std::max_element(it, it + page.Size());
363364

364365
CHECK(ctx->IsCUDA());
365-
InitCompressedData(ctx);
366+
this->InitCompressedData(ctx);
366367

367368
// copy gidx
368369
common::CompressedByteT* d_compressed_buffer = gidx_buffer.data();
@@ -373,7 +374,9 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
373374

374375
auto accessor = this->GetDeviceAccessor(ctx->Device(), ft);
375376
auto null = accessor.NullValue();
376-
CopyGHistToEllpack(page, d_row_ptr, row_stride, d_compressed_buffer, null);
377+
this->monitor_.Start("CopyGHistToEllpack");
378+
CopyGHistToEllpack(ctx, page, d_row_ptr, row_stride, d_compressed_buffer, null);
379+
this->monitor_.Stop("CopyGHistToEllpack");
377380
}
378381

379382
// A functor that copies the data from one EllpackPage to another.

src/tree/gpu_hist/evaluate_splits.cu

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -472,7 +472,9 @@ void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector<bst_
472472

473473
GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input,
474474
EvaluateSplitSharedInputs shared_inputs) {
475-
dh::device_vector<EvaluateSplitInputs> inputs = std::vector<EvaluateSplitInputs>{input};
475+
dh::device_vector<EvaluateSplitInputs> inputs(1);
476+
dh::safe_cuda(cudaMemcpyAsync(inputs.data().get(), &input, sizeof(input), cudaMemcpyDefault));
477+
476478
dh::TemporaryArray<GPUExpandEntry> out_entries(1);
477479
this->EvaluateSplits(ctx, {input.nidx}, input.feature_set.size(), dh::ToSpan(inputs),
478480
shared_inputs, dh::ToSpan(out_entries));

src/tree/gpu_hist/histogram.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,7 @@ class DeviceHistogramBuilderImpl {
325325
void BuildHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
326326
FeatureGroupsAccessor const& feature_groups,
327327
common::Span<GradientPair const> gpair,
328-
common::Span<const std::uint32_t> d_ridx,
328+
common::Span<const cuda_impl::RowIndexT> d_ridx,
329329
common::Span<GradientPairInt64> histogram, GradientQuantiser rounding) {
330330
CHECK(kernel_);
331331
// Otherwise launch blocks such that each block has a minimum amount of work to do
@@ -369,7 +369,7 @@ void DeviceHistogramBuilder::BuildHistogram(CUDAContext const* ctx,
369369
EllpackDeviceAccessor const& matrix,
370370
FeatureGroupsAccessor const& feature_groups,
371371
common::Span<GradientPair const> gpair,
372-
common::Span<const std::uint32_t> ridx,
372+
common::Span<const cuda_impl::RowIndexT> ridx,
373373
common::Span<GradientPairInt64> histogram,
374374
GradientQuantiser rounding) {
375375
this->p_impl_->BuildHistogram(ctx, matrix, feature_groups, gpair, ridx, histogram, rounding);

src/tree/gpu_hist/row_partitioner.cuh

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
132132
common::Span<cuda_impl::RowIndexT> ridx,
133133
common::Span<cuda_impl::RowIndexT> ridx_tmp,
134134
common::Span<cuda_impl::RowIndexT> d_counts, bst_idx_t total_rows, OpT op,
135-
dh::device_vector<int8_t>* tmp) {
135+
dh::DeviceUVector<int8_t>* tmp) {
136136
dh::LDGIterator<PerNodeData<OpDataT>> batch_info_itr(d_batch_info.data());
137137
WriteResultsFunctor<OpDataT> write_results{batch_info_itr, ridx.data(), ridx_tmp.data(),
138138
d_counts.data()};
@@ -150,14 +150,16 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
150150
go_left};
151151
});
152152
std::size_t temp_bytes = 0;
153+
// Restriction imposed by cub.
154+
CHECK_LE(total_rows, static_cast<bst_idx_t>(std::numeric_limits<std::int32_t>::max()));
153155
if (tmp->empty()) {
154-
cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, discard_write_iterator,
155-
IndexFlagOp{}, total_rows);
156+
dh::safe_cuda(cub::DeviceScan::InclusiveScan(
157+
nullptr, temp_bytes, input_iterator, discard_write_iterator, IndexFlagOp{}, total_rows));
156158
tmp->resize(temp_bytes);
157159
}
158160
temp_bytes = tmp->size();
159-
cub::DeviceScan::InclusiveScan(tmp->data().get(), temp_bytes, input_iterator,
160-
discard_write_iterator, IndexFlagOp{}, total_rows);
161+
dh::safe_cuda(cub::DeviceScan::InclusiveScan(tmp->data(), temp_bytes, input_iterator,
162+
discard_write_iterator, IndexFlagOp{}, total_rows));
161163

162164
constexpr int kBlockSize = 256;
163165

@@ -236,7 +238,7 @@ class RowPartitioner {
236238
dh::DeviceUVector<RowIndexT> ridx_;
237239
// Staging area for sorting ridx
238240
dh::DeviceUVector<RowIndexT> ridx_tmp_;
239-
dh::device_vector<int8_t> tmp_;
241+
dh::DeviceUVector<int8_t> tmp_;
240242
dh::PinnedMemory pinned_;
241243
dh::PinnedMemory pinned2_;
242244
bst_node_t n_nodes_{0}; // Counter for internal checks.

tests/cpp/tree/gpu_hist/test_row_partitioner.cu

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,9 @@ void TestUpdatePositionBatch() {
4949
TEST(RowPartitioner, Batch) { TestUpdatePositionBatch(); }
5050

5151
void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Segment>& segments) {
52-
thrust::device_vector<uint32_t> ridx = ridx_in;
53-
thrust::device_vector<uint32_t> ridx_tmp(ridx_in.size());
54-
thrust::device_vector<bst_uint> counts(segments.size());
52+
thrust::device_vector<cuda_impl::RowIndexT> ridx = ridx_in;
53+
thrust::device_vector<cuda_impl::RowIndexT> ridx_tmp(ridx_in.size());
54+
thrust::device_vector<cuda_impl::RowIndexT> counts(segments.size());
5555

5656
auto op = [=] __device__(auto ridx, int split_index, int data) { return ridx % 2 == 0; };
5757
std::vector<int> op_data(segments.size());
@@ -66,7 +66,7 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
6666
dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
6767
h_batch_info.size() * sizeof(PerNodeData<int>), cudaMemcpyDefault,
6868
nullptr));
69-
dh::device_vector<int8_t> tmp;
69+
dh::DeviceUVector<int8_t> tmp;
7070
SortPositionBatch<decltype(op), int>(dh::ToSpan(d_batch_info), dh::ToSpan(ridx),
7171
dh::ToSpan(ridx_tmp), dh::ToSpan(counts), total_rows, op,
7272
&tmp);
@@ -91,5 +91,4 @@ TEST(GpuHist, SortPositionBatch) {
9191
TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 6}});
9292
TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{3, 6}, {0, 2}});
9393
}
94-
9594
} // namespace xgboost::tree

0 commit comments

Comments
 (0)