Skip to content

Commit 5556bc3

Browse files
authored
[BP] Fixes for large size clusters. (dmlc#10880) (dmlc#10899)
- Increase listener backlog. - Check for empty kernels.
1 parent 41c2680 commit 5556bc3

File tree

6 files changed

+31
-28
lines changed

6 files changed

+31
-28
lines changed

include/xgboost/collective/socket.h

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -548,13 +548,10 @@ class TCPSocket {
548548
[[nodiscard]] HandleT const &Handle() const { return handle_; }
549549
/**
550550
* @brief Listen to incoming requests. Should be called after bind.
551+
*
552+
* Both the default and minimum backlog is set to 256.
551553
*/
552-
[[nodiscard]] Result Listen(std::int32_t backlog = 16) {
553-
if (listen(handle_, backlog) != 0) {
554-
return system::FailWithCode("Failed to listen.");
555-
}
556-
return Success();
557-
}
554+
[[nodiscard]] Result Listen(std::int32_t backlog = 256);
558555
/**
559556
* @brief Bind socket to INADDR_ANY, return the port selected by the OS.
560557
*/

src/collective/socket.cc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
*/
44
#include "xgboost/collective/socket.h"
55

6+
#include <algorithm> // for max
67
#include <array> // for array
78
#include <cstddef> // std::size_t
89
#include <cstdint> // std::int32_t
@@ -58,6 +59,14 @@ SockAddrV4 SockAddrV4::InaddrAny() { return MakeSockAddress("0.0.0.0", 0).V4();
5859
SockAddrV6 SockAddrV6::Loopback() { return MakeSockAddress("::1", 0).V6(); }
5960
SockAddrV6 SockAddrV6::InaddrAny() { return MakeSockAddress("::", 0).V6(); }
6061

62+
[[nodiscard]] Result TCPSocket::Listen(std::int32_t backlog) {
63+
backlog = std::max(backlog, 256);
64+
if (listen(this->handle_, backlog) != 0) {
65+
return system::FailWithCode("Failed to listen.");
66+
}
67+
return Success();
68+
}
69+
6170
std::size_t TCPSocket::Send(StringView str) {
6271
CHECK(!this->IsClosed());
6372
CHECK_LT(str.size(), std::numeric_limits<std::int32_t>::max());

src/collective/tracker.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,8 @@ RabitTracker::RabitTracker(Json const& config) : Tracker{config} {
120120
listener_ = TCPSocket::Create(addr.IsV4() ? SockDomain::kV4 : SockDomain::kV6);
121121
return listener_.Bind(host_, &this->port_);
122122
} << [&] {
123-
return listener_.Listen();
123+
CHECK_GT(this->n_workers_, 0);
124+
return listener_.Listen(this->n_workers_);
124125
};
125126
SafeColl(rc);
126127
}

src/common/device_helpers.cuh

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -224,13 +224,6 @@ __global__ void LaunchNKernel(size_t begin, size_t end, L lambda) {
224224
lambda(i);
225225
}
226226
}
227-
template <typename L>
228-
__global__ void LaunchNKernel(int device_idx, size_t begin, size_t end,
229-
L lambda) {
230-
for (auto i : GridStrideRange(begin, end)) {
231-
lambda(i, device_idx);
232-
}
233-
}
234227

235228
/* \brief A wrapper around kernel launching syntax, used to guard against empty input.
236229
*

src/tree/gpu_hist/row_partitioner.cuh

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -146,10 +146,11 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
146146

147147
// Value found by experimentation
148148
const int kItemsThread = 12;
149-
const int grid_size = xgboost::common::DivRoundUp(total_rows, kBlockSize * kItemsThread);
150149

151-
SortPositionCopyKernel<kBlockSize, RowIndexT, OpDataT>
152-
<<<grid_size, kBlockSize, 0>>>(batch_info_itr, ridx, ridx_tmp, total_rows);
150+
std::uint32_t const kGridSize =
151+
xgboost::common::DivRoundUp(total_rows, kBlockSize * kItemsThread);
152+
dh::LaunchKernel{kGridSize, kBlockSize, 0}(SortPositionCopyKernel<kBlockSize, RowIndexT, OpDataT>,
153+
batch_info_itr, ridx, ridx_tmp, total_rows);
153154
}
154155

155156
struct NodePositionInfo {
@@ -328,11 +329,13 @@ class RowPartitioner {
328329
sizeof(NodePositionInfo) * ridx_segments_.size(),
329330
cudaMemcpyDefault));
330331

331-
constexpr int kBlockSize = 512;
332+
constexpr std::uint32_t kBlockSize = 512;
332333
const int kItemsThread = 8;
333-
const int grid_size = xgboost::common::DivRoundUp(ridx_.size(), kBlockSize * kItemsThread);
334+
const std::uint32_t grid_size =
335+
xgboost::common::DivRoundUp(ridx_.size(), kBlockSize * kItemsThread);
334336
common::Span<const RowIndexT> d_ridx(ridx_.data().get(), ridx_.size());
335-
FinalisePositionKernel<kBlockSize><<<grid_size, kBlockSize, 0>>>(
337+
dh::LaunchKernel{grid_size, kBlockSize}(
338+
FinalisePositionKernel<kBlockSize, RowIndexT, FinalisePositionOpT>,
336339
dh::ToSpan(d_node_info_storage), d_ridx, d_out_position, op);
337340
}
338341
};

tests/cpp/tree/gpu_hist/test_row_partitioner.cu

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,12 @@
66
#include <thrust/host_vector.h>
77
#include <thrust/sequence.h>
88

9-
#include <algorithm>
109
#include <vector>
1110

1211
#include "../../../../src/tree/gpu_hist/row_partitioner.cuh"
1312
#include "../../helpers.h"
1413
#include "xgboost/base.h"
15-
#include "xgboost/context.h"
16-
#include "xgboost/task.h"
17-
#include "xgboost/tree_model.h"
14+
#include "../../helpers.h" // for RandomDataGenerator
1815

1916
namespace xgboost::tree {
2017
void TestUpdatePositionBatch() {
@@ -55,7 +52,9 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
5552
thrust::device_vector<uint32_t> ridx_tmp(ridx_in.size());
5653
thrust::device_vector<bst_uint> counts(segments.size());
5754

58-
auto op = [=] __device__(auto ridx, int split_index, int data) { return ridx % 2 == 0; };
55+
auto op = [=] __device__(auto ridx, int split_index, int data) {
56+
return ridx % 2 == 0;
57+
};
5958
std::vector<int> op_data(segments.size());
6059
std::vector<PerNodeData<int>> h_batch_info(segments.size());
6160
dh::TemporaryArray<PerNodeData<int>> d_batch_info(segments.size());
@@ -73,7 +72,9 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
7372
dh::ToSpan(ridx_tmp), dh::ToSpan(counts),
7473
total_rows, op, &tmp);
7574

76-
auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; };
75+
auto op_without_data = [=] __device__(auto ridx) {
76+
return ridx % 2 == 0;
77+
};
7778
for (size_t i = 0; i < segments.size(); i++) {
7879
auto begin = ridx.begin() + segments[i].begin;
7980
auto end = ridx.begin() + segments[i].end;
@@ -87,11 +88,10 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
8788
}
8889
}
8990

90-
TEST(GpuHist, SortPositionBatch) {
91+
TEST(RowPartitioner, SortPositionBatch) {
9192
TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 3}, {3, 6}});
9293
TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 1}, {3, 6}});
9394
TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 6}});
9495
TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{3, 6}, {0, 2}});
9596
}
96-
9797
} // namespace xgboost::tree

0 commit comments

Comments
 (0)