Skip to content

Commit 8d28eb6

Browse files
committed
UCP/PERF: CR fixes
1 parent 59ea068 commit 8d28eb6

File tree

2 files changed

+6
-11
lines changed

2 files changed

+6
-11
lines changed

src/tools/perf/cuda/cuda_kernel.cuh

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -88,16 +88,11 @@ __host__ UCS_F_DEVICE unsigned ucx_perf_cuda_thread_index(size_t tid)
8888
{
8989
switch (level) {
9090
case UCS_DEVICE_LEVEL_THREAD: return tid;
91-
/* TODO: use UCS_DEVICE_NUM_THREADS_IN_WARP */
92-
case UCS_DEVICE_LEVEL_WARP: return tid / 32;
91+
case UCS_DEVICE_LEVEL_WARP: return tid / UCS_DEVICE_NUM_THREADS_IN_WARP;
9392
default: return 0;
9493
}
9594
}
9695

97-
__host__ UCS_F_DEVICE unsigned ucx_ceil_div(unsigned x, unsigned y) {
98-
return (x / y) + ((x % y) != 0);
99-
}
100-
10196
#define UCX_PERF_THREAD_INDEX_SET(_level, _tid, _outval) \
10297
(_outval) = ucx_perf_cuda_thread_index<_level>(_tid)
10398

@@ -136,8 +131,8 @@ __host__ UCS_F_DEVICE unsigned ucx_ceil_div(unsigned x, unsigned y) {
136131
do { \
137132
unsigned _blocks = _perf.params.device_block_count; \
138133
unsigned _threads = _perf.params.device_thread_count; \
139-
unsigned _reqs_count = ucx_ceil_div(_perf.params.max_outstanding, \
140-
_perf.params.device_fc_window); \
134+
unsigned _reqs_count = ucs_div_round_up(_perf.params.max_outstanding, \
135+
_perf.params.device_fc_window); \
141136
size_t _shared_size = _reqs_count * sizeof(ucp_device_request_t) * \
142137
ucx_perf_cuda_thread_index<_level>(_threads); \
143138
_kernel<_level, _cmd><<<_blocks, _threads, _shared_size>>>(__VA_ARGS__); \

src/tools/perf/cuda/ucp_cuda_kernel.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ public:
2626
ucp_device_request_t *requests)
2727
: m_size(size),
2828
m_fc_window(fc_window),
29-
m_reqs_count(ucx_ceil_div(size, fc_window)),
29+
m_reqs_count(ucs_div_round_up(size, fc_window)),
3030
m_pending_count(0),
3131
m_requests(requests),
3232
m_pending_map(0)
@@ -357,8 +357,8 @@ ucp_perf_cuda_put_bw_kernel(ucx_perf_cuda_context &ctx,
357357
{
358358
extern __shared__ ucp_device_request_t shared_requests[];
359359
unsigned thread_index = ucx_perf_cuda_thread_index<level>(threadIdx.x);
360-
unsigned reqs_count = ucx_ceil_div(ctx.max_outstanding,
361-
ctx.device_fc_window);
360+
unsigned reqs_count = ucs_div_round_up(ctx.max_outstanding,
361+
ctx.device_fc_window);
362362
ucp_device_request_t *reqs = &shared_requests[reqs_count * thread_index];
363363

364364
ucp_perf_cuda_request_manager req_mgr(ctx.max_outstanding,

0 commit comments

Comments
 (0)