Skip to content

Commit e29e5f5

Browse files
authored
UCP/PERF: Support for WARP level and improvements (#10906)
* UCP/PERF: Addressed PR comments * UCP/PERF: Replaced popcount with int field to improve performance * UCP/PERF: Refactored dispatch macro * UCP/PERF: Support for warp level * UCP/PERF: Added warp tests in CI * UCP/PERF: Cleanup * UCP/PERF: Speed up intermediate report * UCP/PERF: Reduce progress latency by 2us * UCP/PERF: Fix for ULL * UCP/PERF: Allow NULL comp * UCP/PERF: Flow control window * UCP/PERF: Flow control * UCP/PERF: Added pending_map * UCP: Coverity fix * UCP/PERF: Removed unused var * PERF: Use __threadfence_system * UCP/PERF: Enabled jenkins tests
1 parent c5d2df5 commit e29e5f5

File tree

8 files changed

+314
-204
lines changed

8 files changed

+314
-204
lines changed
Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#
22
# UCP basic device cuda tests
33
#
4+
ucp_device_cuda_single_lat_8b_1thread -t ucp_put_single_lat -m cuda -s 8 -n 10000
45
ucp_device_cuda_single_bw_1k_1thread -t ucp_put_single_bw -m cuda -s 1024 -n 10000
56
ucp_device_cuda_single_lat_1k_1thread -t ucp_put_single_lat -m cuda -s 1024 -n 10000
67
ucp_device_cuda_multi_bw_1k_1thread -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000
@@ -11,10 +12,16 @@ ucp_device_cuda_partial_lat_1k_1thread -t ucp_put_partial_lat -m cuda -s 2
1112
# Increase number of threads after following fixes:
1213
# - Use thread-local memory instead of shared for requests (limit 48K)
1314
# - Fix WQE size limit of 1024
14-
# TODO - enable when wqe reserve is fixed.
15-
# ucp_device_cuda_single_bw_1k_32threads -t ucp_put_single_bw -m cuda -s 1024 -n 10000 -T 32
16-
# ucp_device_cuda_single_lat_1k_32threads -t ucp_put_single_lat -m cuda -s 1024 -n 10000 -T 32
17-
# ucp_device_cuda_multi_bw_1k_32threads -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2
18-
# ucp_device_cuda_multi_lat_1k_32threads -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2
19-
# ucp_device_cuda_partial_bw_1k_32threads -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2
20-
# ucp_device_cuda_partial_lat_1k_32threads -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2
15+
ucp_device_cuda_single_bw_1k_32threads -t ucp_put_single_bw -m cuda -s 1024 -n 10000 -T 32
16+
ucp_device_cuda_single_lat_1k_32threads -t ucp_put_single_lat -m cuda -s 1024 -n 10000 -T 32
17+
ucp_device_cuda_multi_bw_1k_32threads -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2
18+
ucp_device_cuda_multi_lat_1k_32threads -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2
19+
ucp_device_cuda_partial_bw_1k_32threads -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2
20+
ucp_device_cuda_partial_lat_1k_32threads -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2
21+
22+
ucp_device_cuda_single_bw_1k_1warp -t ucp_put_single_bw -m cuda -s 1024 -n 10000 -T 32 -L warp
23+
ucp_device_cuda_single_lat_1k_1warp -t ucp_put_single_lat -m cuda -s 1024 -n 10000 -T 32 -L warp
24+
ucp_device_cuda_multi_bw_1k_1warp -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 -T 32 -L warp
25+
ucp_device_cuda_multi_lat_1k_1warp -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 -T 32 -L warp
26+
ucp_device_cuda_partial_bw_1k_1warp -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 -T 32 -L warp
27+
ucp_device_cuda_partial_lat_1k_1warp -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 -T 32 -L warp

src/tools/perf/api/libperf.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ typedef enum {
166166

167167
#define UCX_PERF_MEM_DEV_DEFAULT -1
168168

169+
#define UCP_PERF_FC_WINDOW_DEFAULT 4
169170

170171
/**
171172
* Performance counter type.
@@ -266,9 +267,9 @@ typedef struct ucx_perf_params {
266267
ucx_perf_wait_mode_t wait_mode; /* How to wait */
267268
ucs_memory_type_t send_mem_type; /* Send memory type */
268269
ucs_memory_type_t recv_mem_type; /* Recv memory type */
269-
ucx_perf_accel_dev_t send_device; /* Send memory device for gdaki */
270-
ucx_perf_accel_dev_t recv_device; /* Recv memory device for gdaki */
271-
ucs_device_level_t device_level; /* Device level for gdaki */
270+
ucx_perf_accel_dev_t send_device; /* Send memory device */
271+
ucx_perf_accel_dev_t recv_device; /* Recv memory device */
272+
ucs_device_level_t device_level; /* Device level */
272273
unsigned flags; /* See ucx_perf_test_flags. */
273274

274275
size_t *msg_size_list; /* Test message sizes list. The size
@@ -289,6 +290,7 @@ typedef struct ucx_perf_params {
289290
in latency tests */
290291
unsigned device_thread_count; /* Number of device threads */
291292
unsigned device_block_count; /* Number of device blocks */
293+
unsigned device_fc_window; /* Flow control window size for device tests */
292294

293295
void *rte_group; /* Opaque RTE group handle */
294296
ucx_perf_rte_t *rte; /* RTE functions used to exchange data */

src/tools/perf/cuda/cuda_kernel.cuh

Lines changed: 95 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ typedef unsigned long long ucx_perf_cuda_time_t;
1818

1919
struct ucx_perf_cuda_context {
2020
unsigned max_outstanding;
21+
unsigned device_fc_window;
2122
ucx_perf_counter_t max_iters;
2223
ucx_perf_cuda_time_t report_interval_ns;
2324
ucx_perf_counter_t completed_iters;
@@ -32,22 +33,46 @@ UCS_F_DEVICE ucx_perf_cuda_time_t ucx_perf_cuda_get_time_ns()
3233
return globaltimer;
3334
}
3435

35-
UCS_F_DEVICE void
36-
ucx_perf_cuda_update_report(ucx_perf_cuda_context &ctx,
37-
ucx_perf_counter_t completed,
38-
ucx_perf_counter_t max_iters,
39-
ucx_perf_cuda_time_t &last_report_time)
40-
{
41-
if (threadIdx.x == 0) {
42-
ucx_perf_cuda_time_t current_time = ucx_perf_cuda_get_time_ns();
43-
if (((current_time - last_report_time) >= ctx.report_interval_ns) ||
44-
(completed >= max_iters)) {
45-
ctx.completed_iters = completed;
46-
last_report_time = current_time;
47-
__threadfence();
36+
class ucx_perf_cuda_reporter {
37+
public:
38+
/* Number of updates per report interval */
39+
static const unsigned UPDATES_PER_INTERVAL = 5;
40+
41+
__device__
42+
ucx_perf_cuda_reporter(ucx_perf_cuda_context &ctx) :
43+
m_ctx(ctx),
44+
m_max_iters(ctx.max_iters),
45+
m_next_report_iter(1),
46+
m_last_report_time(ucx_perf_cuda_get_time_ns()),
47+
m_report_interval_ns(ctx.report_interval_ns / UPDATES_PER_INTERVAL)
48+
{
49+
}
50+
51+
__device__ inline void
52+
update_report(ucx_perf_counter_t completed)
53+
{
54+
if ((threadIdx.x == 0) && ucs_unlikely(completed >= m_next_report_iter)) {
55+
assert(completed - m_ctx.completed_iters > 0);
56+
ucx_perf_cuda_time_t cur_time = ucx_perf_cuda_get_time_ns();
57+
ucx_perf_cuda_time_t iter_time = (cur_time - m_last_report_time) /
58+
(completed - m_ctx.completed_iters);
59+
assert(iter_time > 0);
60+
m_last_report_time = cur_time;
61+
m_ctx.completed_iters = completed;
62+
__threadfence_system();
63+
64+
m_next_report_iter = ucs_min(completed + (m_report_interval_ns / iter_time),
65+
m_max_iters);
4866
}
4967
}
50-
}
68+
69+
private:
70+
ucx_perf_cuda_context &m_ctx;
71+
ucx_perf_counter_t m_max_iters;
72+
ucx_perf_counter_t m_next_report_iter;
73+
ucx_perf_cuda_time_t m_last_report_time;
74+
ucx_perf_cuda_time_t m_report_interval_ns;
75+
};
5176

5277
static UCS_F_ALWAYS_INLINE uint64_t *
5378
ucx_perf_cuda_get_sn(const void *address, size_t length)
@@ -63,95 +88,83 @@ UCS_F_DEVICE void ucx_perf_cuda_wait_sn(const uint64_t *sn, uint64_t value)
6388
__syncthreads();
6489
}
6590

66-
/* Simple bitset */
67-
#define UCX_BIT_MASK(bit) (1 << ((bit) & (CHAR_BIT - 1)))
68-
#define UCX_BIT_SET(set, bit) (set[(bit)/CHAR_BIT] |= UCX_BIT_MASK(bit))
69-
#define UCX_BIT_RESET(set, bit) (set[(bit)/CHAR_BIT] &= ~UCX_BIT_MASK(bit))
70-
#define UCX_BIT_GET(set, bit) (set[(bit)/CHAR_BIT] & UCX_BIT_MASK(bit))
71-
#define UCX_BITSET_SIZE(bits) ((bits + CHAR_BIT - 1) / CHAR_BIT)
72-
73-
UCS_F_DEVICE size_t ucx_bitset_popcount(const uint8_t *set, size_t bits) {
74-
size_t count = 0;
75-
for (size_t i = 0; i < bits; i++) {
76-
if (UCX_BIT_GET(set, i)) {
77-
count++;
78-
}
79-
}
80-
return count;
81-
}
82-
83-
UCS_F_DEVICE size_t
84-
ucx_bitset_ffns(const uint8_t *set, size_t bits, size_t from)
91+
template<ucs_device_level_t level>
92+
__host__ UCS_F_DEVICE unsigned ucx_perf_cuda_thread_index(size_t tid)
8593
{
86-
for (size_t i = from; i < bits; i++) {
87-
if (!UCX_BIT_GET(set, i)) {
88-
return i;
89-
}
94+
switch (level) {
95+
case UCS_DEVICE_LEVEL_THREAD: return tid;
96+
case UCS_DEVICE_LEVEL_WARP: return tid / UCS_DEVICE_NUM_THREADS_IN_WARP;
97+
default: return 0;
9098
}
91-
return bits;
9299
}
93100

94-
#define UCX_KERNEL_CMD(level, cmd, blocks, threads, shared_size, func, ...) \
95-
do { \
96-
switch (cmd) { \
97-
case UCX_PERF_CMD_PUT_SINGLE: \
98-
func<level, UCX_PERF_CMD_PUT_SINGLE><<<blocks, threads, shared_size>>>(__VA_ARGS__); \
99-
break; \
100-
case UCX_PERF_CMD_PUT_MULTI: \
101-
func<level, UCX_PERF_CMD_PUT_MULTI><<<blocks, threads, shared_size>>>(__VA_ARGS__); \
102-
break; \
103-
case UCX_PERF_CMD_PUT_PARTIAL: \
104-
func<level, UCX_PERF_CMD_PUT_PARTIAL><<<blocks, threads, shared_size>>>(__VA_ARGS__); \
105-
break; \
106-
default: \
107-
ucs_error("Unsupported cmd: %d", cmd); \
108-
break; \
109-
} \
110-
} while (0)
101+
#define UCX_PERF_THREAD_INDEX_SET(_level, _tid, _outval) \
102+
(_outval) = ucx_perf_cuda_thread_index<_level>(_tid)
103+
104+
#define UCX_PERF_SWITCH_CMD(_cmd, _func, ...) \
105+
switch (_cmd) { \
106+
case UCX_PERF_CMD_PUT_SINGLE: \
107+
_func(UCX_PERF_CMD_PUT_SINGLE, __VA_ARGS__); \
108+
break; \
109+
case UCX_PERF_CMD_PUT_MULTI: \
110+
_func(UCX_PERF_CMD_PUT_MULTI, __VA_ARGS__); \
111+
break; \
112+
case UCX_PERF_CMD_PUT_PARTIAL: \
113+
_func(UCX_PERF_CMD_PUT_PARTIAL, __VA_ARGS__); \
114+
break; \
115+
default: \
116+
ucs_error("Unsupported cmd: %d", _cmd); \
117+
break; \
118+
}
111119

112-
#define UCX_KERNEL_DISPATCH(perf, func, ...) \
113-
do { \
114-
ucs_device_level_t _level = perf.params.device_level; \
115-
ucx_perf_cmd_t _cmd = perf.params.command; \
116-
unsigned _blocks = perf.params.device_block_count; \
117-
unsigned _threads = perf.params.device_thread_count; \
118-
size_t _shared_size = _threads * perf.params.max_outstanding * \
119-
sizeof(ucp_device_request_t); \
120-
switch (_level) { \
120+
#define UCX_PERF_SWITCH_LEVEL(_level, _func, ...) \
121+
switch (_level) { \
121122
case UCS_DEVICE_LEVEL_THREAD: \
122-
UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_THREAD, _cmd, _blocks, _threads,\
123-
_shared_size, func, __VA_ARGS__); \
123+
_func(UCS_DEVICE_LEVEL_THREAD, __VA_ARGS__); \
124124
break; \
125125
case UCS_DEVICE_LEVEL_WARP: \
126-
UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_WARP, _cmd, _blocks, _threads,\
127-
_shared_size, func, __VA_ARGS__); \
126+
_func(UCS_DEVICE_LEVEL_WARP, __VA_ARGS__); \
128127
break; \
129128
case UCS_DEVICE_LEVEL_BLOCK: \
130-
UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_BLOCK, _cmd, _blocks, _threads,\
131-
_shared_size, func, __VA_ARGS__); \
132-
break; \
133129
case UCS_DEVICE_LEVEL_GRID: \
134-
UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_GRID, _cmd, _blocks, _threads,\
135-
_shared_size, func, __VA_ARGS__); \
136-
break; \
137130
default: \
138131
ucs_error("Unsupported level: %d", _level); \
139132
break; \
140-
} \
133+
}
134+
135+
#define UCX_PERF_KERNEL_DISPATCH_CMD_LEVEL(_cmd, _level, _perf, _kernel, ...) \
136+
do { \
137+
unsigned _blocks = _perf.params.device_block_count; \
138+
unsigned _threads = _perf.params.device_thread_count; \
139+
unsigned _reqs_count = ucs_div_round_up(_perf.params.max_outstanding, \
140+
_perf.params.device_fc_window); \
141+
size_t _shared_size = _reqs_count * sizeof(ucp_device_request_t) * \
142+
ucx_perf_cuda_thread_index<_level>(_threads); \
143+
_kernel<_level, _cmd><<<_blocks, _threads, _shared_size>>>(__VA_ARGS__); \
141144
} while (0)
142145

146+
#define UCX_PERF_KERNEL_DISPATCH_CMD(_level, _perf, _kernel, ...) \
147+
UCX_PERF_SWITCH_CMD(_perf.params.command, UCX_PERF_KERNEL_DISPATCH_CMD_LEVEL, \
148+
_level, _perf, _kernel, __VA_ARGS__);
149+
150+
#define UCX_PERF_KERNEL_DISPATCH(_perf, _kernel, ...) \
151+
UCX_PERF_SWITCH_LEVEL(_perf.params.device_level, UCX_PERF_KERNEL_DISPATCH_CMD, \
152+
_perf, _kernel, __VA_ARGS__);
153+
154+
143155
class ucx_perf_cuda_test_runner {
144156
public:
145157
ucx_perf_cuda_test_runner(ucx_perf_context_t &perf) : m_perf(perf)
146158
{
147159
init_ctx();
148160

149161
m_cpu_ctx->max_outstanding = perf.params.max_outstanding;
162+
m_cpu_ctx->device_fc_window = perf.params.device_fc_window;
150163
m_cpu_ctx->max_iters = perf.max_iter;
151164
m_cpu_ctx->completed_iters = 0;
152165
m_cpu_ctx->report_interval_ns = (perf.report_interval == ULONG_MAX) ?
153166
ULONG_MAX :
154-
ucs_time_to_nsec(perf.report_interval) / 100;
167+
ucs_time_to_nsec(perf.report_interval);
155168
m_cpu_ctx->status = UCS_ERR_NOT_IMPLEMENTED;
156169
}
157170

@@ -166,11 +179,16 @@ public:
166179
ucx_perf_counter_t last_completed = 0;
167180
ucx_perf_counter_t completed = m_cpu_ctx->completed_iters;
168181
unsigned thread_count = m_perf.params.device_thread_count;
182+
ucs_device_level_t level = m_perf.params.device_level;
183+
unsigned msgs_per_iter;
184+
UCX_PERF_SWITCH_LEVEL(level, UCX_PERF_THREAD_INDEX_SET, thread_count,
185+
msgs_per_iter);
186+
169187
while (true) {
170188
ucx_perf_counter_t delta = completed - last_completed;
171189
if (delta > 0) {
172190
// TODO: calculate latency percentile on kernel
173-
ucx_perf_update(&m_perf, delta, delta * thread_count, msg_length);
191+
ucx_perf_update(&m_perf, delta, delta * msgs_per_iter, msg_length);
174192
} else if (completed >= m_perf.max_iter) {
175193
break;
176194
}

0 commit comments

Comments
 (0)