Skip to content

Commit 958423b

Browse files
committed
Merge branch '10906' into perf_chan_id
2 parents 87e88f7 + 9313ce3 commit 958423b

File tree

9 files changed

+310
-202
lines changed

9 files changed

+310
-202
lines changed

contrib/test_jenkins.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -666,6 +666,7 @@ run_ucx_perftest_cuda_device() {
666666
for tls in "$gda_tls"
667667
do
668668
export UCX_TLS=${tls}
669+
# TODO: Enable these tests
669670
run_client_server_app "$ucx_perftest" "$ucp_test_args" "$ucp_client_args" 0 0
670671
done
671672
unset UCX_TLS
Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#
22
# UCP basic device cuda tests
33
#
4+
ucp_device_cuda_single_lat_8b_1thread -t ucp_put_single_lat -m cuda -s 8 -n 10000
45
ucp_device_cuda_single_bw_1k_1thread -t ucp_put_single_bw -m cuda -s 1024 -n 10000
56
ucp_device_cuda_single_lat_1k_1thread -t ucp_put_single_lat -m cuda -s 1024 -n 10000
67
ucp_device_cuda_multi_bw_1k_1thread -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000
@@ -11,10 +12,16 @@ ucp_device_cuda_partial_lat_1k_1thread -t ucp_put_partial_lat -m cuda -s 2
1112
# Increase number of threads after following fixes:
1213
# - Use thread-local memory instead of shared for requests (limit 48K)
1314
# - Fix WQE size limit of 1024
14-
# TODO - enable when wqe reserve is fixed.
15-
# ucp_device_cuda_single_bw_1k_32threads -t ucp_put_single_bw -m cuda -s 1024 -n 10000 -T 32
16-
# ucp_device_cuda_single_lat_1k_32threads -t ucp_put_single_lat -m cuda -s 1024 -n 10000 -T 32
17-
# ucp_device_cuda_multi_bw_1k_32threads -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2
18-
# ucp_device_cuda_multi_lat_1k_32threads -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2
19-
# ucp_device_cuda_partial_bw_1k_32threads -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2
20-
# ucp_device_cuda_partial_lat_1k_32threads -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2
15+
ucp_device_cuda_single_bw_1k_32threads -t ucp_put_single_bw -m cuda -s 1024 -n 10000 -T 32
16+
ucp_device_cuda_single_lat_1k_32threads -t ucp_put_single_lat -m cuda -s 1024 -n 10000 -T 32
17+
ucp_device_cuda_multi_bw_1k_32threads -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2
18+
ucp_device_cuda_multi_lat_1k_32threads -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2
19+
ucp_device_cuda_partial_bw_1k_32threads -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 -T 32 -O 2
20+
ucp_device_cuda_partial_lat_1k_32threads -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 -T 32 -O 2
21+
22+
ucp_device_cuda_single_bw_1k_1warp -t ucp_put_single_bw -m cuda -s 1024 -n 10000 -T 32 -L warp
23+
ucp_device_cuda_single_lat_1k_1warp -t ucp_put_single_lat -m cuda -s 1024 -n 10000 -T 32 -L warp
24+
ucp_device_cuda_multi_bw_1k_1warp -t ucp_put_multi_bw -m cuda -s 256:8 -n 10000 -T 32 -L warp
25+
ucp_device_cuda_multi_lat_1k_1warp -t ucp_put_multi_lat -m cuda -s 256:8 -n 10000 -T 32 -L warp
26+
ucp_device_cuda_partial_bw_1k_1warp -t ucp_put_partial_bw -m cuda -s 256:8 -n 10000 -T 32 -L warp
27+
ucp_device_cuda_partial_lat_1k_1warp -t ucp_put_partial_lat -m cuda -s 256:8 -n 10000 -T 32 -L warp

src/tools/perf/api/libperf.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ typedef enum {
166166

167167
#define UCX_PERF_MEM_DEV_DEFAULT -1
168168

169+
#define UCP_PERF_FC_WINDOW_DEFAULT 4
169170

170171
/**
171172
* Performance counter type.
@@ -266,9 +267,9 @@ typedef struct ucx_perf_params {
266267
ucx_perf_wait_mode_t wait_mode; /* How to wait */
267268
ucs_memory_type_t send_mem_type; /* Send memory type */
268269
ucs_memory_type_t recv_mem_type; /* Recv memory type */
269-
ucx_perf_accel_dev_t send_device; /* Send memory device for gdaki */
270-
ucx_perf_accel_dev_t recv_device; /* Recv memory device for gdaki */
271-
ucs_device_level_t device_level; /* Device level for gdaki */
270+
ucx_perf_accel_dev_t send_device; /* Send memory device */
271+
ucx_perf_accel_dev_t recv_device; /* Recv memory device */
272+
ucs_device_level_t device_level; /* Device level */
272273
unsigned flags; /* See ucx_perf_test_flags. */
273274

274275
size_t *msg_size_list; /* Test message sizes list. The size
@@ -289,6 +290,7 @@ typedef struct ucx_perf_params {
289290
in latency tests */
290291
unsigned device_thread_count; /* Number of device threads */
291292
unsigned device_block_count; /* Number of device blocks */
293+
unsigned device_fc_window; /* Flow control window size for device tests */
292294

293295
void *rte_group; /* Opaque RTE group handle */
294296
ucx_perf_rte_t *rte; /* RTE functions used to exchange data */

src/tools/perf/cuda/cuda_kernel.cuh

Lines changed: 90 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ typedef unsigned long long ucx_perf_cuda_time_t;
1818

1919
struct ucx_perf_cuda_context {
2020
unsigned max_outstanding;
21+
unsigned device_fc_window;
2122
ucx_perf_counter_t max_iters;
2223
ucx_perf_cuda_time_t report_interval_ns;
2324
ucx_perf_counter_t completed_iters;
@@ -32,22 +33,41 @@ UCS_F_DEVICE ucx_perf_cuda_time_t ucx_perf_cuda_get_time_ns()
3233
return globaltimer;
3334
}
3435

35-
UCS_F_DEVICE void
36-
ucx_perf_cuda_update_report(ucx_perf_cuda_context &ctx,
37-
ucx_perf_counter_t completed,
38-
ucx_perf_counter_t max_iters,
39-
ucx_perf_cuda_time_t &last_report_time)
40-
{
41-
if (threadIdx.x == 0) {
42-
ucx_perf_cuda_time_t current_time = ucx_perf_cuda_get_time_ns();
43-
if (((current_time - last_report_time) >= ctx.report_interval_ns) ||
44-
(completed >= max_iters)) {
45-
ctx.completed_iters = completed;
46-
last_report_time = current_time;
47-
__threadfence();
36+
class ucx_perf_cuda_reporter {
37+
public:
38+
__device__
39+
ucx_perf_cuda_reporter(ucx_perf_cuda_context &ctx) :
40+
m_ctx(ctx),
41+
m_max_iters(ctx.max_iters),
42+
m_next_report_iter(1),
43+
m_last_report_time(ucx_perf_cuda_get_time_ns()),
44+
m_report_interval_ns(ctx.report_interval_ns / 5)
45+
{
46+
}
47+
48+
__device__ inline void
49+
update_report(ucx_perf_counter_t completed)
50+
{
51+
if ((threadIdx.x == 0) && ucs_unlikely(completed >= m_next_report_iter)) {
52+
ucx_perf_cuda_time_t cur_time = ucx_perf_cuda_get_time_ns();
53+
ucx_perf_cuda_time_t iter_time = (cur_time - m_last_report_time) /
54+
(completed - m_ctx.completed_iters);
55+
m_last_report_time = cur_time;
56+
m_ctx.completed_iters = completed;
57+
__threadfence_system();
58+
59+
m_next_report_iter = ucs_min(completed + (m_report_interval_ns / iter_time),
60+
m_max_iters);
4861
}
4962
}
50-
}
63+
64+
private:
65+
ucx_perf_cuda_context &m_ctx;
66+
ucx_perf_counter_t m_max_iters;
67+
ucx_perf_counter_t m_next_report_iter;
68+
ucx_perf_cuda_time_t m_last_report_time;
69+
ucx_perf_cuda_time_t m_report_interval_ns;
70+
};
5171

5272
static UCS_F_ALWAYS_INLINE uint64_t *
5373
ucx_perf_cuda_get_sn(const void *address, size_t length)
@@ -63,95 +83,83 @@ UCS_F_DEVICE void ucx_perf_cuda_wait_sn(const uint64_t *sn, uint64_t value)
6383
__syncthreads();
6484
}
6585

66-
/* Simple bitset */
67-
#define UCX_BIT_MASK(bit) (1 << ((bit) & (CHAR_BIT - 1)))
68-
#define UCX_BIT_SET(set, bit) (set[(bit)/CHAR_BIT] |= UCX_BIT_MASK(bit))
69-
#define UCX_BIT_RESET(set, bit) (set[(bit)/CHAR_BIT] &= ~UCX_BIT_MASK(bit))
70-
#define UCX_BIT_GET(set, bit) (set[(bit)/CHAR_BIT] & UCX_BIT_MASK(bit))
71-
#define UCX_BITSET_SIZE(bits) ((bits + CHAR_BIT - 1) / CHAR_BIT)
72-
73-
UCS_F_DEVICE size_t ucx_bitset_popcount(const uint8_t *set, size_t bits) {
74-
size_t count = 0;
75-
for (size_t i = 0; i < bits; i++) {
76-
if (UCX_BIT_GET(set, i)) {
77-
count++;
78-
}
79-
}
80-
return count;
81-
}
82-
83-
UCS_F_DEVICE size_t
84-
ucx_bitset_ffns(const uint8_t *set, size_t bits, size_t from)
86+
template<ucs_device_level_t level>
87+
__host__ UCS_F_DEVICE unsigned ucx_perf_cuda_thread_index(size_t tid)
8588
{
86-
for (size_t i = from; i < bits; i++) {
87-
if (!UCX_BIT_GET(set, i)) {
88-
return i;
89-
}
89+
switch (level) {
90+
case UCS_DEVICE_LEVEL_THREAD: return tid;
91+
case UCS_DEVICE_LEVEL_WARP: return tid / UCS_DEVICE_NUM_THREADS_IN_WARP;
92+
default: return 0;
9093
}
91-
return bits;
9294
}
9395

94-
#define UCX_KERNEL_CMD(level, cmd, blocks, threads, shared_size, func, ...) \
95-
do { \
96-
switch (cmd) { \
97-
case UCX_PERF_CMD_PUT_SINGLE: \
98-
func<level, UCX_PERF_CMD_PUT_SINGLE><<<blocks, threads, shared_size>>>(__VA_ARGS__); \
99-
break; \
100-
case UCX_PERF_CMD_PUT_MULTI: \
101-
func<level, UCX_PERF_CMD_PUT_MULTI><<<blocks, threads, shared_size>>>(__VA_ARGS__); \
102-
break; \
103-
case UCX_PERF_CMD_PUT_PARTIAL: \
104-
func<level, UCX_PERF_CMD_PUT_PARTIAL><<<blocks, threads, shared_size>>>(__VA_ARGS__); \
105-
break; \
106-
default: \
107-
ucs_error("Unsupported cmd: %d", cmd); \
108-
break; \
109-
} \
110-
} while (0)
96+
#define UCX_PERF_THREAD_INDEX_SET(_level, _tid, _outval) \
97+
(_outval) = ucx_perf_cuda_thread_index<_level>(_tid)
98+
99+
#define UCX_PERF_SWITCH_CMD(_cmd, _func, ...) \
100+
switch (_cmd) { \
101+
case UCX_PERF_CMD_PUT_SINGLE: \
102+
_func(UCX_PERF_CMD_PUT_SINGLE, __VA_ARGS__); \
103+
break; \
104+
case UCX_PERF_CMD_PUT_MULTI: \
105+
_func(UCX_PERF_CMD_PUT_MULTI, __VA_ARGS__); \
106+
break; \
107+
case UCX_PERF_CMD_PUT_PARTIAL: \
108+
_func(UCX_PERF_CMD_PUT_PARTIAL, __VA_ARGS__); \
109+
break; \
110+
default: \
111+
ucs_error("Unsupported cmd: %d", _cmd); \
112+
break; \
113+
}
111114

112-
#define UCX_KERNEL_DISPATCH(perf, func, ...) \
113-
do { \
114-
ucs_device_level_t _level = perf.params.device_level; \
115-
ucx_perf_cmd_t _cmd = perf.params.command; \
116-
unsigned _blocks = perf.params.device_block_count; \
117-
unsigned _threads = perf.params.device_thread_count; \
118-
size_t _shared_size = _threads * perf.params.max_outstanding * \
119-
sizeof(ucp_device_request_t); \
120-
switch (_level) { \
115+
#define UCX_PERF_SWITCH_LEVEL(_level, _func, ...) \
116+
switch (_level) { \
121117
case UCS_DEVICE_LEVEL_THREAD: \
122-
UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_THREAD, _cmd, _blocks, _threads,\
123-
_shared_size, func, __VA_ARGS__); \
118+
_func(UCS_DEVICE_LEVEL_THREAD, __VA_ARGS__); \
124119
break; \
125120
case UCS_DEVICE_LEVEL_WARP: \
126-
UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_WARP, _cmd, _blocks, _threads,\
127-
_shared_size, func, __VA_ARGS__); \
121+
_func(UCS_DEVICE_LEVEL_WARP, __VA_ARGS__); \
128122
break; \
129123
case UCS_DEVICE_LEVEL_BLOCK: \
130-
UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_BLOCK, _cmd, _blocks, _threads,\
131-
_shared_size, func, __VA_ARGS__); \
132-
break; \
133124
case UCS_DEVICE_LEVEL_GRID: \
134-
UCX_KERNEL_CMD(UCS_DEVICE_LEVEL_GRID, _cmd, _blocks, _threads,\
135-
_shared_size, func, __VA_ARGS__); \
136-
break; \
137125
default: \
138126
ucs_error("Unsupported level: %d", _level); \
139127
break; \
140-
} \
128+
}
129+
130+
#define UCX_PERF_KERNEL_DISPATCH_CMD_LEVEL(_cmd, _level, _perf, _kernel, ...) \
131+
do { \
132+
unsigned _blocks = _perf.params.device_block_count; \
133+
unsigned _threads = _perf.params.device_thread_count; \
134+
unsigned _reqs_count = ucs_div_round_up(_perf.params.max_outstanding, \
135+
_perf.params.device_fc_window); \
136+
size_t _shared_size = _reqs_count * sizeof(ucp_device_request_t) * \
137+
ucx_perf_cuda_thread_index<_level>(_threads); \
138+
_kernel<_level, _cmd><<<_blocks, _threads, _shared_size>>>(__VA_ARGS__); \
141139
} while (0)
142140

141+
#define UCX_PERF_KERNEL_DISPATCH_CMD(_level, _perf, _kernel, ...) \
142+
UCX_PERF_SWITCH_CMD(_perf.params.command, UCX_PERF_KERNEL_DISPATCH_CMD_LEVEL, \
143+
_level, _perf, _kernel, __VA_ARGS__);
144+
145+
#define UCX_PERF_KERNEL_DISPATCH(_perf, _kernel, ...) \
146+
UCX_PERF_SWITCH_LEVEL(_perf.params.device_level, UCX_PERF_KERNEL_DISPATCH_CMD, \
147+
_perf, _kernel, __VA_ARGS__);
148+
149+
143150
class ucx_perf_cuda_test_runner {
144151
public:
145152
ucx_perf_cuda_test_runner(ucx_perf_context_t &perf) : m_perf(perf)
146153
{
147154
init_ctx();
148155

149156
m_cpu_ctx->max_outstanding = perf.params.max_outstanding;
157+
m_cpu_ctx->device_fc_window = perf.params.device_fc_window;
150158
m_cpu_ctx->max_iters = perf.max_iter;
151159
m_cpu_ctx->completed_iters = 0;
152160
m_cpu_ctx->report_interval_ns = (perf.report_interval == ULONG_MAX) ?
153161
ULONG_MAX :
154-
ucs_time_to_nsec(perf.report_interval) / 100;
162+
ucs_time_to_nsec(perf.report_interval);
155163
m_cpu_ctx->status = UCS_ERR_NOT_IMPLEMENTED;
156164
}
157165

@@ -166,11 +174,16 @@ public:
166174
ucx_perf_counter_t last_completed = 0;
167175
ucx_perf_counter_t completed = m_cpu_ctx->completed_iters;
168176
unsigned thread_count = m_perf.params.device_thread_count;
177+
ucs_device_level_t level = m_perf.params.device_level;
178+
unsigned msgs_per_iter;
179+
UCX_PERF_SWITCH_LEVEL(level, UCX_PERF_THREAD_INDEX_SET, thread_count,
180+
msgs_per_iter);
181+
169182
while (true) {
170183
ucx_perf_counter_t delta = completed - last_completed;
171184
if (delta > 0) {
172185
// TODO: calculate latency percentile on kernel
173-
ucx_perf_update(&m_perf, delta, delta * thread_count, msg_length);
186+
ucx_perf_update(&m_perf, delta, delta * msgs_per_iter, msg_length);
174187
} else if (completed >= m_perf.max_iter) {
175188
break;
176189
}

0 commit comments

Comments
 (0)