Skip to content

Commit 7355ab4

Browse files
committed
TOOLS/DEVICE: support channel id in perftest
1 parent 958423b commit 7355ab4

File tree

5 files changed

+57
-41
lines changed

5 files changed

+57
-41
lines changed

src/tools/perf/api/libperf.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,7 @@ typedef struct ucx_perf_params {
288288
double report_interval; /* Interval at which to call the report callback */
289289
double percentile_rank; /* The percentile rank of the percentile reported
290290
in latency tests */
291+
unsigned device_ep_channel_count; /* Number of channels for each ucp device endpoint */
291292
unsigned device_thread_count; /* Number of device threads */
292293
unsigned device_block_count; /* Number of device blocks */
293294
unsigned device_fc_window; /* Flow control window size for device tests */

src/tools/perf/cuda/cuda_kernel.cuh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
typedef unsigned long long ucx_perf_cuda_time_t;
1818

1919
struct ucx_perf_cuda_context {
20+
unsigned num_channels;
2021
unsigned max_outstanding;
2122
unsigned device_fc_window;
2223
ucx_perf_counter_t max_iters;
@@ -153,6 +154,7 @@ public:
153154
{
154155
init_ctx();
155156

157+
m_cpu_ctx->num_channels = perf.params.device_ep_channel_count;
156158
m_cpu_ctx->max_outstanding = perf.params.max_outstanding;
157159
m_cpu_ctx->device_fc_window = perf.params.device_fc_window;
158160
m_cpu_ctx->max_iters = perf.max_iter;

src/tools/perf/cuda/ucp_cuda_kernel.cu

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ private:
110110
struct ucp_perf_cuda_params {
111111
ucp_device_mem_list_handle_h mem_list;
112112
size_t length;
113+
unsigned num_channels;
113114
unsigned *indices;
114115
size_t *local_offsets;
115116
size_t *remote_offsets;
@@ -122,6 +123,7 @@ class ucp_perf_cuda_params_handler {
122123
public:
123124
ucp_perf_cuda_params_handler(const ucx_perf_context_t &perf)
124125
{
126+
init_params(perf);
125127
init_mem_list(perf);
126128
init_elements(perf);
127129
init_counters(perf);
@@ -144,6 +146,11 @@ private:
144146
return (perf.params.command != UCX_PERF_CMD_PUT_SINGLE);
145147
}
146148

149+
void init_params(const ucx_perf_context_t &perf)
150+
{
151+
m_params.num_channels = perf.params.device_ep_channel_count;
152+
}
153+
147154
void init_mem_list(const ucx_perf_context_t &perf)
148155
{
149156
size_t data_count = perf.params.msg_size_cnt;
@@ -243,16 +250,19 @@ ucp_perf_cuda_send_async(const ucp_perf_cuda_params &params,
243250
ucx_perf_counter_t idx, ucp_device_request_t *req,
244251
ucp_device_flags_t flags = UCP_DEVICE_FLAG_NODELAY)
245252
{
253+
const unsigned channel_id = threadIdx.x % params.num_channels;
254+
246255
switch (cmd) {
247256
case UCX_PERF_CMD_PUT_SINGLE:
248257
/* TODO: Change to ucp_device_counter_write */
249258
*params.counter_send = idx + 1;
250259
return ucp_device_put_single<level>(params.mem_list, params.indices[0],
251260
0, 0,
252261
params.length + ONESIDED_SIGNAL_SIZE,
253-
0, flags, req);
262+
channel_id, flags, req);
254263
case UCX_PERF_CMD_PUT_MULTI:
255-
return ucp_device_put_multi<level>(params.mem_list, 1, 0, flags, req);
264+
return ucp_device_put_multi<level>(params.mem_list, 1, channel_id,
265+
flags, req);
256266
case UCX_PERF_CMD_PUT_PARTIAL: {
257267
unsigned counter_index = params.mem_list->mem_list_length - 1;
258268
return ucp_device_put_multi_partial<level>(params.mem_list,
@@ -261,8 +271,8 @@ ucp_perf_cuda_send_async(const ucp_perf_cuda_params &params,
261271
params.local_offsets,
262272
params.remote_offsets,
263273
params.lengths,
264-
counter_index, 1, 0, 0,
265-
flags, req);
274+
counter_index, 1, 0,
275+
channel_id, flags, req);
266276
}
267277
}
268278

src/tools/perf/perftest.c

Lines changed: 36 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -192,41 +192,42 @@ ucs_status_t init_test_params(perftest_params_t *params)
192192
{UCS_MEMORY_TYPE_LAST, UCX_PERF_MEM_DEV_DEFAULT};
193193

194194
memset(params, 0, sizeof(*params));
195-
params->super.api = UCX_PERF_API_LAST;
196-
params->super.command = UCX_PERF_CMD_LAST;
197-
params->super.test_type = UCX_PERF_TEST_TYPE_LAST;
198-
params->super.thread_mode = UCS_THREAD_MODE_SERIALIZED;
199-
params->super.thread_count = 1;
200-
params->super.async_mode = UCS_ASYNC_THREAD_LOCK_TYPE;
201-
params->super.wait_mode = UCX_PERF_WAIT_MODE_LAST;
202-
params->super.max_outstanding = 0;
203-
params->super.warmup_iter = 10000;
204-
params->super.warmup_time = 100e-3;
205-
params->super.alignment = ucs_get_page_size();
206-
params->super.max_iter = 1000000l;
207-
params->super.max_time = 0.0;
208-
params->super.report_interval = 1.0;
209-
params->super.percentile_rank = 50.0;
210-
params->super.flags = UCX_PERF_TEST_FLAG_VERBOSE;
211-
params->super.uct.fc_window = UCT_PERF_TEST_MAX_FC_WINDOW;
212-
params->super.uct.data_layout = UCT_PERF_DATA_LAYOUT_SHORT;
213-
params->super.uct.am_hdr_size = 8;
214-
params->super.send_mem_type = UCS_MEMORY_TYPE_HOST;
215-
params->super.recv_mem_type = UCS_MEMORY_TYPE_HOST;
216-
params->super.send_device = default_dev;
217-
params->super.recv_device = default_dev;
218-
params->super.device_level = UCS_DEVICE_LEVEL_THREAD;
219-
params->super.msg_size_cnt = 1;
220-
params->super.iov_stride = 0;
221-
params->super.ucp.send_datatype = UCP_PERF_DATATYPE_CONTIG;
222-
params->super.ucp.recv_datatype = UCP_PERF_DATATYPE_CONTIG;
223-
params->super.ucp.am_hdr_size = 0;
224-
params->super.device_thread_count = 1;
225-
params->super.device_block_count = 1;
226-
params->super.device_fc_window = UCP_PERF_FC_WINDOW_DEFAULT;
227-
params->super.ucp.is_daemon_mode = 0;
228-
params->super.ucp.dmn_local_addr = empty_addr;
229-
params->super.ucp.dmn_remote_addr = empty_addr;
195+
params->super.api = UCX_PERF_API_LAST;
196+
params->super.command = UCX_PERF_CMD_LAST;
197+
params->super.test_type = UCX_PERF_TEST_TYPE_LAST;
198+
params->super.thread_mode = UCS_THREAD_MODE_SERIALIZED;
199+
params->super.thread_count = 1;
200+
params->super.async_mode = UCS_ASYNC_THREAD_LOCK_TYPE;
201+
params->super.wait_mode = UCX_PERF_WAIT_MODE_LAST;
202+
params->super.max_outstanding = 0;
203+
params->super.warmup_iter = 10000;
204+
params->super.warmup_time = 100e-3;
205+
params->super.alignment = ucs_get_page_size();
206+
params->super.max_iter = 1000000l;
207+
params->super.max_time = 0.0;
208+
params->super.report_interval = 1.0;
209+
params->super.percentile_rank = 50.0;
210+
params->super.flags = UCX_PERF_TEST_FLAG_VERBOSE;
211+
params->super.uct.fc_window = UCT_PERF_TEST_MAX_FC_WINDOW;
212+
params->super.uct.data_layout = UCT_PERF_DATA_LAYOUT_SHORT;
213+
params->super.uct.am_hdr_size = 8;
214+
params->super.send_mem_type = UCS_MEMORY_TYPE_HOST;
215+
params->super.recv_mem_type = UCS_MEMORY_TYPE_HOST;
216+
params->super.send_device = default_dev;
217+
params->super.recv_device = default_dev;
218+
params->super.device_level = UCS_DEVICE_LEVEL_THREAD;
219+
params->super.msg_size_cnt = 1;
220+
params->super.iov_stride = 0;
221+
params->super.ucp.send_datatype = UCP_PERF_DATATYPE_CONTIG;
222+
params->super.ucp.recv_datatype = UCP_PERF_DATATYPE_CONTIG;
223+
params->super.ucp.am_hdr_size = 0;
224+
params->super.device_ep_channel_count = 1;
225+
params->super.device_thread_count = 1;
226+
params->super.device_block_count = 1;
227+
params->super.device_fc_window = UCP_PERF_FC_WINDOW_DEFAULT;
228+
params->super.ucp.is_daemon_mode = 0;
229+
params->super.ucp.dmn_local_addr = empty_addr;
230+
params->super.ucp.dmn_remote_addr = empty_addr;
230231
strcpy(params->super.uct.dev_name, TL_RESOURCE_NAME_NONE);
231232
strcpy(params->super.uct.tl_name, TL_RESOURCE_NAME_NONE);
232233

src/tools/perf/perftest_params.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -756,9 +756,11 @@ ucs_status_t adjust_test_params(perftest_params_t *params,
756756
}
757757

758758
if (params->super.send_device.mem_type != UCS_MEMORY_TYPE_LAST) {
759+
/* TODO: read number of channels from ucp config */
760+
params->super.device_ep_channel_count = 1;
759761
/* TODO: Add getter function for thread count */
760-
params->super.device_thread_count = params->super.thread_count;
761-
params->super.thread_count = 1;
762+
params->super.device_thread_count = params->super.thread_count;
763+
params->super.thread_count = 1;
762764
}
763765

764766
return UCS_OK;

0 commit comments

Comments
 (0)