Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/tools/perf/api/libperf.h
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,7 @@ typedef struct ucx_perf_params {
double report_interval; /* Interval at which to call the report callback */
double percentile_rank; /* The percentile rank of the percentile reported
in latency tests */
unsigned device_ep_channel_count; /* Number of channels for each ucp device endpoint */
unsigned device_thread_count; /* Number of device threads */
unsigned device_block_count; /* Number of device blocks */
unsigned device_fc_window; /* Flow control window size for device tests */
Expand Down
2 changes: 2 additions & 0 deletions src/tools/perf/cuda/cuda_kernel.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
typedef unsigned long long ucx_perf_cuda_time_t;

struct ucx_perf_cuda_context {
unsigned num_channels;
unsigned max_outstanding;
unsigned device_fc_window;
ucx_perf_counter_t max_iters;
Expand Down Expand Up @@ -158,6 +159,7 @@ public:
{
init_ctx();

m_cpu_ctx->num_channels = perf.params.device_ep_channel_count;
m_cpu_ctx->max_outstanding = perf.params.max_outstanding;
m_cpu_ctx->device_fc_window = perf.params.device_fc_window;
m_cpu_ctx->max_iters = perf.max_iter;
Expand Down
13 changes: 9 additions & 4 deletions src/tools/perf/cuda/ucp_cuda_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ private:
struct ucp_perf_cuda_params {
ucp_device_mem_list_handle_h mem_list;
size_t length;
unsigned num_channels;
unsigned *indices;
size_t *local_offsets;
size_t *remote_offsets;
Expand All @@ -122,6 +123,7 @@ class ucp_perf_cuda_params_handler {
public:
ucp_perf_cuda_params_handler(const ucx_perf_context_t &perf)
{
m_params.num_channels = perf.params.device_ep_channel_count;
init_mem_list(perf);
init_elements(perf);
init_counters(perf);
Expand Down Expand Up @@ -243,15 +245,18 @@ ucp_perf_cuda_send_async(const ucp_perf_cuda_params &params,
ucx_perf_counter_t idx, ucp_device_request_t *req,
ucp_device_flags_t flags = UCP_DEVICE_FLAG_NODELAY)
{
const unsigned channel_id = threadIdx.x % params.num_channels;

switch (cmd) {
case UCX_PERF_CMD_PUT_SINGLE:
*params.counter_send = idx + 1;
return ucp_device_put_single<level>(params.mem_list, params.indices[0],
0, 0,
params.length + ONESIDED_SIGNAL_SIZE,
0, flags, req);
channel_id, flags, req);
case UCX_PERF_CMD_PUT_MULTI:
return ucp_device_put_multi<level>(params.mem_list, 1, 0, flags, req);
return ucp_device_put_multi<level>(params.mem_list, 1, channel_id,
flags, req);
case UCX_PERF_CMD_PUT_PARTIAL: {
unsigned counter_index = params.mem_list->mem_list_length - 1;
return ucp_device_put_multi_partial<level>(params.mem_list,
Expand All @@ -260,8 +265,8 @@ ucp_perf_cuda_send_async(const ucp_perf_cuda_params &params,
params.local_offsets,
params.remote_offsets,
params.lengths,
counter_index, 1, 0, 0,
flags, req);
counter_index, 1, 0,
channel_id, flags, req);
}
}

Expand Down
71 changes: 36 additions & 35 deletions src/tools/perf/perftest.c
Original file line number Diff line number Diff line change
Expand Up @@ -192,41 +192,42 @@ ucs_status_t init_test_params(perftest_params_t *params)
{UCS_MEMORY_TYPE_LAST, UCX_PERF_MEM_DEV_DEFAULT};

memset(params, 0, sizeof(*params));
params->super.api = UCX_PERF_API_LAST;
params->super.command = UCX_PERF_CMD_LAST;
params->super.test_type = UCX_PERF_TEST_TYPE_LAST;
params->super.thread_mode = UCS_THREAD_MODE_SERIALIZED;
params->super.thread_count = 1;
params->super.async_mode = UCS_ASYNC_THREAD_LOCK_TYPE;
params->super.wait_mode = UCX_PERF_WAIT_MODE_LAST;
params->super.max_outstanding = 0;
params->super.warmup_iter = 10000;
params->super.warmup_time = 100e-3;
params->super.alignment = ucs_get_page_size();
params->super.max_iter = 1000000l;
params->super.max_time = 0.0;
params->super.report_interval = 1.0;
params->super.percentile_rank = 50.0;
params->super.flags = UCX_PERF_TEST_FLAG_VERBOSE;
params->super.uct.fc_window = UCT_PERF_TEST_MAX_FC_WINDOW;
params->super.uct.data_layout = UCT_PERF_DATA_LAYOUT_SHORT;
params->super.uct.am_hdr_size = 8;
params->super.send_mem_type = UCS_MEMORY_TYPE_HOST;
params->super.recv_mem_type = UCS_MEMORY_TYPE_HOST;
params->super.send_device = default_dev;
params->super.recv_device = default_dev;
params->super.device_level = UCS_DEVICE_LEVEL_THREAD;
params->super.msg_size_cnt = 1;
params->super.iov_stride = 0;
params->super.ucp.send_datatype = UCP_PERF_DATATYPE_CONTIG;
params->super.ucp.recv_datatype = UCP_PERF_DATATYPE_CONTIG;
params->super.ucp.am_hdr_size = 0;
params->super.device_thread_count = 1;
params->super.device_block_count = 1;
params->super.device_fc_window = UCP_PERF_FC_WINDOW_DEFAULT;
params->super.ucp.is_daemon_mode = 0;
params->super.ucp.dmn_local_addr = empty_addr;
params->super.ucp.dmn_remote_addr = empty_addr;
params->super.api = UCX_PERF_API_LAST;
params->super.command = UCX_PERF_CMD_LAST;
params->super.test_type = UCX_PERF_TEST_TYPE_LAST;
params->super.thread_mode = UCS_THREAD_MODE_SERIALIZED;
params->super.thread_count = 1;
params->super.async_mode = UCS_ASYNC_THREAD_LOCK_TYPE;
params->super.wait_mode = UCX_PERF_WAIT_MODE_LAST;
params->super.max_outstanding = 0;
params->super.warmup_iter = 10000;
params->super.warmup_time = 100e-3;
params->super.alignment = ucs_get_page_size();
params->super.max_iter = 1000000l;
params->super.max_time = 0.0;
params->super.report_interval = 1.0;
params->super.percentile_rank = 50.0;
params->super.flags = UCX_PERF_TEST_FLAG_VERBOSE;
params->super.uct.fc_window = UCT_PERF_TEST_MAX_FC_WINDOW;
params->super.uct.data_layout = UCT_PERF_DATA_LAYOUT_SHORT;
params->super.uct.am_hdr_size = 8;
params->super.send_mem_type = UCS_MEMORY_TYPE_HOST;
params->super.recv_mem_type = UCS_MEMORY_TYPE_HOST;
params->super.send_device = default_dev;
params->super.recv_device = default_dev;
params->super.device_level = UCS_DEVICE_LEVEL_THREAD;
params->super.msg_size_cnt = 1;
params->super.iov_stride = 0;
params->super.ucp.send_datatype = UCP_PERF_DATATYPE_CONTIG;
params->super.ucp.recv_datatype = UCP_PERF_DATATYPE_CONTIG;
params->super.ucp.am_hdr_size = 0;
params->super.device_ep_channel_count = 1;
params->super.device_thread_count = 1;
params->super.device_block_count = 1;
params->super.device_fc_window = UCP_PERF_FC_WINDOW_DEFAULT;
params->super.ucp.is_daemon_mode = 0;
params->super.ucp.dmn_local_addr = empty_addr;
params->super.ucp.dmn_remote_addr = empty_addr;
strcpy(params->super.uct.dev_name, TL_RESOURCE_NAME_NONE);
strcpy(params->super.uct.tl_name, TL_RESOURCE_NAME_NONE);

Expand Down
6 changes: 4 additions & 2 deletions src/tools/perf/perftest_params.c
Original file line number Diff line number Diff line change
Expand Up @@ -756,9 +756,11 @@ ucs_status_t adjust_test_params(perftest_params_t *params,
}

if (params->super.send_device.mem_type != UCS_MEMORY_TYPE_LAST) {
/* TODO: read number of channels from ucp config */
params->super.device_ep_channel_count = 1;
/* TODO: Add getter function for thread count */
params->super.device_thread_count = params->super.thread_count;
params->super.thread_count = 1;
params->super.device_thread_count = params->super.thread_count;
params->super.thread_count = 1;
}

return UCS_OK;
Expand Down
Loading