Skip to content

Commit fb479dd

Browse files
UCP/DEVICE: Make memh and local_addr optional for counter elements (#10945)
1 parent 5db80cf commit fb479dd

File tree

5 files changed

+317
-90
lines changed

5 files changed

+317
-90
lines changed

src/tools/perf/cuda/ucp_cuda_kernel.cu

Lines changed: 36 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -109,28 +109,41 @@ public:
109109
const ucp_perf_cuda_params &get_params() const { return m_params; }
110110

111111
private:
112+
static bool has_counter(const ucx_perf_context_t &perf)
113+
{
114+
return (perf.params.command != UCX_PERF_CMD_PUT_SINGLE);
115+
}
116+
112117
void init_mem_list(const ucx_perf_context_t &perf)
113118
{
114-
/* +1 for the counter */
115-
size_t count = perf.params.msg_size_cnt + 1;
116-
size_t offset = 0;
119+
size_t data_count = perf.params.msg_size_cnt;
120+
size_t count = data_count + (has_counter(perf) ? 1 : 0);
121+
size_t offset = 0;
117122
ucp_device_mem_list_elem_t elems[count];
118123

119-
for (size_t i = 0; i < count; ++i) {
120-
elems[i].field_mask = UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH |
121-
UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY |
122-
UCP_DEVICE_MEM_LIST_ELEM_FIELD_LOCAL_ADDR |
123-
UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR |
124-
UCP_DEVICE_MEM_LIST_ELEM_FIELD_LENGTH;
125-
elems[i].memh = perf.ucp.send_memh;
126-
elems[i].rkey = perf.ucp.rkey;
127-
elems[i].local_addr = UCS_PTR_BYTE_OFFSET(perf.send_buffer, offset);
124+
for (size_t i = 0; i < data_count; ++i) {
125+
elems[i].field_mask = UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH |
126+
UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY |
127+
UCP_DEVICE_MEM_LIST_ELEM_FIELD_LOCAL_ADDR |
128+
UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR |
129+
UCP_DEVICE_MEM_LIST_ELEM_FIELD_LENGTH;
130+
elems[i].memh = perf.ucp.send_memh;
131+
elems[i].rkey = perf.ucp.rkey;
132+
elems[i].local_addr = UCS_PTR_BYTE_OFFSET(perf.send_buffer, offset);
128133
elems[i].remote_addr = perf.ucp.remote_addr + offset;
129-
elems[i].length = (i == count - 1) ? ONESIDED_SIGNAL_SIZE :
130-
perf.params.msg_size_list[i];
134+
elems[i].length = perf.params.msg_size_list[i];
131135
offset += elems[i].length;
132136
}
133137

138+
if (has_counter(perf)) {
139+
elems[data_count].field_mask = UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY |
140+
UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR |
141+
UCP_DEVICE_MEM_LIST_ELEM_FIELD_LENGTH;
142+
elems[data_count].rkey = perf.ucp.rkey;
143+
elems[data_count].remote_addr = perf.ucp.remote_addr + offset;
144+
elems[data_count].length = ONESIDED_SIGNAL_SIZE;
145+
}
146+
134147
ucp_device_mem_list_params_t params;
135148
params.field_mask = UCP_DEVICE_MEM_LIST_PARAMS_FIELD_ELEMENTS |
136149
UCP_DEVICE_MEM_LIST_PARAMS_FIELD_ELEMENT_SIZE |
@@ -148,20 +161,22 @@ private:
148161

149162
void init_elements(const ucx_perf_context_t &perf)
150163
{
151-
/* +1 for the counter */
152-
size_t count = perf.params.msg_size_cnt + 1;
153-
size_t offset = 0;
164+
size_t data_count = perf.params.msg_size_cnt;
165+
size_t count = data_count + (has_counter(perf) ? 1 : 0);
154166

155167
std::vector<unsigned> indices(count);
156168
std::vector<size_t> local_offsets(count, 0);
157169
std::vector<size_t> remote_offsets(count, 0);
158170
std::vector<size_t> lengths(count);
159171

160-
for (unsigned i = 0; i < count; ++i) {
172+
for (unsigned i = 0; i < data_count; ++i) {
161173
indices[i] = i;
162-
lengths[i] = (i == count - 1) ? ONESIDED_SIGNAL_SIZE :
163-
perf.params.msg_size_list[i];
164-
offset += lengths[i];
174+
lengths[i] = perf.params.msg_size_list[i];
175+
}
176+
177+
if (has_counter(perf)) {
178+
indices[data_count] = data_count;
179+
lengths[data_count] = ONESIDED_SIGNAL_SIZE;
165180
}
166181

167182
device_clone(&m_params.indices, indices.data(), count);

src/ucp/api/device/ucp_device_impl.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -232,9 +232,11 @@ UCS_F_DEVICE ucs_status_t ucp_device_counter_inc(
232232
* This operation can be polled on the receiver to detect completion of all the
233233
* operations of the batch, started during the same routine call.
234234
*
235-
* The last entry in the descriptor list contains
236-
* the remote memory registration descriptors to be used for the increment
237-
* operation.
235+
* All the elements except the last one are data elements that must contain all
236+
* @ref ucp_device_mem_list_elem_fields and @ref ucp_device_mem_list_elem_t.
237+
*
238+
* The last entry in the descriptor list contains the remote memory
239+
* registration descriptors to be used for the increment operation.
238240
*
239241
* The routine returns a request that can be progressed and checked for
240242
* completion with @ref ucp_device_progress_req.

src/ucp/api/device/ucp_host.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,16 @@ BEGIN_C_DECLS
3131
* The enumeration allows specifying which fields in @ref
3232
* ucp_device_mem_list_elem are present.
3333
*
34+
* @note Counter elements can omit the @a UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH
35+
* and @a UCP_DEVICE_MEM_LIST_ELEM_FIELD_LOCAL_ADDR fields.
36+
*
3437
* It is used to enable backward compatibility support.
3538
*/
3639
enum ucp_device_mem_list_elem_field {
3740
UCP_DEVICE_MEM_LIST_ELEM_FIELD_MEMH = UCS_BIT(0), /**< Source memory handle */
38-
UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY = UCS_BIT(1), /**< Unpacked remote memory key */
41+
UCP_DEVICE_MEM_LIST_ELEM_FIELD_RKEY = UCS_BIT(1), /**< Unpacked remote memory key (always required) */
3942
UCP_DEVICE_MEM_LIST_ELEM_FIELD_LOCAL_ADDR = UCS_BIT(2), /**< Local address */
40-
UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR = UCS_BIT(3), /**< Remote address */
43+
UCP_DEVICE_MEM_LIST_ELEM_FIELD_REMOTE_ADDR = UCS_BIT(3), /**< Remote address */
4144
UCP_DEVICE_MEM_LIST_ELEM_FIELD_LENGTH = UCS_BIT(4) /**< Length of the local buffer in bytes */
4245
};
4346

@@ -48,6 +51,8 @@ enum ucp_device_mem_list_elem_field {
4851
*
4952
* This describes a pair of local and remote memory for which a memory operation
5053
* can later be performed multiple times, possibly with varying memory offsets.
54+
*
55+
* @note Counter elements can omit the @a memh and @a local_addr fields.
5156
*/
5257
typedef struct ucp_device_mem_list_elem {
5358
/**
@@ -80,6 +85,7 @@ typedef struct ucp_device_mem_list_elem {
8085

8186
/**
8287
* Unpacked memory key for the remote memory endpoint.
88+
* Always required.
8389
*/
8490
ucp_rkey_h rkey;
8591
} ucp_device_mem_list_elem_t;

0 commit comments

Comments
 (0)