Skip to content
This repository was archived by the owner on Sep 30, 2022. It is now read-only.

Commit f4a3c1f

Browse files
PDevezetkordenbrock
authored andcommitted
mtl/portals4: Take into account the limitation of portals4 (max_msg_size) and split messages if necessary
1 parent 44822b2 commit f4a3c1f

File tree

5 files changed

+149
-25
lines changed

5 files changed

+149
-25
lines changed

ompi/mca/mtl/portals4/mtl_portals4.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ struct mca_mtl_portals4_module_t {
7373

7474
/** Network interface handle for matched interface */
7575
ptl_handle_ni_t ni_h;
76+
/** Limit given by portals after NIInit */
77+
uint64_t max_msg_size_mtl;
7678
/** Uid for current user */
7779
ptl_uid_t uid;
7880

ompi/mca/mtl/portals4/mtl_portals4_component.c

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,19 @@ ompi_mtl_portals4_component_register(void)
185185
OPAL_INFO_LVL_5,
186186
MCA_BASE_VAR_SCOPE_READONLY,
187187
&ompi_mtl_portals4.protocol);
188+
189+
ompi_mtl_portals4.max_msg_size_mtl = PTL_SIZE_MAX;
190+
(void) mca_base_component_var_register(&mca_mtl_portals4_component.mtl_version,
191+
"max_msg_size",
192+
"Max size supported by portals4 (above that, a message is cut into messages less than that size)",
193+
MCA_BASE_VAR_TYPE_UNSIGNED_LONG,
194+
NULL,
195+
0,
196+
0,
197+
OPAL_INFO_LVL_5,
198+
MCA_BASE_VAR_SCOPE_READONLY,
199+
&ompi_mtl_portals4.max_msg_size_mtl);
200+
188201
OBJ_RELEASE(new_enum);
189202
if (0 > ret) {
190203
return OMPI_ERR_NOT_SUPPORTED;
@@ -208,6 +221,9 @@ ompi_mtl_portals4_component_open(void)
208221
"no"
209222
#endif
210223
);
224+
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
225+
"Max message size: %lu", (unsigned long)
226+
ompi_mtl_portals4.max_msg_size_mtl);
211227
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
212228
"Short limit: %d", (int)
213229
ompi_mtl_portals4.short_limit);
@@ -329,6 +345,11 @@ ompi_mtl_portals4_component_init(bool enable_progress_threads,
329345
goto error;
330346
}
331347

348+
if (actual_limits.max_msg_size < ompi_mtl_portals4.max_msg_size_mtl)
349+
ompi_mtl_portals4.max_msg_size_mtl = actual_limits.max_msg_size;
350+
OPAL_OUTPUT_VERBOSE((10, ompi_mtl_base_framework.framework_output,
351+
"Due to portals4 and user configuration messages will not go over the size of %lu", ompi_mtl_portals4.max_msg_size_mtl));
352+
332353
if (ompi_comm_rank(MPI_COMM_WORLD) == 0) {
333354
opal_output_verbose(10, ompi_mtl_base_framework.framework_output, "max_entries=%d", actual_limits.max_entries);
334355
opal_output_verbose(10, ompi_mtl_base_framework.framework_output, "max_unexpected_headers=%d", actual_limits.max_unexpected_headers);

ompi/mca/mtl/portals4/mtl_portals4_recv.c

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,9 @@ read_msg(void *start, ptl_size_t length, ptl_process_t target,
3939
ptl_match_bits_t match_bits, ptl_size_t remote_offset,
4040
ompi_mtl_portals4_recv_request_t *request)
4141
{
42-
int ret;
42+
int ret, i;
43+
ptl_size_t rest = length, asked = 0, frag_size;
44+
int32_t pending_reply;
4345

4446
#if OMPI_MTL_PORTALS4_FLOW_CONTROL
4547
while (OPAL_UNLIKELY(OPAL_THREAD_ADD32(&ompi_mtl_portals4.flowctl.send_slots, -1) < 0)) {
@@ -48,19 +50,29 @@ read_msg(void *start, ptl_size_t length, ptl_process_t target,
4850
}
4951
#endif
5052

51-
ret = PtlGet(ompi_mtl_portals4.send_md_h,
52-
(ptl_size_t) start,
53-
length,
54-
target,
55-
ompi_mtl_portals4.read_idx,
56-
match_bits,
57-
remote_offset,
58-
request);
59-
if (OPAL_UNLIKELY(PTL_OK != ret)) {
60-
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
61-
"%s:%d: PtlGet failed: %d",
62-
__FILE__, __LINE__, ret);
63-
return OMPI_ERR_OUT_OF_RESOURCE;
53+
request->pending_reply = (length + ompi_mtl_portals4.max_msg_size_mtl - 1) / ompi_mtl_portals4.max_msg_size_mtl;
54+
pending_reply = request->pending_reply;
55+
56+
for (i = 0 ; i < pending_reply ; i++) {
57+
OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output, "GET (fragment %d/%d) send",
58+
i + 1, pending_reply));
59+
frag_size = (OPAL_UNLIKELY(rest > ompi_mtl_portals4.max_msg_size_mtl)) ? ompi_mtl_portals4.max_msg_size_mtl : rest;
60+
ret = PtlGet(ompi_mtl_portals4.send_md_h,
61+
(ptl_size_t) start + i * ompi_mtl_portals4.max_msg_size_mtl,
62+
frag_size,
63+
target,
64+
ompi_mtl_portals4.read_idx,
65+
match_bits,
66+
remote_offset + i * ompi_mtl_portals4.max_msg_size_mtl,
67+
request);
68+
if (OPAL_UNLIKELY(PTL_OK != ret)) {
69+
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
70+
"%s:%d: PtlGet failed: %d",
71+
__FILE__, __LINE__, ret);
72+
return OMPI_ERR_OUT_OF_RESOURCE;
73+
}
74+
rest -= frag_size;
75+
asked += frag_size;
6476
}
6577

6678
return OMPI_SUCCESS;
@@ -109,12 +121,16 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev,
109121
ptl_request->super.super.ompi_req->req_status.MPI_ERROR = MPI_ERR_TRUNCATE;
110122
}
111123

124+
if (ev->mlength < msg_length)
125+
OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output, "Truncated message, some PtlGet are required (protocol = %d)",
126+
ompi_mtl_portals4.protocol));
127+
112128
#if OPAL_ENABLE_DEBUG
113129
ptl_request->hdr_data = ev->hdr_data;
114130
#endif
115131

116132
ptl_request->super.super.ompi_req->req_status._ucount = ev->mlength;
117-
if (!MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits) && ompi_mtl_portals4.protocol == rndv && msg_length != ev->mlength) {
133+
if (!MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits) && msg_length > ev->mlength) {
118134
/* If it's not a short message and we're doing rndv and the message is not complete, we
119135
only have the first part of the message. Issue the get
120136
to pull the second part of the message. */
@@ -129,7 +145,6 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev,
129145
if (NULL != ptl_request->buffer_ptr) free(ptl_request->buffer_ptr);
130146
goto callback_error;
131147
}
132-
133148
} else {
134149
/* If we're either using the eager protocol or were a
135150
short message, all data has been received, so complete
@@ -167,6 +182,12 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev,
167182
exactly how much data was sent. */
168183
ptl_request->super.super.ompi_req->req_status._ucount += ev->mlength;
169184

185+
ret = OPAL_THREAD_ADD32(&(ptl_request->pending_reply), -1);
186+
if (ret > 0) {
187+
return OMPI_SUCCESS;
188+
}
189+
assert(ptl_request->pending_reply == 0);
190+
170191
#if OMPI_MTL_PORTALS4_FLOW_CONTROL
171192
OPAL_THREAD_ADD32(&ompi_mtl_portals4.flowctl.send_slots, 1);
172193
#endif
@@ -187,8 +208,8 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev,
187208
}
188209

189210
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
190-
"Recv %lu (0x%lx) completed, reply",
191-
ptl_request->opcount, ptl_request->hdr_data));
211+
"Recv %lu (0x%lx) completed , reply (pending_reply: %d)",
212+
ptl_request->opcount, ptl_request->hdr_data, ptl_request->pending_reply));
192213
ptl_request->super.super.completion_callback(&ptl_request->super.super);
193214
break;
194215

@@ -367,6 +388,7 @@ ompi_mtl_portals4_irecv(struct mca_mtl_base_module_t* mtl,
367388
ptl_request->delivery_len = length;
368389
ptl_request->req_started = false;
369390
ptl_request->super.super.ompi_req->req_status.MPI_ERROR = OMPI_SUCCESS;
391+
ptl_request->pending_reply = 0;
370392

371393
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
372394
"Recv %lu from %x,%x of length %ld (0x%lx, 0x%lx, 0x%lx)\n",
@@ -448,6 +470,7 @@ ompi_mtl_portals4_imrecv(struct mca_mtl_base_module_t* mtl,
448470
ptl_request->delivery_ptr = start;
449471
ptl_request->delivery_len = length;
450472
ptl_request->super.super.ompi_req->req_status.MPI_ERROR = OMPI_SUCCESS;
473+
ptl_request->pending_reply = 0;
451474

452475
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
453476
"Mrecv %lu of length %ld (0x%lx)\n",

ompi/mca/mtl/portals4/mtl_portals4_request.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ struct ompi_mtl_portals4_isend_request_t {
5353
struct ompi_mtl_portals4_pending_request_t *pending;
5454
#endif
5555
ptl_size_t length;
56+
int32_t pending_get;
5657
uint32_t event_count;
5758
};
5859
typedef struct ompi_mtl_portals4_isend_request_t ompi_mtl_portals4_isend_request_t;
@@ -74,6 +75,7 @@ struct ompi_mtl_portals4_recv_request_t {
7475
void *delivery_ptr;
7576
size_t delivery_len;
7677
volatile bool req_started;
78+
int32_t pending_reply;
7779
#if OPAL_ENABLE_DEBUG
7880
uint64_t opcount;
7981
ptl_hdr_data_t hdr_data;

ompi/mca/mtl/portals4/mtl_portals4_send.c

Lines changed: 83 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,29 @@ ompi_mtl_portals4_callback(ptl_event_t *ev,
4444
ompi_mtl_portals4_isend_request_t* ptl_request =
4545
(ompi_mtl_portals4_isend_request_t*) ptl_base_request;
4646

47+
if (PTL_EVENT_GET == ev->type) {
48+
ret = OPAL_THREAD_ADD32(&(ptl_request->pending_get), -1);
49+
if (ret > 0) {
50+
/* wait for other gets */
51+
OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output, "PTL_EVENT_GET received now pending_get=%d",ret));
52+
return retval;
53+
}
54+
assert(ptl_request->pending_get == 0);
55+
56+
/* last get received */
57+
OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output, "PTL_EVENT_GET: PtlMEUnlink is called ptl_request->me_h=%d (pending get=%d)", ptl_request->me_h, ret));
58+
59+
if (!PtlHandleIsEqual(ptl_request->me_h, PTL_INVALID_HANDLE)) {
60+
ret = PtlMEUnlink(ptl_request->me_h);
61+
if (PTL_OK != ret) {
62+
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
63+
"%s:%d: send callback PtlMEUnlink returned %d",
64+
__FILE__, __LINE__, ret);
65+
}
66+
ptl_request->me_h = PTL_INVALID_HANDLE;
67+
}
68+
}
69+
4770
#if OMPI_MTL_PORTALS4_FLOW_CONTROL
4871
if (OPAL_UNLIKELY(ev->ni_fail_type == PTL_NI_PT_DISABLED)) {
4972
ompi_mtl_portals4_pending_request_t *pending =
@@ -66,6 +89,7 @@ ompi_mtl_portals4_callback(ptl_event_t *ev,
6689
"%s:%d: send callback PtlMEUnlink returned %d",
6790
__FILE__, __LINE__, ret);
6891
}
92+
ptl_request->me_h = PTL_INVALID_HANDLE;
6993
}
7094

7195
opal_list_append(&ompi_mtl_portals4.flowctl.pending_sends,
@@ -89,6 +113,33 @@ ompi_mtl_portals4_callback(ptl_event_t *ev,
89113
"send %lu got event of type %d",
90114
ptl_request->opcount, ev->type));
91115

116+
/* First put achieved successfully (In the Priority List), so it may be necessary to decrement the number of pending get
117+
* If the protocol is eager, just decrement pending_get
118+
* Else (the protocol is rndv), decrement pending_get only if length % max_msg_size <= eager_limit
119+
* (This is the case where the eager part allows to save one get)
120+
*/
121+
if ((PTL_EVENT_ACK == ev->type) &&
122+
(PTL_PRIORITY_LIST == ev->ptl_list) &&
123+
(0 < ptl_request->pending_get)) {
124+
125+
if ((eager == ompi_mtl_portals4.protocol) ||
126+
(ptl_request->length % ompi_mtl_portals4.max_msg_size_mtl <= ompi_mtl_portals4.eager_limit)) {
127+
val = OPAL_THREAD_ADD32(&(ptl_request->pending_get), -1);
128+
}
129+
if (0 == val) {
130+
add = 2; /* We haven't to wait for any get, so we have to add an extra count to cause the message to complete */
131+
if (!PtlHandleIsEqual(ptl_request->me_h, PTL_INVALID_HANDLE)) {
132+
ret = PtlMEUnlink(ptl_request->me_h);
133+
if (PTL_OK != ret) {
134+
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
135+
"%s:%d: send callback PtlMEUnlink returned %d",
136+
__FILE__, __LINE__, ret);
137+
}
138+
ptl_request->me_h = PTL_INVALID_HANDLE;
139+
}
140+
}
141+
}
142+
92143
if ((PTL_EVENT_ACK == ev->type) &&
93144
(PTL_PRIORITY_LIST == ev->ptl_list) &&
94145
(ev->mlength == ptl_request->length) &&
@@ -107,10 +158,10 @@ ompi_mtl_portals4_callback(ptl_event_t *ev,
107158
"%s:%d: send callback PtlMEUnlink returned %d",
108159
__FILE__, __LINE__, ret);
109160
}
161+
ptl_request->me_h = PTL_INVALID_HANDLE;
110162
add++;
111163
}
112164
val = OPAL_THREAD_ADD32((int32_t*)&ptl_request->event_count, add);
113-
114165
assert(val <= 3);
115166

116167
if (val == 3) {
@@ -193,6 +244,7 @@ ompi_mtl_portals4_short_isend(mca_pml_base_send_mode_t mode,
193244

194245
MTL_PORTALS4_SET_HDR_DATA(hdr_data, ptl_request->opcount, length,
195246
(MCA_PML_BASE_SEND_SYNCHRONOUS == mode) ? 1 : 0);
247+
ptl_request->me_h = PTL_INVALID_HANDLE;
196248

197249
if (MCA_PML_BASE_SEND_SYNCHRONOUS == mode) {
198250
me.start = NULL;
@@ -219,6 +271,7 @@ ompi_mtl_portals4_short_isend(mca_pml_base_send_mode_t mode,
219271
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
220272
"%s:%d: PtlMEAppend failed: %d",
221273
__FILE__, __LINE__, ret);
274+
ptl_request->me_h = PTL_INVALID_HANDLE;
222275
return ompi_mtl_portals4_get_error(ret);
223276
}
224277

@@ -227,7 +280,6 @@ ompi_mtl_portals4_short_isend(mca_pml_base_send_mode_t mode,
227280
ptl_request->opcount, hdr_data, match_bits));
228281
} else {
229282
ptl_request->event_count = 1;
230-
ptl_request->me_h = PTL_INVALID_HANDLE;
231283

232284
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
233285
"Send %lu short send with hdr_data 0x%lx (0x%lx)",
@@ -238,6 +290,7 @@ ompi_mtl_portals4_short_isend(mca_pml_base_send_mode_t mode,
238290
"Send %lu, start: %p",
239291
ptl_request->opcount, start));
240292

293+
ptl_request->pending_get = 0;
241294
ret = PtlPut(ompi_mtl_portals4.send_md_h,
242295
(ptl_size_t) start,
243296
length,
@@ -254,6 +307,7 @@ ompi_mtl_portals4_short_isend(mca_pml_base_send_mode_t mode,
254307
__FILE__, __LINE__, ret);
255308
if (MCA_PML_BASE_SEND_SYNCHRONOUS == mode) {
256309
PtlMEUnlink(ptl_request->me_h);
310+
ptl_request->me_h = PTL_INVALID_HANDLE;
257311
}
258312
return ompi_mtl_portals4_get_error(ret);
259313
}
@@ -285,7 +339,6 @@ ompi_mtl_portals4_long_isend(void *start, size_t length, int contextid, int tag,
285339
me.uid = ompi_mtl_portals4.uid;
286340
me.options =
287341
PTL_ME_OP_GET |
288-
PTL_ME_USE_ONCE |
289342
PTL_ME_EVENT_LINK_DISABLE |
290343
PTL_ME_EVENT_UNLINK_DISABLE;
291344
me.match_id = ptl_proc;
@@ -309,10 +362,32 @@ ompi_mtl_portals4_long_isend(void *start, size_t length, int contextid, int tag,
309362
"Send %lu long send with hdr_data 0x%lx (0x%lx)",
310363
ptl_request->opcount, hdr_data, match_bits));
311364

312-
if ((rndv == ompi_mtl_portals4.protocol) && ((ptl_size_t) length > (ptl_size_t) ompi_mtl_portals4.eager_limit))
313-
put_length = (ptl_size_t) ompi_mtl_portals4.eager_limit;
314-
else put_length = (ptl_size_t) length;
365+
if (rndv == ompi_mtl_portals4.protocol) {
366+
ptl_size_t min = (OPAL_LIKELY (ompi_mtl_portals4.eager_limit < ompi_mtl_portals4.max_msg_size_mtl)) ?
367+
ompi_mtl_portals4.eager_limit :
368+
ompi_mtl_portals4.max_msg_size_mtl;
369+
if ((ptl_size_t) length > (ptl_size_t) min) {
370+
OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output,
371+
"msg truncated by %ld", length - min));
372+
put_length = (ptl_size_t) min;
373+
}
374+
else
375+
put_length = (ptl_size_t) length;
376+
} else { // eager protocol
377+
if (length > ompi_mtl_portals4.max_msg_size_mtl)
378+
put_length = (ptl_size_t) ompi_mtl_portals4.max_msg_size_mtl;
379+
else
380+
put_length = (ptl_size_t) length;
381+
}
382+
383+
/* We have to wait for some GET events.
384+
If the first put falls in overflow list, the number of GET event is egal to:
385+
(length - 1) / ompi_mtl_portals4.max_msg_size_mtl + 1
386+
else we will re-calculate this number when we received the first ACK event (with remote overflow list)
387+
*/
315388

389+
ptl_request->pending_get = (length - 1) / ompi_mtl_portals4.max_msg_size_mtl + 1;
390+
OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output, "pending_get=%d", ptl_request->pending_get));
316391

317392
ret = PtlPut(ompi_mtl_portals4.send_md_h,
318393
(ptl_size_t) start,
@@ -328,7 +403,8 @@ ompi_mtl_portals4_long_isend(void *start, size_t length, int contextid, int tag,
328403
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
329404
"%s:%d: PtlPut failed: %d",
330405
__FILE__, __LINE__, ret);
331-
PtlMEUnlink(ptl_request->me_h);
406+
PtlMEUnlink(ptl_request->me_h);
407+
ptl_request->me_h = PTL_INVALID_HANDLE;
332408
return ompi_mtl_portals4_get_error(ret);
333409
}
334410

0 commit comments

Comments
 (0)