Skip to content
This repository was archived by the owner on Sep 30, 2022. It is now read-only.

Commit 3102942

Browse files
authored
Merge pull request #1311 from tkordenbrock/topic/mtl-portals4-short_msg-split_msg
Mtl portals4 short msg split msg
2 parents 0452ba3 + f4a3c1f commit 3102942

File tree

9 files changed

+209
-55
lines changed

9 files changed

+209
-55
lines changed

ompi/mca/mtl/portals4/mtl_portals4.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,15 @@ struct mca_mtl_portals4_module_t {
4646

4747
/* Use the logical to physical table to accelerate portals4 adressing: 1 (true) : 0 (false) */
4848
int32_t use_logical;
49+
50+
/* Process_id */
51+
ptl_process_t ptl_process_id;
52+
4953
/* Use flow control: 1 (true) : 0 (false) */
5054
int32_t use_flowctl;
5155

56+
/** Short limit; Size limit for short messages */
57+
uint64_t short_limit;
5258
/** Eager limit; messages greater than this use a rendezvous protocol */
5359
uint64_t eager_limit;
5460
/** Size of short message blocks */
@@ -67,6 +73,8 @@ struct mca_mtl_portals4_module_t {
6773

6874
/** Network interface handle for matched interface */
6975
ptl_handle_ni_t ni_h;
76+
/** Limit given by portals after NIInit */
77+
uint64_t max_msg_size_mtl;
7078
/** Uid for current user */
7179
ptl_uid_t uid;
7280

ompi/mca/mtl/portals4/mtl_portals4_component.c

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,18 @@ ompi_mtl_portals4_component_register(void)
100100
OPAL_INFO_LVL_9,
101101
MCA_BASE_VAR_SCOPE_READONLY,
102102
&param_priority);
103+
ompi_mtl_portals4.short_limit = 2 * 1024;
104+
(void) mca_base_component_var_register(&mca_mtl_portals4_component.mtl_version,
105+
"short_limit",
106+
"Size limit for short messages",
107+
MCA_BASE_VAR_TYPE_UNSIGNED_LONG_LONG,
108+
NULL,
109+
0,
110+
0,
111+
OPAL_INFO_LVL_5,
112+
MCA_BASE_VAR_SCOPE_READONLY,
113+
&ompi_mtl_portals4.short_limit);
114+
103115

104116
ompi_mtl_portals4.eager_limit = 2 * 1024;
105117
(void) mca_base_component_var_register(&mca_mtl_portals4_component.mtl_version,
@@ -173,6 +185,19 @@ ompi_mtl_portals4_component_register(void)
173185
OPAL_INFO_LVL_5,
174186
MCA_BASE_VAR_SCOPE_READONLY,
175187
&ompi_mtl_portals4.protocol);
188+
189+
ompi_mtl_portals4.max_msg_size_mtl = PTL_SIZE_MAX;
190+
(void) mca_base_component_var_register(&mca_mtl_portals4_component.mtl_version,
191+
"max_msg_size",
192+
"Max size supported by portals4 (above that, a message is cut into messages less than that size)",
193+
MCA_BASE_VAR_TYPE_UNSIGNED_LONG,
194+
NULL,
195+
0,
196+
0,
197+
OPAL_INFO_LVL_5,
198+
MCA_BASE_VAR_SCOPE_READONLY,
199+
&ompi_mtl_portals4.max_msg_size_mtl);
200+
176201
OBJ_RELEASE(new_enum);
177202
if (0 > ret) {
178203
return OMPI_ERR_NOT_SUPPORTED;
@@ -196,6 +221,12 @@ ompi_mtl_portals4_component_open(void)
196221
"no"
197222
#endif
198223
);
224+
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
225+
"Max message size: %lu", (unsigned long)
226+
ompi_mtl_portals4.max_msg_size_mtl);
227+
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
228+
"Short limit: %d", (int)
229+
ompi_mtl_portals4.short_limit);
199230
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
200231
"Eager limit: %d", (int)
201232
ompi_mtl_portals4.eager_limit);
@@ -314,6 +345,11 @@ ompi_mtl_portals4_component_init(bool enable_progress_threads,
314345
goto error;
315346
}
316347

348+
if (actual_limits.max_msg_size < ompi_mtl_portals4.max_msg_size_mtl)
349+
ompi_mtl_portals4.max_msg_size_mtl = actual_limits.max_msg_size;
350+
OPAL_OUTPUT_VERBOSE((10, ompi_mtl_base_framework.framework_output,
351+
"Due to portals4 and user configuration messages will not go over the size of %lu", ompi_mtl_portals4.max_msg_size_mtl));
352+
317353
if (ompi_comm_rank(MPI_COMM_WORLD) == 0) {
318354
opal_output_verbose(10, ompi_mtl_base_framework.framework_output, "max_entries=%d", actual_limits.max_entries);
319355
opal_output_verbose(10, ompi_mtl_base_framework.framework_output, "max_unexpected_headers=%d", actual_limits.max_unexpected_headers);
@@ -350,6 +386,10 @@ ompi_mtl_portals4_component_init(bool enable_progress_threads,
350386
goto error;
351387
}
352388

389+
ompi_mtl_portals4.ptl_process_id = id;
390+
OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output,
391+
"PtlGetPhysId rank=%x nid=%x pid=%x\n", id.rank, id.phys.nid, id.phys.pid));
392+
353393
OPAL_MODEX_SEND(ret, OPAL_PMIX_GLOBAL,
354394
&mca_mtl_portals4_component.mtl_version,
355395
&id, sizeof(id));

ompi/mca/mtl/portals4/mtl_portals4_flowctl.c

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,13 @@ ompi_mtl_portals4_flowctl_init(void)
7070
goto error;
7171
}
7272

73+
if (ompi_mtl_portals4.flowctl_idx != REQ_FLOWCTL_TABLE_ID) {
74+
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
75+
"%s:%d: PtlPTAlloc did not allocate the requested PT: %d\n",
76+
__FILE__, __LINE__, ompi_mtl_portals4.flowctl_idx);
77+
goto error;
78+
}
79+
7380
ret = PtlCTAlloc(ompi_mtl_portals4.ni_h,
7481
&ompi_mtl_portals4.flowctl.trigger_ct_h);
7582
if (OPAL_UNLIKELY(PTL_OK != ret)) {
@@ -291,9 +298,7 @@ ompi_mtl_portals4_flowctl_trigger(void)
291298
{
292299
int ret;
293300

294-
if (false == ompi_mtl_portals4.flowctl.flowctl_active) {
295-
ompi_mtl_portals4.flowctl.flowctl_active = true;
296-
301+
if (true == OPAL_ATOMIC_CMPSET_32(&ompi_mtl_portals4.flowctl.flowctl_active, false, true)) {
297302
/* send trigger to root */
298303
ret = PtlPut(ompi_mtl_portals4.zero_md_h,
299304
0,

ompi/mca/mtl/portals4/mtl_portals4_flowctl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ OBJ_CLASS_DECLARATION(ompi_mtl_portals4_pending_request_t);
3434

3535

3636
struct ompi_mtl_portals4_flowctl_t {
37-
bool flowctl_active;
37+
int32_t flowctl_active;
3838

3939
int32_t send_slots;
4040
int32_t max_send_slots;

ompi/mca/mtl/portals4/mtl_portals4_probe.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ completion_fn(ptl_event_t *ev, ompi_mtl_portals4_base_request_t *ptl_base_reques
3232
ompi_mtl_portals4_probe_request_t *ptl_request =
3333
(ompi_mtl_portals4_probe_request_t*) ptl_base_request;
3434

35-
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
35+
opal_output_verbose(10, ompi_mtl_base_framework.framework_output,
3636
"%s:%d: completion_fn: %d %d",
3737
__FILE__, __LINE__, ev->type, ev->ni_fail_type);
3838

ompi/mca/mtl/portals4/mtl_portals4_recv.c

Lines changed: 56 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,9 @@ read_msg(void *start, ptl_size_t length, ptl_process_t target,
3939
ptl_match_bits_t match_bits, ptl_size_t remote_offset,
4040
ompi_mtl_portals4_recv_request_t *request)
4141
{
42-
int ret;
42+
int ret, i;
43+
ptl_size_t rest = length, asked = 0, frag_size;
44+
int32_t pending_reply;
4345

4446
#if OMPI_MTL_PORTALS4_FLOW_CONTROL
4547
while (OPAL_UNLIKELY(OPAL_THREAD_ADD32(&ompi_mtl_portals4.flowctl.send_slots, -1) < 0)) {
@@ -48,19 +50,29 @@ read_msg(void *start, ptl_size_t length, ptl_process_t target,
4850
}
4951
#endif
5052

51-
ret = PtlGet(ompi_mtl_portals4.send_md_h,
52-
(ptl_size_t) start,
53-
length,
54-
target,
55-
ompi_mtl_portals4.read_idx,
56-
match_bits,
57-
remote_offset,
58-
request);
59-
if (OPAL_UNLIKELY(PTL_OK != ret)) {
60-
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
61-
"%s:%d: PtlGet failed: %d",
62-
__FILE__, __LINE__, ret);
63-
return OMPI_ERR_OUT_OF_RESOURCE;
53+
request->pending_reply = (length + ompi_mtl_portals4.max_msg_size_mtl - 1) / ompi_mtl_portals4.max_msg_size_mtl;
54+
pending_reply = request->pending_reply;
55+
56+
for (i = 0 ; i < pending_reply ; i++) {
57+
OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output, "GET (fragment %d/%d) send",
58+
i + 1, pending_reply));
59+
frag_size = (OPAL_UNLIKELY(rest > ompi_mtl_portals4.max_msg_size_mtl)) ? ompi_mtl_portals4.max_msg_size_mtl : rest;
60+
ret = PtlGet(ompi_mtl_portals4.send_md_h,
61+
(ptl_size_t) start + i * ompi_mtl_portals4.max_msg_size_mtl,
62+
frag_size,
63+
target,
64+
ompi_mtl_portals4.read_idx,
65+
match_bits,
66+
remote_offset + i * ompi_mtl_portals4.max_msg_size_mtl,
67+
request);
68+
if (OPAL_UNLIKELY(PTL_OK != ret)) {
69+
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
70+
"%s:%d: PtlGet failed: %d",
71+
__FILE__, __LINE__, ret);
72+
return OMPI_ERR_OUT_OF_RESOURCE;
73+
}
74+
rest -= frag_size;
75+
asked += frag_size;
6476
}
6577

6678
return OMPI_SUCCESS;
@@ -109,26 +121,30 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev,
109121
ptl_request->super.super.ompi_req->req_status.MPI_ERROR = MPI_ERR_TRUNCATE;
110122
}
111123

124+
if (ev->mlength < msg_length)
125+
OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output, "Truncated message, some PtlGet are required (protocol = %d)",
126+
ompi_mtl_portals4.protocol));
127+
112128
#if OPAL_ENABLE_DEBUG
113129
ptl_request->hdr_data = ev->hdr_data;
114130
#endif
115131

116-
if (!MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits) && ompi_mtl_portals4.protocol == rndv) {
117-
/* If it's not a short message and we're doing rndv, we
132+
ptl_request->super.super.ompi_req->req_status._ucount = ev->mlength;
133+
if (!MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits) && msg_length > ev->mlength) {
134+
/* If it's not a short message and we're doing rndv and the message is not complete, we
118135
only have the first part of the message. Issue the get
119136
to pull the second part of the message. */
120-
ret = read_msg((char*) ptl_request->delivery_ptr + ompi_mtl_portals4.eager_limit,
137+
ret = read_msg((char*) ptl_request->delivery_ptr + ev->mlength,
121138
((msg_length > ptl_request->delivery_len) ?
122-
ptl_request->delivery_len : msg_length) - ompi_mtl_portals4.eager_limit,
139+
ptl_request->delivery_len : msg_length) - ev->mlength,
123140
ev->initiator,
124141
ev->hdr_data,
125-
ompi_mtl_portals4.eager_limit,
142+
ev->mlength,
126143
ptl_request);
127144
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
128145
if (NULL != ptl_request->buffer_ptr) free(ptl_request->buffer_ptr);
129146
goto callback_error;
130147
}
131-
132148
} else {
133149
/* If we're either using the eager protocol or were a
134150
short message, all data has been received, so complete
@@ -142,8 +158,6 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev,
142158
__FILE__, __LINE__, ret);
143159
ptl_request->super.super.ompi_req->req_status.MPI_ERROR = ret;
144160
}
145-
ptl_request->super.super.ompi_req->req_status._ucount = ev->mlength;
146-
147161
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
148162
"Recv %lu (0x%lx) completed, expected",
149163
ptl_request->opcount, ptl_request->hdr_data));
@@ -165,12 +179,14 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev,
165179
}
166180

167181
/* set the received length in the status, now that we know
168-
excatly how much data was sent. */
169-
ptl_request->super.super.ompi_req->req_status._ucount = ev->mlength;
170-
if (ompi_mtl_portals4.protocol == rndv) {
171-
ptl_request->super.super.ompi_req->req_status._ucount +=
172-
ompi_mtl_portals4.eager_limit;
182+
exactly how much data was sent. */
183+
ptl_request->super.super.ompi_req->req_status._ucount += ev->mlength;
184+
185+
ret = OPAL_THREAD_ADD32(&(ptl_request->pending_reply), -1);
186+
if (ret > 0) {
187+
return OMPI_SUCCESS;
173188
}
189+
assert(ptl_request->pending_reply == 0);
174190

175191
#if OMPI_MTL_PORTALS4_FLOW_CONTROL
176192
OPAL_THREAD_ADD32(&ompi_mtl_portals4.flowctl.send_slots, 1);
@@ -192,8 +208,8 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev,
192208
}
193209

194210
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
195-
"Recv %lu (0x%lx) completed, reply",
196-
ptl_request->opcount, ptl_request->hdr_data));
211+
"Recv %lu (0x%lx) completed , reply (pending_reply: %d)",
212+
ptl_request->opcount, ptl_request->hdr_data, ptl_request->pending_reply));
197213
ptl_request->super.super.completion_callback(&ptl_request->super.super);
198214
break;
199215

@@ -281,17 +297,16 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev,
281297
ptl_request->super.super.completion_callback(&ptl_request->super.super);
282298

283299
} else {
284-
if (ev->mlength > 0) {
285-
/* if rndv or triggered, copy the eager part to the right place */
286-
memcpy(ptl_request->delivery_ptr, ev->start, ev->mlength);
287-
}
288300

289-
ret = read_msg((char*) ptl_request->delivery_ptr + ev->mlength,
290-
((msg_length > ptl_request->delivery_len) ?
291-
ptl_request->delivery_len : msg_length) - ev->mlength,
301+
/* For long messages in the overflow list, ev->mlength = 0 */
302+
ptl_request->super.super.ompi_req->req_status._ucount = 0;
303+
304+
ret = read_msg((char*) ptl_request->delivery_ptr,
305+
(msg_length > ptl_request->delivery_len) ?
306+
ptl_request->delivery_len : msg_length,
292307
ev->initiator,
293308
ev->hdr_data,
294-
ev->mlength,
309+
0,
295310
ptl_request);
296311
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
297312
if (NULL != ptl_request->buffer_ptr) free(ptl_request->buffer_ptr);
@@ -373,6 +388,7 @@ ompi_mtl_portals4_irecv(struct mca_mtl_base_module_t* mtl,
373388
ptl_request->delivery_len = length;
374389
ptl_request->req_started = false;
375390
ptl_request->super.super.ompi_req->req_status.MPI_ERROR = OMPI_SUCCESS;
391+
ptl_request->pending_reply = 0;
376392

377393
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
378394
"Recv %lu from %x,%x of length %ld (0x%lx, 0x%lx, 0x%lx)\n",
@@ -389,7 +405,7 @@ ompi_mtl_portals4_irecv(struct mca_mtl_base_module_t* mtl,
389405
PTL_ME_OP_PUT |
390406
PTL_ME_USE_ONCE |
391407
PTL_ME_EVENT_UNLINK_DISABLE;
392-
if (length <= ompi_mtl_portals4.eager_limit) {
408+
if (length <= ompi_mtl_portals4.short_limit) {
393409
me.options |= PTL_ME_EVENT_LINK_DISABLE;
394410
}
395411
me.match_id = remote_proc;
@@ -413,7 +429,7 @@ ompi_mtl_portals4_irecv(struct mca_mtl_base_module_t* mtl,
413429
/* if a long message, spin until we either have a comm event or a
414430
link event, guaranteeing progress for long unexpected
415431
messages. */
416-
if (length > ompi_mtl_portals4.eager_limit) {
432+
if (length > ompi_mtl_portals4.short_limit) {
417433
while (true != ptl_request->req_started) {
418434
ompi_mtl_portals4_progress();
419435
}
@@ -454,6 +470,7 @@ ompi_mtl_portals4_imrecv(struct mca_mtl_base_module_t* mtl,
454470
ptl_request->delivery_ptr = start;
455471
ptl_request->delivery_len = length;
456472
ptl_request->super.super.ompi_req->req_status.MPI_ERROR = OMPI_SUCCESS;
473+
ptl_request->pending_reply = 0;
457474

458475
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
459476
"Mrecv %lu of length %ld (0x%lx)\n",

ompi/mca/mtl/portals4/mtl_portals4_recv_short.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ ompi_mtl_portals4_activate_block(ompi_mtl_portals4_recv_short_block_t *block)
208208
me.start = block->start;
209209
me.length = ompi_mtl_portals4.recv_short_size;
210210
me.ct_handle = PTL_CT_NONE;
211-
me.min_free = ompi_mtl_portals4.eager_limit;
211+
me.min_free = ompi_mtl_portals4.short_limit;
212212
me.uid = ompi_mtl_portals4.uid;
213213
me.options =
214214
PTL_ME_OP_PUT |

ompi/mca/mtl/portals4/mtl_portals4_request.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ struct ompi_mtl_portals4_isend_request_t {
5252
#if OMPI_MTL_PORTALS4_FLOW_CONTROL
5353
struct ompi_mtl_portals4_pending_request_t *pending;
5454
#endif
55+
ptl_size_t length;
56+
int32_t pending_get;
5557
uint32_t event_count;
5658
};
5759
typedef struct ompi_mtl_portals4_isend_request_t ompi_mtl_portals4_isend_request_t;
@@ -73,6 +75,7 @@ struct ompi_mtl_portals4_recv_request_t {
7375
void *delivery_ptr;
7476
size_t delivery_len;
7577
volatile bool req_started;
78+
int32_t pending_reply;
7679
#if OPAL_ENABLE_DEBUG
7780
uint64_t opcount;
7881
ptl_hdr_data_t hdr_data;

0 commit comments

Comments
 (0)