From bd3b1cf7beb88a801432149f2b258a468edaf1d0 Mon Sep 17 00:00:00 2001 From: Pascal Deveze Date: Tue, 26 Jul 2016 08:41:31 +0200 Subject: [PATCH 1/5] mtl-portals4: Control that flowctl_idx is egal to REQ_FLOWCTL_TABLE_ID and use OPAL_ATOMIC_CMPSET_32 to test and set flowctl_active flag to true --- ompi/mca/mtl/portals4/mtl_portals4_flowctl.c | 11 ++++++++--- ompi/mca/mtl/portals4/mtl_portals4_flowctl.h | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/ompi/mca/mtl/portals4/mtl_portals4_flowctl.c b/ompi/mca/mtl/portals4/mtl_portals4_flowctl.c index 0464015dd2c..ee9d055d8ac 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_flowctl.c +++ b/ompi/mca/mtl/portals4/mtl_portals4_flowctl.c @@ -70,6 +70,13 @@ ompi_mtl_portals4_flowctl_init(void) goto error; } + if (ompi_mtl_portals4.flowctl_idx != REQ_FLOWCTL_TABLE_ID) { + opal_output_verbose(1, ompi_mtl_base_framework.framework_output, + "%s:%d: PtlPTAlloc did not allocate the requested PT: %d\n", + __FILE__, __LINE__, ompi_mtl_portals4.flowctl_idx); + goto error; + } + ret = PtlCTAlloc(ompi_mtl_portals4.ni_h, &ompi_mtl_portals4.flowctl.trigger_ct_h); if (OPAL_UNLIKELY(PTL_OK != ret)) { @@ -291,9 +298,7 @@ ompi_mtl_portals4_flowctl_trigger(void) { int ret; - if (false == ompi_mtl_portals4.flowctl.flowctl_active) { - ompi_mtl_portals4.flowctl.flowctl_active = true; - + if (true == OPAL_ATOMIC_CMPSET_32(&ompi_mtl_portals4.flowctl.flowctl_active, false, true)) { /* send trigger to root */ ret = PtlPut(ompi_mtl_portals4.zero_md_h, 0, diff --git a/ompi/mca/mtl/portals4/mtl_portals4_flowctl.h b/ompi/mca/mtl/portals4/mtl_portals4_flowctl.h index 102659a8a23..7cc634b669b 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_flowctl.h +++ b/ompi/mca/mtl/portals4/mtl_portals4_flowctl.h @@ -34,7 +34,7 @@ OBJ_CLASS_DECLARATION(ompi_mtl_portals4_pending_request_t); struct ompi_mtl_portals4_flowctl_t { - bool flowctl_active; + int32_t flowctl_active; int32_t send_slots; int32_t max_send_slots; From 3ca194f10adeffd04ee8712a118ad4564905f089 Mon Sep 17 00:00:00 2001 From: Pascal Deveze Date: Tue, 26 Jul 2016 08:42:08 +0200 Subject: [PATCH 2/5] mtl-portals4: Store ptl_process_id (from PtlGetPhysId) and display it. --- ompi/mca/mtl/portals4/mtl_portals4.h | 4 ++++ ompi/mca/mtl/portals4/mtl_portals4_component.c | 4 ++++ ompi/mca/mtl/portals4/mtl_portals4_probe.c | 2 +- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/ompi/mca/mtl/portals4/mtl_portals4.h b/ompi/mca/mtl/portals4/mtl_portals4.h index e350d571159..e79000bc9a0 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4.h +++ b/ompi/mca/mtl/portals4/mtl_portals4.h @@ -46,6 +46,10 @@ struct mca_mtl_portals4_module_t { /* Use the logical to physical table to accelerate portals4 adressing: 1 (true) : 0 (false) */ int32_t use_logical; + + /* Process_id */ + ptl_process_t ptl_process_id; + /* Use flow control: 1 (true) : 0 (false) */ int32_t use_flowctl; diff --git a/ompi/mca/mtl/portals4/mtl_portals4_component.c b/ompi/mca/mtl/portals4/mtl_portals4_component.c index 892bb67b7f5..1c0851c3f6a 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_component.c +++ b/ompi/mca/mtl/portals4/mtl_portals4_component.c @@ -350,6 +350,10 @@ ompi_mtl_portals4_component_init(bool enable_progress_threads, goto error; } + ompi_mtl_portals4.ptl_process_id = id; + OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output, + "PtlGetPhysId rank=%x nid=%x pid=%x\n", id.rank, id.phys.nid, id.phys.pid)); + OPAL_MODEX_SEND(ret, OPAL_PMIX_GLOBAL, &mca_mtl_portals4_component.mtl_version, &id, sizeof(id)); diff --git a/ompi/mca/mtl/portals4/mtl_portals4_probe.c b/ompi/mca/mtl/portals4/mtl_portals4_probe.c index fbeda2124e7..5f2a991cfe7 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_probe.c +++ b/ompi/mca/mtl/portals4/mtl_portals4_probe.c @@ -32,7 +32,7 @@ completion_fn(ptl_event_t *ev, ompi_mtl_portals4_base_request_t *ptl_base_reques ompi_mtl_portals4_probe_request_t *ptl_request = (ompi_mtl_portals4_probe_request_t*) ptl_base_request; - opal_output_verbose(1, ompi_mtl_base_framework.framework_output, + opal_output_verbose(10, ompi_mtl_base_framework.framework_output, "%s:%d: completion_fn: %d %d", __FILE__, __LINE__, ev->type, ev->ni_fail_type); From 9e58b4842ffca12f40f4a14482a7c8df5c4d73f8 Mon Sep 17 00:00:00 2001 From: Pascal Deveze Date: Tue, 26 Jul 2016 08:42:48 +0200 Subject: [PATCH 3/5] mtl-portals4: Correct how the request_status._ucount is set --- ompi/mca/mtl/portals4/mtl_portals4_recv.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/ompi/mca/mtl/portals4/mtl_portals4_recv.c b/ompi/mca/mtl/portals4/mtl_portals4_recv.c index 077f1f88661..92b9b4797b8 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_recv.c +++ b/ompi/mca/mtl/portals4/mtl_portals4_recv.c @@ -113,6 +113,7 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev, ptl_request->hdr_data = ev->hdr_data; #endif + ptl_request->super.super.ompi_req->req_status._ucount = ev->mlength; if (!MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits) && ompi_mtl_portals4.protocol == rndv) { /* If it's not a short message and we're doing rndv, we only have the first part of the message. Issue the get @@ -142,8 +143,6 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev, __FILE__, __LINE__, ret); ptl_request->super.super.ompi_req->req_status.MPI_ERROR = ret; } - ptl_request->super.super.ompi_req->req_status._ucount = ev->mlength; - OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "Recv %lu (0x%lx) completed, expected", ptl_request->opcount, ptl_request->hdr_data)); @@ -166,11 +165,7 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev, /* set the received length in the status, now that we know excatly how much data was sent. */ - ptl_request->super.super.ompi_req->req_status._ucount = ev->mlength; - if (ompi_mtl_portals4.protocol == rndv) { - ptl_request->super.super.ompi_req->req_status._ucount += - ompi_mtl_portals4.eager_limit; - } + ptl_request->super.super.ompi_req->req_status._ucount += ev->mlength; #if OMPI_MTL_PORTALS4_FLOW_CONTROL OPAL_THREAD_ADD32(&ompi_mtl_portals4.flowctl.send_slots, 1); @@ -281,10 +276,9 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev, ptl_request->super.super.completion_callback(&ptl_request->super.super); } else { - if (ev->mlength > 0) { - /* if rndv or triggered, copy the eager part to the right place */ - memcpy(ptl_request->delivery_ptr, ev->start, ev->mlength); - } + + /* For long messages in the overflow list, ev->mlength = 0 */ + ptl_request->super.super.ompi_req->req_status._ucount = 0; ret = read_msg((char*) ptl_request->delivery_ptr + ev->mlength, ((msg_length > ptl_request->delivery_len) ? From 724801b0189e8a9f01a6a03a3bd2ee89b1485e48 Mon Sep 17 00:00:00 2001 From: Pascal Deveze Date: Tue, 26 Jul 2016 08:43:24 +0200 Subject: [PATCH 4/5] mtl-portals4: Introduce a "short_limit" for the short message size. "eager_limit" will only be used for the limit of the eager part of the messages sent with the rndv protocol --- ompi/mca/mtl/portals4/mtl_portals4.h | 2 ++ .../mca/mtl/portals4/mtl_portals4_component.c | 15 ++++++++++++ ompi/mca/mtl/portals4/mtl_portals4_recv.c | 24 +++++++++---------- .../mtl/portals4/mtl_portals4_recv_short.c | 2 +- ompi/mca/mtl/portals4/mtl_portals4_request.h | 1 + ompi/mca/mtl/portals4/mtl_portals4_send.c | 17 ++++++++----- 6 files changed, 42 insertions(+), 19 deletions(-) diff --git a/ompi/mca/mtl/portals4/mtl_portals4.h b/ompi/mca/mtl/portals4/mtl_portals4.h index e79000bc9a0..b8f312804c9 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4.h +++ b/ompi/mca/mtl/portals4/mtl_portals4.h @@ -53,6 +53,8 @@ struct mca_mtl_portals4_module_t { /* Use flow control: 1 (true) : 0 (false) */ int32_t use_flowctl; + /** Short limit; Size limit for short messages */ + uint64_t short_limit; /** Eager limit; messages greater than this use a rendezvous protocol */ uint64_t eager_limit; /** Size of short message blocks */ diff --git a/ompi/mca/mtl/portals4/mtl_portals4_component.c b/ompi/mca/mtl/portals4/mtl_portals4_component.c index 1c0851c3f6a..eef04c55dc5 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_component.c +++ b/ompi/mca/mtl/portals4/mtl_portals4_component.c @@ -100,6 +100,18 @@ ompi_mtl_portals4_component_register(void) OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, ¶m_priority); + ompi_mtl_portals4.short_limit = 2 * 1024; + (void) mca_base_component_var_register(&mca_mtl_portals4_component.mtl_version, + "short_limit", + "Size limit for short messages", + MCA_BASE_VAR_TYPE_UNSIGNED_LONG_LONG, + NULL, + 0, + 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_mtl_portals4.short_limit); + ompi_mtl_portals4.eager_limit = 2 * 1024; (void) mca_base_component_var_register(&mca_mtl_portals4_component.mtl_version, @@ -196,6 +208,9 @@ ompi_mtl_portals4_component_open(void) "no" #endif ); + opal_output_verbose(1, ompi_mtl_base_framework.framework_output, + "Short limit: %d", (int) + ompi_mtl_portals4.short_limit); opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "Eager limit: %d", (int) ompi_mtl_portals4.eager_limit); diff --git a/ompi/mca/mtl/portals4/mtl_portals4_recv.c b/ompi/mca/mtl/portals4/mtl_portals4_recv.c index 92b9b4797b8..ca7f506bde8 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_recv.c +++ b/ompi/mca/mtl/portals4/mtl_portals4_recv.c @@ -114,16 +114,16 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev, #endif ptl_request->super.super.ompi_req->req_status._ucount = ev->mlength; - if (!MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits) && ompi_mtl_portals4.protocol == rndv) { - /* If it's not a short message and we're doing rndv, we + if (!MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits) && ompi_mtl_portals4.protocol == rndv && msg_length != ev->mlength) { + /* If it's not a short message and we're doing rndv and the message is not complete, we only have the first part of the message. Issue the get to pull the second part of the message. */ - ret = read_msg((char*) ptl_request->delivery_ptr + ompi_mtl_portals4.eager_limit, + ret = read_msg((char*) ptl_request->delivery_ptr + ev->mlength, ((msg_length > ptl_request->delivery_len) ? - ptl_request->delivery_len : msg_length) - ompi_mtl_portals4.eager_limit, + ptl_request->delivery_len : msg_length) - ev->mlength, ev->initiator, ev->hdr_data, - ompi_mtl_portals4.eager_limit, + ev->mlength, ptl_request); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { if (NULL != ptl_request->buffer_ptr) free(ptl_request->buffer_ptr); @@ -164,7 +164,7 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev, } /* set the received length in the status, now that we know - excatly how much data was sent. */ + exactly how much data was sent. */ ptl_request->super.super.ompi_req->req_status._ucount += ev->mlength; #if OMPI_MTL_PORTALS4_FLOW_CONTROL @@ -280,12 +280,12 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev, /* For long messages in the overflow list, ev->mlength = 0 */ ptl_request->super.super.ompi_req->req_status._ucount = 0; - ret = read_msg((char*) ptl_request->delivery_ptr + ev->mlength, - ((msg_length > ptl_request->delivery_len) ? - ptl_request->delivery_len : msg_length) - ev->mlength, + ret = read_msg((char*) ptl_request->delivery_ptr, + (msg_length > ptl_request->delivery_len) ? + ptl_request->delivery_len : msg_length, ev->initiator, ev->hdr_data, - ev->mlength, + 0, ptl_request); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { if (NULL != ptl_request->buffer_ptr) free(ptl_request->buffer_ptr); @@ -383,7 +383,7 @@ ompi_mtl_portals4_irecv(struct mca_mtl_base_module_t* mtl, PTL_ME_OP_PUT | PTL_ME_USE_ONCE | PTL_ME_EVENT_UNLINK_DISABLE; - if (length <= ompi_mtl_portals4.eager_limit) { + if (length <= ompi_mtl_portals4.short_limit) { me.options |= PTL_ME_EVENT_LINK_DISABLE; } me.match_id = remote_proc; @@ -407,7 +407,7 @@ ompi_mtl_portals4_irecv(struct mca_mtl_base_module_t* mtl, /* if a long message, spin until we either have a comm event or a link event, guaranteeing progress for long unexpected messages. */ - if (length > ompi_mtl_portals4.eager_limit) { + if (length > ompi_mtl_portals4.short_limit) { while (true != ptl_request->req_started) { ompi_mtl_portals4_progress(); } diff --git a/ompi/mca/mtl/portals4/mtl_portals4_recv_short.c b/ompi/mca/mtl/portals4/mtl_portals4_recv_short.c index 936b92e1ec5..23cd049022b 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_recv_short.c +++ b/ompi/mca/mtl/portals4/mtl_portals4_recv_short.c @@ -191,7 +191,7 @@ ompi_mtl_portals4_activate_block(ompi_mtl_portals4_recv_short_block_t *block) me.start = block->start; me.length = ompi_mtl_portals4.recv_short_size; me.ct_handle = PTL_CT_NONE; - me.min_free = ompi_mtl_portals4.eager_limit; + me.min_free = ompi_mtl_portals4.short_limit; me.uid = ompi_mtl_portals4.uid; me.options = PTL_ME_OP_PUT | diff --git a/ompi/mca/mtl/portals4/mtl_portals4_request.h b/ompi/mca/mtl/portals4/mtl_portals4_request.h index f76846115cb..eb814dafa7f 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_request.h +++ b/ompi/mca/mtl/portals4/mtl_portals4_request.h @@ -52,6 +52,7 @@ struct ompi_mtl_portals4_isend_request_t { #if OMPI_MTL_PORTALS4_FLOW_CONTROL struct ompi_mtl_portals4_pending_request_t *pending; #endif + ptl_size_t length; uint32_t event_count; }; typedef struct ompi_mtl_portals4_isend_request_t ompi_mtl_portals4_isend_request_t; diff --git a/ompi/mca/mtl/portals4/mtl_portals4_send.c b/ompi/mca/mtl/portals4/mtl_portals4_send.c index 4ee2e775322..d6b39a994b2 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_send.c +++ b/ompi/mca/mtl/portals4/mtl_portals4_send.c @@ -91,9 +91,11 @@ ompi_mtl_portals4_callback(ptl_event_t *ev, if ((PTL_EVENT_ACK == ev->type) && (PTL_PRIORITY_LIST == ev->ptl_list) && - (eager == ompi_mtl_portals4.protocol) && + (ev->mlength == ptl_request->length) && (!PtlHandleIsEqual(ptl_request->me_h, PTL_INVALID_HANDLE))) { - /* long expected messages with the eager protocol won't see a + /* long expected messages with the eager protocol + (and also with the rndv protocol if the length + is less or egal to eager_limit) won't see a get event to complete the message. Give them an extra count to cause the message to complete with just the SEND and ACK events and remove the ME. (we wait for the counter @@ -307,8 +309,10 @@ ompi_mtl_portals4_long_isend(void *start, size_t length, int contextid, int tag, "Send %lu long send with hdr_data 0x%lx (0x%lx)", ptl_request->opcount, hdr_data, match_bits)); - put_length = (rndv == ompi_mtl_portals4.protocol) ? - (ptl_size_t) ompi_mtl_portals4.eager_limit : (ptl_size_t) length; + if ((rndv == ompi_mtl_portals4.protocol) && ((ptl_size_t) length > (ptl_size_t) ompi_mtl_portals4.eager_limit)) + put_length = (ptl_size_t) ompi_mtl_portals4.eager_limit; + else put_length = (ptl_size_t) length; + ret = PtlPut(ompi_mtl_portals4.send_md_h, (ptl_size_t) start, @@ -355,7 +359,7 @@ ompi_mtl_portals4_pending_list_progress() } pending = (ompi_mtl_portals4_pending_request_t*) item; - if (pending->length <= ompi_mtl_portals4.eager_limit) { + if (pending->length <= ompi_mtl_portals4.short_limit) { ret = ompi_mtl_portals4_short_isend(pending->mode, pending->start, pending->length, @@ -414,6 +418,7 @@ ompi_mtl_portals4_send_start(struct mca_mtl_base_module_t* mtl, ptl_request->opcount = OPAL_THREAD_ADD64((int64_t*)&ompi_mtl_portals4.opcount, 1); ptl_request->buffer_ptr = (free_after) ? start : NULL; + ptl_request->length = length; ptl_request->event_count = 0; OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, @@ -461,7 +466,7 @@ ompi_mtl_portals4_send_start(struct mca_mtl_base_module_t* mtl, return OMPI_SUCCESS; } #endif - if (length <= ompi_mtl_portals4.eager_limit) { + if (length <= ompi_mtl_portals4.short_limit) { ret = ompi_mtl_portals4_short_isend(mode, start, length, From 10763f5abc9464dcd124b3059ea4f840552e97b4 Mon Sep 17 00:00:00 2001 From: Pascal Deveze Date: Tue, 26 Jul 2016 08:44:07 +0200 Subject: [PATCH 5/5] mtl/portals4: Take into account the limitation of portals4 (max_msg_size) and split messages if necessary --- ompi/mca/mtl/portals4/mtl_portals4.h | 2 + .../mca/mtl/portals4/mtl_portals4_component.c | 21 +++++ ompi/mca/mtl/portals4/mtl_portals4_recv.c | 59 ++++++++---- ompi/mca/mtl/portals4/mtl_portals4_request.h | 2 + ompi/mca/mtl/portals4/mtl_portals4_send.c | 90 +++++++++++++++++-- 5 files changed, 149 insertions(+), 25 deletions(-) diff --git a/ompi/mca/mtl/portals4/mtl_portals4.h b/ompi/mca/mtl/portals4/mtl_portals4.h index b8f312804c9..82975f6219d 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4.h +++ b/ompi/mca/mtl/portals4/mtl_portals4.h @@ -73,6 +73,8 @@ struct mca_mtl_portals4_module_t { /** Network interface handle for matched interface */ ptl_handle_ni_t ni_h; + /** Limit given by portals after NIInit */ + uint64_t max_msg_size_mtl; /** Uid for current user */ ptl_uid_t uid; diff --git a/ompi/mca/mtl/portals4/mtl_portals4_component.c b/ompi/mca/mtl/portals4/mtl_portals4_component.c index eef04c55dc5..3509efa03be 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_component.c +++ b/ompi/mca/mtl/portals4/mtl_portals4_component.c @@ -185,6 +185,19 @@ ompi_mtl_portals4_component_register(void) OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY, &ompi_mtl_portals4.protocol); + + ompi_mtl_portals4.max_msg_size_mtl = PTL_SIZE_MAX; + (void) mca_base_component_var_register(&mca_mtl_portals4_component.mtl_version, + "max_msg_size", + "Max size supported by portals4 (above that, a message is cut into messages less than that size)", + MCA_BASE_VAR_TYPE_UNSIGNED_LONG, + NULL, + 0, + 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_mtl_portals4.max_msg_size_mtl); + OBJ_RELEASE(new_enum); if (0 > ret) { return OMPI_ERR_NOT_SUPPORTED; @@ -208,6 +221,9 @@ ompi_mtl_portals4_component_open(void) "no" #endif ); + opal_output_verbose(1, ompi_mtl_base_framework.framework_output, + "Max message size: %lu", (unsigned long) + ompi_mtl_portals4.max_msg_size_mtl); opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "Short limit: %d", (int) ompi_mtl_portals4.short_limit); @@ -329,6 +345,11 @@ ompi_mtl_portals4_component_init(bool enable_progress_threads, goto error; } + if (actual_limits.max_msg_size < ompi_mtl_portals4.max_msg_size_mtl) + ompi_mtl_portals4.max_msg_size_mtl = actual_limits.max_msg_size; + OPAL_OUTPUT_VERBOSE((10, ompi_mtl_base_framework.framework_output, + "Due to portals4 and user configuration messages will not go over the size of %lu", ompi_mtl_portals4.max_msg_size_mtl)); + if (ompi_comm_rank(MPI_COMM_WORLD) == 0) { opal_output_verbose(10, ompi_mtl_base_framework.framework_output, "max_entries=%d", actual_limits.max_entries); opal_output_verbose(10, ompi_mtl_base_framework.framework_output, "max_unexpected_headers=%d", actual_limits.max_unexpected_headers); diff --git a/ompi/mca/mtl/portals4/mtl_portals4_recv.c b/ompi/mca/mtl/portals4/mtl_portals4_recv.c index ca7f506bde8..387aa53be02 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_recv.c +++ b/ompi/mca/mtl/portals4/mtl_portals4_recv.c @@ -39,7 +39,9 @@ read_msg(void *start, ptl_size_t length, ptl_process_t target, ptl_match_bits_t match_bits, ptl_size_t remote_offset, ompi_mtl_portals4_recv_request_t *request) { - int ret; + int ret, i; + ptl_size_t rest = length, asked = 0, frag_size; + int32_t pending_reply; #if OMPI_MTL_PORTALS4_FLOW_CONTROL while (OPAL_UNLIKELY(OPAL_THREAD_ADD32(&ompi_mtl_portals4.flowctl.send_slots, -1) < 0)) { @@ -48,19 +50,29 @@ read_msg(void *start, ptl_size_t length, ptl_process_t target, } #endif - ret = PtlGet(ompi_mtl_portals4.send_md_h, - (ptl_size_t) start, - length, - target, - ompi_mtl_portals4.read_idx, - match_bits, - remote_offset, - request); - if (OPAL_UNLIKELY(PTL_OK != ret)) { - opal_output_verbose(1, ompi_mtl_base_framework.framework_output, - "%s:%d: PtlGet failed: %d", - __FILE__, __LINE__, ret); - return OMPI_ERR_OUT_OF_RESOURCE; + request->pending_reply = (length + ompi_mtl_portals4.max_msg_size_mtl - 1) / ompi_mtl_portals4.max_msg_size_mtl; + pending_reply = request->pending_reply; + + for (i = 0 ; i < pending_reply ; i++) { + OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output, "GET (fragment %d/%d) send", + i + 1, pending_reply)); + frag_size = (OPAL_UNLIKELY(rest > ompi_mtl_portals4.max_msg_size_mtl)) ? ompi_mtl_portals4.max_msg_size_mtl : rest; + ret = PtlGet(ompi_mtl_portals4.send_md_h, + (ptl_size_t) start + i * ompi_mtl_portals4.max_msg_size_mtl, + frag_size, + target, + ompi_mtl_portals4.read_idx, + match_bits, + remote_offset + i * ompi_mtl_portals4.max_msg_size_mtl, + request); + if (OPAL_UNLIKELY(PTL_OK != ret)) { + opal_output_verbose(1, ompi_mtl_base_framework.framework_output, + "%s:%d: PtlGet failed: %d", + __FILE__, __LINE__, ret); + return OMPI_ERR_OUT_OF_RESOURCE; + } + rest -= frag_size; + asked += frag_size; } return OMPI_SUCCESS; @@ -109,12 +121,16 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev, ptl_request->super.super.ompi_req->req_status.MPI_ERROR = MPI_ERR_TRUNCATE; } + if (ev->mlength < msg_length) + OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output, "Truncated message, some PtlGet are required (protocol = %d)", + ompi_mtl_portals4.protocol)); + #if OPAL_ENABLE_DEBUG ptl_request->hdr_data = ev->hdr_data; #endif ptl_request->super.super.ompi_req->req_status._ucount = ev->mlength; - if (!MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits) && ompi_mtl_portals4.protocol == rndv && msg_length != ev->mlength) { + if (!MTL_PORTALS4_IS_SHORT_MSG(ev->match_bits) && msg_length > ev->mlength) { /* If it's not a short message and we're doing rndv and the message is not complete, we only have the first part of the message. Issue the get to pull the second part of the message. */ @@ -129,7 +145,6 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev, if (NULL != ptl_request->buffer_ptr) free(ptl_request->buffer_ptr); goto callback_error; } - } else { /* If we're either using the eager protocol or were a short message, all data has been received, so complete @@ -167,6 +182,12 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev, exactly how much data was sent. */ ptl_request->super.super.ompi_req->req_status._ucount += ev->mlength; + ret = OPAL_THREAD_ADD32(&(ptl_request->pending_reply), -1); + if (ret > 0) { + return OMPI_SUCCESS; + } + assert(ptl_request->pending_reply == 0); + #if OMPI_MTL_PORTALS4_FLOW_CONTROL OPAL_THREAD_ADD32(&ompi_mtl_portals4.flowctl.send_slots, 1); #endif @@ -187,8 +208,8 @@ ompi_mtl_portals4_recv_progress(ptl_event_t *ev, } OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, - "Recv %lu (0x%lx) completed, reply", - ptl_request->opcount, ptl_request->hdr_data)); + "Recv %lu (0x%lx) completed , reply (pending_reply: %d)", + ptl_request->opcount, ptl_request->hdr_data, ptl_request->pending_reply)); ptl_request->super.super.completion_callback(&ptl_request->super.super); break; @@ -367,6 +388,7 @@ ompi_mtl_portals4_irecv(struct mca_mtl_base_module_t* mtl, ptl_request->delivery_len = length; ptl_request->req_started = false; ptl_request->super.super.ompi_req->req_status.MPI_ERROR = OMPI_SUCCESS; + ptl_request->pending_reply = 0; OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "Recv %lu from %x,%x of length %ld (0x%lx, 0x%lx, 0x%lx)\n", @@ -448,6 +470,7 @@ ompi_mtl_portals4_imrecv(struct mca_mtl_base_module_t* mtl, ptl_request->delivery_ptr = start; ptl_request->delivery_len = length; ptl_request->super.super.ompi_req->req_status.MPI_ERROR = OMPI_SUCCESS; + ptl_request->pending_reply = 0; OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "Mrecv %lu of length %ld (0x%lx)\n", diff --git a/ompi/mca/mtl/portals4/mtl_portals4_request.h b/ompi/mca/mtl/portals4/mtl_portals4_request.h index eb814dafa7f..e187bce765e 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_request.h +++ b/ompi/mca/mtl/portals4/mtl_portals4_request.h @@ -53,6 +53,7 @@ struct ompi_mtl_portals4_isend_request_t { struct ompi_mtl_portals4_pending_request_t *pending; #endif ptl_size_t length; + int32_t pending_get; uint32_t event_count; }; typedef struct ompi_mtl_portals4_isend_request_t ompi_mtl_portals4_isend_request_t; @@ -74,6 +75,7 @@ struct ompi_mtl_portals4_recv_request_t { void *delivery_ptr; size_t delivery_len; volatile bool req_started; + int32_t pending_reply; #if OPAL_ENABLE_DEBUG uint64_t opcount; ptl_hdr_data_t hdr_data; diff --git a/ompi/mca/mtl/portals4/mtl_portals4_send.c b/ompi/mca/mtl/portals4/mtl_portals4_send.c index d6b39a994b2..6393b9a465b 100644 --- a/ompi/mca/mtl/portals4/mtl_portals4_send.c +++ b/ompi/mca/mtl/portals4/mtl_portals4_send.c @@ -44,6 +44,29 @@ ompi_mtl_portals4_callback(ptl_event_t *ev, ompi_mtl_portals4_isend_request_t* ptl_request = (ompi_mtl_portals4_isend_request_t*) ptl_base_request; + if (PTL_EVENT_GET == ev->type) { + ret = OPAL_THREAD_ADD32(&(ptl_request->pending_get), -1); + if (ret > 0) { + /* wait for other gets */ + OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output, "PTL_EVENT_GET received now pending_get=%d",ret)); + return retval; + } + assert(ptl_request->pending_get == 0); + + /* last get received */ + OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output, "PTL_EVENT_GET: PtlMEUnlink is called ptl_request->me_h=%d (pending get=%d)", ptl_request->me_h, ret)); + + if (!PtlHandleIsEqual(ptl_request->me_h, PTL_INVALID_HANDLE)) { + ret = PtlMEUnlink(ptl_request->me_h); + if (PTL_OK != ret) { + opal_output_verbose(1, ompi_mtl_base_framework.framework_output, + "%s:%d: send callback PtlMEUnlink returned %d", + __FILE__, __LINE__, ret); + } + ptl_request->me_h = PTL_INVALID_HANDLE; + } + } + #if OMPI_MTL_PORTALS4_FLOW_CONTROL if (OPAL_UNLIKELY(ev->ni_fail_type == PTL_NI_PT_DISABLED)) { ompi_mtl_portals4_pending_request_t *pending = @@ -66,6 +89,7 @@ ompi_mtl_portals4_callback(ptl_event_t *ev, "%s:%d: send callback PtlMEUnlink returned %d", __FILE__, __LINE__, ret); } + ptl_request->me_h = PTL_INVALID_HANDLE; } opal_list_append(&ompi_mtl_portals4.flowctl.pending_sends, @@ -89,6 +113,33 @@ ompi_mtl_portals4_callback(ptl_event_t *ev, "send %lu got event of type %d", ptl_request->opcount, ev->type)); + /* First put achieved successfully (In the Priority List), so it may be necessary to decrement the number of pending get + * If the protocol is eager, just decrement pending_get + * Else (the protocol is rndv), decrement pending_get only if length % max_msg_size <= eager_limit + * (This is the case where the eager part allows to save one get) + */ + if ((PTL_EVENT_ACK == ev->type) && + (PTL_PRIORITY_LIST == ev->ptl_list) && + (0 < ptl_request->pending_get)) { + + if ((eager == ompi_mtl_portals4.protocol) || + (ptl_request->length % ompi_mtl_portals4.max_msg_size_mtl <= ompi_mtl_portals4.eager_limit)) { + val = OPAL_THREAD_ADD32(&(ptl_request->pending_get), -1); + } + if (0 == val) { + add = 2; /* We haven't to wait for any get, so we have to add an extra count to cause the message to complete */ + if (!PtlHandleIsEqual(ptl_request->me_h, PTL_INVALID_HANDLE)) { + ret = PtlMEUnlink(ptl_request->me_h); + if (PTL_OK != ret) { + opal_output_verbose(1, ompi_mtl_base_framework.framework_output, + "%s:%d: send callback PtlMEUnlink returned %d", + __FILE__, __LINE__, ret); + } + ptl_request->me_h = PTL_INVALID_HANDLE; + } + } + } + if ((PTL_EVENT_ACK == ev->type) && (PTL_PRIORITY_LIST == ev->ptl_list) && (ev->mlength == ptl_request->length) && @@ -107,10 +158,10 @@ ompi_mtl_portals4_callback(ptl_event_t *ev, "%s:%d: send callback PtlMEUnlink returned %d", __FILE__, __LINE__, ret); } + ptl_request->me_h = PTL_INVALID_HANDLE; add++; } val = OPAL_THREAD_ADD32((int32_t*)&ptl_request->event_count, add); - assert(val <= 3); if (val == 3) { @@ -193,6 +244,7 @@ ompi_mtl_portals4_short_isend(mca_pml_base_send_mode_t mode, MTL_PORTALS4_SET_HDR_DATA(hdr_data, ptl_request->opcount, length, (MCA_PML_BASE_SEND_SYNCHRONOUS == mode) ? 1 : 0); + ptl_request->me_h = PTL_INVALID_HANDLE; if (MCA_PML_BASE_SEND_SYNCHRONOUS == mode) { me.start = NULL; @@ -219,6 +271,7 @@ ompi_mtl_portals4_short_isend(mca_pml_base_send_mode_t mode, opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: PtlMEAppend failed: %d", __FILE__, __LINE__, ret); + ptl_request->me_h = PTL_INVALID_HANDLE; return ompi_mtl_portals4_get_error(ret); } @@ -227,7 +280,6 @@ ompi_mtl_portals4_short_isend(mca_pml_base_send_mode_t mode, ptl_request->opcount, hdr_data, match_bits)); } else { ptl_request->event_count = 1; - ptl_request->me_h = PTL_INVALID_HANDLE; OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output, "Send %lu short send with hdr_data 0x%lx (0x%lx)", @@ -238,6 +290,7 @@ ompi_mtl_portals4_short_isend(mca_pml_base_send_mode_t mode, "Send %lu, start: %p", ptl_request->opcount, start)); + ptl_request->pending_get = 0; ret = PtlPut(ompi_mtl_portals4.send_md_h, (ptl_size_t) start, length, @@ -254,6 +307,7 @@ ompi_mtl_portals4_short_isend(mca_pml_base_send_mode_t mode, __FILE__, __LINE__, ret); if (MCA_PML_BASE_SEND_SYNCHRONOUS == mode) { PtlMEUnlink(ptl_request->me_h); + ptl_request->me_h = PTL_INVALID_HANDLE; } return ompi_mtl_portals4_get_error(ret); } @@ -285,7 +339,6 @@ ompi_mtl_portals4_long_isend(void *start, size_t length, int contextid, int tag, me.uid = ompi_mtl_portals4.uid; me.options = PTL_ME_OP_GET | - PTL_ME_USE_ONCE | PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE; me.match_id = ptl_proc; @@ -309,10 +362,32 @@ ompi_mtl_portals4_long_isend(void *start, size_t length, int contextid, int tag, "Send %lu long send with hdr_data 0x%lx (0x%lx)", ptl_request->opcount, hdr_data, match_bits)); - if ((rndv == ompi_mtl_portals4.protocol) && ((ptl_size_t) length > (ptl_size_t) ompi_mtl_portals4.eager_limit)) - put_length = (ptl_size_t) ompi_mtl_portals4.eager_limit; - else put_length = (ptl_size_t) length; + if (rndv == ompi_mtl_portals4.protocol) { + ptl_size_t min = (OPAL_LIKELY (ompi_mtl_portals4.eager_limit < ompi_mtl_portals4.max_msg_size_mtl)) ? + ompi_mtl_portals4.eager_limit : + ompi_mtl_portals4.max_msg_size_mtl; + if ((ptl_size_t) length > (ptl_size_t) min) { + OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output, + "msg truncated by %ld", length - min)); + put_length = (ptl_size_t) min; + } + else + put_length = (ptl_size_t) length; + } else { // eager protocol + if (length > ompi_mtl_portals4.max_msg_size_mtl) + put_length = (ptl_size_t) ompi_mtl_portals4.max_msg_size_mtl; + else + put_length = (ptl_size_t) length; + } + + /* We have to wait for some GET events. + If the first put falls in overflow list, the number of GET event is egal to: + (length - 1) / ompi_mtl_portals4.max_msg_size_mtl + 1 + else we will re-calculate this number when we received the first ACK event (with remote overflow list) + */ + ptl_request->pending_get = (length - 1) / ompi_mtl_portals4.max_msg_size_mtl + 1; + OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output, "pending_get=%d", ptl_request->pending_get)); ret = PtlPut(ompi_mtl_portals4.send_md_h, (ptl_size_t) start, @@ -328,7 +403,8 @@ ompi_mtl_portals4_long_isend(void *start, size_t length, int contextid, int tag, opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: PtlPut failed: %d", __FILE__, __LINE__, ret); - PtlMEUnlink(ptl_request->me_h); + PtlMEUnlink(ptl_request->me_h); + ptl_request->me_h = PTL_INVALID_HANDLE; return ompi_mtl_portals4_get_error(ret); }