diff --git a/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client.c b/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client.c index b9d133ee507..a1b9546bedb 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client.c +++ b/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client.c @@ -492,7 +492,7 @@ PMIX_EXPORT pmix_status_t PMIx_Abort(int flag, const char msg[], pmix_buffer_t *bfr; pmix_cmd_t cmd = PMIX_ABORT_CMD; pmix_status_t rc; - pmix_ptl_sr_t cb; + volatile bool active; pmix_output_verbose(2, pmix_globals.debug_output, "pmix:client abort called"); @@ -541,23 +541,15 @@ PMIX_EXPORT pmix_status_t PMIx_Abort(int flag, const char msg[], } } - /* create a callback object as we need to pass it to the - * recv routine so we know which callback to use when - * the return message is recvd */ - PMIX_CONSTRUCT(&cb, pmix_ptl_sr_t); - cb.active = true; - cb.cbfunc = wait_cbfunc; - /* send to the server */ + active = true; if (PMIX_SUCCESS != (rc = pmix_ptl.send_recv(&pmix_client_globals.myserver, bfr, - wait_cbfunc, &cb))){ - PMIX_DESTRUCT(&cb); + wait_cbfunc, (void*)&active))){ return rc; } /* wait for the release */ - PMIX_WAIT_FOR_COMPLETION(cb.active); - PMIX_DESTRUCT(&cb); + PMIX_WAIT_FOR_COMPLETION(active); return PMIX_SUCCESS; } diff --git a/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/ptl_base_sendrecv.c b/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/ptl_base_sendrecv.c index b51c7902051..88f9bca94a1 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/ptl_base_sendrecv.c +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/ptl_base_sendrecv.c @@ -62,6 +62,9 @@ static void lost_connection(pmix_peer_t *peer, pmix_status_t err) pmix_regevents_info_t *reginfoptr, *regnext; pmix_peer_events_info_t *pr, *pnext; pmix_rank_info_t *info, *pinfo; + pmix_ptl_posted_recv_t *rcv; + pmix_buffer_t buf; + pmix_ptl_hdr_t hdr; /* stop all events */ if (peer->recv_ev_active) { @@ -143,6 +146,23 @@ static void lost_connection(pmix_peer_t *peer, pmix_status_t err) pmix_globals.connected = false; /* set the public error status */ err = PMIX_ERR_LOST_CONNECTION_TO_SERVER; + /* it is possible that we have sendrecv's in progress where + * we are waiting for a response to arrive. Since we have + * lost connection to the server, that will never happen. + * Thus, to preclude any chance of hanging, cycle thru + * the list of posted recvs and complete any that are + * the return call from a sendrecv - i.e., any that are + * waiting on dynamic tags */ + PMIX_CONSTRUCT(&buf, pmix_buffer_t); + hdr.nbytes = 0; // initialize the hdr to something safe + PMIX_LIST_FOREACH(rcv, &pmix_ptl_globals.posted_recvs, pmix_ptl_posted_recv_t) { + if (UINT_MAX != rcv->tag && NULL != rcv->cbfunc) { + /* construct and load the buffer */ + hdr.tag = rcv->tag; + rcv->cbfunc(pmix_globals.mypeer, &hdr, &buf, rcv->cbdata); + } + } + PMIX_DESTRUCT(&buf); } PMIX_REPORT_EVENT(err, _notify_complete); }