From 55022b428330d0b94b221e3312fdfebb5fab3ea9 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Thu, 23 Mar 2017 02:53:21 -0700 Subject: [PATCH 1/2] If we lose connection to the server after initiating a send/recv in PMIx (e.g., in PMIx_Abort), then we need to "resolve" all pending recvs to avoid hanging. Fixes #3225 Signed-off-by: Ralph Castain (cherry picked from commit 55e4fba5f5633d08b283c247301c17926c4f5b6b) --- .../pmix/src/mca/ptl/base/ptl_base_sendrecv.c | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/ptl_base_sendrecv.c b/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/ptl_base_sendrecv.c index b51c7902051..88f9bca94a1 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/ptl_base_sendrecv.c +++ b/opal/mca/pmix/pmix2x/pmix/src/mca/ptl/base/ptl_base_sendrecv.c @@ -62,6 +62,9 @@ static void lost_connection(pmix_peer_t *peer, pmix_status_t err) pmix_regevents_info_t *reginfoptr, *regnext; pmix_peer_events_info_t *pr, *pnext; pmix_rank_info_t *info, *pinfo; + pmix_ptl_posted_recv_t *rcv; + pmix_buffer_t buf; + pmix_ptl_hdr_t hdr; /* stop all events */ if (peer->recv_ev_active) { @@ -143,6 +146,23 @@ static void lost_connection(pmix_peer_t *peer, pmix_status_t err) pmix_globals.connected = false; /* set the public error status */ err = PMIX_ERR_LOST_CONNECTION_TO_SERVER; + /* it is possible that we have sendrecv's in progress where + * we are waiting for a response to arrive. Since we have + * lost connection to the server, that will never happen. + * Thus, to preclude any chance of hanging, cycle thru + * the list of posted recvs and complete any that are + * the return call from a sendrecv - i.e., any that are + * waiting on dynamic tags */ + PMIX_CONSTRUCT(&buf, pmix_buffer_t); + hdr.nbytes = 0; // initialize the hdr to something safe + PMIX_LIST_FOREACH(rcv, &pmix_ptl_globals.posted_recvs, pmix_ptl_posted_recv_t) { + if (UINT_MAX != rcv->tag && NULL != rcv->cbfunc) { + /* construct and load the buffer */ + hdr.tag = rcv->tag; + rcv->cbfunc(pmix_globals.mypeer, &hdr, &buf, rcv->cbdata); + } + } + PMIX_DESTRUCT(&buf); } PMIX_REPORT_EVENT(err, _notify_complete); } From a358c7b8d7722b2be6105fbed4c835bc8d2c4d25 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 28 Mar 2017 17:21:47 -0700 Subject: [PATCH 2/2] Use the correct callback data - the callback function was expecting a bool*, not a pmix_ptl_sr_t*. Signed-off-by: Ralph Castain (cherry picked from commit 7dd34d0c9a58c50f47295d866e675dcda91a5e07) --- .../pmix/pmix2x/pmix/src/client/pmix_client.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client.c b/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client.c index b9d133ee507..a1b9546bedb 100644 --- a/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client.c +++ b/opal/mca/pmix/pmix2x/pmix/src/client/pmix_client.c @@ -492,7 +492,7 @@ PMIX_EXPORT pmix_status_t PMIx_Abort(int flag, const char msg[], pmix_buffer_t *bfr; pmix_cmd_t cmd = PMIX_ABORT_CMD; pmix_status_t rc; - pmix_ptl_sr_t cb; + volatile bool active; pmix_output_verbose(2, pmix_globals.debug_output, "pmix:client abort called"); @@ -541,23 +541,15 @@ PMIX_EXPORT pmix_status_t PMIx_Abort(int flag, const char msg[], } } - /* create a callback object as we need to pass it to the - * recv routine so we know which callback to use when - * the return message is recvd */ - PMIX_CONSTRUCT(&cb, pmix_ptl_sr_t); - cb.active = true; - cb.cbfunc = wait_cbfunc; - /* send to the server */ + active = true; if (PMIX_SUCCESS != (rc = pmix_ptl.send_recv(&pmix_client_globals.myserver, bfr, - wait_cbfunc, &cb))){ - PMIX_DESTRUCT(&cb); + wait_cbfunc, (void*)&active))){ return rc; } /* wait for the release */ - PMIX_WAIT_FOR_COMPLETION(cb.active); - PMIX_DESTRUCT(&cb); + PMIX_WAIT_FOR_COMPLETION(active); return PMIX_SUCCESS; }