diff --git a/ompi/mca/coll/portals4/coll_portals4_allreduce.c b/ompi/mca/coll/portals4/coll_portals4_allreduce.c index 935ce6cd9d3..ec94c428175 100644 --- a/ompi/mca/coll/portals4/coll_portals4_allreduce.c +++ b/ompi/mca/coll/portals4/coll_portals4_allreduce.c @@ -343,15 +343,38 @@ allreduce_kary_tree_top(const void *sendbuf, void *recvbuf, int count, static int allreduce_kary_tree_bottom(ompi_coll_portals4_request_t *request) { + int ret; + if (request->u.allreduce.is_optim) { PtlAtomicSync(); if (request->u.allreduce.child_nb) { - PtlCTFree(request->u.allreduce.ack_ct_h); + ret = PtlCTFree(request->u.allreduce.ack_ct_h); + if (PTL_OK != ret) { + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "%s:%d: PtlCTFree failed: %d\n", + __FILE__, __LINE__, ret); + return OMPI_ERROR; + } } - PtlMEUnlink(request->u.allreduce.data_me_h); - PtlCTFree(request->u.allreduce.trig_ct_h); + do { + ret = PtlMEUnlink(request->u.allreduce.data_me_h); + } while (PTL_IN_USE == ret); + if (PTL_OK != ret) { + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "%s:%d: PtlMEUnlink failed: %d\n", + __FILE__, __LINE__, ret); + return OMPI_ERROR; + } + + ret = PtlCTFree(request->u.allreduce.trig_ct_h); + if (PTL_OK != ret) { + opal_output_verbose(1, ompi_coll_base_framework.framework_output, + "%s:%d: PtlCTFree failed: %d\n", + __FILE__, __LINE__, ret); + return OMPI_ERROR; + } } return (OMPI_SUCCESS); diff --git a/ompi/mca/coll/portals4/coll_portals4_barrier.c b/ompi/mca/coll/portals4/coll_portals4_barrier.c index 9d5c4f3c164..58294dedf4b 100644 --- a/ompi/mca/coll/portals4/coll_portals4_barrier.c +++ b/ompi/mca/coll/portals4/coll_portals4_barrier.c @@ -206,7 +206,9 @@ barrier_hypercube_bottom(ompi_coll_portals4_request_t *request) int ret; /* cleanup */ - ret = PtlMEUnlink(request->u.barrier.data_me_h); + do { + ret = PtlMEUnlink(request->u.barrier.data_me_h); + } while (PTL_IN_USE == ret); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlMEUnlink failed: %d\n", diff --git a/ompi/mca/coll/portals4/coll_portals4_component.c b/ompi/mca/coll/portals4/coll_portals4_component.c index 1be495861c0..d632340ee26 100644 --- a/ompi/mca/coll/portals4/coll_portals4_component.c +++ b/ompi/mca/coll/portals4/coll_portals4_component.c @@ -285,7 +285,9 @@ portals4_close(void) mca_coll_portals4_component.data_md_h = PTL_INVALID_HANDLE; if (!PtlHandleIsEqual(mca_coll_portals4_component.finish_me_h, PTL_INVALID_HANDLE)) { - ret = PtlMEUnlink(mca_coll_portals4_component.finish_me_h); + do { + ret = PtlMEUnlink(mca_coll_portals4_component.finish_me_h); + } while (PTL_IN_USE == ret); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlMEUnlink failed: %d\n", @@ -293,7 +295,9 @@ portals4_close(void) } } if (!PtlHandleIsEqual(mca_coll_portals4_component.unex_me_h, PTL_INVALID_HANDLE)) { - ret = PtlMEUnlink(mca_coll_portals4_component.unex_me_h); + do { + ret = PtlMEUnlink(mca_coll_portals4_component.unex_me_h); + } while (PTL_IN_USE == ret); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlMEUnlink failed: %d\n", diff --git a/ompi/mca/coll/portals4/coll_portals4_gather.c b/ompi/mca/coll/portals4/coll_portals4_gather.c index 45ff4c07728..5bd60b7613e 100644 --- a/ompi/mca/coll/portals4/coll_portals4_gather.c +++ b/ompi/mca/coll/portals4/coll_portals4_gather.c @@ -460,7 +460,9 @@ cleanup_gather_handles(ompi_coll_portals4_request_t *request) /**********************************/ /* Cleanup Gather Handles */ /**********************************/ - ret = PtlMEUnlink(request->u.gather.gather_meh); + do { + ret = PtlMEUnlink(request->u.gather.gather_meh); + } while (PTL_IN_USE == ret); if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } ret = PtlCTFree(request->u.gather.gather_cth); @@ -484,7 +486,9 @@ cleanup_sync_handles(ompi_coll_portals4_request_t *request) /**********************************/ /* Cleanup Sync Handles */ /**********************************/ - ret = PtlMEUnlink(request->u.gather.sync_meh); + do { + ret = PtlMEUnlink(request->u.gather.sync_meh); + } while (PTL_IN_USE == ret); if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } ret = PtlCTFree(request->u.gather.sync_cth); diff --git a/ompi/mca/coll/portals4/coll_portals4_reduce.c b/ompi/mca/coll/portals4/coll_portals4_reduce.c index 1a55a5c3f70..e9b6b159a94 100644 --- a/ompi/mca/coll/portals4/coll_portals4_reduce.c +++ b/ompi/mca/coll/portals4/coll_portals4_reduce.c @@ -340,24 +340,38 @@ reduce_kary_tree_top(const void *sendbuf, void *recvbuf, int count, static int reduce_kary_tree_bottom(ompi_coll_portals4_request_t *request) { + int ret, line; + if (request->u.reduce.is_optim) { PtlAtomicSync(); if (request->u.reduce.use_ack_ct_h) { - PtlCTFree(request->u.reduce.ack_ct_h); + ret = PtlCTFree(request->u.reduce.ack_ct_h); + if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } } if (request->u.reduce.child_nb) { - PtlMEUnlink(request->u.reduce.data_me_h); + do { + ret = PtlMEUnlink(request->u.reduce.data_me_h); + } while (PTL_IN_USE == ret); + if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } } - PtlCTFree(request->u.reduce.trig_ct_h); + ret = PtlCTFree(request->u.reduce.trig_ct_h); + if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } if (request->u.reduce.free_buffer) { free(request->u.reduce.free_buffer); } } return (OMPI_SUCCESS); + +err_hdlr: + opal_output(ompi_coll_base_framework.framework_output, + "%s:%4d:%4d\tError occurred ret=%d", + __FILE__, __LINE__, line, ret); + + return ret; } diff --git a/ompi/mca/coll/portals4/coll_portals4_scatter.c b/ompi/mca/coll/portals4/coll_portals4_scatter.c index d1cfbbaa0d2..94262c13598 100644 --- a/ompi/mca/coll/portals4/coll_portals4_scatter.c +++ b/ompi/mca/coll/portals4/coll_portals4_scatter.c @@ -253,14 +253,8 @@ cleanup_scatter_handles(ompi_coll_portals4_request_t *request) /**********************************/ do { ret = PtlMEUnlink(request->u.scatter.scatter_meh); - if (PTL_IN_USE == ret) { - opal_output(ompi_coll_base_framework.framework_output, - "%s:%4d: scatter_meh still in use (ret=%d, rank %2d)", - __FILE__, __LINE__, ret, request->u.scatter.my_rank); - continue; - } - if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } - } while (ret == PTL_IN_USE); + } while (PTL_IN_USE == ret); + if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } ret = PtlCTFree(request->u.scatter.scatter_cth); if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } @@ -292,14 +286,8 @@ cleanup_sync_handles(ompi_coll_portals4_request_t *request) /**********************************/ do { ret = PtlMEUnlink(request->u.scatter.sync_meh); - if (PTL_IN_USE == ret) { - opal_output(ompi_coll_base_framework.framework_output, - "%s:%4d: sync_meh still in use (ret=%d, rank %2d)", - __FILE__, __LINE__, ret, request->u.scatter.my_rank); - continue; - } - if (PTL_OK != ret) { ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } - } while (ret == PTL_IN_USE); + } while (PTL_IN_USE == ret); + if (PTL_OK != ret) { ptl_ret = ret; ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; } ret = PtlCTFree(request->u.scatter.sync_cth); if (PTL_OK != ret) { ptl_ret = ret; ret = OMPI_ERROR; line = __LINE__; goto err_hdlr; }