diff --git a/ompi/mca/mtl/ofi/mtl_ofi.h b/ompi/mca/mtl/ofi/mtl_ofi.h index e858a9f12bf..1f555646fe9 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi.h +++ b/ompi/mca/mtl/ofi/mtl_ofi.h @@ -158,8 +158,7 @@ ompi_mtl_ofi_context_progress(int ctxt_id) "%s:%d: Error returned by request (type: %d) event callback: %zd.\n" "*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n", __FILE__, __LINE__, req_type, ret); - fflush(stderr); - exit(1); + goto bail; } } } @@ -181,16 +180,23 @@ ompi_mtl_ofi_context_progress(int ctxt_id) * thread fetches the entry while others get -FI_EAGAIN * indicating an empty queue, which is not erroneous. */ - if (ret == -FI_EAGAIN) + if (ret == -FI_EAGAIN) { return count; + } opal_output(0, "%s:%d: Error returned from fi_cq_readerr: %s(%zd).\n" "*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n", __FILE__, __LINE__, fi_strerror(-ret), ret); - fflush(stderr); - exit(1); + goto bail; + } + + if (!error.op_context) { + opal_output(0, "%s:%d: Error returned from fi_cq_readerr with null context. " + "Completion flags: %016lx\n" + "*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n", + __FILE__, __LINE__, error.flags); + goto bail; } - assert(error.op_context); ofi_req = TO_OFI_REQ(error.op_context); assert(ofi_req); req_type = ofi_req->type; @@ -200,18 +206,20 @@ ompi_mtl_ofi_context_progress(int ctxt_id) "%s:%d: Error returned by request (type: %d) error callback: %zd.\n" "*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n", __FILE__, __LINE__, req_type, ret); - fflush(stderr); - exit(1); + goto bail; } } else if (ret != -FI_EAGAIN && ret != -EINTR) { opal_output(0, "%s:%d: Error returned from fi_cq_read: %s(%zd).\n" "*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n", __FILE__, __LINE__, fi_strerror(-ret), ret); - fflush(stderr); - exit(1); + goto bail; } return count; + +bail: + fflush(stderr); + exit(1); } __opal_attribute_always_inline__ static inline int