Skip to content

Commit 84ad6b7

Browse files
authored
Merge pull request #12218 from wenduwan/v5.0.x_backport_pr12175
[v5.0.x] backport pr12175
2 parents 5939b25 + fe113e2 commit 84ad6b7

File tree

1 file changed

+21
-12
lines changed

1 file changed

+21
-12
lines changed

ompi/mca/mtl/ofi/mtl_ofi.h

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ opal_mutex_atomic_unlock(&ompi_mtl_ofi.ofi_ctxt[ctxt_id].context_lock)
131131
__opal_attribute_always_inline__ static inline int
132132
ompi_mtl_ofi_context_progress(int ctxt_id)
133133
{
134-
int count = 0, i, events_read;
134+
int count = 0, i, events_read, req_type = -1;
135135
ompi_mtl_ofi_request_t *ofi_req = NULL;
136136
struct fi_cq_err_entry error = { 0 };
137137
ssize_t ret;
@@ -151,11 +151,13 @@ ompi_mtl_ofi_context_progress(int ctxt_id)
151151
if (NULL != ompi_mtl_ofi_wc[i].op_context) {
152152
ofi_req = TO_OFI_REQ(ompi_mtl_ofi_wc[i].op_context);
153153
assert(ofi_req);
154+
req_type = ofi_req->type;
154155
ret = ofi_req->event_callback(&ompi_mtl_ofi_wc[i], ofi_req);
155156
if (OMPI_SUCCESS != ret) {
156-
opal_output(0, "%s:%d: Error returned by request event callback: %zd.\n"
157-
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
158-
__FILE__, __LINE__, ret);
157+
opal_output(0,
158+
"%s:%d: Error returned by request (type: %d) event callback: %zd.\n"
159+
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
160+
__FILE__, __LINE__, req_type, ret);
159161
fflush(stderr);
160162
exit(1);
161163
}
@@ -191,11 +193,13 @@ ompi_mtl_ofi_context_progress(int ctxt_id)
191193
assert(error.op_context);
192194
ofi_req = TO_OFI_REQ(error.op_context);
193195
assert(ofi_req);
196+
req_type = ofi_req->type;
194197
ret = ofi_req->error_callback(&error, ofi_req);
195198
if (OMPI_SUCCESS != ret) {
196-
opal_output(0, "%s:%d: Error returned by request error callback: %zd.\n"
197-
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
198-
__FILE__, __LINE__, ret);
199+
opal_output(0,
200+
"%s:%d: Error returned by request (type: %d) error callback: %zd.\n"
201+
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
202+
__FILE__, __LINE__, req_type, ret);
199203
fflush(stderr);
200204
exit(1);
201205
}
@@ -666,6 +670,7 @@ ompi_mtl_ofi_ssend_recv(ompi_mtl_ofi_request_t *ack_req,
666670
assert(ack_req);
667671

668672
ack_req->parent = ofi_req;
673+
ack_req->type = OMPI_MTL_OFI_ACK;
669674
ack_req->event_callback = ompi_mtl_ofi_send_ack_callback;
670675
ack_req->error_callback = ompi_mtl_ofi_send_ack_error_callback;
671676

@@ -877,6 +882,7 @@ ompi_mtl_ofi_send_generic(struct mca_mtl_base_module_t *mtl,
877882
/**
878883
* Create a send request, start it and wait until it completes.
879884
*/
885+
ofi_req.type = OMPI_MTL_OFI_SEND;
880886
ofi_req.event_callback = ompi_mtl_ofi_send_callback;
881887
ofi_req.error_callback = ompi_mtl_ofi_send_error_callback;
882888

@@ -1125,6 +1131,7 @@ ompi_mtl_ofi_isend_generic(struct mca_mtl_base_module_t *mtl,
11251131
}
11261132
set_thread_context(ctxt_id);
11271133

1134+
ofi_req->type = OMPI_MTL_OFI_SEND;
11281135
ofi_req->event_callback = ompi_mtl_ofi_isend_callback;
11291136
ofi_req->error_callback = ompi_mtl_ofi_send_error_callback;
11301137

@@ -1137,7 +1144,6 @@ ompi_mtl_ofi_isend_generic(struct mca_mtl_base_module_t *mtl,
11371144
ompi_ret = ompi_mtl_datatype_pack(convertor, &start, &length, &free_after);
11381145
if (OPAL_UNLIKELY(OMPI_SUCCESS != ompi_ret)) return ompi_ret;
11391146

1140-
ofi_req->type = OMPI_MTL_OFI_SEND;
11411147
ofi_req->buffer = (free_after) ? start : NULL;
11421148
ofi_req->length = length;
11431149
ofi_req->status.MPI_ERROR = OMPI_SUCCESS;
@@ -1252,7 +1258,7 @@ __opal_attribute_always_inline__ static inline int
12521258
ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
12531259
ompi_mtl_ofi_request_t *ofi_req)
12541260
{
1255-
int ompi_ret;
1261+
int ompi_ret = OMPI_SUCCESS;
12561262
int src = mtl_ofi_get_source(wc);
12571263
ompi_status_public_t *status = NULL;
12581264

@@ -1312,9 +1318,11 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
13121318
}
13131319
}
13141320

1321+
ompi_ret = status->MPI_ERROR;
1322+
13151323
ofi_req->super.completion_callback(&ofi_req->super);
13161324

1317-
return status->MPI_ERROR;
1325+
return ompi_ret;
13181326
}
13191327

13201328
/**
@@ -1454,13 +1462,13 @@ __opal_attribute_always_inline__ static inline int
14541462
ompi_mtl_ofi_mrecv_callback(struct fi_cq_tagged_entry *wc,
14551463
ompi_mtl_ofi_request_t *ofi_req)
14561464
{
1465+
int ompi_ret = OMPI_SUCCESS;
14571466
struct mca_mtl_request_t *mrecv_req = ofi_req->mrecv_req;
14581467
ompi_status_public_t *status = &mrecv_req->ompi_req->req_status;
14591468
status->MPI_SOURCE = mtl_ofi_get_source(wc);
14601469
status->MPI_TAG = MTL_OFI_GET_TAG(wc->tag);
14611470
status->MPI_ERROR = MPI_SUCCESS;
14621471
status->_ucount = wc->len;
1463-
int ompi_ret;
14641472

14651473
ompi_mtl_ofi_deregister_and_free_buffer(ofi_req);
14661474

@@ -1475,11 +1483,12 @@ ompi_mtl_ofi_mrecv_callback(struct fi_cq_tagged_entry *wc,
14751483
}
14761484
}
14771485

1486+
ompi_ret = status->MPI_ERROR;
14781487
free(ofi_req);
14791488

14801489
mrecv_req->completion_callback(mrecv_req);
14811490

1482-
return status->MPI_ERROR;
1491+
return ompi_ret;
14831492
}
14841493

14851494
/**

0 commit comments

Comments
 (0)