Skip to content

Commit fe113e2

Browse files
committed
mtl/ofi: do not access request object after completion callback
Request completion callback function can potentially invalidate the request object. We should avoid accessing the object afterwards. Signed-off-by: Wenduo Wang <[email protected]> (cherry picked from commit 6d79aae)
1 parent 89c0dde commit fe113e2

File tree

1 file changed

+15
-9
lines changed

1 file changed

+15
-9
lines changed

ompi/mca/mtl/ofi/mtl_ofi.h

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ opal_mutex_atomic_unlock(&ompi_mtl_ofi.ofi_ctxt[ctxt_id].context_lock)
131131
__opal_attribute_always_inline__ static inline int
132132
ompi_mtl_ofi_context_progress(int ctxt_id)
133133
{
134-
int count = 0, i, events_read;
134+
int count = 0, i, events_read, req_type = -1;
135135
ompi_mtl_ofi_request_t *ofi_req = NULL;
136136
struct fi_cq_err_entry error = { 0 };
137137
ssize_t ret;
@@ -151,12 +151,13 @@ ompi_mtl_ofi_context_progress(int ctxt_id)
151151
if (NULL != ompi_mtl_ofi_wc[i].op_context) {
152152
ofi_req = TO_OFI_REQ(ompi_mtl_ofi_wc[i].op_context);
153153
assert(ofi_req);
154+
req_type = ofi_req->type;
154155
ret = ofi_req->event_callback(&ompi_mtl_ofi_wc[i], ofi_req);
155156
if (OMPI_SUCCESS != ret) {
156157
opal_output(0,
157158
"%s:%d: Error returned by request (type: %d) event callback: %zd.\n"
158159
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
159-
__FILE__, __LINE__, ofi_req->type, ret);
160+
__FILE__, __LINE__, req_type, ret);
160161
fflush(stderr);
161162
exit(1);
162163
}
@@ -192,11 +193,13 @@ ompi_mtl_ofi_context_progress(int ctxt_id)
192193
assert(error.op_context);
193194
ofi_req = TO_OFI_REQ(error.op_context);
194195
assert(ofi_req);
196+
req_type = ofi_req->type;
195197
ret = ofi_req->error_callback(&error, ofi_req);
196198
if (OMPI_SUCCESS != ret) {
197-
opal_output(0, "%s:%d: Error returned by request error callback: %zd.\n"
198-
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
199-
__FILE__, __LINE__, ret);
199+
opal_output(0,
200+
"%s:%d: Error returned by request (type: %d) error callback: %zd.\n"
201+
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
202+
__FILE__, __LINE__, req_type, ret);
200203
fflush(stderr);
201204
exit(1);
202205
}
@@ -1255,7 +1258,7 @@ __opal_attribute_always_inline__ static inline int
12551258
ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
12561259
ompi_mtl_ofi_request_t *ofi_req)
12571260
{
1258-
int ompi_ret;
1261+
int ompi_ret = OMPI_SUCCESS;
12591262
int src = mtl_ofi_get_source(wc);
12601263
ompi_status_public_t *status = NULL;
12611264

@@ -1315,9 +1318,11 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
13151318
}
13161319
}
13171320

1321+
ompi_ret = status->MPI_ERROR;
1322+
13181323
ofi_req->super.completion_callback(&ofi_req->super);
13191324

1320-
return status->MPI_ERROR;
1325+
return ompi_ret;
13211326
}
13221327

13231328
/**
@@ -1457,13 +1462,13 @@ __opal_attribute_always_inline__ static inline int
14571462
ompi_mtl_ofi_mrecv_callback(struct fi_cq_tagged_entry *wc,
14581463
ompi_mtl_ofi_request_t *ofi_req)
14591464
{
1465+
int ompi_ret = OMPI_SUCCESS;
14601466
struct mca_mtl_request_t *mrecv_req = ofi_req->mrecv_req;
14611467
ompi_status_public_t *status = &mrecv_req->ompi_req->req_status;
14621468
status->MPI_SOURCE = mtl_ofi_get_source(wc);
14631469
status->MPI_TAG = MTL_OFI_GET_TAG(wc->tag);
14641470
status->MPI_ERROR = MPI_SUCCESS;
14651471
status->_ucount = wc->len;
1466-
int ompi_ret;
14671472

14681473
ompi_mtl_ofi_deregister_and_free_buffer(ofi_req);
14691474

@@ -1478,11 +1483,12 @@ ompi_mtl_ofi_mrecv_callback(struct fi_cq_tagged_entry *wc,
14781483
}
14791484
}
14801485

1486+
ompi_ret = status->MPI_ERROR;
14811487
free(ofi_req);
14821488

14831489
mrecv_req->completion_callback(mrecv_req);
14841490

1485-
return status->MPI_ERROR;
1491+
return ompi_ret;
14861492
}
14871493

14881494
/**

0 commit comments

Comments
 (0)