Skip to content

Commit 39f8a86

Browse files
committed
mtl/ofi: Fix erroneous FI_PEEK/FI_CLAIM usage
The current iprobe/improbe implementations merely checks the return code on the posted receive operation to tell if there is a match or not. This commit moves the check to the probe's error callback instead. Per the semantics defined in libfabric, the peek operation is asynchronous and the results are to be fetched from the completion queue. If no message is found matching the tags specified in the peek request, then a completion queue error entry with err field set to FI_ENOMSG will be available. Signed-off-by: Raghu Raja <[email protected]>
1 parent 9afcb8e commit 39f8a86

File tree

1 file changed

+14
-19
lines changed

1 file changed

+14
-19
lines changed

ompi/mca/mtl/ofi/mtl_ofi.h

Lines changed: 14 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -991,10 +991,20 @@ __opal_attribute_always_inline__ static inline int
991991
ompi_mtl_ofi_probe_error_callback(struct fi_cq_err_entry *error,
992992
ompi_mtl_ofi_request_t *ofi_req)
993993
{
994-
ofi_req->status.MPI_ERROR = MPI_ERR_INTERN;
995994
ofi_req->completion_count--;
996995

997-
return OMPI_SUCCESS;
996+
/*
997+
* Receives posted with FI_PEEK and friends will get an error
998+
* completion with FI_ENOMSG. This just indicates the lack of a match for
999+
* the probe and is not an error case. All other error cases are
1000+
* provider-internal errors and should be flagged as such.
1001+
*/
1002+
if (error->err == FI_ENOMSG)
1003+
return OMPI_SUCCESS;
1004+
1005+
ofi_req->status.MPI_ERROR = MPI_ERR_INTERN;
1006+
1007+
return OMPI_ERROR;
9981008
}
9991009

10001010
__opal_attribute_always_inline__ static inline int
@@ -1039,7 +1049,6 @@ ompi_mtl_ofi_iprobe_generic(struct mca_mtl_base_module_t *mtl,
10391049
/**
10401050
* fi_trecvmsg with FI_PEEK:
10411051
* Initiate a search for a match in the hardware or software queue.
1042-
* The search can complete immediately with -ENOMSG.
10431052
* If successful, libfabric will enqueue a context entry into the completion
10441053
* queue to make the search nonblocking. This code will poll until the
10451054
* entry is enqueued.
@@ -1060,13 +1069,7 @@ ompi_mtl_ofi_iprobe_generic(struct mca_mtl_base_module_t *mtl,
10601069
ofi_req.match_state = 0;
10611070

10621071
MTL_OFI_RETRY_UNTIL_DONE(fi_trecvmsg(ompi_mtl_ofi.ofi_ctxt[ctxt_id].rx_ep, &msg, msgflags), ret);
1063-
if (-FI_ENOMSG == ret) {
1064-
/**
1065-
* The search request completed but no matching message was found.
1066-
*/
1067-
*flag = 0;
1068-
return OMPI_SUCCESS;
1069-
} else if (OPAL_UNLIKELY(0 > ret)) {
1072+
if (OPAL_UNLIKELY(0 > ret)) {
10701073
MTL_OFI_LOG_FI_ERR(ret, "fi_trecvmsg failed");
10711074
return ompi_mtl_ofi_get_error(ret);
10721075
}
@@ -1136,7 +1139,6 @@ ompi_mtl_ofi_improbe_generic(struct mca_mtl_base_module_t *mtl,
11361139
/**
11371140
* fi_trecvmsg with FI_PEEK and FI_CLAIM:
11381141
* Initiate a search for a match in the hardware or software queue.
1139-
* The search can complete immediately with -ENOMSG.
11401142
* If successful, libfabric will enqueue a context entry into the completion
11411143
* queue to make the search nonblocking. This code will poll until the
11421144
* entry is enqueued.
@@ -1158,14 +1160,7 @@ ompi_mtl_ofi_improbe_generic(struct mca_mtl_base_module_t *mtl,
11581160
ofi_req->mask_bits = mask_bits;
11591161

11601162
MTL_OFI_RETRY_UNTIL_DONE(fi_trecvmsg(ompi_mtl_ofi.ofi_ctxt[ctxt_id].rx_ep, &msg, msgflags), ret);
1161-
if (-FI_ENOMSG == ret) {
1162-
/**
1163-
* The search request completed but no matching message was found.
1164-
*/
1165-
*matched = 0;
1166-
free(ofi_req);
1167-
return OMPI_SUCCESS;
1168-
} else if (OPAL_UNLIKELY(0 > ret)) {
1163+
if (OPAL_UNLIKELY(0 > ret)) {
11691164
MTL_OFI_LOG_FI_ERR(ret, "fi_trecvmsg failed");
11701165
free(ofi_req);
11711166
return ompi_mtl_ofi_get_error(ret);

0 commit comments

Comments
 (0)