Skip to content

Commit 09856c7

Browse files
authored
Merge pull request #5621 from aravindksg/ofi_race_fix_31x
MTL OFI: Fix race condition due to global progress entries array
2 parents 664650e + 1eb1b99 commit 09856c7

File tree

3 files changed

+5
-28
lines changed

3 files changed

+5
-28
lines changed

ompi/mca/mtl/ofi/mtl_ofi.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,23 +64,23 @@ ompi_mtl_ofi_progress(void)
6464
int count = 0, i, events_read;
6565
struct fi_cq_err_entry error = { 0 };
6666
ompi_mtl_ofi_request_t *ofi_req = NULL;
67+
struct fi_cq_tagged_entry wc[ompi_mtl_ofi.ofi_progress_event_count];
6768

6869
/**
6970
* Read the work completions from the CQ.
7071
* From the completion's op_context, we get the associated OFI request.
7172
* Call the request's callback.
7273
*/
7374
while (true) {
74-
ret = fi_cq_read(ompi_mtl_ofi.cq, ompi_mtl_ofi.progress_entries,
75-
ompi_mtl_ofi.ofi_progress_event_count);
75+
ret = fi_cq_read(ompi_mtl_ofi.cq, (void *)&wc, ompi_mtl_ofi.ofi_progress_event_count);
7676
if (ret > 0) {
7777
count+= ret;
7878
events_read = ret;
7979
for (i = 0; i < events_read; i++) {
80-
if (NULL != ompi_mtl_ofi.progress_entries[i].op_context) {
81-
ofi_req = TO_OFI_REQ(ompi_mtl_ofi.progress_entries[i].op_context);
80+
if (NULL != wc[i].op_context) {
81+
ofi_req = TO_OFI_REQ(wc[i].op_context);
8282
assert(ofi_req);
83-
ret = ofi_req->event_callback(&ompi_mtl_ofi.progress_entries[i], ofi_req);
83+
ret = ofi_req->event_callback(&wc[i], ofi_req);
8484
if (OMPI_SUCCESS != ret) {
8585
opal_output(0, "%s:%d: Error returned by request event callback: %zd.\n"
8686
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",

ompi/mca/mtl/ofi/mtl_ofi_component.c

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -499,21 +499,6 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
499499
goto error;
500500
}
501501

502-
/**
503-
* Allocate memory for storing the CQ events read in OFI progress.
504-
*/
505-
ompi_mtl_ofi.progress_entries = calloc(ompi_mtl_ofi.ofi_progress_event_count, sizeof(struct fi_cq_tagged_entry));
506-
if (OPAL_UNLIKELY(!ompi_mtl_ofi.progress_entries)) {
507-
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
508-
"%s:%d: alloc of CQ event storage failed: %s\n",
509-
__FILE__, __LINE__, strerror(errno));
510-
goto error;
511-
}
512-
513-
/**
514-
* The remote fi_addr will be stored in the ofi_endpoint struct.
515-
*/
516-
517502
av_attr.type = (MTL_OFI_AV_TABLE == av_type) ? FI_AV_TABLE: FI_AV_MAP;
518503

519504
ret = fi_av_open(ompi_mtl_ofi.domain, &av_attr, &ompi_mtl_ofi.av, NULL);
@@ -632,9 +617,6 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
632617
if (ompi_mtl_ofi.fabric) {
633618
(void) fi_close((fid_t)ompi_mtl_ofi.fabric);
634619
}
635-
if (ompi_mtl_ofi.progress_entries) {
636-
free(ompi_mtl_ofi.progress_entries);
637-
}
638620

639621
return NULL;
640622
}
@@ -667,8 +649,6 @@ ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl)
667649
goto finalize_err;
668650
}
669651

670-
free(ompi_mtl_ofi.progress_entries);
671-
672652
return OMPI_SUCCESS;
673653

674654
finalize_err:

ompi/mca/mtl/ofi/mtl_ofi_types.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,6 @@ typedef struct mca_mtl_ofi_module_t {
5252
/** Maximum number of CQ events to read in OFI Progress */
5353
int ofi_progress_event_count;
5454

55-
/** CQ event storage */
56-
struct fi_cq_tagged_entry *progress_entries;
57-
5855
} mca_mtl_ofi_module_t;
5956

6057
extern mca_mtl_ofi_module_t ompi_mtl_ofi;

0 commit comments

Comments
 (0)