From 1eb1b998085c5db73fb4aa0391bd289f26e1949c Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Tue, 31 Jul 2018 14:41:27 -0700 Subject: [PATCH] MTL OFI: Fix race condition due to global progress entries array Since progress entries array is globally allocated, it is susceptible to race conditions when using multi-threaded applications. Allocating it on the stack resolves any potential races as it is thread local by default. Signed-off-by: Aravind Gopalakrishnan (cherry picked from commit ed2343034d09b33eb44a0a727bef97a108edc8aa) Conflicts: ompi/mca/mtl/ofi/mtl_ofi_component.c ompi/mca/mtl/ofi/mtl_ofi_types.h --- ompi/mca/mtl/ofi/mtl_ofi.h | 10 +++++----- ompi/mca/mtl/ofi/mtl_ofi_component.c | 20 -------------------- ompi/mca/mtl/ofi/mtl_ofi_types.h | 3 --- 3 files changed, 5 insertions(+), 28 deletions(-) diff --git a/ompi/mca/mtl/ofi/mtl_ofi.h b/ompi/mca/mtl/ofi/mtl_ofi.h index 9f5ca214be9..7f30a6ff93d 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi.h +++ b/ompi/mca/mtl/ofi/mtl_ofi.h @@ -64,6 +64,7 @@ ompi_mtl_ofi_progress(void) int count = 0, i, events_read; struct fi_cq_err_entry error = { 0 }; ompi_mtl_ofi_request_t *ofi_req = NULL; + struct fi_cq_tagged_entry wc[ompi_mtl_ofi.ofi_progress_event_count]; /** * Read the work completions from the CQ. @@ -71,16 +72,15 @@ ompi_mtl_ofi_progress(void) * Call the request's callback. */ while (true) { - ret = fi_cq_read(ompi_mtl_ofi.cq, ompi_mtl_ofi.progress_entries, - ompi_mtl_ofi.ofi_progress_event_count); + ret = fi_cq_read(ompi_mtl_ofi.cq, (void *)&wc, ompi_mtl_ofi.ofi_progress_event_count); if (ret > 0) { count+= ret; events_read = ret; for (i = 0; i < events_read; i++) { - if (NULL != ompi_mtl_ofi.progress_entries[i].op_context) { - ofi_req = TO_OFI_REQ(ompi_mtl_ofi.progress_entries[i].op_context); + if (NULL != wc[i].op_context) { + ofi_req = TO_OFI_REQ(wc[i].op_context); assert(ofi_req); - ret = ofi_req->event_callback(&ompi_mtl_ofi.progress_entries[i], ofi_req); + ret = ofi_req->event_callback(&wc[i], ofi_req); if (OMPI_SUCCESS != ret) { opal_output(0, "%s:%d: Error returned by request event callback: %zd.\n" "*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n", diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c index 662fb38e796..3963373c4ce 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_component.c +++ b/ompi/mca/mtl/ofi/mtl_ofi_component.c @@ -499,21 +499,6 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, goto error; } - /** - * Allocate memory for storing the CQ events read in OFI progress. - */ - ompi_mtl_ofi.progress_entries = calloc(ompi_mtl_ofi.ofi_progress_event_count, sizeof(struct fi_cq_tagged_entry)); - if (OPAL_UNLIKELY(!ompi_mtl_ofi.progress_entries)) { - opal_output_verbose(1, ompi_mtl_base_framework.framework_output, - "%s:%d: alloc of CQ event storage failed: %s\n", - __FILE__, __LINE__, strerror(errno)); - goto error; - } - - /** - * The remote fi_addr will be stored in the ofi_endpoint struct. - */ - av_attr.type = (MTL_OFI_AV_TABLE == av_type) ? FI_AV_TABLE: FI_AV_MAP; ret = fi_av_open(ompi_mtl_ofi.domain, &av_attr, &ompi_mtl_ofi.av, NULL); @@ -632,9 +617,6 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, if (ompi_mtl_ofi.fabric) { (void) fi_close((fid_t)ompi_mtl_ofi.fabric); } - if (ompi_mtl_ofi.progress_entries) { - free(ompi_mtl_ofi.progress_entries); - } return NULL; } @@ -667,8 +649,6 @@ ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl) goto finalize_err; } - free(ompi_mtl_ofi.progress_entries); - return OMPI_SUCCESS; finalize_err: diff --git a/ompi/mca/mtl/ofi/mtl_ofi_types.h b/ompi/mca/mtl/ofi/mtl_ofi_types.h index 0b6a1fcc715..e8c3f21b53e 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_types.h +++ b/ompi/mca/mtl/ofi/mtl_ofi_types.h @@ -52,9 +52,6 @@ typedef struct mca_mtl_ofi_module_t { /** Maximum number of CQ events to read in OFI Progress */ int ofi_progress_event_count; - /** CQ event storage */ - struct fi_cq_tagged_entry *progress_entries; - } mca_mtl_ofi_module_t; extern mca_mtl_ofi_module_t ompi_mtl_ofi;