From d2308be3fa78bd750c3759c8ee831a19ead7b7f0 Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Mon, 30 Oct 2017 12:13:44 -0700 Subject: [PATCH 1/6] Fix OFI MTL to recognize correct CQ empty scenario Currently, the progress function is incorrectly interpreting any error value other than a positive value or -FI_EAVAIL to mean CQ is empty. CQ is empty only if fi_cq_read() call returned -EAGAIN error code. Fix that here. While at it, fix help text output for calls made to OFI API. Signed-off-by: Aravind Gopalakrishnan (cherry picked from commit 285fc42b4ea9c16c8e7b9caa1c5def453f80e557) --- ompi/mca/mtl/ofi/help-mtl-ofi.txt | 8 ++- ompi/mca/mtl/ofi/mtl_ofi.h | 44 ++++++++------ ompi/mca/mtl/ofi/mtl_ofi_component.c | 87 +++++++++++++++------------- 3 files changed, 81 insertions(+), 58 deletions(-) diff --git a/ompi/mca/mtl/ofi/help-mtl-ofi.txt b/ompi/mca/mtl/ofi/help-mtl-ofi.txt index 2338d548f01..97c355123c1 100644 --- a/ompi/mca/mtl/ofi/help-mtl-ofi.txt +++ b/ompi/mca/mtl/ofi/help-mtl-ofi.txt @@ -1,6 +1,6 @@ # -*- text -*- # -# Copyright (c) 2013-2015 Intel, Inc. All rights reserved +# Copyright (c) 2013-2017 Intel, Inc. All rights reserved # # $COPYRIGHT$ # @@ -8,3 +8,9 @@ # # $HEADER$ # +[OFI call fail] +Open MPI failed an OFI Libfabric library call (%s).This is highly unusual; +your job may behave unpredictably (and/or abort) after this. + Local host: %s + Location: %s:%d + Error: %s (%zd) diff --git a/ompi/mca/mtl/ofi/mtl_ofi.h b/ompi/mca/mtl/ofi/mtl_ofi.h index 1128aca3d26..0ee125c7966 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi.h +++ b/ompi/mca/mtl/ofi/mtl_ofi.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved * * $COPYRIGHT$ * @@ -14,6 +14,7 @@ #include "ompi/mca/mtl/mtl.h" #include "ompi/mca/mtl/base/base.h" #include "opal/datatype/opal_convertor.h" +#include "opal/util/show_help.h" #include #include @@ -79,13 +80,14 @@ ompi_mtl_ofi_progress(void) assert(ofi_req); ret = ofi_req->event_callback(&wc, ofi_req); if (OMPI_SUCCESS != ret) { - opal_output(ompi_mtl_base_framework.framework_output, - "Error returned by request event callback: %zd", - ret); - abort(); + opal_output(0, "%s:%d: Error returned by request event callback: %zd.\n" + "*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n", + __FILE__, __LINE__, ret); + fflush(stderr); + exit(1); } } - } else if (ret == -FI_EAVAIL) { + } else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) { /** * An error occured and is being reported via the CQ. * Read the error and forward it to the upper layer. @@ -94,9 +96,11 @@ ompi_mtl_ofi_progress(void) &error, 0); if (0 > ret) { - opal_output(ompi_mtl_base_framework.framework_output, - "Error returned from fi_cq_readerr: %zd", ret); - abort(); + opal_output(0, "%s:%d: Error returned from fi_cq_readerr: %s(%zd).\n" + "*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n", + __FILE__, __LINE__, fi_strerror(-ret), ret); + fflush(stderr); + exit(1); } assert(error.op_context); @@ -104,16 +108,22 @@ ompi_mtl_ofi_progress(void) assert(ofi_req); ret = ofi_req->error_callback(&error, ofi_req); if (OMPI_SUCCESS != ret) { - opal_output(ompi_mtl_base_framework.framework_output, - "Error returned by request error callback: %zd", - ret); - abort(); + opal_output(0, "%s:%d: Error returned by request error callback: %zd.\n" + "*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n", + __FILE__, __LINE__, ret); + fflush(stderr); + exit(1); } } else { - /** - * The CQ is empty. Return. - */ - break; + if (ret == -FI_EAGAIN) { + break; + } else { + opal_output(0, "%s:%d: Error returned from fi_cq_read: %s(%zd).\n" + "*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n", + __FILE__, __LINE__, fi_strerror(-ret), ret); + fflush(stderr); + exit(1); + } } } return count; diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c index 6b7058434df..2dd0090ad18 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_component.c +++ b/ompi/mca/mtl/ofi/mtl_ofi_component.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved * * Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights @@ -14,6 +14,7 @@ #include "mtl_ofi.h" #include "opal/util/argv.h" +#include "opal/util/show_help.h" static int ompi_mtl_ofi_component_open(void); static int ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority); @@ -364,9 +365,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, hints, /* In: Hints to filter providers */ &providers); /* Out: List of matching providers */ if (0 != ret) { - opal_output_verbose(1, ompi_mtl_base_framework.framework_output, - "%s:%d: fi_getinfo failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); + opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, + "fi_getinfo", + ompi_process_info.nodename, __FILE__, __LINE__, + fi_strerror(-ret), ret); goto error; } @@ -392,9 +394,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, &ompi_mtl_ofi.fabric, /* Out: Fabric handle */ NULL); /* Optional context for fabric events */ if (0 != ret) { - opal_output_verbose(1, ompi_mtl_base_framework.framework_output, - "%s:%d: fi_fabric failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); + opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, + "fi_fabric", + ompi_process_info.nodename, __FILE__, __LINE__, + fi_strerror(-ret), ret); goto error; } @@ -408,9 +411,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, &ompi_mtl_ofi.domain, /* Out: Domain oject */ NULL); /* Optional context for domain events */ if (0 != ret) { - opal_output_verbose(1, ompi_mtl_base_framework.framework_output, - "%s:%d: fi_domain failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); + opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, + "fi_domain", + ompi_process_info.nodename, __FILE__, __LINE__, + fi_strerror(-ret), ret); goto error; } @@ -426,9 +430,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, &ompi_mtl_ofi.ep, /* Out: Endpoint object */ NULL); /* Optional context */ if (0 != ret) { - opal_output_verbose(1, ompi_mtl_base_framework.framework_output, - "%s:%d: fi_endpoint failed: %s\n", - __FILE__, __LINE__, fi_strerror(-ret)); + opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, + "fi_endpoint", + ompi_process_info.nodename, __FILE__, __LINE__, + fi_strerror(-ret), ret); goto error; } @@ -581,38 +586,40 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, int ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl) { + ssize_t ret; + opal_progress_unregister(ompi_mtl_ofi_progress_no_inline); - /** - * * Close all the OFI objects - * */ - if (fi_close((fid_t)ompi_mtl_ofi.ep)) { - opal_output(ompi_mtl_base_framework.framework_output, - "fi_close failed: %s", strerror(errno)); - abort(); - } - if (fi_close((fid_t)ompi_mtl_ofi.cq)) { - opal_output(ompi_mtl_base_framework.framework_output, - "fi_close failed: %s", strerror(errno)); - abort(); - } - if (fi_close((fid_t)ompi_mtl_ofi.av)) { - opal_output(ompi_mtl_base_framework.framework_output, - "fi_close failed: %s", strerror(errno)); - abort(); - } - if (fi_close((fid_t)ompi_mtl_ofi.domain)) { - opal_output(ompi_mtl_base_framework.framework_output, - "fi_close failed: %s", strerror(errno)); - abort(); - } - if (fi_close((fid_t)ompi_mtl_ofi.fabric)) { - opal_output(ompi_mtl_base_framework.framework_output, - "fi_close failed: %s", strerror(errno)); - abort(); + /* Close all the OFI objects */ + if (ret = fi_close((fid_t)ompi_mtl_ofi.ep)) { + goto finalize_err; + } + + if (ret = fi_close((fid_t)ompi_mtl_ofi.cq)) { + goto finalize_err; + } + + if (ret = fi_close((fid_t)ompi_mtl_ofi.av)) { + goto finalize_err; + } + + if (ret = fi_close((fid_t)ompi_mtl_ofi.domain)) { + goto finalize_err; + } + + if (ret = fi_close((fid_t)ompi_mtl_ofi.fabric)) { + goto finalize_err; } return OMPI_SUCCESS; + +finalize_err: + opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, + "fi_close", + ompi_process_info.nodename, __FILE__, __LINE__, + fi_strerror(-ret), ret); + + return OMPI_ERROR; } From 8cd5b62d56ac1dc861d706004221402d60c4ac69 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Sat, 11 Nov 2017 04:58:08 -0800 Subject: [PATCH 2/6] mtl ofi: fix formatting of help message No code or logic changes. Signed-off-by: Jeff Squyres (cherry picked from commit bed1930df817ba632f0ed365f634748ca4706a45) --- ompi/mca/mtl/ofi/help-mtl-ofi.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ompi/mca/mtl/ofi/help-mtl-ofi.txt b/ompi/mca/mtl/ofi/help-mtl-ofi.txt index 97c355123c1..8131766ae00 100644 --- a/ompi/mca/mtl/ofi/help-mtl-ofi.txt +++ b/ompi/mca/mtl/ofi/help-mtl-ofi.txt @@ -2,6 +2,7 @@ # # Copyright (c) 2013-2017 Intel, Inc. All rights reserved # +# Copyright (c) 2017 Cisco Systems, Inc. All rights reserved # $COPYRIGHT$ # # Additional copyrights may follow @@ -9,8 +10,9 @@ # $HEADER$ # [OFI call fail] -Open MPI failed an OFI Libfabric library call (%s).This is highly unusual; -your job may behave unpredictably (and/or abort) after this. +Open MPI failed an OFI Libfabric library call (%s). This is highly +unusual; your job may behave unpredictably (and/or abort) after this. + Local host: %s Location: %s:%d Error: %s (%zd) From 62f2b278efa539060e8c41a6000ade6b4052928b Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Sat, 11 Nov 2017 04:58:31 -0800 Subject: [PATCH 3/6] mtl ofi: fix trivial comment whitespace No code or logic changes. Signed-off-by: Jeff Squyres (cherry picked from commit e8c13ef286702a514e54681eaeee3e60c32c7800) --- ompi/mca/mtl/ofi/mtl_ofi_component.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c index 2dd0090ad18..e05a6127fab 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_component.c +++ b/ompi/mca/mtl/ofi/mtl_ofi_component.c @@ -2,7 +2,7 @@ /* * Copyright (c) 2013-2017 Intel, Inc. All rights reserved * - * Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2014-2017 Cisco Systems, Inc. All rights reserved * Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ @@ -362,7 +362,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, NULL, /* Optional name or fabric to resolve */ NULL, /* Optional service name or port to request */ 0ULL, /* Optional flag */ - hints, /* In: Hints to filter providers */ + hints, /* In: Hints to filter providers */ &providers); /* Out: List of matching providers */ if (0 != ret) { opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, From 20e883fdabcbb415920c852a7f2c11f08156ccab Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Sat, 11 Nov 2017 04:58:59 -0800 Subject: [PATCH 4/6] mtl ofi: show the positive value of the error The value of ret is negative (e.g., -61), but it is displayed in the help message as `%zd`, which renders as unsigned (i.e., a giant positive value). So make sure to negate the negative value before rendering it (e.g., so we display "61", not "4294967235"). Signed-off-by: Jeff Squyres (cherry picked from commit f910f554f78cdc163eb2315832b805417885cf52) --- ompi/mca/mtl/ofi/mtl_ofi_component.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c index e05a6127fab..3825e286bce 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_component.c +++ b/ompi/mca/mtl/ofi/mtl_ofi_component.c @@ -368,7 +368,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, "fi_getinfo", ompi_process_info.nodename, __FILE__, __LINE__, - fi_strerror(-ret), ret); + fi_strerror(-ret), -ret); goto error; } @@ -397,7 +397,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, "fi_fabric", ompi_process_info.nodename, __FILE__, __LINE__, - fi_strerror(-ret), ret); + fi_strerror(-ret), -ret); goto error; } @@ -414,7 +414,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, "fi_domain", ompi_process_info.nodename, __FILE__, __LINE__, - fi_strerror(-ret), ret); + fi_strerror(-ret), -ret); goto error; } @@ -433,7 +433,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, "fi_endpoint", ompi_process_info.nodename, __FILE__, __LINE__, - fi_strerror(-ret), ret); + fi_strerror(-ret), -ret); goto error; } @@ -617,7 +617,7 @@ ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl) opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, "fi_close", ompi_process_info.nodename, __FILE__, __LINE__, - fi_strerror(-ret), ret); + fi_strerror(-ret), -ret); return OMPI_ERROR; } From 418df2bd2a40d3f2b1583334c0265139196f866d Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Sat, 11 Nov 2017 05:01:15 -0800 Subject: [PATCH 5/6] mtl ofi: it is not an error to return no data from fi_getinfo() Before this commit, the presence of usNIC devices -- which will (currently) return no data when fi_getinfo() is queried for tagged matching providers -- would cause an error message to be displayed. Signed-off-by: Jeff Squyres (cherry picked from commit 5a6ddf42d6f94842c9b81bde1b077e5e73681de8) --- ompi/mca/mtl/ofi/mtl_ofi_component.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c index 3825e286bce..96f8f6abbff 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_component.c +++ b/ompi/mca/mtl/ofi/mtl_ofi_component.c @@ -364,7 +364,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, 0ULL, /* Optional flag */ hints, /* In: Hints to filter providers */ &providers); /* Out: List of matching providers */ - if (0 != ret) { + if (FI_ENODATA == -ret) { + // It is not an error if no information is returned. + goto error; + } else if (0 != ret) { opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, "fi_getinfo", ompi_process_info.nodename, __FILE__, __LINE__, From 7c1f1e4030bba4ee10dc8ed7026d5417c4ef5229 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Sat, 11 Nov 2017 05:04:23 -0800 Subject: [PATCH 6/6] mtl ofi: squelch compiler warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc 5.2 complains: ``` mtl_ofi_component.c: In function ‘ompi_mtl_ofi_finalize’: mtl_ofi_component.c:613:5: warning: suggest parentheses around assignment used as truth value [-Wparentheses] if (ret = fi_close((fid_t)ompi_mtl_ofi.fabric)) { ^ ``` Signed-off-by: Jeff Squyres (cherry picked from commit a8686a681361177da28aac6e61963600fbd30d8a) --- ompi/mca/mtl/ofi/mtl_ofi_component.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c index 96f8f6abbff..89a71ed7b63 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_component.c +++ b/ompi/mca/mtl/ofi/mtl_ofi_component.c @@ -594,23 +594,23 @@ ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl) opal_progress_unregister(ompi_mtl_ofi_progress_no_inline); /* Close all the OFI objects */ - if (ret = fi_close((fid_t)ompi_mtl_ofi.ep)) { + if ((ret = fi_close((fid_t)ompi_mtl_ofi.ep))) { goto finalize_err; } - if (ret = fi_close((fid_t)ompi_mtl_ofi.cq)) { + if ((ret = fi_close((fid_t)ompi_mtl_ofi.cq))) { goto finalize_err; } - if (ret = fi_close((fid_t)ompi_mtl_ofi.av)) { + if ((ret = fi_close((fid_t)ompi_mtl_ofi.av))) { goto finalize_err; } - if (ret = fi_close((fid_t)ompi_mtl_ofi.domain)) { + if ((ret = fi_close((fid_t)ompi_mtl_ofi.domain))) { goto finalize_err; } - if (ret = fi_close((fid_t)ompi_mtl_ofi.fabric)) { + if ((ret = fi_close((fid_t)ompi_mtl_ofi.fabric))) { goto finalize_err; }