Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion ompi/mca/mtl/ofi/help-mtl-ofi.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
# -*- text -*-
#
# Copyright (c) 2013-2015 Intel, Inc. All rights reserved
# Copyright (c) 2013-2017 Intel, Inc. All rights reserved
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
[OFI call fail]
Open MPI failed an OFI Libfabric library call (%s).This is highly unusual;
your job may behave unpredictably (and/or abort) after this.
Local host: %s
Location: %s:%d
Error: %s (%zd)
44 changes: 27 additions & 17 deletions ompi/mca/mtl/ofi/mtl_ofi.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved
*
* $COPYRIGHT$
*
Expand All @@ -14,6 +14,7 @@
#include "ompi/mca/mtl/mtl.h"
#include "ompi/mca/mtl/base/base.h"
#include "opal/datatype/opal_convertor.h"
#include "opal/util/show_help.h"

#include <rdma/fabric.h>
#include <rdma/fi_cm.h>
Expand Down Expand Up @@ -79,13 +80,14 @@ ompi_mtl_ofi_progress(void)
assert(ofi_req);
ret = ofi_req->event_callback(&wc, ofi_req);
if (OMPI_SUCCESS != ret) {
opal_output(ompi_mtl_base_framework.framework_output,
"Error returned by request event callback: %zd",
ret);
abort();
opal_output(0, "%s:%d: Error returned by request event callback: %zd.\n"
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
__FILE__, __LINE__, ret);
fflush(stderr);
exit(1);
}
}
} else if (ret == -FI_EAVAIL) {
} else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) {
/**
* An error occured and is being reported via the CQ.
* Read the error and forward it to the upper layer.
Expand All @@ -94,26 +96,34 @@ ompi_mtl_ofi_progress(void)
&error,
0);
if (0 > ret) {
opal_output(ompi_mtl_base_framework.framework_output,
"Error returned from fi_cq_readerr: %zd", ret);
abort();
opal_output(0, "%s:%d: Error returned from fi_cq_readerr: %s(%zd).\n"
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
__FILE__, __LINE__, fi_strerror(-ret), ret);
fflush(stderr);
exit(1);
}

assert(error.op_context);
ofi_req = TO_OFI_REQ(error.op_context);
assert(ofi_req);
ret = ofi_req->error_callback(&error, ofi_req);
if (OMPI_SUCCESS != ret) {
opal_output(ompi_mtl_base_framework.framework_output,
"Error returned by request error callback: %zd",
ret);
abort();
opal_output(0, "%s:%d: Error returned by request error callback: %zd.\n"
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
__FILE__, __LINE__, ret);
fflush(stderr);
exit(1);
}
} else {
/**
* The CQ is empty. Return.
*/
break;
if (ret == -FI_EAGAIN) {
break;
} else {
opal_output(0, "%s:%d: Error returned from fi_cq_read: %s(%zd).\n"
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
__FILE__, __LINE__, fi_strerror(-ret), ret);
fflush(stderr);
exit(1);
}
}
}
return count;
Expand Down
87 changes: 47 additions & 40 deletions ompi/mca/mtl/ofi/mtl_ofi_component.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved
*
* Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
Expand All @@ -14,6 +14,7 @@

#include "mtl_ofi.h"
#include "opal/util/argv.h"
#include "opal/util/show_help.h"

static int ompi_mtl_ofi_component_open(void);
static int ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority);
Expand Down Expand Up @@ -364,9 +365,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
hints, /* In: Hints to filter providers */
&providers); /* Out: List of matching providers */
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_getinfo failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
"fi_getinfo",
ompi_process_info.nodename, __FILE__, __LINE__,
fi_strerror(-ret), ret);
goto error;
}

Expand All @@ -392,9 +394,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
&ompi_mtl_ofi.fabric, /* Out: Fabric handle */
NULL); /* Optional context for fabric events */
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_fabric failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
"fi_fabric",
ompi_process_info.nodename, __FILE__, __LINE__,
fi_strerror(-ret), ret);
goto error;
}

Expand All @@ -408,9 +411,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
&ompi_mtl_ofi.domain, /* Out: Domain oject */
NULL); /* Optional context for domain events */
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_domain failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
"fi_domain",
ompi_process_info.nodename, __FILE__, __LINE__,
fi_strerror(-ret), ret);
goto error;
}

Expand All @@ -426,9 +430,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
&ompi_mtl_ofi.ep, /* Out: Endpoint object */
NULL); /* Optional context */
if (0 != ret) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: fi_endpoint failed: %s\n",
__FILE__, __LINE__, fi_strerror(-ret));
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
"fi_endpoint",
ompi_process_info.nodename, __FILE__, __LINE__,
fi_strerror(-ret), ret);
goto error;
}

Expand Down Expand Up @@ -581,38 +586,40 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
int
ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl)
{
ssize_t ret;

opal_progress_unregister(ompi_mtl_ofi_progress_no_inline);

/**
* * Close all the OFI objects
* */
if (fi_close((fid_t)ompi_mtl_ofi.ep)) {
opal_output(ompi_mtl_base_framework.framework_output,
"fi_close failed: %s", strerror(errno));
abort();
}
if (fi_close((fid_t)ompi_mtl_ofi.cq)) {
opal_output(ompi_mtl_base_framework.framework_output,
"fi_close failed: %s", strerror(errno));
abort();
}
if (fi_close((fid_t)ompi_mtl_ofi.av)) {
opal_output(ompi_mtl_base_framework.framework_output,
"fi_close failed: %s", strerror(errno));
abort();
}
if (fi_close((fid_t)ompi_mtl_ofi.domain)) {
opal_output(ompi_mtl_base_framework.framework_output,
"fi_close failed: %s", strerror(errno));
abort();
}
if (fi_close((fid_t)ompi_mtl_ofi.fabric)) {
opal_output(ompi_mtl_base_framework.framework_output,
"fi_close failed: %s", strerror(errno));
abort();
/* Close all the OFI objects */
if (ret = fi_close((fid_t)ompi_mtl_ofi.ep)) {
goto finalize_err;
}

if (ret = fi_close((fid_t)ompi_mtl_ofi.cq)) {
goto finalize_err;
}

if (ret = fi_close((fid_t)ompi_mtl_ofi.av)) {
goto finalize_err;
}

if (ret = fi_close((fid_t)ompi_mtl_ofi.domain)) {
goto finalize_err;
}

if (ret = fi_close((fid_t)ompi_mtl_ofi.fabric)) {
goto finalize_err;
}

return OMPI_SUCCESS;

finalize_err:
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
"fi_close",
ompi_process_info.nodename, __FILE__, __LINE__,
fi_strerror(-ret), ret);

return OMPI_ERROR;
}


Expand Down