diff --git a/orte/mca/errmgr/base/help-errmgr-base.txt b/orte/mca/errmgr/base/help-errmgr-base.txt index c7e3051bb95..4aec50c04d4 100644 --- a/orte/mca/errmgr/base/help-errmgr-base.txt +++ b/orte/mca/errmgr/base/help-errmgr-base.txt @@ -11,6 +11,7 @@ # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. # Copyright (c) 2014 Intel, Inc. All rights reserved. +# Copyright (c) 2017 IBM Corporation. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -61,9 +62,10 @@ route found between them. Please check network connectivity (including firewalls and network routing requirements). # [node-died] -ORTE has lost communication with its daemon located on node: +ORTE has lost communication with a remote daemon. - hostname: %s + HNP daemon : %s on node %s + Remote daemon: %s on node %s This is usually due to either a failure of the TCP network connection to the node, or possibly an internal failure of diff --git a/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c b/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c index a29d80e9b0a..fcdbe3acc30 100644 --- a/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c +++ b/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c @@ -10,6 +10,7 @@ * Copyright (c) 2011-2015 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2014 Intel, Inc. All rights reserved. + * Copyright (c) 2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -369,7 +370,11 @@ static void proc_errors(int fd, short args, void *cbdata) /* record the first one to fail */ if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { /* output an error message so the user knows what happened */ - orte_show_help("help-errmgr-base.txt", "node-died", true, pptr->node->name); + orte_show_help("help-errmgr-base.txt", "node-died", true, + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + orte_process_info.nodename, + ORTE_NAME_PRINT(proc), + pptr->node->name); /* mark the daemon job as failed */ jdata->state = ORTE_JOB_STATE_COMM_FAILED; /* point to the lowest rank to cause the problem */ diff --git a/orte/mca/oob/tcp/oob_tcp_component.c b/orte/mca/oob/tcp/oob_tcp_component.c index 45580a61b90..85cad6c83d6 100644 --- a/orte/mca/oob/tcp/oob_tcp_component.c +++ b/orte/mca/oob/tcp/oob_tcp_component.c @@ -16,6 +16,7 @@ * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2013-2014 Intel, Inc. All rights reserved. * Copyright (c) 2014 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -400,8 +401,8 @@ static int tcp_component_register(void) &mca_oob_tcp_component.disable_ipv6_family); #endif // OPAL_ENABLE_IPV6 - // Default to keepalives every 60 seconds - mca_oob_tcp_component.keepalive_time = 60; + // Wait for this amount of time before sending the first keepalive probe + mca_oob_tcp_component.keepalive_time = 300; (void)mca_base_component_var_register(component, "keepalive_time", "Idle time in seconds before starting to send keepalives (keepalive_time <= 0 disables keepalive functionality)", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, @@ -409,8 +410,8 @@ static int tcp_component_register(void) MCA_BASE_VAR_SCOPE_READONLY, &mca_oob_tcp_component.keepalive_time); - // Default to keepalive retry interval time of 5 seconds - mca_oob_tcp_component.keepalive_intvl = 5; + // Resend keepalive probe every INT seconds + mca_oob_tcp_component.keepalive_intvl = 20; (void)mca_base_component_var_register(component, "keepalive_intvl", "Time between successive keepalive pings when peer has not responded, in seconds (ignored if keepalive_time <= 0)", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, @@ -418,9 +419,8 @@ static int tcp_component_register(void) MCA_BASE_VAR_SCOPE_READONLY, &mca_oob_tcp_component.keepalive_intvl); - // Default to retrying a keepalive 3 times before declaring the - // peer kaput - mca_oob_tcp_component.keepalive_probes = 3; + // After sending PR probes every INT seconds consider the connection dead + mca_oob_tcp_component.keepalive_probes = 9; (void)mca_base_component_var_register(component, "keepalive_probes", "Number of keepalives that can be missed before declaring error (ignored if keepalive_time <= 0)", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,