Skip to content

Commit 42df51d

Browse files
authored
Merge pull request #3046 from jjhursey/fix/ibm/v2.x/tcp-timeout
Adjust TCP Keepalive and Errmgr message
2 parents 97e48bf + 353f720 commit 42df51d

File tree

3 files changed

+17
-10
lines changed

3 files changed

+17
-10
lines changed

orte/mca/errmgr/base/help-errmgr-base.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
# Copyright (c) 2004-2005 The Regents of the University of California.
1212
# All rights reserved.
1313
# Copyright (c) 2014 Intel, Inc. All rights reserved.
14+
# Copyright (c) 2017 IBM Corporation. All rights reserved.
1415
# $COPYRIGHT$
1516
#
1617
# Additional copyrights may follow
@@ -61,9 +62,10 @@ route found between them. Please check network connectivity
6162
(including firewalls and network routing requirements).
6263
#
6364
[node-died]
64-
ORTE has lost communication with its daemon located on node:
65+
ORTE has lost communication with a remote daemon.
6566

66-
hostname: %s
67+
HNP daemon : %s on node %s
68+
Remote daemon: %s on node %s
6769

6870
This is usually due to either a failure of the TCP network
6971
connection to the node, or possibly an internal failure of

orte/mca/errmgr/default_hnp/errmgr_default_hnp.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
* Copyright (c) 2011-2015 Los Alamos National Security, LLC.
1111
* All rights reserved.
1212
* Copyright (c) 2014 Intel, Inc. All rights reserved.
13+
* Copyright (c) 2017 IBM Corporation. All rights reserved.
1314
* $COPYRIGHT$
1415
*
1516
* Additional copyrights may follow
@@ -369,7 +370,11 @@ static void proc_errors(int fd, short args, void *cbdata)
369370
/* record the first one to fail */
370371
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
371372
/* output an error message so the user knows what happened */
372-
orte_show_help("help-errmgr-base.txt", "node-died", true, pptr->node->name);
373+
orte_show_help("help-errmgr-base.txt", "node-died", true,
374+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
375+
orte_process_info.nodename,
376+
ORTE_NAME_PRINT(proc),
377+
pptr->node->name);
373378
/* mark the daemon job as failed */
374379
jdata->state = ORTE_JOB_STATE_COMM_FAILED;
375380
/* point to the lowest rank to cause the problem */

orte/mca/oob/tcp/oob_tcp_component.c

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
1717
* Copyright (c) 2013-2014 Intel, Inc. All rights reserved.
1818
* Copyright (c) 2014 NVIDIA Corporation. All rights reserved.
19+
* Copyright (c) 2017 IBM Corporation. All rights reserved.
1920
* $COPYRIGHT$
2021
*
2122
* Additional copyrights may follow
@@ -400,27 +401,26 @@ static int tcp_component_register(void)
400401
&mca_oob_tcp_component.disable_ipv6_family);
401402
#endif // OPAL_ENABLE_IPV6
402403

403-
// Default to keepalives every 60 seconds
404-
mca_oob_tcp_component.keepalive_time = 60;
404+
// Wait for this amount of time before sending the first keepalive probe
405+
mca_oob_tcp_component.keepalive_time = 300;
405406
(void)mca_base_component_var_register(component, "keepalive_time",
406407
"Idle time in seconds before starting to send keepalives (keepalive_time <= 0 disables keepalive functionality)",
407408
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
408409
OPAL_INFO_LVL_5,
409410
MCA_BASE_VAR_SCOPE_READONLY,
410411
&mca_oob_tcp_component.keepalive_time);
411412

412-
// Default to keepalive retry interval time of 5 seconds
413-
mca_oob_tcp_component.keepalive_intvl = 5;
413+
// Resend keepalive probe every INT seconds
414+
mca_oob_tcp_component.keepalive_intvl = 20;
414415
(void)mca_base_component_var_register(component, "keepalive_intvl",
415416
"Time between successive keepalive pings when peer has not responded, in seconds (ignored if keepalive_time <= 0)",
416417
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
417418
OPAL_INFO_LVL_5,
418419
MCA_BASE_VAR_SCOPE_READONLY,
419420
&mca_oob_tcp_component.keepalive_intvl);
420421

421-
// Default to retrying a keepalive 3 times before declaring the
422-
// peer kaput
423-
mca_oob_tcp_component.keepalive_probes = 3;
422+
// After sending PR probes every INT seconds consider the connection dead
423+
mca_oob_tcp_component.keepalive_probes = 9;
424424
(void)mca_base_component_var_register(component, "keepalive_probes",
425425
"Number of keepalives that can be missed before declaring error (ignored if keepalive_time <= 0)",
426426
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,

0 commit comments

Comments
 (0)