Skip to content

Commit c452f68

Browse files
committed
orte/errmgr: Improve help message on connection lost
Signed-off-by: Joshua Hursey <[email protected]>
1 parent 578d881 commit c452f68

File tree

3 files changed

+16
-4
lines changed

3 files changed

+16
-4
lines changed

orte/mca/errmgr/base/help-errmgr-base.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
# Copyright (c) 2004-2005 The Regents of the University of California.
1212
# All rights reserved.
1313
# Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
14+
# Copyright (c) 2017 IBM Corporation. All rights reserved.
1415
# $COPYRIGHT$
1516
#
1617
# Additional copyrights may follow
@@ -61,9 +62,10 @@ route found between them. Please check network connectivity
6162
(including firewalls and network routing requirements).
6263
#
6364
[node-died]
64-
ORTE has lost communication with its daemon located on node:
65+
ORTE has lost communication with a remote daemon.
6566

66-
hostname: %s
67+
HNP daemon : %s on node %s
68+
Remote daemon: %s on node %s
6769

6870
This is usually due to either a failure of the TCP network
6971
connection to the node, or possibly an internal failure of

orte/mca/errmgr/default_hnp/errmgr_default_hnp.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
1111
* All rights reserved.
1212
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
13+
* Copyright (c) 2017 IBM Corporation. All rights reserved.
1314
* $COPYRIGHT$
1415
*
1516
* Additional copyrights may follow
@@ -379,7 +380,11 @@ static void proc_errors(int fd, short args, void *cbdata)
379380
/* record the first one to fail */
380381
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
381382
/* output an error message so the user knows what happened */
382-
orte_show_help("help-errmgr-base.txt", "node-died", true, pptr->node->name);
383+
orte_show_help("help-errmgr-base.txt", "node-died", true,
384+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
385+
orte_process_info.nodename,
386+
ORTE_NAME_PRINT(proc),
387+
pptr->node->name);
383388
/* mark the daemon job as failed */
384389
jdata->state = ORTE_JOB_STATE_COMM_FAILED;
385390
/* point to the lowest rank to cause the problem */

orte/mca/errmgr/dvm/errmgr_dvm.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
1111
* All rights reserved.
1212
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
13+
* Copyright (c) 2017 IBM Corporation. All rights reserved.
1314
* $COPYRIGHT$
1415
*
1516
* Additional copyrights may follow
@@ -381,7 +382,11 @@ static void proc_errors(int fd, short args, void *cbdata)
381382
/* record the first one to fail */
382383
if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
383384
/* output an error message so the user knows what happened */
384-
orte_show_help("help-errmgr-base.txt", "node-died", true, pptr->node->name);
385+
orte_show_help("help-errmgr-base.txt", "node-died", true,
386+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
387+
orte_process_info.nodename,
388+
ORTE_NAME_PRINT(proc),
389+
pptr->node->name);
385390
/* mark the daemon job as failed */
386391
jdata->state = ORTE_JOB_STATE_COMM_FAILED;
387392
/* point to the lowest rank to cause the problem */

0 commit comments

Comments
 (0)