Skip to content

Commit 0f8077a

Browse files
committed
oob/tcp: add show_help message about version mismatch
Be more explicit about version mismatch between ORTE processes. Signed-off-by: Jeff Squyres <[email protected]>
1 parent 40afd52 commit 0f8077a

File tree

3 files changed

+36
-8
lines changed

3 files changed

+36
-8
lines changed

opal/util/fd.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@
2424
#ifdef HAVE_ARPA_INET_H
2525
#include <arpa/inet.h>
2626
#endif
27+
#ifdef HAVE_NETINET_IN_H
28+
#include <netinet/in.h>
29+
#endif
2730
#ifdef HAVE_UNISTD_H
2831
#include <unistd.h>
2932
#endif

orte/mca/oob/tcp/help-oob-tcp.txt

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
# Copyright (c) 2004-2005 The Regents of the University of California.
1212
# All rights reserved.
1313
# Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
14-
# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
14+
# Copyright (c) 2015-2018 Cisco Systems, Inc. All rights reserved
1515
# $COPYRIGHT$
1616
#
1717
# Additional copyrights may follow
@@ -106,10 +106,29 @@ levels.
106106
Remote host: %s
107107
Remote port: %d
108108

109-
110109
The connection was rejected.
111110
#
112111
[static-fwd]
113112
Static ports were requested while orte_fwd_mpirun_port was set.
114113
Both options cannot be simultaneously set. Please either set
115114
orte_fwd_mpirun_port=false or remove any static port directives.
115+
#
116+
[version mismatch]
117+
Open MPI detected a mismatch in versions between two processes. This
118+
typically means that you executed "mpirun" (or "mpiexec") from one
119+
version of Open MPI on on node, but your default path on one of the
120+
other nodes upon which you launched found a different version of Open
121+
MPI.
122+
123+
Open MPI only supports running exactly the same version between all
124+
processes in a single job.
125+
126+
This will almost certainly cause unpredictable behavior, and may end
127+
up aborting your job.
128+
129+
Local host: %s
130+
Local process name: %s
131+
Local Open MPI version: %s
132+
Peer host: %s
133+
Peer process name: %s
134+
Peer Open MPI version: %s

orte/mca/oob/tcp/oob_tcp_connection.c

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2006-2013 Los Alamos National Security, LLC.
1313
* All rights reserved.
14-
* Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved.
14+
* Copyright (c) 2009-2018 Cisco Systems, Inc. All rights reserved
1515
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
1616
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
1717
* Copyright (c) 2014-2015 Research Organization for Information Science
@@ -58,6 +58,7 @@
5858
#include "opal/util/net.h"
5959
#include "opal/util/fd.h"
6060
#include "opal/util/error.h"
61+
#include "opal/util/show_help.h"
6162
#include "opal/class/opal_hash_table.h"
6263
#include "opal/mca/event/event.h"
6364

@@ -701,6 +702,7 @@ static bool retry(mca_oob_tcp_peer_t* peer, int sd, bool fatal)
701702
}
702703
}
703704

705+
704706
int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_peer_t* pr,
705707
int sd, mca_oob_tcp_hdr_t *dhdr)
706708
{
@@ -890,11 +892,15 @@ int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_peer_t* pr,
890892
version = (char*)((char*)msg + offset);
891893
offset += strlen(version) + 1;
892894
if (0 != strcmp(version, orte_version_string)) {
893-
opal_output(0, "%s tcp_peer_recv_connect_ack: "
894-
"received different version from %s: %s instead of %s\n",
895-
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
896-
ORTE_NAME_PRINT(&(peer->name)),
897-
version, orte_version_string);
895+
opal_show_help("help-oob-tcp.txt", "version mismatch",
896+
true,
897+
opal_process_info.nodename,
898+
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
899+
orte_version_string,
900+
opal_fd_get_peer_name(peer->sd),
901+
ORTE_NAME_PRINT(&(peer->name)),
902+
version);
903+
898904
peer->state = MCA_OOB_TCP_FAILED;
899905
mca_oob_tcp_peer_close(peer);
900906
free(msg);

0 commit comments

Comments
 (0)