Skip to content

Commit 023a4a8

Browse files
authored
Merge pull request #4942 from jsquyres/pr/tcp-btl-help-message-updates
TCP help message updates
2 parents a15d823 + 0f8077a commit 023a4a8

File tree

6 files changed

+188
-62
lines changed

6 files changed

+188
-62
lines changed

opal/mca/btl/tcp/btl_tcp_component.c

Lines changed: 40 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* University of Stuttgart. All rights reserved.
1111
* Copyright (c) 2004-2005 The Regents of the University of California.
1212
* All rights reserved.
13-
* Copyright (c) 2007-2015 Cisco Systems, Inc. All rights reserved.
13+
* Copyright (c) 2007-2018 Cisco Systems, Inc. All rights reserved
1414
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
1515
* Copyright (c) 2009 Oak Ridge National Laboratory
1616
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
@@ -1363,7 +1363,6 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
13631363
mca_btl_tcp_endpoint_hs_msg_t hs_msg;
13641364
struct timeval save, tv;
13651365
socklen_t rcvtimeo_save_len = sizeof(save);
1366-
char str[128];
13671366

13681367
/* Note, Socket will be in blocking mode during intial handshake
13691368
* hence setting SO_RCVTIMEO to say 2 seconds here to avoid waiting
@@ -1376,20 +1375,22 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
13761375
if (ENOPROTOOPT == errno) {
13771376
sockopt = false;
13781377
} else {
1379-
opal_output_verbose(20, opal_btl_base_framework.framework_output,
1380-
"Cannot get current recv timeout value of the socket"
1381-
"Local_host:%s PID:%d",
1382-
opal_process_info.nodename, getpid());
1378+
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
1379+
true, opal_process_info.nodename,
1380+
getpid(),
1381+
"getsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, ...)",
1382+
strerror(opal_socket_errno), opal_socket_errno);
13831383
return;
13841384
}
13851385
} else {
13861386
tv.tv_sec = 2;
13871387
tv.tv_usec = 0;
13881388
if (0 != setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv))) {
1389-
opal_output_verbose(20, opal_btl_base_framework.framework_output,
1390-
"Cannot set new recv timeout value of the socket"
1391-
"Local_host:%s PID:%d",
1392-
opal_process_info.nodename, getpid());
1389+
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
1390+
true, opal_process_info.nodename,
1391+
getpid(),
1392+
"setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, ...)",
1393+
strerror(opal_socket_errno), opal_socket_errno);
13931394
return;
13941395
}
13951396
}
@@ -1408,14 +1409,16 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
14081409
* This attempted connection will be ignored; your MPI job may or may not
14091410
* continue properly.
14101411
*/
1411-
if (sizeof(hs_msg) != retval) {
1412-
opal_output_verbose(20, opal_btl_base_framework.framework_output,
1413-
"process did not receive full connect ACK "
1414-
"Local_host:%s PID:%d String_received:%s Test_fail:%s",
1415-
opal_process_info.nodename,
1416-
getpid(),
1417-
(retval > 0) ? hs_msg.magic_id : "<nothing>",
1418-
"handshake message length");
1412+
if (sizeof(hs_msg) != retval) {
1413+
const char *peer = opal_fd_get_peer_name(sd);
1414+
opal_show_help("help-mpi-btl-tcp.txt",
1415+
"did not receive full magic id string",
1416+
true,
1417+
opal_process_info.nodename,
1418+
getpid(),
1419+
opal_version_string,
1420+
peer);
1421+
free((char*) peer);
14191422

14201423
/* The other side probably isn't OMPI, so just hang up */
14211424
CLOSE_THE_SOCKET(sd);
@@ -1424,12 +1427,18 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
14241427

14251428
guid = hs_msg.guid;
14261429
if (0 != strncmp(hs_msg.magic_id, mca_btl_tcp_magic_id_string, len)) {
1427-
opal_output_verbose(20, opal_btl_base_framework.framework_output,
1428-
"process did not receive right magic string. "
1429-
"Local_host:%s PID:%d String_received:%s Test_fail:%s",
1430-
opal_process_info.nodename,
1431-
getpid(), hs_msg.magic_id,
1432-
"string value");
1430+
const char *peer = opal_fd_get_peer_name(sd);
1431+
opal_show_help("help-mpi-btl-tcp.txt",
1432+
"received incorrect magic id string",
1433+
true,
1434+
opal_process_info.nodename,
1435+
getpid(),
1436+
opal_version_string,
1437+
peer,
1438+
hs_msg.magic_id,
1439+
mca_btl_tcp_magic_id_string);
1440+
free((char*) peer);
1441+
14331442
/* The other side probably isn't OMPI, so just hang up */
14341443
CLOSE_THE_SOCKET(sd);
14351444
return;
@@ -1438,10 +1447,11 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
14381447
if (sockopt) {
14391448
/* reset RECVTIMEO option to its original state */
14401449
if (0 != setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, &save, sizeof(save))) {
1441-
opal_output_verbose(20, opal_btl_base_framework.framework_output,
1442-
"Cannot reset recv timeout value"
1443-
"Local_host:%s PID:%d",
1444-
opal_process_info.nodename, getpid());
1450+
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
1451+
true, opal_process_info.nodename,
1452+
getpid(),
1453+
"setsockopt(sd, SOL_SOCKET, SO_RCVTIMEO, ...)",
1454+
strerror(opal_socket_errno), opal_socket_errno);
14451455
return;
14461456
}
14471457
}
@@ -1492,24 +1502,9 @@ static void mca_btl_tcp_component_recv_handler(int sd, short flags, void* user)
14921502
/* are there any existing peer instances willing to accept this connection */
14931503
(void)mca_btl_tcp_proc_accept(btl_proc, (struct sockaddr*)&addr, sd);
14941504

1495-
switch (addr.ss_family) {
1496-
case AF_INET:
1497-
inet_ntop(AF_INET, &(((struct sockaddr_in*) &addr)->sin_addr), str, sizeof(str));
1498-
break;
1499-
1500-
#if OPAL_ENABLE_IPV6
1501-
case AF_INET6:
1502-
inet_ntop(AF_INET6, &(((struct sockaddr_in6*) &addr)->sin6_addr), str, sizeof(str));
1503-
break;
1504-
#endif
1505-
1506-
default:
1507-
BTL_ERROR(("Got an accept() from an unknown address family -- this shouldn't happen"));
1508-
CLOSE_THE_SOCKET(sd);
1509-
return;
1510-
1511-
}
1505+
const char *str = opal_fd_get_peer_name(sd);
15121506
opal_output_verbose(10, opal_btl_base_framework.framework_output,
15131507
"btl:tcp: now connected to %s, process %s", str,
15141508
OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name));
1509+
free((char*) str);
15151510
}

opal/mca/btl/tcp/help-mpi-btl-tcp.txt

Lines changed: 47 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ values are in the range [1 .. 2^16-1]. This value will be ignored
3535
WARNING: Open MPI failed to TCP connect to a peer MPI process. This
3636
should not happen.
3737

38-
Your Open MPI job may now fail.
38+
Your Open MPI job may now hang or fail.
3939

4040
Local host: %s
4141
PID: %d
@@ -46,7 +46,7 @@ Your Open MPI job may now fail.
4646
WARNING: Open MPI failed to handshake with a connecting peer MPI
4747
process over TCP. This should not happen.
4848

49-
Your Open MPI job may now fail.
49+
Your Open MPI job may now hang or fail.
5050

5151
Local host: %s
5252
PID: %d
@@ -102,8 +102,11 @@ hopefully be able to continue).
102102
Known IPs of peer: %s
103103
#
104104
[socket flag fail]
105-
WARNING: Open MPI failed to set flags on a TCP socket. This should
106-
not happen. It is likely that your MPI job will now fail.
105+
WARNING: Open MPI failed to get or set flags on a TCP socket. This
106+
should not happen.
107+
108+
This may cause unpredictable behavior, and may end up hanging or
109+
aborting your job.
107110

108111
Local host: %s
109112
PID: %d
@@ -164,4 +167,43 @@ Your Open MPI job may now fail.
164167
PID: %d
165168
Message: %s
166169
Error: %s (%d)
167-
#
170+
#
171+
[did not receive full magic id string]
172+
The TCP BTL received an inbound socket connection from an unidentified
173+
peer. This typically means one of two things:
174+
175+
1. A non-Open MPI process tried to connect to this Open MPI process.
176+
2. An Open MPI process compiled against a different version of Open
177+
MPI tried to connect to this Open MPI process.
178+
179+
Open MPI only supports running exactly the same version between all
180+
processes in a single job.
181+
182+
This may cause unpredictable behavior, and may end up aborting your
183+
job.
184+
185+
Local host: %s
186+
Local PID: %d
187+
Local Open MPI version: %s
188+
Peer IP address: %s
189+
#
190+
[received incorrect magic id string]
191+
The TCP BTL received an inbound socket connection from a peer that did
192+
not identify itself correctly as an Open MPI process. This typically
193+
means one of two things:
194+
195+
1. A non-Open MPI process tried to connect to this Open MPI process.
196+
2. An Open MPI process compiled against a different version of Open
197+
MPI tried to connect to this Open MPI process.
198+
199+
Open MPI only supports running exactly the same version between all
200+
processes in a single job.
201+
202+
This may cause unpredictable behavior, and may end up hanging or
203+
aborting your job.
204+
205+
Local host: %s
206+
Local PID: %d
207+
Local Open MPI version: %s
208+
Peer IP address: %s
209+
Peer identifier: %s (expected %s)

opal/util/fd.c

Lines changed: 58 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
2+
* Copyright (c) 2008-2018 Cisco Systems, Inc. All rights reserved
33
* Copyright (c) 2009 Sandia National Laboratories. All rights reserved.
44
* Copyright (c) 2017 Mellanox Technologies. All rights reserved.
55
*
@@ -18,13 +18,22 @@
1818
#ifdef HAVE_SYS_STAT_H
1919
#include <sys/stat.h>
2020
#endif
21-
22-
21+
#ifdef HAVE_SYS_SOCKET_H
22+
#include <sys/socket.h>
23+
#endif
24+
#ifdef HAVE_ARPA_INET_H
25+
#include <arpa/inet.h>
26+
#endif
27+
#ifdef HAVE_NETINET_IN_H
28+
#include <netinet/in.h>
29+
#endif
2330
#ifdef HAVE_UNISTD_H
2431
#include <unistd.h>
2532
#endif
2633
#include <errno.h>
2734
#include <fcntl.h>
35+
#include <stdlib.h>
36+
#include <string.h>
2837

2938
#include "opal/util/fd.h"
3039
#include "opal/constants.h"
@@ -126,3 +135,49 @@ bool opal_fd_is_blkdev(int fd)
126135
return S_ISBLK(buf.st_mode);
127136
}
128137

138+
const char *opal_fd_get_peer_name(int fd)
139+
{
140+
char *str;
141+
const char *ret;
142+
struct sockaddr sa;
143+
socklen_t slt = (socklen_t) sizeof(sa);
144+
145+
int rc = getpeername(fd, &sa, &slt);
146+
if (0 != rc) {
147+
ret = strdup("Unknown");
148+
return ret;
149+
}
150+
151+
size_t len = INET_ADDRSTRLEN;
152+
#if OPAL_ENABLE_IPV6
153+
len = INET6_ADDRSTRLEN;
154+
#endif
155+
str = malloc(len);
156+
if (NULL == str) {
157+
return NULL;
158+
}
159+
160+
if (sa.sa_family == AF_INET) {
161+
struct sockaddr_in *si;
162+
si = (struct sockaddr_in*) &sa;
163+
ret = inet_ntop(AF_INET, &(si->sin_addr), str, INET_ADDRSTRLEN);
164+
if (NULL == ret) {
165+
free(str);
166+
}
167+
}
168+
#if OPAL_ENABLE_IPV6
169+
else if (sa.sa_family == AF_INET6) {
170+
struct sockaddr_in6 *si6;
171+
si6 = (struct sockaddr_in6*) &sa;
172+
ret = inet_ntop(AF_INET6, &(si6->sin6_addr), str, INET6_ADDRSTRLEN);
173+
if (NULL == ret) {
174+
free(str);
175+
}
176+
}
177+
#endif
178+
else {
179+
ret = strdup("Unknown");
180+
}
181+
182+
return ret;
183+
}

opal/util/fd.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
2+
* Copyright (c) 2008-2018 Cisco Systems, Inc. All rights reserved
33
* Copyright (c) 2009 Sandia National Laboratories. All rights reserved.
44
* Copyright (c) 2017 Mellanox Technologies. All rights reserved.
55
*
@@ -94,6 +94,15 @@ OPAL_DECLSPEC bool opal_fd_is_chardev(int fd);
9494
*/
9595
OPAL_DECLSPEC bool opal_fd_is_blkdev(int fd);
9696

97+
/**
98+
* Convenience function to get a string name of the peer on the other
99+
* end of this internet socket.
100+
*
101+
* @param fd File descriptor of an AF_INET/AF_INET6 socket
102+
*
103+
* @returns resolvable IP name, or "a.b.c.d". This string must be freed by the caller.
104+
*/
105+
OPAL_DECLSPEC const char *opal_fd_get_peer_name(int fd);
97106

98107
END_C_DECLS
99108

orte/mca/oob/tcp/help-oob-tcp.txt

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
# Copyright (c) 2004-2005 The Regents of the University of California.
1212
# All rights reserved.
1313
# Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
14-
# Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
14+
# Copyright (c) 2015-2018 Cisco Systems, Inc. All rights reserved
1515
# $COPYRIGHT$
1616
#
1717
# Additional copyrights may follow
@@ -106,10 +106,29 @@ levels.
106106
Remote host: %s
107107
Remote port: %d
108108

109-
110109
The connection was rejected.
111110
#
112111
[static-fwd]
113112
Static ports were requested while orte_fwd_mpirun_port was set.
114113
Both options cannot be simultaneously set. Please either set
115114
orte_fwd_mpirun_port=false or remove any static port directives.
115+
#
116+
[version mismatch]
117+
Open MPI detected a mismatch in versions between two processes. This
118+
typically means that you executed "mpirun" (or "mpiexec") from one
119+
version of Open MPI on on node, but your default path on one of the
120+
other nodes upon which you launched found a different version of Open
121+
MPI.
122+
123+
Open MPI only supports running exactly the same version between all
124+
processes in a single job.
125+
126+
This will almost certainly cause unpredictable behavior, and may end
127+
up aborting your job.
128+
129+
Local host: %s
130+
Local process name: %s
131+
Local Open MPI version: %s
132+
Peer host: %s
133+
Peer process name: %s
134+
Peer Open MPI version: %s

0 commit comments

Comments
 (0)