Skip to content

Commit 0741fad

Browse files
committed
Btl tcp: BTL_ERROR to show_help & update func behaviour
As part of improvement towards tcp debugging we are moving few BTL_ERROR to show_help and also update the function behaviour of mca_btl_tcp_endpoint_complete_connect to return SUCCESS and ERROR cases. Signed-off-by: Mohan Gandhi <[email protected]>
1 parent 368f9f0 commit 0741fad

File tree

3 files changed

+118
-19
lines changed

3 files changed

+118
-19
lines changed

opal/mca/btl/tcp/btl_tcp_component.c

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -729,7 +729,9 @@ static int mca_btl_tcp_component_create_instances(void)
729729
char* if_name = *argv;
730730
int if_index = opal_ifnametokindex(if_name);
731731
if(if_index < 0) {
732-
BTL_ERROR(("invalid interface \"%s\"", if_name));
732+
opal_show_help("help-mpi-btl-tcp.txt", "invalid if_inexclude",
733+
true, "include", opal_process_info.nodename,
734+
if_name, "Unknown interface name");
733735
ret = OPAL_ERR_NOT_FOUND;
734736
goto cleanup;
735737
}
@@ -960,15 +962,20 @@ static int mca_btl_tcp_component_create_listen(uint16_t af_family)
960962

961963
/* set socket up to be non-blocking, otherwise accept could block */
962964
if((flags = fcntl(sd, F_GETFL, 0)) < 0) {
963-
BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)",
964-
strerror(opal_socket_errno), opal_socket_errno));
965+
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
966+
true, opal_process_info.nodename,
967+
getpid(), "fcntl(sd, F_GETFL, 0)",
968+
strerror(opal_socket_errno), opal_socket_errno);
965969
CLOSE_THE_SOCKET(sd);
966970
return OPAL_ERROR;
967971
} else {
968972
flags |= O_NONBLOCK;
969973
if(fcntl(sd, F_SETFL, flags) < 0) {
970-
BTL_ERROR(("fcntl(F_SETFL) failed: %s (%d)",
971-
strerror(opal_socket_errno), opal_socket_errno));
974+
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
975+
true, opal_process_info.nodename,
976+
getpid(),
977+
"fcntl(sd, F_SETFL, flags & O_NONBLOCK)",
978+
strerror(opal_socket_errno), opal_socket_errno);
972979
CLOSE_THE_SOCKET(sd);
973980
return OPAL_ERROR;
974981
}

opal/mca/btl/tcp/btl_tcp_endpoint.c

Lines changed: 41 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -721,13 +721,23 @@ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpo
721721

722722
/* setup the socket as non-blocking */
723723
if((flags = fcntl(btl_endpoint->endpoint_sd, F_GETFL, 0)) < 0) {
724-
BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)",
725-
strerror(opal_socket_errno), opal_socket_errno));
724+
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
725+
true, opal_process_info.nodename,
726+
getpid(), "fcntl(sd, F_GETFL, 0)",
727+
strerror(opal_socket_errno), opal_socket_errno);
728+
/* Upper layer will handler the error */
729+
return OPAL_ERR_UNREACH;
726730
} else {
727731
flags |= O_NONBLOCK;
728-
if(fcntl(btl_endpoint->endpoint_sd, F_SETFL, flags) < 0)
729-
BTL_ERROR(("fcntl(F_SETFL) failed: %s (%d)",
730-
strerror(opal_socket_errno), opal_socket_errno));
732+
if(fcntl(btl_endpoint->endpoint_sd, F_SETFL, flags) < 0) {
733+
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
734+
true, opal_process_info.nodename,
735+
getpid(),
736+
"fcntl(sd, F_SETFL, flags & O_NONBLOCK)",
737+
strerror(opal_socket_errno), opal_socket_errno);
738+
/* Upper layer will handler the error */
739+
return OPAL_ERR_UNREACH;
740+
}
731741
}
732742

733743
/* start the connect - will likely fail with EINPROGRESS */
@@ -778,7 +788,7 @@ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpo
778788
* later. Otherwise, send this processes identifier to the endpoint on the
779789
* newly connected socket.
780790
*/
781-
static void mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_endpoint)
791+
static int mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_endpoint)
782792
{
783793
int so_error = 0;
784794
opal_socklen_t so_length = sizeof(so_error);
@@ -794,32 +804,49 @@ static void mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_e
794804

795805
/* check connect completion status */
796806
if(getsockopt(btl_endpoint->endpoint_sd, SOL_SOCKET, SO_ERROR, (char *)&so_error, &so_length) < 0) {
797-
BTL_ERROR(("getsockopt() to %s failed: %s (%d)",
807+
opal_show_help("help-mpi-btl-tcp.txt", "socket flag fail",
808+
true, opal_process_info.nodename,
809+
getpid(), "fcntl(sd, F_GETFL, 0)",
810+
strerror(opal_socket_errno), opal_socket_errno);
811+
BTL_ERROR(("getsockopt() to %s:%d failed: %s (%d)",
798812
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
813+
((struct sockaddr_in*) &endpoint_addr)->sin_port,
799814
strerror(opal_socket_errno), opal_socket_errno));
800815
mca_btl_tcp_endpoint_close(btl_endpoint);
801-
return;
816+
return OPAL_ERROR;
802817
}
803818
if(so_error == EINPROGRESS || so_error == EWOULDBLOCK) {
804-
return;
819+
return OPAL_SUCCESS;
805820
}
806821
if(so_error != 0) {
807-
BTL_ERROR(("connect() to %s failed: %s (%d)",
808-
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
809-
strerror(so_error), so_error));
822+
char *msg;
823+
asprintf(&msg, "connect() to %s:%d failed",
824+
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
825+
ntohs(((struct sockaddr_in*) &endpoint_addr)->sin_port));
826+
opal_show_help("help-mpi-btl-tcp.txt", "client connect fail",
827+
true, opal_process_info.nodename,
828+
getpid(), msg,
829+
strerror(opal_socket_errno), opal_socket_errno);
830+
free(msg);
810831
mca_btl_tcp_endpoint_close(btl_endpoint);
811-
return;
832+
return OPAL_ERROR;
812833
}
813834

835+
opal_output_verbose(10, opal_btl_base_framework.framework_output,
836+
"btl:tcp: connect() to %s:%d completed (complete_connect), sending connect ACK",
837+
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
838+
ntohs(((struct sockaddr_in*) &endpoint_addr)->sin_port));
839+
814840
if(mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint) == OPAL_SUCCESS) {
815841
btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK;
816842
opal_event_add(&btl_endpoint->endpoint_recv_event, 0);
817843
MCA_BTL_TCP_ENDPOINT_DUMP(10, btl_endpoint, false, "event_add(recv) [complete_connect]");
818-
return;
844+
return OPAL_SUCCESS;
819845
}
820846
MCA_BTL_TCP_ENDPOINT_DUMP(1, btl_endpoint, false, " [complete_connect]");
821847
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
822848
mca_btl_tcp_endpoint_close(btl_endpoint);
849+
return OPAL_ERROR;
823850
}
824851

825852

opal/mca/btl/tcp/help-mpi-btl-tcp.txt

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,3 +100,68 @@ hopefully be able to continue).
100100
Peer hostname: %s (%s)
101101
Source IP of socket: %s
102102
Known IPs of peer: %s
103+
#
104+
[socket flag fail]
105+
WARNING: Open MPI failed to set flags on a TCP socket. This should
106+
not happen. It is likely that your MPI job will now fail.
107+
108+
Local host: %s
109+
PID: %d
110+
Flag: %s
111+
Error: %s (%d)
112+
#
113+
[server did not get guid]
114+
WARNING: Open MPI accepted a TCP connection from what appears to be a
115+
another Open MPI process but the peer process did not complete the
116+
initial handshake properly. This should not happen.
117+
118+
This attempted connection will be ignored; your MPI job may or may not
119+
continue properly.
120+
121+
Local host: %s
122+
PID: %d
123+
#
124+
[server accept cannot find guid]
125+
WARNING: Open MPI accepted a TCP connection from what appears to be a
126+
another Open MPI process but cannot find a corresponding process
127+
entry for that peer.
128+
129+
This attempted connection will be ignored; your MPI job may or may not
130+
continue properly.
131+
132+
Local host: %s
133+
PID: %d
134+
#
135+
[server getpeername failed]
136+
WARNING: Open MPI failed to look up the peer IP address information of
137+
a TCP connection that it just accepted. This should not happen.
138+
139+
This attempted connection will be ignored; your MPI job may or may not
140+
continue properly.
141+
142+
Local host: %s
143+
PID: %d
144+
Error: %s (%d)
145+
#
146+
[server cannot find endpoint]
147+
WARNING: Open MPI accepted a TCP connection from what appears to be a
148+
valid peer Open MPI process but cannot find a corresponding endpoint
149+
entry for that peer. This should not happen.
150+
151+
This attempted connection will be ignored; your MPI job may or may not
152+
continue properly.
153+
154+
Local host: %s
155+
PID: %d
156+
#
157+
[client connect fail]
158+
WARNING: Open MPI failed to TCP connect to a peer MPI process via
159+
TCP. This should not happen.
160+
161+
Your Open MPI job may now fail.
162+
163+
Local host: %s
164+
PID: %d
165+
Message: %s
166+
Error: %s (%d)
167+
#

0 commit comments

Comments
 (0)