Skip to content

Commit 10f6a77

Browse files
authored
Merge pull request #7315 from abouteiller/export/tcp_errors_v2
Handle error cases in TCP BTL (v2)
2 parents 969eb02 + 76021e3 commit 10f6a77

File tree

5 files changed

+84
-15
lines changed

5 files changed

+84
-15
lines changed

ompi/mca/pml/ob1/pml_ob1.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2018 The University of Tennessee and The University
6+
* Copyright (c) 2004-2020 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -817,6 +817,14 @@ void mca_pml_ob1_error_handler(
817817
return;
818818
}
819819
#endif /* OPAL_CUDA_SUPPORT */
820+
/* Some BTL report unreachable errors during normal MPI_Finalize
821+
* termination. Lets simply ignore such errors after MPI is not supposed to
822+
* be operational anyway.
823+
*/
824+
if(ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
825+
return;
826+
}
827+
820828
ompi_rte_abort(-1, btlinfo);
821829
}
822830

opal/mca/btl/tcp/btl_tcp.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2014 The University of Tennessee and The University
6+
* Copyright (c) 2004-2017 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -38,6 +38,8 @@
3838
#include "btl_tcp_proc.h"
3939
#include "btl_tcp_endpoint.h"
4040

41+
static int mca_btl_tcp_register_error_cb(struct mca_btl_base_module_t* btl,
42+
mca_btl_base_module_error_cb_fn_t cbfunc);
4143

4244
mca_btl_tcp_module_t mca_btl_tcp_module = {
4345
.super = {
@@ -51,11 +53,20 @@ mca_btl_tcp_module_t mca_btl_tcp_module = {
5153
.btl_send = mca_btl_tcp_send,
5254
.btl_put = mca_btl_tcp_put,
5355
.btl_dump = mca_btl_base_dump,
56+
.btl_register_error = mca_btl_tcp_register_error_cb, /* register error */
5457
.btl_ft_event = mca_btl_tcp_ft_event
5558
},
5659
.tcp_endpoints_mutex = OPAL_MUTEX_STATIC_INIT
5760
};
5861

62+
static int mca_btl_tcp_register_error_cb(struct mca_btl_base_module_t* btl,
63+
mca_btl_base_module_error_cb_fn_t cbfunc)
64+
{
65+
mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*)btl;
66+
tcp_btl->tcp_error_cb = cbfunc;
67+
return OPAL_SUCCESS;
68+
}
69+
5970
/**
6071
*
6172
*/

opal/mca/btl/tcp/btl_tcp_endpoint.c

Lines changed: 43 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
33
* University Research and Technology
44
* Corporation. All rights reserved.
5-
* Copyright (c) 2004-2016 The University of Tennessee and The University
5+
* Copyright (c) 2004-2020 The University of Tennessee and The University
66
* of Tennessee Research Foundation. All rights
77
* reserved.
88
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -388,6 +388,7 @@ mca_btl_tcp_endpoint_send_blocking(mca_btl_base_endpoint_t* btl_endpoint,
388388
{
389389
int ret = mca_btl_tcp_send_blocking(btl_endpoint->endpoint_sd, data, size);
390390
if (ret < 0) {
391+
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
391392
mca_btl_tcp_endpoint_close(btl_endpoint);
392393
}
393394
return ret;
@@ -534,24 +535,47 @@ void mca_btl_tcp_endpoint_close(mca_btl_base_endpoint_t* btl_endpoint)
534535
btl_endpoint->endpoint_cache_length = 0;
535536
#endif /* MCA_BTL_TCP_ENDPOINT_CACHE */
536537

538+
/* send a message before closing to differentiate between failures and
539+
* clean disconnect during finalize */
540+
if( MCA_BTL_TCP_CONNECTED == btl_endpoint->endpoint_state ) {
541+
mca_btl_tcp_hdr_t fin_msg = {
542+
.base.tag = 0,
543+
.type = MCA_BTL_TCP_HDR_TYPE_FIN,
544+
.count = 0,
545+
.size = 0,
546+
};
547+
mca_btl_tcp_endpoint_send_blocking(btl_endpoint,
548+
&fin_msg, sizeof(fin_msg));
549+
}
550+
537551
CLOSE_THE_SOCKET(btl_endpoint->endpoint_sd);
538552
btl_endpoint->endpoint_sd = -1;
539553
/**
540554
* If we keep failing to connect to the peer let the caller know about
541-
* this situation by triggering all the pending fragments callback and
542-
* reporting the error.
555+
* this situation by triggering the callback on all pending fragments and
556+
* reporting the error. The upper layer has then the opportunity to
557+
* re-route or re-schedule the fragments.
543558
*/
544559
if( MCA_BTL_TCP_FAILED == btl_endpoint->endpoint_state ) {
545560
mca_btl_tcp_frag_t* frag = btl_endpoint->endpoint_send_frag;
546561
if( NULL == frag )
547562
frag = (mca_btl_tcp_frag_t*)opal_list_remove_first(&btl_endpoint->endpoint_frags);
548563
while(NULL != frag) {
549564
frag->base.des_cbfunc(&frag->btl->super, frag->endpoint, &frag->base, OPAL_ERR_UNREACH);
550-
565+
if( frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP ) {
566+
MCA_BTL_TCP_FRAG_RETURN(frag);
567+
}
551568
frag = (mca_btl_tcp_frag_t*)opal_list_remove_first(&btl_endpoint->endpoint_frags);
552569
}
570+
btl_endpoint->endpoint_send_frag = NULL;
571+
/* Let's report the error upstream */
572+
if(NULL != btl_endpoint->endpoint_btl->tcp_error_cb) {
573+
btl_endpoint->endpoint_btl->tcp_error_cb((mca_btl_base_module_t*)btl_endpoint->endpoint_btl, 0,
574+
btl_endpoint->endpoint_proc->proc_opal, "Socket closed");
575+
}
576+
} else {
577+
btl_endpoint->endpoint_state = MCA_BTL_TCP_CLOSED;
553578
}
554-
btl_endpoint->endpoint_state = MCA_BTL_TCP_CLOSED;
555579
}
556580

557581
/*
@@ -608,7 +632,6 @@ static int mca_btl_tcp_endpoint_recv_connect_ack(mca_btl_base_endpoint_t* btl_en
608632
opal_show_help("help-mpi-btl-tcp.txt", "client handshake fail",
609633
true, opal_process_info.nodename,
610634
getpid(), "did not receive entire connect ACK from peer");
611-
612635
return OPAL_ERR_BAD_PARAM;
613636
}
614637
if (0 != strncmp(hs_msg.magic_id, mca_btl_tcp_magic_id_string, len)) {
@@ -628,6 +651,7 @@ static int mca_btl_tcp_endpoint_recv_connect_ack(mca_btl_base_endpoint_t* btl_en
628651
if (0 != opal_compare_proc(btl_proc->proc_opal->proc_name, guid)) {
629652
BTL_ERROR(("received unexpected process identifier %s",
630653
OPAL_NAME_PRINT(guid)));
654+
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
631655
mca_btl_tcp_endpoint_close(btl_endpoint);
632656
return OPAL_ERR_UNREACH;
633657
}
@@ -834,6 +858,7 @@ static int mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_en
834858
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
835859
((struct sockaddr_in*) &endpoint_addr)->sin_port,
836860
strerror(opal_socket_errno), opal_socket_errno));
861+
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
837862
mca_btl_tcp_endpoint_close(btl_endpoint);
838863
return OPAL_ERROR;
839864
}
@@ -850,6 +875,7 @@ static int mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_en
850875
getpid(), msg,
851876
strerror(so_error), so_error);
852877
free(msg);
878+
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
853879
mca_btl_tcp_endpoint_close(btl_endpoint);
854880
return OPAL_ERROR;
855881
}
@@ -921,12 +947,15 @@ static void mca_btl_tcp_endpoint_recv_handler(int sd, short flags, void* user)
921947
OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
922948
MCA_BTL_TCP_ENDPOINT_DUMP(10, btl_endpoint, true, "connected");
923949
}
924-
else if (OPAL_ERR_BAD_PARAM == rc) {
950+
else if (OPAL_ERR_BAD_PARAM == rc
951+
|| OPAL_ERROR == rc) {
925952
/* If we get a BAD_PARAM, it means that it probably wasn't
926953
an OMPI process on the other end of the socket (e.g.,
927-
the magic string ID failed). So we can probably just
928-
close the socket and ignore this connection. */
929-
CLOSE_THE_SOCKET(sd);
954+
the magic string ID failed). recv_connect_ack already cleaned
955+
up the socket. */
956+
/* If we get OPAL_ERROR, the other end closed the connection
957+
* because it has initiated a symetrical connexion on its end.
958+
* recv_connect_ack already cleaned up the socket. */
930959
}
931960
else {
932961
/* Otherwise, it probably *was* an OMPI peer process on
@@ -1065,6 +1094,10 @@ static void mca_btl_tcp_endpoint_send_handler(int sd, short flags, void* user)
10651094
opal_event_del(&btl_endpoint->endpoint_send_event);
10661095
}
10671096
break;
1097+
case MCA_BTL_TCP_FAILED:
1098+
MCA_BTL_TCP_ENDPOINT_DUMP(1, btl_endpoint, true, "event_del(send) [endpoint_send_handler:error]");
1099+
opal_event_del(&btl_endpoint->endpoint_send_event);
1100+
break;
10681101
default:
10691102
BTL_ERROR(("invalid connection state (%d)", btl_endpoint->endpoint_state));
10701103
MCA_BTL_TCP_ENDPOINT_DUMP(1, btl_endpoint, true, "event_del(send) [endpoint_send_handler:error]");

opal/mca/btl/tcp/btl_tcp_frag.c

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2016 The University of Tennessee and The University
6+
* Copyright (c) 2004-2020 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -212,7 +212,8 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
212212
cnt = readv(sd, frag->iov_ptr, num_vecs);
213213
if( 0 < cnt ) goto advance_iov_position;
214214
if( cnt == 0 ) {
215-
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
215+
if(MCA_BTL_TCP_CONNECTED == btl_endpoint->endpoint_state)
216+
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
216217
mca_btl_tcp_endpoint_close(btl_endpoint);
217218
return false;
218219
}
@@ -272,6 +273,10 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
272273
if(frag->iov_cnt == 0) {
273274
if (btl_endpoint->endpoint_nbo && frag->iov_idx == 1) MCA_BTL_TCP_HDR_NTOH(frag->hdr);
274275
switch(frag->hdr.type) {
276+
case MCA_BTL_TCP_HDR_TYPE_FIN:
277+
frag->endpoint->endpoint_state = MCA_BTL_TCP_CLOSED;
278+
mca_btl_tcp_endpoint_close(frag->endpoint);
279+
break;
275280
case MCA_BTL_TCP_HDR_TYPE_SEND:
276281
if(frag->iov_idx == 1 && frag->hdr.size) {
277282
frag->segments[0].seg_addr.pval = frag+1;

opal/mca/btl/tcp/btl_tcp_hdr.h

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
33
* University Research and Technology
44
* Corporation. All rights reserved.
5-
* Copyright (c) 2004-2005 The University of Tennessee and The University
5+
* Copyright (c) 2004-2020 The University of Tennessee and The University
66
* of Tennessee Research Foundation. All rights
77
* reserved.
88
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -33,6 +33,18 @@ BEGIN_C_DECLS
3333
#define MCA_BTL_TCP_HDR_TYPE_SEND 1
3434
#define MCA_BTL_TCP_HDR_TYPE_PUT 2
3535
#define MCA_BTL_TCP_HDR_TYPE_GET 3
36+
#define MCA_BTL_TCP_HDR_TYPE_FIN 4
37+
/* The MCA_BTL_TCP_HDR_TYPE_FIN is a special kind of message sent during normal
38+
* connexion closing. Before the endpoint closes the socket, it performs a
39+
* 1-way handshake by sending a FIN message in the socket. This lets the other
40+
* end of the connexion discriminate between the case in which the peer has
41+
* closed intentionnally (e.g., during MPI_FINALIZE), or unintentionally (e.g.,
42+
* as the result of some transmission or process failure).
43+
* The process initiating the close sends the FIN message but does not wait
44+
* for a 2-way handshake and closes the socket immediately. Thus, the recipient
45+
* of a FIN message can simply close the socket and mark the endpoint as closed
46+
* without error, and without answering a FIN message itself.
47+
*/
3648

3749
struct mca_btl_tcp_hdr_t {
3850
mca_btl_base_header_t base;

0 commit comments

Comments
 (0)