2
2
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3
3
* University Research and Technology
4
4
* Corporation. All rights reserved.
5
- * Copyright (c) 2004-2016 The University of Tennessee and The University
5
+ * Copyright (c) 2004-2020 The University of Tennessee and The University
6
6
* of Tennessee Research Foundation. All rights
7
7
* reserved.
8
8
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -388,6 +388,7 @@ mca_btl_tcp_endpoint_send_blocking(mca_btl_base_endpoint_t* btl_endpoint,
388
388
{
389
389
int ret = mca_btl_tcp_send_blocking (btl_endpoint -> endpoint_sd , data , size );
390
390
if (ret < 0 ) {
391
+ btl_endpoint -> endpoint_state = MCA_BTL_TCP_FAILED ;
391
392
mca_btl_tcp_endpoint_close (btl_endpoint );
392
393
}
393
394
return ret ;
@@ -534,24 +535,47 @@ void mca_btl_tcp_endpoint_close(mca_btl_base_endpoint_t* btl_endpoint)
534
535
btl_endpoint -> endpoint_cache_length = 0 ;
535
536
#endif /* MCA_BTL_TCP_ENDPOINT_CACHE */
536
537
538
+ /* send a message before closing to differentiate between failures and
539
+ * clean disconnect during finalize */
540
+ if ( MCA_BTL_TCP_CONNECTED == btl_endpoint -> endpoint_state ) {
541
+ mca_btl_tcp_hdr_t fin_msg = {
542
+ .base .tag = 0 ,
543
+ .type = MCA_BTL_TCP_HDR_TYPE_FIN ,
544
+ .count = 0 ,
545
+ .size = 0 ,
546
+ };
547
+ mca_btl_tcp_endpoint_send_blocking (btl_endpoint ,
548
+ & fin_msg , sizeof (fin_msg ));
549
+ }
550
+
537
551
CLOSE_THE_SOCKET (btl_endpoint -> endpoint_sd );
538
552
btl_endpoint -> endpoint_sd = -1 ;
539
553
/**
540
554
* If we keep failing to connect to the peer let the caller know about
541
- * this situation by triggering all the pending fragments callback and
542
- * reporting the error.
555
+ * this situation by triggering the callback on all pending fragments and
556
+ * reporting the error. The upper layer has then the opportunity to
557
+ * re-route or re-schedule the fragments.
543
558
*/
544
559
if ( MCA_BTL_TCP_FAILED == btl_endpoint -> endpoint_state ) {
545
560
mca_btl_tcp_frag_t * frag = btl_endpoint -> endpoint_send_frag ;
546
561
if ( NULL == frag )
547
562
frag = (mca_btl_tcp_frag_t * )opal_list_remove_first (& btl_endpoint -> endpoint_frags );
548
563
while (NULL != frag ) {
549
564
frag -> base .des_cbfunc (& frag -> btl -> super , frag -> endpoint , & frag -> base , OPAL_ERR_UNREACH );
550
-
565
+ if ( frag -> base .des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP ) {
566
+ MCA_BTL_TCP_FRAG_RETURN (frag );
567
+ }
551
568
frag = (mca_btl_tcp_frag_t * )opal_list_remove_first (& btl_endpoint -> endpoint_frags );
552
569
}
570
+ btl_endpoint -> endpoint_send_frag = NULL ;
571
+ /* Let's report the error upstream */
572
+ if (NULL != btl_endpoint -> endpoint_btl -> tcp_error_cb ) {
573
+ btl_endpoint -> endpoint_btl -> tcp_error_cb ((mca_btl_base_module_t * )btl_endpoint -> endpoint_btl , 0 ,
574
+ btl_endpoint -> endpoint_proc -> proc_opal , "Socket closed" );
575
+ }
576
+ } else {
577
+ btl_endpoint -> endpoint_state = MCA_BTL_TCP_CLOSED ;
553
578
}
554
- btl_endpoint -> endpoint_state = MCA_BTL_TCP_CLOSED ;
555
579
}
556
580
557
581
/*
@@ -608,7 +632,6 @@ static int mca_btl_tcp_endpoint_recv_connect_ack(mca_btl_base_endpoint_t* btl_en
608
632
opal_show_help ("help-mpi-btl-tcp.txt" , "client handshake fail" ,
609
633
true, opal_process_info .nodename ,
610
634
getpid (), "did not receive entire connect ACK from peer" );
611
-
612
635
return OPAL_ERR_BAD_PARAM ;
613
636
}
614
637
if (0 != strncmp (hs_msg .magic_id , mca_btl_tcp_magic_id_string , len )) {
@@ -628,6 +651,7 @@ static int mca_btl_tcp_endpoint_recv_connect_ack(mca_btl_base_endpoint_t* btl_en
628
651
if (0 != opal_compare_proc (btl_proc -> proc_opal -> proc_name , guid )) {
629
652
BTL_ERROR (("received unexpected process identifier %s" ,
630
653
OPAL_NAME_PRINT (guid )));
654
+ btl_endpoint -> endpoint_state = MCA_BTL_TCP_FAILED ;
631
655
mca_btl_tcp_endpoint_close (btl_endpoint );
632
656
return OPAL_ERR_UNREACH ;
633
657
}
@@ -834,6 +858,7 @@ static int mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_en
834
858
opal_net_get_hostname ((struct sockaddr * ) & endpoint_addr ),
835
859
((struct sockaddr_in * ) & endpoint_addr )-> sin_port ,
836
860
strerror (opal_socket_errno ), opal_socket_errno ));
861
+ btl_endpoint -> endpoint_state = MCA_BTL_TCP_FAILED ;
837
862
mca_btl_tcp_endpoint_close (btl_endpoint );
838
863
return OPAL_ERROR ;
839
864
}
@@ -850,6 +875,7 @@ static int mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_en
850
875
getpid (), msg ,
851
876
strerror (so_error ), so_error );
852
877
free (msg );
878
+ btl_endpoint -> endpoint_state = MCA_BTL_TCP_FAILED ;
853
879
mca_btl_tcp_endpoint_close (btl_endpoint );
854
880
return OPAL_ERROR ;
855
881
}
@@ -921,12 +947,15 @@ static void mca_btl_tcp_endpoint_recv_handler(int sd, short flags, void* user)
921
947
OPAL_THREAD_UNLOCK (& btl_endpoint -> endpoint_send_lock );
922
948
MCA_BTL_TCP_ENDPOINT_DUMP (10 , btl_endpoint , true, "connected" );
923
949
}
924
- else if (OPAL_ERR_BAD_PARAM == rc ) {
950
+ else if (OPAL_ERR_BAD_PARAM == rc
951
+ || OPAL_ERROR == rc ) {
925
952
/* If we get a BAD_PARAM, it means that it probably wasn't
926
953
an OMPI process on the other end of the socket (e.g.,
927
- the magic string ID failed). So we can probably just
928
- close the socket and ignore this connection. */
929
- CLOSE_THE_SOCKET (sd );
954
+ the magic string ID failed). recv_connect_ack already cleaned
955
+ up the socket. */
956
+ /* If we get OPAL_ERROR, the other end closed the connection
957
+ * because it has initiated a symetrical connexion on its end.
958
+ * recv_connect_ack already cleaned up the socket. */
930
959
}
931
960
else {
932
961
/* Otherwise, it probably *was* an OMPI peer process on
@@ -1065,6 +1094,10 @@ static void mca_btl_tcp_endpoint_send_handler(int sd, short flags, void* user)
1065
1094
opal_event_del (& btl_endpoint -> endpoint_send_event );
1066
1095
}
1067
1096
break ;
1097
+ case MCA_BTL_TCP_FAILED :
1098
+ MCA_BTL_TCP_ENDPOINT_DUMP (1 , btl_endpoint , true, "event_del(send) [endpoint_send_handler:error]" );
1099
+ opal_event_del (& btl_endpoint -> endpoint_send_event );
1100
+ break ;
1068
1101
default :
1069
1102
BTL_ERROR (("invalid connection state (%d)" , btl_endpoint -> endpoint_state ));
1070
1103
MCA_BTL_TCP_ENDPOINT_DUMP (1 , btl_endpoint , true, "event_del(send) [endpoint_send_handler:error]" );
0 commit comments