2
2
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3
3
* University Research and Technology
4
4
* Corporation. All rights reserved.
5
- * Copyright (c) 2004-2016 The University of Tennessee and The University
5
+ * Copyright (c) 2004-2017 The University of Tennessee and The University
6
6
* of Tennessee Research Foundation. All rights
7
7
* reserved.
8
8
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -388,6 +388,7 @@ mca_btl_tcp_endpoint_send_blocking(mca_btl_base_endpoint_t* btl_endpoint,
388
388
{
389
389
int ret = mca_btl_tcp_send_blocking (btl_endpoint -> endpoint_sd , data , size );
390
390
if (ret < 0 ) {
391
+ btl_endpoint -> endpoint_state = MCA_BTL_TCP_FAILED ;
391
392
mca_btl_tcp_endpoint_close (btl_endpoint );
392
393
}
393
394
return ret ;
@@ -538,20 +539,30 @@ void mca_btl_tcp_endpoint_close(mca_btl_base_endpoint_t* btl_endpoint)
538
539
btl_endpoint -> endpoint_sd = -1 ;
539
540
/**
540
541
* If we keep failing to connect to the peer let the caller know about
541
- * this situation by triggering all the pending fragments callback and
542
- * reporting the error.
542
+ * this situation by triggering the callback on all pending fragments and
543
+ * reporting the error. The upper layer has then the opportunity to
544
+ * re-route or re-schedule the fragments.
543
545
*/
544
546
if ( MCA_BTL_TCP_FAILED == btl_endpoint -> endpoint_state ) {
545
547
mca_btl_tcp_frag_t * frag = btl_endpoint -> endpoint_send_frag ;
546
548
if ( NULL == frag )
547
549
frag = (mca_btl_tcp_frag_t * )opal_list_remove_first (& btl_endpoint -> endpoint_frags );
548
550
while (NULL != frag ) {
549
551
frag -> base .des_cbfunc (& frag -> btl -> super , frag -> endpoint , & frag -> base , OPAL_ERR_UNREACH );
550
-
552
+ if ( frag -> base .des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP ) {
553
+ MCA_BTL_TCP_FRAG_RETURN (frag );
554
+ }
551
555
frag = (mca_btl_tcp_frag_t * )opal_list_remove_first (& btl_endpoint -> endpoint_frags );
552
556
}
557
+ btl_endpoint -> endpoint_send_frag = NULL ;
558
+ /* Let's report the error upstream */
559
+ if (NULL != btl_endpoint -> endpoint_btl -> tcp_error_cb ) {
560
+ btl_endpoint -> endpoint_btl -> tcp_error_cb ((mca_btl_base_module_t * )btl_endpoint -> endpoint_btl , 0 ,
561
+ btl_endpoint -> endpoint_proc -> proc_opal , "Socket closed" );
562
+ }
563
+ } else {
564
+ btl_endpoint -> endpoint_state = MCA_BTL_TCP_CLOSED ;
553
565
}
554
- btl_endpoint -> endpoint_state = MCA_BTL_TCP_CLOSED ;
555
566
}
556
567
557
568
/*
@@ -608,7 +619,6 @@ static int mca_btl_tcp_endpoint_recv_connect_ack(mca_btl_base_endpoint_t* btl_en
608
619
opal_show_help ("help-mpi-btl-tcp.txt" , "client handshake fail" ,
609
620
true, opal_process_info .nodename ,
610
621
getpid (), "did not receive entire connect ACK from peer" );
611
-
612
622
return OPAL_ERR_BAD_PARAM ;
613
623
}
614
624
if (0 != strncmp (hs_msg .magic_id , mca_btl_tcp_magic_id_string , len )) {
@@ -628,6 +638,7 @@ static int mca_btl_tcp_endpoint_recv_connect_ack(mca_btl_base_endpoint_t* btl_en
628
638
if (0 != opal_compare_proc (btl_proc -> proc_opal -> proc_name , guid )) {
629
639
BTL_ERROR (("received unexpected process identifier %s" ,
630
640
OPAL_NAME_PRINT (guid )));
641
+ btl_endpoint -> endpoint_state = MCA_BTL_TCP_FAILED ;
631
642
mca_btl_tcp_endpoint_close (btl_endpoint );
632
643
return OPAL_ERR_UNREACH ;
633
644
}
@@ -834,6 +845,7 @@ static int mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_en
834
845
opal_net_get_hostname ((struct sockaddr * ) & endpoint_addr ),
835
846
((struct sockaddr_in * ) & endpoint_addr )-> sin_port ,
836
847
strerror (opal_socket_errno ), opal_socket_errno ));
848
+ btl_endpoint -> endpoint_state = MCA_BTL_TCP_FAILED ;
837
849
mca_btl_tcp_endpoint_close (btl_endpoint );
838
850
return OPAL_ERROR ;
839
851
}
@@ -850,6 +862,7 @@ static int mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_en
850
862
getpid (), msg ,
851
863
strerror (so_error ), so_error );
852
864
free (msg );
865
+ btl_endpoint -> endpoint_state = MCA_BTL_TCP_FAILED ;
853
866
mca_btl_tcp_endpoint_close (btl_endpoint );
854
867
return OPAL_ERROR ;
855
868
}
@@ -921,12 +934,15 @@ static void mca_btl_tcp_endpoint_recv_handler(int sd, short flags, void* user)
921
934
OPAL_THREAD_UNLOCK (& btl_endpoint -> endpoint_send_lock );
922
935
MCA_BTL_TCP_ENDPOINT_DUMP (10 , btl_endpoint , true, "connected" );
923
936
}
924
- else if (OPAL_ERR_BAD_PARAM == rc ) {
937
+ else if (OPAL_ERR_BAD_PARAM == rc
938
+ || OPAL_ERROR == rc ) {
925
939
/* If we get a BAD_PARAM, it means that it probably wasn't
926
940
an OMPI process on the other end of the socket (e.g.,
927
- the magic string ID failed). So we can probably just
928
- close the socket and ignore this connection. */
929
- CLOSE_THE_SOCKET (sd );
941
+ the magic string ID failed). recv_connect_ack already cleaned
942
+ up the socket. */
943
+ /* If we get OPAL_ERROR, the other end closed the connection
944
+ * because it has initiated a symetrical connexion on its end.
945
+ * recv_connect_ack already cleaned up the socket. */
930
946
}
931
947
else {
932
948
/* Otherwise, it probably *was* an OMPI peer process on
@@ -1065,6 +1081,8 @@ static void mca_btl_tcp_endpoint_send_handler(int sd, short flags, void* user)
1065
1081
opal_event_del (& btl_endpoint -> endpoint_send_event );
1066
1082
}
1067
1083
break ;
1084
+ case MCA_BTL_TCP_FAILED :
1085
+ break ;
1068
1086
default :
1069
1087
BTL_ERROR (("invalid connection state (%d)" , btl_endpoint -> endpoint_state ));
1070
1088
MCA_BTL_TCP_ENDPOINT_DUMP (1 , btl_endpoint , true, "event_del(send) [endpoint_send_handler:error]" );
0 commit comments