@@ -420,6 +420,7 @@ static const char *era_status_to_string(era_proc_status_t s) {
420420 }
421421 return "UNDEFINED STATUS" ;
422422}
423+ #endif /* OPAL_ENABLE_DEBUG */
423424
424425static const char * era_msg_type_to_string (int type ) {
425426 switch (type ) {
@@ -432,7 +433,6 @@ static const char *era_msg_type_to_string(int type) {
432433 }
433434 return "UNDEFINED MESSAGE TYPE" ;
434435}
435- #endif /* OPAL_ENABLE_DEBUG */
436436
437437static ompi_coll_ftagree_era_agreement_info_t * era_lookup_agreement_info (era_identifier_t agreement_id )
438438{
@@ -808,12 +808,12 @@ static void era_update_new_dead_list(ompi_coll_ftagree_era_agreement_info_t *ci)
808808 }
809809
810810 OPAL_OUTPUT_VERBOSE ((30 , ompi_ftmpi_output_handle ,
811- "%s ftagree:agreement (ERA) agreement (%d.%d).%d -- adding %d procs to the list of newly dead processes" ,
811+ "%s ftagree:agreement (ERA) agreement (%d.%d).%d -- adding %d procs to the list of newly dead processes (%d currently; AFR size is %d) " ,
812812 OMPI_NAME_PRINT (OMPI_PROC_MY_NAME ),
813813 ci -> agreement_id .ERAID_FIELDS .contextid ,
814814 ci -> agreement_id .ERAID_FIELDS .epoch ,
815815 ci -> agreement_id .ERAID_FIELDS .agreementid ,
816- r ));
816+ r , ci -> current_value -> new_dead_array , ags -> afr_size ));
817817
818818#if OPAL_ENABLE_DEBUG
819819 {
@@ -1372,16 +1372,14 @@ static void era_build_tree_structure(ompi_coll_ftagree_era_agreement_info_t *ci)
13721372
13731373 era_call_tree_fn (ci );
13741374
1375- if ( ompi_comm_rank (ci -> comm ) == 0 ) {
1376- OPAL_OUTPUT_VERBOSE ((4 , ompi_ftmpi_output_handle ,
1377- "%s ftagree:agreement (ERA) Agreement (%d.%d).%d: re-built the tree structure with size %d: %s\n" ,
1378- OMPI_NAME_PRINT (OMPI_PROC_MY_NAME ),
1379- ci -> agreement_id .ERAID_FIELDS .contextid ,
1380- ci -> agreement_id .ERAID_FIELDS .epoch ,
1381- ci -> agreement_id .ERAID_FIELDS .agreementid ,
1382- AGS (ci -> comm )-> tree_size ,
1383- era_debug_tree (ci -> ags -> tree , ci -> ags -> tree_size )));
1384- }
1375+ OPAL_OUTPUT_VERBOSE (((ompi_comm_rank (ci -> comm ) == 0 )? 4 : 50 , ompi_ftmpi_output_handle ,
1376+ "%s ftagree:agreement (ERA) Agreement (%d.%d).%d: re-built the tree structure with size %d: %s\n" ,
1377+ OMPI_NAME_PRINT (OMPI_PROC_MY_NAME ),
1378+ ci -> agreement_id .ERAID_FIELDS .contextid ,
1379+ ci -> agreement_id .ERAID_FIELDS .epoch ,
1380+ ci -> agreement_id .ERAID_FIELDS .agreementid ,
1381+ AGS (ci -> comm )-> tree_size ,
1382+ era_debug_tree (ci -> ags -> tree , ci -> ags -> tree_size )));
13851383
13861384#if OPAL_ENABLE_DEBUG
13871385 era_tree_check (ci -> ags -> tree , ci -> ags -> tree_size , 0 );
@@ -2184,7 +2182,21 @@ static void send_msg(ompi_communicator_t *comm,
21842182 }
21852183 assert (NULL != peer );
21862184 endpoint = mca_bml_base_get_endpoint (peer );
2187- assert (NULL != endpoint );
2185+ if (NULL == endpoint ) {
2186+ opal_output_verbose (5 , ompi_ftmpi_output_handle ,
2187+ "%s ftagree:agreement (ERA) CANNOT send message [(%d.%d).%d, %s, %08x.%d.%d..] to %d/%s (no endpoint)\n" ,
2188+ OMPI_NAME_PRINT (OMPI_PROC_MY_NAME ),
2189+ agreement_id .ERAID_FIELDS .contextid ,
2190+ agreement_id .ERAID_FIELDS .epoch ,
2191+ agreement_id .ERAID_FIELDS .agreementid ,
2192+ era_msg_type_to_string (type ),
2193+ (NULL != value -> bytes )? * (int * )value -> bytes : 0 ,
2194+ value -> header .ret ,
2195+ value -> header .nb_new_dead ,
2196+ dst ,
2197+ NULL != proc_name ? OMPI_NAME_PRINT (proc_name ) : "(null)" );
2198+ return ; /* bail out: the algorithm should reconnect when the failed proc is detected */
2199+ }
21882200 bml_btl = mca_bml_base_btl_array_get_index (& endpoint -> btl_eager , 0 );
21892201 assert (NULL != bml_btl );
21902202 btl_endpoint = bml_btl -> btl_endpoint ;
@@ -2570,7 +2582,7 @@ static void msg_down(era_msg_header_t *msg_header, uint8_t *bytes, int *new_dead
25702582 */
25712583 return ;
25722584 }
2573- /** if I receive a down message on an agreement I know about, I already participated.
2585+ /** if I receive a down message on an agreement I know about, I already participated.
25742586 * There is a non-erroneous code; erroneous execution that may also trigger this assert:
25752587 * consider the following case with false detection:
25762588 * 1. some ancestor A has detected the current process C as failed
0 commit comments