@@ -420,6 +420,7 @@ static const char *era_status_to_string(era_proc_status_t s) {
420420 }
421421 return "UNDEFINED STATUS" ;
422422}
423+ #endif /* OPAL_ENABLE_DEBUG */
423424
424425static const char * era_msg_type_to_string (int type ) {
425426 switch (type ) {
@@ -432,7 +433,6 @@ static const char *era_msg_type_to_string(int type) {
432433 }
433434 return "UNDEFINED MESSAGE TYPE" ;
434435}
435- #endif /* OPAL_ENABLE_DEBUG */
436436
437437static ompi_coll_ftagree_era_agreement_info_t * era_lookup_agreement_info (era_identifier_t agreement_id )
438438{
@@ -2184,7 +2184,21 @@ static void send_msg(ompi_communicator_t *comm,
21842184 }
21852185 assert (NULL != peer );
21862186 endpoint = mca_bml_base_get_endpoint (peer );
2187- assert (NULL != endpoint );
2187+ if (NULL == endpoint ) {
2188+ opal_output_verbose (5 , ompi_ftmpi_output_handle ,
2189+ "%s ftagree:agreement (ERA) CANNOT send message [(%d.%d).%d, %s, %08x.%d.%d..] to %d/%s (no endpoint)\n" ,
2190+ OMPI_NAME_PRINT (OMPI_PROC_MY_NAME ),
2191+ agreement_id .ERAID_FIELDS .contextid ,
2192+ agreement_id .ERAID_FIELDS .epoch ,
2193+ agreement_id .ERAID_FIELDS .agreementid ,
2194+ era_msg_type_to_string (type ),
2195+ (NULL != value -> bytes )? * (int * )value -> bytes : 0 ,
2196+ value -> header .ret ,
2197+ value -> header .nb_new_dead ,
2198+ dst ,
2199+ NULL != proc_name ? OMPI_NAME_PRINT (proc_name ) : "(null)" );
2200+ return ; /* bail out: the algorithm should reconnect when the failed proc is detected */
2201+ }
21882202 bml_btl = mca_bml_base_btl_array_get_index (& endpoint -> btl_eager , 0 );
21892203 assert (NULL != bml_btl );
21902204 btl_endpoint = bml_btl -> btl_endpoint ;
@@ -2570,7 +2584,7 @@ static void msg_down(era_msg_header_t *msg_header, uint8_t *bytes, int *new_dead
25702584 */
25712585 return ;
25722586 }
2573- /** if I receive a down message on an agreement I know about, I already participated.
2587+ /** if I receive a down message on an agreement I know about, I already participated.
25742588 * There is a non-erroneous code; erroneous execution that may also trigger this assert:
25752589 * consider the following case with false detection:
25762590 * 1. some ancestor A has detected the current process C as failed
0 commit comments