33 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44 * University Research and Technology
55 * Corporation. All rights reserved.
6- * Copyright (c) 2004-2020 The University of Tennessee and The University
6+ * Copyright (c) 2004-2022 The University of Tennessee and The University
77 * of Tennessee Research Foundation. All rights
88 * reserved.
99 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -343,10 +343,10 @@ int ompi_errhandler_proc_failed_internal(ompi_proc_t* ompi_proc, int status, boo
343343 opal_mutex_unlock (& errhandler_ftmpi_lock );
344344
345345 opal_output_verbose (1 , ompi_ftmpi_output_handle ,
346- "%s ompi: Process %s failed (state = %d)." ,
346+ "%s ompi: Process %s failed (state = %d %s )." ,
347347 OMPI_NAME_PRINT (OMPI_PROC_MY_NAME ),
348348 OMPI_NAME_PRINT (& ompi_proc -> super .proc_name ),
349- status );
349+ status , PMIx_Error_string ( status ) );
350350
351351 if (90 < opal_output_get_verbosity (ompi_ftmpi_output_handle )) {
352352 /* how did we get there? */
@@ -413,7 +413,7 @@ int ompi_errhandler_proc_failed_internal(ompi_proc_t* ompi_proc, int status, boo
413413 * The wait function has a check, so all we need to do here is
414414 * signal it so it will check again.
415415 */
416- wait_sync_global_wakeup (MPI_ERR_PROC_FAILED );
416+ wait_sync_global_wakeup (PMIX_ERR_PROC_ABORTED == status ? MPI_ERR_PROC_ABORTED : MPI_ERR_PROC_FAILED );
417417
418418 /* Collectives:
419419 * Propagate the error (this has been selected rather than the "roll
@@ -430,12 +430,11 @@ int ompi_errhandler_proc_failed_internal(ompi_proc_t* ompi_proc, int status, boo
430430 pmix_info_t pmix_info [1 ];
431431 pmix_status_t prc ;
432432
433- assert (OPAL_ERR_PROC_ABORTED == status );
434433 OPAL_PMIX_CONVERT_NAME (& pmix_source , OMPI_PROC_MY_NAME );
435434 OPAL_PMIX_CONVERT_NAME (& pmix_proc , & ompi_proc -> super .proc_name );
436435 PMIX_INFO_CONSTRUCT (& pmix_info [0 ]);
437436 PMIX_INFO_LOAD (& pmix_info [0 ], PMIX_EVENT_AFFECTED_PROC , & pmix_proc , PMIX_PROC );
438- prc = PMIx_Notify_event (PMIX_ERR_PROC_ABORTED , & pmix_source , PMIX_RANGE_LOCAL ,
437+ prc = PMIx_Notify_event (PMIX_ERR_PROC_TERM_WO_SYNC , & pmix_source , PMIX_RANGE_LOCAL ,
439438 pmix_info , 1 , NULL , & active );
440439 if ( PMIX_SUCCESS != prc &&
441440 PMIX_OPERATION_SUCCEEDED != prc ) {
@@ -468,10 +467,16 @@ static void *ompi_errhandler_event_cb(int fd, int flags, void *context) {
468467 opal_process_name_t prc ;
469468 int rc ;
470469#if OPAL_ENABLE_FT_MPI
471- if ( PMIX_ERR_PROC_ABORTED == status ) {
472- int i ;
473- for (i = 0 ; i < event -> nvalue ; i ++ ) {
470+ switch ( status ) {
471+ case PMIX_ERR_PROC_TERM_WO_SYNC :
472+ case PMIX_ERR_PROC_ABORTED_BY_SIG :
473+ case PMIX_ERR_PROC_ABORTED : /* that is, proc aborted by pmix_abort */
474+ for (int i = 0 ; i < event -> nvalue ; i ++ ) {
474475 if (PMIX_PROC != event -> info [i ].value .type ) {
476+ OPAL_OUTPUT_VERBOSE ((70 , ompi_ftmpi_output_handle ,
477+ "%s ompi: ignoring the following key for a PMIx fault event: %s" ,
478+ OMPI_NAME_PRINT (OMPI_PROC_MY_NAME ),
479+ event -> info [i ].key ));
475480 continue ;
476481 }
477482 OPAL_PMIX_CONVERT_PROCT (rc , & prc , event -> info [i ].value .data .proc );
@@ -484,20 +489,26 @@ static void *ompi_errhandler_event_cb(int fd, int flags, void *context) {
484489 continue ; /* we are not 'MPI connected' with this proc. */
485490 }
486491 assert ( !ompi_proc_is_sentinel (proc ) );
487- ompi_errhandler_proc_failed_internal (proc , OPAL_ERR_PROC_ABORTED , false);
492+ ompi_errhandler_proc_failed_internal (proc , status , false);
488493 }
489494 opal_event_del (& event -> super );
490495 free (event );
491496 return NULL ;
497+ case PMIX_ERR_LOST_CONNECTION :
498+ opal_output_verbose (1 , ompi_ftmpi_output_handle ,
499+ "%s ompi: Error event PMIX_ERR_LOST_CONNECTION reported, that usually means that my daemon died thus I need to go away." ,
500+ OMPI_NAME_PRINT (OMPI_PROC_MY_NAME ));
501+ break ;
502+ default :
503+ /* An unmanaged type of failure, let it do its thing. */
504+ opal_output_verbose (1 , ompi_ftmpi_output_handle ,
505+ "%s ompi: Error event reported through PMIx from %s (state = %d). "
506+ "This error type is not handled by the fault tolerant layer "
507+ "and the application will now presumably abort." ,
508+ OMPI_NAME_PRINT (OMPI_PROC_MY_NAME ),
509+ OPAL_NAME_PRINT (source ),
510+ status );
492511 }
493- /* An unmanaged type of failure, let it do its thing. */
494- opal_output_verbose (1 , ompi_ftmpi_output_handle ,
495- "%s ompi: Error event reported through PMIx from %s (state = %d). "
496- "This error type is not handled by the fault tolerant layer "
497- "and the application will now presumably abort." ,
498- OMPI_NAME_PRINT (OMPI_PROC_MY_NAME ),
499- OPAL_NAME_PRINT (source ),
500- status );
501512#endif /* OPAL_ENABLE_FT_MPI */
502513 opal_event_del (& event -> super );
503514 free (event );
0 commit comments