@@ -48,6 +48,7 @@ static const char *dash_line
4848 = "--------------------------------------------------------------------------\n" ;
4949static int output_stream = -1 ;
5050static char * * search_dirs = NULL ;
51+ static bool opal_help_want_aggregate = true;
5152
5253/*
5354 * Local functions
@@ -58,13 +59,28 @@ static int opal_show_help_internal(const char *filename, const char *topic, int
5859 ...);
5960static void opal_show_help_finalize (void );
6061
62+ typedef struct {
63+ pmix_info_t * info ;
64+ pmix_info_t * dirs ;
65+ char * msg ;
66+ } opal_log_info_t ;
67+
6168opal_show_help_fn_t opal_show_help = opal_show_help_internal ;
6269opal_show_vhelp_fn_t opal_show_vhelp = opal_show_vhelp_internal ;
6370
6471int opal_show_help_init (void )
6572{
6673 opal_output_stream_t lds ;
6774
75+ opal_help_want_aggregate = true;
76+ mca_base_var_register ("opal" , NULL , "base" , "help_aggregate" ,
77+ "If opal_base_help_aggregate is true, duplicate help messages will be aggregated rather "
78+ "than displayed individually. This can be helpful for parallel jobs that experience "
79+ "multiple identical failures; rather than print out the same help/failure message N times, "
80+ "display it once with a count of how many processes sent the same message. Default: true." ,
81+ MCA_BASE_VAR_TYPE_BOOL , NULL , 0 , 0 , OPAL_INFO_LVL_9 ,
82+ MCA_BASE_VAR_SCOPE_LOCAL , & opal_help_want_aggregate );
83+
6884 OBJ_CONSTRUCT (& lds , opal_output_stream_t );
6985 lds .lds_want_stderr = true;
7086 output_stream = opal_output_open (& lds );
@@ -88,6 +104,58 @@ static void opal_show_help_finalize(void)
88104 }
89105}
90106
107+ static void opal_show_help_cbfunc (pmix_status_t status , void * cbdata )
108+ {
109+ opal_log_info_t * info = (opal_log_info_t * ) cbdata ;
110+ if (PMIX_SUCCESS != status && PMIX_OPERATION_SUCCEEDED != status ) {
111+ // Aggregation/de-duplication functionality is *probably* lost,
112+ // but let's print the error anyway since duplicate error messages
113+ // is better than hiding it.
114+ opal_output (output_stream , "%s" , info -> msg );
115+ }
116+ PMIX_INFO_DESTRUCT (info -> info );
117+ if (info -> dirs ) {
118+ PMIX_INFO_DESTRUCT (info -> dirs );
119+ }
120+ free (info -> msg );
121+ free (info );
122+ }
123+
124+ static void local_delivery (const char * file , const char * topic , char * msg ) {
125+ pmix_info_t * info , * dirs ;
126+ int ninfo = 0 , ndirs = 0 ;
127+ PMIX_INFO_CREATE (info , 1 );
128+ PMIX_INFO_LOAD (& info [ninfo ++ ], PMIX_LOG_STDERR , msg , PMIX_STRING );
129+
130+ opal_log_info_t * cbdata = calloc (1 , sizeof (opal_log_info_t ));
131+ if (opal_help_want_aggregate ) {
132+ PMIX_INFO_CREATE (dirs , 3 );
133+ PMIX_INFO_LOAD (& dirs [ndirs ++ ], PMIX_LOG_AGG , & opal_help_want_aggregate , PMIX_BOOL );
134+ PMIX_INFO_LOAD (& dirs [ndirs ++ ], PMIX_LOG_KEY , file , PMIX_STRING );
135+ PMIX_INFO_LOAD (& dirs [ndirs ++ ], PMIX_LOG_VAL , topic , PMIX_STRING );
136+ cbdata -> dirs = dirs ;
137+ }
138+
139+ cbdata -> info = info ;
140+ cbdata -> msg = msg ;
141+
142+ // PMIx and the runtime will aggregate, de-duplicate, and print this
143+ // message to stderr.
144+ pmix_status_t rc = PMIx_Log_nb (info , ninfo , dirs , ndirs , opal_show_help_cbfunc , cbdata );
145+ if (PMIX_SUCCESS != rc ) {
146+ // Aggregation/de-duplication functionality is *definitely* lost,
147+ // but let's print the error anyway since duplicate error messages
148+ // is better than hiding it.
149+ opal_output (output_stream , "%s" , msg );
150+ PMIX_INFO_DESTRUCT (info );
151+ if (opal_help_want_aggregate ) {
152+ PMIX_INFO_DESTRUCT (dirs );
153+ }
154+ free (msg );
155+ free (cbdata );
156+ }
157+ }
158+
91159/*
92160 * Make one big string with all the lines. This isn't the most
93161 * efficient method in the world, but we're going for clarity here --
@@ -180,10 +248,12 @@ static int open_file(const char *base, const char *topic)
180248
181249 /* If we still couldn't open it, then something is wrong */
182250 if (NULL == opal_show_help_yyin ) {
183- opal_output (output_stream ,
251+ char * tmp ;
252+ opal_asprintf (& tmp ,
184253 "%sSorry! You were supposed to get help about:\n %s\nBut I couldn't open "
185254 "the help file:\n %s. Sorry!\n%s" ,
186255 dash_line , topic , err_msg , dash_line );
256+ local_delivery (topic , err_msg , tmp );
187257 free (err_msg );
188258 return OPAL_ERR_NOT_FOUND ;
189259 }
@@ -231,14 +301,15 @@ static int find_topic(const char *base, const char *topic)
231301 case OPAL_SHOW_HELP_PARSE_MESSAGE :
232302 break ;
233303
234- case OPAL_SHOW_HELP_PARSE_DONE :
235- opal_output (output_stream ,
304+ case OPAL_SHOW_HELP_PARSE_DONE : {
305+ char * msg ;
306+ opal_asprintf (& msg ,
236307 "%sSorry! You were supposed to get help about:\n %s\nfrom the file:\n "
237308 " %s\nBut I couldn't find that topic in the file. Sorry!\n%s" ,
238309 dash_line , topic , base , dash_line );
310+ local_delivery (topic , base , msg );
239311 return OPAL_ERR_NOT_FOUND ;
240- break ;
241-
312+ }
242313 default :
243314 break ;
244315 }
@@ -344,8 +415,7 @@ static int opal_show_vhelp_internal(const char *filename, const char *topic, int
344415
345416 /* If we got a single string, output it with formatting */
346417 if (NULL != output ) {
347- opal_output (output_stream , "%s" , output );
348- free (output );
418+ local_delivery (filename , topic , output );
349419 }
350420
351421 return (NULL == output ) ? OPAL_ERROR : OPAL_SUCCESS ;
0 commit comments