55 * Copyright (c) 2004-2011 The University of Tennessee and The University
66 * of Tennessee Research Foundation. All rights
77 * reserved.
8- * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
8+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
99 * University of Stuttgart. All rights reserved.
1010 * Copyright (c) 2004-2005 The Regents of the University of California.
1111 * All rights reserved.
1212 * Copyright (c) 2010-2012 Oak Ridge National Labs. All rights reserved.
1313 * Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
14- * reserved.
14+ * reserved.
1515 * Copyright (c) 2013 Intel, Inc. All rights reserved.
1616 * Copyright (c) 2014 Research Organization for Information Science
1717 * and Technology (RIST). All rights reserved.
1818 * $COPYRIGHT$
19- *
19+ *
2020 * Additional copyrights may follow
21- *
21+ *
2222 * $HEADER$
2323 */
2424
7373
7474static void * orte_progress_thread_engine (opal_object_t * obj );
7575static bool progress_thread_running = false;
76+ static struct timeval long_timeout = {
77+ .tv_sec = 3600 ,
78+ .tv_usec = 0
79+ };
80+ static opal_event_t block ;
81+ /*
82+ * If this event is fired, just restart it so that this event base
83+ * continues to have something to block on.
84+ */
85+ static void dummy_timeout_cb (int fd , short args , void * cbdata )
86+ {
87+ opal_event_t * block = (opal_event_t * )cbdata ;
88+
89+ opal_event_add (block , & long_timeout );
90+ }
91+
7692
7793int orte_ess_base_app_setup (bool db_restrict_local )
7894{
@@ -101,7 +117,12 @@ int orte_ess_base_app_setup(bool db_restrict_local)
101117
102118 /* get a separate orte event base */
103119 orte_event_base = opal_event_base_create ();
104-
120+ /* add an event to the new event base (if there are no events,
121+ * opal_event_loop() will return immediately) */
122+ opal_event_set (orte_event_base , & block , -1 , OPAL_EV_PERSIST ,
123+ dummy_timeout_cb , & block );
124+ opal_event_add (& block , & long_timeout );
125+
105126 /* open and setup the state machine */
106127 if (ORTE_SUCCESS != (ret = mca_base_framework_open (& orte_state_base_framework , 0 ))) {
107128 ORTE_ERROR_LOG (ret );
@@ -135,7 +156,7 @@ int orte_ess_base_app_setup(bool db_restrict_local)
135156 error = "orte_oob_base_select" ;
136157 goto error ;
137158 }
138-
159+
139160 /* Runtime Messaging Layer */
140161 if (ORTE_SUCCESS != (ret = mca_base_framework_open (& orte_rml_base_framework , 0 ))) {
141162 ORTE_ERROR_LOG (ret );
@@ -147,7 +168,7 @@ int orte_ess_base_app_setup(bool db_restrict_local)
147168 error = "orte_rml_base_select" ;
148169 goto error ;
149170 }
150-
171+
151172 /* setup the errmgr */
152173 if (ORTE_SUCCESS != (ret = orte_errmgr_base_select ())) {
153174 ORTE_ERROR_LOG (ret );
@@ -166,7 +187,7 @@ int orte_ess_base_app_setup(bool db_restrict_local)
166187 error = "orte_routed_base_select" ;
167188 goto error ;
168189 }
169-
190+
170191 /* database */
171192 if (ORTE_SUCCESS != (ret = mca_base_framework_open (& opal_db_base_framework , 0 ))) {
172193 ORTE_ERROR_LOG (ret );
@@ -194,7 +215,7 @@ int orte_ess_base_app_setup(bool db_restrict_local)
194215 error = "orte_grpcomm_base_select" ;
195216 goto error ;
196217 }
197-
218+
198219 /* non-daemon/HNP apps can only have the default proxy PLM
199220 * module open - provide a chance for it to initialize
200221 */
@@ -203,7 +224,7 @@ int orte_ess_base_app_setup(bool db_restrict_local)
203224 error = "orte_plm_init" ;
204225 goto error ;
205226 }
206-
227+
207228 /* construct the thread object */
208229 OBJ_CONSTRUCT (& orte_progress_thread , opal_thread_t );
209230 /* fork off a thread to progress it */
@@ -221,15 +242,15 @@ int orte_ess_base_app_setup(bool db_restrict_local)
221242 error = "orte_rml.enable_comm" ;
222243 goto error ;
223244 }
224-
245+
225246 /* setup my session directory */
226247 if (orte_create_session_dirs ) {
227248 OPAL_OUTPUT_VERBOSE ((2 , orte_ess_base_framework .framework_output ,
228249 "%s setting up session dir with\n\ttmpdir: %s\n\thost %s" ,
229250 ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ),
230251 (NULL == orte_process_info .tmpdir_base ) ? "UNDEF" : orte_process_info .tmpdir_base ,
231252 orte_process_info .nodename ));
232-
253+
233254 if (ORTE_SUCCESS != (ret = orte_session_dir (true,
234255 orte_process_info .tmpdir_base ,
235256 orte_process_info .nodename , NULL ,
@@ -238,22 +259,22 @@ int orte_ess_base_app_setup(bool db_restrict_local)
238259 error = "orte_session_dir" ;
239260 goto error ;
240261 }
241-
262+
242263 /* Once the session directory location has been established, set
243264 the opal_output env file location to be in the
244265 proc-specific session directory. */
245266 opal_output_set_output_file_info (orte_process_info .proc_session_dir ,
246267 "output-" , NULL , NULL );
247268 }
248-
269+
249270 /* setup the routed info */
250271 if (ORTE_SUCCESS != (ret = orte_routed .init_routes (ORTE_PROC_MY_NAME -> jobid , NULL ))) {
251272 ORTE_ERROR_LOG (ret );
252273 error = "orte_routed.init_routes" ;
253274 goto error ;
254275 }
255-
256-
276+
277+
257278#if OPAL_ENABLE_FT_CR == 1
258279 /*
259280 * Setup the SnapC
@@ -284,7 +305,7 @@ int orte_ess_base_app_setup(bool db_restrict_local)
284305#else
285306 opal_cr_set_enabled (false);
286307#endif
287-
308+
288309 /* Initalize the CR setup
289310 * Note: Always do this, even in non-FT builds.
290311 * If we don't some user level tools may hang.
@@ -308,7 +329,7 @@ int orte_ess_base_app_setup(bool db_restrict_local)
308329 }
309330
310331 return ORTE_SUCCESS ;
311-
332+
312333 error :
313334 if (!progress_thread_running ) {
314335 /* can't send the help message, so ensure it
@@ -319,7 +340,7 @@ int orte_ess_base_app_setup(bool db_restrict_local)
319340 orte_show_help ("help-orte-runtime.txt" ,
320341 "orte_init:startup:internal-failure" ,
321342 true, error , ORTE_ERROR_NAME (ret ), ret );
322-
343+
323344 return ret ;
324345}
325346
@@ -362,11 +383,12 @@ int orte_ess_base_app_finalize(void)
362383 (void ) mca_base_framework_close (& orte_state_base_framework );
363384
364385 /* release the event base */
386+ opal_event_del (& block );
365387 opal_event_base_free (orte_event_base );
366388
367389 orte_session_dir_finalize (ORTE_PROC_MY_NAME );
368-
369- return ORTE_SUCCESS ;
390+
391+ return ORTE_SUCCESS ;
370392}
371393
372394/*
@@ -396,16 +418,16 @@ void orte_ess_base_app_abort(int status, bool report)
396418
397419 /* Exit - do NOT do a normal finalize as this will very likely
398420 * hang the process. We are aborting due to an abnormal condition
399- * that precludes normal cleanup
421+ * that precludes normal cleanup
400422 *
401- * We do need to do the following bits to make sure we leave a
423+ * We do need to do the following bits to make sure we leave a
402424 * clean environment. Taken from orte_finalize():
403425 * - Assume errmgr cleans up child processes before we exit.
404426 */
405-
427+
406428 /* CRS cleanup since it may have a named pipe and thread active */
407429 orte_cr_finalize ();
408-
430+
409431 /* If we were asked to report this termination, do so.
410432 * Since singletons don't start an HNP unless necessary, and
411433 * direct-launched procs don't have daemons at all, only send
@@ -425,7 +447,7 @@ void orte_ess_base_app_abort(int status, bool report)
425447 * have a chance to be sent */
426448 nanosleep (& tp , NULL ); \
427449 }
428-
450+
429451 /* Now Exit */
430452 _exit (status );
431453}
0 commit comments