Skip to content

Commit e0cd931

Browse files
author
Ralph Castain
committed
This is a 1.10-only fix: ensure we always have an event on the async progress thread so we don't consume cpu for no purpose. In the case of singletons, the OOB has no event it can place on the thread as there is no HNP being started, and so the thread spins.
1 parent 4c4941d commit e0cd931

File tree

1 file changed

+48
-26
lines changed

1 file changed

+48
-26
lines changed

orte/mca/ess/base/ess_base_std_app.c

Lines changed: 48 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,20 @@
55
* Copyright (c) 2004-2011 The University of Tennessee and The University
66
* of Tennessee Research Foundation. All rights
77
* reserved.
8-
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
8+
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
1212
* Copyright (c) 2010-2012 Oak Ridge National Labs. All rights reserved.
1313
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights
14-
* reserved.
14+
* reserved.
1515
* Copyright (c) 2013 Intel, Inc. All rights reserved.
1616
* Copyright (c) 2014 Research Organization for Information Science
1717
* and Technology (RIST). All rights reserved.
1818
* $COPYRIGHT$
19-
*
19+
*
2020
* Additional copyrights may follow
21-
*
21+
*
2222
* $HEADER$
2323
*/
2424

@@ -73,6 +73,22 @@
7373

7474
static void* orte_progress_thread_engine(opal_object_t *obj);
7575
static bool progress_thread_running = false;
76+
static struct timeval long_timeout = {
77+
.tv_sec = 3600,
78+
.tv_usec = 0
79+
};
80+
static opal_event_t block;
81+
/*
82+
* If this event is fired, just restart it so that this event base
83+
* continues to have something to block on.
84+
*/
85+
static void dummy_timeout_cb(int fd, short args, void *cbdata)
86+
{
87+
opal_event_t *block = (opal_event_t*)cbdata;
88+
89+
opal_event_add(block, &long_timeout);
90+
}
91+
7692

7793
int orte_ess_base_app_setup(bool db_restrict_local)
7894
{
@@ -101,7 +117,12 @@ int orte_ess_base_app_setup(bool db_restrict_local)
101117

102118
/* get a separate orte event base */
103119
orte_event_base = opal_event_base_create();
104-
120+
/* add an event to the new event base (if there are no events,
121+
* opal_event_loop() will return immediately) */
122+
opal_event_set(orte_event_base, &block, -1, OPAL_EV_PERSIST,
123+
dummy_timeout_cb, &block);
124+
opal_event_add(&block, &long_timeout);
125+
105126
/* open and setup the state machine */
106127
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
107128
ORTE_ERROR_LOG(ret);
@@ -135,7 +156,7 @@ int orte_ess_base_app_setup(bool db_restrict_local)
135156
error = "orte_oob_base_select";
136157
goto error;
137158
}
138-
159+
139160
/* Runtime Messaging Layer */
140161
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) {
141162
ORTE_ERROR_LOG(ret);
@@ -147,7 +168,7 @@ int orte_ess_base_app_setup(bool db_restrict_local)
147168
error = "orte_rml_base_select";
148169
goto error;
149170
}
150-
171+
151172
/* setup the errmgr */
152173
if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
153174
ORTE_ERROR_LOG(ret);
@@ -166,7 +187,7 @@ int orte_ess_base_app_setup(bool db_restrict_local)
166187
error = "orte_routed_base_select";
167188
goto error;
168189
}
169-
190+
170191
/* database */
171192
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_db_base_framework, 0))) {
172193
ORTE_ERROR_LOG(ret);
@@ -194,7 +215,7 @@ int orte_ess_base_app_setup(bool db_restrict_local)
194215
error = "orte_grpcomm_base_select";
195216
goto error;
196217
}
197-
218+
198219
/* non-daemon/HNP apps can only have the default proxy PLM
199220
* module open - provide a chance for it to initialize
200221
*/
@@ -203,7 +224,7 @@ int orte_ess_base_app_setup(bool db_restrict_local)
203224
error = "orte_plm_init";
204225
goto error;
205226
}
206-
227+
207228
/* construct the thread object */
208229
OBJ_CONSTRUCT(&orte_progress_thread, opal_thread_t);
209230
/* fork off a thread to progress it */
@@ -221,15 +242,15 @@ int orte_ess_base_app_setup(bool db_restrict_local)
221242
error = "orte_rml.enable_comm";
222243
goto error;
223244
}
224-
245+
225246
/* setup my session directory */
226247
if (orte_create_session_dirs) {
227248
OPAL_OUTPUT_VERBOSE((2, orte_ess_base_framework.framework_output,
228249
"%s setting up session dir with\n\ttmpdir: %s\n\thost %s",
229250
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
230251
(NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base,
231252
orte_process_info.nodename));
232-
253+
233254
if (ORTE_SUCCESS != (ret = orte_session_dir(true,
234255
orte_process_info.tmpdir_base,
235256
orte_process_info.nodename, NULL,
@@ -238,22 +259,22 @@ int orte_ess_base_app_setup(bool db_restrict_local)
238259
error = "orte_session_dir";
239260
goto error;
240261
}
241-
262+
242263
/* Once the session directory location has been established, set
243264
the opal_output env file location to be in the
244265
proc-specific session directory. */
245266
opal_output_set_output_file_info(orte_process_info.proc_session_dir,
246267
"output-", NULL, NULL);
247268
}
248-
269+
249270
/* setup the routed info */
250271
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
251272
ORTE_ERROR_LOG(ret);
252273
error = "orte_routed.init_routes";
253274
goto error;
254275
}
255-
256-
276+
277+
257278
#if OPAL_ENABLE_FT_CR == 1
258279
/*
259280
* Setup the SnapC
@@ -284,7 +305,7 @@ int orte_ess_base_app_setup(bool db_restrict_local)
284305
#else
285306
opal_cr_set_enabled(false);
286307
#endif
287-
308+
288309
/* Initalize the CR setup
289310
* Note: Always do this, even in non-FT builds.
290311
* If we don't some user level tools may hang.
@@ -308,7 +329,7 @@ int orte_ess_base_app_setup(bool db_restrict_local)
308329
}
309330

310331
return ORTE_SUCCESS;
311-
332+
312333
error:
313334
if (!progress_thread_running) {
314335
/* can't send the help message, so ensure it
@@ -319,7 +340,7 @@ int orte_ess_base_app_setup(bool db_restrict_local)
319340
orte_show_help("help-orte-runtime.txt",
320341
"orte_init:startup:internal-failure",
321342
true, error, ORTE_ERROR_NAME(ret), ret);
322-
343+
323344
return ret;
324345
}
325346

@@ -362,11 +383,12 @@ int orte_ess_base_app_finalize(void)
362383
(void) mca_base_framework_close(&orte_state_base_framework);
363384

364385
/* release the event base */
386+
opal_event_del(&block);
365387
opal_event_base_free(orte_event_base);
366388

367389
orte_session_dir_finalize(ORTE_PROC_MY_NAME);
368-
369-
return ORTE_SUCCESS;
390+
391+
return ORTE_SUCCESS;
370392
}
371393

372394
/*
@@ -396,16 +418,16 @@ void orte_ess_base_app_abort(int status, bool report)
396418

397419
/* Exit - do NOT do a normal finalize as this will very likely
398420
* hang the process. We are aborting due to an abnormal condition
399-
* that precludes normal cleanup
421+
* that precludes normal cleanup
400422
*
401-
* We do need to do the following bits to make sure we leave a
423+
* We do need to do the following bits to make sure we leave a
402424
* clean environment. Taken from orte_finalize():
403425
* - Assume errmgr cleans up child processes before we exit.
404426
*/
405-
427+
406428
/* CRS cleanup since it may have a named pipe and thread active */
407429
orte_cr_finalize();
408-
430+
409431
/* If we were asked to report this termination, do so.
410432
* Since singletons don't start an HNP unless necessary, and
411433
* direct-launched procs don't have daemons at all, only send
@@ -425,7 +447,7 @@ void orte_ess_base_app_abort(int status, bool report)
425447
* have a chance to be sent */
426448
nanosleep(&tp, NULL); \
427449
}
428-
450+
429451
/* Now Exit */
430452
_exit(status);
431453
}

0 commit comments

Comments
 (0)