Skip to content

Commit 31faf0a

Browse files
authored
Merge pull request #2861 from jjhursey/topic/ibm/master/orted-timeout-improv
orterun: Add parameter to control when we give up on stack traces
2 parents 0b82252 + 3c47432 commit 31faf0a

File tree

4 files changed

+26
-8
lines changed

4 files changed

+26
-8
lines changed

orte/orted/orted_submit.c

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
1818
* Copyright (c) 2015-2017 Research Organization for Information Science
1919
* and Technology (RIST). All rights reserved.
20+
* Copyright (c) 2017 IBM Corporation. All rights reserved.
2021
* $COPYRIGHT$
2122
*
2223
* Additional copyrights may follow
@@ -3071,8 +3072,10 @@ static void stack_trace_recv(int status, orte_process_name_t* sender,
30713072
}
30723073
++ntraces;
30733074
if (orte_process_info.num_procs == ntraces) {
3074-
/* cancel the timeout */
3075-
OBJ_DESTRUCT(&stack_trace_timer);
3075+
if( orte_stack_trace_wait_timeout > 0 ) {
3076+
/* cancel the timeout */
3077+
OBJ_DESTRUCT(&stack_trace_timer);
3078+
}
30763079
/* abort the job */
30773080
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
30783081
/* set the global abnormal exit flag */
@@ -3173,12 +3176,14 @@ void orte_timeout_wakeup(int sd, short args, void *cbdata)
31733176
OBJ_RELEASE(sig);
31743177
/* we will terminate after we get the stack_traces, but set a timeout
31753178
* just in case we never hear back from everyone */
3176-
OBJ_CONSTRUCT(&stack_trace_timer, orte_timer_t);
3177-
opal_event_evtimer_set(orte_event_base,
3178-
stack_trace_timer.ev, stack_trace_timeout, NULL);
3179-
opal_event_set_priority(stack_trace_timer.ev, ORTE_ERROR_PRI);
3180-
stack_trace_timer.tv.tv_sec = 30;
3181-
opal_event_evtimer_add(stack_trace_timer.ev, &stack_trace_timer.tv);
3179+
if( orte_stack_trace_wait_timeout > 0 ) {
3180+
OBJ_CONSTRUCT(&stack_trace_timer, orte_timer_t);
3181+
opal_event_evtimer_set(orte_event_base,
3182+
stack_trace_timer.ev, stack_trace_timeout, NULL);
3183+
opal_event_set_priority(stack_trace_timer.ev, ORTE_ERROR_PRI);
3184+
stack_trace_timer.tv.tv_sec = orte_stack_trace_wait_timeout;
3185+
opal_event_evtimer_add(stack_trace_timer.ev, &stack_trace_timer.tv);
3186+
}
31823187
return;
31833188
}
31843189
giveup:

orte/runtime/orte_globals.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,8 @@ orte_timer_t *orte_mpiexec_timeout = NULL;
133133

134134
opal_buffer_t *orte_tree_launch_cmd = NULL;
135135

136+
int orte_stack_trace_wait_timeout = 30;
137+
136138
/* global arrays for data storage */
137139
opal_hash_table_t *orte_job_data = NULL;
138140
opal_pointer_array_t *orte_node_pool = NULL;

orte/runtime/orte_globals.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -578,6 +578,9 @@ ORTE_DECLSPEC extern char *orte_base_user_debugger;
578578
*/
579579
ORTE_DECLSPEC extern char *orte_daemon_cores;
580580

581+
/* Max time to wait for stack straces to return */
582+
ORTE_DECLSPEC extern int orte_stack_trace_wait_timeout;
583+
581584
END_C_DECLS
582585

583586
#endif /* ORTE_RUNTIME_ORTE_GLOBALS_H */

orte/runtime/orte_mca_params.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -765,5 +765,13 @@ int orte_register_params(void)
765765
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_9,
766766
MCA_BASE_VAR_SCOPE_READONLY, &orte_mgmt_transport);
767767

768+
/* Amount of time to wait for a stack trace to return from the daemons */
769+
orte_stack_trace_wait_timeout = 30;
770+
(void) mca_base_var_register ("orte", "orte", NULL, "timeout_for_stack_trace",
771+
"Seconds to wait for stack traces to return before terminating the job (<= 0 wait forever)",
772+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
773+
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
774+
&orte_stack_trace_wait_timeout);
775+
768776
return ORTE_SUCCESS;
769777
}

0 commit comments

Comments
 (0)