Skip to content

Commit a2c2bd1

Browse files
author
rhc54
committed
Merge pull request open-mpi#760 from igor-ivanov/pr/oshmem_new_mca_vars
oshmem: Add new mca variables oshmem_abort_delay and oshmem_abort_pri…
2 parents 741c12c + b988a59 commit a2c2bd1

File tree

7 files changed

+109
-45
lines changed

7 files changed

+109
-45
lines changed

ompi/runtime/ompi_mpi_abort.c

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#endif
3535

3636
#include "opal/mca/backtrace/backtrace.h"
37+
#include "opal/runtime/opal_params.h"
3738

3839
#include "ompi/communicator/communicator.h"
3940
#include "ompi/runtime/mpiruntime.h"
@@ -72,11 +73,11 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
7273

7374
/* Should we print a stack trace? Not aggregated because they
7475
might be different on all processes. */
75-
if (ompi_mpi_abort_print_stack) {
76+
if (opal_abort_print_stack) {
7677
char **messages;
7778
int len, i;
7879

79-
if (OMPI_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
80+
if (OPAL_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
8081
for (i = 0; i < len; ++i) {
8182
fprintf(stderr, "[%s:%d] [%d] func:%s\n", host, (int) pid,
8283
i, messages[i]);
@@ -96,7 +97,7 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
9697
if (errcode < 0 ||
9798
asprintf(&msg, "[%s:%d] aborting with MPI error %s%s",
9899
host, (int) pid, ompi_mpi_errnum_get_string(errcode),
99-
ompi_mpi_abort_print_stack ?
100+
opal_abort_print_stack ?
100101
" (stack trace available on stderr)" : "") < 0) {
101102
msg = NULL;
102103
}
@@ -107,20 +108,20 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
107108

108109
/* Should we wait for a while before aborting? */
109110

110-
if (0 != ompi_mpi_abort_delay) {
111-
if (ompi_mpi_abort_delay < 0) {
112-
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter mpi_abort_delay is < 0)\n",
111+
if (0 != opal_abort_delay) {
112+
if (opal_abort_delay < 0) {
113+
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter opal_abort_delay is < 0)\n",
113114
host, (int) pid);
114115
fflush(stderr);
115116
while (1) {
116117
sleep(5);
117118
}
118119
} else {
119120
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
120-
host, (int) pid, ompi_mpi_abort_delay);
121+
host, (int) pid, opal_abort_delay);
121122
do {
122123
sleep(1);
123-
} while (--ompi_mpi_abort_delay > 0);
124+
} while (--opal_abort_delay > 0);
124125
}
125126
}
126127

ompi/runtime/ompi_mpi_params.c

Lines changed: 8 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
#include "ompi/runtime/params.h"
3939
#include "ompi/mca/rte/rte.h"
4040

41+
#include "opal/runtime/opal_params.h"
4142
#include "opal/mca/base/mca_base_param.h"
4243
#include "opal/util/argv.h"
4344
#include "opal/util/output.h"
@@ -58,8 +59,6 @@ int ompi_debug_show_mpi_alloc_mem_leaks = 0;
5859
bool ompi_debug_no_free_handles = false;
5960
bool ompi_mpi_show_mca_params = false;
6061
char *ompi_mpi_show_mca_params_file = NULL;
61-
bool ompi_mpi_abort_print_stack = false;
62-
int ompi_mpi_abort_delay = 0;
6362
bool ompi_mpi_keep_fqdn_hostnames = false;
6463
int ompi_mpi_leave_pinned = -1;
6564
bool ompi_mpi_leave_pinned_pipeline = false;
@@ -214,33 +213,6 @@ int ompi_mpi_register_params(void)
214213

215214
/* User-level process pinning controls */
216215

217-
/* MPI_ABORT controls */
218-
ompi_mpi_abort_delay = 0;
219-
(void) mca_base_var_register("ompi", "mpi", NULL, "abort_delay",
220-
"If nonzero, print out an identifying message when MPI_ABORT is invoked (hostname, PID of the process that called MPI_ABORT) and delay for that many seconds before exiting (a negative delay value means to never abort). This allows attaching of a debugger before quitting the job.",
221-
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
222-
OPAL_INFO_LVL_9,
223-
MCA_BASE_VAR_SCOPE_READONLY,
224-
&ompi_mpi_abort_delay);
225-
226-
ompi_mpi_abort_print_stack = false;
227-
(void) mca_base_var_register("ompi", "mpi", NULL, "abort_print_stack",
228-
"If nonzero, print out a stack trace when MPI_ABORT is invoked",
229-
MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
230-
/* If we do not have stack trace
231-
capability, make this a constant
232-
MCA variable */
233-
#if OPAL_WANT_PRETTY_PRINT_STACKTRACE && defined(HAVE_BACKTRACE)
234-
0,
235-
OPAL_INFO_LVL_9,
236-
MCA_BASE_VAR_SCOPE_READONLY,
237-
#else
238-
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
239-
OPAL_INFO_LVL_9,
240-
MCA_BASE_VAR_SCOPE_CONSTANT,
241-
#endif
242-
&ompi_mpi_abort_print_stack);
243-
244216
ompi_mpi_preconnect_mpi = false;
245217
value = mca_base_var_register("ompi", "mpi", NULL, "preconnect_mpi",
246218
"Whether to force MPI processes to fully "
@@ -345,6 +317,13 @@ int ompi_mpi_register_params(void)
345317
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
346318
&ompi_hostname_cutoff);
347319

320+
(void) mca_base_var_register_synonym(opal_abort_delay_var_index, "ompi", "mpi", NULL,
321+
"abort_delay",
322+
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
323+
324+
(void) mca_base_var_register_synonym(opal_abort_print_stack_var_index, "ompi", "mpi", NULL,
325+
"abort_print_stack",
326+
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
348327

349328
return OMPI_SUCCESS;
350329
}

opal/runtime/opal_params.c

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,10 @@
4444
char *opal_signal_string = NULL;
4545
char *opal_net_private_ipv4 = NULL;
4646
char *opal_set_max_sys_limits = NULL;
47+
bool opal_abort_print_stack = false;
48+
int opal_abort_print_stack_var_index = -1;
49+
int opal_abort_delay = 0;
50+
int opal_abort_delay_var_index = -1;
4751

4852
static bool opal_register_done = false;
4953

@@ -153,6 +157,40 @@ int opal_register_params(void)
153157
return ret;
154158
}
155159

160+
opal_abort_delay = 0;
161+
ret = mca_base_var_register("opal", "opal", NULL, "abort_delay",
162+
"If nonzero, print out an identifying message when abort operation is invoked (hostname, PID of the process that called abort) and delay for that many seconds before exiting (a negative delay value means to never abort). This allows attaching of a debugger before quitting the job.",
163+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
164+
OPAL_INFO_LVL_5,
165+
MCA_BASE_VAR_SCOPE_READONLY,
166+
&opal_abort_delay);
167+
if (0 > ret) {
168+
return ret;
169+
}
170+
opal_abort_delay_var_index = ret;
171+
172+
opal_abort_print_stack = false;
173+
ret = mca_base_var_register("opal", "opal", NULL, "abort_print_stack",
174+
"If nonzero, print out a stack trace when abort is invoked",
175+
MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
176+
/* If we do not have stack trace
177+
capability, make this a constant
178+
MCA variable */
179+
#if OPAL_WANT_PRETTY_PRINT_STACKTRACE
180+
0,
181+
OPAL_INFO_LVL_5,
182+
MCA_BASE_VAR_SCOPE_READONLY,
183+
#else
184+
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
185+
OPAL_INFO_LVL_5,
186+
MCA_BASE_VAR_SCOPE_CONSTANT,
187+
#endif
188+
&opal_abort_print_stack);
189+
if (0 > ret) {
190+
return ret;
191+
}
192+
opal_abort_print_stack_var_index = ret;
193+
156194
/* The ddt engine has a few parameters */
157195
ret = opal_datatype_register_params();
158196
if (OPAL_SUCCESS != ret) {

opal/runtime/opal_params.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,4 +33,22 @@ extern char *opal_set_max_sys_limits;
3333
extern bool opal_progress_debug;
3434
#endif
3535

36+
/**
37+
* Whether an abort operation should print out a stack trace or not.
38+
*/
39+
OPAL_DECLSPEC extern bool opal_abort_print_stack;
40+
OPAL_DECLSPEC extern int opal_abort_print_stack_var_index;
41+
42+
/**
43+
* Whether abort operation should print out an identifying message
44+
* (e.g., hostname and PID) and loop waiting for a debugger to
45+
* attach. The value of the integer is how many seconds to wait:
46+
*
47+
* 0 = do not print the message and do not loop
48+
* negative value = print the message and loop forever
49+
* positive value = print the message and delay for that many seconds
50+
*/
51+
OPAL_DECLSPEC extern int opal_abort_delay;
52+
OPAL_DECLSPEC extern int opal_abort_delay_var_index;
53+
3654
#endif

oshmem/runtime/oshmem_shmem_abort.c

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#endif
2525

2626
#include "opal/mca/backtrace/backtrace.h"
27+
#include "opal/runtime/opal_params.h"
2728

2829
#include "orte/util/proc_info.h"
2930
#include "orte/runtime/runtime.h"
@@ -71,11 +72,11 @@ int oshmem_shmem_abort(int errcode)
7172

7273
/* Should we print a stack trace? Not aggregated because they
7374
might be different on all processes. */
74-
if (ompi_mpi_abort_print_stack) {
75+
if (opal_abort_print_stack) {
7576
char **messages;
7677
int len, i;
7778

78-
if (OSHMEM_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
79+
if (OPAL_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
7980
for (i = 0; i < len; ++i) {
8081
fprintf(stderr,
8182
"[%s:%d] [%d] func:%s\n",
@@ -94,6 +95,25 @@ int oshmem_shmem_abort(int errcode)
9495
}
9596
}
9697

98+
/* Should we wait for a while before aborting? */
99+
100+
if (0 != opal_abort_delay) {
101+
if (opal_abort_delay < 0) {
102+
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter opal_abort_delay is < 0)\n",
103+
host, (int) pid);
104+
fflush(stderr);
105+
while (1) {
106+
sleep(5);
107+
}
108+
} else {
109+
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
110+
host, (int) pid, opal_abort_delay);
111+
do {
112+
sleep(1);
113+
} while (--opal_abort_delay > 0);
114+
}
115+
}
116+
97117
if (!orte_initialized || !oshmem_shmem_initialized) {
98118
if (orte_show_help_is_available()) {
99119
/* TODO help message from SHMEM not from MPI is needed*/

oshmem/runtime/oshmem_shmem_params.c

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,12 @@
88
* $HEADER$
99
*/
1010

11-
#include "params.h"
12-
#include "runtime.h"
11+
#include "oshmem_config.h"
12+
13+
#include "opal/runtime/opal_params.h"
14+
15+
#include "oshmem/runtime/params.h"
16+
#include "oshmem/runtime/runtime.h"
1317
#include "oshmem/constants.h"
1418

1519

@@ -63,5 +67,13 @@ int oshmem_shmem_register_params(void)
6367
MCA_BASE_VAR_SCOPE_READONLY,
6468
&oshmem_preconnect_all);
6569

70+
(void) mca_base_var_register_synonym(opal_abort_delay_var_index, "oshmem", "oshmem", NULL,
71+
"abort_delay",
72+
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
73+
74+
(void) mca_base_var_register_synonym(opal_abort_print_stack_var_index, "oshmem", "oshmem", NULL,
75+
"abort_print_stack",
76+
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
77+
6678
return OSHMEM_SUCCESS;
6779
}

oshmem/runtime/params.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,6 @@ BEGIN_C_DECLS
1919
* Global variables
2020
*/
2121

22-
/**
23-
* Whether an MPI_ABORT should print out a stack trace or not.
24-
*/
25-
OSHMEM_DECLSPEC extern bool ompi_mpi_abort_print_stack;
2622

2723
/**
2824
* Whether or not the lock routines are recursive

0 commit comments

Comments
 (0)