Skip to content

Commit 6b91edd

Browse files
committed
Apply opal_abort_delay to the signal handler
This commit expands the effect of the MCA parameter `opal_abort_delay` to the OPAL signal handler. This allows attaching of a debugger on segmentation fault etc. before quitting the job. The sleep code is moved to the `opal_delay_abort` function from the `ompi_mpi_abort` and `oshmem_shmem_abort` functions for code cleanup. Signed-off-by: KAWASHIMA Takahiro <[email protected]>
1 parent 7002535 commit 6b91edd

File tree

5 files changed

+59
-36
lines changed

5 files changed

+59
-36
lines changed

ompi/runtime/ompi_mpi_abort.c

Lines changed: 4 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
* reserved.
1919
* Copyright (c) 2015 Mellanox Technologies, Inc.
2020
* All rights reserved.
21+
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
2122
* $COPYRIGHT$
2223
*
2324
* Additional copyrights may follow
@@ -42,6 +43,7 @@
4243
#include <errno.h>
4344

4445
#include "opal/mca/backtrace/backtrace.h"
46+
#include "opal/util/error.h"
4547
#include "opal/runtime/opal_params.h"
4648

4749
#include "ompi/communicator/communicator.h"
@@ -159,24 +161,8 @@ ompi_mpi_abort(struct ompi_communicator_t* comm,
159161
}
160162
}
161163

162-
/* Should we wait for a while before aborting? */
163-
164-
if (0 != opal_abort_delay) {
165-
if (opal_abort_delay < 0) {
166-
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter opal_abort_delay is < 0)\n",
167-
host, (int) pid);
168-
fflush(stderr);
169-
while (1) {
170-
sleep(5);
171-
}
172-
} else {
173-
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
174-
host, (int) pid, opal_abort_delay);
175-
do {
176-
sleep(1);
177-
} while (--opal_abort_delay > 0);
178-
}
179-
}
164+
/* Wait for a while before aborting */
165+
opal_delay_abort();
180166

181167
/* If the RTE isn't setup yet/any more, then don't even try
182168
killing everyone. Sorry, Charlie... */

opal/util/error.c

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
* reserved.
1515
* Copyright (c) 2015 Research Organization for Information Science
1616
* and Technology (RIST). All rights reserved.
17+
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
1718
* $COPYRIGHT$
1819
*
1920
* Additional copyrights may follow
@@ -27,9 +28,12 @@
2728
#include <errno.h>
2829
#include <stdio.h>
2930
#include <stdlib.h>
31+
#include <unistd.h>
3032

3133
#include "opal/util/error.h"
3234
#include "opal/constants.h"
35+
#include "opal/util/proc.h"
36+
#include "opal/runtime/opal_params.h"
3337

3438
#define MAX_CONVERTERS 5
3539
#define MAX_CONVERTER_PROJECT_LEN 10
@@ -208,3 +212,36 @@ opal_error_register(const char *project, int err_base, int err_max,
208212

209213
return OPAL_ERR_OUT_OF_RESOURCE;
210214
}
215+
216+
217+
void
218+
opal_delay_abort(void)
219+
{
220+
// Though snprintf and strlen are not guaranteed to be async-signal-safe
221+
// in POSIX, it is async-signal-safe on many implementations probably.
222+
223+
if (0 != opal_abort_delay) {
224+
int delay = opal_abort_delay;
225+
pid_t pid = getpid();
226+
char msg[100 + OPAL_MAXHOSTNAMELEN];
227+
228+
if (delay < 0) {
229+
snprintf(msg, sizeof(msg),
230+
"[%s:%05d] Looping forever "
231+
"(MCA parameter opal_abort_delay is < 0)\n",
232+
opal_process_info.nodename, (int) pid);
233+
write(STDERR_FILENO, msg, strlen(msg) + 1);
234+
while (1) {
235+
sleep(5);
236+
}
237+
} else {
238+
snprintf(msg, sizeof(msg),
239+
"[%s:%05d] Delaying for %d seconds before aborting\n",
240+
opal_process_info.nodename, (int) pid, delay);
241+
write(STDERR_FILENO, msg, strlen(msg) + 1);
242+
do {
243+
sleep(1);
244+
} while (--delay > 0);
245+
}
246+
}
247+
}

opal/util/error.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12+
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
1213
* $COPYRIGHT$
1314
*
1415
* Additional copyrights may follow
@@ -89,6 +90,14 @@ OPAL_DECLSPEC int opal_error_register(const char *project,
8990
int err_base, int err_max,
9091
opal_err2str_fn_t converter);
9192

93+
/**
94+
* Print a message and sleep in accordance with the opal_abort_delay value
95+
*
96+
* This function is (almost) async-thread-safe so it can be called from
97+
* a signal handler.
98+
*/
99+
OPAL_DECLSPEC void opal_delay_abort(void);
100+
92101
END_C_DECLS
93102

94103
#endif /* OPAL_UTIL_ERROR_H */

opal/util/stacktrace.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
1313
* Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved.
1414
* Copyright (c) 2017 IBM Corporation. All rights reserved.
15+
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
1516
* $COPYRIGHT$
1617
*
1718
* Additional copyrights may follow
@@ -45,6 +46,7 @@
4546
#include "opal/util/show_help.h"
4647
#include "opal/util/argv.h"
4748
#include "opal/util/proc.h"
49+
#include "opal/util/error.h"
4850
#include "opal/runtime/opal_params.h"
4951

5052
#ifndef _NSIG
@@ -412,6 +414,9 @@ static void show_stackframe (int signo, siginfo_t * info, void * p)
412414
opal_stacktrace_output_fileno = -1;
413415
}
414416

417+
/* wait for a while before aborting for debugging */
418+
opal_delay_abort();
419+
415420
/* Raise the signal again, so we don't accidentally mask critical signals.
416421
* For critical signals, it is preferred that we call 'raise' instead of
417422
* 'exit' or 'abort' so that the return status is set properly for this

oshmem/runtime/oshmem_shmem_abort.c

Lines changed: 4 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
/*
22
* Copyright (c) 2013 Mellanox Technologies, Inc.
33
* All rights reserved.
4+
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
45
* $COPYRIGHT$
56
*
67
* Additional copyrights may follow
@@ -24,6 +25,7 @@
2425
#endif
2526

2627
#include "opal/mca/backtrace/backtrace.h"
28+
#include "opal/util/error.h"
2729
#include "opal/runtime/opal_params.h"
2830

2931
#include "orte/util/proc_info.h"
@@ -95,24 +97,8 @@ int oshmem_shmem_abort(int errcode)
9597
}
9698
}
9799

98-
/* Should we wait for a while before aborting? */
99-
100-
if (0 != opal_abort_delay) {
101-
if (opal_abort_delay < 0) {
102-
fprintf(stderr ,"[%s:%d] Looping forever (MCA parameter opal_abort_delay is < 0)\n",
103-
host, (int) pid);
104-
fflush(stderr);
105-
while (1) {
106-
sleep(5);
107-
}
108-
} else {
109-
fprintf(stderr, "[%s:%d] Delaying for %d seconds before aborting\n",
110-
host, (int) pid, opal_abort_delay);
111-
do {
112-
sleep(1);
113-
} while (--opal_abort_delay > 0);
114-
}
115-
}
100+
/* Wait for a while before aborting */
101+
opal_delay_abort();
116102

117103
if (!orte_initialized || !oshmem_shmem_initialized) {
118104
if (orte_show_help_is_available()) {

0 commit comments

Comments
 (0)