Skip to content

Commit 83d21e3

Browse files
author
Ralph Castain
authored
Merge pull request #3930 from rhc54/topic/signal
Fix signal forwarding on ORTE daemons
2 parents 8688219 + 8a98aab commit 83d21e3

File tree

3 files changed

+100
-106
lines changed

3 files changed

+100
-106
lines changed

orte/mca/ess/base/ess_base_frame.c

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,13 +161,34 @@ static struct known_signal known_signals[] = {
161161
{SIGHUP, "SIGHUP", false},
162162
{SIGINT, "SIGINT", false},
163163
{SIGKILL, "SIGKILL", false},
164+
{SIGPIPE, "SIGPIPE", false},
165+
#ifdef SIGQUIT
166+
{SIGQUIT, "SIGQUIT", false},
167+
#endif
168+
#ifdef SIGTRAP
169+
{SIGTRAP, "SIGTRAP", true},
170+
#endif
171+
#ifdef SIGTSTP
172+
{SIGTSTP, "SIGTSTP", true},
173+
#endif
174+
#ifdef SIGABRT
175+
{SIGABRT, "SIGABRT", true},
176+
#endif
177+
#ifdef SIGCONT
178+
{SIGCONT, "SIGCONT", true},
179+
#endif
164180
#ifdef SIGSYS
165181
{SIGSYS, "SIGSYS", true},
166182
#endif
167183
#ifdef SIGXCPU
168184
{SIGXCPU, "SIGXCPU", true},
169185
#endif
186+
#ifdef SIGXFSZ
170187
{SIGXFSZ, "SIGXFSZ", true},
188+
#endif
189+
#ifdef SIGALRM
190+
{SIGALRM, "SIGALRM", true},
191+
#endif
171192
#ifdef SIGVTALRM
172193
{SIGVTALRM, "SIGVTALRM", true},
173194
#endif

orte/mca/ess/base/ess_base_std_orted.c

Lines changed: 79 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -88,12 +88,11 @@ static bool signals_set=false;
8888
static opal_event_t term_handler;
8989
static opal_event_t int_handler;
9090
static opal_event_t epipe_handler;
91-
static opal_event_t sigusr1_handler;
92-
static opal_event_t sigusr2_handler;
9391
static char *log_path = NULL;
9492
static void shutdown_signal(int fd, short flags, void *arg);
95-
static void signal_callback(int fd, short flags, void *arg);
9693
static void epipe_signal_callback(int fd, short flags, void *arg);
94+
static void signal_forward_callback(int fd, short event, void *arg);
95+
static opal_event_t *forward_signals_events = NULL;
9796

9897
static void setup_sighandler(int signal, opal_event_t *ev,
9998
opal_event_cbfunc_t cbfunc)
@@ -119,6 +118,8 @@ int orte_ess_base_orted_setup(void)
119118
unsigned i, j;
120119
orte_topology_t *t;
121120
opal_list_t transports;
121+
orte_ess_base_signal_t *sig;
122+
int idx;
122123

123124
/* my name is set, xfer it to the OPAL layer */
124125
orte_process_info.super.proc_name = *(opal_process_name_t*)ORTE_PROC_MY_NAME;
@@ -128,18 +129,31 @@ int orte_ess_base_orted_setup(void)
128129
opal_proc_local_set(&orte_process_info.super);
129130

130131
plm_in_use = false;
132+
131133
/* setup callback for SIGPIPE */
132134
setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback);
133135
/* Set signal handlers to catch kill signals so we can properly clean up
134136
* after ourselves.
135137
*/
136138
setup_sighandler(SIGTERM, &term_handler, shutdown_signal);
137139
setup_sighandler(SIGINT, &int_handler, shutdown_signal);
138-
/** setup callbacks for signals we should ignore */
139-
setup_sighandler(SIGUSR1, &sigusr1_handler, signal_callback);
140-
setup_sighandler(SIGUSR2, &sigusr2_handler, signal_callback);
140+
/** setup callbacks for signals we should forward */
141+
if (0 < (idx = opal_list_get_size(&orte_ess_base_signals))) {
142+
forward_signals_events = (opal_event_t*)malloc(sizeof(opal_event_t) * idx);
143+
if (NULL == forward_signals_events) {
144+
ret = ORTE_ERR_OUT_OF_RESOURCE;
145+
error = "unable to malloc";
146+
goto error;
147+
}
148+
idx = 0;
149+
OPAL_LIST_FOREACH(sig, &orte_ess_base_signals, orte_ess_base_signal_t) {
150+
setup_sighandler(sig->signal, forward_signals_events + idx, signal_forward_callback);
151+
++idx;
152+
}
153+
}
141154
signals_set = true;
142155

156+
143157
/* get the local topology */
144158
if (NULL == opal_hwloc_topology) {
145159
if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
@@ -653,14 +667,24 @@ int orte_ess_base_orted_setup(void)
653667

654668
int orte_ess_base_orted_finalize(void)
655669
{
670+
orte_ess_base_signal_t *sig;
671+
unsigned int i;
672+
656673
if (signals_set) {
657-
/* Release all local signal handlers */
658674
opal_event_del(&epipe_handler);
659675
opal_event_del(&term_handler);
660676
opal_event_del(&int_handler);
661-
opal_event_signal_del(&sigusr1_handler);
662-
opal_event_signal_del(&sigusr2_handler);
677+
/** Remove the USR signal handlers */
678+
i = 0;
679+
OPAL_LIST_FOREACH(sig, &orte_ess_base_signals, orte_ess_base_signal_t) {
680+
opal_event_signal_del(forward_signals_events + i);
681+
++i;
682+
}
683+
free (forward_signals_events);
684+
forward_signals_events = NULL;
685+
signals_set = false;
663686
}
687+
664688
/* cleanup */
665689
if (NULL != log_path) {
666690
unlink(log_path);
@@ -717,7 +741,51 @@ static void epipe_signal_callback(int fd, short flags, void *arg)
717741
return;
718742
}
719743

720-
static void signal_callback(int fd, short event, void *arg)
744+
/* Pass user signals to the local application processes */
745+
static void signal_forward_callback(int fd, short event, void *arg)
721746
{
722-
/* just ignore these signals */
747+
opal_event_t *signal = (opal_event_t*)arg;
748+
int32_t signum, rc;
749+
opal_buffer_t *cmd;
750+
orte_daemon_cmd_flag_t command=ORTE_DAEMON_SIGNAL_LOCAL_PROCS;
751+
orte_jobid_t job = ORTE_JOBID_WILDCARD;
752+
753+
signum = OPAL_EVENT_SIGNAL(signal);
754+
if (!orte_execute_quiet){
755+
fprintf(stderr, "%s: Forwarding signal %d to job\n",
756+
orte_basename, signum);
757+
}
758+
759+
cmd = OBJ_NEW(opal_buffer_t);
760+
761+
/* pack the command */
762+
if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD))) {
763+
ORTE_ERROR_LOG(rc);
764+
OBJ_RELEASE(cmd);
765+
return;
766+
}
767+
768+
/* pack the jobid */
769+
if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &job, 1, ORTE_JOBID))) {
770+
ORTE_ERROR_LOG(rc);
771+
OBJ_RELEASE(cmd);
772+
return;
773+
}
774+
775+
/* pack the signal */
776+
if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &signum, 1, OPAL_INT32))) {
777+
ORTE_ERROR_LOG(rc);
778+
OBJ_RELEASE(cmd);
779+
return;
780+
}
781+
782+
/* send it to ourselves */
783+
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
784+
ORTE_PROC_MY_NAME, cmd,
785+
ORTE_RML_TAG_DAEMON,
786+
NULL, NULL))) {
787+
ORTE_ERROR_LOG(rc);
788+
OBJ_RELEASE(cmd);
789+
}
790+
723791
}

orte/mca/ess/slurm/ess_slurm_module.c

Lines changed: 0 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -59,24 +59,10 @@ orte_ess_base_module_t orte_ess_slurm_module = {
5959
NULL /* ft_event */
6060
};
6161

62-
static void signal_forward_callback(int fd, short event, void *arg);
63-
static opal_event_t *forward_signals_events = NULL;
64-
static bool signals_set=false;
65-
66-
static void setup_sighandler(int signal, opal_event_t *ev,
67-
opal_event_cbfunc_t cbfunc)
68-
{
69-
opal_event_signal_set(orte_event_base, ev, signal, cbfunc, ev);
70-
opal_event_set_priority(ev, ORTE_ERROR_PRI);
71-
opal_event_signal_add(ev, NULL);
72-
}
73-
7462
static int rte_init(void)
7563
{
7664
int ret;
7765
char *error = NULL;
78-
orte_ess_base_signal_t *sig;
79-
int idx;
8066

8167
/* run the prolog */
8268
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
@@ -91,29 +77,11 @@ static int rte_init(void)
9177
* default procedure
9278
*/
9379
if (ORTE_PROC_IS_DAEMON) {
94-
/** setup callbacks for signals we should forward */
95-
if (0 < (idx = opal_list_get_size(&orte_ess_base_signals))) {
96-
forward_signals_events = (opal_event_t*)malloc(sizeof(opal_event_t) * idx);
97-
if (NULL == forward_signals_events) {
98-
ret = ORTE_ERR_OUT_OF_RESOURCE;
99-
error = "unable to malloc";
100-
goto error;
101-
}
102-
idx = 0;
103-
OPAL_LIST_FOREACH(sig, &orte_ess_base_signals, orte_ess_base_signal_t) {
104-
setup_sighandler(sig->signal, forward_signals_events + idx, signal_forward_callback);
105-
++idx;
106-
}
107-
}
108-
signals_set = true;
109-
11080
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup())) {
11181
ORTE_ERROR_LOG(ret);
11282
error = "orte_ess_base_orted_setup";
11383
goto error;
11484
}
115-
/* setup the signal handlers */
116-
11785
return ORTE_SUCCESS;
11886
}
11987

@@ -145,23 +113,9 @@ static int rte_init(void)
145113
static int rte_finalize(void)
146114
{
147115
int ret;
148-
orte_ess_base_signal_t *sig;
149-
unsigned int i;
150116

151117
/* if I am a daemon, finalize using the default procedure */
152118
if (ORTE_PROC_IS_DAEMON) {
153-
if (signals_set) {
154-
/** Remove the USR signal handlers */
155-
i = 0;
156-
OPAL_LIST_FOREACH(sig, &orte_ess_base_signals, orte_ess_base_signal_t) {
157-
opal_event_signal_del(forward_signals_events + i);
158-
++i;
159-
}
160-
free (forward_signals_events);
161-
forward_signals_events = NULL;
162-
signals_set = false;
163-
}
164-
165119
if (ORTE_SUCCESS != (ret = orte_ess_base_orted_finalize())) {
166120
ORTE_ERROR_LOG(ret);
167121
return ret;
@@ -246,52 +200,3 @@ static int slurm_set_name(void)
246200

247201
return ORTE_SUCCESS;
248202
}
249-
250-
/* Pass user signals to the local application processes */
251-
static void signal_forward_callback(int fd, short event, void *arg)
252-
{
253-
opal_event_t *signal = (opal_event_t*)arg;
254-
int32_t signum, rc;
255-
opal_buffer_t *cmd;
256-
orte_daemon_cmd_flag_t command=ORTE_DAEMON_SIGNAL_LOCAL_PROCS;
257-
orte_jobid_t job = ORTE_JOBID_WILDCARD;
258-
259-
signum = OPAL_EVENT_SIGNAL(signal);
260-
if (!orte_execute_quiet){
261-
fprintf(stderr, "%s: Forwarding signal %d to job\n",
262-
orte_basename, signum);
263-
}
264-
265-
cmd = OBJ_NEW(opal_buffer_t);
266-
267-
/* pack the command */
268-
if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD))) {
269-
ORTE_ERROR_LOG(rc);
270-
OBJ_RELEASE(cmd);
271-
return;
272-
}
273-
274-
/* pack the jobid */
275-
if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &job, 1, ORTE_JOBID))) {
276-
ORTE_ERROR_LOG(rc);
277-
OBJ_RELEASE(cmd);
278-
return;
279-
}
280-
281-
/* pack the signal */
282-
if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &signum, 1, OPAL_INT32))) {
283-
ORTE_ERROR_LOG(rc);
284-
OBJ_RELEASE(cmd);
285-
return;
286-
}
287-
288-
/* send it to ourselves */
289-
if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
290-
ORTE_PROC_MY_NAME, cmd,
291-
ORTE_RML_TAG_DAEMON,
292-
NULL, NULL))) {
293-
ORTE_ERROR_LOG(rc);
294-
OBJ_RELEASE(cmd);
295-
}
296-
297-
}

0 commit comments

Comments
 (0)