Skip to content

Commit 2414244

Browse files
author
rhc54
authored
Merge pull request #1872 from rhc54/topic/continuous
Add support for continuously operating applications
2 parents 0d1afba + 20a91c2 commit 2414244

File tree

16 files changed

+275
-20
lines changed

16 files changed

+275
-20
lines changed

opal/include/opal/constants.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,15 @@ enum {
8181
OPAL_ERR_COMM_FAILURE = (OPAL_ERR_BASE - 51),
8282
OPAL_ERR_SERVER_NOT_AVAIL = (OPAL_ERR_BASE - 52),
8383
OPAL_ERR_IN_PROCESS = (OPAL_ERR_BASE - 53),
84+
/* PMIx equivalents for notification support */
8485
OPAL_ERR_DEBUGGER_RELEASE = (OPAL_ERR_BASE - 54),
8586
OPAL_ERR_HANDLERS_COMPLETE = (OPAL_ERR_BASE - 55),
86-
OPAL_ERR_PARTIAL_SUCCESS = (OPAL_ERR_BASE - 56)
87+
OPAL_ERR_PARTIAL_SUCCESS = (OPAL_ERR_BASE - 56),
88+
OPAL_ERR_PROC_ABORTED = (OPAL_ERR_BASE - 57),
89+
OPAL_ERR_PROC_REQUESTED_ABORT = (OPAL_ERR_BASE - 58),
90+
OPAL_ERR_PROC_ABORTING = (OPAL_ERR_BASE - 59),
91+
OPAL_ERR_NODE_DOWN = (OPAL_ERR_BASE - 60),
92+
OPAL_ERR_NODE_OFFLINE = (OPAL_ERR_BASE - 61)
8793
};
8894

8995
#define OPAL_ERR_MAX (OPAL_ERR_BASE - 100)

opal/mca/pmix/pmix2x/pmix/include/pmix/pmix_common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,7 @@ BEGIN_C_DECLS
204204
#define PMIX_EVENT_ENVIRO_LEVEL "pmix.evenv" // (bool) register for environment events only
205205
#define PMIX_EVENT_ORDER_PREPEND "pmix.evprepend" // (bool) prepend this handler to the precedence list
206206
#define PMIX_EVENT_CUSTOM_RANGE "pmix.evrange" // (pmix_proc_t*) array of pmix_proc_t defining range of event notification
207+
#define PMIX_EVENT_AFFECTED_PROC "pmix.evproc" // (pmix_proc_t) single proc that was affected
207208
#define PMIX_EVENT_AFFECTED_PROCS "pmix.evaffected" // (pmix_proc_t*) array of pmix_proc_t defining affected procs
208209
#define PMIX_EVENT_NON_DEFAULT "pmix.evnondef" // (bool) event is not to be delivered to default event handlers
209210
/* fault tolerance-related events */
@@ -462,6 +463,7 @@ typedef struct pmix_value {
462463
double dval;
463464
struct timeval tv;
464465
pmix_status_t status;
466+
pmix_proc_t proc;
465467
pmix_info_array_t array;
466468
pmix_byte_object_t bo;
467469
void *ptr;

opal/mca/pmix/pmix2x/pmix/src/buffer_ops/pack.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,11 @@ static pmix_status_t pack_val(pmix_buffer_t *buffer,
544544
return ret;
545545
}
546546
break;
547+
case PMIX_PROC:
548+
if (PMIX_SUCCESS != (ret = pmix_bfrop_pack_buffer(buffer, &p->data.proc, 1, PMIX_PROC))) {
549+
return ret;
550+
}
551+
break;
547552
default:
548553
pmix_output(0, "PACK-PMIX-VALUE: UNSUPPORTED TYPE %d", (int)p->type);
549554
return PMIX_ERROR;

opal/mca/pmix/pmix2x/pmix/src/buffer_ops/unpack.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -634,8 +634,13 @@ pmix_status_t pmix_bfrop_unpack_status(pmix_buffer_t *buffer, void *dest,
634634
return ret;
635635
}
636636
break;
637+
case PMIX_PROC:
638+
if (PMIX_SUCCESS != (ret = pmix_bfrop_unpack_buffer(buffer, &val->data.proc, &m, PMIX_PROC))) {
639+
return ret;
640+
}
641+
break;
637642
default:
638-
pmix_output(0, "UNPACK-PMIX-VALUE: UNSUPPORTED TYPE");
643+
pmix_output(0, "UNPACK-PMIX-VALUE: UNSUPPORTED TYPE %d", (int)val->type);
639644
return PMIX_ERROR;
640645
}
641646

opal/mca/pmix/pmix2x/pmix2x.c

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -422,6 +422,24 @@ pmix_status_t pmix2x_convert_opalrc(int rc)
422422
case OPAL_ERR_DEBUGGER_RELEASE:
423423
return PMIX_ERR_DEBUGGER_RELEASE;
424424

425+
case OPAL_ERR_HANDLERS_COMPLETE:
426+
return PMIX_EVENT_ACTION_COMPLETE;
427+
428+
case OPAL_ERR_PROC_ABORTED:
429+
return PMIX_ERR_PROC_ABORTED;
430+
431+
case OPAL_ERR_PROC_REQUESTED_ABORT:
432+
return PMIX_ERR_PROC_REQUESTED_ABORT;
433+
434+
case OPAL_ERR_PROC_ABORTING:
435+
return PMIX_ERR_PROC_ABORTING;
436+
437+
case OPAL_ERR_NODE_DOWN:
438+
return PMIX_ERR_NODE_DOWN;
439+
440+
case OPAL_ERR_NODE_OFFLINE:
441+
return PMIX_ERR_NODE_OFFLINE;
442+
425443
case OPAL_ERR_NOT_IMPLEMENTED:
426444
case OPAL_ERR_NOT_SUPPORTED:
427445
return PMIX_ERR_NOT_SUPPORTED;
@@ -452,6 +470,9 @@ pmix_status_t pmix2x_convert_opalrc(int rc)
452470
case OPAL_EXISTS:
453471
return PMIX_EXISTS;
454472

473+
case OPAL_ERR_PARTIAL_SUCCESS:
474+
return PMIX_QUERY_PARTIAL_SUCCESS;
475+
455476
case OPAL_ERROR:
456477
return PMIX_ERROR;
457478
case OPAL_SUCCESS:
@@ -467,6 +488,24 @@ int pmix2x_convert_rc(pmix_status_t rc)
467488
case PMIX_ERR_DEBUGGER_RELEASE:
468489
return OPAL_ERR_DEBUGGER_RELEASE;
469490

491+
case PMIX_EVENT_ACTION_COMPLETE:
492+
return OPAL_ERR_HANDLERS_COMPLETE;
493+
494+
case PMIX_ERR_PROC_ABORTED:
495+
return OPAL_ERR_PROC_ABORTED;
496+
497+
case PMIX_ERR_PROC_REQUESTED_ABORT:
498+
return OPAL_ERR_PROC_REQUESTED_ABORT;
499+
500+
case PMIX_ERR_PROC_ABORTING:
501+
return OPAL_ERR_PROC_ABORTING;
502+
503+
case PMIX_ERR_NODE_DOWN:
504+
return OPAL_ERR_NODE_DOWN;
505+
506+
case PMIX_ERR_NODE_OFFLINE:
507+
return OPAL_ERR_NODE_OFFLINE;
508+
470509
case PMIX_ERR_NOT_SUPPORTED:
471510
return OPAL_ERR_NOT_SUPPORTED;
472511

@@ -500,6 +539,9 @@ int pmix2x_convert_rc(pmix_status_t rc)
500539
case PMIX_EXISTS:
501540
return OPAL_EXISTS;
502541

542+
case PMIX_QUERY_PARTIAL_SUCCESS:
543+
return OPAL_ERR_PARTIAL_SUCCESS;
544+
503545
case PMIX_ERROR:
504546
return OPAL_ERROR;
505547
case PMIX_SUCCESS:
@@ -671,6 +713,11 @@ void pmix2x_value_load(pmix_value_t *v,
671713
}
672714
}
673715
break;
716+
case OPAL_NAME:
717+
v->type = PMIX_PROC;
718+
(void)opal_snprintf_jobid(v->data.proc.nspace, PMIX_MAX_NSLEN, kv->data.name.jobid);
719+
v->data.proc.rank = kv->data.name.vpid;
720+
break;
674721
default:
675722
/* silence warnings */
676723
break;
@@ -772,6 +819,13 @@ int pmix2x_value_unload(opal_value_t *kv,
772819
kv->data.bo.size = 0;
773820
}
774821
break;
822+
case PMIX_PROC:
823+
kv->type = OPAL_NAME;
824+
if (OPAL_SUCCESS != (rc = opal_convert_string_to_jobid(&kv->data.name.jobid, v->data.proc.nspace))) {
825+
return pmix2x_convert_opalrc(rc);
826+
}
827+
kv->data.name.vpid = v->data.proc.rank;
828+
break;
775829
default:
776830
/* silence warnings */
777831
rc = OPAL_ERROR;

opal/mca/pmix/pmix_types.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ BEGIN_C_DECLS
143143
#define OPAL_PMIX_EVENT_ENVIRO_LEVEL "pmix.evenv" // (bool) register for environment events only
144144
#define OPAL_PMIX_EVENT_ORDER_PREPEND "pmix.evprepend" // (bool) prepend this handler to the precedence list
145145
#define OPAL_PMIX_EVENT_CUSTOM_RANGE "pmix.evrange" // (pmix_proc_t*) array of pmix_proc_t defining range of event notification
146+
#define OPAL_PMIX_EVENT_AFFECTED_PROC "pmix.evproc" // (pmix_proc_t) single proc that was affected
146147
#define OPAL_PMIX_EVENT_AFFECTED_PROCS "pmix.evaffected" // (pmix_proc_t*) array of pmix_proc_t defining affected procs
147148
#define OPAL_PMIX_EVENT_NON_DEFAULT "opal.evnondef" // (bool) event is not to be delivered to default event handlers
148149
/* fault tolerance-related events */

opal/runtime/opal_init.c

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,11 +257,26 @@ opal_err2str(int errnum, const char **errmsg)
257257
retval = "Release debugger";
258258
break;
259259
case OPAL_ERR_HANDLERS_COMPLETE:
260-
retval = "Event handler processing complete";
260+
retval = "Event handlers complete";
261261
break;
262262
case OPAL_ERR_PARTIAL_SUCCESS:
263263
retval = "Partial success";
264264
break;
265+
case OPAL_ERR_PROC_ABORTED:
266+
retval = "Process abnormally terminated";
267+
break;
268+
case OPAL_ERR_PROC_REQUESTED_ABORT:
269+
retval = "Process requested abort";
270+
break;
271+
case OPAL_ERR_PROC_ABORTING:
272+
retval = "Process is aborting";
273+
break;
274+
case OPAL_ERR_NODE_DOWN:
275+
retval = "Node has gone down";
276+
break;
277+
case OPAL_ERR_NODE_OFFLINE:
278+
retval = "Node has gone offline";
279+
break;
265280
default:
266281
retval = "UNRECOGNIZED";
267282
}

orte/include/orte/constants.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,11 @@ enum {
8888
ORTE_ERR_COMM_FAILURE = OPAL_ERR_COMM_FAILURE,
8989
ORTE_ERR_DEBUGGER_RELEASE = OPAL_ERR_DEBUGGER_RELEASE,
9090
ORTE_ERR_PARTIAL_SUCCESS = OPAL_ERR_PARTIAL_SUCCESS,
91+
ORTE_ERR_PROC_ABORTED = OPAL_ERR_PROC_ABORTED,
92+
ORTE_ERR_PROC_REQUESTED_ABORT = OPAL_ERR_PROC_REQUESTED_ABORT,
93+
ORTE_ERR_PROC_ABORTING = OPAL_ERR_PROC_ABORTING,
94+
ORTE_ERR_NODE_DOWN = OPAL_ERR_NODE_DOWN,
95+
ORTE_ERR_NODE_OFFLINE = OPAL_ERR_NODE_OFFLINE,
9196

9297
/* error codes specific to ORTE - don't forget to update
9398
orte/util/error_strings.c when adding new error codes!!

orte/mca/errmgr/default_hnp/errmgr_default_hnp.c

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -339,8 +339,8 @@ static void proc_errors(int fd, short args, void *cbdata)
339339
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
340340
/* remove from dependent routes, if it is one */
341341
orte_routed.route_lost(proc);
342-
/* if all my routes and local children are gone, then terminate ourselves */
343-
if (0 == orte_routed.num_routes()) {
342+
/* if all my routes and local children are gone, then terminate ourselves */
343+
if (0 == orte_routed.num_routes()) {
344344
for (i=0; i < orte_local_children->size; i++) {
345345
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
346346
ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_ALIVE) && proct->state < ORTE_PROC_STATE_UNTERMINATED) {
@@ -357,7 +357,7 @@ static void proc_errors(int fd, short args, void *cbdata)
357357
"%s errmgr_hnp: all routes and children gone - ordering exit",
358358
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
359359
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
360-
} else {
360+
} else {
361361
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
362362
"%s Comm failure: %d routes remain alive",
363363
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@@ -398,7 +398,7 @@ static void proc_errors(int fd, short args, void *cbdata)
398398
}
399399

400400
/* if we were ordered to terminate, mark this proc as dead and see if
401-
* any of our routes or local children remain alive - if not, then
401+
* any of our routes or local children remain alive - if not, then
402402
* terminate ourselves. */
403403
if (orte_orteds_term_ordered) {
404404
for (i=0; i < orte_local_children->size; i++) {
@@ -419,6 +419,14 @@ static void proc_errors(int fd, short args, void *cbdata)
419419
}
420420

421421
keep_going:
422+
/* if this is a continuously operating job, then there is nothing more
423+
* to do - we let the job continue to run */
424+
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL)) {
425+
/* always mark the waitpid as having fired */
426+
ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED);
427+
goto cleanup;
428+
}
429+
422430
/* ensure we record the failed proc properly so we can report
423431
* the error once we terminate
424432
*/
@@ -490,7 +498,7 @@ static void proc_errors(int fd, short args, void *cbdata)
490498
/* this job has terminated */
491499
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
492500
}
493-
}
501+
}
494502
break;
495503

496504
case ORTE_PROC_STATE_TERM_WO_SYNC:

orte/mca/schizo/ompi/schizo_ompi.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -411,13 +411,17 @@ static opal_cmd_line_init_t cmd_line_init[] = {
411411
"Report events to a tool listening at the specified URI" },
412412

413413
{ "orte_enable_recovery", '\0', "enable-recovery", "enable-recovery", 0,
414-
&orte_cmd_options.enable_recovery, OPAL_CMD_LINE_TYPE_BOOL,
414+
NULL, OPAL_CMD_LINE_TYPE_BOOL,
415415
"Enable recovery from process failure [Default = disabled]" },
416416

417417
{ "orte_max_restarts", '\0', "max-restarts", "max-restarts", 1,
418418
NULL, OPAL_CMD_LINE_TYPE_INT,
419419
"Max number of times to restart a failed process" },
420420

421+
{ NULL, '\0', "continuous", "continuous", 0,
422+
&orte_cmd_options.continuous, OPAL_CMD_LINE_TYPE_BOOL,
423+
"Job is to run until explicitly terminated" },
424+
421425
{ "orte_hetero_nodes", '\0', NULL, "hetero-nodes", 0,
422426
NULL, OPAL_CMD_LINE_TYPE_BOOL,
423427
"Nodes in cluster may differ in topology, so send the topology back from each node [Default = false]" },

0 commit comments

Comments
 (0)