Skip to content

Commit 1fe452d

Browse files
author
Ralph Castain
authored
Merge pull request #3607 from rhc54/topic/server
Ensure that data from a job that was stored in ompi-server is purged once that job completes. Cleanup a few typos. Silence a Coverity warning
2 parents 2263183 + 5d990b5 commit 1fe452d

File tree

13 files changed

+205
-83
lines changed

13 files changed

+205
-83
lines changed

orte/mca/odls/base/odls_base_default_fns.c

Lines changed: 53 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -485,8 +485,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
485485
}
486486
}
487487

488-
if (!ORTE_PROC_IS_HNP &&
489-
!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
488+
if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
490489
/* compute and save bindings of local children */
491490
if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) {
492491
ORTE_ERROR_LOG(rc);
@@ -535,11 +534,8 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
535534

536535
static int setup_path(orte_app_context_t *app, char **wdir)
537536
{
538-
int rc;
537+
int rc=ORTE_SUCCESS;
539538
char dir[MAXPATHLEN];
540-
char **argvptr;
541-
char *pathenv = NULL, *mpiexec_pathenv = NULL;
542-
char *full_search;
543539

544540
if (!orte_get_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, NULL, OPAL_BOOL)) {
545541
/* Try to change to the app's cwd and check that the app
@@ -573,40 +569,6 @@ static int setup_path(orte_app_context_t *app, char **wdir)
573569
*wdir = NULL;
574570
}
575571

576-
/* Search for the OMPI_exec_path and PATH settings in the environment. */
577-
for (argvptr = app->env; *argvptr != NULL; argvptr++) {
578-
if (0 == strncmp("OMPI_exec_path=", *argvptr, 15)) {
579-
mpiexec_pathenv = *argvptr + 15;
580-
}
581-
if (0 == strncmp("PATH=", *argvptr, 5)) {
582-
pathenv = *argvptr + 5;
583-
}
584-
}
585-
586-
/* If OMPI_exec_path is set (meaning --path was used), then create a
587-
temporary environment to be used in the search for the executable.
588-
The PATH setting in this temporary environment is a combination of
589-
the OMPI_exec_path and PATH values. If OMPI_exec_path is not set,
590-
then just use existing environment with PATH in it. */
591-
if (NULL != mpiexec_pathenv) {
592-
argvptr = NULL;
593-
if (pathenv != NULL) {
594-
asprintf(&full_search, "%s:%s", mpiexec_pathenv, pathenv);
595-
} else {
596-
asprintf(&full_search, "%s", mpiexec_pathenv);
597-
}
598-
opal_setenv("PATH", full_search, true, &argvptr);
599-
free(full_search);
600-
} else {
601-
argvptr = app->env;
602-
}
603-
604-
rc = orte_util_check_context_app(app, argvptr);
605-
/* do not ERROR_LOG - it will be reported elsewhere */
606-
if (NULL != mpiexec_pathenv) {
607-
opal_argv_free(argvptr);
608-
}
609-
610572
CLEANUP:
611573
return rc;
612574
}
@@ -663,6 +625,9 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
663625
int rc, i;
664626
bool found;
665627
orte_proc_state_t state;
628+
char **argvptr;
629+
char *pathenv = NULL, *mpiexec_pathenv = NULL;
630+
char *full_search;
666631

667632
/* thread-protect common values */
668633
cd->env = opal_argv_copy(app->env);
@@ -694,6 +659,54 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
694659
child->rml_uri = NULL;
695660
}
696661

662+
/* setup the rest of the environment with the proc-specific items - these
663+
* will be overwritten for each child
664+
*/
665+
if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &cd->env))) {
666+
ORTE_ERROR_LOG(rc);
667+
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
668+
goto errorout;
669+
}
670+
671+
/* Search for the OMPI_exec_path and PATH settings in the environment. */
672+
for (argvptr = app->env; *argvptr != NULL; argvptr++) {
673+
if (0 == strncmp("OMPI_exec_path=", *argvptr, 15)) {
674+
mpiexec_pathenv = *argvptr + 15;
675+
}
676+
if (0 == strncmp("PATH=", *argvptr, 5)) {
677+
pathenv = *argvptr + 5;
678+
}
679+
}
680+
681+
/* If OMPI_exec_path is set (meaning --path was used), then create a
682+
temporary environment to be used in the search for the executable.
683+
The PATH setting in this temporary environment is a combination of
684+
the OMPI_exec_path and PATH values. If OMPI_exec_path is not set,
685+
then just use existing environment with PATH in it. */
686+
if (NULL != mpiexec_pathenv) {
687+
argvptr = NULL;
688+
if (pathenv != NULL) {
689+
asprintf(&full_search, "%s:%s", mpiexec_pathenv, pathenv);
690+
} else {
691+
asprintf(&full_search, "%s", mpiexec_pathenv);
692+
}
693+
opal_setenv("PATH", full_search, true, &argvptr);
694+
free(full_search);
695+
} else {
696+
argvptr = app->env;
697+
}
698+
699+
rc = orte_util_check_context_app(app, argvptr);
700+
/* do not ERROR_LOG - it will be reported elsewhere */
701+
if (NULL != mpiexec_pathenv) {
702+
opal_argv_free(argvptr);
703+
}
704+
if (ORTE_SUCCESS != rc) {
705+
opal_output(0, "%s:%d", __FILE__, __LINE__);
706+
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
707+
goto errorout;
708+
}
709+
697710
/* did the user request we display output in xterms? */
698711
if (NULL != orte_xterm && !ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
699712
opal_list_item_t *nmitem;
@@ -754,15 +767,6 @@ void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
754767
cd->argv = opal_argv_copy(app->argv);
755768
}
756769

757-
/* setup the rest of the environment with the proc-specific items - these
758-
* will be overwritten for each child
759-
*/
760-
if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &cd->env))) {
761-
ORTE_ERROR_LOG(rc);
762-
state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
763-
goto errorout;
764-
}
765-
766770
/* if we are indexing the argv by rank, do so now */
767771
if (cd->index_argv && !ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
768772
char *param;

orte/mca/rmaps/round_robin/rmaps_rr_mappers.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -359,7 +359,7 @@ int orte_rmaps_rr_bynode(orte_job_t *jdata,
359359
return ORTE_ERR_OUT_OF_RESOURCE;
360360
}
361361
nprocs_mapped++;
362-
orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
362+
orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
363363
}
364364
/* not all nodes are equal, so only set oversubscribed for
365365
* this node if it is in that state

orte/mca/schizo/ompi/schizo_ompi.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1207,6 +1207,11 @@ static int setup_child(orte_job_t *jdata,
12071207
opal_setenv("PWD", param, true, env);
12081208
/* update the initial wdir value too */
12091209
opal_setenv("OMPI_MCA_initial_wdir", param, true, env);
1210+
} else if (NULL != app->cwd) {
1211+
/* change to it */
1212+
if (0 != chdir(app->cwd)) {
1213+
return ORTE_ERROR;
1214+
}
12101215
}
12111216
return ORTE_SUCCESS;
12121217
}

orte/mca/state/base/state_base_fns.c

Lines changed: 70 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
#include "opal/mca/event/event.h"
2525
#include "opal/mca/pmix/pmix.h"
2626

27+
#include "orte/orted/pmix/pmix_server_internal.h"
28+
#include "orte/runtime/orte_data_server.h"
2729
#include "orte/runtime/orte_globals.h"
2830
#include "orte/runtime/orte_wait.h"
2931
#include "orte/mca/errmgr/errmgr.h"
@@ -466,6 +468,50 @@ void orte_state_base_report_progress(int fd, short argc, void *cbdata)
466468
OBJ_RELEASE(caddy);
467469
}
468470

471+
void orte_state_base_notify_data_server(orte_process_name_t *target)
472+
{
473+
opal_buffer_t *buf;
474+
int rc, room = -1;
475+
uint8_t cmd = ORTE_PMIX_PURGE_PROC_CMD;
476+
477+
/* if nobody local to us published anything, then we can ignore this */
478+
if (ORTE_JOBID_INVALID == orte_pmix_server_globals.server.jobid) {
479+
return;
480+
}
481+
482+
buf = OBJ_NEW(opal_buffer_t);
483+
484+
/* pack the room number */
485+
if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &room, 1, OPAL_INT))) {
486+
ORTE_ERROR_LOG(rc);
487+
OBJ_RELEASE(buf);
488+
return;
489+
}
490+
491+
/* load the command */
492+
if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &cmd, 1, OPAL_UINT8))) {
493+
ORTE_ERROR_LOG(rc);
494+
OBJ_RELEASE(buf);
495+
return;
496+
}
497+
498+
/* provide the target */
499+
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, target, 1, ORTE_NAME))) {
500+
ORTE_ERROR_LOG(rc);
501+
OBJ_RELEASE(buf);
502+
return;
503+
}
504+
505+
/* send the request to the server */
506+
rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
507+
&orte_pmix_server_globals.server, buf,
508+
ORTE_RML_TAG_DATA_SERVER,
509+
orte_rml_send_callback, NULL);
510+
if (ORTE_SUCCESS != rc) {
511+
OBJ_RELEASE(buf);
512+
}
513+
}
514+
469515
static void _send_notification(int status,
470516
orte_proc_state_t state,
471517
orte_process_name_t *proc,
@@ -725,6 +771,13 @@ void orte_state_base_track_procs(int fd, short argc, void *cbdata)
725771
if (orte_state_base_run_fdcheck) {
726772
orte_state_base_check_fds(jdata);
727773
}
774+
/* if ompi-server is around, then notify it to purge
775+
* any session-related info */
776+
if (NULL != orte_data_server_uri) {
777+
target.jobid = jdata->jobid;
778+
target.vpid = ORTE_VPID_WILDCARD;
779+
orte_state_base_notify_data_server(&target);
780+
}
728781
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
729782
/* if they requested notification upon completion, provide it */
730783
if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NOTIFY_COMPLETION, NULL, OPAL_BOOL)) {
@@ -1035,6 +1088,7 @@ void orte_state_base_check_fds(orte_job_t *jdata)
10351088
char path[1024], info[256], **list=NULL, *status, *result, *r2;
10361089
ssize_t rc;
10371090
struct flock fl;
1091+
bool flk;
10381092
int cnt = 0;
10391093

10401094
/* get the number of available file descriptors
@@ -1066,7 +1120,11 @@ void orte_state_base_check_fds(orte_job_t *jdata)
10661120
fl.l_whence = 0;
10671121
fl.l_start = 0;
10681122
fl.l_len = 0;
1069-
fcntl(i, F_GETLK, &fl);
1123+
if (-1 == fcntl(i, F_GETLK, &fl)) {
1124+
flk = false;
1125+
} else {
1126+
flk = true;
1127+
}
10701128
/* construct the list of capabilities */
10711129
if (fdflags & FD_CLOEXEC) {
10721130
opal_argv_append_nosize(&list, "cloexec");
@@ -1077,14 +1135,18 @@ void orte_state_base_check_fds(orte_job_t *jdata)
10771135
if (flflags & O_NONBLOCK) {
10781136
opal_argv_append_nosize(&list, "nonblock");
10791137
}
1080-
if (flflags & O_RDONLY) {
1138+
/* from the man page:
1139+
* Unlike the other values that can be specified in flags,
1140+
* the access mode values O_RDONLY, O_WRONLY, and O_RDWR,
1141+
* do not specify individual bits. Rather, they define
1142+
* the low order two bits of flags, and defined respectively
1143+
* as 0, 1, and 2. */
1144+
if (O_RDONLY == (flflags & 3)) {
10811145
opal_argv_append_nosize(&list, "rdonly");
1082-
}
1083-
if (flflags & O_RDWR) {
1084-
opal_argv_append_nosize(&list, "rdwr");
1085-
}
1086-
if (flflags & O_WRONLY) {
1146+
} else if (O_WRONLY == (flflags & 3)) {
10871147
opal_argv_append_nosize(&list, "wronly");
1148+
} else {
1149+
opal_argv_append_nosize(&list, "rdwr");
10881150
}
10891151
if (flflags & O_DSYNC) {
10901152
opal_argv_append_nosize(&list, "dsync");
@@ -1095,7 +1157,7 @@ void orte_state_base_check_fds(orte_job_t *jdata)
10951157
if (flflags & O_SYNC) {
10961158
opal_argv_append_nosize(&list, "sync");
10971159
}
1098-
if (F_UNLCK != fl.l_type) {
1160+
if (flk && F_UNLCK != fl.l_type) {
10991161
if (F_WRLCK == fl.l_type) {
11001162
opal_argv_append_nosize(&list, "wrlock");
11011163
} else {

orte/mca/state/base/state_private.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ ORTE_DECLSPEC void orte_state_base_report_progress(int fd, short argc, void *cbd
7878
ORTE_DECLSPEC void orte_state_base_track_procs(int fd, short argc, void *cbdata);
7979
ORTE_DECLSPEC void orte_state_base_check_all_complete(int fd, short args, void *cbdata);
8080
ORTE_DECLSPEC void orte_state_base_check_fds(orte_job_t *jdata);
81+
ORTE_DECLSPEC void orte_state_base_notify_data_server(orte_process_name_t *target);
8182

8283
END_C_DECLS
8384
#endif

orte/mca/state/dvm/state_dvm.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ static orte_job_state_t launch_states[] = {
8080
ORTE_JOB_STATE_DAEMONS_LAUNCHED,
8181
ORTE_JOB_STATE_DAEMONS_REPORTED,
8282
ORTE_JOB_STATE_VM_READY,
83+
ORTE_JOB_STATE_MAP,
84+
ORTE_JOB_STATE_MAP_COMPLETE,
8385
ORTE_JOB_STATE_SYSTEM_PREP,
8486
ORTE_JOB_STATE_LAUNCH_APPS,
8587
ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE,
@@ -98,6 +100,8 @@ static orte_state_cbfunc_t launch_callbacks[] = {
98100
orte_plm_base_daemons_launched,
99101
orte_plm_base_daemons_reported,
100102
vm_ready,
103+
orte_rmaps_base_map_job,
104+
orte_plm_base_mapping_complete,
101105
orte_plm_base_complete_setup,
102106
orte_plm_base_launch_apps,
103107
orte_state_base_local_launch_complete,
@@ -211,7 +215,7 @@ static void files_ready(int status, void *cbdata)
211215
ORTE_FORCED_TERMINATE(status);
212216
return;
213217
} else {
214-
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SYSTEM_PREP);
218+
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
215219
}
216220
}
217221

orte/mca/state/orted/state_orted.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
#include "orte/mca/rml/rml.h"
2828
#include "orte/mca/routed/routed.h"
2929
#include "orte/util/session_dir.h"
30+
#include "orte/orted/pmix/pmix_server_internal.h"
31+
#include "orte/runtime/orte_data_server.h"
3032
#include "orte/runtime/orte_quit.h"
3133

3234
#include "orte/mca/state/state.h"
@@ -260,6 +262,7 @@ static void track_procs(int fd, short argc, void *cbdata)
260262
orte_std_cntr_t index;
261263
orte_job_map_t *map;
262264
orte_node_t *node;
265+
orte_process_name_t target;
263266

264267
OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
265268
"%s state:orted:track_procs called for proc %s state %s",
@@ -489,6 +492,14 @@ static void track_procs(int fd, short argc, void *cbdata)
489492
orte_state_base_check_fds(jdata);
490493
}
491494

495+
/* if ompi-server is around, then notify it to purge
496+
* any session-related info */
497+
if (NULL != orte_data_server_uri) {
498+
target.jobid = jdata->jobid;
499+
target.vpid = ORTE_VPID_WILDCARD;
500+
orte_state_base_notify_data_server(&target);
501+
}
502+
492503
/* cleanup the job info */
493504
opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, NULL);
494505
OBJ_RELEASE(jdata);

orte/orted/orted_submit.c

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -370,7 +370,6 @@ int orte_submit_init(int argc, char *argv[],
370370
} else {
371371
orte_process_info.proc_type = ORTE_PROC_TOOL;
372372
}
373-
374373
if (ORTE_PROC_IS_TOOL) {
375374
if (0 == strncasecmp(orte_cmd_options.hnp, "file", strlen("file"))) {
376375
char input[1024], *filename;
@@ -1629,22 +1628,17 @@ static int create_app(int argc, char* argv[],
16291628
app->num_procs = (orte_std_cntr_t)orte_cmd_options.num_procs;
16301629
total_num_apps++;
16311630

1632-
/* Capture any preload flags */
1633-
if (orte_cmd_options.preload_binaries) {
1634-
orte_set_attribute(&app->attributes, ORTE_APP_PRELOAD_BIN, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
1635-
}
1636-
/* if we were told to cwd to the session dir and the app was given in
1637-
* relative syntax, then we need to preload the binary to
1631+
/* see if we need to preload the binary to
16381632
* find the app - don't do this for java apps, however, as we
16391633
* can't easily find the class on the cmd line. Java apps have to
16401634
* preload their binary via the preload_files option
16411635
*/
1642-
if (!opal_path_is_absolute(app->argv[0]) &&
1643-
NULL == strstr(app->argv[0], "java")) {
1636+
if (NULL == strstr(app->argv[0], "java")) {
16441637
if (orte_cmd_options.preload_binaries) {
16451638
orte_set_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
1646-
} else if (orte_get_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, NULL, OPAL_BOOL)) {
16471639
orte_set_attribute(&app->attributes, ORTE_APP_PRELOAD_BIN, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
1640+
/* no harm in setting this attribute twice as the function will simply ignore it */
1641+
orte_set_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL);
16481642
}
16491643
}
16501644
if (NULL != orte_cmd_options.preload_files) {

0 commit comments

Comments
 (0)