Skip to content

Commit 9eba1b0

Browse files
authored
Merge pull request #2042 from artpol84/pmix_sdirs
Several fixes related to session directories:
2 parents be41b12 + a9a7f39 commit 9eba1b0

File tree

18 files changed

+360
-545
lines changed

18 files changed

+360
-545
lines changed

opal/mca/pmix/pmix_types.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ BEGIN_C_DECLS
7575
#define OPAL_PMIX_TMPDIR "pmix.tmpdir" // (char*) top-level tmp dir assigned to session
7676
#define OPAL_PMIX_NSDIR "pmix.nsdir" // (char*) sub-tmpdir assigned to namespace
7777
#define OPAL_PMIX_PROCDIR "pmix.pdir" // (char*) sub-nsdir assigned to proc
78+
#define OPAL_PMIX_TDIR_RMCLEAN "pmix.tdir.rmclean" // (bool) Resource Manager will clean session directories
7879

7980
/* information about relative ranks as assigned by the RM */
8081
#define OPAL_PMIX_JOBID "pmix.jobid" // (uint32_t) jobid assigned by scheduler

orte/mca/ess/base/ess_base_std_app.c

Lines changed: 1 addition & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -136,10 +136,7 @@ int orte_ess_base_app_setup(bool db_restrict_local)
136136
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
137137
(NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base,
138138
orte_process_info.nodename));
139-
if (ORTE_SUCCESS != (ret = orte_session_dir(true,
140-
orte_process_info.tmpdir_base,
141-
orte_process_info.nodename,
142-
ORTE_PROC_MY_NAME))) {
139+
if (ORTE_SUCCESS != (ret = orte_session_dir(true, ORTE_PROC_MY_NAME))) {
143140
ORTE_ERROR_LOG(ret);
144141
error = "orte_session_dir";
145142
goto error;
@@ -149,29 +146,6 @@ int orte_ess_base_app_setup(bool db_restrict_local)
149146
proc-specific session directory. */
150147
opal_output_set_output_file_info(orte_process_info.proc_session_dir,
151148
"output-", NULL, NULL);
152-
/* store the session directory location */
153-
OBJ_CONSTRUCT(&kv, opal_value_t);
154-
kv.key = strdup(OPAL_PMIX_NSDIR);
155-
kv.type = OPAL_STRING;
156-
kv.data.string = strdup(orte_process_info.job_session_dir);
157-
if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_NAME, &kv))) {
158-
ORTE_ERROR_LOG(ret);
159-
OBJ_DESTRUCT(&kv);
160-
error = "opal pmix put job sessiondir";
161-
goto error;
162-
}
163-
OBJ_DESTRUCT(&kv);
164-
OBJ_CONSTRUCT(&kv, opal_value_t);
165-
kv.key = strdup(OPAL_PMIX_PROCDIR);
166-
kv.type = OPAL_STRING;
167-
kv.data.string = strdup(orte_process_info.proc_session_dir);
168-
if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_NAME, &kv))) {
169-
ORTE_ERROR_LOG(ret);
170-
OBJ_DESTRUCT(&kv);
171-
error = "opal pmix put proc sessiondir";
172-
goto error;
173-
}
174-
OBJ_DESTRUCT(&kv);
175149
}
176150
/* Setup the communication infrastructure */
177151
/*

orte/mca/ess/base/ess_base_std_orted.c

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -237,10 +237,7 @@ int orte_ess_base_orted_setup(char **hosts)
237237
/* take a pass thru the session directory code to fillin the
238238
* tmpdir names - don't create anything yet
239239
*/
240-
if (ORTE_SUCCESS != (ret = orte_session_dir(false,
241-
orte_process_info.tmpdir_base,
242-
orte_process_info.nodename,
243-
ORTE_PROC_MY_NAME))) {
240+
if (ORTE_SUCCESS != (ret = orte_session_dir(false, ORTE_PROC_MY_NAME))) {
244241
ORTE_ERROR_LOG(ret);
245242
error = "orte_session_dir define";
246243
goto error;
@@ -250,10 +247,7 @@ int orte_ess_base_orted_setup(char **hosts)
250247
*/
251248
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
252249
/* now actually create the directory tree */
253-
if (ORTE_SUCCESS != (ret = orte_session_dir(true,
254-
orte_process_info.tmpdir_base,
255-
orte_process_info.nodename,
256-
ORTE_PROC_MY_NAME))) {
250+
if (ORTE_SUCCESS != (ret = orte_session_dir(true, ORTE_PROC_MY_NAME))) {
257251
ORTE_ERROR_LOG(ret);
258252
error = "orte_session_dir";
259253
goto error;
@@ -277,11 +271,8 @@ int orte_ess_base_orted_setup(char **hosts)
277271
/* define a log file name in the session directory */
278272
snprintf(log_file, PATH_MAX, "output-orted-%s-%s.log",
279273
jobidstring, orte_process_info.nodename);
280-
log_path = opal_os_path(false,
281-
orte_process_info.tmpdir_base,
282-
orte_process_info.top_session_dir,
283-
log_file,
284-
NULL);
274+
log_path = opal_os_path(false, orte_process_info.top_session_dir,
275+
log_file, NULL);
285276

286277
fd = open(log_path, O_RDWR|O_CREAT|O_TRUNC, 0640);
287278
if (fd < 0) {

orte/mca/ess/base/ess_base_std_tool.c

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -145,10 +145,9 @@ int orte_ess_base_tool_setup(void)
145145
* tmp base where any other session directories on
146146
* this node might be located
147147
*/
148-
if (ORTE_SUCCESS != (ret = orte_session_dir_get_name(NULL,
149-
&orte_process_info.tmpdir_base,
150-
&orte_process_info.top_session_dir,
151-
orte_process_info.nodename, NULL))) {
148+
149+
ret = orte_session_setup_base(NULL);
150+
if (ORTE_SUCCESS != ret ) {
152151
ORTE_ERROR_LOG(ret);
153152
error = "define session dir names";
154153
goto error;

orte/mca/ess/hnp/ess_hnp_module.c

Lines changed: 15 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ static int rte_init(void)
138138
{
139139
int ret;
140140
char *error = NULL;
141-
char *contact_path, *jobfam_dir;
141+
char *contact_path;
142142
orte_job_t *jdata;
143143
orte_node_t *node;
144144
orte_proc_t *proc;
@@ -294,10 +294,7 @@ static int rte_init(void)
294294
/* take a pass thru the session directory code to fillin the
295295
* tmpdir names - don't create anything yet
296296
*/
297-
if (ORTE_SUCCESS != (ret = orte_session_dir(false,
298-
orte_process_info.tmpdir_base,
299-
orte_process_info.nodename,
300-
ORTE_PROC_MY_NAME))) {
297+
if (ORTE_SUCCESS != (ret = orte_session_dir(false, ORTE_PROC_MY_NAME))) {
301298
error = "orte_session_dir define";
302299
goto error;
303300
}
@@ -307,10 +304,7 @@ static int rte_init(void)
307304
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
308305

309306
/* now actually create the directory tree */
310-
if (ORTE_SUCCESS != (ret = orte_session_dir(true,
311-
orte_process_info.tmpdir_base,
312-
orte_process_info.nodename,
313-
ORTE_PROC_MY_NAME))) {
307+
if (ORTE_SUCCESS != (ret = orte_session_dir(true, ORTE_PROC_MY_NAME))) {
314308
error = "orte_session_dir";
315309
goto error;
316310
}
@@ -586,9 +580,12 @@ static int rte_init(void)
586580
opal_output_set_output_file_info(orte_process_info.proc_session_dir,
587581
"output-", NULL, NULL);
588582
/* save my contact info in a file for others to find */
589-
jobfam_dir = opal_dirname(orte_process_info.job_session_dir);
590-
contact_path = opal_os_path(false, jobfam_dir, "contact.txt", NULL);
591-
free(jobfam_dir);
583+
if( NULL == orte_process_info.jobfam_session_dir ){
584+
/* has to be set here! */
585+
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
586+
goto error;
587+
}
588+
contact_path = opal_os_path(false, orte_process_info.jobfam_session_dir, "contact.txt", NULL);
592589
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
593590
"%s writing contact file %s",
594591
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@@ -758,10 +755,9 @@ static int rte_init(void)
758755
true, error, ORTE_ERROR_NAME(ret), ret);
759756
}
760757
/* remove my contact info file, if we have session directories */
761-
if (NULL != orte_process_info.job_session_dir) {
762-
jobfam_dir = opal_dirname(orte_process_info.job_session_dir);
763-
contact_path = opal_os_path(false, jobfam_dir, "contact.txt", NULL);
764-
free(jobfam_dir);
758+
if (NULL != orte_process_info.jobfam_session_dir) {
759+
contact_path = opal_os_path(false, orte_process_info.jobfam_session_dir,
760+
"contact.txt", NULL);
765761
unlink(contact_path);
766762
free(contact_path);
767763
}
@@ -775,7 +771,6 @@ static int rte_init(void)
775771
static int rte_finalize(void)
776772
{
777773
char *contact_path;
778-
char *jobfam_dir;
779774

780775
if (signals_set) {
781776
/* Remove the epipe handler */
@@ -816,10 +811,9 @@ static int rte_finalize(void)
816811
(void) mca_base_framework_close(&opal_pstat_base_framework);
817812

818813
/* remove my contact info file, if we have session directories */
819-
if (NULL != orte_process_info.job_session_dir) {
820-
jobfam_dir = opal_dirname(orte_process_info.job_session_dir);
821-
contact_path = opal_os_path(false, jobfam_dir, "contact.txt", NULL);
822-
free(jobfam_dir);
814+
if (NULL != orte_process_info.jobfam_session_dir) {
815+
contact_path = opal_os_path(false, orte_process_info.jobfam_session_dir,
816+
"contact.txt", NULL);
823817
unlink(contact_path);
824818
free(contact_path);
825819
}

orte/mca/ess/pmi/ess_pmi_module.c

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ static int rte_init(void)
9494
uint16_t u16, *u16ptr;
9595
char **peers=NULL, *mycpuset, **cpusets=NULL;
9696
opal_process_name_t wildcard_rank, pname;
97+
bool bool_val, *bool_ptr = &bool_val, tdir_mca_override = false;
9798
size_t i;
9899

99100
/* run the prolog */
@@ -242,6 +243,63 @@ static int rte_init(void)
242243
free(string_key);
243244
}
244245

246+
/* retrieve temp directories info */
247+
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_TMPDIR, &wildcard_rank, &val, OPAL_STRING);
248+
if (OPAL_SUCCESS == ret && NULL != val) {
249+
/* We want to provide user with ability
250+
* to override RM settings at his own risk
251+
*/
252+
if( NULL == orte_process_info.top_session_dir ){
253+
orte_process_info.top_session_dir = val;
254+
} else {
255+
/* keep the MCA setting */
256+
tdir_mca_override = true;
257+
free(val);
258+
}
259+
val = NULL;
260+
}
261+
262+
if( !tdir_mca_override ){
263+
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_NSDIR, &wildcard_rank, &val, OPAL_STRING);
264+
if (OPAL_SUCCESS == ret && NULL != val) {
265+
/* We want to provide user with ability
266+
* to override RM settings at his own risk
267+
*/
268+
if( NULL == orte_process_info.job_session_dir ){
269+
orte_process_info.job_session_dir = val;
270+
} else {
271+
/* keep the MCA setting */
272+
free(val);
273+
tdir_mca_override = true;
274+
}
275+
val = NULL;
276+
}
277+
}
278+
279+
if( !tdir_mca_override ){
280+
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_PROCDIR, &wildcard_rank, &val, OPAL_STRING);
281+
if (OPAL_SUCCESS == ret && NULL != val) {
282+
/* We want to provide user with ability
283+
* to override RM settings at his own risk
284+
*/
285+
if( NULL == orte_process_info.proc_session_dir ){
286+
orte_process_info.proc_session_dir = val;
287+
} else {
288+
/* keep the MCA setting */
289+
tdir_mca_override = true;
290+
free(val);
291+
}
292+
val = NULL;
293+
}
294+
}
295+
296+
if( !tdir_mca_override ){
297+
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_TDIR_RMCLEAN, &wildcard_rank, &bool_ptr, OPAL_BOOL);
298+
if (OPAL_SUCCESS == ret ) {
299+
orte_process_info.rm_session_dirs = bool_val;
300+
}
301+
}
302+
245303
/* retrieve our topology */
246304
val = NULL;
247305
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCAL_TOPO,

0 commit comments

Comments
 (0)