@@ -114,7 +114,16 @@ static int orte_create_dir(char *directory)
114114
115115/* 
116116 * Construct the fullpath to the session directory - it 
117-  * will consist of "ompi.<hostname>.<pid>" 
117+  * will consist of "ompi.<hostname>.<effective-uid>", and 
118+  * have subdirs: 
119+  * 
120+  * pid - the pid of the mpirun that oversees this job. Note 
121+  *       that direct-launched processes will have manufactured 
122+  *       this value 
123+  * 
124+  * jobid - jobid of the application being executed 
125+  * 
126+  * vpid - vpid of the process 
118127 */ 
119128int 
120129orte_session_dir_get_name (char  * * fulldirpath ,
@@ -132,10 +141,14 @@ orte_session_dir_get_name(char **fulldirpath,
132141    bool  prefix_provided  =  false;
133142    int  exit_status  =  ORTE_SUCCESS ;
134143    size_t  len ;
144+     uid_t  uid ;
135145
136146    /* Ensure that system info is set */ 
137147    orte_proc_info ();
138148
149+     /* get the effective uid */ 
150+     uid  =  geteuid ();
151+ 
139152    /* 
140153     * set the 'hostname' 
141154     */ 
@@ -156,30 +169,48 @@ orte_session_dir_get_name(char **fulldirpath,
156169    /* construct the frontend of the session directory*/ 
157170    if  (NULL  !=  orte_process_info .top_session_dir ) {
158171        frontend  =  strdup (orte_process_info .top_session_dir );
172+     } else  { /* If not set then construct it */ 
173+         if  (0  >  asprintf (& frontend , "ompi.%s.%lu" , hostname , (unsigned long )uid )) {
174+             ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
175+             exit_status  =  ORTE_ERR_OUT_OF_RESOURCE ;
176+             goto cleanup ;
177+         }
159178    }
160-     else  { /* If not set then construct it */ 
161-         if  (0  >  asprintf (& frontend , "ompi.%s.%lu" , hostname , (unsigned long )orte_process_info .pid )) {
179+ 
180+     /* construct the next level down, which belongs to the 
181+      * job family. This is related to the mpirun that launched 
182+      * the job, or is an arbitrary (agreed upon) value if 
183+      * direct launched */ 
184+     if  (ORTE_PROC_IS_HNP ) {
185+         if  (0  >  asprintf (& jobfam , "pid.%lu" , (unsigned long )orte_process_info .pid )) {
162186            ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
163187            exit_status  =  ORTE_ERR_OUT_OF_RESOURCE ;
164188            goto cleanup ;
165189        }
190+         orte_process_info .jobfam_session_dir  =  strdup (jobfam );
191+     } else  if  (NULL  !=  orte_process_info .jobfam_session_dir ) {
192+         /* we had a job family session dir passed down to us by mpirun */ 
193+         jobfam  =  strdup (orte_process_info .jobfam_session_dir );
194+     } else  {
195+         /* we were not given one, so define it */ 
196+         if  (NULL  ==  proc ) {
197+             jobfam  =  strdup ("jobfam" );
198+         } else  {
199+             if  (0  >  asprintf (& jobfam , "jf.%d" , ORTE_JOB_FAMILY (proc -> jobid ))) {
200+                 ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
201+                 exit_status  =  ORTE_ERR_OUT_OF_RESOURCE ;
202+                 goto cleanup ;
203+             }
204+         }
205+         orte_process_info .jobfam_session_dir  =  strdup (jobfam );
166206    }
167207
168208    /* 
169209     * Construct the session directory 
170210     */ 
171-     /* If we were given a valid vpid then we can construct it fully into: 
172-      *   openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOB-FAMILY/JOBID/VPID 
173-      */ 
211+     /* If we were given a valid vpid then we can construct it fully */ 
174212    if ( NULL  !=  proc ) {
175213        if  (ORTE_VPID_INVALID  !=  proc -> vpid ) {
176- 
177-             if  (0  >  asprintf (& jobfam , "%d" , ORTE_JOB_FAMILY (proc -> jobid ))) {
178-                 ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
179-                 exit_status  =  ORTE_ERR_OUT_OF_RESOURCE ;
180-                 goto cleanup ;
181-             }
182- 
183214            if  (0  >  asprintf (& job , "%d" , ORTE_LOCAL_JOBID (proc -> jobid ))) {
184215                ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
185216                exit_status  =  ORTE_ERR_OUT_OF_RESOURCE ;
@@ -192,23 +223,13 @@ orte_session_dir_get_name(char **fulldirpath,
192223                goto cleanup ;
193224            }
194225
195-             sessions  =  opal_os_path (  false, frontend , jobfam , job , vpidstr , NULL   );
226+             sessions  =  opal_os_path (false, frontend , jobfam , job , vpidstr , NULL );
196227            if ( NULL  ==  sessions  ) {
197228                ORTE_ERROR_LOG (ORTE_ERROR );
198229                exit_status  =  ORTE_ERROR ;
199230                goto cleanup ;
200231            }
201-         }
202-         /* If we were given a valid jobid then we can construct it partially into: 
203-          *   openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOB-FAMILY/JOBID 
204-          */ 
205-         else  if  (ORTE_JOBID_INVALID  !=  proc -> jobid ) {
206-             if  (0  >  asprintf (& jobfam , "%d" , ORTE_JOB_FAMILY (proc -> jobid ))) {
207-                 ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
208-                 exit_status  =  ORTE_ERR_OUT_OF_RESOURCE ;
209-                 goto cleanup ;
210-             }
211- 
232+         } else  if  (ORTE_JOBID_INVALID  !=  proc -> jobid ) {
212233            if  (0  >  asprintf (& job , "%d" , ORTE_LOCAL_JOBID (proc -> jobid ))) {
213234                ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
214235                exit_status  =  ORTE_ERR_OUT_OF_RESOURCE ;
@@ -221,14 +242,12 @@ orte_session_dir_get_name(char **fulldirpath,
221242                exit_status  =  ORTE_ERROR ;
222243                goto cleanup ;
223244            }
224-         } /* if both are invalid */ 
225-         else  {
245+         } else  {
226246            sessions  =  strdup (frontend ); /* must dup this to avoid double-free later */ 
227247        }
228248
229-     }    /* If we were not given a proc at all, then we just set it to frontend 
230-           */ 
231-     else  {
249+     } else  {
250+         /* If we were not given a proc at all, then we just set it to frontend */ 
232251        sessions  =  strdup (frontend ); /* must dup this to avoid double-free later */ 
233252    }
234253
@@ -666,14 +685,8 @@ static char *orte_build_job_session_dir(char *top_dir,
666685                                        orte_process_name_t  * proc ,
667686                                        orte_jobid_t  jobid )
668687{
669-     char  * jobfam  =  NULL ;
670688    char  * job_session_dir ;
671689
672-     if  (0  >  asprintf (& jobfam , "%d" , ORTE_JOB_FAMILY (proc -> jobid ))) {
673-         ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
674-         return  NULL ;
675-     }
676- 
677690    if  (ORTE_JOBID_WILDCARD  !=  jobid ) {
678691        char  * job  =  NULL ;
679692
@@ -682,19 +695,18 @@ static char *orte_build_job_session_dir(char *top_dir,
682695            job_session_dir  =  NULL ;
683696            goto out ;
684697        }
685-         job_session_dir  =  opal_os_path (false, top_dir , jobfam , job , NULL );
698+         job_session_dir  =  opal_os_path (false, top_dir , orte_process_info . jobfam_session_dir , job , NULL );
686699        free (job );
687700        if  (NULL  ==  job_session_dir ) {
688701            ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
689702        }
690703    } else  {
691-         job_session_dir  =  opal_os_path (false, top_dir , jobfam , NULL );
704+         job_session_dir  =  opal_os_path (false, top_dir , orte_process_info . jobfam_session_dir , NULL );
692705        if ( NULL  ==  job_session_dir ) {
693706            ORTE_ERROR_LOG (ORTE_ERR_OUT_OF_RESOURCE );
694707        }
695708    }
696709
697710out :
698-     free (jobfam );
699711    return  job_session_dir ;
700712}
0 commit comments