Skip to content

Commit b967646

Browse files
authored
Merge pull request #5610 from edgargabriel/pr/sharedfp-naming-conflict-v3.0
sharedfp/sm and lockedfile: fix naming bug
2 parents 2a4450d + 2eefa27 commit b967646

File tree

2 files changed

+42
-14
lines changed

2 files changed

+42
-14
lines changed

ompi/mca/sharedfp/lockedfile/sharedfp_lockedfile_file_open.c

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include <sys/stat.h>
3535
#endif
3636
#include <fcntl.h>
37+
#include <unistd.h>
3738

3839
int mca_sharedfp_lockedfile_file_open (struct ompi_communicator_t *comm,
3940
const char* filename,
@@ -49,6 +50,9 @@ int mca_sharedfp_lockedfile_file_open (struct ompi_communicator_t *comm,
4950
mca_io_ompio_file_t * shfileHandle, *ompio_fh;
5051
mca_io_ompio_data_t *data;
5152

53+
pid_t my_pid;
54+
int int_pid;
55+
5256
/*------------------------------------------------------------*/
5357
/*Open the same file again without shared file pointer support*/
5458
/*------------------------------------------------------------*/
@@ -109,15 +113,27 @@ int mca_sharedfp_lockedfile_file_open (struct ompi_communicator_t *comm,
109113
comm->c_coll->coll_bcast ( &masterjobid, 1, MPI_UNSIGNED, 0, comm,
110114
comm->c_coll->coll_bcast_module );
111115

112-
size_t filenamelen = strlen(filename) + 16;
116+
if ( 0 == fh->f_rank ) {
117+
my_pid = getpid();
118+
int_pid = (int) my_pid;
119+
}
120+
err = comm->c_coll->coll_bcast (&int_pid, 1, MPI_INT, 0, comm, comm->c_coll->coll_bcast_module );
121+
if ( OMPI_SUCCESS != err ) {
122+
opal_output(0, "[%d]mca_sharedfp_lockedfile_file_open: Error in bcast operation\n", fh->f_rank);
123+
free (sh);
124+
free(module_data);
125+
return err;
126+
}
127+
128+
size_t filenamelen = strlen(filename) + 24;
113129
lockedfilename = (char*)malloc(sizeof(char) * filenamelen);
114130
if ( NULL == lockedfilename ) {
115131
free (shfileHandle);
116132
free (sh);
117133
free (module_data);
118134
return OMPI_ERR_OUT_OF_RESOURCE;
119135
}
120-
snprintf(lockedfilename, filenamelen, "%s-%u%s",filename,masterjobid,".lock");
136+
snprintf(lockedfilename, filenamelen, "%s-%u-%d%s",filename,masterjobid,int_pid,".lock");
121137
module_data->filename = lockedfilename;
122138

123139
/*-------------------------------------------------*/

ompi/mca/sharedfp/sm/sharedfp_sm_file_open.c

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
#include <semaphore.h>
4444
#include <sys/mman.h>
4545
#include <libgen.h>
46-
46+
#include <unistd.h>
4747

4848
int mca_sharedfp_sm_file_open (struct ompi_communicator_t *comm,
4949
const char* filename,
@@ -57,12 +57,16 @@ int mca_sharedfp_sm_file_open (struct ompi_communicator_t *comm,
5757
mca_io_ompio_file_t * shfileHandle, *ompio_fh;
5858
char * filename_basename;
5959
char * sm_filename;
60+
int sm_filename_length;
6061
struct mca_sharedfp_sm_offset * sm_offset_ptr;
6162
struct mca_sharedfp_sm_offset sm_offset;
6263
mca_io_ompio_data_t *data;
6364
int sm_fd;
6465
int rank;
65-
66+
uint32_t comm_cid;
67+
int int_pid;
68+
pid_t my_pid;
69+
6670
/*----------------------------------------------------*/
6771
/*Open the same file again without shared file pointer*/
6872
/*----------------------------------------------------*/
@@ -132,25 +136,33 @@ int mca_sharedfp_sm_file_open (struct ompi_communicator_t *comm,
132136
** TODO: properly name the file so that different jobs can run on the same system w/o
133137
** overwriting each other, e.g. orte_process_info.proc_session_dir
134138
*/
135-
/*sprintf(sm_filename,"%s%s",filename,".sm");*/
136-
filename_basename = basename((void *)filename);
137-
sm_filename = (char*) malloc( sizeof(char) * (strlen(filename_basename)+64) );
139+
filename_basename = basename((char *)filename);
140+
/* format is "%s/%s_cid-%d-%d.sm", see below */
141+
sm_filename_length = strlen(ompi_process_info.job_session_dir) + 1 + strlen(filename_basename) + 5 + (3*sizeof(uint32_t)+1) + 4;
142+
sm_filename = (char*) malloc( sizeof(char) * sm_filename_length);
138143
if (NULL == sm_filename) {
139144
free(sm_data);
140145
free(sh);
141146
free(shfileHandle);
142147
return OMPI_ERR_OUT_OF_RESOURCE;
143148
}
144149

145-
opal_jobid_t masterjobid;
146-
if ( 0 == comm->c_my_rank ) {
147-
ompi_proc_t *masterproc = ompi_group_peer_lookup(comm->c_local_group, 0 );
148-
masterjobid = OMPI_CAST_RTE_NAME(&masterproc->super.proc_name)->jobid;
150+
comm_cid = ompi_comm_get_cid(comm);
151+
if ( 0 == fh->f_rank ) {
152+
my_pid = getpid();
153+
int_pid = (int) my_pid;
154+
}
155+
err = comm->c_coll->coll_bcast (&int_pid, 1, MPI_INT, 0, comm, comm->c_coll->coll_bcast_module );
156+
if ( OMPI_SUCCESS != err ) {
157+
opal_output(0,"mca_sharedfp_sm_file_open: Error in bcast operation \n");
158+
free(sm_filename);
159+
free(sm_data);
160+
free(sh);
161+
return err;
149162
}
150-
comm->c_coll->coll_bcast ( &masterjobid, 1, MPI_UNSIGNED, 0, comm,
151-
comm->c_coll->coll_bcast_module );
163+
snprintf(sm_filename, sm_filename_length, "%s/%s_cid-%d-%d.sm", ompi_process_info.job_session_dir,
164+
filename_basename, comm_cid, int_pid);
152165

153-
sprintf(sm_filename,"/tmp/OMPIO_%s_%d_%s",filename_basename, masterjobid, ".sm");
154166
/* open shared memory file, initialize to 0, map into memory */
155167
sm_fd = open(sm_filename, O_RDWR | O_CREAT,
156168
S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);

0 commit comments

Comments
 (0)