Skip to content

Commit 524029d

Browse files
authored
Merge pull request #9945 from edgargabriel/topic/lockedfile-fixes
Topic/lockedfile fixes
2 parents 64b84e6 + 6178e43 commit 524029d

File tree

4 files changed

+75
-13
lines changed

4 files changed

+75
-13
lines changed

ompi/mca/common/ompio/common_ompio.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ struct ompio_file_t {
158158
int f_perm;
159159
ompi_communicator_t *f_comm;
160160
const char *f_filename;
161+
char *f_fullfilename;
161162
char *f_datarep;
162163
opal_convertor_t *f_mem_convertor;
163164
opal_convertor_t *f_file_convertor;

ompi/mca/common/ompio/common_ompio_file_open.c

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@
4141
#include <math.h>
4242
#include "common_ompio.h"
4343
#include "ompi/mca/topo/topo.h"
44+
#include "opal/util/opal_getcwd.h"
45+
#include "opal/util/path.h"
46+
#include "opal/util/os_path.h"
4447

4548
static mca_common_ompio_generate_current_file_view_fn_t generate_current_file_view_fn;
4649
static mca_common_ompio_get_mca_parameter_value_fn_t get_mca_parameter_value_fn;
@@ -100,6 +103,22 @@ int mca_common_ompio_file_open (ompi_communicator_t *comm,
100103
ompio_fh->f_get_mca_parameter_value=get_mca_parameter_value_fn;
101104

102105
ompio_fh->f_filename = filename;
106+
if (opal_path_is_absolute(filename) ) {
107+
ompio_fh->f_fullfilename = strdup(filename);
108+
}
109+
else {
110+
char path[OPAL_PATH_MAX];
111+
ret = opal_getcwd(path, OPAL_PATH_MAX);
112+
if (OPAL_SUCCESS != ret) {
113+
goto fn_fail;
114+
}
115+
ompio_fh->f_fullfilename = opal_os_path(0, path, filename, NULL);
116+
if (NULL == ompio_fh->f_fullfilename){
117+
ret = OMPI_ERROR;
118+
goto fn_fail;
119+
}
120+
}
121+
103122
mca_common_ompio_set_file_defaults (ompio_fh);
104123

105124
ompio_fh->f_split_coll_req = NULL;
@@ -285,7 +304,7 @@ int mca_common_ompio_file_close (ompio_file_t *ompio_fh)
285304
ret = ompio_fh->f_fs->fs_file_close (ompio_fh);
286305
}
287306
if ( delete_flag ) {
288-
ret = mca_common_ompio_file_delete ( ompio_fh->f_filename, &(MPI_INFO_NULL->super) );
307+
ret = mca_common_ompio_file_delete ( ompio_fh->f_fullfilename, &(MPI_INFO_NULL->super) );
289308
}
290309

291310
if ( NULL != ompio_fh->f_fs ) {
@@ -350,7 +369,8 @@ int mca_common_ompio_file_close (ompio_file_t *ompio_fh)
350369
free ( ompio_fh->f_coll_write_time );
351370
ompio_fh->f_coll_write_time = NULL;
352371
}
353-
372+
free (ompio_fh->f_fullfilename);
373+
354374
if ( NULL != ompio_fh->f_coll_read_time ) {
355375
free ( ompio_fh->f_coll_read_time );
356376
ompio_fh->f_coll_read_time = NULL;
@@ -371,8 +391,7 @@ int mca_common_ompio_file_close (ompio_file_t *ompio_fh)
371391
if ( MPI_DATATYPE_NULL != ompio_fh->f_orig_filetype ){
372392
ompi_datatype_destroy (&ompio_fh->f_orig_filetype);
373393
}
374-
375-
394+
376395
if (MPI_COMM_NULL != ompio_fh->f_comm && !(ompio_fh->f_flags & OMPIO_SHAREDFP_IS_SET) ) {
377396
ompi_comm_free (&ompio_fh->f_comm);
378397
}

ompi/mca/sharedfp/lockedfile/sharedfp_lockedfile_file_open.c

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@
3737
#include <fcntl.h>
3838
#include <unistd.h>
3939

40+
#include "opal/util/output.h"
41+
#include "opal/util/opal_getcwd.h"
42+
#include "opal/util/path.h"
43+
#include "opal/util/os_path.h"
44+
4045
int mca_sharedfp_lockedfile_file_open (struct ompi_communicator_t *comm,
4146
const char* filename,
4247
int amode,
@@ -110,8 +115,26 @@ int mca_sharedfp_lockedfile_file_open (struct ompi_communicator_t *comm,
110115
return OMPI_ERR_OUT_OF_RESOURCE;
111116
}
112117
snprintf(lockedfilename, filenamelen, "%s-%u-%d%s",filename,masterjobid,int_pid,".lock");
113-
module_data->filename = lockedfilename;
114-
118+
if (opal_path_is_absolute(lockedfilename) ) {
119+
module_data->filename = lockedfilename;
120+
} else {
121+
char path[OPAL_PATH_MAX];
122+
err = opal_getcwd(path, OPAL_PATH_MAX);
123+
if (OPAL_SUCCESS != err) {
124+
free (sh);
125+
free (module_data);
126+
free (lockedfilename);
127+
return err;
128+
}
129+
module_data->filename = opal_os_path(0, path, lockedfilename, NULL);
130+
if (NULL == module_data->filename){
131+
free (sh);
132+
free (module_data);
133+
free (lockedfilename);
134+
return OMPI_ERROR;
135+
}
136+
}
137+
115138
/*-------------------------------------------------*/
116139
/*Open the lockedfile without shared file pointer */
117140
/*-------------------------------------------------*/
@@ -125,12 +148,19 @@ int mca_sharedfp_lockedfile_file_open (struct ompi_communicator_t *comm,
125148
opal_output(0, "[%d]mca_sharedfp_lockedfile_file_open: Error during file open\n",
126149
fh->f_rank);
127150
free (sh);
128-
free(module_data);
151+
free (module_data);
129152
free (lockedfilename);
130153
return OMPI_ERROR;
131154
}
132-
write ( handle, &position, sizeof(OMPI_MPI_OFFSET_TYPE) );
133-
close ( handle );
155+
err = opal_best_effort_write (handle, &position, sizeof(OMPI_MPI_OFFSET_TYPE));
156+
if (OPAL_SUCCESS != err) {
157+
free (sh);
158+
free (module_data);
159+
free (lockedfilename);
160+
close (handle);
161+
return err;
162+
}
163+
close (handle);
134164
}
135165
err = comm->c_coll->coll_barrier ( comm, comm->c_coll->coll_barrier_module );
136166
if ( OMPI_SUCCESS != err ) {

ompi/mca/sharedfp/lockedfile/sharedfp_lockedfile_request_position.c

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@
2525
#include "ompi/constants.h"
2626
#include "ompi/mca/sharedfp/sharedfp.h"
2727
#include "ompi/mca/sharedfp/base/base.h"
28+
#include "opal/util/output.h"
29+
#include "opal/util/fd.h"
30+
2831

2932
/*Use fcntl to lock the hidden file which stores the current position*/
3033
#include <fcntl.h>
@@ -76,7 +79,10 @@ int mca_sharedfp_lockedfile_request_position(struct mca_sharedfp_base_data_t * s
7679

7780
/* read from the file */
7881
lseek ( fd, 0, SEEK_SET );
79-
read ( fd, &buf, sizeof(OMPI_MPI_OFFSET_TYPE));
82+
ret = opal_fd_read ( fd, sizeof(OMPI_MPI_OFFSET_TYPE), &buf);
83+
if (OPAL_SUCCESS != ret ) {
84+
goto exit;
85+
}
8086
if ( mca_sharedfp_lockedfile_verbose ) {
8187
opal_output(ompi_sharedfp_base_framework.framework_output,
8288
"sharedfp_lockedfile_request_position: Read last_offset=%lld! ret=%d\n",buf, ret);
@@ -92,8 +98,11 @@ int mca_sharedfp_lockedfile_request_position(struct mca_sharedfp_base_data_t * s
9298

9399
/* write to the file */
94100
lseek ( fd, 0, SEEK_SET );
95-
write ( fd, &position, sizeof(OMPI_MPI_OFFSET_TYPE));
96-
101+
ret = opal_best_effort_write ( fd, &position, sizeof(OMPI_MPI_OFFSET_TYPE));
102+
/* No need to handle error case here, the subsequent steps are identical
103+
in case of ret != OPAL_SUCCESS, namely release lock and return ret */
104+
105+
exit:
97106
/* unlock the file */
98107
if ( mca_sharedfp_lockedfile_verbose ) {
99108
opal_output(ompi_sharedfp_base_framework.framework_output,
@@ -115,7 +124,10 @@ int mca_sharedfp_lockedfile_request_position(struct mca_sharedfp_base_data_t * s
115124
if (fcntl(fd, F_SETLK, &fl) == -1) {
116125
opal_output(0,"sharedfp_lockedfile_request_position:failed to release lock for fd: %d\n",fd);
117126
opal_output(0,"error(%i): %s", errno, strerror(errno));
118-
return OMPI_ERROR;
127+
/* Only overwrite error code if it was OPAL_SUCCESS previously */
128+
if (OPAL_SUCCESS == ret ) {
129+
ret = OMPI_ERROR;
130+
}
119131
}
120132
else {
121133
if ( mca_sharedfp_lockedfile_verbose ) {

0 commit comments

Comments
 (0)