Skip to content

Commit df6e3e5

Browse files
committed
sharedfp/individual: defer error when not being able to open datafile
This commit changes the behavior of the individual sharedfp component. If the component cannot create either the datafile or the metadatafile during File_open, no error is being raised going forward. This allows applications that do not use shared file pointer operations to continue execution without any issue. If the user however subsequently calls MPI_File_write_shared or similar operations, an error will be raised. Fixes issue #7429 Signed-off-by: Edgar Gabriel <[email protected]>
1 parent e9a54e8 commit df6e3e5

File tree

2 files changed

+49
-22
lines changed

2 files changed

+49
-22
lines changed

ompi/mca/sharedfp/individual/sharedfp_individual_file_open.c

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12-
* Copyright (c) 2013-2018 University of Houston. All rights reserved.
12+
* Copyright (c) 2013-2019 University of Houston. All rights reserved.
1313
* Copyright (c) 2015-2018 Research Organization for Information Science
1414
* and Technology (RIST). All rights reserved.
1515
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
@@ -92,11 +92,18 @@ int mca_sharedfp_individual_file_open (struct ompi_communicator_t *comm,
9292
MPI_MODE_RDWR | MPI_MODE_CREATE | MPI_MODE_DELETE_ON_CLOSE,
9393
&(MPI_INFO_NULL->super), datafilehandle, false);
9494
if ( OMPI_SUCCESS != err) {
95-
opal_output(0, "mca_sharedfp_individual_file_open: Error during datafile file open\n");
95+
opal_output(ompi_sharedfp_base_framework.framework_output,
96+
"mca_sharedfp_individual_file_open: Error during datafile file open. Continuing anyway. \n");
9697
free (sh);
9798
free (datafilename);
9899
free (datafilehandle);
99-
return err;
100+
101+
// We reset the error code here to OMPI_SUCCESS since the individual component can act as
102+
// a dummy component, in case no sharedfp operations are used by the code. Invoking any write/read
103+
// operations will however lead to an error, since the sharedfp_data pointer will be NULL.
104+
sh = NULL;
105+
err = OMPI_SUCCESS;
106+
goto exit;
100107
}
101108

102109
/*----------------------------------------------------------*/
@@ -113,32 +120,48 @@ int mca_sharedfp_individual_file_open (struct ompi_communicator_t *comm,
113120
if ( NULL == metadatafilename ) {
114121
free (sh);
115122
free (datafilename);
123+
mca_common_ompio_file_close ( datafilehandle);
116124
free (datafilehandle);
117125
opal_output(0, "mca_sharedfp_individual_file_open: Error during memory allocation\n");
118-
return OMPI_ERR_OUT_OF_RESOURCE;
126+
127+
sh=NULL;
128+
err = OMPI_ERR_OUT_OF_RESOURCE;
129+
goto exit;
119130
}
120131
snprintf ( metadatafilename, len, "%s%s%d", filename, ".metadata.",fh->f_rank);
121132

122133
metadatafilehandle = (ompio_file_t *)malloc(sizeof(ompio_file_t));
123134
if ( NULL == metadatafilehandle ) {
124135
free (sh);
125136
free (datafilename);
137+
mca_common_ompio_file_close ( datafilehandle);
126138
free (datafilehandle);
127139
free (metadatafilename);
128140
opal_output(0, "mca_sharedfp_individual_file_open: Error during memory allocation\n");
129-
return OMPI_ERR_OUT_OF_RESOURCE;
141+
142+
sh = NULL;
143+
err = OMPI_ERR_OUT_OF_RESOURCE;
144+
goto exit;
130145
}
131146
err = mca_common_ompio_file_open ( MPI_COMM_SELF,metadatafilename,
132147
MPI_MODE_RDWR | MPI_MODE_CREATE | MPI_MODE_DELETE_ON_CLOSE,
133148
&(MPI_INFO_NULL->super), metadatafilehandle, false);
134149
if ( OMPI_SUCCESS != err) {
135-
opal_output(0, "mca_sharedfp_individual_file_open: Error during metadatafile file open\n");
150+
opal_output(ompi_sharedfp_base_framework.framework_output,
151+
"mca_sharedfp_individual_file_open: Error during metadatafile file open. Continuing anyway. \n");
136152
free (sh);
137153
free (datafilename);
154+
mca_common_ompio_file_close ( datafilehandle);
138155
free (datafilehandle);
139156
free (metadatafilename);
140157
free (metadatafilehandle);
141-
return err;
158+
159+
// We reset the error code here to OMPI_SUCCESS since the individual component can act as
160+
// a dummy component, in case no sharedfp operations are used by the code. Invoking any write/read
161+
// operations will however lead to an error, since the sharedfp_data pointer will be NULL.
162+
sh = NULL;
163+
err = OMPI_SUCCESS;
164+
goto exit;
142165
}
143166

144167
/*save the datafilehandle and metadatahandle in the sharedfp individual module data structure*/
@@ -150,6 +173,8 @@ int mca_sharedfp_individual_file_open (struct ompi_communicator_t *comm,
150173
headnode->metadatafilename = metadatafilename;
151174
}
152175

176+
177+
exit:
153178
/*save the sharedfp individual module data structure in the ompio filehandle structure*/
154179
fh->f_sharedfp_data = sh;
155180

ompi/mca/sharedfp/individual/sharedfp_individual_write.c

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -54,24 +54,26 @@ int mca_sharedfp_individual_write (ompio_file_t *fh,
5454
/*Retrieve data structure for shared file pointer operations*/
5555
sh = fh->f_sharedfp_data;
5656
headnode = (mca_sharedfp_individual_header_record*)sh->selected_module_data;
57+
if ( NULL == headnode) {
58+
opal_output (0, "sharedfp_individual_write_ordered: headnode is NULL but file is open\n");
59+
return OMPI_ERROR;
60+
}
5761

58-
if (headnode) {
59-
/*Insert metadata record into a queue*/
60-
mca_sharedfp_individual_insert_metadata(OMPI_FILE_WRITE_SHARED, totalbytes, sh);
61-
62-
/*Write the data into individual file*/
63-
ret = mca_common_ompio_file_write_at ( headnode->datafilehandle,
64-
headnode->datafile_offset,
65-
buf, count, datatype, status);
66-
if ( OMPI_SUCCESS != ret ) {
67-
opal_output(0,"mca_sharedfp_individual_write: Error while writing the datafile \n");
68-
return -1;
69-
}
70-
71-
/* Update the datafileoffset*/
72-
headnode->datafile_offset = headnode->datafile_offset + totalbytes;
62+
/*Insert metadata record into a queue*/
63+
mca_sharedfp_individual_insert_metadata(OMPI_FILE_WRITE_SHARED, totalbytes, sh);
64+
65+
/*Write the data into individual file*/
66+
ret = mca_common_ompio_file_write_at ( headnode->datafilehandle,
67+
headnode->datafile_offset,
68+
buf, count, datatype, status);
69+
if ( OMPI_SUCCESS != ret ) {
70+
opal_output(0,"mca_sharedfp_individual_write: Error while writing the datafile \n");
71+
return -1;
7372
}
7473

74+
/* Update the datafileoffset*/
75+
headnode->datafile_offset = headnode->datafile_offset + totalbytes;
76+
7577
return ret;
7678
}
7779

0 commit comments

Comments
 (0)