@@ -178,8 +178,20 @@ def _get_de_dir_from_file_path(file_path):
178
178
de_dir = self ._get_de_variable_folder_dir (file_path , global_step )
179
179
return file_prefix_pattern , global_step , de_dir
180
180
181
+ def _rank0_delete_files_and_return_de_dir (file_path ):
182
+ file_prefix_pattern , global_step , de_dir = _get_de_dir_from_file_path (
183
+ file_path )
184
+ if global_step is not None :
185
+ ckpt_index_list = file_io .get_matching_files (file_prefix_pattern +
186
+ '-*.index' )
187
+ self ._delete_redundant_de_dir (
188
+ ckpt_index_list
189
+ ) # Compatible with automatic sweep function of checkpointmanager
190
+ return de_dir
191
+
181
192
if self ._hvd is None :
182
193
file_path = tf_write_func ()
194
+ de_dir = _rank0_delete_files_and_return_de_dir (file_path )
183
195
self ._de_handle_root_and_var_with_func (de_dir = de_dir ,
184
196
func = self ._de_var_fs_save_funtion )
185
197
else :
@@ -189,14 +201,7 @@ def _get_de_dir_from_file_path(file_path):
189
201
self ._hvd .broadcast_object (file_path ,
190
202
root_rank = 0 ,
191
203
name = 'de_hvd_broadcast_file_path' )
192
- file_prefix_pattern , global_step , de_dir = _get_de_dir_from_file_path (
193
- file_path )
194
- if global_step is not None :
195
- ckpt_index_list = file_io .get_matching_files (file_prefix_pattern +
196
- '-*.index' )
197
- self ._delete_redundant_de_dir (
198
- ckpt_index_list
199
- ) # Compatible with automatic sweep function of checkpointmanager
204
+ de_dir = _rank0_delete_files_and_return_de_dir (file_path )
200
205
self ._hvd .join () # Sync for avoiding files conflict
201
206
self ._de_handle_root_and_var_with_func (
202
207
de_dir = de_dir , func = self ._de_var_fs_save_funtion )
@@ -205,8 +210,7 @@ def _get_de_dir_from_file_path(file_path):
205
210
else :
206
211
file_path = self ._hvd .broadcast_object (
207
212
None , root_rank = 0 , name = 'de_hvd_broadcast_file_path' )
208
- file_prefix_pattern , global_step , de_dir = _get_de_dir_from_file_path (
209
- file_path )
213
+ _ , _ , de_dir = _get_de_dir_from_file_path (file_path )
210
214
self ._hvd .join () # Sync for avoiding files conflict
211
215
self ._de_handle_root_and_var_with_func (
212
216
de_dir = de_dir , func = self ._de_var_fs_save_funtion )
0 commit comments