@@ -280,6 +280,48 @@ def _remove_incomplete_files(path: epath.Path) -> None:
280
280
logging .info ('Removed %d incomplete files.' , num_incomplete_files )
281
281
282
282
283
+ def _get_info_for_dirs_to_convert (
284
+ from_dir : epath .Path ,
285
+ to_dir : epath .Path ,
286
+ out_file_format : file_adapters .FileFormat ,
287
+ overwrite : bool ,
288
+ ) -> dataset_info .DatasetInfo | None :
289
+ """Returns the dataset info for the given dataset dirs."""
290
+ builder = read_only_builder_lib .builder_from_directory (from_dir )
291
+ if out_file_format == builder .info .file_format :
292
+ raise ValueError (
293
+ f'The file format of the dataset ({ builder .info .file_format } ) is the'
294
+ f' same as the specified out file format! ({ out_file_format } )'
295
+ )
296
+ if out_file_format in builder .info .alternative_file_formats :
297
+ if overwrite :
298
+ logging .warning (
299
+ 'The file format to convert to (%s) is already an alternative file'
300
+ ' format. Overwriting the shards!' ,
301
+ out_file_format .value ,
302
+ )
303
+ return builder .info
304
+ elif os .fspath (from_dir ) == os .fspath (to_dir ):
305
+ logging .info (
306
+ 'The file format to convert to (%s) is already an alternative file'
307
+ ' format of the dataset in %s. Skipping conversion.' ,
308
+ os .fspath (from_dir ),
309
+ out_file_format .value ,
310
+ )
311
+ # TODO(weide) add check whether data files are actually present.
312
+ return None
313
+ else :
314
+ logging .warning (
315
+ 'The file format to convert to (%s) is already an alternative file'
316
+ ' format, but the converted output is being written to a different'
317
+ ' folder, so the shards will be converted anyway. From: %s, to: %s' ,
318
+ out_file_format .value ,
319
+ os .fspath (from_dir ),
320
+ os .fspath (to_dir ),
321
+ )
322
+ return builder .info
323
+
324
+
283
325
def _convert_dataset_dirs (
284
326
from_to_dirs : Mapping [epath .Path , epath .Path ],
285
327
out_file_format : file_adapters .FileFormat ,
@@ -303,36 +345,16 @@ def _convert_dataset_dirs(
303
345
logging .info ('Converting %d datasets.' , len (from_to_dirs ))
304
346
305
347
found_dataset_versions : dict [epath .Path , dataset_info .DatasetInfo ] = {}
348
+ # TODO(weide) parallelize this, because it's slow for dirs with many datasets.
306
349
for from_dir , to_dir in from_to_dirs .items ():
307
- builder = read_only_builder_lib .builder_from_directory (from_dir )
308
- if out_file_format == builder .info .file_format :
309
- raise ValueError (
310
- f'The file format of the dataset ({ builder .info .file_format } ) is the'
311
- f' same as the specified out file format! ({ out_file_format } )'
312
- )
313
- if out_file_format in builder .info .alternative_file_formats :
314
- if overwrite :
315
- logging .warning (
316
- 'The file format to convert to (%s) is already an alternative file'
317
- ' format. Overwriting the shards!' ,
318
- out_file_format .value ,
319
- )
320
- elif from_dir == to_dir :
321
- logging .info (
322
- 'The file format to convert to (%s) is already an alternative file'
323
- ' format of the dataset in %s. Skipping conversion.' ,
324
- os .fspath (from_dir ),
325
- out_file_format .value ,
326
- )
327
- continue
328
- else :
329
- logging .warning (
330
- 'The file format to convert to (%s) is already an alternative file'
331
- ' format, but the converted output is being written to a different'
332
- ' folder, so the shards will be converted anyway.' ,
333
- out_file_format .value ,
334
- )
335
- found_dataset_versions [from_dir ] = builder .info
350
+ info = _get_info_for_dirs_to_convert (
351
+ from_dir = from_dir ,
352
+ to_dir = to_dir ,
353
+ out_file_format = out_file_format ,
354
+ overwrite = overwrite ,
355
+ )
356
+ if info is not None :
357
+ found_dataset_versions [from_dir ] = info
336
358
337
359
convert_dataset_fn = functools .partial (
338
360
_convert_dataset ,
0 commit comments