Skip to content

Commit 5e83418

Browse files
tomvdwThe TensorFlow Datasets Authors
authored andcommitted
Add more detailed logging in convert_format_utils for when from and to dirs are the same
PiperOrigin-RevId: 686272333
1 parent fc8cb72 commit 5e83418

File tree

1 file changed

+51
-29
lines changed

1 file changed

+51
-29
lines changed

tensorflow_datasets/scripts/cli/convert_format_utils.py

Lines changed: 51 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,48 @@ def _remove_incomplete_files(path: epath.Path) -> None:
280280
logging.info('Removed %d incomplete files.', num_incomplete_files)
281281

282282

283+
def _get_info_for_dirs_to_convert(
284+
from_dir: epath.Path,
285+
to_dir: epath.Path,
286+
out_file_format: file_adapters.FileFormat,
287+
overwrite: bool,
288+
) -> dataset_info.DatasetInfo | None:
289+
"""Returns the dataset info for the given dataset dirs."""
290+
builder = read_only_builder_lib.builder_from_directory(from_dir)
291+
if out_file_format == builder.info.file_format:
292+
raise ValueError(
293+
f'The file format of the dataset ({builder.info.file_format}) is the'
294+
f' same as the specified out file format! ({out_file_format})'
295+
)
296+
if out_file_format in builder.info.alternative_file_formats:
297+
if overwrite:
298+
logging.warning(
299+
'The file format to convert to (%s) is already an alternative file'
300+
' format. Overwriting the shards!',
301+
out_file_format.value,
302+
)
303+
return builder.info
304+
elif os.fspath(from_dir) == os.fspath(to_dir):
305+
logging.info(
306+
'The file format to convert to (%s) is already an alternative file'
307+
' format of the dataset in %s. Skipping conversion.',
308+
os.fspath(from_dir),
309+
out_file_format.value,
310+
)
311+
# TODO(weide) add check whether data files are actually present.
312+
return None
313+
else:
314+
logging.warning(
315+
'The file format to convert to (%s) is already an alternative file'
316+
' format, but the converted output is being written to a different'
317+
' folder, so the shards will be converted anyway. From: %s, to: %s',
318+
out_file_format.value,
319+
os.fspath(from_dir),
320+
os.fspath(to_dir),
321+
)
322+
return builder.info
323+
324+
283325
def _convert_dataset_dirs(
284326
from_to_dirs: Mapping[epath.Path, epath.Path],
285327
out_file_format: file_adapters.FileFormat,
@@ -303,36 +345,16 @@ def _convert_dataset_dirs(
303345
logging.info('Converting %d datasets.', len(from_to_dirs))
304346

305347
found_dataset_versions: dict[epath.Path, dataset_info.DatasetInfo] = {}
348+
# TODO(weide) parallelize this, because it's slow for dirs with many datasets.
306349
for from_dir, to_dir in from_to_dirs.items():
307-
builder = read_only_builder_lib.builder_from_directory(from_dir)
308-
if out_file_format == builder.info.file_format:
309-
raise ValueError(
310-
f'The file format of the dataset ({builder.info.file_format}) is the'
311-
f' same as the specified out file format! ({out_file_format})'
312-
)
313-
if out_file_format in builder.info.alternative_file_formats:
314-
if overwrite:
315-
logging.warning(
316-
'The file format to convert to (%s) is already an alternative file'
317-
' format. Overwriting the shards!',
318-
out_file_format.value,
319-
)
320-
elif from_dir == to_dir:
321-
logging.info(
322-
'The file format to convert to (%s) is already an alternative file'
323-
' format of the dataset in %s. Skipping conversion.',
324-
os.fspath(from_dir),
325-
out_file_format.value,
326-
)
327-
continue
328-
else:
329-
logging.warning(
330-
'The file format to convert to (%s) is already an alternative file'
331-
' format, but the converted output is being written to a different'
332-
' folder, so the shards will be converted anyway.',
333-
out_file_format.value,
334-
)
335-
found_dataset_versions[from_dir] = builder.info
350+
info = _get_info_for_dirs_to_convert(
351+
from_dir=from_dir,
352+
to_dir=to_dir,
353+
out_file_format=out_file_format,
354+
overwrite=overwrite,
355+
)
356+
if info is not None:
357+
found_dataset_versions[from_dir] = info
336358

337359
convert_dataset_fn = functools.partial(
338360
_convert_dataset,

0 commit comments

Comments
 (0)