Skip to content

Commit 5c2bd15

Browse files
fineguyThe TensorFlow Datasets Authors
authored andcommitted
Simplify file_utils.list_dataset_variants().
PiperOrigin-RevId: 707571864
1 parent 6b93631 commit 5c2bd15

File tree

3 files changed

+66
-181
lines changed

3 files changed

+66
-181
lines changed

tensorflow_datasets/core/utils/file_utils.py

Lines changed: 65 additions & 135 deletions
Original file line numberDiff line numberDiff line change
@@ -343,113 +343,6 @@ def _find_files_with_glob(
343343
yield from _find_files_without_glob(folder, globs, file_names)
344344

345345

346-
def _find_references_with_glob(
347-
folder: epath.Path,
348-
is_data_dir: bool,
349-
is_dataset_dir: bool,
350-
namespace: str | None = None,
351-
include_old_tfds_version: bool = True,
352-
glob_suffixes: Sequence[str] = ('json',),
353-
) -> Iterator[naming.DatasetReference]:
354-
"""Yields all dataset references in the given folder.
355-
356-
Args:
357-
folder: the folder where to look for datasets. Can be either a root data
358-
dir, or a dataset folder.
359-
is_data_dir: Whether `folder` is a root TFDS data dir.
360-
is_dataset_dir: Whether `folder` is the folder of one specific dataset.
361-
namespace: Optional namespace to which the found datasets belong to.
362-
include_old_tfds_version: include datasets that have been generated with
363-
TFDS before 4.0.0.
364-
glob_suffixes: list of file suffixes to use to create the glob for
365-
interesting TFDS files. Defaults to json files.
366-
"""
367-
if is_dataset_dir and is_data_dir:
368-
raise ValueError('Folder cannot be both a data dir and dataset dir!')
369-
if not is_data_dir and not is_dataset_dir:
370-
raise ValueError('Folder must be either a data dir or a dataset dir!')
371-
372-
if is_data_dir:
373-
data_dir = folder
374-
dataset_name = None
375-
stars = ['*/*/*/*', '*/*/*']
376-
else:
377-
data_dir = folder.parent
378-
dataset_name = folder.name
379-
stars = ['*/*/*', '*/*']
380-
381-
globs: list[str] = []
382-
for star in stars:
383-
if glob_suffixes:
384-
globs.extend([f'{star}.{suffix}' for suffix in glob_suffixes])
385-
else:
386-
globs.append(star)
387-
388-
# Check files matching the globs and are files we are interested in.
389-
matched_files_per_folder = collections.defaultdict(set)
390-
for file in _find_files_with_glob(
391-
folder,
392-
globs=globs,
393-
file_names=_INFO_FILE_NAMES,
394-
):
395-
matched_files_per_folder[file.parent].add(file.name)
396-
397-
for data_folder, matched_files in matched_files_per_folder.items():
398-
if constants.DATASET_INFO_FILENAME not in matched_files:
399-
logging.warning(
400-
'Ignoring dataset folder %s, which has no dataset_info.json',
401-
os.fspath(data_folder),
402-
)
403-
continue
404-
if (
405-
not include_old_tfds_version
406-
and constants.FEATURES_FILENAME not in matched_files
407-
):
408-
logging.info(
409-
'Ignoring dataset folder %s, which has no features.json',
410-
os.fspath(data_folder),
411-
)
412-
continue
413-
414-
version = data_folder.name
415-
if not version_lib.Version.is_valid(version):
416-
logging.warning(
417-
'Ignoring dataset folder %s, which has invalid version %s',
418-
os.fspath(data_folder),
419-
version,
420-
)
421-
continue
422-
423-
config = None
424-
if is_data_dir:
425-
if data_folder.parent.parent == folder:
426-
dataset_name = data_folder.parent.name
427-
elif data_folder.parent.parent.parent == folder:
428-
dataset_name = data_folder.parent.parent.name
429-
config = data_folder.parent.name
430-
else:
431-
raise ValueError(
432-
f'Could not detect dataset and config from path {data_folder} in'
433-
f' {folder}'
434-
)
435-
else:
436-
if data_folder.parent != folder:
437-
config = data_folder.parent.name
438-
439-
if not naming.is_valid_dataset_name(dataset_name):
440-
logging.warning('Invalid dataset name: %s', dataset_name)
441-
continue
442-
443-
yield naming.DatasetReference(
444-
namespace=namespace,
445-
data_dir=data_dir,
446-
dataset_name=dataset_name,
447-
config=config,
448-
version=version,
449-
info_filenames=matched_files,
450-
)
451-
452-
453346
def list_dataset_versions(
454347
dataset_config_dir: epath.PathLike,
455348
) -> list[version_lib.Version]:
@@ -476,45 +369,77 @@ def list_dataset_versions(
476369

477370

478371
def list_dataset_variants(
479-
dataset_dir: epath.PathLike,
372+
dataset_dir: Path,
480373
namespace: str | None = None,
481-
include_versions: bool = True,
482374
include_old_tfds_version: bool = False,
483-
glob_suffixes: Sequence[str] = ('json',),
484375
) -> Iterator[naming.DatasetReference]:
485376
"""Yields all variants (config + version) found in `dataset_dir`.
486377
487-
Arguments:
378+
Args:
488379
dataset_dir: the folder of the dataset.
489380
namespace: optional namespace to which this data dir belongs.
490-
include_versions: whether to list what versions are available.
491381
include_old_tfds_version: include datasets that have been generated with
492382
TFDS before 4.0.0.
493-
glob_suffixes: list of file suffixes to use to create the glob for
494-
interesting TFDS files. Defaults to json files.
495383
496384
Yields:
497385
all variants of the given dataset.
498386
""" # fmt: skip
499-
dataset_dir = epath.Path(dataset_dir)
500-
references = {}
501-
for reference in _find_references_with_glob(
502-
folder=dataset_dir,
503-
is_data_dir=False,
504-
is_dataset_dir=True,
505-
namespace=namespace,
506-
include_old_tfds_version=include_old_tfds_version,
507-
glob_suffixes=glob_suffixes,
387+
data_dir = dataset_dir.parent
388+
dataset_name = dataset_dir.name
389+
globs = [
390+
'*/*/*.json', # with nested config directory
391+
'*/*.json', # without nested config directory
392+
]
393+
394+
# Check files matching the globs and are files we are interested in.
395+
matched_files_by_variant_dir = collections.defaultdict(set)
396+
for file in _find_files_with_glob(
397+
dataset_dir,
398+
globs=globs,
399+
file_names=_INFO_FILE_NAMES,
508400
):
509-
if include_versions:
510-
key = f'{reference.dataset_name}/{reference.config}:{reference.version}'
511-
else:
512-
key = f'{reference.dataset_name}/{reference.config}'
513-
reference = reference.replace(version=None)
514-
references[key] = reference
401+
matched_files_by_variant_dir[file.parent].add(file.name)
402+
403+
for variant_dir, matched_files in matched_files_by_variant_dir.items():
404+
if constants.DATASET_INFO_FILENAME not in matched_files:
405+
logging.warning(
406+
'Ignoring variant folder %s, which has no %s',
407+
variant_dir,
408+
constants.DATASET_INFO_FILENAME,
409+
)
410+
continue
411+
412+
if (
413+
not include_old_tfds_version
414+
and constants.FEATURES_FILENAME not in matched_files
415+
):
416+
logging.info(
417+
'Ignoring variant folder %s, which has no %s',
418+
variant_dir,
419+
constants.FEATURES_FILENAME,
420+
)
421+
continue
515422

516-
for reference in references.values():
517-
yield reference
423+
version = variant_dir.name
424+
if not version_lib.Version.is_valid(version):
425+
logging.warning(
426+
'Ignoring variant folder %s, which has invalid version %s',
427+
variant_dir,
428+
version,
429+
)
430+
continue
431+
432+
config_dir = variant_dir.parent
433+
config = config_dir.name if config_dir != dataset_dir else None
434+
435+
yield naming.DatasetReference(
436+
namespace=namespace,
437+
data_dir=data_dir,
438+
dataset_name=dataset_name,
439+
config=config,
440+
version=version,
441+
info_filenames=matched_files,
442+
)
518443

519444

520445
def list_datasets_in_data_dir(
@@ -547,22 +472,27 @@ def list_datasets_in_data_dir(
547472
for dataset_dir in epath.Path(data_dir).iterdir():
548473
if not dataset_dir.is_dir():
549474
continue
550-
if not naming.is_valid_dataset_name(dataset_dir.name):
475+
dataset_name = dataset_dir.name
476+
if not naming.is_valid_dataset_name(dataset_name):
477+
logging.warning('Invalid dataset name: %s', dataset_name)
551478
continue
552479
num_datasets += 1
553480
if include_configs:
554481
for variant in list_dataset_variants(
555482
dataset_dir=dataset_dir,
556483
namespace=namespace,
557-
include_versions=include_versions,
558484
include_old_tfds_version=include_old_tfds_version,
559485
):
560486
num_variants += 1
561-
yield variant
487+
if include_versions:
488+
yield variant
489+
else:
490+
yield variant.replace(version=None)
491+
break
562492
else:
563493
num_variants += 1
564494
yield naming.DatasetReference(
565-
dataset_name=dataset_dir.name, namespace=namespace, data_dir=data_dir
495+
dataset_name=dataset_name, namespace=namespace, data_dir=data_dir
566496
)
567497
logging.info(
568498
'Found %d datasets and %d variants in %s',

tensorflow_datasets/core/utils/file_utils_test.py

Lines changed: 1 addition & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -200,18 +200,13 @@ def test_list_dataset_variants_with_configs(mock_fs: testing.MockFs):
200200
constants.FEATURES_FILENAME,
201201
constants.DATASET_INFO_FILENAME,
202202
}
203-
glob_suffixes = [
204-
'json',
205-
]
206203
for config, versions in configs_and_versions.items():
207204
for version in versions:
208205
for info_filename in info_filenames:
209206
mock_fs.add_file(_DATASET_DIR / config / version / info_filename)
210207

211208
references = sorted(
212-
file_utils.list_dataset_variants(
213-
dataset_dir=_DATASET_DIR, glob_suffixes=glob_suffixes
214-
)
209+
file_utils.list_dataset_variants(dataset_dir=_DATASET_DIR)
215210
)
216211
assert references == [
217212
naming.DatasetReference(
@@ -238,43 +233,6 @@ def test_list_dataset_variants_with_configs(mock_fs: testing.MockFs):
238233
]
239234

240235

241-
def test_list_dataset_variants_with_configs_no_versions(
242-
mock_fs: testing.MockFs,
243-
):
244-
configs_and_versions = {
245-
'x': [_VERSION, '1.0.1'],
246-
'y': ['2.0.0'],
247-
}
248-
info_filenames = {
249-
constants.DATASET_INFO_FILENAME,
250-
constants.FEATURES_FILENAME,
251-
}
252-
for config, versions in configs_and_versions.items():
253-
for version in versions:
254-
for filename in info_filenames:
255-
mock_fs.add_file(_DATASET_DIR / config / version / filename)
256-
257-
references = sorted(
258-
file_utils.list_dataset_variants(
259-
dataset_dir=_DATASET_DIR, include_versions=False
260-
)
261-
)
262-
assert references == [
263-
naming.DatasetReference(
264-
dataset_name=_DATASET_NAME,
265-
config='x',
266-
data_dir=_DATA_DIR,
267-
info_filenames=info_filenames,
268-
),
269-
naming.DatasetReference(
270-
dataset_name=_DATASET_NAME,
271-
config='y',
272-
data_dir=_DATA_DIR,
273-
info_filenames=info_filenames,
274-
),
275-
]
276-
277-
278236
def test_list_dataset_variants_without_configs(mock_fs: testing.MockFs):
279237
# Version 1.0.0 doesn't have features.json, because it was generated with an
280238
# old version of TFDS.
@@ -286,7 +244,6 @@ def test_list_dataset_variants_without_configs(mock_fs: testing.MockFs):
286244
references = sorted(
287245
file_utils.list_dataset_variants(
288246
dataset_dir=_DATASET_DIR,
289-
include_versions=True,
290247
include_old_tfds_version=True,
291248
)
292249
)
@@ -312,7 +269,6 @@ def test_list_dataset_variants_without_configs(mock_fs: testing.MockFs):
312269
references = sorted(
313270
file_utils.list_dataset_variants(
314271
dataset_dir=_DATASET_DIR,
315-
include_versions=True,
316272
include_old_tfds_version=False,
317273
)
318274
)

tensorflow_datasets/scripts/cli/convert_format_utils.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -740,7 +740,6 @@ def convert_dataset_dir(
740740

741741
references = file_utils.list_dataset_variants(
742742
dataset_dir=dataset_dir,
743-
include_versions=True,
744743
include_old_tfds_version=True,
745744
)
746745
from_to_dirs = _create_from_to_dirs(

0 commit comments

Comments
 (0)