Skip to content

Commit 28b2e91

Browse files
fineguyThe TensorFlow Datasets Authors
authored andcommitted
Fix DatasetBuilder.is_prepared()
PiperOrigin-RevId: 708238521
1 parent 0940a54 commit 28b2e91

File tree

2 files changed

+65
-26
lines changed

2 files changed

+65
-26
lines changed

tensorflow_datasets/core/dataset_builder.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -570,7 +570,10 @@ def get_file_spec(self, split: str) -> str | None:
570570

571571
def is_prepared(self) -> bool:
572572
"""Returns whether this dataset is already downloaded and prepared."""
573-
return self.data_path.exists()
573+
return file_utils.is_valid_variant_dir(
574+
variant_dir=self.data_path,
575+
include_old_tfds_version=True,
576+
)
574577

575578
def is_blocked(self) -> utils.IsBlocked:
576579
"""Returns whether this builder (version, config) is blocked."""

tensorflow_datasets/core/utils/file_utils.py

Lines changed: 61 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,61 @@ def list_dataset_versions(
368368
return sorted(found_versions)
369369

370370

371+
def is_valid_variant_dir(
372+
variant_dir: Path,
373+
matched_files: set[str] | None = None,
374+
include_old_tfds_version: bool = False,
375+
) -> bool:
376+
"""Returns whether the variant directory is valid.
377+
378+
Valid variant directories must:
379+
- Contain a dataset_info.json file.
380+
- Contain a features.json file.
381+
- Have a valid version name.
382+
383+
Args:
384+
variant_dir: The variant directory to check.
385+
matched_files: The files that were matched in the variant directory. If
386+
None, all json files in the directory are used.
387+
include_old_tfds_version: include datasets that have been generated with
388+
TFDS before 4.0.0.
389+
"""
390+
version = variant_dir.name
391+
if not version_lib.Version.is_valid(version):
392+
logging.warning(
393+
'Variant folder %s has invalid version %s',
394+
variant_dir,
395+
version,
396+
)
397+
return False
398+
399+
if matched_files is None:
400+
matched_files = set(
401+
matched_path.name for matched_path in variant_dir.glob('*.json')
402+
)
403+
404+
if constants.DATASET_INFO_FILENAME not in matched_files:
405+
logging.warning(
406+
'Variant folder %s has no %s',
407+
variant_dir,
408+
constants.DATASET_INFO_FILENAME,
409+
)
410+
return False
411+
412+
if (
413+
not include_old_tfds_version
414+
and constants.FEATURES_FILENAME not in matched_files
415+
):
416+
logging.warning(
417+
'Variant folder %s has no %s',
418+
variant_dir,
419+
constants.FEATURES_FILENAME,
420+
)
421+
return False
422+
423+
return True
424+
425+
371426
def list_dataset_variants(
372427
dataset_dir: Path,
373428
namespace: str | None = None,
@@ -401,36 +456,17 @@ def list_dataset_variants(
401456
matched_files_by_variant_dir[file.parent].add(file.name)
402457

403458
for variant_dir, matched_files in matched_files_by_variant_dir.items():
404-
if constants.DATASET_INFO_FILENAME not in matched_files:
405-
logging.warning(
406-
'Ignoring variant folder %s, which has no %s',
407-
variant_dir,
408-
constants.DATASET_INFO_FILENAME,
409-
)
410-
continue
411-
412-
if (
413-
not include_old_tfds_version
414-
and constants.FEATURES_FILENAME not in matched_files
459+
if not is_valid_variant_dir(
460+
variant_dir=variant_dir,
461+
matched_files=matched_files,
462+
include_old_tfds_version=include_old_tfds_version,
415463
):
416-
logging.info(
417-
'Ignoring variant folder %s, which has no %s',
418-
variant_dir,
419-
constants.FEATURES_FILENAME,
420-
)
421-
continue
422-
423-
version = variant_dir.name
424-
if not version_lib.Version.is_valid(version):
425-
logging.warning(
426-
'Ignoring variant folder %s, which has invalid version %s',
427-
variant_dir,
428-
version,
429-
)
464+
logging.warning('Skipping invalid variant directory: %s', variant_dir)
430465
continue
431466

432467
config_dir = variant_dir.parent
433468
config = config_dir.name if config_dir != dataset_dir else None
469+
version = variant_dir.name
434470

435471
yield naming.DatasetReference(
436472
namespace=namespace,

0 commit comments

Comments
 (0)