@@ -343,113 +343,6 @@ def _find_files_with_glob(
343
343
yield from _find_files_without_glob (folder , globs , file_names )
344
344
345
345
346
- def _find_references_with_glob (
347
- folder : epath .Path ,
348
- is_data_dir : bool ,
349
- is_dataset_dir : bool ,
350
- namespace : str | None = None ,
351
- include_old_tfds_version : bool = True ,
352
- glob_suffixes : Sequence [str ] = ('json' ,),
353
- ) -> Iterator [naming .DatasetReference ]:
354
- """Yields all dataset references in the given folder.
355
-
356
- Args:
357
- folder: the folder where to look for datasets. Can be either a root data
358
- dir, or a dataset folder.
359
- is_data_dir: Whether `folder` is a root TFDS data dir.
360
- is_dataset_dir: Whether `folder` is the folder of one specific dataset.
361
- namespace: Optional namespace to which the found datasets belong to.
362
- include_old_tfds_version: include datasets that have been generated with
363
- TFDS before 4.0.0.
364
- glob_suffixes: list of file suffixes to use to create the glob for
365
- interesting TFDS files. Defaults to json files.
366
- """
367
- if is_dataset_dir and is_data_dir :
368
- raise ValueError ('Folder cannot be both a data dir and dataset dir!' )
369
- if not is_data_dir and not is_dataset_dir :
370
- raise ValueError ('Folder must be either a data dir or a dataset dir!' )
371
-
372
- if is_data_dir :
373
- data_dir = folder
374
- dataset_name = None
375
- stars = ['*/*/*/*' , '*/*/*' ]
376
- else :
377
- data_dir = folder .parent
378
- dataset_name = folder .name
379
- stars = ['*/*/*' , '*/*' ]
380
-
381
- globs : list [str ] = []
382
- for star in stars :
383
- if glob_suffixes :
384
- globs .extend ([f'{ star } .{ suffix } ' for suffix in glob_suffixes ])
385
- else :
386
- globs .append (star )
387
-
388
- # Check files matching the globs and are files we are interested in.
389
- matched_files_per_folder = collections .defaultdict (set )
390
- for file in _find_files_with_glob (
391
- folder ,
392
- globs = globs ,
393
- file_names = _INFO_FILE_NAMES ,
394
- ):
395
- matched_files_per_folder [file .parent ].add (file .name )
396
-
397
- for data_folder , matched_files in matched_files_per_folder .items ():
398
- if constants .DATASET_INFO_FILENAME not in matched_files :
399
- logging .warning (
400
- 'Ignoring dataset folder %s, which has no dataset_info.json' ,
401
- os .fspath (data_folder ),
402
- )
403
- continue
404
- if (
405
- not include_old_tfds_version
406
- and constants .FEATURES_FILENAME not in matched_files
407
- ):
408
- logging .info (
409
- 'Ignoring dataset folder %s, which has no features.json' ,
410
- os .fspath (data_folder ),
411
- )
412
- continue
413
-
414
- version = data_folder .name
415
- if not version_lib .Version .is_valid (version ):
416
- logging .warning (
417
- 'Ignoring dataset folder %s, which has invalid version %s' ,
418
- os .fspath (data_folder ),
419
- version ,
420
- )
421
- continue
422
-
423
- config = None
424
- if is_data_dir :
425
- if data_folder .parent .parent == folder :
426
- dataset_name = data_folder .parent .name
427
- elif data_folder .parent .parent .parent == folder :
428
- dataset_name = data_folder .parent .parent .name
429
- config = data_folder .parent .name
430
- else :
431
- raise ValueError (
432
- f'Could not detect dataset and config from path { data_folder } in'
433
- f' { folder } '
434
- )
435
- else :
436
- if data_folder .parent != folder :
437
- config = data_folder .parent .name
438
-
439
- if not naming .is_valid_dataset_name (dataset_name ):
440
- logging .warning ('Invalid dataset name: %s' , dataset_name )
441
- continue
442
-
443
- yield naming .DatasetReference (
444
- namespace = namespace ,
445
- data_dir = data_dir ,
446
- dataset_name = dataset_name ,
447
- config = config ,
448
- version = version ,
449
- info_filenames = matched_files ,
450
- )
451
-
452
-
453
346
def list_dataset_versions (
454
347
dataset_config_dir : epath .PathLike ,
455
348
) -> list [version_lib .Version ]:
@@ -476,45 +369,77 @@ def list_dataset_versions(
476
369
477
370
478
371
def list_dataset_variants (
479
- dataset_dir : epath . PathLike ,
372
+ dataset_dir : Path ,
480
373
namespace : str | None = None ,
481
- include_versions : bool = True ,
482
374
include_old_tfds_version : bool = False ,
483
- glob_suffixes : Sequence [str ] = ('json' ,),
484
375
) -> Iterator [naming .DatasetReference ]:
485
376
"""Yields all variants (config + version) found in `dataset_dir`.
486
377
487
- Arguments :
378
+ Args :
488
379
dataset_dir: the folder of the dataset.
489
380
namespace: optional namespace to which this data dir belongs.
490
- include_versions: whether to list what versions are available.
491
381
include_old_tfds_version: include datasets that have been generated with
492
382
TFDS before 4.0.0.
493
- glob_suffixes: list of file suffixes to use to create the glob for
494
- interesting TFDS files. Defaults to json files.
495
383
496
384
Yields:
497
385
all variants of the given dataset.
498
386
""" # fmt: skip
499
- dataset_dir = epath .Path (dataset_dir )
500
- references = {}
501
- for reference in _find_references_with_glob (
502
- folder = dataset_dir ,
503
- is_data_dir = False ,
504
- is_dataset_dir = True ,
505
- namespace = namespace ,
506
- include_old_tfds_version = include_old_tfds_version ,
507
- glob_suffixes = glob_suffixes ,
387
+ data_dir = dataset_dir .parent
388
+ dataset_name = dataset_dir .name
389
+ globs = [
390
+ '*/*/*.json' , # with nested config directory
391
+ '*/*.json' , # without nested config directory
392
+ ]
393
+
394
+ # Check files matching the globs and are files we are interested in.
395
+ matched_files_by_variant_dir = collections .defaultdict (set )
396
+ for file in _find_files_with_glob (
397
+ dataset_dir ,
398
+ globs = globs ,
399
+ file_names = _INFO_FILE_NAMES ,
508
400
):
509
- if include_versions :
510
- key = f'{ reference .dataset_name } /{ reference .config } :{ reference .version } '
511
- else :
512
- key = f'{ reference .dataset_name } /{ reference .config } '
513
- reference = reference .replace (version = None )
514
- references [key ] = reference
401
+ matched_files_by_variant_dir [file .parent ].add (file .name )
402
+
403
+ for variant_dir , matched_files in matched_files_by_variant_dir .items ():
404
+ if constants .DATASET_INFO_FILENAME not in matched_files :
405
+ logging .warning (
406
+ 'Ignoring variant folder %s, which has no %s' ,
407
+ variant_dir ,
408
+ constants .DATASET_INFO_FILENAME ,
409
+ )
410
+ continue
411
+
412
+ if (
413
+ not include_old_tfds_version
414
+ and constants .FEATURES_FILENAME not in matched_files
415
+ ):
416
+ logging .info (
417
+ 'Ignoring variant folder %s, which has no %s' ,
418
+ variant_dir ,
419
+ constants .FEATURES_FILENAME ,
420
+ )
421
+ continue
515
422
516
- for reference in references .values ():
517
- yield reference
423
+ version = variant_dir .name
424
+ if not version_lib .Version .is_valid (version ):
425
+ logging .warning (
426
+ 'Ignoring variant folder %s, which has invalid version %s' ,
427
+ variant_dir ,
428
+ version ,
429
+ )
430
+ continue
431
+
432
+ config_dir = variant_dir .parent
433
+ config = config_dir .name if config_dir != dataset_dir else None
434
+
435
+ yield naming .DatasetReference (
436
+ namespace = namespace ,
437
+ data_dir = data_dir ,
438
+ dataset_name = dataset_name ,
439
+ config = config ,
440
+ version = version ,
441
+ info_filenames = matched_files ,
442
+ )
518
443
519
444
520
445
def list_datasets_in_data_dir (
@@ -547,22 +472,27 @@ def list_datasets_in_data_dir(
547
472
for dataset_dir in epath .Path (data_dir ).iterdir ():
548
473
if not dataset_dir .is_dir ():
549
474
continue
550
- if not naming .is_valid_dataset_name (dataset_dir .name ):
475
+ dataset_name = dataset_dir .name
476
+ if not naming .is_valid_dataset_name (dataset_name ):
477
+ logging .warning ('Invalid dataset name: %s' , dataset_name )
551
478
continue
552
479
num_datasets += 1
553
480
if include_configs :
554
481
for variant in list_dataset_variants (
555
482
dataset_dir = dataset_dir ,
556
483
namespace = namespace ,
557
- include_versions = include_versions ,
558
484
include_old_tfds_version = include_old_tfds_version ,
559
485
):
560
486
num_variants += 1
561
- yield variant
487
+ if include_versions :
488
+ yield variant
489
+ else :
490
+ yield variant .replace (version = None )
491
+ break
562
492
else :
563
493
num_variants += 1
564
494
yield naming .DatasetReference (
565
- dataset_name = dataset_dir . name , namespace = namespace , data_dir = data_dir
495
+ dataset_name = dataset_name , namespace = namespace , data_dir = data_dir
566
496
)
567
497
logging .info (
568
498
'Found %d datasets and %d variants in %s' ,
0 commit comments