Internal change

The TensorFlow Datasets Authors · The TensorFlow Datasets Authors · commit 27e89c37b1d8 · 2024-06-04T05:46:46.000-07:00
PiperOrigin-RevId: 640120829
diff --git a/tensorflow_datasets/core/dataset_info.py b/tensorflow_datasets/core/dataset_info.py
@@ -336,10 +336,7 @@ def as_proto(self) -> dataset_info_pb2.DatasetInfo:
 
   @property
   def as_proto_with_features(self) -> dataset_info_pb2.DatasetInfo:
-    info_proto = dataset_info_pb2.DatasetInfo()
-    info_proto.CopyFrom(self._info_proto)
-    info_proto.features.CopyFrom(self.features.to_proto())  # pytype: disable=attribute-error  # always-use-property-annotation
-    return info_proto
+    return update_info_proto_with_features(self._info_proto, self.features)
 
   @property
   def name(self) -> str:
@@ -1196,6 +1193,25 @@ def _create_redistribution_info_proto(
   return None
 
 
+def update_info_proto_with_features(
+    info_proto: dataset_info_pb2.DatasetInfo,
+    features: feature_lib.FeatureConnector,
+) -> dataset_info_pb2.DatasetInfo:
+  """Update the info proto with the given features, if any.
+
+  Args:
+    info_proto: the info proto to update.
+    features: the features to use.
+
+  Returns:
+    the updated info proto.
+  """
+  completed_info_proto = dataset_info_pb2.DatasetInfo()
+  completed_info_proto.CopyFrom(info_proto)
+  completed_info_proto.features.CopyFrom(features.to_proto())
+  return completed_info_proto
+
+
 class MetadataDict(Metadata, dict):
   """A `tfds.core.Metadata` object that acts as a `dict`.
 
diff --git a/tensorflow_datasets/core/dataset_info_test.py b/tensorflow_datasets/core/dataset_info_test.py
@@ -422,6 +422,14 @@ def test_set_file_format_override(self):
     info.set_file_format(file_adapters.FileFormat.RIEGELI, override=True)
     self.assertEqual(info.file_format, file_adapters.FileFormat.RIEGELI)
 
+  def test_update_info_proto_with_features(self):
+    info_proto = dataset_info.DatasetInfo(builder=self._builder).as_proto
+    new_features = features.FeaturesDict({"text": features.Text()})
+    new_info = dataset_info.update_info_proto_with_features(
+        info_proto, new_features
+    )
+    self.assertEqual(new_info.features, new_features.to_proto())
+
 
 @pytest.mark.parametrize(
     "file_format",
diff --git a/tensorflow_datasets/core/features/feature.py b/tensorflow_datasets/core/features/feature.py
@@ -1068,7 +1068,7 @@ def load_metadata(
     will restore the feature metadata from the saved file.
 
     Args:
-      data_dir: path to the dataset folder to which save the info (ex:
+      data_dir: path to the dataset folder where the info is saved (ex:
         `~/datasets/cifar10/1.2.0/`)
       feature_name: the name of the feature (from the FeaturesDict key)
     """
diff --git a/tensorflow_datasets/core/naming.py b/tensorflow_datasets/core/naming.py
@@ -208,6 +208,9 @@ class DatasetReference:
       can be used. For example, if the collection uses the split `valid`, but
       this dataset uses the split `validation`, then the `split_mapping` should
       be `{'validation': 'valid'}`.
+    info_filenames: Filenames which are used to describe the dataset. They might
+      include, for example, `dataset_info.json`, `features.json`, etc. If None,
+      then it wasn't checked which info files exist on disk.
   """  # fmt: skip
 
   dataset_name: str
@@ -216,6 +219,7 @@ class DatasetReference:
   version: None | str | version_lib.Version = None
   data_dir: None | str | os.PathLike = None  # pylint: disable=g-bare-generic
   split_mapping: None | Mapping[str, str] = None
+  info_filenames: set[str] | None = None
 
   def __post_init__(self):
     if isinstance(self.version, version_lib.Version):
@@ -302,7 +306,7 @@ def from_tfds_name(
 
 
 def references_for(
-    name_to_tfds_name: Mapping[str, str]
+    name_to_tfds_name: Mapping[str, str],
 ) -> Mapping[str, DatasetReference]:
   """Constructs of dataset references.
 
diff --git a/tensorflow_datasets/core/utils/file_utils.py b/tensorflow_datasets/core/utils/file_utils.py
@@ -177,15 +177,20 @@ def _find_files_without_glob(
 
 
 def _find_files_with_glob(
-    folder: epath.Path, globs: list[str], file_names: list[str]
+    folder: epath.Path,
+    globs: list[str],
+    file_names: list[str],
 ) -> Iterator[epath.Path]:
   """Finds files matching any of the given globs and given file names."""
   for glob in globs:
+    found_files = folder.glob(glob)
     try:
-      for file in folder.glob(glob):
+      for file in found_files:
         if file.name in file_names:
           yield file
-    except OSError:
+    except (
+        OSError,
+    ):
       # If permission was denied on any subfolder, then the glob fails. Manually
       # iterate through the subfolders instead to be more robust against this.
       yield from _find_files_without_glob(folder, globs, file_names)
@@ -197,6 +202,7 @@ def _find_references_with_glob(
     is_dataset_dir: bool,
     namespace: str | None = None,
     include_old_tfds_version: bool = True,
+    glob_suffixes: Sequence[str] = ('json',),
 ) -> Iterator[naming.DatasetReference]:
   """Yields all dataset references in the given folder.
 
@@ -208,6 +214,8 @@ def _find_references_with_glob(
     namespace: Optional namespace to which the found datasets belong to.
     include_old_tfds_version: include datasets that have been generated with
       TFDS before 4.0.0.
+    glob_suffixes: list of file suffixes to use to create the the glob for
+      interesting TFDS files. Defaults to json files.
 
   Yields:
     all dataset references in the given folder.
@@ -220,16 +228,26 @@ def _find_references_with_glob(
   if is_data_dir:
     data_dir = folder
     dataset_name = None
-    globs = ['*/*/*/*.json', '*/*/*.json']
+    stars = ['*/*/*/*', '*/*/*']
   else:
     data_dir = folder.parent
     dataset_name = folder.name
-    globs = ['*/*/*.json', '*/*.json']
+    stars = ['*/*/*', '*/*']
+
+  globs = [f'{star}.{suffix}' for star in stars for suffix in glob_suffixes]  # pylint:disable=g-complex-comprehension
 
   # Check files matching the globs and are files we are interested in.
   matched_files_per_folder = collections.defaultdict(set)
-  file_names = [constants.FEATURES_FILENAME, constants.DATASET_INFO_FILENAME]
-  for file in _find_files_with_glob(folder, globs=globs, file_names=file_names):
+  file_names = [
+      constants.FEATURES_FILENAME,
+      constants.DATASET_INFO_FILENAME,
+  ]
+
+  for file in _find_files_with_glob(
+      folder,
+      globs=globs,
+      file_names=file_names,
+  ):
     matched_files_per_folder[file.parent].add(file.name)
 
   for data_folder, matched_files in matched_files_per_folder.items():
@@ -284,6 +302,7 @@ def _find_references_with_glob(
         dataset_name=dataset_name,
         config=config,
         version=version,
+        info_filenames=matched_files,
     )
 
 
@@ -292,6 +311,7 @@ def list_dataset_variants(
     namespace: str | None = None,
     include_versions: bool = True,
     include_old_tfds_version: bool = False,
+    glob_suffixes: Sequence[str] = ('json',),
 ) -> Iterator[naming.DatasetReference]:
   """Yields all variants (config + version) found in `dataset_dir`.
 
@@ -301,6 +321,8 @@ def list_dataset_variants(
     include_versions: whether to list what versions are available.
     include_old_tfds_version: include datasets that have been generated with
       TFDS before 4.0.0.
+    glob_suffixes: list of file suffixes to use to create the the glob for
+      interesting TFDS files. Defaults to json files.
 
   Yields:
     all variants of the given dataset.
@@ -313,6 +335,7 @@ def list_dataset_variants(
       is_dataset_dir=True,
       namespace=namespace,
       include_old_tfds_version=include_old_tfds_version,
+      glob_suffixes=glob_suffixes,
   ):
     if include_versions:
       key = f'{reference.dataset_name}/{reference.config}:{reference.version}'
diff --git a/tensorflow_datasets/core/utils/file_utils_test.py b/tensorflow_datasets/core/utils/file_utils_test.py
@@ -41,21 +41,44 @@ def test_list_dataset_variants_with_configs(mock_fs: testing.MockFs):
       'x': ['1.0.0', '1.0.1'],
       'y': ['2.0.0'],
   }
+  info_filenames = {
+      'features.json',
+      'dataset_info.json',
+  }
+  glob_suffixes = [
+      'json',
+  ]
   for config, versions in configs_and_versions.items():
     for version in versions:
-      mock_fs.add_file(dataset_dir / config / version / 'dataset_info.json')
-      mock_fs.add_file(dataset_dir / config / version / 'features.json')
+      for info_filename in info_filenames:
+        mock_fs.add_file(dataset_dir / config / version / info_filename)
 
-  references = sorted(file_utils.list_dataset_variants(dataset_dir=dataset_dir))
+  references = sorted(
+      file_utils.list_dataset_variants(
+          dataset_dir=dataset_dir, glob_suffixes=glob_suffixes
+      )
+  )
   assert references == [
       naming.DatasetReference(
-          dataset_name='my_ds', config='x', version='1.0.0', data_dir=data_dir
+          dataset_name='my_ds',
+          config='x',
+          version='1.0.0',
+          data_dir=data_dir,
+          info_filenames=info_filenames,
       ),
       naming.DatasetReference(
-          dataset_name='my_ds', config='x', version='1.0.1', data_dir=data_dir
+          dataset_name='my_ds',
+          config='x',
+          version='1.0.1',
+          data_dir=data_dir,
+          info_filenames=info_filenames,
       ),
       naming.DatasetReference(
-          dataset_name='my_ds', config='y', version='2.0.0', data_dir=data_dir
+          dataset_name='my_ds',
+          config='y',
+          version='2.0.0',
+          data_dir=data_dir,
+          info_filenames=info_filenames,
       ),
   ]
 
@@ -69,10 +92,12 @@ def test_list_dataset_variants_with_configs_no_versions(
       'x': ['1.0.0', '1.0.1'],
       'y': ['2.0.0'],
   }
+  info_filenames = {'dataset_info.json', 'features.json'}
   for config, versions in configs_and_versions.items():
     for version in versions:
-      mock_fs.add_file(dataset_dir / config / version / 'dataset_info.json')
-      mock_fs.add_file(dataset_dir / config / version / 'features.json')
+      for filename in info_filenames:
+        mock_fs.add_file(dataset_dir / config / version / filename)
+        mock_fs.add_file(dataset_dir / config / version / filename)
 
   references = sorted(
       file_utils.list_dataset_variants(
@@ -81,10 +106,16 @@ def test_list_dataset_variants_with_configs_no_versions(
   )
   assert references == [
       naming.DatasetReference(
-          dataset_name='my_ds', config='x', data_dir=data_dir
+          dataset_name='my_ds',
+          config='x',
+          data_dir=data_dir,
+          info_filenames=info_filenames,
       ),
       naming.DatasetReference(
-          dataset_name='my_ds', config='y', data_dir=data_dir
+          dataset_name='my_ds',
+          config='y',
+          data_dir=data_dir,
+          info_filenames=info_filenames,
       ),
   ]
 
@@ -108,10 +139,16 @@ def test_list_dataset_variants_without_configs(mock_fs: testing.MockFs):
   )
   assert references == [
       naming.DatasetReference(
-          dataset_name='my_ds', version='1.0.0', data_dir=data_dir
+          dataset_name='my_ds',
+          version='1.0.0',
+          data_dir=data_dir,
+          info_filenames={'dataset_info.json'},
       ),
       naming.DatasetReference(
-          dataset_name='my_ds', version='1.0.1', data_dir=data_dir
+          dataset_name='my_ds',
+          version='1.0.1',
+          data_dir=data_dir,
+          info_filenames={'dataset_info.json', 'features.json'},
       ),
   ]
 
@@ -125,7 +162,10 @@ def test_list_dataset_variants_without_configs(mock_fs: testing.MockFs):
   )
   assert references == [
       naming.DatasetReference(
-          dataset_name='my_ds', version='1.0.1', data_dir=data_dir
+          dataset_name='my_ds',
+          version='1.0.1',
+          data_dir=data_dir,
+          info_filenames={'dataset_info.json', 'features.json'},
       )
   ]
 
@@ -140,6 +180,7 @@ def test_list_datasets_in_data_dir(mock_fs: testing.MockFs):
   mock_fs.add_file(data_dir / 'ds1/config2/1.0.0/features.json')
   mock_fs.add_file(data_dir / 'ds2/1.0.0/dataset_info.json')
   mock_fs.add_file(data_dir / 'ds2/1.0.0/features.json')
+  info_filenames = {'dataset_info.json', 'features.json'}
 
   # The following are problematic and should thus be ignored.
   mock_fs.add_file(
@@ -164,21 +205,27 @@ def test_list_datasets_in_data_dir(mock_fs: testing.MockFs):
           config='config1',
           version='1.0.0',
           data_dir=data_dir,
+          info_filenames=info_filenames,
       ),
       naming.DatasetReference(
           dataset_name='ds1',
           config='config1',
           version='2.0.0',
           data_dir=data_dir,
+          info_filenames=info_filenames,
       ),
       naming.DatasetReference(
           dataset_name='ds1',
           config='config2',
           version='1.0.0',
           data_dir=data_dir,
+          info_filenames=info_filenames,
       ),
       naming.DatasetReference(
-          dataset_name='ds2', version='1.0.0', data_dir=data_dir
+          dataset_name='ds2',
+          version='1.0.0',
+          data_dir=data_dir,
+          info_filenames=info_filenames,
       ),
   ]
 
@@ -205,6 +252,7 @@ def test_list_datasets_in_data_dir_with_namespace(mock_fs: testing.MockFs):
           config='config1',
           version='1.0.0',
           data_dir=data_dir,
+          info_filenames={'dataset_info.json', 'features.json'},
       ),
   ]