Add support for storing a dataset in multiple formats

tomvdw · The TensorFlow Datasets Authors · commit 58ae072a4db2 · 2024-06-21T01:39:17.000-07:00
Add alternative file formats to dataset info and make `tfds.data_source` pick a supported file format when the default one doesn't support random access.

PiperOrigin-RevId: 645307409
diff --git a/tensorflow_datasets/core/dataset_builder.py b/tensorflow_datasets/core/dataset_builder.py
@@ -783,33 +783,56 @@ def as_data_source(
     if split is None:
       split = {s: s for s in self.info.splits}
 
-    # Create a dataset for each of the given splits
-    def build_single_data_source(
-        split: str,
-    ) -> Sequence[Any]:
-      file_format = self.info.file_format
-      if file_format == file_adapters.FileFormat.ARRAY_RECORD:
-        return array_record.ArrayRecordDataSource(
-            self.info,
-            split=split,
-            decoders=decoders,
+    info = self.info
+
+    random_access_formats = file_adapters.FileFormat.with_random_access()
+    random_access_formats_msg = " or ".join(
+        [f.value for f in random_access_formats]
+    )
+    unsupported_format_msg = (
+        f"Random access data source for file format {info.file_format} is"
+        " not supported. Can you try to run download_and_prepare with"
+        f" file_format set to one of: {random_access_formats_msg}?"
+    )
+
+    if info.file_format is None and not info.alternative_file_formats:
+      raise ValueError(
+          "Dataset info file format is not set! For random access, one of the"
+          f" following formats is required: {random_access_formats_msg}"
+      )
+
+    if (
+        info.file_format is None
+        or info.file_format not in random_access_formats
+    ):
+      available_formats = set(info.alternative_file_formats)
+      suitable_formats = available_formats.intersection(random_access_formats)
+      if suitable_formats:
+        chosen_format = suitable_formats.pop()
+        logging.info(
+            "Found random access formats: %s. Chose to use %s. Overriding file"
+            " format in the dataset info.",
+            ", ".join([f.name for f in suitable_formats]),
+            chosen_format,
         )
-      elif file_format == file_adapters.FileFormat.PARQUET:
-        return parquet.ParquetDataSource(
-            self.info,
-            split=split,
-            decoders=decoders,
+        # Change the dataset info to read from a random access format.
+        info.set_file_format(
+            chosen_format, override=True, override_if_initialized=True
         )
       else:
-        args = [
-            f"`file_format='{file_format.value}'`"
-            for file_format in file_adapters.FileFormat.with_random_access()
-        ]
-        raise NotImplementedError(
-            f"Random access data source for file format {file_format} is not"
-            " supported. Can you try to run download_and_prepare with"
-            f" {' or '.join(args)}?"
-        )
+        raise NotImplementedError(unsupported_format_msg)
+
+    # Create a dataset for each of the given splits
+    def build_single_data_source(split: str) -> Sequence[Any]:
+      match info.file_format:
+        case file_adapters.FileFormat.ARRAY_RECORD:
+          return array_record.ArrayRecordDataSource(
+              info, split=split, decoders=decoders
+          )
+        case file_adapters.FileFormat.PARQUET:
+          return parquet.ParquetDataSource(info, split=split, decoders=decoders)
+        case _:
+          raise NotImplementedError(unsupported_format_msg)
 
     all_ds = tree.map_structure(build_single_data_source, split)
     return all_ds
diff --git a/tensorflow_datasets/core/dataset_builder_test.py b/tensorflow_datasets/core/dataset_builder_test.py
@@ -578,6 +578,32 @@ def test_load_as_data_source(self):
     assert len(data_source) == 10
     assert data_source[0]["x"] == 28
 
+  def test_load_as_data_source_alternative_file_format(self):
+    data_dir = self.get_temp_dir()
+    builder = DummyDatasetWithConfigs(
+        data_dir=data_dir,
+        config="plus1",
+        file_format=file_adapters.FileFormat.ARRAY_RECORD,
+    )
+    builder.download_and_prepare()
+    # Change the default file format and add alternative file format.
+    builder.info.as_proto.file_format = "tfrecord"
+    builder.info.add_alternative_file_format("array_record")
+
+    data_source = builder.as_data_source()
+    assert isinstance(data_source, dict)
+    assert isinstance(data_source["train"], array_record.ArrayRecordDataSource)
+    assert isinstance(data_source["test"], array_record.ArrayRecordDataSource)
+    assert len(data_source["test"]) == 10
+    assert data_source["test"][0]["x"] == 28
+    assert len(data_source["train"]) == 20
+    assert data_source["train"][0]["x"] == 7
+
+    data_source = builder.as_data_source(split="test")
+    assert isinstance(data_source, array_record.ArrayRecordDataSource)
+    assert len(data_source) == 10
+    assert data_source[0]["x"] == 28
+
   @parameterized.named_parameters(
       *[
           {"file_format": file_format, "testcase_name": file_format.value}
diff --git a/tensorflow_datasets/core/dataset_info.py b/tensorflow_datasets/core/dataset_info.py
@@ -33,7 +33,7 @@
 from __future__ import annotations
 
 import abc
-from collections.abc import Iterable
+from collections.abc import Iterable, Sequence
 import dataclasses
 import json
 import os
@@ -194,6 +194,9 @@ def __init__(
       license: str | None = None,  # pylint: disable=redefined-builtin
       redistribution_info: Optional[dict[str, str]] = None,
       split_dict: Optional[splits_lib.SplitDict] = None,
+      alternative_file_formats: (
+          Sequence[str | file_adapters.FileFormat] | None
+      ) = None,
       # LINT.ThenChange(:setstate)
   ):
     # pyformat: disable
@@ -238,6 +241,8 @@ def __init__(
         subfield will automatically be written to a LICENSE file stored with the
         dataset.
       split_dict: information about the splits in this dataset.
+      alternative_file_formats: alternative file formats that are availablefor
+        this dataset.
     """
     # pyformat: enable
     self._builder_or_identity = builder
@@ -246,6 +251,13 @@ def __init__(
     else:
       self._identity = DatasetIdentity.from_builder(builder)
 
+    self._alternative_file_formats: list[file_adapters.FileFormat] = []
+    if alternative_file_formats:
+      for file_format in alternative_file_formats:
+        if isinstance(file_format, str):
+          file_format = file_adapters.FileFormat.from_value(file_format)
+        self.add_alternative_file_format(file_format)
+
     self._info_proto = dataset_info_pb2.DatasetInfo(
         name=self._identity.name,
         description=utils.dedent(description),
@@ -260,6 +272,9 @@ def __init__(
         redistribution_info=_create_redistribution_info_proto(
             license=license, redistribution_info=redistribution_info
         ),
+        alternative_file_formats=[
+            f.value for f in self._alternative_file_formats
+        ],
     )
 
     if homepage:
@@ -328,6 +343,7 @@ def from_proto(
             repeated_split_infos=proto.splits,
             filename_template=filename_template,
         ),
+        alternative_file_formats=proto.alternative_file_formats,
     )
 
   @property
@@ -415,6 +431,10 @@ def download_size(self, size):
   def features(self):
     return self._features
 
+  @property
+  def alternative_file_formats(self) -> Sequence[file_adapters.FileFormat]:
+    return self._alternative_file_formats
+
   @property
   def metadata(self) -> Metadata | None:
     return self._metadata
@@ -444,6 +464,7 @@ def set_file_format(
       self,
       file_format: None | str | file_adapters.FileFormat,
       override: bool = False,
+      override_if_initialized: bool = False,
   ) -> None:
     """Internal function to define the file format.
 
@@ -454,6 +475,8 @@ def set_file_format(
       file_format: The file format.
       override: Whether the file format should be overridden if it is already
         set.
+      override_if_initialized: Whether the file format should be overridden if
+        the DatasetInfo is already fully initialized.
 
     Raises:
       ValueError: if the file format was already set and the `override`
@@ -474,12 +497,39 @@ def set_file_format(
       raise ValueError(
           f"File format is already set to {self.file_format}. Got {file_format}"
       )
-    if override and self._fully_initialized:
+    if override and self._fully_initialized and not override_if_initialized:
       raise RuntimeError(
-          "Cannot override the file format "
-          "when the DatasetInfo is already fully initialized!"
+          "Cannot override the file format when the DatasetInfo is already"
+          " fully initialized!"
       )
     self._info_proto.file_format = file_format.value
+    if override_if_initialized:
+      # Update the splits to point to the new file format.
+      updated_split_infos = []
+      for split_info in self.splits.values():
+        if split_info.filename_template is None:
+          continue
+        updated_split_info = split_info.replace(
+            filename_template=split_info.filename_template.replace(
+                filetype_suffix=file_format.value
+            )
+        )
+        updated_split_infos.append(updated_split_info)
+      self._splits = splits_lib.SplitDict(updated_split_infos)
+
+  def add_alternative_file_format(
+      self,
+      file_format: str | file_adapters.FileFormat,
+  ) -> None:
+    """Adds an alternative file format to the dataset info."""
+    if isinstance(file_format, str):
+      file_format = file_adapters.FileFormat.from_value(file_format)
+    if file_format in self.alternative_file_formats:
+      raise ValueError(
+          f"Alternative file format {file_format} is already present."
+      )
+    self._alternative_file_formats.append(file_format)
+    self.as_proto.alternative_file_formats.append(file_format.value)
 
   @property
   def splits(self) -> splits_lib.SplitDict:
@@ -882,6 +932,7 @@ def __getstate__(self):
         "metadata": self.metadata,
         "license": self.redistribution_info.license,
         "split_dict": self.splits,
+        "alternative_file_formats": self.alternative_file_formats,
     }
   def __setstate__(self, state):
     # LINT.IfChange(setstate)
@@ -896,6 +947,7 @@ def __setstate__(self, state):
         metadata=state["metadata"],
         license=state["license"],
         split_dict=state["split_dict"],
+        alternative_file_formats=state["alternative_file_formats"],
     )
     # LINT.ThenChange(:dataset_info_args)
 
diff --git a/tensorflow_datasets/core/dataset_info_test.py b/tensorflow_datasets/core/dataset_info_test.py
@@ -422,6 +422,32 @@ def test_set_file_format_override(self):
     info.set_file_format(file_adapters.FileFormat.RIEGELI, override=True)
     self.assertEqual(info.file_format, file_adapters.FileFormat.RIEGELI)
 
+  def test_set_file_format_override_failes_when_fully_initialized(self):
+    info = dataset_info.DatasetInfo(builder=self._builder)
+    info.set_file_format(file_adapters.FileFormat.TFRECORD)
+    info._fully_initialized = True
+    self.assertEqual(info.file_format, file_adapters.FileFormat.TFRECORD)
+    with pytest.raises(
+        ValueError,
+        match=(
+            "File format is already set to FileFormat.TFRECORD. Got"
+            " FileFormat.RIEGELI"
+        ),
+    ):
+      info.set_file_format(file_adapters.FileFormat.RIEGELI)
+
+  def test_set_file_format_override_fully_initialized(self):
+    info = dataset_info.DatasetInfo(builder=self._builder)
+    info.set_file_format(file_adapters.FileFormat.TFRECORD)
+    info._fully_initialized = True
+    self.assertEqual(info.file_format, file_adapters.FileFormat.TFRECORD)
+    info.set_file_format(
+        file_adapters.FileFormat.RIEGELI,
+        override=True,
+        override_if_initialized=True,
+    )
+    self.assertEqual(info.file_format, file_adapters.FileFormat.RIEGELI)
+
   def test_update_info_proto_with_features(self):
     info_proto = dataset_info.DatasetInfo(builder=self._builder).as_proto
     new_features = features.FeaturesDict({"text": features.Text()})
diff --git a/tensorflow_datasets/core/proto/dataset_info.proto b/tensorflow_datasets/core/proto/dataset_info.proto
@@ -222,10 +222,16 @@ message DatasetInfo {
   // Specifies whether examples should be shuffled.
   bool disable_shuffling = 16;
 
-  // File format used.
-  // Use string to allow format extension without regenerating the proto.
+  // Default file format to use. Note that alternative file formats may be
+  // available too and that depending on how the dataset is loaded, the default
+  // file format may be ignored.
   string file_format = 17;
 
+  // Alternative file formats available for this dataset. Note that the number
+  // of shards and the number of examples per shard must be the same for all
+  // file formats.
+  repeated string alternative_file_formats = 22;
+
   // The data that was used to generate this dataset.
   repeated DataSourceAccess data_source_accesses = 20;
 
diff --git a/tensorflow_datasets/core/proto/dataset_info_generated_pb2.py b/tensorflow_datasets/core/proto/dataset_info_generated_pb2.py
@@ -67,7 +67,7 @@
     b' \x01(\t\x12\x10\n\x08\x64\x61ta_dir\x18\x04'
     b' \x01(\t\x12\x14\n\x0c\x64s_namespace\x18\x05'
     b' \x01(\t\x12\r\n\x05split\x18\x06'
-    b' \x01(\t"\xb4\x07\n\x0b\x44\x61tasetInfo\x12\x0c\n\x04name\x18\x01'
+    b' \x01(\t"\xd6\x07\n\x0b\x44\x61tasetInfo\x12\x0c\n\x04name\x18\x01'
     b' \x01(\t\x12\x13\n\x0b\x64\x65scription\x18\x02'
     b' \x01(\t\x12\x0f\n\x07version\x18\t \x01(\t\x12I\n\rrelease_notes\x18\x12'
     b' \x03(\x0b\x32\x32.tensorflow_datasets.DatasetInfo.ReleaseNotesEntry\x12\x13\n\x0b\x63onfig_name\x18\r'
@@ -85,8 +85,9 @@
     b' \x01(\x0b\x32#.tensorflow_datasets.SupervisedKeys\x12\x44\n\x13redistribution_info\x18\x0b'
     b" \x01(\x0b\x32'.tensorflow_datasets.RedistributionInfo\x12\x13\n\x0bmodule_name\x18\x0f"
     b' \x01(\t\x12\x19\n\x11\x64isable_shuffling\x18\x10'
-    b' \x01(\x08\x12\x13\n\x0b\x66ile_format\x18\x11'
-    b' \x01(\t\x12\x43\n\x14\x64\x61ta_source_accesses\x18\x14'
+    b' \x01(\x08\x12\x13\n\x0b\x66ile_format\x18\x11 \x01(\t\x12'
+    b' \n\x18\x61lternative_file_formats\x18\x16'
+    b' \x03(\t\x12\x43\n\x14\x64\x61ta_source_accesses\x18\x14'
     b' \x03(\x0b\x32%.tensorflow_datasets.DataSourceAccess\x1a\x33\n\x11ReleaseNotesEntry\x12\x0b\n\x03key\x18\x01'
     b' \x01(\t\x12\r\n\x05value\x18\x02'
     b' \x01(\t:\x02\x38\x01\x1a\x38\n\x16\x44ownloadChecksumsEntry\x12\x0b\n\x03key\x18\x01'
@@ -145,9 +146,9 @@
   _TFDSDATASETREFERENCE._serialized_start = 1280
   _TFDSDATASETREFERENCE._serialized_end = 1404
   _DATASETINFO._serialized_start = 1407
-  _DATASETINFO._serialized_end = 2355
-  _DATASETINFO_RELEASENOTESENTRY._serialized_start = 2246
-  _DATASETINFO_RELEASENOTESENTRY._serialized_end = 2297
-  _DATASETINFO_DOWNLOADCHECKSUMSENTRY._serialized_start = 2299
-  _DATASETINFO_DOWNLOADCHECKSUMSENTRY._serialized_end = 2355
+  _DATASETINFO._serialized_end = 2389
+  _DATASETINFO_RELEASENOTESENTRY._serialized_start = 2280
+  _DATASETINFO_RELEASENOTESENTRY._serialized_end = 2331
+  _DATASETINFO_DOWNLOADCHECKSUMSENTRY._serialized_start = 2333
+  _DATASETINFO_DOWNLOADCHECKSUMSENTRY._serialized_end = 2389
 # @@protoc_insertion_point(module_scope)