Add support for specifying how data should be deserialized in tfds.data_source

tomvdw · The TensorFlow Datasets Authors · commit acdccd45f1ff · 2024-09-13T02:57:32.000-07:00
If you're interested in the raw bytes or the deserialized but not decoded examples, then you can set the deserialize_method parameter accordingly.

PiperOrigin-RevId: 674218072
diff --git a/tensorflow_datasets/core/data_sources/array_record.py b/tensorflow_datasets/core/data_sources/array_record.py
@@ -47,6 +47,9 @@ class ArrayRecordDataSource(base.BaseDataSource):
   decoders: Optional[type_utils.TreeDict[decode.partial_decode.DecoderArg]] = (
       None
   )
+  deserialize_method: decode.DeserializeMethod = (
+      decode.DeserializeMethod.DESERIALIZE_AND_DECODE
+  )
   # In order to lazy load array_record, we don't load
   # `array_record_data_source.ArrayRecordDataSource` here.
   data_source: Any = dataclasses.field(init=False)
diff --git a/tensorflow_datasets/core/data_sources/base.py b/tensorflow_datasets/core/data_sources/base.py
@@ -68,18 +68,34 @@ class BaseDataSource(MappingView, Sequence):
     split: The split to load in the data source.
     decoders: Optional decoders for decoding.
     data_source: The underlying data source to initialize in the __post_init__.
+    deserialize_method: How to deserialize the bytes that are read before
+      returning.
   """
 
   dataset_info: dataset_info_lib.DatasetInfo
   split: splits_lib.Split | None = None
   decoders: type_utils.TreeDict[decode.partial_decode.DecoderArg] | None = None
   data_source: DataSource[Any] = dataclasses.field(init=False)
+  deserialize_method: decode.DeserializeMethod = (
+      decode.DeserializeMethod.DESERIALIZE_AND_DECODE
+  )
+
+  def _deserialize(self, record: Any) -> Any:
+    match self.deserialize_method:
+      case decode.DeserializeMethod.RAW_BYTES:
+        return record
+      case decode.DeserializeMethod.DESERIALIZE_NO_DECODE:
+        if file_format := self.dataset_info.file_format:
+          return file_format.deserialize(record)
+        raise ValueError('No file format set, cannot deserialize bytes!')
+      case decode.DeserializeMethod.DESERIALIZE_AND_DECODE:
+        if features := self.dataset_info.features:
+          return features.deserialize_example_np(record, decoders=self.decoders)  # pylint: disable=attribute-error
+        raise ValueError('No features set, cannot decode example!')
 
   def __getitem__(self, key: SupportsIndex) -> Any:
     record = self.data_source[key.__index__()]
-    return self.dataset_info.features.deserialize_example_np(
-        record, decoders=self.decoders
-    )
+    return self._deserialize(record)
 
   def __getitems__(self, keys: Sequence[int]) -> Sequence[Any]:
     """Retrieves items by batch.
@@ -98,17 +114,12 @@ def __getitems__(self, keys: Sequence[int]) -> Sequence[Any]:
     if not keys:
       return []
     records = self.data_source.__getitems__(keys)
-    features = self.dataset_info.features
     if len(keys) != len(records):
       raise IndexError(
-          f'Requested {len(keys)} records but got'
-          f' {len(records)} records.'
+          f'Requested {len(keys)} records but got {len(records)} records.'
           f'{keys=}, {records=}'
       )
-    return [
-        features.deserialize_example_np(record, decoders=self.decoders)
-        for record in records
-    ]
+    return [self._deserialize(record) for record in records]
 
   def __repr__(self) -> str:
     decoders_repr = (
diff --git a/tensorflow_datasets/core/dataset_builder.py b/tensorflow_datasets/core/dataset_builder.py
@@ -799,6 +799,7 @@ def as_data_source(
       split: Optional[Tree[splits_lib.SplitArg]] = None,
       *,
       decoders: Optional[TreeDict[decode.partial_decode.DecoderArg]] = None,
+      deserialize_method: decode.DeserializeMethod = decode.DeserializeMethod.DESERIALIZE_AND_DECODE,
   ) -> ListOrTreeOrElem[Sequence[Any]]:
     """Constructs an `ArrayRecordDataSource`.
 
@@ -812,6 +813,11 @@ def as_data_source(
         customized feature keys need to be present. See [the
         guide](https://github.com/tensorflow/datasets/blob/master/docs/decode.md)
         for more info.
+      deserialize_method: Whether the read examples should be deserialized
+        and/or decoded. If not specified, it'll deserialize the data and decode
+        the features. Decoding is only supported if the examples are tf
+        examples. Note that if the deserialize_method method is other than
+        PARSE_AND_DECODE, then the `decoders` argument is ignored.
 
     Returns:
       `Sequence` if `split`,
@@ -866,13 +872,27 @@ def as_data_source(
 
     # Create a dataset for each of the given splits
     def build_single_data_source(split: str) -> Sequence[Any]:
+      if info.file_format is None:
+        raise ValueError(
+            "Dataset info file format is not set! For random access, one of the"
+            f" following formats is required: {random_access_formats_msg}"
+        )
+
       match info.file_format:
         case file_adapters.FileFormat.ARRAY_RECORD:
           return array_record.ArrayRecordDataSource(
-              info, split=split, decoders=decoders
+              info,
+              split=split,
+              decoders=decoders,
+              deserialize_method=deserialize_method,
           )
         case file_adapters.FileFormat.PARQUET:
-          return parquet.ParquetDataSource(info, split=split, decoders=decoders)
+          return parquet.ParquetDataSource(
+              info,
+              split=split,
+              decoders=decoders,
+              deserialize_method=deserialize_method,
+          )
         case _:
           raise NotImplementedError(unsupported_format_msg)
 
diff --git a/tensorflow_datasets/core/decode/__init__.py b/tensorflow_datasets/core/decode/__init__.py
@@ -16,13 +16,15 @@
 """Decoder public API."""
 
 from tensorflow_datasets.core.decode.base import Decoder
+from tensorflow_datasets.core.decode.base import DeserializeMethod
 from tensorflow_datasets.core.decode.base import make_decoder
 from tensorflow_datasets.core.decode.base import SkipDecoding
 from tensorflow_datasets.core.decode.partial_decode import PartialDecoding
 
 __all__ = [
     'Decoder',
     'make_decoder',
+    'DeserializeMethod',
     'PartialDecoding',
     'SkipDecoding',
 ]
diff --git a/tensorflow_datasets/core/decode/base.py b/tensorflow_datasets/core/decode/base.py
@@ -16,6 +16,7 @@
 """Base decoders."""
 
 import abc
+import enum
 import functools
 
 from tensorflow_datasets.core.utils.lazy_imports_utils import tensorflow as tf
@@ -210,3 +211,30 @@ def decorated(*args, **kwargs):
     return decorated
 
   return decorator
+
+
+class DeserializeMethod(enum.Enum):
+  """How to deserialize the bytes that are read before returning.
+
+  When reading examples from a source (e.g., a file), we consider 2 phases in
+  parsing the raw data:
+
+  1. Deserialize: deserializes raw bytes into an object. Typically it will be
+     deserialized into a `tf.train.Example`.
+
+  2. Decode: A `tf.train.Example` might encode information (e.g., a bytes
+     feature encodes an image or a int64 list encodes a tensor). The second
+     phase decodes the encoded information.
+
+  DESERIALIZE_AND_DECODE: deserialize the raw bytes to tf example (if file
+    format doesn't have a custom encoding) and then decode the features. Note
+    that how and what is decoded can typically be overriden with `decoders`.
+  DESERIALIZE_NO_DECODE: parse the raw bytes to tf example (if file format
+    doesn't have a custom encoding). If this parse method is used, then all
+    decoders are ignored.
+  RAW_BYTES: don't parse nor decode, but return the raw bytes.
+  """
+
+  DESERIALIZE_AND_DECODE = 'deserialize_and_decode'
+  DESERIALIZE_NO_DECODE = 'deserialize_no_decode'
+  RAW_BYTES = 'raw_bytes'
diff --git a/tensorflow_datasets/core/features/feature.py b/tensorflow_datasets/core/features/feature.py
@@ -765,7 +765,7 @@ def decode_example(self, tfexample_data):
   def decode_example_np(
       self, example_data: type_utils.NpArrayOrScalar
   ) -> type_utils.NpArrayOrScalar | None:
-    """Encode the feature dict into NumPy-compatible input.
+    """Decode the example data into NumPy-compatible input.
 
     Args:
       example_data: Value to convert to NumPy.
diff --git a/tensorflow_datasets/core/file_adapters.py b/tensorflow_datasets/core/file_adapters.py
@@ -25,14 +25,20 @@
 import re
 from typing import Any, ClassVar, Type, TypeVar
 
-from etils import epath
-from tensorflow_datasets.core.utils import file_utils
-from tensorflow_datasets.core.utils import type_utils
+from etils import epy
 from tensorflow_datasets.core.utils.lazy_imports_utils import array_record_module
 from tensorflow_datasets.core.utils.lazy_imports_utils import parquet as pq
 from tensorflow_datasets.core.utils.lazy_imports_utils import pyarrow as pa
 from tensorflow_datasets.core.utils.lazy_imports_utils import tensorflow as tf
 
+with epy.lazy_imports():
+  # pylint: disable=g-import-not-at-top
+  from etils import epath
+  from tensorflow_datasets.core.utils import file_utils
+  from tensorflow_datasets.core.utils import type_utils
+
+  # pylint: enable=g-import-not-at-top
+
 ExamplePositions = list[Any]
 T = TypeVar('T')
 
@@ -52,6 +58,10 @@ class FileFormat(enum.Enum):
   def file_suffix(self) -> str:
     return ADAPTER_FOR_FORMAT[self].FILE_SUFFIX
 
+  def deserialize(self, raw_example: bytes) -> Any:
+    """Deserializes bytes into an object, but does not decode features."""
+    return ADAPTER_FOR_FORMAT[self].deserialize(raw_example)
+
   @classmethod
   def with_random_access(cls) -> set[FileFormat]:
     """File formats with random access."""
@@ -146,6 +156,17 @@ def write_examples(
     """
     raise NotImplementedError()
 
+  @classmethod
+  def deserialize(cls, raw_example: bytes) -> Any:
+    """Returns the deserialized example, but does not decode features.
+
+    If custom serialization is used, override this method in the file adapter.
+
+    Args:
+      raw_example: the bytes read from the source that should be deserialized.
+    """
+    return tf.train.Example.FromString(raw_example)
+
 
 class TfRecordFileAdapter(FileAdapter):
   """File adapter for TFRecord file format."""
diff --git a/tensorflow_datasets/core/load.py b/tensorflow_datasets/core/load.py
@@ -705,6 +705,7 @@ def data_source(
     data_dir: Union[None, str, os.PathLike] = None,  # pylint: disable=g-bare-generic
     download: bool = True,
     decoders: Optional[TreeDict[decode.partial_decode.DecoderArg]] = None,
+    deserialize_method: decode.DeserializeMethod = decode.DeserializeMethod.DESERIALIZE_AND_DECODE,
     builder_kwargs: Optional[Dict[str, Any]] = None,
     download_and_prepare_kwargs: Optional[Dict[str, Any]] = None,
     try_gcs: bool = False,
@@ -777,6 +778,11 @@ def data_source(
       customized feature keys need to be present. See [the
       guide](https://github.com/tensorflow/datasets/blob/master/docs/decode.md)
       for more info.
+    deserialize_method: Whether the read examples should be deserialized and/or
+      decoded. If not specified, it'll deserialize the data and decode the
+      features. Decoding is only supported if the examples are tf examples.
+      Note that if the parse method is other than PARSE_AND_DECODE, then the
+      `decoders` argument is ignored.
     builder_kwargs: `dict` (optional), keyword arguments to be passed to the
       `tfds.core.DatasetBuilder` constructor. `data_dir` will be passed through
       by default.
@@ -807,7 +813,9 @@ def data_source(
       try_gcs,
   )
   _download_and_prepare_builder(dbuilder, download, download_and_prepare_kwargs)
-  return dbuilder.as_data_source(split=split, decoders=decoders)
+  return dbuilder.as_data_source(
+      split=split, decoders=decoders, deserialize_method=deserialize_method
+  )
 
 
 def _get_all_versions(