tensorflow
diff --git a/‎docs/_index.yaml‎
Lines changed: 0 additions & 5 deletions b/‎docs/_index.yaml‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎tensorflow_datasets/core/dataset_builder.py‎
Lines changed: 90 additions & 51 deletions b/‎tensorflow_datasets/core/dataset_builder.py‎
Lines changed: 90 additions & 51 deletions
diff --git a/‎tensorflow_datasets/core/dataset_builder_test.py‎
Lines changed: 26 additions & 0 deletions b/‎tensorflow_datasets/core/dataset_builder_test.py‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder.py‎
Lines changed: 43 additions & 22 deletions b/‎tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder.py‎
Lines changed: 43 additions & 22 deletions
diff --git a/‎tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder_test.py‎
Lines changed: 18 additions & 1 deletion b/‎tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder_test.py‎
Lines changed: 18 additions & 1 deletion
@@ -41,11 +41,6 @@ landing_page:
         {% dynamic endif %}
   - classname: devsite-landing-row-cards
     items:
-    - heading: "Explore datasets with Know Your Data"
-      image_path: /resources/images/kyd-screenshot.jpg
-      buttons:
-      - label: Go to Know Your Data
-        path: https://knowyourdata.withgoogle.com
     - heading: Introducing TensorFlow Datasets
       image_path: /resources/images/tf-logo-card-16x9.png
       path: https://blog.tensorflow.org/2019/02/introducing-tensorflow-datasets.html
 
@@ -29,37 +29,42 @@
 from typing import Any, ClassVar, Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union
 
 from absl import logging
-from etils import epath
-import importlib_resources
-from tensorflow_datasets.core import constants
-from tensorflow_datasets.core import dataset_info
-from tensorflow_datasets.core import dataset_metadata
-from tensorflow_datasets.core import decode
-from tensorflow_datasets.core import download
-from tensorflow_datasets.core import file_adapters
-from tensorflow_datasets.core import lazy_imports_lib
-from tensorflow_datasets.core import logging as tfds_logging
-from tensorflow_datasets.core import naming
-from tensorflow_datasets.core import reader as reader_lib
-from tensorflow_datasets.core import registered
-from tensorflow_datasets.core import split_builder as split_builder_lib
-from tensorflow_datasets.core import splits as splits_lib
-from tensorflow_datasets.core import tf_compat
-from tensorflow_datasets.core import units
-from tensorflow_datasets.core import utils
-from tensorflow_datasets.core import writer as writer_lib
-from tensorflow_datasets.core.data_sources import array_record
-from tensorflow_datasets.core.data_sources import parquet
-from tensorflow_datasets.core.proto import dataset_info_pb2
-from tensorflow_datasets.core.utils import file_utils
-from tensorflow_datasets.core.utils import gcs_utils
-from tensorflow_datasets.core.utils import read_config as read_config_lib
-from tensorflow_datasets.core.utils import type_utils
+from etils import epy
 from tensorflow_datasets.core.utils.lazy_imports_utils import apache_beam as beam
 from tensorflow_datasets.core.utils.lazy_imports_utils import tensorflow as tf
 from tensorflow_datasets.core.utils.lazy_imports_utils import tree
-import termcolor
 
+with epy.lazy_imports():
+  # pylint: disable=g-import-not-at-top
+  from etils import epath
+  import importlib_resources
+  import termcolor
+
+  from tensorflow_datasets.core import constants
+  from tensorflow_datasets.core import dataset_info
+  from tensorflow_datasets.core import dataset_metadata
+  from tensorflow_datasets.core import decode
+  from tensorflow_datasets.core import download
+  from tensorflow_datasets.core import file_adapters
+  from tensorflow_datasets.core import lazy_imports_lib
+  from tensorflow_datasets.core import logging as tfds_logging
+  from tensorflow_datasets.core import naming
+  from tensorflow_datasets.core import reader as reader_lib
+  from tensorflow_datasets.core import registered
+  from tensorflow_datasets.core import split_builder as split_builder_lib
+  from tensorflow_datasets.core import splits as splits_lib
+  from tensorflow_datasets.core import tf_compat
+  from tensorflow_datasets.core import units
+  from tensorflow_datasets.core import utils
+  from tensorflow_datasets.core import writer as writer_lib
+  from tensorflow_datasets.core.data_sources import array_record
+  from tensorflow_datasets.core.data_sources import parquet
+  from tensorflow_datasets.core.proto import dataset_info_pb2
+  from tensorflow_datasets.core.utils import file_utils
+  from tensorflow_datasets.core.utils import gcs_utils
+  from tensorflow_datasets.core.utils import read_config as read_config_lib
+  from tensorflow_datasets.core.utils import type_utils
+  # pylint: enable=g-import-not-at-top
 
 ListOrTreeOrElem = type_utils.ListOrTreeOrElem
 Tree = type_utils.Tree
@@ -726,6 +731,17 @@ def download_and_prepare(
 
     self._log_download_done()
 
+    # Execute post download and prepare hook if it exists.
+    self._post_download_and_prepare_hook()
+
+
+  def _post_download_and_prepare_hook(self) -> None:
+    """Hook to be executed after download and prepare.
+
+    Override this in custom dataset builders to execute custom logic after
+    download and prepare.
+    """
+    pass
 
   def _update_dataset_info(self) -> None:
     """Updates the `dataset_info.json` file in the dataset dir."""
@@ -767,33 +783,56 @@ def as_data_source(
     if split is None:
       split = {s: s for s in self.info.splits}
 
-    # Create a dataset for each of the given splits
-    def build_single_data_source(
-        split: str,
-    ) -> Sequence[Any]:
-      file_format = self.info.file_format
-      if file_format == file_adapters.FileFormat.ARRAY_RECORD:
-        return array_record.ArrayRecordDataSource(
-            self.info,
-            split=split,
-            decoders=decoders,
+    info = self.info
+
+    random_access_formats = file_adapters.FileFormat.with_random_access()
+    random_access_formats_msg = " or ".join(
+        [f.value for f in random_access_formats]
+    )
+    unsupported_format_msg = (
+        f"Random access data source for file format {info.file_format} is"
+        " not supported. Can you try to run download_and_prepare with"
+        f" file_format set to one of: {random_access_formats_msg}?"
+    )
+
+    if info.file_format is None and not info.alternative_file_formats:
+      raise ValueError(
+          "Dataset info file format is not set! For random access, one of the"
+          f" following formats is required: {random_access_formats_msg}"
+      )
+
+    if (
+        info.file_format is None
+        or info.file_format not in random_access_formats
+    ):
+      available_formats = set(info.alternative_file_formats)
+      suitable_formats = available_formats.intersection(random_access_formats)
+      if suitable_formats:
+        chosen_format = suitable_formats.pop()
+        logging.info(
+            "Found random access formats: %s. Chose to use %s. Overriding file"
+            " format in the dataset info.",
+            ", ".join([f.name for f in suitable_formats]),
+            chosen_format,
         )
-      elif file_format == file_adapters.FileFormat.PARQUET:
-        return parquet.ParquetDataSource(
-            self.info,
-            split=split,
-            decoders=decoders,
+        # Change the dataset info to read from a random access format.
+        info.set_file_format(
+            chosen_format, override=True, override_if_initialized=True
         )
       else:
-        args = [
-            f"`file_format='{file_format.value}'`"
-            for file_format in file_adapters.FileFormat.with_random_access()
-        ]
-        raise NotImplementedError(
-            f"Random access data source for file format {file_format} is not"
-            " supported. Can you try to run download_and_prepare with"
-            f" {' or '.join(args)}?"
-        )
+        raise NotImplementedError(unsupported_format_msg)
+
+    # Create a dataset for each of the given splits
+    def build_single_data_source(split: str) -> Sequence[Any]:
+      match info.file_format:
+        case file_adapters.FileFormat.ARRAY_RECORD:
+          return array_record.ArrayRecordDataSource(
+              info, split=split, decoders=decoders
+          )
+        case file_adapters.FileFormat.PARQUET:
+          return parquet.ParquetDataSource(info, split=split, decoders=decoders)
+        case _:
+          raise NotImplementedError(unsupported_format_msg)
 
     all_ds = tree.map_structure(build_single_data_source, split)
     return all_ds
 
@@ -578,6 +578,32 @@ def test_load_as_data_source(self):
     assert len(data_source) == 10
     assert data_source[0]["x"] == 28
 
+  def test_load_as_data_source_alternative_file_format(self):
+    data_dir = self.get_temp_dir()
+    builder = DummyDatasetWithConfigs(
+        data_dir=data_dir,
+        config="plus1",
+        file_format=file_adapters.FileFormat.ARRAY_RECORD,
+    )
+    builder.download_and_prepare()
+    # Change the default file format and add alternative file format.
+    builder.info.as_proto.file_format = "tfrecord"
+    builder.info.add_alternative_file_format("array_record")
+
+    data_source = builder.as_data_source()
+    assert isinstance(data_source, dict)
+    assert isinstance(data_source["train"], array_record.ArrayRecordDataSource)
+    assert isinstance(data_source["test"], array_record.ArrayRecordDataSource)
+    assert len(data_source["test"]) == 10
+    assert data_source["test"][0]["x"] == 28
+    assert len(data_source["train"]) == 20
+    assert data_source["train"][0]["x"] == 7
+
+    data_source = builder.as_data_source(split="test")
+    assert isinstance(data_source, array_record.ArrayRecordDataSource)
+    assert len(data_source) == 10
+    assert data_source[0]["x"] == 28
+
   @parameterized.named_parameters(
       *[
           {"file_format": file_format, "testcase_name": file_format.value}
 
@@ -42,14 +42,14 @@
 from tensorflow_datasets.core import example_serializer
 from tensorflow_datasets.core import features as feature_lib
 from tensorflow_datasets.core import file_adapters
-from tensorflow_datasets.core import lazy_imports_lib
 from tensorflow_datasets.core import split_builder as split_builder_lib
 from tensorflow_datasets.core import splits as splits_lib
 from tensorflow_datasets.core.utils import huggingface_utils
 from tensorflow_datasets.core.utils import shard_utils
 from tensorflow_datasets.core.utils import tqdm_utils
 from tensorflow_datasets.core.utils import version as version_lib
 from tensorflow_datasets.core.utils.lazy_imports_utils import datasets as hf_datasets
+from tensorflow_datasets.core.utils.lazy_imports_utils import huggingface_hub
 
 
 def _extract_supervised_keys(hf_info):
@@ -198,6 +198,7 @@ def __init__(
       hf_num_proc: Optional[int] = None,
       tfds_num_proc: Optional[int] = None,
       ignore_hf_errors: bool = False,
+      overwrite_version: str | None = None,
       **config_kwargs,
   ):
     self._hf_repo_id = hf_repo_id
@@ -216,23 +217,28 @@ def __init__(
           f' hf_repo_id={self._hf_repo_id}, hf_config={self._hf_config},'
           f' config_kwargs={self.config_kwargs}'
       ) from e
-    version = str(self._hf_info.version or self._hf_builder.VERSION or '1.0.0')
+    version = str(
+        overwrite_version
+        or self._hf_info.version
+        or self._hf_builder.VERSION
+        or '1.0.0'
+    )
     self.VERSION = version_lib.Version(version)  # pylint: disable=invalid-name
-    if self._hf_config:
-      self._converted_builder_config = dataset_builder.BuilderConfig(
-          name=tfds_config,
-          version=self.VERSION,
-          description=self._hf_info.description,
-      )
-    else:
-      self._converted_builder_config = None
     self.name = huggingface_utils.convert_hf_name(hf_repo_id)
     self._hf_hub_token = hf_hub_token
     self._hf_num_proc = hf_num_proc
     self._tfds_num_proc = tfds_num_proc
     self._verification_mode = (
         'no_checks' if ignore_verifications else 'all_checks'
     )
+    if self._hf_config:
+      self._converted_builder_config = dataset_builder.BuilderConfig(
+          name=tfds_config,
+          version=self.VERSION,
+          description=self._get_text_field('description'),
+      )
+    else:
+      self._converted_builder_config = None
     super().__init__(
         file_format=file_format, config=tfds_config, data_dir=data_dir
     )
@@ -260,8 +266,16 @@ def _hf_download_and_prepare(self):
 
   @property
   def _hf_info(self) -> hf_datasets.DatasetInfo:
+    """Retrieves the dataset info from the HuggingFace Datasets."""
     return self._hf_builder.info
 
+  @functools.cached_property
+  def _hf_hub_info(self) -> huggingface_hub.hf_api.DatasetInfo:
+    """Retrieves the dataset info from the HuggingFace Hub and caches it."""
+    return huggingface_hub.dataset_info(
+        self._hf_repo_id, token=self._hf_hub_token
+    )
+
   def _hf_features(self) -> hf_datasets.Features:
     if not self._hf_info.features:
       # We need to download and prepare the data to know its features.
@@ -272,9 +286,9 @@ def _hf_features(self) -> hf_datasets.Features:
   def _info(self) -> dataset_info_lib.DatasetInfo:
     return dataset_info_lib.DatasetInfo(
         builder=self,
-        description=self._hf_info.description,
+        description=self._get_text_field('description'),
         features=huggingface_utils.convert_hf_features(self._hf_features()),
-        citation=self._hf_info.citation,
+        citation=self._get_text_field('citation'),
         license=self._get_license(),
         supervised_keys=_extract_supervised_keys(self._hf_info),
     )
@@ -411,24 +425,32 @@ def _write_shards(
 
   def _get_license(self) -> str | None:
     """Implements heuristics to get the license from HuggingFace."""
-    # First heuristic: check the DatasetInfo from Hugging Face datasets.
-    if self._hf_info.license:
-      return self._hf_info.license
-    huggingface_hub = lazy_imports_lib.lazy_imports.huggingface_hub
-    # Retrieve the dataset info from the HuggingFace Hub.
-    repo_id, token = self._hf_repo_id, self._hf_hub_token
-    dataset_info = huggingface_hub.dataset_info(repo_id, token=token)
-    # Second heuristic: check the card data.
+    # Heuristic #1: check the DatasetInfo from Hugging Face Hub/Datasets.
+    if info_license := self._get_text_field('license'):
+      return info_license
+    dataset_info = self._hf_hub_info
+    # Heuristic #2: check the card data.
     if dataset_info.card_data:
       if card_data_license := dataset_info.card_data.get('license'):
         return card_data_license
-    # Third heuristic: check the tags.
+    # Heuristic #3: check the tags.
     if dataset_info.tags:
       for tag in dataset_info.tags:
         if tag.startswith('license:'):
           return tag.removeprefix('license:')
     return None
 
+  def _get_text_field(self, field: str) -> str | None:
+    """Get the field from either HF Hub or HF Datasets."""
+    # The information retrieved from the Hub has priority over the one in the
+    # builder, because the Hub which is allegedly the new source of truth.
+    for dataset_info in [self._hf_hub_info, self._hf_info]:
+      # `description` and `citation` are not official fields in the Hugging Face
+      # Hub API but they're still exposed in its __dict__.
+      if value := getattr(dataset_info, field, None):
+        return value
+    return None
+
 
 def builder(
     name: str, config: Optional[str] = None, **builder_kwargs
@@ -443,5 +465,4 @@ def login_to_hf(hf_hub_token: Optional[str] = None):
   """Logs in to Hugging Face Hub with the token as arg or env variable."""
   hf_hub_token = hf_hub_token or os.environ.get('HUGGING_FACE_HUB_TOKEN')
   if hf_hub_token is not None:
-    huggingface_hub = lazy_imports_lib.lazy_imports.huggingface_hub
     huggingface_hub.login(token=hf_hub_token)
@@ -20,6 +20,7 @@
 import pytest
 from tensorflow_datasets.core import lazy_imports_lib
 from tensorflow_datasets.core.dataset_builders import huggingface_dataset_builder
+from tensorflow_datasets.core.utils.lazy_imports_utils import huggingface_hub
 
 PIL_Image = lazy_imports_lib.lazy_imports.PIL_Image
 
@@ -72,6 +73,22 @@ def mock_login_to_hf():
     yield login_to_hf
 
 
+@pytest.fixture(autouse=True)
+def mock_hub_dataset_info():
+  fake_dataset_info = huggingface_hub.hf_api.DatasetInfo(
+      id='foo/bar',
+      citation='citation from the hub',
+      private=False,
+      downloads=123,
+      likes=456,
+      tags=[],
+  )
+  with mock.patch.object(
+      huggingface_hub, 'dataset_info', return_value=fake_dataset_info
+  ) as dataset_info:
+    yield dataset_info
+
+
 @pytest.fixture(name='builder')
 def mock_huggingface_dataset_builder(
     tmp_path, load_dataset_builder, login_to_hf
@@ -91,7 +108,7 @@ def mock_huggingface_dataset_builder(
   )
   login_to_hf.assert_called_once_with('SECRET_TOKEN')
   assert builder.info.description == 'description'
-  assert builder.info.citation == 'citation'
+  assert builder.info.citation == 'citation from the hub'
   assert builder.info.redistribution_info.license == 'test-license'
   yield builder