Use 1 unified function to convert Hugging Face names.

marcenacp · The TensorFlow Datasets Authors · commit 89440d4e5791 · 2024-03-14T10:38:49.000-07:00
PiperOrigin-RevId: 615827787
diff --git a/tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder.py b/tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder.py
@@ -107,7 +107,7 @@ def __init__(
     self._hf_repo_id = hf_repo_id
     self._hf_config = hf_config
     self.config_kwargs = config_kwargs
-    tfds_config = huggingface_utils.convert_hf_config_name(hf_config)
+    tfds_config = huggingface_utils.convert_hf_name(hf_config)
     try:
       self._hf_builder = hf_datasets.load_dataset_builder(
           self._hf_repo_id, self._hf_config, **self.config_kwargs
@@ -128,7 +128,7 @@ def __init__(
       )
     else:
       self._converted_builder_config = None
-    self.name = huggingface_utils.convert_hf_dataset_name(hf_repo_id)
+    self.name = huggingface_utils.convert_hf_name(hf_repo_id)
     self._hf_hub_token = hf_hub_token
     self._hf_num_proc = hf_num_proc
     self._tfds_num_proc = tfds_num_proc
@@ -189,7 +189,8 @@ def _split_generators(
     self._hf_download_and_prepare()
     ds = self._hf_builder.as_dataset(verification_mode=self._verification_mode)
     splits = {
-        split: self._generate_examples(data) for split, data in ds.items()
+        huggingface_utils.convert_hf_name(split): self._generate_examples(data)
+        for split, data in ds.items()
     }
     return _remove_empty_splits(splits)
 
diff --git a/tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder_test.py b/tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder_test.py
@@ -45,7 +45,7 @@ def _info(self):
     )
 
   def _split_generators(self, dl_manager):
-    return [hf_datasets.SplitGenerator(name=hf_datasets.Split.TRAIN)]
+    return [hf_datasets.SplitGenerator(name='train.clean')]
 
   def _generate_examples(self):
     for i in range(2):
@@ -101,7 +101,8 @@ def mock_huggingface_dataset_builder(
 def test_download_and_prepare(builder):
   builder.download_and_prepare()
   ds = builder.as_data_source()
-  assert list(ds['train']) == [{'feature': 0}, {'feature': 1}]
+  # Split names are sanitized, eg train.clean -> train_clean
+  assert list(ds['train_clean']) == [{'feature': 0}, {'feature': 1}]
 
 
 def test_all_parameters_are_passed_down_to_hf(builder):
diff --git a/tensorflow_datasets/core/utils/huggingface_utils.py b/tensorflow_datasets/core/utils/huggingface_utils.py
@@ -17,7 +17,8 @@
 
 from collections.abc import Mapping, Sequence
 import datetime
-from typing import Any, Type
+import re
+from typing import Any, Type, TypeVar
 
 from etils import epath
 import immutabledict
@@ -39,6 +40,9 @@
     'string': np.object_,
 })
 _IMAGE_ENCODING_FORMAT = 'png'
+# Regular expression to match strings that are not valid Python/TFDS names:
+_INVALID_TFDS_NAME_CHARACTER = re.compile(r'[^a-zA-Z0-9_]')
+_StrOrNone = TypeVar('_StrOrNone', str, None)
 
 
 def _convert_to_np_dtype(hf_dtype: str) -> Type[np.generic]:
@@ -229,29 +233,27 @@ def convert_hf_value(
   )
 
 
-def convert_hf_dataset_name(hf_dataset_name: str) -> str:
-  """Converts Huggingface dataset name to a TFDS compatible dataset name.
+def convert_hf_name(hf_name: _StrOrNone) -> _StrOrNone:
+  """Converts Huggingface name to a TFDS compatible dataset name.
 
-  Huggingface dataset names can contain characters that are not supported in
+  Huggingface names can contain characters that are not supported in
   TFDS. For example, in Huggingface a dataset name like `a/b` is supported,
   while in TFDS `b` would be parsed as the config.
 
   Examples:
-  - `hf_dataset_name='codeparrot/github-code'` becomes
-  `codeparrot__github_code`.
+  - `hf_name='codeparrot/github-code'` becomes `codeparrot__github_code`.
 
   Args:
-    hf_dataset_name: Huggingface dataset name.
+    hf_name: Huggingface name.
 
   Returns:
-    The TFDS compatible dataset name.
+    The TFDS compatible dataset name (dataset names, config names and split
+    names).
   """
-  return (
-      hf_dataset_name.replace('-', '_')
-      .replace('.', '_')
-      .replace('/', '__')
-      .lower()
-  )
+  if hf_name is None:
+    return hf_name
+  hf_name = hf_name.lower().replace('/', '__')
+  return re.sub(_INVALID_TFDS_NAME_CHARACTER, '_', hf_name)
 
 
 def convert_tfds_dataset_name(tfds_dataset_name: str) -> str:
@@ -271,22 +273,8 @@ def convert_tfds_dataset_name(tfds_dataset_name: str) -> str:
     existing Huggingface dataset.
   """
   for hf_dataset_name in hf_datasets.list_datasets():
-    if convert_hf_dataset_name(hf_dataset_name) == tfds_dataset_name.lower():
+    if convert_hf_name(hf_dataset_name) == tfds_dataset_name.lower():
       return hf_dataset_name
   raise registered.DatasetNotFoundError(
       f'"{tfds_dataset_name}" is not listed in Huggingface datasets.'
   )
-
-
-def convert_hf_config_name(hf_config_name: str | None) -> str | None:
-  """Converts Huggingface config name to a TFDS compatible config name.
-
-  Args:
-    hf_config_name: Optional Huggingface config name.
-
-  Returns:
-    The TFDS compatible config name.
-  """
-  if hf_config_name is None:
-    return hf_config_name
-  return hf_config_name.lower().replace(',', '_')
diff --git a/tensorflow_datasets/core/utils/huggingface_utils_test.py b/tensorflow_datasets/core/utils/huggingface_utils_test.py
@@ -212,20 +212,22 @@ def test_convert_value(hf_value, feature, expected_value):
 
 
 @pytest.mark.parametrize(
-    'hf_dataset_name,tfds_dataset_name',
+    'hf_name,tfds_name',
     [
+        # Dataset names
         ('x', 'x'),
         ('X', 'x'),
         ('x-y', 'x_y'),
         ('x/y', 'x__y'),
+        ('x/Y-z', 'x__y_z'),
+        # Config and split names
+        ('x.y', 'x_y'),
         ('x_v1.0', 'x_v1_0'),
+        (None, None),
     ],
 )
-def test_from_hf_to_tfds(hf_dataset_name, tfds_dataset_name):
-  assert (
-      huggingface_utils.convert_hf_dataset_name(hf_dataset_name)
-      == tfds_dataset_name
-  )
+def test_from_hf_to_tfds(hf_name, tfds_name):
+  assert huggingface_utils.convert_hf_name(hf_name) == tfds_name
 
 
 @pytest.fixture(name='mock_list_datasets')
@@ -263,14 +265,3 @@ def test_convert_tfds_dataset_name(
       huggingface_utils.convert_tfds_dataset_name(tfds_dataset_name)
       == hf_dataset_name
   )
-
-
-@pytest.mark.parametrize(
-    'hf_config_name,tfds_config_name',
-    [(None, None), ('x', 'x'), ('X', 'x'), ('X,y', 'x_y')],
-)
-def test_convert_config_name(hf_config_name, tfds_config_name):
-  assert (
-      huggingface_utils.convert_hf_config_name(hf_config_name)
-      == tfds_config_name
-  )
diff --git a/tensorflow_datasets/scripts/documentation/build_community_catalog.py b/tensorflow_datasets/scripts/documentation/build_community_catalog.py
@@ -255,11 +255,11 @@ def format_template(
         config_name: str, info: dataset_info_pb2.DatasetInfo
     ) -> str:
       if self.namespace and self.namespace == 'huggingface':
-        tfds_id = huggingface_utils.convert_hf_dataset_name(self.tfds_id)
+        tfds_id = huggingface_utils.convert_hf_name(self.tfds_id)
       else:
         tfds_id = self.tfds_id
       if config_name != 'default':
-        config_name = huggingface_utils.convert_hf_config_name(config_name)
+        config_name = huggingface_utils.convert_hf_name(config_name)
         tfds_id = f'{tfds_id}/{config_name}'
       if keep_short:
         features = ''