42
42
from tensorflow_datasets .core import example_serializer
43
43
from tensorflow_datasets .core import features as feature_lib
44
44
from tensorflow_datasets .core import file_adapters
45
- from tensorflow_datasets .core import lazy_imports_lib
46
45
from tensorflow_datasets .core import split_builder as split_builder_lib
47
46
from tensorflow_datasets .core import splits as splits_lib
48
47
from tensorflow_datasets .core .utils import huggingface_utils
49
48
from tensorflow_datasets .core .utils import shard_utils
50
49
from tensorflow_datasets .core .utils import tqdm_utils
51
50
from tensorflow_datasets .core .utils import version as version_lib
52
51
from tensorflow_datasets .core .utils .lazy_imports_utils import datasets as hf_datasets
52
+ from tensorflow_datasets .core .utils .lazy_imports_utils import huggingface_hub
53
53
54
54
55
55
def _extract_supervised_keys (hf_info ):
@@ -224,21 +224,21 @@ def __init__(
224
224
or '1.0.0'
225
225
)
226
226
self .VERSION = version_lib .Version (version ) # pylint: disable=invalid-name
227
- if self ._hf_config :
228
- self ._converted_builder_config = dataset_builder .BuilderConfig (
229
- name = tfds_config ,
230
- version = self .VERSION ,
231
- description = self ._hf_info .description ,
232
- )
233
- else :
234
- self ._converted_builder_config = None
235
227
self .name = huggingface_utils .convert_hf_name (hf_repo_id )
236
228
self ._hf_hub_token = hf_hub_token
237
229
self ._hf_num_proc = hf_num_proc
238
230
self ._tfds_num_proc = tfds_num_proc
239
231
self ._verification_mode = (
240
232
'no_checks' if ignore_verifications else 'all_checks'
241
233
)
234
+ if self ._hf_config :
235
+ self ._converted_builder_config = dataset_builder .BuilderConfig (
236
+ name = tfds_config ,
237
+ version = self .VERSION ,
238
+ description = self ._get_text_field ('description' ),
239
+ )
240
+ else :
241
+ self ._converted_builder_config = None
242
242
super ().__init__ (
243
243
file_format = file_format , config = tfds_config , data_dir = data_dir
244
244
)
@@ -266,8 +266,16 @@ def _hf_download_and_prepare(self):
266
266
267
267
@property
268
268
def _hf_info (self ) -> hf_datasets .DatasetInfo :
269
+ """Retrieves the dataset info from the HuggingFace Datasets."""
269
270
return self ._hf_builder .info
270
271
272
+ @functools .cached_property
273
+ def _hf_hub_info (self ) -> huggingface_hub .hf_api .DatasetInfo :
274
+ """Retrieves the dataset info from the HuggingFace Hub and caches it."""
275
+ return huggingface_hub .dataset_info (
276
+ self ._hf_repo_id , token = self ._hf_hub_token
277
+ )
278
+
271
279
def _hf_features (self ) -> hf_datasets .Features :
272
280
if not self ._hf_info .features :
273
281
# We need to download and prepare the data to know its features.
@@ -278,9 +286,9 @@ def _hf_features(self) -> hf_datasets.Features:
278
286
def _info (self ) -> dataset_info_lib .DatasetInfo :
279
287
return dataset_info_lib .DatasetInfo (
280
288
builder = self ,
281
- description = self ._hf_info . description ,
289
+ description = self ._get_text_field ( ' description' ) ,
282
290
features = huggingface_utils .convert_hf_features (self ._hf_features ()),
283
- citation = self ._hf_info . citation ,
291
+ citation = self ._get_text_field ( ' citation' ) ,
284
292
license = self ._get_license (),
285
293
supervised_keys = _extract_supervised_keys (self ._hf_info ),
286
294
)
@@ -417,24 +425,32 @@ def _write_shards(
417
425
418
426
def _get_license (self ) -> str | None :
419
427
"""Implements heuristics to get the license from HuggingFace."""
420
- # First heuristic: check the DatasetInfo from Hugging Face datasets.
421
- if self ._hf_info .license :
422
- return self ._hf_info .license
423
- huggingface_hub = lazy_imports_lib .lazy_imports .huggingface_hub
424
- # Retrieve the dataset info from the HuggingFace Hub.
425
- repo_id , token = self ._hf_repo_id , self ._hf_hub_token
426
- dataset_info = huggingface_hub .dataset_info (repo_id , token = token )
427
- # Second heuristic: check the card data.
428
+ # Heuristic #1: check the DatasetInfo from Hugging Face Hub/Datasets.
429
+ if info_license := self ._get_text_field ('license' ):
430
+ return info_license
431
+ dataset_info = self ._hf_hub_info
432
+ # Heuristic #2: check the card data.
428
433
if dataset_info .card_data :
429
434
if card_data_license := dataset_info .card_data .get ('license' ):
430
435
return card_data_license
431
- # Third heuristic : check the tags.
436
+ # Heuristic #3 : check the tags.
432
437
if dataset_info .tags :
433
438
for tag in dataset_info .tags :
434
439
if tag .startswith ('license:' ):
435
440
return tag .removeprefix ('license:' )
436
441
return None
437
442
443
+ def _get_text_field (self , field : str ) -> str | None :
444
+ """Get the field from either HF Hub or HF Datasets."""
445
+ # The information retrieved from the Hub has priority over the one in the
446
+ # builder, because the Hub which is allegedly the new source of truth.
447
+ for dataset_info in [self ._hf_hub_info , self ._hf_info ]:
448
+ # `description` and `citation` are not official fields in the Hugging Face
449
+ # Hub API but they're still exposed in its __dict__.
450
+ if value := getattr (dataset_info , field , None ):
451
+ return value
452
+ return None
453
+
438
454
439
455
def builder (
440
456
name : str , config : Optional [str ] = None , ** builder_kwargs
@@ -449,5 +465,4 @@ def login_to_hf(hf_hub_token: Optional[str] = None):
449
465
"""Logs in to Hugging Face Hub with the token as arg or env variable."""
450
466
hf_hub_token = hf_hub_token or os .environ .get ('HUGGING_FACE_HUB_TOKEN' )
451
467
if hf_hub_token is not None :
452
- huggingface_hub = lazy_imports_lib .lazy_imports .huggingface_hub
453
468
huggingface_hub .login (token = hf_hub_token )
0 commit comments