Skip to content

Commit d5401a0

Browse files
marcenacpThe TensorFlow Datasets Authors
authored andcommitted
Heuristics to retrieve the license from Hugging Face.
PiperOrigin-RevId: 630328078
1 parent 751053f commit d5401a0

File tree

2 files changed

+23
-1
lines changed

2 files changed

+23
-1
lines changed

tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ def _info(self) -> dataset_info_lib.DatasetInfo:
273273
description=self._hf_info.description,
274274
features=huggingface_utils.convert_hf_features(self._hf_features()),
275275
citation=self._hf_info.citation,
276-
license=self._hf_info.license,
276+
license=self._get_license(),
277277
supervised_keys=_extract_supervised_keys(self._hf_info),
278278
)
279279

@@ -407,6 +407,24 @@ def _write_shards(
407407
)
408408
return shard_infos_by_split
409409

410+
def _get_license(self) -> str | None:
411+
"""Implements heuristics to get the license from HuggingFace."""
412+
# First heuristic: check the DatasetInfo from Hugging Face datasets.
413+
if self._hf_info.license:
414+
return self._hf_info.license
415+
huggingface_hub = lazy_imports_lib.lazy_imports.huggingface_hub
416+
# Retrieve the dataset info from the HuggingFace Hub.
417+
repo_id, token = self._hf_repo_id, self._hf_hub_token
418+
dataset_info = huggingface_hub.dataset_info(repo_id, token=token)
419+
# Second heuristic: check the card data.
420+
if 'license' in dataset_info.card_data:
421+
return dataset_info.card_data['license']
422+
# Third heuristic: check the tags.
423+
for tag in dataset_info.tags:
424+
if tag.startswith('license:'):
425+
return tag[len('license:') :]
426+
return None
427+
410428

411429
def builder(
412430
name: str, config: Optional[str] = None, **builder_kwargs

tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder_test.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ def _info(self):
3030
return hf_datasets.DatasetInfo(
3131
description='description',
3232
citation='citation',
33+
license='test-license',
3334
features=None,
3435
version='1.0.0',
3536
)
@@ -89,6 +90,9 @@ def mock_huggingface_dataset_builder(
8990
'foo/bar', 'config', other_arg='this is another arg'
9091
)
9192
login_to_hf.assert_called_once_with('SECRET_TOKEN')
93+
assert builder.info.description == 'description'
94+
assert builder.info.citation == 'citation'
95+
assert builder.info.redistribution_info.license == 'test-license'
9296
yield builder
9397

9498

0 commit comments

Comments
 (0)