@@ -273,7 +273,7 @@ def _info(self) -> dataset_info_lib.DatasetInfo:
273
273
description = self ._hf_info .description ,
274
274
features = huggingface_utils .convert_hf_features (self ._hf_features ()),
275
275
citation = self ._hf_info .citation ,
276
- license = self ._hf_info . license ,
276
+ license = self ._get_license () ,
277
277
supervised_keys = _extract_supervised_keys (self ._hf_info ),
278
278
)
279
279
@@ -407,6 +407,24 @@ def _write_shards(
407
407
)
408
408
return shard_infos_by_split
409
409
410
+ def _get_license (self ) -> str | None :
411
+ """Implements heuristics to get the license from HuggingFace."""
412
+ # First heuristic: check the DatasetInfo from Hugging Face datasets.
413
+ if self ._hf_info .license :
414
+ return self ._hf_info .license
415
+ huggingface_hub = lazy_imports_lib .lazy_imports .huggingface_hub
416
+ # Retrieve the dataset info from the HuggingFace Hub.
417
+ repo_id , token = self ._hf_repo_id , self ._hf_hub_token
418
+ dataset_info = huggingface_hub .dataset_info (repo_id , token = token )
419
+ # Second heuristic: check the card data.
420
+ if 'license' in dataset_info .card_data :
421
+ return dataset_info .card_data ['license' ]
422
+ # Third heuristic: check the tags.
423
+ for tag in dataset_info .tags :
424
+ if tag .startswith ('license:' ):
425
+ return tag [len ('license:' ) :]
426
+ return None
427
+
410
428
411
429
def builder (
412
430
name : str , config : Optional [str ] = None , ** builder_kwargs
0 commit comments