Skip to content

Commit d3db9e3

Browse files
author
The TensorFlow Datasets Authors
committed
For HuggingFace builders, add gated text to the description and license of gated datasets. Also adds homepage to the dataset info.
PiperOrigin-RevId: 657593377
1 parent 2123db7 commit d3db9e3

File tree

2 files changed

+56
-3
lines changed

2 files changed

+56
-3
lines changed

tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder.py

Lines changed: 55 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -226,17 +226,21 @@ def __init__(
226226
)
227227
self.VERSION = version_lib.Version(version) # pylint: disable=invalid-name
228228
self.name = conversion_utils.to_tfds_name(hf_repo_id)
229+
self.homepage = f'https://huggingface.co/datasets/{hf_repo_id}'
229230
self._hf_hub_token = hf_hub_token
230231
self._hf_num_proc = hf_num_proc
231232
self._tfds_num_proc = tfds_num_proc
232233
self._verification_mode = (
233234
'no_checks' if ignore_verifications else 'all_checks'
234235
)
235236
if self._hf_config:
237+
description = self._get_text_field('description')
238+
if self._is_gated():
239+
description = self._gated_text + '\n' + description
236240
self._converted_builder_config = dataset_builder.BuilderConfig(
237241
name=tfds_config,
238242
version=self.VERSION,
239-
description=self._get_text_field('description'),
243+
description=description,
240244
)
241245
else:
242246
self._converted_builder_config = None
@@ -277,6 +281,48 @@ def _hf_hub_info(self) -> huggingface_hub.hf_api.DatasetInfo:
277281
self._hf_repo_id, token=self._hf_hub_token
278282
)
279283

284+
def _is_gated(self) -> bool:
285+
"""Whether the dataset is gated."""
286+
# Gated datasets return a string ('manual' or 'automatic').
287+
if isinstance(self._hf_hub_info.gated, str):
288+
return True
289+
return False
290+
291+
@property
292+
def _gated_dataset_warning(self) -> str:
293+
"""The warning message for a gated dataset."""
294+
return (
295+
'WARNING: This dataset is gated. Before using it, make sure to sign'
296+
f' the conditions at: {self.homepage}. Important: access requests are'
297+
' always granted to individual users rather than to entire'
298+
' organizations.'
299+
)
300+
301+
@property
302+
def _gated_text(self) -> str | None:
303+
"""Returns the conditions for a dataset, if it is gated.
304+
305+
All datasets share the same default conditions. Extra conditions are stored
306+
in the dataset card:
307+
https://huggingface.co/docs/hub/en/datasets-gated
308+
309+
Returns:
310+
The gated text if the dataset is gated. None otherwise.
311+
"""
312+
if self._is_gated():
313+
# This condition is the same for all gated datasets.
314+
conditions = (
315+
'The conditions consist of:\nBy agreeing you accept to share your'
316+
' contact information (email and username) with the repository'
317+
' authors.'
318+
)
319+
if dataset_card := self._hf_hub_info.card_data:
320+
gated_text = dataset_card.get('extra_gated_prompt', None)
321+
if gated_text:
322+
conditions = conditions + '\n' + gated_text
323+
return self._gated_dataset_warning + '\n' + conditions
324+
return None
325+
280326
def _hf_features(self) -> hf_datasets.Features:
281327
if not self._hf_info.features:
282328
# We need to download and prepare the data to know its features.
@@ -285,13 +331,19 @@ def _hf_features(self) -> hf_datasets.Features:
285331
return self._hf_info.features
286332

287333
def _info(self) -> dataset_info_lib.DatasetInfo:
334+
ds_description = self._get_text_field('description')
335+
ds_license = self._get_license()
336+
if self._is_gated():
337+
ds_description = self._gated_text + '\n' + ds_description
338+
ds_license = ds_license + ' ' + self._gated_dataset_warning
288339
return dataset_info_lib.DatasetInfo(
289340
builder=self,
290-
description=self._get_text_field('description'),
341+
description=ds_description,
291342
features=huggingface_utils.convert_hf_features(self._hf_features()),
292343
citation=self._get_text_field('citation'),
293-
license=self._get_license(),
344+
license=ds_license,
294345
supervised_keys=_extract_supervised_keys(self._hf_info),
346+
homepage=self.homepage,
295347
)
296348

297349
def _split_generators(

tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder_test.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ def test_dataset_info(builder):
114114
assert builder.info.description == 'description'
115115
assert builder.info.citation == 'citation from the hub'
116116
assert builder.info.redistribution_info.license == 'test-license'
117+
assert builder.info.homepage == 'https://huggingface.co/datasets/foo/bar'
117118

118119

119120
def test_download_and_prepare(builder):

0 commit comments

Comments
 (0)