Skip to content

Commit 551e9d2

Browse files
author
The TensorFlow Datasets Authors
committed
Add information about blocked versions and configs to dataset_info and restore this information in our ReadOnlyBuilder.
PiperOrigin-RevId: 678169155
1 parent 8223a15 commit 551e9d2

File tree

6 files changed

+84
-8
lines changed

6 files changed

+84
-8
lines changed

tensorflow_datasets/core/dataset_builder.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,18 @@ def __init__(
294294
self.info.read_from_directory(self._data_dir)
295295
else: # Use the code version (do not restore data)
296296
self.info.initialize_from_bucket()
297+
if self.BLOCKED_VERSIONS is not None:
298+
config_name = self._builder_config.name if self._builder_config else None
299+
if is_blocked := self.BLOCKED_VERSIONS.is_blocked(
300+
version=self._version, config=config_name
301+
):
302+
default_msg = (
303+
f"Dataset {self.name} is blocked at version {self._version} and"
304+
f" config {config_name}."
305+
)
306+
self.info.set_is_blocked(
307+
is_blocked.blocked_msg if is_blocked.blocked_msg else default_msg
308+
)
297309

298310
@utils.classproperty
299311
@classmethod

tensorflow_datasets/core/dataset_info.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,7 @@ def __init__(
197197
alternative_file_formats: (
198198
Sequence[str | file_adapters.FileFormat] | None
199199
) = None,
200+
is_blocked: str | None = None,
200201
# LINT.ThenChange(:setstate)
201202
):
202203
# pyformat: disable
@@ -243,6 +244,8 @@ def __init__(
243244
split_dict: information about the splits in this dataset.
244245
alternative_file_formats: alternative file formats that are availablefor
245246
this dataset.
247+
is_blocked: A message explaining why the dataset, in its version and
248+
config, is blocked. If empty or None, the dataset is not blocked.
246249
"""
247250
# pyformat: enable
248251
self._builder_or_identity = builder
@@ -259,6 +262,8 @@ def __init__(
259262
f = file_adapters.FileFormat.from_value(f)
260263
self._alternative_file_formats.append(f)
261264

265+
self._is_blocked = is_blocked
266+
262267
self._info_proto = dataset_info_pb2.DatasetInfo(
263268
name=self._identity.name,
264269
description=utils.dedent(description),
@@ -276,6 +281,7 @@ def __init__(
276281
alternative_file_formats=[
277282
f.value for f in self._alternative_file_formats
278283
],
284+
is_blocked=self._is_blocked,
279285
)
280286

281287
if homepage:
@@ -440,6 +446,13 @@ def alternative_file_formats(self) -> Sequence[file_adapters.FileFormat]:
440446
def metadata(self) -> Metadata | None:
441447
return self._metadata
442448

449+
@property
450+
def is_blocked(self) -> str | None:
451+
return self._is_blocked
452+
453+
def set_is_blocked(self, is_blocked: str) -> None:
454+
self._is_blocked = is_blocked
455+
443456
@property
444457
def supervised_keys(self) -> Optional[SupervisedKeysType]:
445458
if not self.as_proto.HasField("supervised_keys"):
@@ -941,6 +954,7 @@ def __getstate__(self):
941954
"license": self.redistribution_info.license,
942955
"split_dict": self.splits,
943956
"alternative_file_formats": self.alternative_file_formats,
957+
"is_blocked": self.is_blocked,
944958
}
945959
def __setstate__(self, state):
946960
# LINT.IfChange(setstate)
@@ -956,6 +970,7 @@ def __setstate__(self, state):
956970
license=state["license"],
957971
split_dict=state["split_dict"],
958972
alternative_file_formats=state["alternative_file_formats"],
973+
is_blocked=state["is_blocked"],
959974
)
960975
# LINT.ThenChange(:dataset_info_args)
961976

tensorflow_datasets/core/proto/dataset_info.proto

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,5 +231,9 @@ message DatasetInfo {
231231
// The data that was used to generate this dataset.
232232
repeated DataSourceAccess data_source_accesses = 20;
233233

234-
// Next available: 22
234+
// A message explaining why the dataset is blocked. If empty, it means that
235+
// the dataset is not blocked.
236+
string is_blocked = 23;
237+
238+
// Next available: 24
235239
}

tensorflow_datasets/core/proto/dataset_info_generated_pb2.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@
6767
b' \x01(\t\x12\x10\n\x08\x64\x61ta_dir\x18\x04'
6868
b' \x01(\t\x12\x14\n\x0c\x64s_namespace\x18\x05'
6969
b' \x01(\t\x12\r\n\x05split\x18\x06'
70-
b' \x01(\t"\xd6\x07\n\x0b\x44\x61tasetInfo\x12\x0c\n\x04name\x18\x01'
70+
b' \x01(\t"\xea\x07\n\x0b\x44\x61tasetInfo\x12\x0c\n\x04name\x18\x01'
7171
b' \x01(\t\x12\x13\n\x0b\x64\x65scription\x18\x02'
7272
b' \x01(\t\x12\x0f\n\x07version\x18\t \x01(\t\x12I\n\rrelease_notes\x18\x12'
7373
b' \x03(\x0b\x32\x32.tensorflow_datasets.DatasetInfo.ReleaseNotesEntry\x12\x13\n\x0b\x63onfig_name\x18\r'
@@ -88,7 +88,8 @@
8888
b' \x01(\x08\x12\x13\n\x0b\x66ile_format\x18\x11 \x01(\t\x12'
8989
b' \n\x18\x61lternative_file_formats\x18\x16'
9090
b' \x03(\t\x12\x43\n\x14\x64\x61ta_source_accesses\x18\x14'
91-
b' \x03(\x0b\x32%.tensorflow_datasets.DataSourceAccess\x1a\x33\n\x11ReleaseNotesEntry\x12\x0b\n\x03key\x18\x01'
91+
b' \x03(\x0b\x32%.tensorflow_datasets.DataSourceAccess\x12\x12\n\nis_blocked\x18\x17'
92+
b' \x01(\t\x1a\x33\n\x11ReleaseNotesEntry\x12\x0b\n\x03key\x18\x01'
9293
b' \x01(\t\x12\r\n\x05value\x18\x02'
9394
b' \x01(\t:\x02\x38\x01\x1a\x38\n\x16\x44ownloadChecksumsEntry\x12\x0b\n\x03key\x18\x01'
9495
b' \x01(\t\x12\r\n\x05value\x18\x02'
@@ -146,9 +147,9 @@
146147
_TFDSDATASETREFERENCE._serialized_start = 1280
147148
_TFDSDATASETREFERENCE._serialized_end = 1404
148149
_DATASETINFO._serialized_start = 1407
149-
_DATASETINFO._serialized_end = 2389
150-
_DATASETINFO_RELEASENOTESENTRY._serialized_start = 2280
151-
_DATASETINFO_RELEASENOTESENTRY._serialized_end = 2331
152-
_DATASETINFO_DOWNLOADCHECKSUMSENTRY._serialized_start = 2333
153-
_DATASETINFO_DOWNLOADCHECKSUMSENTRY._serialized_end = 2389
150+
_DATASETINFO._serialized_end = 2409
151+
_DATASETINFO_RELEASENOTESENTRY._serialized_start = 2300
152+
_DATASETINFO_RELEASENOTESENTRY._serialized_end = 2351
153+
_DATASETINFO_DOWNLOADCHECKSUMSENTRY._serialized_start = 2353
154+
_DATASETINFO_DOWNLOADCHECKSUMSENTRY._serialized_end = 2409
154155
# @@protoc_insertion_point(module_scope)

tensorflow_datasets/core/read_only_builder.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ def __init__(
7878
self.name = info_proto.name
7979
self.VERSION = version_lib.Version(info_proto.version) # pylint: disable=invalid-name
8080
self.RELEASE_NOTES = info_proto.release_notes or {} # pylint: disable=invalid-name
81+
self.BLOCKED_VERSIONS = self._restore_blocked_versions(info_proto) # pylint: disable=invalid-name
8182

8283
if info_proto.module_name:
8384
# Overwrite the module so documenting `ReadOnlyBuilder` point to the
@@ -92,6 +93,7 @@ def __init__(
9293
config=builder_config,
9394
version=info_proto.version,
9495
)
96+
self.assert_is_not_blocked()
9597

9698
# For pickling, should come after super.__init__ which is setting that same
9799
# _original_state attribute.
@@ -103,6 +105,25 @@ def __init__(
103105
'was generated with an old TFDS version (<=3.2.1).'
104106
)
105107

108+
def _restore_blocked_versions(
109+
self, info_proto: dataset_info_pb2.DatasetInfo
110+
) -> version_lib.BlockedVersions | None:
111+
"""Restores the blocked version information from the dataset info proto.
112+
113+
Args:
114+
info_proto: DatasetInfo describing the name, config, etc of the requested
115+
dataset.
116+
117+
Returns:
118+
None if the dataset is not blocked, or a populated BlockedVersions object.
119+
"""
120+
if info_proto.is_blocked:
121+
configs = {
122+
info_proto.version: {info_proto.config_name: info_proto.is_blocked}
123+
}
124+
return version_lib.BlockedVersions(configs=configs)
125+
return None
126+
106127
def _create_builder_config(
107128
self,
108129
builder_config: str | dataset_builder.BuilderConfig | None,

tensorflow_datasets/core/read_only_builder_test.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,29 @@ def test_builder_from_metadata(
246246
assert str(builder.info.features) == str(dummy_features)
247247

248248

249+
def test_restore_blocked_versions(
250+
code_builder: dataset_builder.DatasetBuilder,
251+
dummy_features: features_dict.FeaturesDict,
252+
):
253+
info_proto = dataset_info_pb2.DatasetInfo(
254+
name='abcd',
255+
description='efgh',
256+
config_name='en',
257+
config_description='something',
258+
version='0.1.0',
259+
release_notes={'0.1.0': 'release description'},
260+
citation='some citation',
261+
features=dummy_features.to_proto(),
262+
is_blocked='some reason for blocking',
263+
)
264+
with pytest.raises(
265+
utils.DatasetVariantBlockedError, match='some reason for blocking'
266+
):
267+
read_only_builder.builder_from_metadata(
268+
code_builder.data_dir, info_proto=info_proto
269+
)
270+
271+
249272
def test_builder_from_directory_dir_not_exists(tmp_path: pathlib.Path):
250273
with pytest.raises(FileNotFoundError, match='Could not load dataset info'):
251274
read_only_builder.builder_from_directory(tmp_path)

0 commit comments

Comments
 (0)