Skip to content

Commit 978884e

Browse files
tomvdwThe TensorFlow Datasets Authors
authored andcommitted
Fix bug where the builder config was not retrieved in the right manner since the config name also includes the version
PiperOrigin-RevId: 694039995
1 parent 69e0840 commit 978884e

File tree

3 files changed

+64
-46
lines changed

3 files changed

+64
-46
lines changed

tensorflow_datasets/core/dataset_builder.py

Lines changed: 42 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import collections
2222
from collections.abc import Iterable, Iterator, Mapping, Sequence
2323
import dataclasses
24+
import difflib
2425
import functools
2526
import inspect
2627
import json
@@ -1347,39 +1348,50 @@ def _create_builder_config(
13471348
) -> BuilderConfig | None:
13481349
"""Create and validate BuilderConfig object."""
13491350
if builder_config is None:
1350-
builder_config = self.get_default_builder_config()
1351-
if builder_config is not None:
1351+
if builder_config := self.get_default_builder_config():
13521352
logging.info(
13531353
"No config specified, defaulting to config: %s/%s",
13541354
self.name,
13551355
builder_config.name,
13561356
)
1357-
if not builder_config:
1358-
return None
1357+
return builder_config
1358+
else:
1359+
return None
1360+
13591361
if isinstance(builder_config, str):
1360-
name = builder_config
1361-
builder_config = self.builder_configs.get(name)
1362-
if builder_config is None and version is not None:
1363-
builder_config = self.builder_configs.get(f"{name}:{version}")
1364-
if builder_config is None:
1365-
raise ValueError(
1366-
"BuilderConfig %s not found with version %s. Available: %s"
1367-
% (name, version, list(self.builder_configs.keys()))
1362+
config = self.get_builder_config(name=builder_config, version=version)
1363+
if config is not None:
1364+
return config
1365+
else:
1366+
close_matches = difflib.get_close_matches(
1367+
builder_config, self.builder_configs.keys(), n=10
13681368
)
1369-
name = builder_config.name
1370-
if not name:
1371-
raise ValueError("BuilderConfig must have a name, got %s" % name)
1372-
is_custom = name not in self.builder_configs
1373-
if is_custom:
1374-
logging.warning("Using custom data configuration %s", name)
1375-
else:
1376-
if builder_config is not self.builder_configs[name]:
13771369
raise ValueError(
1378-
"Cannot name a custom BuilderConfig the same as an available "
1379-
"BuilderConfig. Change the name. Available BuilderConfigs: %s"
1380-
% (list(self.builder_configs.keys()))
1370+
f"BuilderConfig {builder_config} not found with version {version}."
1371+
" Here are 10 BuilderConfigs whose name closely match:"
1372+
f" {close_matches}"
13811373
)
1382-
return builder_config
1374+
1375+
if not isinstance(builder_config, BuilderConfig):
1376+
raise ValueError(
1377+
f"BuilderConfig {builder_config} is not a BuilderConfig or string"
1378+
)
1379+
1380+
cls_builder_config = self.get_builder_config(
1381+
name=builder_config.name, version=version
1382+
)
1383+
if cls_builder_config is None:
1384+
logging.warning("Using custom data configuration: %s", builder_config)
1385+
return builder_config
1386+
elif builder_config is not cls_builder_config:
1387+
raise ValueError(
1388+
"Cannot name a custom BuilderConfig the same as an available"
1389+
" BuilderConfig. Change the name.\n"
1390+
f"Requested: {builder_config}\n"
1391+
f"BuilderConfig in class: {cls_builder_config}"
1392+
)
1393+
else:
1394+
return builder_config
13831395

13841396
@utils.classproperty
13851397
@classmethod
@@ -2019,7 +2031,7 @@ def canonical_version_for_config(
20192031
) -> utils.Version:
20202032
"""Get the canonical version for the given config.
20212033
2022-
This allow to get the version without instanciating the class.
2034+
This allows getting the version without instantiating the class.
20232035
The version can be stored either at the class or in the config object.
20242036
20252037
Args:
@@ -2030,14 +2042,15 @@ def canonical_version_for_config(
20302042
Returns:
20312043
version: The extracted version.
20322044
"""
2033-
if instance_or_cls.BUILDER_CONFIGS and config is None:
2045+
if config and config.version:
2046+
return utils.Version(config.version)
2047+
2048+
if config is None and instance_or_cls.BUILDER_CONFIGS:
20342049
raise ValueError(
20352050
f"Cannot infer version on {instance_or_cls.name}. Unknown config."
20362051
)
20372052

2038-
if config and config.version:
2039-
return utils.Version(config.version)
2040-
elif instance_or_cls.VERSION:
2053+
if instance_or_cls.VERSION:
20412054
return utils.Version(instance_or_cls.VERSION)
20422055
else:
20432056
raise ValueError(

tensorflow_datasets/core/utils/file_utils.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -247,10 +247,14 @@ def get_data_dir_and_dataset_dir(
247247
if all_versions:
248248
logging.warning(
249249
(
250-
'Found a different version of the requested dataset:\n'
251-
'%s\n'
252-
'Using %s instead.'
250+
'Found a different version of the requested dataset'
251+
' (given_data_dir=%s,dataset=%s, config=%s, version=%s):\n'
252+
'%s\nUsing %s instead.'
253253
),
254+
given_data_dir,
255+
builder_name,
256+
config_name,
257+
version,
254258
'\n'.join(str(v) for v in sorted(all_versions)),
255259
dataset_dir,
256260
)

tensorflow_datasets/scripts/cli/build.py

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,14 @@
1616
"""`tfds build` command."""
1717

1818
import argparse
19+
from collections.abc import Iterator
1920
import functools
2021
import importlib
2122
import itertools
2223
import json
2324
import multiprocessing
2425
import os
25-
from typing import Any, Dict, Iterator, Optional, Tuple, Type, Union
26+
from typing import Any, Type
2627

2728
from absl import logging
2829
import tensorflow_datasets as tfds
@@ -132,7 +133,7 @@ def _build_datasets(args: argparse.Namespace) -> None:
132133
def _make_builders(
133134
args: argparse.Namespace,
134135
builder_cls: Type[tfds.core.DatasetBuilder],
135-
builder_kwargs: Dict[str, Any],
136+
builder_kwargs: dict[str, Any],
136137
) -> Iterator[tfds.core.DatasetBuilder]:
137138
"""Yields builders to generate.
138139
@@ -185,7 +186,7 @@ def _get_builder_cls_and_kwargs(
185186
ds_to_build: str,
186187
*,
187188
has_imports: bool,
188-
) -> Tuple[Type[tfds.core.DatasetBuilder], Dict[str, Any]]:
189+
) -> tuple[Type[tfds.core.DatasetBuilder], dict[str, Any]]:
189190
"""Infer the builder class to build and its kwargs.
190191
191192
Args:
@@ -232,7 +233,7 @@ def _get_builder_cls_and_kwargs(
232233
return builder_cls, builder_kwargs
233234

234235

235-
def _search_script_path(ds_to_build: str) -> Optional[tfds.core.Path]:
236+
def _search_script_path(ds_to_build: str) -> tfds.core.Path | None:
236237
"""Check whether the requested dataset match a file on disk."""
237238
# If the dataset file exists, use it. Valid values are:
238239
# * Empty string (use default directory)
@@ -274,7 +275,7 @@ def _search_script_path(ds_to_build: str) -> Optional[tfds.core.Path]:
274275

275276
def _validate_script_path(
276277
path: tfds.core.Path,
277-
) -> Optional[tfds.core.Path]:
278+
) -> tfds.core.Path | None:
278279
"""Validates and returns the `dataset_builder.py` generation script path."""
279280
if path.suffix != '.py':
280281
raise ValueError(f'Expected `.py` file. Invalid dataset path: {path}.')
@@ -292,17 +293,17 @@ def _make_builder(
292293
**builder_kwargs,
293294
) -> tfds.core.DatasetBuilder:
294295
"""Builder factory, eventually deleting pre-existing dataset."""
295-
builder = builder_cls(**builder_kwargs) # pytype: disable=not-instantiable
296-
data_exists = builder.data_path.exists()
297-
if fail_if_exists and data_exists:
296+
builder = builder_cls(**builder_kwargs)
297+
is_prepared = builder.is_prepared()
298+
if fail_if_exists and is_prepared:
298299
raise RuntimeError(
299300
'The `fail_if_exists` flag was True and '
300301
f'the data already exists in {builder.data_path}'
301302
)
302-
if overwrite and data_exists:
303+
if overwrite and is_prepared:
303304
builder.data_path.rmtree() # Delete pre-existing data
304305
# Re-create the builder with clean state
305-
builder = builder_cls(**builder_kwargs) # pytype: disable=not-instantiable
306+
builder = builder_cls(**builder_kwargs)
306307
return builder
307308

308309

@@ -404,10 +405,10 @@ def _make_download_config(
404405

405406
def _get_config_name(
406407
builder_cls: Type[tfds.core.DatasetBuilder],
407-
config_kwarg: Optional[str],
408-
config_name: Optional[str],
409-
config_idx: Optional[int],
410-
) -> Optional[Union[str, tfds.core.BuilderConfig]]:
408+
config_kwarg: str | None,
409+
config_name: str | None,
410+
config_idx: int | None,
411+
) -> str | tfds.core.BuilderConfig | None:
411412
"""Extract the config name.
412413
413414
Args:

0 commit comments

Comments
 (0)