Skip to content

Commit fc8cb72

Browse files
fineguyThe TensorFlow Datasets Authors
authored andcommitted
Add overwrite option to utils.incomplete_dir().
PiperOrigin-RevId: 686088351
1 parent 46b867b commit fc8cb72

File tree

4 files changed

+43
-22
lines changed

4 files changed

+43
-22
lines changed

tensorflow_datasets/core/dataset_builder.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -627,15 +627,16 @@ def download_and_prepare(
627627
self._update_dataset_info()
628628
return
629629

630-
if data_exists and download_config.download_mode == REUSE_DATASET_IF_EXISTS:
631-
logging.info("Reusing dataset %s (%s)", self.name, self.data_dir)
632-
return
633-
elif data_exists and download_config.download_mode == REUSE_CACHE_IF_EXISTS:
634-
logging.info(
635-
"Deleting pre-existing dataset %s (%s)", self.name, self.data_dir
636-
)
637-
data_path.rmtree() # Delete pre-existing data.
638-
data_exists = data_path.exists()
630+
if data_exists:
631+
if download_config.download_mode.overwrite_dataset:
632+
logging.info(
633+
"Deleting pre-existing dataset %s (%s)", self.name, self.data_dir
634+
)
635+
data_path.rmtree() # Delete pre-existing data.
636+
data_exists = data_path.exists()
637+
else:
638+
logging.info("Reusing dataset %s (%s)", self.name, self.data_dir)
639+
return
639640

640641
if self.version.tfds_version_to_prepare:
641642
available_to_prepare = ", ".join(
@@ -734,7 +735,9 @@ def download_and_prepare(
734735

735736
# Create a tmp dir and rename to self.data_dir on successful exit.
736737
with utils.incomplete_dir(
737-
dirname=self.data_dir, permissions=permissions
738+
dirname=self.data_dir,
739+
permissions=permissions,
740+
overwrite=download_config.download_mode.overwrite_dataset,
738741
) as tmp_data_dir:
739742
# Temporarily assign _data_dir to tmp_data_dir to avoid having to forward
740743
# it to every sub function.
@@ -1297,8 +1300,8 @@ def _make_download_manager(
12971300
manual_dir=manual_dir,
12981301
url_infos=self.url_infos,
12991302
manual_dir_instructions=self.MANUAL_DOWNLOAD_INSTRUCTIONS,
1300-
force_download=(download_config.download_mode == FORCE_REDOWNLOAD),
1301-
force_extraction=(download_config.download_mode == FORCE_REDOWNLOAD),
1303+
force_download=download_config.download_mode.force_download,
1304+
force_extraction=download_config.download_mode.force_download,
13021305
force_checksums_validation=download_config.force_checksums_validation,
13031306
register_checksums=download_config.register_checksums,
13041307
register_checksums_path=register_checksums_path,

tensorflow_datasets/core/download/util.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,14 @@
1616
"""Utils functions."""
1717

1818
import enum
19+
from etils import epy
1920

2021

2122
class DownloadError(Exception):
2223
pass
2324

2425

25-
class GenerateMode(enum.Enum):
26+
class GenerateMode(epy.StrEnum):
2627
"""`Enum` for how to treat pre-existing downloads and data.
2728
2829
The default mode is `REUSE_DATASET_IF_EXISTS`, which will reuse both
@@ -46,10 +47,21 @@ class GenerateMode(enum.Enum):
4647
UPDATE_DATASET_INFO will fail if the data has never been prepared.
4748
"""
4849

49-
REUSE_DATASET_IF_EXISTS = 'reuse_dataset_if_exists'
50-
UPDATE_DATASET_INFO = 'update_dataset_info'
51-
REUSE_CACHE_IF_EXISTS = 'reuse_cache_if_exists'
52-
FORCE_REDOWNLOAD = 'force_redownload'
50+
REUSE_DATASET_IF_EXISTS = enum.auto()
51+
UPDATE_DATASET_INFO = enum.auto()
52+
REUSE_CACHE_IF_EXISTS = enum.auto()
53+
FORCE_REDOWNLOAD = enum.auto()
54+
55+
@property
56+
def force_download(self) -> bool:
57+
return self == GenerateMode.FORCE_REDOWNLOAD
58+
59+
@property
60+
def overwrite_dataset(self) -> bool:
61+
return self in [
62+
GenerateMode.REUSE_CACHE_IF_EXISTS,
63+
GenerateMode.FORCE_REDOWNLOAD,
64+
]
5365

5466

5567
class ComputeStatsMode(enum.Enum):

tensorflow_datasets/core/utils/file_utils.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def as_path(path: PathLike) -> Path:
7676
"""
7777
)
7878
logging.warning(msg)
79-
return epath.Path(path)
79+
return Path(path)
8080

8181

8282
def add_data_dir(data_dir: PathLike) -> None:
@@ -94,7 +94,7 @@ def add_data_dir(data_dir: PathLike) -> None:
9494
Args:
9595
data_dir: New data_dir to register.
9696
"""
97-
_REGISTERED_DATA_DIRS.add(epath.Path(data_dir))
97+
_REGISTERED_DATA_DIRS.add(Path(data_dir))
9898

9999

100100
def clear_registered_data_dirs() -> None:
@@ -107,21 +107,25 @@ def _get_incomplete_dir(dir_name: str) -> str:
107107
random_suffix = ''.join(
108108
random.choice(string.ascii_uppercase + string.digits) for _ in range(6)
109109
)
110-
dir_name = epath.Path(dir_name)
110+
dir_name = Path(dir_name)
111111
return f'{dir_name.parent}/{constants.INCOMPLETE_PREFIX}{random_suffix}_{dir_name.name}/'
112112

113113

114114
@contextlib.contextmanager
115115
def incomplete_dir(
116-
dirname: epath.PathLike, permissions: Permissions | None = None
116+
dirname: PathLike,
117+
permissions: Permissions | None = None,
118+
overwrite: bool = False,
117119
) -> Iterator[str]:
118120
"""Create temporary dir for dirname and rename on exit."""
119121
dirname = os.fspath(dirname)
120122
tmp_dir = _get_incomplete_dir(dirname)
121-
tmp_path = epath.Path(tmp_dir)
123+
tmp_path = Path(tmp_dir)
122124
tmp_path.mkdir(parents=True, exist_ok=True)
123125
try:
124126
yield tmp_dir
127+
if overwrite:
128+
Path(dirname).rmtree(missing_ok=True)
125129
tmp_path.rename(dirname)
126130
finally:
127131
if tmp_path.exists():

tensorflow_datasets/scripts/cli/cli_utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,8 @@ def download_and_prepare(
315315

316316
if not download_config:
317317
download_config = download.DownloadConfig()
318+
if overwrite and not download_config.download_mode.overwrite_dataset:
319+
download_config.download_mode = download.GenerateMode.REUSE_CACHE_IF_EXISTS
318320

319321
# Add Apache Beam options to download config
320322
try:

0 commit comments

Comments
 (0)