Skip to content

Commit c229c24

Browse files
tomvdwThe TensorFlow Datasets Authors
authored andcommitted
Lazily load various libraries
PiperOrigin-RevId: 627961527
1 parent a49dbba commit c229c24

File tree

13 files changed

+210
-162
lines changed

13 files changed

+210
-162
lines changed

tensorflow_datasets/core/constants.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
# IMPORTANT: when changing values here, update docstrings.
1919

2020
import os
21-
from etils import epath
2221

2322
# Directory in which datasets are declared within TFDS sources.
2423
DATASETS_TFDS_SRC_DIR = 'datasets'
@@ -48,7 +47,4 @@
4847

4948
# Filepath for mapping between TFDS datasets and PapersWithCode entries.
5049
PWC_FILENAME = 'tfds_to_pwc_links.json'
51-
PWC_LINKS_PATH = (
52-
epath.resource_path('tensorflow_datasets')
53-
/ f'scripts/documentation/{PWC_FILENAME}'
54-
)
50+
PWC_LINKS_PATH = f'scripts/documentation/{PWC_FILENAME}'

tensorflow_datasets/core/download/checksums.py

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,21 +15,27 @@
1515

1616
"""Methods to retrieve and store size/checksums associated to URLs."""
1717

18+
from collections.abc import Iterable
1819
import dataclasses
1920
import hashlib
2021
import io
21-
from typing import Any, Dict, Iterable, Optional
22+
from typing import Any
2223

2324
from absl import logging
2425
from etils import epath
2526
from tensorflow_datasets.core import utils
2627

27-
_CHECKSUM_DIRS = [
28-
utils.tfds_path() / 'url_checksums',
29-
]
28+
_CUSTOM_CHECKSUM_DIRS = []
3029
_CHECKSUM_SUFFIX = '.txt'
3130

3231

32+
@utils.memoize(maxsize=1)
33+
def _default_checksum_dirs() -> list[epath.Path]:
34+
return [
35+
utils.tfds_path() / 'url_checksums',
36+
]
37+
38+
3339
@dataclasses.dataclass(eq=True)
3440
class UrlInfo:
3541
"""Small wrapper around the url metadata (checksum, size).
@@ -44,9 +50,9 @@ class UrlInfo:
4450
checksum: str
4551
# We exclude the filename from `__eq__` for backward compatibility
4652
# Two checksums are equals even if filename is unknown or different.
47-
filename: Optional[str] = dataclasses.field(compare=False)
53+
filename: str | None = dataclasses.field(compare=False)
4854

49-
def asdict(self) -> Dict[str, Any]:
55+
def asdict(self) -> dict[str, Any]:
5056
"""Returns the dict representation of the dataclass."""
5157
return dataclasses.asdict(self)
5258

@@ -107,16 +113,19 @@ class MyDataset(tfds.core.DatasetBuilder):
107113
'The checksum file will be automatically detected. More info at: '
108114
'https://www.tensorflow.org/datasets/add_dataset'
109115
)
110-
if checksums_dir in _CHECKSUM_DIRS: # Avoid duplicate
116+
if (
117+
checksums_dir in _CUSTOM_CHECKSUM_DIRS
118+
or checksums_dir in _default_checksum_dirs()
119+
): # Avoid duplicates
111120
return
112-
_CHECKSUM_DIRS.append(checksums_dir)
121+
_CUSTOM_CHECKSUM_DIRS.append(checksums_dir)
113122

114123

115124
@utils.memoize()
116-
def _checksum_paths() -> Dict[str, epath.Path]:
125+
def _checksum_paths() -> dict[str, epath.Path]:
117126
"""Returns dict {'dataset_name': 'path/to/checksums/file'}."""
118127
dataset2path = {}
119-
for dir_path in _CHECKSUM_DIRS:
128+
for dir_path in _CUSTOM_CHECKSUM_DIRS + _default_checksum_dirs():
120129
if isinstance(dir_path, str):
121130
dir_path = epath.Path(dir_path)
122131
if not dir_path.exists():
@@ -129,7 +138,7 @@ def _checksum_paths() -> Dict[str, epath.Path]:
129138
return dataset2path
130139

131140

132-
def _parse_url_infos(checksums_file: Iterable[str]) -> Dict[str, UrlInfo]:
141+
def _parse_url_infos(checksums_file: Iterable[str]) -> dict[str, UrlInfo]:
133142
"""Returns {URL: (size, checksum)}s stored within given file."""
134143
url_infos = {}
135144
for line in checksums_file:
@@ -156,7 +165,7 @@ def _parse_url_infos(checksums_file: Iterable[str]) -> Dict[str, UrlInfo]:
156165

157166

158167
@utils.memoize()
159-
def get_all_url_infos() -> Dict[str, UrlInfo]:
168+
def get_all_url_infos() -> dict[str, UrlInfo]:
160169
"""Returns dict associating URL to UrlInfo."""
161170
url_infos = {}
162171
for path in _checksum_paths().values():
@@ -171,14 +180,14 @@ def get_all_url_infos() -> Dict[str, UrlInfo]:
171180
return url_infos
172181

173182

174-
def load_url_infos(path: epath.PathLike) -> Dict[str, UrlInfo]:
183+
def load_url_infos(path: epath.PathLike) -> dict[str, UrlInfo]:
175184
"""Loads the checksums."""
176185
return _parse_url_infos(epath.Path(path).read_text().splitlines())
177186

178187

179188
def save_url_infos(
180189
path: epath.Path,
181-
url_infos: Dict[str, UrlInfo],
190+
url_infos: dict[str, UrlInfo],
182191
) -> None:
183192
"""Store given checksums and sizes for specific dataset.
184193
@@ -211,8 +220,8 @@ def save_url_infos(
211220

212221

213222
def _filenames_equal(
214-
left: Dict[str, UrlInfo],
215-
right: Dict[str, UrlInfo],
223+
left: dict[str, UrlInfo],
224+
right: dict[str, UrlInfo],
216225
) -> bool:
217226
"""Compare filenames."""
218227
return all(

0 commit comments

Comments
 (0)