15
15
16
16
"""Methods to retrieve and store size/checksums associated to URLs."""
17
17
18
+ from collections .abc import Iterable
18
19
import dataclasses
19
20
import hashlib
20
21
import io
21
- from typing import Any , Dict , Iterable , Optional
22
+ from typing import Any
22
23
23
24
from absl import logging
24
25
from etils import epath
25
26
from tensorflow_datasets .core import utils
26
27
27
- _CHECKSUM_DIRS = [
28
- utils .tfds_path () / 'url_checksums' ,
29
- ]
28
+ _CUSTOM_CHECKSUM_DIRS = []
30
29
_CHECKSUM_SUFFIX = '.txt'
31
30
32
31
32
+ @utils .memoize (maxsize = 1 )
33
+ def _default_checksum_dirs () -> list [epath .Path ]:
34
+ return [
35
+ utils .tfds_path () / 'url_checksums' ,
36
+ ]
37
+
38
+
33
39
@dataclasses .dataclass (eq = True )
34
40
class UrlInfo :
35
41
"""Small wrapper around the url metadata (checksum, size).
@@ -44,9 +50,9 @@ class UrlInfo:
44
50
checksum : str
45
51
# We exclude the filename from `__eq__` for backward compatibility
46
52
# Two checksums are equals even if filename is unknown or different.
47
- filename : Optional [ str ] = dataclasses .field (compare = False )
53
+ filename : str | None = dataclasses .field (compare = False )
48
54
49
- def asdict (self ) -> Dict [str , Any ]:
55
+ def asdict (self ) -> dict [str , Any ]:
50
56
"""Returns the dict representation of the dataclass."""
51
57
return dataclasses .asdict (self )
52
58
@@ -107,16 +113,19 @@ class MyDataset(tfds.core.DatasetBuilder):
107
113
'The checksum file will be automatically detected. More info at: '
108
114
'https://www.tensorflow.org/datasets/add_dataset'
109
115
)
110
- if checksums_dir in _CHECKSUM_DIRS : # Avoid duplicate
116
+ if (
117
+ checksums_dir in _CUSTOM_CHECKSUM_DIRS
118
+ or checksums_dir in _default_checksum_dirs ()
119
+ ): # Avoid duplicates
111
120
return
112
- _CHECKSUM_DIRS .append (checksums_dir )
121
+ _CUSTOM_CHECKSUM_DIRS .append (checksums_dir )
113
122
114
123
115
124
@utils .memoize ()
116
- def _checksum_paths () -> Dict [str , epath .Path ]:
125
+ def _checksum_paths () -> dict [str , epath .Path ]:
117
126
"""Returns dict {'dataset_name': 'path/to/checksums/file'}."""
118
127
dataset2path = {}
119
- for dir_path in _CHECKSUM_DIRS :
128
+ for dir_path in _CUSTOM_CHECKSUM_DIRS + _default_checksum_dirs () :
120
129
if isinstance (dir_path , str ):
121
130
dir_path = epath .Path (dir_path )
122
131
if not dir_path .exists ():
@@ -129,7 +138,7 @@ def _checksum_paths() -> Dict[str, epath.Path]:
129
138
return dataset2path
130
139
131
140
132
- def _parse_url_infos (checksums_file : Iterable [str ]) -> Dict [str , UrlInfo ]:
141
+ def _parse_url_infos (checksums_file : Iterable [str ]) -> dict [str , UrlInfo ]:
133
142
"""Returns {URL: (size, checksum)}s stored within given file."""
134
143
url_infos = {}
135
144
for line in checksums_file :
@@ -156,7 +165,7 @@ def _parse_url_infos(checksums_file: Iterable[str]) -> Dict[str, UrlInfo]:
156
165
157
166
158
167
@utils .memoize ()
159
- def get_all_url_infos () -> Dict [str , UrlInfo ]:
168
+ def get_all_url_infos () -> dict [str , UrlInfo ]:
160
169
"""Returns dict associating URL to UrlInfo."""
161
170
url_infos = {}
162
171
for path in _checksum_paths ().values ():
@@ -171,14 +180,14 @@ def get_all_url_infos() -> Dict[str, UrlInfo]:
171
180
return url_infos
172
181
173
182
174
- def load_url_infos (path : epath .PathLike ) -> Dict [str , UrlInfo ]:
183
+ def load_url_infos (path : epath .PathLike ) -> dict [str , UrlInfo ]:
175
184
"""Loads the checksums."""
176
185
return _parse_url_infos (epath .Path (path ).read_text ().splitlines ())
177
186
178
187
179
188
def save_url_infos (
180
189
path : epath .Path ,
181
- url_infos : Dict [str , UrlInfo ],
190
+ url_infos : dict [str , UrlInfo ],
182
191
) -> None :
183
192
"""Store given checksums and sizes for specific dataset.
184
193
@@ -211,8 +220,8 @@ def save_url_infos(
211
220
212
221
213
222
def _filenames_equal (
214
- left : Dict [str , UrlInfo ],
215
- right : Dict [str , UrlInfo ],
223
+ left : dict [str , UrlInfo ],
224
+ right : dict [str , UrlInfo ],
216
225
) -> bool :
217
226
"""Compare filenames."""
218
227
return all (
0 commit comments