17
17
18
18
import base64
19
19
import codecs
20
- from collections .abc import Mapping
21
20
import enum
22
21
import itertools
23
22
import json
30
29
from tensorflow_datasets .core import utils
31
30
from tensorflow_datasets .core .download import checksums as checksums_lib
32
31
33
- # Should be `Union[int, float, bool, str, Dict[str, Json], List[Json]]`
34
- Json = Mapping [str , Any ]
32
+ Json = dict [str , Any ]
35
33
36
34
_hex_codec = codecs .getdecoder ('hex_codec' )
37
35
@@ -205,9 +203,7 @@ def is_locally_cached(path: epath.Path) -> bool:
205
203
206
204
def _read_info (info_path : epath .Path ) -> Json :
207
205
"""Returns info dict."""
208
- if not info_path .exists ():
209
- return {}
210
- return json .loads (info_path .read_text ())
206
+ return json .loads (info_path .read_text ()) if info_path .exists () else {}
211
207
212
208
213
209
# TODO(pierrot): one lock per info path instead of locking everything.
@@ -223,6 +219,22 @@ def read_info_file(info_path: epath.Path) -> Json:
223
219
return _read_info (get_info_path (info_path ))
224
220
225
221
222
+ def _add_value_to_list (info : Json , key : str , value : str ) -> None :
223
+ """Adds `value` to list `key` in `info` dict."""
224
+ if value and value not in (stored_values := info .get (key , [])):
225
+ info [key ] = stored_values + [value ]
226
+
227
+
228
+ def _set_value (info_path : epath .Path , info : Json , key : str , value : Any ) -> None :
229
+ """Sets `value` to `key` in `info` dict."""
230
+ if (stored_value := info .get (key )) and stored_value != value :
231
+ raise ValueError (
232
+ f'File info "{ info_path } " contains a different "{ key } " than the'
233
+ f' downloaded one: Stored: { stored_value } ; Expected: { value } '
234
+ )
235
+ info [key ] = value
236
+
237
+
226
238
@synchronize_decorator
227
239
def write_info_file (
228
240
url : str ,
@@ -244,40 +256,21 @@ def write_info_file(
244
256
original_fname: name of file as downloaded.
245
257
url_info: checksums/size info of the url
246
258
"""
247
- url_info_dict = url_info .asdict ()
248
259
info_path = get_info_path (path )
249
260
info = _read_info (info_path )
250
- urls = set (info .get ('urls' , []) + [url ])
251
- dataset_names = info .get ('dataset_names' , [])
252
- if dataset_name :
253
- dataset_names .append (dataset_name )
254
- if info .get ('original_fname' , original_fname ) != original_fname :
255
- raise ValueError (
256
- '`original_fname` "{}" stored in {} does NOT match "{}".' .format (
257
- info ['original_fname' ], info_path , original_fname
258
- )
259
- )
260
- if info .get ('url_info' , url_info_dict ) != url_info_dict :
261
- raise ValueError (
262
- 'File info {} contains a different checksum that the downloaded one: '
263
- 'Stored: {}; Expected: {}' .format (
264
- info_path , info ['url_info' ], url_info_dict
265
- )
266
- )
267
- info = dict (
268
- urls = list (urls ),
269
- dataset_names = list (set (dataset_names )),
270
- original_fname = original_fname ,
271
- url_info = url_info_dict ,
272
- )
261
+
262
+ _add_value_to_list (info , 'urls' , url )
263
+ _add_value_to_list (info , 'dataset_names' , dataset_name )
264
+ _set_value (info_path , info , 'original_fname' , original_fname )
265
+ _set_value (info_path , info , 'url_info' , url_info .asdict ())
266
+
273
267
with utils .atomic_write (info_path , 'w' ) as info_f :
274
268
json .dump (info , info_f , sort_keys = True )
275
269
276
270
277
271
def _get_extract_method (path : epath .Path ) -> ExtractMethod :
278
272
"""Returns `ExtractMethod` to use on resource at path. Cannot be None."""
279
- info_path = get_info_path (path )
280
- info = _read_info (info_path )
273
+ info = _read_info (get_info_path (path ))
281
274
fname = info .get ('original_fname' , os .fspath (path ))
282
275
return guess_extract_method (fname )
283
276
0 commit comments