Simplify resource.write_info_file

fineguy · The TensorFlow Datasets Authors · commit 43c738141e01 · 2024-10-07T06:25:31.000-07:00
* Use `dict` instead of `Mapping` for `Json` type.
* Use `_add_value_to_list()` and `_set_value()` to update `info` dict.
* Use `epath.Path.replace()` instead of `unlink()` and `rename()`.

PiperOrigin-RevId: 683157254
diff --git a/tensorflow_datasets/core/download/download_manager_test.py b/tensorflow_datasets/core/download/download_manager_test.py
@@ -553,7 +553,7 @@ def test_download_url_info_in_info_file_missmatch(self):
         register_checksums=False,
         force_download=True,
     )
-    with self.assertRaisesRegex(ValueError, 'contains a different checksum'):
+    with self.assertRaisesRegex(ValueError, 'contains a different "url_info"'):
       dl_manager.download(a.url)
 
     # If the url is re-downloaded with the same hash, no error is raised
diff --git a/tensorflow_datasets/core/download/resource.py b/tensorflow_datasets/core/download/resource.py
@@ -17,7 +17,6 @@
 
 import base64
 import codecs
-from collections.abc import Mapping
 import enum
 import itertools
 import json
@@ -30,8 +29,7 @@
 from tensorflow_datasets.core import utils
 from tensorflow_datasets.core.download import checksums as checksums_lib
 
-# Should be `Union[int, float, bool, str, Dict[str, Json], List[Json]]`
-Json = Mapping[str, Any]
+Json = dict[str, Any]
 
 _hex_codec = codecs.getdecoder('hex_codec')
 
@@ -205,9 +203,7 @@ def is_locally_cached(path: epath.Path) -> bool:
 
 def _read_info(info_path: epath.Path) -> Json:
   """Returns info dict."""
-  if not info_path.exists():
-    return {}
-  return json.loads(info_path.read_text())
+  return json.loads(info_path.read_text()) if info_path.exists() else {}
 
 
 # TODO(pierrot): one lock per info path instead of locking everything.
@@ -223,6 +219,22 @@ def read_info_file(info_path: epath.Path) -> Json:
   return _read_info(get_info_path(info_path))
 
 
+def _add_value_to_list(info: Json, key: str, value: str) -> None:
+  """Adds `value` to list `key` in `info` dict."""
+  if value and value not in (stored_values := info.get(key, [])):
+    info[key] = stored_values + [value]
+
+
+def _set_value(info_path: epath.Path, info: Json, key: str, value: Any) -> None:
+  """Sets `value` to `key` in `info` dict."""
+  if (stored_value := info.get(key)) and stored_value != value:
+    raise ValueError(
+        f'File info "{info_path}" contains a different "{key}" than the'
+        f' downloaded one: Stored: {stored_value}; Expected: {value}'
+    )
+  info[key] = value
+
+
 @synchronize_decorator
 def write_info_file(
     url: str,
@@ -244,40 +256,21 @@ def write_info_file(
     original_fname: name of file as downloaded.
     url_info: checksums/size info of the url
   """
-  url_info_dict = url_info.asdict()
   info_path = get_info_path(path)
   info = _read_info(info_path)
-  urls = set(info.get('urls', []) + [url])
-  dataset_names = info.get('dataset_names', [])
-  if dataset_name:
-    dataset_names.append(dataset_name)
-  if info.get('original_fname', original_fname) != original_fname:
-    raise ValueError(
-        '`original_fname` "{}" stored in {} does NOT match "{}".'.format(
-            info['original_fname'], info_path, original_fname
-        )
-    )
-  if info.get('url_info', url_info_dict) != url_info_dict:
-    raise ValueError(
-        'File info {} contains a different checksum that the downloaded one: '
-        'Stored: {}; Expected: {}'.format(
-            info_path, info['url_info'], url_info_dict
-        )
-    )
-  info = dict(
-      urls=list(urls),
-      dataset_names=list(set(dataset_names)),
-      original_fname=original_fname,
-      url_info=url_info_dict,
-  )
+
+  _add_value_to_list(info, 'urls', url)
+  _add_value_to_list(info, 'dataset_names', dataset_name)
+  _set_value(info_path, info, 'original_fname', original_fname)
+  _set_value(info_path, info, 'url_info', url_info.asdict())
+
   with utils.atomic_write(info_path, 'w') as info_f:
     json.dump(info, info_f, sort_keys=True)
 
 
 def _get_extract_method(path: epath.Path) -> ExtractMethod:
   """Returns `ExtractMethod` to use on resource at path. Cannot be None."""
-  info_path = get_info_path(path)
-  info = _read_info(info_path)
+  info = _read_info(get_info_path(path))
   fname = info.get('original_fname', os.fspath(path))
   return guess_extract_method(fname)
 
diff --git a/tensorflow_datasets/core/utils/py_utils.py b/tensorflow_datasets/core/utils/py_utils.py
@@ -361,12 +361,10 @@ def is_incomplete_file(path: epath.Path) -> bool:
 @contextlib.contextmanager
 def atomic_write(path: epath.PathLike, mode: str):
   """Writes to path atomically, by writing to temp file and renaming it."""
-  path = epath.Path(path)
   tmp_path = _tmp_file_name(path)
   with tmp_path.open(mode=mode) as file_:
     yield file_
-  path.unlink(missing_ok=True)
-  tmp_path.rename(path)
+  tmp_path.replace(path)
 
 
 def reraise(

Original file line number	Diff line number	Diff line change
`@@ -553,7 +553,7 @@ def test_download_url_info_in_info_file_missmatch(self):`
`553`	`553`	`register_checksums=False,`
`554`	`554`	`force_download=True,`
`555`	`555`	`)`
`556`		`- with self.assertRaisesRegex(ValueError, 'contains a different checksum'):`
	`556`	`+ with self.assertRaisesRegex(ValueError, 'contains a different "url_info"'):`
`557`	`557`	`dl_manager.download(a.url)`
`558`	`558`
`559`	`559`	`# If the url is re-downloaded with the same hash, no error is raised`