Support register checksums for manually downloaded files.

fineguy · The TensorFlow Datasets Authors · commit 0015e969c3a9 · 2024-09-25T01:21:53.000-07:00
PiperOrigin-RevId: 678584247
diff --git a/tensorflow_datasets/core/download/download_manager.py b/tensorflow_datasets/core/download/download_manager.py
@@ -315,8 +315,12 @@ def downloaded_size(self) -> int:
     """Returns the total size of downloaded files."""
     return sum(url_info.size for url_info in self._recorded_url_infos.values())
 
-  def _get_dl_path(self, url: str, checksum: str | None = None) -> epath.Path:
-    return self._download_dir / resource_lib.get_dl_fname(url, checksum)
+  def _get_dl_path(
+      self, resource: resource_lib.Resource, checksum: str | None = None
+  ) -> epath.Path:
+    return self._download_dir / resource_lib.get_dl_fname(
+        resource.url, checksum
+    )
 
   @property
   def register_checksums(self):
@@ -352,7 +356,7 @@ def _get_manually_downloaded_path(
   @utils.build_synchronize_decorator()
   @utils.memoize()
   def _download(self, resource: Url) -> promise.Promise[epath.Path]:
-    """Download resource, returns Promise->path to downloaded file.
+    """Downloads resource or gets downloaded cache.
 
     This function:
 
@@ -364,13 +368,12 @@ def _download(self, resource: Url) -> promise.Promise[epath.Path]:
       resource: The URL to download.
 
     Returns:
-      path: The path to the downloaded resource.
+      Promise of the path to the downloaded resource.
     """
     # Normalize the input
-    if isinstance(resource, str):
-      url = resource
-    else:
-      url = resource.url
+    if not isinstance(resource, resource_lib.Resource):
+      resource = resource_lib.Resource(url=resource)
+    url = resource.url
     assert url is not None, 'URL is undefined from resource.'
 
     expected_url_info = self._url_infos.get(url)
@@ -382,9 +385,9 @@ def _download(self, resource: Url) -> promise.Promise[epath.Path]:
     manually_downloaded_path = self._get_manually_downloaded_path(
         expected_url_info=expected_url_info
     )
-    url_path = self._get_dl_path(url)
+    url_path = self._get_dl_path(resource)
     checksum_path = (
-        self._get_dl_path(url, expected_url_info.checksum)
+        self._get_dl_path(resource, expected_url_info.checksum)
         if expected_url_info
         else None
     )
@@ -396,12 +399,12 @@ def _download(self, resource: Url) -> promise.Promise[epath.Path]:
         url_path=url_path,
         expected_url_info=expected_url_info,
     )
-    if dl_result.path and not self._force_download:  # Download was cached
+    if dl_result and not self._force_download:  # Download was cached
       logging.info(
           f'Skipping download of {url}: File cached in {dl_result.path}'
       )
       # Still update the progression bar to indicate the file was downloaded
-      self._downloader.increase_tqdm(dl_result)
+      self._downloader.increase_tqdm(dl_result.url_info)
       future = promise.Promise.resolve(dl_result)
     else:
       # Download in a tmp directory next to url_path (to avoid name collisions)
@@ -418,7 +421,7 @@ def _download(self, resource: Url) -> promise.Promise[epath.Path]:
     # Post-process the result
     return future.then(
         lambda dl_result: self._register_or_validate_checksums(  # pylint: disable=g-long-lambda
-            url=url,
+            resource=resource,
             path=dl_result.path,
             computed_url_info=dl_result.url_info,
             expected_url_info=expected_url_info,
@@ -429,10 +432,10 @@ def _download(self, resource: Url) -> promise.Promise[epath.Path]:
 
   def _register_or_validate_checksums(
       self,
+      resource: resource_lib.Resource,
       path: epath.Path,
-      url: str,
       expected_url_info: checksums.UrlInfo | None,
-      computed_url_info: checksums.UrlInfo | None,
+      computed_url_info: checksums.UrlInfo,
       checksum_path: epath.Path | None,
       url_path: epath.Path,
   ) -> epath.Path:
@@ -443,16 +446,11 @@ def _register_or_validate_checksums(
     # * (cached) url_path
     # * `tmp_dir/file` (downloaded path)
 
-    if computed_url_info:
-      # Used both in `.downloaded_size` and `_record_url_infos()`
-      self._recorded_url_infos[url] = computed_url_info
+    url: str = resource.url  # pytype: disable=annotation-type-mismatch
+    # Used both in `.downloaded_size` and `_record_url_infos()`
+    self._recorded_url_infos[url] = computed_url_info
 
     if self._register_checksums:
-      if not computed_url_info:
-        raise ValueError(
-            f'Cannot register checksums for {url}: no computed checksum. '
-            '--register_checksums with manually downloaded data not supported.'
-        )
       # Note:
       # * We save even if `expected_url_info == computed_url_info` as
       #   `expected_url_info` might have been loaded from another dataset.
@@ -463,7 +461,7 @@ def _register_or_validate_checksums(
       # Checksum path should now match the new registered checksum (even if
       # checksums were previously registered)
       expected_url_info = computed_url_info
-      checksum_path = self._get_dl_path(url, computed_url_info.checksum)
+      checksum_path = self._get_dl_path(resource, computed_url_info.checksum)
     else:
       # Eventually validate checksums
       # Note:
@@ -476,9 +474,9 @@ def _register_or_validate_checksums(
       #   was corrupted. Note: The tmp file isn't deleted to allow inspection.
       self._validate_checksums(
           url=url,
-          path=path,
           expected_url_info=expected_url_info,
           computed_url_info=computed_url_info,
+          path=path,
       )
 
     return self._rename_and_get_final_dl_path(
@@ -493,17 +491,14 @@ def _register_or_validate_checksums(
   def _validate_checksums(
       self,
       url: str,
-      path: epath.Path,
-      computed_url_info: checksums.UrlInfo | None,
       expected_url_info: checksums.UrlInfo | None,
+      computed_url_info: checksums.UrlInfo,
+      path: epath.Path,
   ) -> None:
     """Validate computed_url_info match expected_url_info."""
     # If force-checksums validations, both expected and computed url_info
     # should exists
     if self._force_checksums_validation:
-      # Checksum of the downloaded file unknown (for manually downloaded file)
-      if not computed_url_info:
-        computed_url_info = checksums.compute_url_info(path)
       # Checksums have not been registered
       if not expected_url_info:
         raise ValueError(
@@ -512,11 +507,7 @@ def _validate_checksums(
             'Did you forget to register checksums?'
         )
 
-    if (
-        expected_url_info
-        and computed_url_info
-        and expected_url_info != computed_url_info
-    ):
+    if expected_url_info and expected_url_info != computed_url_info:
       msg = (
           f'Artifact {url}, downloaded to {path}, has wrong checksum:\n'
           f'* Expected: {expected_url_info}\n'
diff --git a/tensorflow_datasets/core/download/download_manager_test.py b/tensorflow_datasets/core/download/download_manager_test.py
@@ -46,20 +46,23 @@
 class Artifact:
   # For testing only.
 
-  def __init__(self, name, url=None, content=None):
-    url = url or f'http://foo-bar.ch/{name}'
-    content = content or f'content of {name}'
-    self.url = url
+  def __init__(
+      self, name: str, url: str | None = None, content: str | None = None
+  ):
+    self.name = name
+    self.url = url or f'http://foo-bar.ch/{self.name}'
+    self.content = content or f'content of {self.name}'
+
     self.url_info = checksums_lib.UrlInfo(
-        size=len(content),
-        checksum=checksums_lib.sha256(content),
-        filename=name,
+        size=len(self.content),
+        checksum=checksums_lib.sha256(self.content),
+        filename=self.name,
     )
 
-    self.file_name = resource_lib.get_dl_fname(url, self.url_info.checksum)
+    self.file_name = resource_lib.get_dl_fname(self.url, self.url_info.checksum)
     self.file_path = _DOWNLOAD_DIR / self.file_name
 
-    self.url_name = resource_lib.get_dl_fname(url)
+    self.url_name = resource_lib.get_dl_fname(self.url)
     self.url_path = _DOWNLOAD_DIR / self.url_name
 
     self.manual_path = _MANUAL_DIR / name
@@ -91,17 +94,17 @@ class DownloadManagerTest(testing.TestCase, parameterized.TestCase):
   def _make_downloader_mock(self):
     """`downloader.download` patch which creates the returns the path."""
 
-    def _download(url, tmpdir_path, verify):
+    def _download(url: str, tmpdir_path: epath.Path, verify: bool):
       del verify
       self.downloaded_urls.append(url)  # Record downloader.download() calls
       # If the name isn't explicitly provided, then it is extracted from the
       # url.
       filename = self.dl_fnames.get(url, os.path.basename(url))
       # Save the file in the tmp_dir
-      path = os.path.join(tmpdir_path, filename)
+      path = tmpdir_path / filename
       self.fs.add_file(path)
       dl_result = downloader.DownloadResult(
-          path=epath.Path(path),
+          path=path,
           url_info=self.dl_results[url],
       )
       return promise.Promise.resolve(dl_result)
@@ -224,7 +227,7 @@ def test_manually_downloaded(self):
     a, b = [Artifact(i) for i in 'ab']
 
     # File a is manually downloaded
-    self.fs.add_file(a.manual_path)
+    self.fs.add_file(a.manual_path, content=a.content)
     self.fs.add_file(b.file_path)
 
     self.dl_results[b.url] = b.url_info
@@ -298,8 +301,8 @@ def test_download_and_extract(self):
             b.url: b.url_info,
         }
     )
-    res = manager.download_and_extract({'a': a.url, 'b': b.url})
-    self.assertEqual(res, {'a': a.extract_path, 'b': b.file_path})
+    res = manager.download_and_extract({a.name: a.url, b.name: b.url})
+    self.assertEqual(res, {a.name: a.extract_path, b.name: b.file_path})
 
   def test_download_and_extract_no_manual_dir(self):
     a, b = Artifact('a.zip'), Artifact('b')
@@ -316,8 +319,8 @@ def test_download_and_extract_no_manual_dir(self):
             b.url: b.url_info,
         },
     )
-    res = manager.download_and_extract({'a': a.url, 'b': b.url})
-    self.assertEqual(res, {'a': a.extract_path, 'b': b.file_path})
+    res = manager.download_and_extract({a.name: a.url, b.name: b.url})
+    self.assertEqual(res, {a.name: a.extract_path, b.name: b.file_path})
 
   def test_download_and_extract_archive_ext_in_fname(self):
     # Make sure extraction method is properly deduced from original fname, and
@@ -582,7 +585,7 @@ def test_register_checksums_url_info_already_exists(self):
 
   def test_download_cached_url_path_checksum_updated(self):
     old_a = Artifact('a.tar.gz')
-    new_a = Artifact('a.tar.gz', content='New a content')  # New checksums
+    new_a = Artifact('a.tar.gz', content='New content')  # New checksums
 
     # Urls are equals, but not checksums
     self.assertEqual(old_a.url, new_a.url)
diff --git a/tensorflow_datasets/core/download/downloader.py b/tensorflow_datasets/core/download/downloader.py
@@ -52,8 +52,8 @@
 
 @dataclasses.dataclass(eq=False, frozen=True)
 class DownloadResult:
-  path: epath.Path | None
-  url_info: checksums_lib.UrlInfo | None
+  path: epath.Path
+  url_info: checksums_lib.UrlInfo
 
 
 @utils.memoize()
@@ -80,7 +80,7 @@ def get_cached_path(
     checksum_path: epath.Path | None,
     url_path: epath.Path,
     expected_url_info: checksums_lib.UrlInfo | None,
-) -> DownloadResult:
+) -> DownloadResult | None:
   """Returns the downloaded path and computed url-info.
 
   If the path is not cached, or that `url_path` does not match checksums,
@@ -96,7 +96,10 @@ def get_cached_path(
   """
   # User has manually downloaded the file.
   if manually_downloaded_path and manually_downloaded_path.exists():
-    return DownloadResult(path=manually_downloaded_path, url_info=None)
+    computed_url_info = checksums_lib.compute_url_info(manually_downloaded_path)
+    return DownloadResult(
+        path=manually_downloaded_path, url_info=computed_url_info
+    )
 
   # Download has been cached (checksum known)
   elif checksum_path and resource_lib.Resource.exists_locally(checksum_path):
@@ -110,13 +113,13 @@ def get_cached_path(
     # If checksums are now registered but do not match, trigger a new
     # download (e.g. previous file corrupted, checksums updated)
     if expected_url_info and computed_url_info != expected_url_info:
-      return DownloadResult(path=None, url_info=None)
+      return None
     else:
       return DownloadResult(path=url_path, url_info=computed_url_info)
 
   # Else file not found (or has bad checksums). (re)download.
   else:
-    return DownloadResult(path=None, url_info=None)
+    return None
 
 
 def _filename_from_content_disposition(
@@ -216,13 +219,12 @@ def tqdm(self) -> Iterator[None]:
         self._pbar_dl_size = pbar_dl_size
         yield
 
-  def increase_tqdm(self, dl_result: DownloadResult) -> None:
-    """Update the tqdm bars to visually indicate the dl_result is downloaded."""
+  def increase_tqdm(self, url_info: checksums_lib.UrlInfo) -> None:
+    """Update the tqdm bars to visually indicate the url_info is downloaded."""
     self._pbar_url.update_total(1)
     self._pbar_url.update(1)
-    if dl_result.url_info:  # Info unknown for manually downloaded files
-      self._pbar_dl_size.update_total(dl_result.url_info.size)
-      self._pbar_dl_size.update(dl_result.url_info.size)
+    self._pbar_dl_size.update_total(url_info.size)
+    self._pbar_dl_size.update(url_info.size)
 
   def download(
       self, url: str, destination_path: epath.Path, verify: bool = True
diff --git a/tensorflow_datasets/testing/test_utils.py b/tensorflow_datasets/testing/test_utils.py
@@ -202,7 +202,7 @@ def _validate_out(self, out):
   def add_file(self, path, content=None) -> None:
     """Add a file, creating all parent directories."""
     path = os.fspath(path)
-    content = f'Content of {path}' if content is None else content
+    content = content or f'Content of {path}'
     fpath = self._to_tmp(path)
     fpath.parent.mkdir(parents=True, exist_ok=True)  # pytype: disable=attribute-error
     fpath.write_text(content)  # pytype: disable=attribute-error