Merge pull request #13441 from notatallshaw/cache-final-file-from-resume-retry

notatallshaw · web-flow · commit f68e7c923945 · 2025-06-25T18:42:15.000-04:00
Cache final file from resume retry process
diff --git a/news/13441.bugfix.rst b/news/13441.bugfix.rst
@@ -0,0 +1 @@
+Resumed downloads are saved to the HTTP cache like any other normal download.
diff --git a/src/pip/_internal/network/cache.py b/src/pip/_internal/network/cache.py
@@ -3,10 +3,11 @@
 from __future__ import annotations
 
 import os
+import shutil
 from collections.abc import Generator
 from contextlib import contextmanager
 from datetime import datetime
-from typing import BinaryIO
+from typing import Any, BinaryIO, Callable
 
 from pip._vendor.cachecontrol.cache import SeparateBodyBaseCache
 from pip._vendor.cachecontrol.caches import SeparateBodyFileCache
@@ -72,12 +73,13 @@ def get(self, key: str) -> bytes | None:
             with open(metadata_path, "rb") as f:
                 return f.read()
 
-    def _write(self, path: str, data: bytes) -> None:
+    def _write_to_file(self, path: str, writer_func: Callable[[BinaryIO], Any]) -> None:
+        """Common file writing logic with proper permissions and atomic replacement."""
         with suppressed_cache_errors():
             ensure_dir(os.path.dirname(path))
 
             with adjacent_tmp_file(path) as f:
-                f.write(data)
+                writer_func(f)
                 # Inherit the read/write permissions of the cache directory
                 # to enable multi-user cache use-cases.
                 mode = (
@@ -93,6 +95,12 @@ def _write(self, path: str, data: bytes) -> None:
 
             replace(f.name, path)
 
+    def _write(self, path: str, data: bytes) -> None:
+        self._write_to_file(path, lambda f: f.write(data))
+
+    def _write_from_io(self, path: str, source_file: BinaryIO) -> None:
+        self._write_to_file(path, lambda f: shutil.copyfileobj(source_file, f))
+
     def set(
         self, key: str, value: bytes, expires: int | datetime | None = None
     ) -> None:
@@ -118,3 +126,8 @@ def get_body(self, key: str) -> BinaryIO | None:
     def set_body(self, key: str, body: bytes) -> None:
         path = self._get_cache_path(key) + ".body"
         self._write(path, body)
+
+    def set_body_from_io(self, key: str, body_file: BinaryIO) -> None:
+        """Set the body of the cache entry from a file object."""
+        path = self._get_cache_path(key) + ".body"
+        self._write_from_io(path, body_file)
diff --git a/src/pip/_internal/network/download.py b/src/pip/_internal/network/download.py
@@ -11,15 +11,18 @@
 from http import HTTPStatus
 from typing import BinaryIO
 
+from pip._vendor.requests import PreparedRequest
 from pip._vendor.requests.models import Response
+from pip._vendor.urllib3 import HTTPResponse as URLlib3Response
+from pip._vendor.urllib3._collections import HTTPHeaderDict
 from pip._vendor.urllib3.exceptions import ReadTimeoutError
 
 from pip._internal.cli.progress_bars import get_download_progress_renderer
 from pip._internal.exceptions import IncompleteDownloadError, NetworkConnectionError
 from pip._internal.models.index import PyPI
 from pip._internal.models.link import Link
-from pip._internal.network.cache import is_from_cache
-from pip._internal.network.session import PipSession
+from pip._internal.network.cache import SafeFileCache, is_from_cache
+from pip._internal.network.session import CacheControlAdapter, PipSession
 from pip._internal.network.utils import HEADERS, raise_for_status, response_chunks
 from pip._internal.utils.misc import format_size, redact_auth_from_url, splitext
 
@@ -250,6 +253,67 @@ def _attempt_resumes_or_redownloads(
             os.remove(download.output_file.name)
             raise IncompleteDownloadError(download)
 
+        # If we successfully completed the download via resume, manually cache it
+        # as a complete response to enable future caching
+        if download.reattempts > 0:
+            self._cache_resumed_download(download, first_resp)
+
+    def _cache_resumed_download(
+        self, download: _FileDownload, original_response: Response
+    ) -> None:
+        """
+        Manually cache a file that was successfully downloaded via resume retries.
+
+        cachecontrol doesn't cache 206 (Partial Content) responses, since they
+        are not complete files. This method manually adds the final file to the
+        cache as though it was downloaded in a single request, so that future
+        requests can use the cache.
+        """
+        url = download.link.url_without_fragment
+        adapter = self._session.get_adapter(url)
+
+        # Check if the adapter is the CacheControlAdapter (i.e. caching is enabled)
+        if not isinstance(adapter, CacheControlAdapter):
+            logger.debug(
+                "Skipping resume download caching: no cache controller for %s", url
+            )
+            return
+
+        # Check SafeFileCache is being used
+        assert isinstance(
+            adapter.cache, SafeFileCache
+        ), "separate body cache not in use!"
+
+        synthetic_request = PreparedRequest()
+        synthetic_request.prepare(method="GET", url=url, headers={})
+
+        synthetic_response_headers = HTTPHeaderDict()
+        for key, value in original_response.headers.items():
+            if key.lower() not in ["content-range", "content-length"]:
+                synthetic_response_headers[key] = value
+        synthetic_response_headers["content-length"] = str(download.size)
+
+        synthetic_response = URLlib3Response(
+            body="",
+            headers=synthetic_response_headers,
+            status=200,
+            preload_content=False,
+        )
+
+        # Save metadata and then stream the file contents to cache.
+        cache_url = adapter.controller.cache_url(url)
+        metadata_blob = adapter.controller.serializer.dumps(
+            synthetic_request, synthetic_response, b""
+        )
+        adapter.cache.set(cache_url, metadata_blob)
+        download.output_file.flush()
+        with open(download.output_file.name, "rb") as f:
+            adapter.cache.set_body_from_io(cache_url, f)
+
+        logger.debug(
+            "Cached resumed download as complete response for future use: %s", url
+        )
+
     def _http_get_resume(
         self, download: _FileDownload, should_match: Response
     ) -> Response:
diff --git a/tests/unit/test_network_download.py b/tests/unit/test_network_download.py
@@ -350,3 +350,40 @@ def test_downloader(
 
     # Make sure that the downloader makes additional requests for resumption
     _http_get_mock.assert_has_calls(calls)
+
+
+def test_resumed_download_caching(tmpdir: Path) -> None:
+    """Test that resumed downloads are cached properly for future use."""
+    cache_dir = tmpdir / "cache"
+    session = PipSession(cache=str(cache_dir))
+    link = Link("https://example.com/foo.tgz")
+    downloader = Downloader(session, "on", resume_retries=5)
+
+    # Mock an incomplete download followed by a successful resume
+    incomplete_resp = MockResponse(b"0cfa7e9d-1868-4dd7-9fb3-")
+    incomplete_resp.headers = {"content-length": "36"}
+    incomplete_resp.status_code = 200
+
+    resume_resp = MockResponse(b"f2561d5dfd89")
+    resume_resp.headers = {"content-length": "12"}
+    resume_resp.status_code = 206
+
+    responses = [incomplete_resp, resume_resp]
+    _http_get_mock = MagicMock(side_effect=responses)
+
+    with patch.object(Downloader, "_http_get", _http_get_mock):
+        # Perform the download (incomplete then resumed)
+        filepath, _ = downloader(link, str(tmpdir))
+
+        # Verify the file was downloaded correctly
+        with open(filepath, "rb") as downloaded_file:
+            downloaded_bytes = downloaded_file.read()
+            expected_bytes = b"0cfa7e9d-1868-4dd7-9fb3-f2561d5dfd89"
+            assert downloaded_bytes == expected_bytes
+
+        # Verify that the cache directory was created and contains cache files
+        # The resumed download should have been cached for future use
+        assert cache_dir.exists()
+        cache_files = list(cache_dir.rglob("*"))
+        # Should have cache files (both metadata and body files)
+        assert len([f for f in cache_files if f.is_file()]) == 2

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Resumed downloads are saved to the HTTP cache like any other normal download.`