Skip to content

Commit f68e7c9

Browse files
authored
Merge pull request #13441 from notatallshaw/cache-final-file-from-resume-retry
Cache final file from resume retry process
2 parents 98ac4a2 + 67d6639 commit f68e7c9

File tree

4 files changed

+120
-5
lines changed

4 files changed

+120
-5
lines changed

news/13441.bugfix.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Resumed downloads are saved to the HTTP cache like any other normal download.

src/pip/_internal/network/cache.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@
33
from __future__ import annotations
44

55
import os
6+
import shutil
67
from collections.abc import Generator
78
from contextlib import contextmanager
89
from datetime import datetime
9-
from typing import BinaryIO
10+
from typing import Any, BinaryIO, Callable
1011

1112
from pip._vendor.cachecontrol.cache import SeparateBodyBaseCache
1213
from pip._vendor.cachecontrol.caches import SeparateBodyFileCache
@@ -72,12 +73,13 @@ def get(self, key: str) -> bytes | None:
7273
with open(metadata_path, "rb") as f:
7374
return f.read()
7475

75-
def _write(self, path: str, data: bytes) -> None:
76+
def _write_to_file(self, path: str, writer_func: Callable[[BinaryIO], Any]) -> None:
77+
"""Common file writing logic with proper permissions and atomic replacement."""
7678
with suppressed_cache_errors():
7779
ensure_dir(os.path.dirname(path))
7880

7981
with adjacent_tmp_file(path) as f:
80-
f.write(data)
82+
writer_func(f)
8183
# Inherit the read/write permissions of the cache directory
8284
# to enable multi-user cache use-cases.
8385
mode = (
@@ -93,6 +95,12 @@ def _write(self, path: str, data: bytes) -> None:
9395

9496
replace(f.name, path)
9597

98+
def _write(self, path: str, data: bytes) -> None:
99+
self._write_to_file(path, lambda f: f.write(data))
100+
101+
def _write_from_io(self, path: str, source_file: BinaryIO) -> None:
102+
self._write_to_file(path, lambda f: shutil.copyfileobj(source_file, f))
103+
96104
def set(
97105
self, key: str, value: bytes, expires: int | datetime | None = None
98106
) -> None:
@@ -118,3 +126,8 @@ def get_body(self, key: str) -> BinaryIO | None:
118126
def set_body(self, key: str, body: bytes) -> None:
119127
path = self._get_cache_path(key) + ".body"
120128
self._write(path, body)
129+
130+
def set_body_from_io(self, key: str, body_file: BinaryIO) -> None:
131+
"""Set the body of the cache entry from a file object."""
132+
path = self._get_cache_path(key) + ".body"
133+
self._write_from_io(path, body_file)

src/pip/_internal/network/download.py

Lines changed: 66 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,18 @@
1111
from http import HTTPStatus
1212
from typing import BinaryIO
1313

14+
from pip._vendor.requests import PreparedRequest
1415
from pip._vendor.requests.models import Response
16+
from pip._vendor.urllib3 import HTTPResponse as URLlib3Response
17+
from pip._vendor.urllib3._collections import HTTPHeaderDict
1518
from pip._vendor.urllib3.exceptions import ReadTimeoutError
1619

1720
from pip._internal.cli.progress_bars import get_download_progress_renderer
1821
from pip._internal.exceptions import IncompleteDownloadError, NetworkConnectionError
1922
from pip._internal.models.index import PyPI
2023
from pip._internal.models.link import Link
21-
from pip._internal.network.cache import is_from_cache
22-
from pip._internal.network.session import PipSession
24+
from pip._internal.network.cache import SafeFileCache, is_from_cache
25+
from pip._internal.network.session import CacheControlAdapter, PipSession
2326
from pip._internal.network.utils import HEADERS, raise_for_status, response_chunks
2427
from pip._internal.utils.misc import format_size, redact_auth_from_url, splitext
2528

@@ -250,6 +253,67 @@ def _attempt_resumes_or_redownloads(
250253
os.remove(download.output_file.name)
251254
raise IncompleteDownloadError(download)
252255

256+
# If we successfully completed the download via resume, manually cache it
257+
# as a complete response to enable future caching
258+
if download.reattempts > 0:
259+
self._cache_resumed_download(download, first_resp)
260+
261+
def _cache_resumed_download(
262+
self, download: _FileDownload, original_response: Response
263+
) -> None:
264+
"""
265+
Manually cache a file that was successfully downloaded via resume retries.
266+
267+
cachecontrol doesn't cache 206 (Partial Content) responses, since they
268+
are not complete files. This method manually adds the final file to the
269+
cache as though it was downloaded in a single request, so that future
270+
requests can use the cache.
271+
"""
272+
url = download.link.url_without_fragment
273+
adapter = self._session.get_adapter(url)
274+
275+
# Check if the adapter is the CacheControlAdapter (i.e. caching is enabled)
276+
if not isinstance(adapter, CacheControlAdapter):
277+
logger.debug(
278+
"Skipping resume download caching: no cache controller for %s", url
279+
)
280+
return
281+
282+
# Check SafeFileCache is being used
283+
assert isinstance(
284+
adapter.cache, SafeFileCache
285+
), "separate body cache not in use!"
286+
287+
synthetic_request = PreparedRequest()
288+
synthetic_request.prepare(method="GET", url=url, headers={})
289+
290+
synthetic_response_headers = HTTPHeaderDict()
291+
for key, value in original_response.headers.items():
292+
if key.lower() not in ["content-range", "content-length"]:
293+
synthetic_response_headers[key] = value
294+
synthetic_response_headers["content-length"] = str(download.size)
295+
296+
synthetic_response = URLlib3Response(
297+
body="",
298+
headers=synthetic_response_headers,
299+
status=200,
300+
preload_content=False,
301+
)
302+
303+
# Save metadata and then stream the file contents to cache.
304+
cache_url = adapter.controller.cache_url(url)
305+
metadata_blob = adapter.controller.serializer.dumps(
306+
synthetic_request, synthetic_response, b""
307+
)
308+
adapter.cache.set(cache_url, metadata_blob)
309+
download.output_file.flush()
310+
with open(download.output_file.name, "rb") as f:
311+
adapter.cache.set_body_from_io(cache_url, f)
312+
313+
logger.debug(
314+
"Cached resumed download as complete response for future use: %s", url
315+
)
316+
253317
def _http_get_resume(
254318
self, download: _FileDownload, should_match: Response
255319
) -> Response:

tests/unit/test_network_download.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -350,3 +350,40 @@ def test_downloader(
350350

351351
# Make sure that the downloader makes additional requests for resumption
352352
_http_get_mock.assert_has_calls(calls)
353+
354+
355+
def test_resumed_download_caching(tmpdir: Path) -> None:
356+
"""Test that resumed downloads are cached properly for future use."""
357+
cache_dir = tmpdir / "cache"
358+
session = PipSession(cache=str(cache_dir))
359+
link = Link("https://example.com/foo.tgz")
360+
downloader = Downloader(session, "on", resume_retries=5)
361+
362+
# Mock an incomplete download followed by a successful resume
363+
incomplete_resp = MockResponse(b"0cfa7e9d-1868-4dd7-9fb3-")
364+
incomplete_resp.headers = {"content-length": "36"}
365+
incomplete_resp.status_code = 200
366+
367+
resume_resp = MockResponse(b"f2561d5dfd89")
368+
resume_resp.headers = {"content-length": "12"}
369+
resume_resp.status_code = 206
370+
371+
responses = [incomplete_resp, resume_resp]
372+
_http_get_mock = MagicMock(side_effect=responses)
373+
374+
with patch.object(Downloader, "_http_get", _http_get_mock):
375+
# Perform the download (incomplete then resumed)
376+
filepath, _ = downloader(link, str(tmpdir))
377+
378+
# Verify the file was downloaded correctly
379+
with open(filepath, "rb") as downloaded_file:
380+
downloaded_bytes = downloaded_file.read()
381+
expected_bytes = b"0cfa7e9d-1868-4dd7-9fb3-f2561d5dfd89"
382+
assert downloaded_bytes == expected_bytes
383+
384+
# Verify that the cache directory was created and contains cache files
385+
# The resumed download should have been cached for future use
386+
assert cache_dir.exists()
387+
cache_files = list(cache_dir.rglob("*"))
388+
# Should have cache files (both metadata and body files)
389+
assert len([f for f in cache_files if f.is_file()]) == 2

0 commit comments

Comments
 (0)