Skip to content

Commit 17f3703

Browse files
committed
Cache final file from resume retry process
1 parent a5cd068 commit 17f3703

File tree

2 files changed

+125
-2
lines changed

2 files changed

+125
-2
lines changed

src/pip/_internal/network/download.py

Lines changed: 65 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,18 @@
1111
from http import HTTPStatus
1212
from typing import BinaryIO
1313

14+
from pip._vendor.requests import PreparedRequest
1415
from pip._vendor.requests.models import Response
16+
from pip._vendor.urllib3 import HTTPResponse as URLlib3Response
17+
from pip._vendor.urllib3._collections import HTTPHeaderDict
1518
from pip._vendor.urllib3.exceptions import ReadTimeoutError
1619

1720
from pip._internal.cli.progress_bars import get_download_progress_renderer
1821
from pip._internal.exceptions import IncompleteDownloadError, NetworkConnectionError
1922
from pip._internal.models.index import PyPI
2023
from pip._internal.models.link import Link
2124
from pip._internal.network.cache import is_from_cache
22-
from pip._internal.network.session import PipSession
25+
from pip._internal.network.session import CacheControlAdapter, PipSession
2326
from pip._internal.network.utils import HEADERS, raise_for_status, response_chunks
2427
from pip._internal.utils.misc import format_size, redact_auth_from_url, splitext
2528

@@ -250,6 +253,67 @@ def _attempt_resumes_or_redownloads(
250253
os.remove(download.output_file.name)
251254
raise IncompleteDownloadError(download)
252255

256+
# If we successfully completed the download via resume, manually cache it
257+
# as a complete response to enable future caching
258+
if download.reattempts > 0:
259+
self._cache_resumed_download(download, first_resp)
260+
261+
def _cache_resumed_download(
262+
self, download: _FileDownload, original_response: Response
263+
) -> None:
264+
"""
265+
Manually cache a file that was successfully downloaded via resume retries.
266+
267+
cachecontrol doesn't cache 206 (Partial Content) responses, since they
268+
are not complete files. This method manually adds the final file to the
269+
cache as though it was downloaded in a single request, so that future
270+
requests can use the cache.
271+
"""
272+
url = download.link.url_without_fragment
273+
if url.startswith("https://"):
274+
adapter = self._session.adapters["https://"]
275+
elif url.startswith("http://"):
276+
adapter = self._session.adapters["http://"]
277+
else:
278+
return
279+
280+
# Check if the adapter is the CacheControlAdapter (i.e. caching is enabled)
281+
if not isinstance(adapter, CacheControlAdapter):
282+
logger.debug(
283+
"Skipping resume download caching: no cache controller for %s", url
284+
)
285+
return
286+
287+
synthetic_request = PreparedRequest()
288+
synthetic_request.prepare(method="GET", url=url, headers={})
289+
290+
synthetic_response_headers = HTTPHeaderDict()
291+
for key, value in original_response.headers.items():
292+
if key.lower() not in ["content-range", "content-length"]:
293+
synthetic_response_headers[key] = value
294+
synthetic_response_headers["content-length"] = str(download.size)
295+
296+
synthetic_response = URLlib3Response(
297+
body="",
298+
headers=synthetic_response_headers,
299+
status=200,
300+
preload_content=False,
301+
)
302+
303+
# Use the cache controller to store this as a complete response
304+
download.output_file.flush()
305+
with open(download.output_file.name, "rb") as f:
306+
adapter.controller.cache_response(
307+
synthetic_request,
308+
synthetic_response,
309+
body=f.read(),
310+
status_codes=(200, 203, 300, 301, 308),
311+
)
312+
313+
logger.debug(
314+
"Cached resumed download as complete response for future use: %s", url
315+
)
316+
253317
def _http_get_resume(
254318
self, download: _FileDownload, should_match: Response
255319
) -> Response:

tests/unit/test_network_download.py

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
parse_content_disposition,
1717
sanitize_content_filename,
1818
)
19-
from pip._internal.network.session import PipSession
19+
from pip._internal.network.session import CacheControlAdapter, PipSession
2020
from pip._internal.network.utils import HEADERS
2121

2222
from tests.lib.requests_mocks import MockResponse
@@ -350,3 +350,62 @@ def test_downloader(
350350

351351
# Make sure that the downloader makes additional requests for resumption
352352
_http_get_mock.assert_has_calls(calls)
353+
354+
355+
def test_resumed_download_caching(tmpdir: Path) -> None:
356+
"""Test that resumed downloads are cached properly for future use."""
357+
session = PipSession()
358+
link = Link("http://example.com/foo.tgz")
359+
downloader = Downloader(session, "on", resume_retries=5)
360+
361+
# Mock an incomplete download followed by a successful resume
362+
incomplete_resp = MockResponse(b"0cfa7e9d-1868-4dd7-9fb3-")
363+
incomplete_resp.headers = {"content-length": "36"}
364+
incomplete_resp.status_code = 200
365+
366+
resume_resp = MockResponse(b"f2561d5dfd89")
367+
resume_resp.headers = {"content-length": "12"}
368+
resume_resp.status_code = 206
369+
370+
responses = [incomplete_resp, resume_resp]
371+
_http_get_mock = MagicMock(side_effect=responses)
372+
373+
# Mock the session's adapters to have a cache controller
374+
mock_adapter = MagicMock(spec=CacheControlAdapter)
375+
mock_controller = MagicMock()
376+
mock_adapter.controller = mock_controller
377+
378+
# Create a mock for the session adapters
379+
adapters_mock = MagicMock()
380+
adapters_mock.__getitem__ = MagicMock(return_value=mock_adapter)
381+
382+
with (
383+
patch.object(Downloader, "_http_get", _http_get_mock),
384+
patch.object(session, "adapters", adapters_mock),
385+
):
386+
387+
filepath, _ = downloader(link, str(tmpdir))
388+
389+
# Verify the file was downloaded correctly
390+
with open(filepath, "rb") as downloaded_file:
391+
downloaded_bytes = downloaded_file.read()
392+
expected_bytes = b"0cfa7e9d-1868-4dd7-9fb3-f2561d5dfd89"
393+
assert downloaded_bytes == expected_bytes
394+
395+
# Verify that cache_response was called for the resumed download
396+
mock_controller.cache_response.assert_called_once()
397+
398+
# Get the call arguments to verify the cached content
399+
call_args = mock_controller.cache_response.call_args
400+
assert call_args is not None
401+
402+
# Extract positional and keyword arguments
403+
args, kwargs = call_args
404+
request, response = args
405+
body = kwargs.get("body")
406+
status_codes = kwargs.get("status_codes")
407+
408+
assert body == expected_bytes, "Cached body should match complete file content"
409+
assert response.status == 200, "Cached response should have status 200"
410+
assert request.url == link.url_without_fragment
411+
assert 200 in status_codes

0 commit comments

Comments
 (0)