66import logging
77import mimetypes
88import os
9- from collections .abc import Iterable
9+ from collections .abc import Iterable , Mapping
10+ from dataclasses import dataclass
1011from http import HTTPStatus
1112from typing import BinaryIO
1213
@@ -40,7 +41,7 @@ def _get_http_response_etag_or_last_modified(resp: Response) -> str | None:
4041 return resp .headers .get ("etag" , resp .headers .get ("last-modified" ))
4142
4243
43- def _prepare_download (
44+ def _log_download (
4445 resp : Response ,
4546 link : Link ,
4647 progress_bar : str ,
@@ -134,28 +135,28 @@ def _get_http_response_filename(resp: Response, link: Link) -> str:
134135 return filename
135136
136137
137- def _http_get_download (
138- session : PipSession ,
139- link : Link ,
140- range_start : int | None = 0 ,
141- if_range : str | None = None ,
142- ) -> Response :
143- target_url = link . url . split ( "#" , 1 )[ 0 ]
144- headers = HEADERS . copy ()
145- # request a partial download
146- if range_start :
147- headers [ "Range" ] = f"bytes= { range_start } -"
148- # make sure the file hasn't changed
149- if if_range :
150- headers [ "If-Range" ] = if_range
151- try :
152- resp = session . get ( target_url , headers = headers , stream = True )
153- raise_for_status ( resp )
154- except NetworkConnectionError as e :
155- assert e . response is not None
156- logger . critical ( "HTTP error %s while getting %s" , e . response . status_code , link )
157- raise
158- return resp
138+ @ dataclass
139+ class _FileDownload :
140+ """Stores the state of a single link download."""
141+
142+ link : Link
143+ output_file : BinaryIO
144+ size : int | None
145+ bytes_received : int = 0
146+ reattempts : int = 0
147+
148+ def is_incomplete ( self ) -> bool :
149+ return bool ( self . size is not None and self . bytes_received < self . size )
150+
151+ def write_chunk ( self , data : bytes ) -> None :
152+ self . bytes_received += len ( data )
153+ self . output_file . write ( data )
154+
155+ def reset_file ( self ) -> None :
156+ """Delete any saved data and reset progress to zero."""
157+ self . output_file . seek ( 0 )
158+ self . output_file . truncate ()
159+ self . bytes_received = 0
159160
160161
161162class Downloader :
@@ -172,146 +173,106 @@ def __init__(
172173 self ._progress_bar = progress_bar
173174 self ._resume_retries = resume_retries
174175
175- def __call__ (self , link : Link , location : str ) -> tuple [str , str ]:
176- """Download the file given by link into location."""
177- resp = _http_get_download (self ._session , link )
178- # NOTE: The original download size needs to be passed down everywhere
179- # so if the download is resumed (with a HTTP Range request) the progress
180- # bar will report the right size.
181- total_length = _get_http_response_size (resp )
182- content_type = resp .headers .get ("Content-Type" , "" )
176+ def batch (
177+ self , links : Iterable [Link ], location : str
178+ ) -> Iterable [tuple [Link , tuple [str , str ]]]:
179+ """Convenience method to download multiple links."""
180+ for link in links :
181+ filepath , content_type = self (link , location )
182+ yield link , (filepath , content_type )
183183
184- filename = _get_http_response_filename (resp , link )
185- filepath = os .path .join (location , filename )
184+ def __call__ (self , link : Link , location : str ) -> tuple [str , str ]:
185+ """Download a link and save it under location."""
186+ resp = self ._http_get (link )
187+ download_size = _get_http_response_size (resp )
186188
189+ filepath = os .path .join (location , _get_http_response_filename (resp , link ))
187190 with open (filepath , "wb" ) as content_file :
188- bytes_received = self ._process_response (
189- resp , link , content_file , 0 , total_length
190- )
191- # If possible, check for an incomplete download and attempt resuming.
192- if total_length and bytes_received < total_length :
193- self ._attempt_resume (
194- resp , link , content_file , total_length , bytes_received
195- )
191+ download = _FileDownload (link , content_file , download_size )
192+ self ._process_response (download , resp )
193+ if download .is_incomplete ():
194+ self ._attempt_resumes_or_redownloads (download , resp )
196195
196+ content_type = resp .headers .get ("Content-Type" , "" )
197197 return filepath , content_type
198198
199- def _process_response (
200- self ,
201- resp : Response ,
202- link : Link ,
203- content_file : BinaryIO ,
204- bytes_received : int ,
205- total_length : int | None ,
206- ) -> int :
207- """Process the response and write the chunks to the file."""
208- chunks = _prepare_download (
209- resp , link , self ._progress_bar , total_length , range_start = bytes_received
210- )
211- return self ._write_chunks_to_file (
212- chunks , content_file , allow_partial = bool (total_length )
199+ def _process_response (self , download : _FileDownload , resp : Response ) -> None :
200+ """Download and save chunks from a response."""
201+ chunks = _log_download (
202+ resp ,
203+ download .link ,
204+ self ._progress_bar ,
205+ download .size ,
206+ range_start = download .bytes_received ,
213207 )
214-
215- def _write_chunks_to_file (
216- self , chunks : Iterable [bytes ], content_file : BinaryIO , * , allow_partial : bool
217- ) -> int :
218- """Write the chunks to the file and return the number of bytes received."""
219- bytes_received = 0
220208 try :
221209 for chunk in chunks :
222- bytes_received += len (chunk )
223- content_file .write (chunk )
210+ download .write_chunk (chunk )
224211 except ReadTimeoutError as e :
225- # If partial downloads are OK (the download will be retried), don't bail .
226- if not allow_partial :
212+ # If the download size is not known, then give up downloading the file .
213+ if download . size is None :
227214 raise e
228215
229- # Ensuring bytes_received is returned to attempt resume
230216 logger .warning ("Connection timed out while downloading." )
231217
232- return bytes_received
233-
234- def _attempt_resume (
235- self ,
236- resp : Response ,
237- link : Link ,
238- content_file : BinaryIO ,
239- total_length : int | None ,
240- bytes_received : int ,
218+ def _attempt_resumes_or_redownloads (
219+ self , download : _FileDownload , first_resp : Response
241220 ) -> None :
242- """Attempt to resume the download if connection was dropped."""
243- etag_or_last_modified = _get_http_response_etag_or_last_modified (resp )
244-
245- attempts_left = self ._resume_retries
246- while total_length and attempts_left and bytes_received < total_length :
247- attempts_left -= 1
221+ """Attempt to resume/restart the download if connection was dropped."""
248222
223+ while download .reattempts < self ._resume_retries and download .is_incomplete ():
224+ assert download .size is not None
225+ download .reattempts += 1
249226 logger .warning (
250227 "Attempting to resume incomplete download (%s/%s, attempt %d)" ,
251- format_size (bytes_received ),
252- format_size (total_length ),
253- ( self . _resume_retries - attempts_left ) ,
228+ format_size (download . bytes_received ),
229+ format_size (download . size ),
230+ download . reattempts ,
254231 )
255232
256233 try :
257- # Try to resume the download using a HTTP range request.
258- resume_resp = _http_get_download (
259- self ._session ,
260- link ,
261- range_start = bytes_received ,
262- if_range = etag_or_last_modified ,
263- )
264-
234+ resume_resp = self ._http_get_resume (download , should_match = first_resp )
265235 # Fallback: if the server responded with 200 (i.e., the file has
266236 # since been modified or range requests are unsupported) or any
267237 # other unexpected status, restart the download from the beginning.
268238 must_restart = resume_resp .status_code != HTTPStatus .PARTIAL_CONTENT
269239 if must_restart :
270- bytes_received , total_length , etag_or_last_modified = (
271- self . _reset_download_state (resume_resp , content_file )
272- )
240+ download . reset_file ()
241+ download . size = _get_http_response_size (resume_resp )
242+ first_resp = resume_resp
273243
274- bytes_received += self ._process_response (
275- resume_resp , link , content_file , bytes_received , total_length
276- )
244+ self ._process_response (download , resume_resp )
277245 except (ConnectionError , ReadTimeoutError , OSError ):
278246 continue
279247
280248 # No more resume attempts. Raise an error if the download is still incomplete.
281- if total_length and bytes_received < total_length :
282- os .remove (content_file .name )
283- raise IncompleteDownloadError (
284- link , bytes_received , total_length , retries = self ._resume_retries
249+ if download .is_incomplete ():
250+ os .remove (download .output_file .name )
251+ raise IncompleteDownloadError (download )
252+
253+ def _http_get_resume (
254+ self , download : _FileDownload , should_match : Response
255+ ) -> Response :
256+ """Issue a HTTP range request to resume the download."""
257+ # To better understand the download resumption logic, see the mdn web docs:
258+ # https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/Range_requests
259+ headers = HEADERS .copy ()
260+ headers ["Range" ] = f"bytes={ download .bytes_received } -"
261+ # If possible, use a conditional range request to avoid corrupted
262+ # downloads caused by the remote file changing in-between.
263+ if identifier := _get_http_response_etag_or_last_modified (should_match ):
264+ headers ["If-Range" ] = identifier
265+ return self ._http_get (download .link , headers )
266+
267+ def _http_get (self , link : Link , headers : Mapping [str , str ] = HEADERS ) -> Response :
268+ target_url = link .url_without_fragment
269+ try :
270+ resp = self ._session .get (target_url , headers = headers , stream = True )
271+ raise_for_status (resp )
272+ except NetworkConnectionError as e :
273+ assert e .response is not None
274+ logger .critical (
275+ "HTTP error %s while getting %s" , e .response .status_code , link
285276 )
286-
287- def _reset_download_state (
288- self ,
289- resp : Response ,
290- content_file : BinaryIO ,
291- ) -> tuple [int , int | None , str | None ]:
292- """Reset the download state to restart downloading from the beginning."""
293- content_file .seek (0 )
294- content_file .truncate ()
295- bytes_received = 0
296- total_length = _get_http_response_size (resp )
297- etag_or_last_modified = _get_http_response_etag_or_last_modified (resp )
298-
299- return bytes_received , total_length , etag_or_last_modified
300-
301-
302- class BatchDownloader :
303- def __init__ (
304- self ,
305- session : PipSession ,
306- progress_bar : str ,
307- resume_retries : int ,
308- ) -> None :
309- self ._downloader = Downloader (session , progress_bar , resume_retries )
310-
311- def __call__ (
312- self , links : Iterable [Link ], location : str
313- ) -> Iterable [tuple [Link , tuple [str , str ]]]:
314- """Download the files given by links into location."""
315- for link in links :
316- filepath , content_type = self ._downloader (link , location )
317- yield link , (filepath , content_type )
277+ raise
278+ return resp
0 commit comments