6
6
import logging
7
7
import mimetypes
8
8
import os
9
- from collections .abc import Iterable
9
+ from collections .abc import Iterable , Mapping
10
+ from dataclasses import dataclass
10
11
from http import HTTPStatus
11
12
from typing import BinaryIO
12
13
@@ -40,7 +41,7 @@ def _get_http_response_etag_or_last_modified(resp: Response) -> str | None:
40
41
return resp .headers .get ("etag" , resp .headers .get ("last-modified" ))
41
42
42
43
43
- def _prepare_download (
44
+ def _log_download (
44
45
resp : Response ,
45
46
link : Link ,
46
47
progress_bar : str ,
@@ -134,28 +135,28 @@ def _get_http_response_filename(resp: Response, link: Link) -> str:
134
135
return filename
135
136
136
137
137
- def _http_get_download (
138
- session : PipSession ,
139
- link : Link ,
140
- range_start : int | None = 0 ,
141
- if_range : str | None = None ,
142
- ) -> Response :
143
- target_url = link . url . split ( "#" , 1 )[ 0 ]
144
- headers = HEADERS . copy ()
145
- # request a partial download
146
- if range_start :
147
- headers [ "Range" ] = f"bytes= { range_start } -"
148
- # make sure the file hasn't changed
149
- if if_range :
150
- headers [ "If-Range" ] = if_range
151
- try :
152
- resp = session . get ( target_url , headers = headers , stream = True )
153
- raise_for_status ( resp )
154
- except NetworkConnectionError as e :
155
- assert e . response is not None
156
- logger . critical ( "HTTP error %s while getting %s" , e . response . status_code , link )
157
- raise
158
- return resp
138
+ @ dataclass
139
+ class _FileDownload :
140
+ """Stores the state of a single link download."""
141
+
142
+ link : Link
143
+ output_file : BinaryIO
144
+ size : int | None
145
+ bytes_received : int = 0
146
+ reattempts : int = 0
147
+
148
+ def is_incomplete ( self ) -> bool :
149
+ return bool ( self . size is not None and self . bytes_received < self . size )
150
+
151
+ def write_chunk ( self , data : bytes ) -> None :
152
+ self . bytes_received += len ( data )
153
+ self . output_file . write ( data )
154
+
155
+ def reset_file ( self ) -> None :
156
+ """Delete any saved data and reset progress to zero."""
157
+ self . output_file . seek ( 0 )
158
+ self . output_file . truncate ()
159
+ self . bytes_received = 0
159
160
160
161
161
162
class Downloader :
@@ -172,146 +173,106 @@ def __init__(
172
173
self ._progress_bar = progress_bar
173
174
self ._resume_retries = resume_retries
174
175
175
- def __call__ (self , link : Link , location : str ) -> tuple [str , str ]:
176
- """Download the file given by link into location."""
177
- resp = _http_get_download (self ._session , link )
178
- # NOTE: The original download size needs to be passed down everywhere
179
- # so if the download is resumed (with a HTTP Range request) the progress
180
- # bar will report the right size.
181
- total_length = _get_http_response_size (resp )
182
- content_type = resp .headers .get ("Content-Type" , "" )
176
+ def batch (
177
+ self , links : Iterable [Link ], location : str
178
+ ) -> Iterable [tuple [Link , tuple [str , str ]]]:
179
+ """Convenience method to download multiple links."""
180
+ for link in links :
181
+ filepath , content_type = self (link , location )
182
+ yield link , (filepath , content_type )
183
183
184
- filename = _get_http_response_filename (resp , link )
185
- filepath = os .path .join (location , filename )
184
+ def __call__ (self , link : Link , location : str ) -> tuple [str , str ]:
185
+ """Download a link and save it under location."""
186
+ resp = self ._http_get (link )
187
+ download_size = _get_http_response_size (resp )
186
188
189
+ filepath = os .path .join (location , _get_http_response_filename (resp , link ))
187
190
with open (filepath , "wb" ) as content_file :
188
- bytes_received = self ._process_response (
189
- resp , link , content_file , 0 , total_length
190
- )
191
- # If possible, check for an incomplete download and attempt resuming.
192
- if total_length and bytes_received < total_length :
193
- self ._attempt_resume (
194
- resp , link , content_file , total_length , bytes_received
195
- )
191
+ download = _FileDownload (link , content_file , download_size )
192
+ self ._process_response (download , resp )
193
+ if download .is_incomplete ():
194
+ self ._attempt_resumes_or_redownloads (download , resp )
196
195
196
+ content_type = resp .headers .get ("Content-Type" , "" )
197
197
return filepath , content_type
198
198
199
- def _process_response (
200
- self ,
201
- resp : Response ,
202
- link : Link ,
203
- content_file : BinaryIO ,
204
- bytes_received : int ,
205
- total_length : int | None ,
206
- ) -> int :
207
- """Process the response and write the chunks to the file."""
208
- chunks = _prepare_download (
209
- resp , link , self ._progress_bar , total_length , range_start = bytes_received
210
- )
211
- return self ._write_chunks_to_file (
212
- chunks , content_file , allow_partial = bool (total_length )
199
+ def _process_response (self , download : _FileDownload , resp : Response ) -> None :
200
+ """Download and save chunks from a response."""
201
+ chunks = _log_download (
202
+ resp ,
203
+ download .link ,
204
+ self ._progress_bar ,
205
+ download .size ,
206
+ range_start = download .bytes_received ,
213
207
)
214
-
215
- def _write_chunks_to_file (
216
- self , chunks : Iterable [bytes ], content_file : BinaryIO , * , allow_partial : bool
217
- ) -> int :
218
- """Write the chunks to the file and return the number of bytes received."""
219
- bytes_received = 0
220
208
try :
221
209
for chunk in chunks :
222
- bytes_received += len (chunk )
223
- content_file .write (chunk )
210
+ download .write_chunk (chunk )
224
211
except ReadTimeoutError as e :
225
- # If partial downloads are OK (the download will be retried), don't bail .
226
- if not allow_partial :
212
+ # If the download size is not known, then give up downloading the file .
213
+ if download . size is None :
227
214
raise e
228
215
229
- # Ensuring bytes_received is returned to attempt resume
230
216
logger .warning ("Connection timed out while downloading." )
231
217
232
- return bytes_received
233
-
234
- def _attempt_resume (
235
- self ,
236
- resp : Response ,
237
- link : Link ,
238
- content_file : BinaryIO ,
239
- total_length : int | None ,
240
- bytes_received : int ,
218
+ def _attempt_resumes_or_redownloads (
219
+ self , download : _FileDownload , first_resp : Response
241
220
) -> None :
242
- """Attempt to resume the download if connection was dropped."""
243
- etag_or_last_modified = _get_http_response_etag_or_last_modified (resp )
244
-
245
- attempts_left = self ._resume_retries
246
- while total_length and attempts_left and bytes_received < total_length :
247
- attempts_left -= 1
221
+ """Attempt to resume/restart the download if connection was dropped."""
248
222
223
+ while download .reattempts < self ._resume_retries and download .is_incomplete ():
224
+ assert download .size is not None
225
+ download .reattempts += 1
249
226
logger .warning (
250
227
"Attempting to resume incomplete download (%s/%s, attempt %d)" ,
251
- format_size (bytes_received ),
252
- format_size (total_length ),
253
- ( self . _resume_retries - attempts_left ) ,
228
+ format_size (download . bytes_received ),
229
+ format_size (download . size ),
230
+ download . reattempts ,
254
231
)
255
232
256
233
try :
257
- # Try to resume the download using a HTTP range request.
258
- resume_resp = _http_get_download (
259
- self ._session ,
260
- link ,
261
- range_start = bytes_received ,
262
- if_range = etag_or_last_modified ,
263
- )
264
-
234
+ resume_resp = self ._http_get_resume (download , should_match = first_resp )
265
235
# Fallback: if the server responded with 200 (i.e., the file has
266
236
# since been modified or range requests are unsupported) or any
267
237
# other unexpected status, restart the download from the beginning.
268
238
must_restart = resume_resp .status_code != HTTPStatus .PARTIAL_CONTENT
269
239
if must_restart :
270
- bytes_received , total_length , etag_or_last_modified = (
271
- self . _reset_download_state (resume_resp , content_file )
272
- )
240
+ download . reset_file ()
241
+ download . size = _get_http_response_size (resume_resp )
242
+ first_resp = resume_resp
273
243
274
- bytes_received += self ._process_response (
275
- resume_resp , link , content_file , bytes_received , total_length
276
- )
244
+ self ._process_response (download , resume_resp )
277
245
except (ConnectionError , ReadTimeoutError , OSError ):
278
246
continue
279
247
280
248
# No more resume attempts. Raise an error if the download is still incomplete.
281
- if total_length and bytes_received < total_length :
282
- os .remove (content_file .name )
283
- raise IncompleteDownloadError (
284
- link , bytes_received , total_length , retries = self ._resume_retries
249
+ if download .is_incomplete ():
250
+ os .remove (download .output_file .name )
251
+ raise IncompleteDownloadError (download )
252
+
253
+ def _http_get_resume (
254
+ self , download : _FileDownload , should_match : Response
255
+ ) -> Response :
256
+ """Issue a HTTP range request to resume the download."""
257
+ # To better understand the download resumption logic, see the mdn web docs:
258
+ # https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/Range_requests
259
+ headers = HEADERS .copy ()
260
+ headers ["Range" ] = f"bytes={ download .bytes_received } -"
261
+ # If possible, use a conditional range request to avoid corrupted
262
+ # downloads caused by the remote file changing in-between.
263
+ if identifier := _get_http_response_etag_or_last_modified (should_match ):
264
+ headers ["If-Range" ] = identifier
265
+ return self ._http_get (download .link , headers )
266
+
267
+ def _http_get (self , link : Link , headers : Mapping [str , str ] = HEADERS ) -> Response :
268
+ target_url = link .url_without_fragment
269
+ try :
270
+ resp = self ._session .get (target_url , headers = headers , stream = True )
271
+ raise_for_status (resp )
272
+ except NetworkConnectionError as e :
273
+ assert e .response is not None
274
+ logger .critical (
275
+ "HTTP error %s while getting %s" , e .response .status_code , link
285
276
)
286
-
287
- def _reset_download_state (
288
- self ,
289
- resp : Response ,
290
- content_file : BinaryIO ,
291
- ) -> tuple [int , int | None , str | None ]:
292
- """Reset the download state to restart downloading from the beginning."""
293
- content_file .seek (0 )
294
- content_file .truncate ()
295
- bytes_received = 0
296
- total_length = _get_http_response_size (resp )
297
- etag_or_last_modified = _get_http_response_etag_or_last_modified (resp )
298
-
299
- return bytes_received , total_length , etag_or_last_modified
300
-
301
-
302
- class BatchDownloader :
303
- def __init__ (
304
- self ,
305
- session : PipSession ,
306
- progress_bar : str ,
307
- resume_retries : int ,
308
- ) -> None :
309
- self ._downloader = Downloader (session , progress_bar , resume_retries )
310
-
311
- def __call__ (
312
- self , links : Iterable [Link ], location : str
313
- ) -> Iterable [tuple [Link , tuple [str , str ]]]:
314
- """Download the files given by links into location."""
315
- for link in links :
316
- filepath , content_type = self ._downloader (link , location )
317
- yield link , (filepath , content_type )
277
+ raise
278
+ return resp
0 commit comments