@@ -158,6 +158,30 @@ def _http_get_download(
158
158
return resp
159
159
160
160
161
+ @dataclass
162
+ class _FileDownload :
163
+ """Stores the state of a single file download."""
164
+
165
+ link : Link
166
+ output_file : BinaryIO
167
+ size : Optional [int ]
168
+ bytes_received : int = 0
169
+ reattempts : int = 0
170
+
171
+ def is_incomplete (self ) -> bool :
172
+ return bool (self .size is not None and self .bytes_received < self .size )
173
+
174
+ def write_chunk (self , data : bytes ) -> None :
175
+ self .bytes_received += len (data )
176
+ self .output_file .write (data )
177
+
178
+ def reset_download (self ) -> None :
179
+ """Delete any saved data and reset progress to zero."""
180
+ self .output_file .seek (0 )
181
+ self .output_file .truncate ()
182
+ self .bytes_received = 0
183
+
184
+
161
185
class Downloader :
162
186
def __init__ (
163
187
self ,
@@ -183,90 +207,63 @@ def batch(
183
207
def __call__ (self , link : Link , location : str ) -> tuple [str , str ]:
184
208
"""Download the file given by link into location."""
185
209
resp = _http_get_download (self ._session , link )
186
- # NOTE: The original download size needs to be passed down everywhere
187
- # so if the download is resumed (with a HTTP Range request) the progress
188
- # bar will report the right size.
189
- total_length = _get_http_response_size (resp )
190
- content_type = resp .headers .get ("Content-Type" , "" )
191
-
192
- filename = _get_http_response_filename (resp , link )
193
- filepath = os .path .join (location , filename )
210
+ download_size = _get_http_response_size (resp )
194
211
212
+ filepath = os .path .join (location , _get_http_response_filename (resp , link ))
195
213
with open (filepath , "wb" ) as content_file :
196
- bytes_received = self ._process_response (
197
- resp , link , content_file , 0 , total_length
198
- )
199
- # If possible, check for an incomplete download and attempt resuming.
200
- if total_length and bytes_received < total_length :
201
- self ._attempt_resume (
202
- resp , link , content_file , total_length , bytes_received
203
- )
214
+ download = _FileDownload (link , content_file , download_size )
215
+ self ._process_response (download , resp )
216
+ if download .is_incomplete ():
217
+ self ._attempt_resume (download , resp )
204
218
219
+ content_type = resp .headers .get ("Content-Type" , "" )
205
220
return filepath , content_type
206
221
207
- def _process_response (
208
- self ,
209
- resp : Response ,
210
- link : Link ,
211
- content_file : BinaryIO ,
212
- bytes_received : int ,
213
- total_length : int | None ,
214
- ) -> int :
222
+ def _process_response (self , download : _FileDownload , resp : Response ) -> None :
215
223
"""Process the response and write the chunks to the file."""
216
224
chunks = _prepare_download (
217
- resp , link , self ._progress_bar , total_length , range_start = bytes_received
218
- )
219
- return self ._write_chunks_to_file (
220
- chunks , content_file , allow_partial = bool (total_length )
225
+ resp ,
226
+ download .link ,
227
+ self ._progress_bar ,
228
+ download .size ,
229
+ range_start = download .bytes_received ,
221
230
)
231
+ self ._write_chunks_to_file (download , chunks )
222
232
223
233
def _write_chunks_to_file (
224
- self , chunks : Iterable [bytes ], content_file : BinaryIO , * , allow_partial : bool
225
- ) -> int :
234
+ self , download : _FileDownload , chunks : Iterable [bytes ]
235
+ ) -> None :
226
236
"""Write the chunks to the file and return the number of bytes received."""
227
- bytes_received = 0
228
237
try :
229
238
for chunk in chunks :
230
- bytes_received += len (chunk )
231
- content_file .write (chunk )
239
+ download .write_chunk (chunk )
232
240
except ReadTimeoutError as e :
233
- # If partial downloads are OK (the download will be retried), don't bail .
234
- if not allow_partial :
241
+ # If the download size is not known, then give up downloading the file .
242
+ if download . size is None :
235
243
raise e
236
244
237
- # Ensuring bytes_received is returned to attempt resume
238
245
logger .warning ("Connection timed out while downloading." )
239
246
240
- return bytes_received
241
-
242
- def _attempt_resume (
243
- self ,
244
- resp : Response ,
245
- link : Link ,
246
- content_file : BinaryIO ,
247
- total_length : int | None ,
248
- bytes_received : int ,
249
- ) -> None :
247
+ def _attempt_resume (self , download : _FileDownload , resp : Response ) -> None :
250
248
"""Attempt to resume the download if connection was dropped."""
251
249
etag_or_last_modified = _get_http_response_etag_or_last_modified (resp )
252
250
253
- attempts_left = self ._resume_retries
254
- while total_length and attempts_left and bytes_received < total_length :
255
- attempts_left -= 1
256
-
251
+ while download .reattempts < self ._resume_retries and download .is_incomplete ():
252
+ assert download .size is not None
253
+ download .reattempts += 1
257
254
logger .warning (
258
255
"Attempting to resume incomplete download (%s/%s, attempt %d)" ,
259
- format_size (bytes_received ),
260
- format_size (total_length ),
261
- ( self . _resume_retries - attempts_left ) ,
256
+ format_size (download . bytes_received ),
257
+ format_size (download . size ),
258
+ download . reattempts ,
262
259
)
263
260
264
261
try :
265
262
# Try to resume the download using a HTTP range request.
266
263
resume_resp = _http_get_download (
267
264
self ._session ,
268
- link ,
269
- range_start = bytes_received ,
265
+ download . link ,
266
+ range_start = download . bytes_received ,
270
267
if_range = etag_or_last_modified ,
271
268
)
272
269
@@ -275,33 +272,25 @@ def _attempt_resume(
275
272
# other unexpected status, restart the download from the beginning.
276
273
must_restart = resume_resp .status_code != HTTPStatus .PARTIAL_CONTENT
277
274
if must_restart :
278
- bytes_received , total_length , etag_or_last_modified = (
279
- self . _reset_download_state ( resume_resp , content_file )
275
+ download . size , etag_or_last_modified = self . _reset_download_state (
276
+ download , resume_resp
280
277
)
281
278
282
- bytes_received += self ._process_response (
283
- resume_resp , link , content_file , bytes_received , total_length
284
- )
279
+ self ._process_response (download , resume_resp )
285
280
except (ConnectionError , ReadTimeoutError , OSError ):
286
281
continue
287
282
288
283
# No more resume attempts. Raise an error if the download is still incomplete.
289
- if total_length and bytes_received < total_length :
290
- os .remove (content_file .name )
291
- raise IncompleteDownloadError (
292
- link , bytes_received , total_length , retries = self ._resume_retries
293
- )
284
+ if download .is_incomplete ():
285
+ os .remove (download .output_file .name )
286
+ raise IncompleteDownloadError (download )
294
287
295
288
def _reset_download_state (
296
- self ,
297
- resp : Response ,
298
- content_file : BinaryIO ,
299
- ) -> tuple [int , int | None , str | None ]:
289
+ self , download : _FileDownload , resp : Response
290
+ ) -> tuple [int | None , str | None ]:
300
291
"""Reset the download state to restart downloading from the beginning."""
301
- content_file .seek (0 )
302
- content_file .truncate ()
303
- bytes_received = 0
292
+ download .reset_download ()
304
293
total_length = _get_http_response_size (resp )
305
294
etag_or_last_modified = _get_http_response_etag_or_last_modified (resp )
306
295
307
- return bytes_received , total_length , etag_or_last_modified
296
+ return total_length , etag_or_last_modified
0 commit comments