|
| 1 | +"""Utilities for fetching things from external endpoints. Vendored from hyp3lib""" |
| 2 | + |
| 3 | +import logging |
| 4 | +from email.message import Message |
| 5 | +from os.path import basename |
| 6 | +from pathlib import Path |
| 7 | +from urllib.parse import urlparse |
| 8 | + |
| 9 | +import requests |
| 10 | +from requests.adapters import HTTPAdapter |
| 11 | +from urllib3.util.retry import Retry |
| 12 | + |
| 13 | + |
| 14 | +def _get_download_path(url: str, content_disposition: str | None = None, directory: Path | str = '.'): |
| 15 | + filename = None |
| 16 | + if content_disposition is not None: |
| 17 | + message = Message() |
| 18 | + message['content-type'] = content_disposition |
| 19 | + filename = message.get_param('filename') |
| 20 | + if not filename: |
| 21 | + filename = basename(urlparse(url).path) |
| 22 | + if not filename: |
| 23 | + raise ValueError(f'could not determine download path for: {url}') |
| 24 | + assert isinstance(filename, str) |
| 25 | + return Path(directory) / filename |
| 26 | + |
| 27 | + |
| 28 | +def download_file( |
| 29 | + url: str, |
| 30 | + directory: Path | str = '.', |
| 31 | + chunk_size=None, |
| 32 | + retries=2, |
| 33 | + backoff_factor=1, |
| 34 | + auth: tuple[str, str] | None = None, |
| 35 | + token: str | None = None, |
| 36 | +) -> str: |
| 37 | + """Download a file |
| 38 | +
|
| 39 | + Args: |
| 40 | + url: URL of the file to download |
| 41 | + directory: Directory location to place files into |
| 42 | + chunk_size: Size to chunk the download into |
| 43 | + retries: Number of retries to attempt |
| 44 | + backoff_factor: Factor for calculating time between retries |
| 45 | + auth: Username and password for HTTP Basic Auth |
| 46 | + token: Token for HTTP Bearer authentication |
| 47 | +
|
| 48 | + Returns: |
| 49 | + download_path: The path to the downloaded file |
| 50 | + """ |
| 51 | + logging.info(f'Downloading {url}') |
| 52 | + |
| 53 | + session = requests.Session() |
| 54 | + session.auth = auth |
| 55 | + if token: |
| 56 | + session.headers.update({'Authorization': f'Bearer {token}'}) |
| 57 | + |
| 58 | + retry_strategy = Retry( |
| 59 | + total=retries, |
| 60 | + backoff_factor=backoff_factor, |
| 61 | + status_forcelist=[429, 500, 502, 503, 504], |
| 62 | + ) |
| 63 | + session.mount('https://', HTTPAdapter(max_retries=retry_strategy)) |
| 64 | + session.mount('http://', HTTPAdapter(max_retries=retry_strategy)) |
| 65 | + |
| 66 | + with session.get(url, stream=True) as s: |
| 67 | + download_path = _get_download_path(s.url, s.headers.get('content-disposition'), directory) |
| 68 | + s.raise_for_status() |
| 69 | + with open(download_path, 'wb') as f: |
| 70 | + for chunk in s.iter_content(chunk_size=chunk_size): |
| 71 | + if chunk: |
| 72 | + f.write(chunk) |
| 73 | + session.close() |
| 74 | + |
| 75 | + return str(download_path) |
0 commit comments