|
| 1 | +# odkcore - Ontology Development Kit Core |
| 2 | +# Copyright © 2025 ODK Developers |
| 3 | +# |
| 4 | +# This file is part of the ODK Core project and distributed under the |
| 5 | +# terms of a 3-clause BSD license. See the LICENSE file in that project |
| 6 | +# for the detailed conditions. |
| 7 | + |
| 8 | +import logging |
| 9 | +import os |
| 10 | +from bz2 import BZ2File |
| 11 | +from dataclasses import dataclass |
| 12 | +from datetime import UTC, datetime |
| 13 | +from enum import Enum |
| 14 | +from gzip import GzipFile |
| 15 | +from hashlib import sha256 |
| 16 | +from io import BufferedIOBase, BytesIO |
| 17 | +from pathlib import Path |
| 18 | +from time import sleep |
| 19 | +from typing import Dict, Iterator, Optional, Union |
| 20 | +from urllib.parse import urlparse |
| 21 | + |
| 22 | +import requests |
| 23 | + |
| 24 | +RFC5322_DATE_FORMAT = "%a, %d %b %Y %H:%M:%S GMT" |
| 25 | +# Those are the HTTP errors that Curl considers as "transient" and |
| 26 | +# eligible for retrying when the --retry option is used. |
| 27 | +RETRIABLE_HTTP_ERRORS = (408, 429, 500, 502, 503, 504) |
| 28 | + |
| 29 | + |
| 30 | +class Compression(Enum): |
| 31 | + NONE = (0, None) |
| 32 | + GZIP = (1, ".gz") |
| 33 | + BZIP2 = (2, ".bz2") |
| 34 | + |
| 35 | + extension: Optional[str] |
| 36 | + |
| 37 | + def __new__(cls, value: int, extension: Optional[str] = None): |
| 38 | + self = object.__new__(cls) |
| 39 | + self._value_ = value |
| 40 | + self.extension = extension |
| 41 | + return self |
| 42 | + |
| 43 | + @classmethod |
| 44 | + def from_extension(cls, path: Path) -> "Compression": |
| 45 | + for v in Compression: |
| 46 | + if v.extension is not None and v.extension == path.suffix: |
| 47 | + return v |
| 48 | + return Compression.NONE |
| 49 | + |
| 50 | + |
| 51 | +class DownloadError(Exception): |
| 52 | + pass |
| 53 | + |
| 54 | + |
| 55 | +@dataclass |
| 56 | +class RemoteFileInfo: |
| 57 | + """Informations about a downloaded file. |
| 58 | +
|
| 59 | + This class is a simple structure holding the informations we need in |
| 60 | + order to decide whether to update a file that has already been |
| 61 | + downloaded previously. |
| 62 | + """ |
| 63 | + |
| 64 | + sha256: Optional[str] = None |
| 65 | + """The SHA-256 checksum of the downloaded file.""" |
| 66 | + |
| 67 | + etag: Optional[str] = None |
| 68 | + """The tag returned by the server for the file.""" |
| 69 | + |
| 70 | + time: Optional[datetime] = None |
| 71 | + """The time the file was last downloaded.""" |
| 72 | + |
| 73 | + def to_file(self, dest: Path) -> None: |
| 74 | + """Writes the information to a cache file.""" |
| 75 | + with dest.open("w") as fd: |
| 76 | + if self.sha256 is not None: |
| 77 | + fd.write(f"sha256: {self.sha256}\n") |
| 78 | + if self.etag is not None: |
| 79 | + fd.write(f"etag: {self.etag}\n") |
| 80 | + if self.time is not None: |
| 81 | + fd.write(f"time: {self.time.strftime(RFC5322_DATE_FORMAT)}\n") |
| 82 | + |
| 83 | + def from_cache_file(self, source: Path) -> "RemoteFileInfo": |
| 84 | + """Reads the information from a cache file.""" |
| 85 | + if source.exists(): |
| 86 | + with source.open("r") as fd: |
| 87 | + for line in fd: |
| 88 | + if line.startswith("#"): |
| 89 | + continue |
| 90 | + items = line.strip().split(": ", maxsplit=1) |
| 91 | + if len(items) != 2: |
| 92 | + continue |
| 93 | + if items[0] == "sha256": |
| 94 | + self.sha256 = items[1] |
| 95 | + elif items[0] == "etag": |
| 96 | + self.etag = items[1] |
| 97 | + elif items[0] == "time": |
| 98 | + self.time = datetime.strptime(items[1], RFC5322_DATE_FORMAT) |
| 99 | + return self |
| 100 | + |
| 101 | + @classmethod |
| 102 | + def from_file(cls, source: Path) -> "RemoteFileInfo": |
| 103 | + """Gets the information from an existing file.""" |
| 104 | + info = RemoteFileInfo() |
| 105 | + if source.exists(): |
| 106 | + info.time = datetime.fromtimestamp(source.stat().st_mtime, UTC) |
| 107 | + h = sha256() |
| 108 | + with source.open("rb") as fd: |
| 109 | + while True: |
| 110 | + chunk = fd.read(512) |
| 111 | + if not chunk: |
| 112 | + break |
| 113 | + h.update(chunk) |
| 114 | + info.sha256 = h.hexdigest() |
| 115 | + return info |
| 116 | + |
| 117 | + |
| 118 | +def download_file( |
| 119 | + url: str, |
| 120 | + output: Path, |
| 121 | + info: RemoteFileInfo, |
| 122 | + max_retry: int = 0, |
| 123 | + compression: Compression = Compression.NONE, |
| 124 | +) -> int: |
| 125 | + """Downloads a remote file. |
| 126 | +
|
| 127 | + This function will avoid needlessly downloading the file if the |
| 128 | + remote server can tell us that the remote file has not changed since |
| 129 | + the last time it was downloaded. In addition, even if the file is |
| 130 | + downloaded, if it is found to be identical to the locally available |
| 131 | + version, the existing file is not touched at all. |
| 132 | +
|
| 133 | + :param url: The URL to download the file from. |
| 134 | + :param output: Where the downloaded file should be written. |
| 135 | + :param info: Informations about the last time the file was |
| 136 | + downloaded. If the fields of that structure are set to None, |
| 137 | + this means there is no local version of the file, and the |
| 138 | + remote file should always be downloaded. If the download is |
| 139 | + successful, the structure will be updated with informations from |
| 140 | + the newly downloaded file. |
| 141 | + :param max_retry: Number of download attempts to perform. |
| 142 | + :param compression: How the remote file is compressed (if at all). |
| 143 | + The file will be automatically uncompressed after being |
| 144 | + downloaded. |
| 145 | +
|
| 146 | + :returns: The HTTP status code. |
| 147 | + """ |
| 148 | + headers: Dict[str, str] = {} |
| 149 | + if info.time: |
| 150 | + headers["If-Modified-Since"] = info.time.strftime(RFC5322_DATE_FORMAT) |
| 151 | + if info.etag: |
| 152 | + headers["If-None-Match"] = info.etag |
| 153 | + |
| 154 | + n_try = 0 |
| 155 | + hostname = urlparse(url).hostname |
| 156 | + while True: |
| 157 | + try: |
| 158 | + response = requests.get(url, timeout=5, headers=headers) |
| 159 | + if response.status_code == 200: |
| 160 | + return _handle_successful_download(response, output, info, compression) |
| 161 | + elif response.status_code == 304: |
| 162 | + logging.info(f"{output.name}: Not modified at {url}") |
| 163 | + return 304 |
| 164 | + elif response.status_code == 404: |
| 165 | + logging.warning(f"{output.name}: Not found at {url}") |
| 166 | + return 404 |
| 167 | + elif response.status_code in RETRIABLE_HTTP_ERRORS and n_try < max_retry: |
| 168 | + n_try += 1 |
| 169 | + logging.warning( |
| 170 | + f"{output.name}: Transient HTTP error, retrying ({n_try}/{max_retry}" |
| 171 | + ) |
| 172 | + sleep(1) |
| 173 | + else: |
| 174 | + response.raise_for_status() |
| 175 | + except requests.exceptions.ConnectTimeout: |
| 176 | + # `curl --retry` retries on timeout errors, and so do we |
| 177 | + if n_try < max_retry: |
| 178 | + n_try += 1 |
| 179 | + logging.warning( |
| 180 | + f"{output.name}: Timeout when connecting to {hostname}, retrying ({n_try}/{max_retry})" |
| 181 | + ) |
| 182 | + sleep(1) |
| 183 | + else: |
| 184 | + raise DownloadError(f"Timeout when connecting to {hostname}") |
| 185 | + except requests.exceptions.ConnectionError: |
| 186 | + raise DownloadError(f"Cannot connect to {hostname}") |
| 187 | + except requests.exceptions.HTTPError: |
| 188 | + raise DownloadError(f"HTTP error when downloading {url}") |
| 189 | + except requests.exceptions.ReadTimeout: |
| 190 | + raise DownloadError(f"Timeout when downloading {url}") |
| 191 | + |
| 192 | + |
| 193 | +def _handle_successful_download( |
| 194 | + response: requests.Response, |
| 195 | + output: Path, |
| 196 | + info: RemoteFileInfo, |
| 197 | + comp: Compression, |
| 198 | +) -> int: |
| 199 | + h = sha256() |
| 200 | + |
| 201 | + # We download into a temporary file so that we do not touch the |
| 202 | + # output file until (1) the download is complete and (2) we have |
| 203 | + # verified that the downloaded file is different from the output |
| 204 | + # file, if it already exists |
| 205 | + tmpfile = output.with_suffix(output.suffix + ".tmp") |
| 206 | + with tmpfile.open("wb") as fd: |
| 207 | + for chunk in _ResponseWrapper.maybe_wrap(response, comp).iter_content(512): |
| 208 | + h.update(chunk) |
| 209 | + fd.write(chunk) |
| 210 | + checksum = h.hexdigest() |
| 211 | + if info.sha256 == checksum: |
| 212 | + logging.info( |
| 213 | + f"{output.name}: File newly downloaded is identical to previously downloaded file" |
| 214 | + ) |
| 215 | + # Remove the file we just downloaded, and report to caller as a |
| 216 | + # 304 Not-Modified status |
| 217 | + tmpfile.unlink() |
| 218 | + return 304 |
| 219 | + else: |
| 220 | + logging.info(f"{output.name}: Download OK, file is new") |
| 221 | + os.replace(tmpfile, output) |
| 222 | + info.sha256 = checksum |
| 223 | + info.time = datetime.now(tz=UTC) |
| 224 | + info.etag = response.headers.get("ETag", None) |
| 225 | + return 200 |
| 226 | + |
| 227 | + |
| 228 | +class _ResponseWrapper: |
| 229 | + """Helper class to handle compressed files. |
| 230 | +
|
| 231 | + This class allows to use the same `iter_content` method (as found on |
| 232 | + a requests.Response object) to get the content of a downloaded file, |
| 233 | + regardless of how the file has been compressed (if at all). |
| 234 | + """ |
| 235 | + |
| 236 | + _stream: BufferedIOBase |
| 237 | + |
| 238 | + def __init__(self, stream: BufferedIOBase): |
| 239 | + self._stream = stream |
| 240 | + |
| 241 | + def iter_content(self, size: int = 512) -> Iterator[bytes]: |
| 242 | + while True: |
| 243 | + chunk = self._stream.read(size) |
| 244 | + if not chunk: |
| 245 | + break |
| 246 | + yield chunk |
| 247 | + self._stream.close() |
| 248 | + |
| 249 | + @classmethod |
| 250 | + def maybe_wrap( |
| 251 | + cls, response: requests.Response, compression: Compression |
| 252 | + ) -> Union[requests.Response, "_ResponseWrapper"]: |
| 253 | + if compression == Compression.GZIP: |
| 254 | + return _ResponseWrapper(GzipFile(fileobj=BytesIO(response.content))) |
| 255 | + elif compression == Compression.BZIP2: |
| 256 | + return _ResponseWrapper(BZ2File(BytesIO(response.content))) |
| 257 | + else: |
| 258 | + return response |
0 commit comments