Skip to content

Commit 199a29f

Browse files
authored
Merge pull request #2 from INCATools/download-helper
Refactor downloading rules using a "download helper" command
2 parents f81f527 + 07ecf28 commit 199a29f

File tree

7 files changed

+492
-71
lines changed

7 files changed

+492
-71
lines changed

.github/workflows/code-quality.yml

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Static code quality tests
1+
name: Static code quality tests and unit tests
22

33
on:
44
push:
@@ -35,8 +35,12 @@ jobs:
3535
- name: Install Python dependencies
3636
run: uv sync --dev --all-extras
3737
- name: Code linting
38-
run: uv run ruff check src
38+
run: uv run ruff check src tests
3939
- name: Code formatting
40-
run: uv run ruff format --check src
40+
run: uv run ruff format --check src tests
4141
- name: Static type checking
42-
run: uv run mypy src
42+
run: |
43+
uv run mypy src
44+
uv run mypy tests
45+
- name: Unit tests
46+
run: uv run pytest tests

pyproject.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,18 @@ source-include = [
4646
"tests/**",
4747
]
4848

49+
[tool.ruff.lint]
50+
select = [ "E4", "E7", "E9", "F", "I" ]
51+
52+
[[tool.mypy.overrides]]
53+
module = ["incatools.odk.*"]
54+
follow_untyped_imports = true
55+
4956
[dependency-groups]
5057
dev = [
5158
"ruff",
5259
"mypy",
60+
"pytest",
5361
"types-PyYAML",
5462
"types-defusedxml",
5563
"types-requests",

src/incatools/odk/download.py

Lines changed: 258 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,258 @@
1+
# odkcore - Ontology Development Kit Core
2+
# Copyright © 2025 ODK Developers
3+
#
4+
# This file is part of the ODK Core project and distributed under the
5+
# terms of a 3-clause BSD license. See the LICENSE file in that project
6+
# for the detailed conditions.
7+
8+
import logging
9+
import os
10+
from bz2 import BZ2File
11+
from dataclasses import dataclass
12+
from datetime import UTC, datetime
13+
from enum import Enum
14+
from gzip import GzipFile
15+
from hashlib import sha256
16+
from io import BufferedIOBase, BytesIO
17+
from pathlib import Path
18+
from time import sleep
19+
from typing import Dict, Iterator, Optional, Union
20+
from urllib.parse import urlparse
21+
22+
import requests
23+
24+
RFC5322_DATE_FORMAT = "%a, %d %b %Y %H:%M:%S GMT"
25+
# Those are the HTTP errors that Curl considers as "transient" and
26+
# eligible for retrying when the --retry option is used.
27+
RETRIABLE_HTTP_ERRORS = (408, 429, 500, 502, 503, 504)
28+
29+
30+
class Compression(Enum):
31+
NONE = (0, None)
32+
GZIP = (1, ".gz")
33+
BZIP2 = (2, ".bz2")
34+
35+
extension: Optional[str]
36+
37+
def __new__(cls, value: int, extension: Optional[str] = None):
38+
self = object.__new__(cls)
39+
self._value_ = value
40+
self.extension = extension
41+
return self
42+
43+
@classmethod
44+
def from_extension(cls, path: Path) -> "Compression":
45+
for v in Compression:
46+
if v.extension is not None and v.extension == path.suffix:
47+
return v
48+
return Compression.NONE
49+
50+
51+
class DownloadError(Exception):
52+
pass
53+
54+
55+
@dataclass
56+
class RemoteFileInfo:
57+
"""Informations about a downloaded file.
58+
59+
This class is a simple structure holding the informations we need in
60+
order to decide whether to update a file that has already been
61+
downloaded previously.
62+
"""
63+
64+
sha256: Optional[str] = None
65+
"""The SHA-256 checksum of the downloaded file."""
66+
67+
etag: Optional[str] = None
68+
"""The tag returned by the server for the file."""
69+
70+
time: Optional[datetime] = None
71+
"""The time the file was last downloaded."""
72+
73+
def to_file(self, dest: Path) -> None:
74+
"""Writes the information to a cache file."""
75+
with dest.open("w") as fd:
76+
if self.sha256 is not None:
77+
fd.write(f"sha256: {self.sha256}\n")
78+
if self.etag is not None:
79+
fd.write(f"etag: {self.etag}\n")
80+
if self.time is not None:
81+
fd.write(f"time: {self.time.strftime(RFC5322_DATE_FORMAT)}\n")
82+
83+
def from_cache_file(self, source: Path) -> "RemoteFileInfo":
84+
"""Reads the information from a cache file."""
85+
if source.exists():
86+
with source.open("r") as fd:
87+
for line in fd:
88+
if line.startswith("#"):
89+
continue
90+
items = line.strip().split(": ", maxsplit=1)
91+
if len(items) != 2:
92+
continue
93+
if items[0] == "sha256":
94+
self.sha256 = items[1]
95+
elif items[0] == "etag":
96+
self.etag = items[1]
97+
elif items[0] == "time":
98+
self.time = datetime.strptime(items[1], RFC5322_DATE_FORMAT)
99+
return self
100+
101+
@classmethod
102+
def from_file(cls, source: Path) -> "RemoteFileInfo":
103+
"""Gets the information from an existing file."""
104+
info = RemoteFileInfo()
105+
if source.exists():
106+
info.time = datetime.fromtimestamp(source.stat().st_mtime, UTC)
107+
h = sha256()
108+
with source.open("rb") as fd:
109+
while True:
110+
chunk = fd.read(512)
111+
if not chunk:
112+
break
113+
h.update(chunk)
114+
info.sha256 = h.hexdigest()
115+
return info
116+
117+
118+
def download_file(
119+
url: str,
120+
output: Path,
121+
info: RemoteFileInfo,
122+
max_retry: int = 0,
123+
compression: Compression = Compression.NONE,
124+
) -> int:
125+
"""Downloads a remote file.
126+
127+
This function will avoid needlessly downloading the file if the
128+
remote server can tell us that the remote file has not changed since
129+
the last time it was downloaded. In addition, even if the file is
130+
downloaded, if it is found to be identical to the locally available
131+
version, the existing file is not touched at all.
132+
133+
:param url: The URL to download the file from.
134+
:param output: Where the downloaded file should be written.
135+
:param info: Informations about the last time the file was
136+
downloaded. If the fields of that structure are set to None,
137+
this means there is no local version of the file, and the
138+
remote file should always be downloaded. If the download is
139+
successful, the structure will be updated with informations from
140+
the newly downloaded file.
141+
:param max_retry: Number of download attempts to perform.
142+
:param compression: How the remote file is compressed (if at all).
143+
The file will be automatically uncompressed after being
144+
downloaded.
145+
146+
:returns: The HTTP status code.
147+
"""
148+
headers: Dict[str, str] = {}
149+
if info.time:
150+
headers["If-Modified-Since"] = info.time.strftime(RFC5322_DATE_FORMAT)
151+
if info.etag:
152+
headers["If-None-Match"] = info.etag
153+
154+
n_try = 0
155+
hostname = urlparse(url).hostname
156+
while True:
157+
try:
158+
response = requests.get(url, timeout=5, headers=headers)
159+
if response.status_code == 200:
160+
return _handle_successful_download(response, output, info, compression)
161+
elif response.status_code == 304:
162+
logging.info(f"{output.name}: Not modified at {url}")
163+
return 304
164+
elif response.status_code == 404:
165+
logging.warning(f"{output.name}: Not found at {url}")
166+
return 404
167+
elif response.status_code in RETRIABLE_HTTP_ERRORS and n_try < max_retry:
168+
n_try += 1
169+
logging.warning(
170+
f"{output.name}: Transient HTTP error, retrying ({n_try}/{max_retry}"
171+
)
172+
sleep(1)
173+
else:
174+
response.raise_for_status()
175+
except requests.exceptions.ConnectTimeout:
176+
# `curl --retry` retries on timeout errors, and so do we
177+
if n_try < max_retry:
178+
n_try += 1
179+
logging.warning(
180+
f"{output.name}: Timeout when connecting to {hostname}, retrying ({n_try}/{max_retry})"
181+
)
182+
sleep(1)
183+
else:
184+
raise DownloadError(f"Timeout when connecting to {hostname}")
185+
except requests.exceptions.ConnectionError:
186+
raise DownloadError(f"Cannot connect to {hostname}")
187+
except requests.exceptions.HTTPError:
188+
raise DownloadError(f"HTTP error when downloading {url}")
189+
except requests.exceptions.ReadTimeout:
190+
raise DownloadError(f"Timeout when downloading {url}")
191+
192+
193+
def _handle_successful_download(
194+
response: requests.Response,
195+
output: Path,
196+
info: RemoteFileInfo,
197+
comp: Compression,
198+
) -> int:
199+
h = sha256()
200+
201+
# We download into a temporary file so that we do not touch the
202+
# output file until (1) the download is complete and (2) we have
203+
# verified that the downloaded file is different from the output
204+
# file, if it already exists
205+
tmpfile = output.with_suffix(output.suffix + ".tmp")
206+
with tmpfile.open("wb") as fd:
207+
for chunk in _ResponseWrapper.maybe_wrap(response, comp).iter_content(512):
208+
h.update(chunk)
209+
fd.write(chunk)
210+
checksum = h.hexdigest()
211+
if info.sha256 == checksum:
212+
logging.info(
213+
f"{output.name}: File newly downloaded is identical to previously downloaded file"
214+
)
215+
# Remove the file we just downloaded, and report to caller as a
216+
# 304 Not-Modified status
217+
tmpfile.unlink()
218+
return 304
219+
else:
220+
logging.info(f"{output.name}: Download OK, file is new")
221+
os.replace(tmpfile, output)
222+
info.sha256 = checksum
223+
info.time = datetime.now(tz=UTC)
224+
info.etag = response.headers.get("ETag", None)
225+
return 200
226+
227+
228+
class _ResponseWrapper:
229+
"""Helper class to handle compressed files.
230+
231+
This class allows to use the same `iter_content` method (as found on
232+
a requests.Response object) to get the content of a downloaded file,
233+
regardless of how the file has been compressed (if at all).
234+
"""
235+
236+
_stream: BufferedIOBase
237+
238+
def __init__(self, stream: BufferedIOBase):
239+
self._stream = stream
240+
241+
def iter_content(self, size: int = 512) -> Iterator[bytes]:
242+
while True:
243+
chunk = self._stream.read(size)
244+
if not chunk:
245+
break
246+
yield chunk
247+
self._stream.close()
248+
249+
@classmethod
250+
def maybe_wrap(
251+
cls, response: requests.Response, compression: Compression
252+
) -> Union[requests.Response, "_ResponseWrapper"]:
253+
if compression == Compression.GZIP:
254+
return _ResponseWrapper(GzipFile(fileobj=BytesIO(response.content)))
255+
elif compression == Compression.BZIP2:
256+
return _ResponseWrapper(BZ2File(BytesIO(response.content)))
257+
else:
258+
return response

0 commit comments

Comments
 (0)