diff --git a/.github/workflows/code-quality.yml b/.github/workflows/code-quality.yml index 15dc21b..1adda54 100644 --- a/.github/workflows/code-quality.yml +++ b/.github/workflows/code-quality.yml @@ -1,4 +1,4 @@ -name: Static code quality tests +name: Static code quality tests and unit tests on: push: @@ -35,8 +35,12 @@ jobs: - name: Install Python dependencies run: uv sync --dev --all-extras - name: Code linting - run: uv run ruff check src + run: uv run ruff check src tests - name: Code formatting - run: uv run ruff format --check src + run: uv run ruff format --check src tests - name: Static type checking - run: uv run mypy src + run: | + uv run mypy src + uv run mypy tests + - name: Unit tests + run: uv run pytest tests diff --git a/pyproject.toml b/pyproject.toml index 160cb25..9a72d21 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,10 +46,18 @@ source-include = [ "tests/**", ] +[tool.ruff.lint] +select = [ "E4", "E7", "E9", "F", "I" ] + +[[tool.mypy.overrides]] +module = ["incatools.odk.*"] +follow_untyped_imports = true + [dependency-groups] dev = [ "ruff", "mypy", + "pytest", "types-PyYAML", "types-defusedxml", "types-requests", diff --git a/src/incatools/odk/download.py b/src/incatools/odk/download.py new file mode 100644 index 0000000..0098599 --- /dev/null +++ b/src/incatools/odk/download.py @@ -0,0 +1,258 @@ +# odkcore - Ontology Development Kit Core +# Copyright © 2025 ODK Developers +# +# This file is part of the ODK Core project and distributed under the +# terms of a 3-clause BSD license. See the LICENSE file in that project +# for the detailed conditions. + +import logging +import os +from bz2 import BZ2File +from dataclasses import dataclass +from datetime import UTC, datetime +from enum import Enum +from gzip import GzipFile +from hashlib import sha256 +from io import BufferedIOBase, BytesIO +from pathlib import Path +from time import sleep +from typing import Dict, Iterator, Optional, Union +from urllib.parse import urlparse + +import requests + +RFC5322_DATE_FORMAT = "%a, %d %b %Y %H:%M:%S GMT" +# Those are the HTTP errors that Curl considers as "transient" and +# eligible for retrying when the --retry option is used. +RETRIABLE_HTTP_ERRORS = (408, 429, 500, 502, 503, 504) + + +class Compression(Enum): + NONE = (0, None) + GZIP = (1, ".gz") + BZIP2 = (2, ".bz2") + + extension: Optional[str] + + def __new__(cls, value: int, extension: Optional[str] = None): + self = object.__new__(cls) + self._value_ = value + self.extension = extension + return self + + @classmethod + def from_extension(cls, path: Path) -> "Compression": + for v in Compression: + if v.extension is not None and v.extension == path.suffix: + return v + return Compression.NONE + + +class DownloadError(Exception): + pass + + +@dataclass +class RemoteFileInfo: + """Informations about a downloaded file. + + This class is a simple structure holding the informations we need in + order to decide whether to update a file that has already been + downloaded previously. + """ + + sha256: Optional[str] = None + """The SHA-256 checksum of the downloaded file.""" + + etag: Optional[str] = None + """The tag returned by the server for the file.""" + + time: Optional[datetime] = None + """The time the file was last downloaded.""" + + def to_file(self, dest: Path) -> None: + """Writes the information to a cache file.""" + with dest.open("w") as fd: + if self.sha256 is not None: + fd.write(f"sha256: {self.sha256}\n") + if self.etag is not None: + fd.write(f"etag: {self.etag}\n") + if self.time is not None: + fd.write(f"time: {self.time.strftime(RFC5322_DATE_FORMAT)}\n") + + def from_cache_file(self, source: Path) -> "RemoteFileInfo": + """Reads the information from a cache file.""" + if source.exists(): + with source.open("r") as fd: + for line in fd: + if line.startswith("#"): + continue + items = line.strip().split(": ", maxsplit=1) + if len(items) != 2: + continue + if items[0] == "sha256": + self.sha256 = items[1] + elif items[0] == "etag": + self.etag = items[1] + elif items[0] == "time": + self.time = datetime.strptime(items[1], RFC5322_DATE_FORMAT) + return self + + @classmethod + def from_file(cls, source: Path) -> "RemoteFileInfo": + """Gets the information from an existing file.""" + info = RemoteFileInfo() + if source.exists(): + info.time = datetime.fromtimestamp(source.stat().st_mtime, UTC) + h = sha256() + with source.open("rb") as fd: + while True: + chunk = fd.read(512) + if not chunk: + break + h.update(chunk) + info.sha256 = h.hexdigest() + return info + + +def download_file( + url: str, + output: Path, + info: RemoteFileInfo, + max_retry: int = 0, + compression: Compression = Compression.NONE, +) -> int: + """Downloads a remote file. + + This function will avoid needlessly downloading the file if the + remote server can tell us that the remote file has not changed since + the last time it was downloaded. In addition, even if the file is + downloaded, if it is found to be identical to the locally available + version, the existing file is not touched at all. + + :param url: The URL to download the file from. + :param output: Where the downloaded file should be written. + :param info: Informations about the last time the file was + downloaded. If the fields of that structure are set to None, + this means there is no local version of the file, and the + remote file should always be downloaded. If the download is + successful, the structure will be updated with informations from + the newly downloaded file. + :param max_retry: Number of download attempts to perform. + :param compression: How the remote file is compressed (if at all). + The file will be automatically uncompressed after being + downloaded. + + :returns: The HTTP status code. + """ + headers: Dict[str, str] = {} + if info.time: + headers["If-Modified-Since"] = info.time.strftime(RFC5322_DATE_FORMAT) + if info.etag: + headers["If-None-Match"] = info.etag + + n_try = 0 + hostname = urlparse(url).hostname + while True: + try: + response = requests.get(url, timeout=5, headers=headers) + if response.status_code == 200: + return _handle_successful_download(response, output, info, compression) + elif response.status_code == 304: + logging.info(f"{output.name}: Not modified at {url}") + return 304 + elif response.status_code == 404: + logging.warn(f"{output.name}: Not found at {url}") + return 404 + elif response.status_code in RETRIABLE_HTTP_ERRORS and n_try < max_retry: + n_try += 1 + logging.warn( + f"{output.name}: Transient HTTP error, retrying ({n_try}/{max_retry}" + ) + sleep(1) + else: + response.raise_for_status() + except requests.exceptions.ConnectTimeout: + # `curl --retry` retries on timeout errors, and so do we + if n_try < max_retry: + n_try += 1 + logging.warn( + f"{output.name}: Timeout when connecting to {hostname}, retrying ({n_try}/{max_retry})" + ) + sleep(1) + else: + raise DownloadError(f"Timeout when connecting to {hostname}") + except requests.exceptions.ConnectionError: + raise DownloadError(f"Cannot connect to {hostname}") + except requests.exceptions.HTTPError: + raise DownloadError(f"HTTP error when downloading {url}") + except requests.exceptions.ReadTimeout: + raise DownloadError(f"Timeout when downloading {url}") + + +def _handle_successful_download( + response: requests.Response, + output: Path, + info: RemoteFileInfo, + comp: Compression, +) -> int: + h = sha256() + + # We download into a temporary file so that we do not touch the + # output file until (1) the download is complete and (2) we have + # verified that the downloaded file is different from the output + # file, if it already exists + tmpfile = output.with_suffix(output.suffix + ".tmp") + with tmpfile.open("wb") as fd: + for chunk in _ResponseWrapper.maybe_wrap(response, comp).iter_content(512): + h.update(chunk) + fd.write(chunk) + checksum = h.hexdigest() + if info.sha256 == checksum: + logging.info( + f"{output.name}: File newly downloaded is identical to previously downloaded file" + ) + # Remove the file we just downloaded, and report to caller as a + # 304 Not-Modified status + tmpfile.unlink() + return 304 + else: + logging.info(f"{output.name}: Download OK, file is new") + os.replace(tmpfile, output) + info.sha256 = checksum + info.time = datetime.now(tz=UTC) + info.etag = response.headers.get("ETag", None) + return 200 + + +class _ResponseWrapper: + """Helper class to handle compressed files. + + This class allows to use the same `iter_content` method (as found on + a requests.Response object) to get the content of a downloaded file, + regardless of how the file has been compressed (if at all). + """ + + _stream: BufferedIOBase + + def __init__(self, stream: BufferedIOBase): + self._stream = stream + + def iter_content(self, size: int = 512) -> Iterator[bytes]: + while True: + chunk = self._stream.read(size) + if not chunk: + break + yield chunk + self._stream.close() + + @classmethod + def maybe_wrap( + cls, response: requests.Response, compression: Compression + ) -> Union[requests.Response, "_ResponseWrapper"]: + if compression == Compression.GZIP: + return _ResponseWrapper(GzipFile(fileobj=BytesIO(response.content))) + elif compression == Compression.BZIP2: + return _ResponseWrapper(BZ2File(BytesIO(response.content))) + else: + return response diff --git a/src/incatools/odk/helper.py b/src/incatools/odk/helper.py index e30feec..8352dcf 100644 --- a/src/incatools/odk/helper.py +++ b/src/incatools/odk/helper.py @@ -6,22 +6,26 @@ # for the detailed conditions. import json +import logging import shutil import subprocess from pathlib import Path +from typing import List, Tuple +from urllib.parse import urlparse import click from lightrdf import Parser as LightRDFParser # type: ignore from rdflib import Graph from . import __version__ +from .download import Compression, DownloadError, RemoteFileInfo, download_file from .template import DEFAULT_TEMPLATE_DIR, RESOURCES_DIR @click.group() def main() -> None: """Helper commands for ODK workflows.""" - pass + logging.basicConfig(level=logging.INFO) @main.command() @@ -132,3 +136,83 @@ def info(tools) -> None: if tools: cmd.append("--tools") subprocess.run(cmd) + + +@main.command() +@click.argument("url") +@click.option( + "-o", + "--output", + type=click.Path(path_type=Path), + help="""Write the downloaded file to the specified location. + The default is derived from the URL.""", +) +@click.option( + "-r", + "--reference", + type=click.Path(path_type=Path), + help="""Use the specified file as reference to decide whether to + download the file again. The default is to use the file + specified with the --output option.""", +) +@click.option( + "-i", + "--cache-info", + type=click.Path(path_type=Path), + help="""Read/write cache data from/to the specified file. The + default is the same location as specified with the + --reference option plus an added '.info' extension.""", +) +@click.option( + "--max-retry", + default=4, + metavar="N", + help="""Retry at most N times. Default is 4. Set to zero to disable retrying.""", +) +@click.option( + "--try-gzip/--no-try-gzip", + default=True, + help="""Given the URL "U", automatically attempt to download "U.gz", + then fallback to "U". This is enabled by default, unless the + provided URL already points to a compressed file.""", +) +def download(url, output, reference, cache_info, max_retry, try_gzip) -> None: + """Download a file.""" + + urlpath = Path(urlparse(url).path) + compression = Compression.from_extension(urlpath) + + attempts: List[Tuple[str, Compression]] = [] + if compression == Compression.NONE and try_gzip: + attempts.append((url + ".gz", Compression.GZIP)) + attempts.append((url, compression)) + + if output is None: + output = Path(urlpath.name) + if not output.name: + raise click.ClickException( + f"Explicit output name required for downloading from {url}" + ) + if reference is None: + reference = output + if cache_info is None: + cache_info = output.with_suffix(output.suffix + ".info") + + info = RemoteFileInfo() + if reference.exists(): + info.from_cache_file(cache_info) + if reference != output and output.exists(): + output.unlink() + + try: + for u, c in attempts: + status = download_file(u, output, info, max_retry, c) + if status == 200: + info.to_file(cache_info) + return + elif status == 304: + return + if status == 404: # Last attempt failed + raise click.ClickException(f"Cannot download {url}: 404 Not Found") + except DownloadError as e: + raise click.ClickException(f"Cannot download {url}: {e}") diff --git a/src/incatools/odk/model.py b/src/incatools/odk/model.py index 765ba87..f11456f 100644 --- a/src/incatools/odk/model.py +++ b/src/incatools/odk/model.py @@ -439,15 +439,14 @@ class ImportGroup(ProductGroup): mirror_retry_download: int = 4 """How many times to try downloading the source of an import module. - This corresponds to the cURL --retry parameter, + This behaves similarly to cURL’s --retry parameter, see . """ mirror_max_time_download: int = 200 """How long downloading the source of an import module is allowed to take. - This corresponds to the cURL --max-time parameter (in seconds), see - . + DEPRECATED: This option no longer has any effect. """ release_imports: bool = False diff --git a/src/incatools/odk/templates/src/ontology/Makefile.jinja2 b/src/incatools/odk/templates/src/ontology/Makefile.jinja2 index eeed803..8b16aee 100644 --- a/src/incatools/odk/templates/src/ontology/Makefile.jinja2 +++ b/src/incatools/odk/templates/src/ontology/Makefile.jinja2 @@ -773,6 +773,7 @@ refresh-imports-excluding-large: .PHONY: refresh-% refresh-%: + rm -f $(TMPDIR)/download-mirror-$*.owl.info $(MAKE) --assume-new=$(SRC) --assume-new=$(IMPORTDIR)/$*_terms.txt \ IMP=true IMP_LARGE=true MIR=true PAT=false $(IMPORTDIR)/$*_import.owl @@ -805,6 +806,7 @@ no-mirror-recreate-components: .PHONY: recreate-% recreate-%: + rm -f $(TMPDIR)/download-component-$*.owl.info $(MAKE) --assume-new=$(TMPDIR)/stamp-component-$*.owl \ COMP=true IMP=false MIR=true PAT=true $(COMPONENTSDIR)/$*.owl @@ -824,23 +826,21 @@ $(TMPDIR)/stamp-component-%.owl: | $(TMPDIR) {% for component in project.components.products -%} {% if component.source is not none -%} ifeq ($(MIR),true) -.PHONY: component-download-{{ component.filename }} -component-download-{{ component.filename }}: | $(TMPDIR) - $(ROBOT) merge -I {{ component.source }} \{% if component.make_base %} - remove {% for iri in component.base_iris %}--base-iri {{ iri }} \ - {% endfor %}--axioms external --preserve-structure false --trim false \{% endif %} - annotate --annotation owl:versionInfo $(VERSION) --output $(TMPDIR)/$@.owl - -$(COMPONENTSDIR)/{{ component.filename }}: component-download-{{ component.filename }} $(TMPDIR)/stamp-component-{{ component.filename }} - @if cmp -s $(TMPDIR)/component-download-{{ component.filename }}.owl $(TMPDIR)/component-download-{{ component.filename }}.tmp.owl ; then \ - echo "Component identical." ; \ - else \ - echo "Component different, updating." && \ - cp $(TMPDIR)/component-download-{{ component.filename }}.owl $(TMPDIR)/component-download-{{ component.filename }}.tmp.owl && \ - $(ROBOT) annotate --input $(TMPDIR)/component-download-{{ component.filename }}.owl \ - --ontology-iri $(ONTBASE)/$@ $(ANNOTATE_ONTOLOGY_VERSION) \ - --output $@ ; \ +.PHONY: download-component-{{ component.filename }} +download-component-{{ component.filename }}: | $(TMPDIR) + @odk-helper download --output $(TMPDIR)/$@ \ + --reference $(COMPONENTSDIR)/{{ component.filename }} \ + {{ component.source }} + +$(COMPONENTSDIR)/{{ component.filename }}: download-component-{{ component.filename }} $(TMPDIR)/stamp-component-{{ component.filename }} + @if [ -f $(TMPDIR)/download-component-{{ component.filename }} ]; then \ + $(ROBOT) merge -i $(TMPDIR)/download-component-{{ component.filename }} \{% if component.make_base %} + remove {% for iri in component.base_iris %}--base-iri {{ iri }} \ + {% endfor %}--axioms external --preserve-structure false --trim false \{% endif %} + annotate --ontology-iri $(ONTBASE)/$@ $(ANNOTATE_ONTOLOGY_VERSION) \ + --output $@ ; \ fi + .PRECIOUS: $(COMPONENTSDIR)/{{ component.filename }} endif # MIR=true @@ -864,61 +864,59 @@ $(COMPONENTSDIR)/{{ component.filename }}:{% for mapping in component.mappings % {% endfor -%} endif # COMP=true {% endif %}{# ! project.components is not none -#} + {% if project.import_group is defined -%} # ---------------------------------------- # Mirroring upstream ontologies # ---------------------------------------- ifeq ($(MIR),true) -{% for ont in project.import_group.products %} +{% for ont in project.import_group.products -%} ## ONTOLOGY: {{ ont.id }} -{% if ont.description -%} +{% if ont.description -%} ## {{ ont.description }} -{% endif -%} +{% endif -%} -{%- if ont.mirror_type != 'no_mirror' -%} -.PHONY: mirror-{{ ont.id }} +{% if ont.mirror_type != 'no_mirror' -%} +.PHONY: download-mirror-{{ ont.id }} .PRECIOUS: $(MIRRORDIR)/{{ ont.id }}.owl -{%- endif -%} +download-mirror-{{ ont.id }}: | $(TMPDIR) + odk-helper download --output $(TMPDIR)/$@.owl \ + --reference $(MIRRORDIR)/{{ ont.id }}.owl \ + --max-retry {{ project.import_group.mirror_retry_download }} \ + {% if ont.mirror_from %}{{ ont.mirror_from }}{% elif ont.use_base %}$(OBOBASE)/{{ ont.id }}/{{ ont.id }}-base.owl{% if ont.use_gzipped %}.gz{% endif %}{% else %}$(OBOBASE)/{{ ont.id }}.owl{% if ont.use_gzipped %}.gz{% endif %}{% endif %} + +{% if ont.mirror_type == 'custom' -%} +$(MIRRORDIR)/{{ ont.id }}.owl: + @echo "ERROR: You have configured your default mirror type to be custom; + @echo " this behavior needs to be overwritten in {{ project.id }}.Makefile!" + @false -{%- if ont.is_large %} +{% else -%} +{% if ont.is_large -%} ifeq ($(IMP_LARGE),true) -{%- endif %} -{%- if ont.mirror_from %} -mirror-{{ ont.id }}: | $(TMPDIR) - $(ROBOT) {% if ont.make_base or 'base' == ont.mirror_type %}remove -I {{ ont.mirror_from }} {% if ont.base_iris is not none %}{% for iri in ont.base_iris %}--base-iri {{iri}} {% endfor %}{% else %}--base-iri $(OBOBASE)/{{ ont.id.upper() }} {% endif %} --axioms external --preserve-structure false --trim false{% else %}convert -I {{ ont.mirror_from }}{% endif %} -o $(TMPDIR)/$@.owl -{%- elif ont.use_base %} -{%- if ont.use_gzipped %} -mirror-{{ ont.id }}: | $(TMPDIR) - curl -L $(OBOBASE)/{{ ont.id }}/{{ ont.id }}-base.owl.gz --create-dirs -o $(MIRRORDIR)/{{ ont.id }}-base.owl.gz --retry {{project.import_group.mirror_retry_download}} --max-time {{ project.import_group.mirror_max_time_download }} && \ - $(ROBOT) {% if ont.make_base or 'base' == ont.mirror_type %}remove -i $(MIRROR_DIR)/{{ ont.id }}-base.owl.gz {% if ont.base_iris is not none %}{% for iri in ont.base_iris %}--base-iri {{iri}} {% endfor %}{ else %}--base-iri $(OBOBASE)/{{ ont.id.upper() }} {% endif %} --axioms external --preserve-structure false --trim false{% else %}convert -i $(MIRRORDIR)/{{ ont.id }}-base.owl.gz{% endif %} -o $(TMPDIR)/$@.owl -{%- else %} -mirror-{{ ont.id }}: | $(TMPDIR) - curl -L $(OBOBASE)/{{ ont.id }}/{{ ont.id }}-base.owl --create-dirs -o $(TMPDIR)/{{ ont.id }}-download.owl --retry {{ project.import_group.mirror_retry_download }} --max-time {{ project.import_group.mirror_max_time_download }} && \ - $(ROBOT) convert -i $(TMPDIR)/{{ ont.id }}-download.owl -o $(TMPDIR)/$@.owl -{%- endif %} -{%- else %} -{%- if ont.use_gzipped %} -mirror-{{ ont.id }}: | $(TMPDIR) - curl -L $(OBOBASE)/{{ ont.id }}.owl.gz --create-dirs -o $(MIRRORDIR)/{{ ont.id }}.owl.gz --retry {{ project.import_group.mirror_retry_download }} --max-time {{ project.import_group.mirror_max_time_download }} && \ - $(ROBOT) {% if ont.make_base or 'base' == ont.mirror_type %}remove -i $(MIRRORDIR)/{{ ont.id }}.owl.gz {% if ont.base_iris is not none %}{% for iri in ont.base_iris %}--base-iri {{iri}} {% endfor %}{% else %}--base-iri $(OBOBASE)/{{ ont.id.upper() }} {% endif %}--axioms external --preserve-structure false --trim false{% else %}convert -i $(MIRRORDIR)/{{ ont.id }}.owl.gz{% endif %} -o $(TMPDIR)/$@.owl -{%- elif 'custom' == ont.mirror_type %} -$(MIRRORDIR)/{{ ont.id }}.owl: - echo "ERROR: You have configured your default mirror type to be custom; this behavior needs to be overwritten in {{ project.id }}.Makefile!" && false -{%- elif 'no_mirror' == ont.mirror_type -%} -## You have configured your default mirror type to no_mirror. -{%- else %} -mirror-{{ ont.id }}: | $(TMPDIR) - curl -L $(OBOBASE)/{{ ont.id }}.owl --create-dirs -o $(TMPDIR)/{{ ont.id }}-download.owl --retry {{ project.import_group.mirror_retry_download }} --max-time {{ project.import_group.mirror_max_time_download }} && \ - $(ROBOT) {% if ont.make_base or 'base' == ont.mirror_type %}remove -i $(TMPDIR)/{{ ont.id }}-download.owl {% if ont.base_iris is not none %}{% for iri in ont.base_iris %}--base-iri {{iri}} {% endfor %}{% else %}--base-iri $(OBOBASE)/{{ ont.id.upper() }} {% endif %}--axioms external --preserve-structure false --trim false{% else %}convert -i $(TMPDIR)/{{ ont.id }}-download.owl{% endif %} -o $(TMPDIR)/$@.owl -{%- endif %} -{%- endif %} -{%- if ont.is_large %} +{% endif -%} +$(MIRRORDIR)/{{ ont.id }}.owl: download-mirror-{{ ont.id }} + @if [ -f $(TMPDIR)/download-mirror-{{ ont.id }}.owl ]; then \ + $(ROBOT) {% if ont.make_base or 'base' == ont.mirror_type %}remove -i $(TMPDIR)/download-mirror-{{ ont.id }}.owl \{% if ont.base_iris is not none %}{% for iri in ont.base_iris %} + --base-iri {{ iri }} \{% endfor %}{% else %} + --base-iri $(OBOBASE)/{{ ont.id.upper() }} \{% endif %} + --axioms external --preserve-structure false --trim false \ + convert {% else %}convert -i $(TMPDIR)/download-mirror-{{ ont.id }}.owl {% endif %}-o $@ ; \ + fi + +{% if ont.is_large -%} endif -{%- endif %} -{% endfor -%} -{% if project.import_group.use_base_merging %} +{% endif -%} + +{% endif -%} + +{% endif %}{# !ont.mirror_type != 'no_mirror' -#} + +{% endfor %}{# for ont in project.import_group.products -#} + +{% if project.import_group.use_base_merging -%} ALL_MIRRORS = $(patsubst %, $(MIRRORDIR)/%.owl, $(IMPORTS)) MERGE_MIRRORS = true @@ -927,14 +925,10 @@ $(MIRRORDIR)/merged.owl: $(ALL_MIRRORS) $(ROBOT) merge $(patsubst %, -i %, $^) {% if project.import_group.annotate_defined_by %}--annotate-defined-by true{% endif %} {% if project.import_group.base_merge_drop_equivalent_class_axioms %}remove --axioms equivalent --preserve-structure false {% endif %}-o $@ .PRECIOUS: $(MIRRORDIR)/merged.owl endif -{% endif %} - -$(MIRRORDIR)/%.owl: mirror-% | $(MIRRORDIR) - if [ -f $(TMPDIR)/mirror-$*.owl ]; then if cmp -s $(TMPDIR)/mirror-$*.owl $@ ; then echo "Mirror identical, ignoring."; else echo "Mirrors different, updating." &&\ - cp $(TMPDIR)/mirror-$*.owl $@; fi; fi +{% endif -%} endif # MIR=true -{% endif %} +{% endif %}{# project.import_group is defined -#} {% if project.subset_group is defined %} # ---------------------------------------- @@ -1599,6 +1593,7 @@ clean:{% if project.use_dosdps %} case $$reldir in .*|"") ;; *) rm -rf $$reldir/* ;; esac \ done rm -f $(CLEANFILES) + rm -f $(TMPDIR)/download-*.info .PHONY: help help: diff --git a/tests/test_download.py b/tests/test_download.py new file mode 100644 index 0000000..e2be5a5 --- /dev/null +++ b/tests/test_download.py @@ -0,0 +1,75 @@ +# odkcore - Ontology Development Kit Core +# Copyright © 2026 ODK Developers +# +# This file is part of the ODK Core project and distributed under the +# terms of a 3-clause BSD license. See the LICENSE file in that project +# for the detailed conditions. + +import unittest +from pathlib import Path +from time import sleep + +from incatools.odk.download import RemoteFileInfo, download_file + + +class TestDownload(unittest.TestCase): + """A test case for the download function.""" + + downloaded_file: Path + url: str + + def setUp(self) -> None: + self.downloaded_file = Path("xxx-downloaded_file.txt") + self.url = "https://raw.githubusercontent.com/INCATools/odkcore/refs/heads/main/README.md" + + def tearDown(self) -> None: + if self.downloaded_file.exists(): + self.downloaded_file.unlink() + + def test_download_once(self) -> None: + """Test that we do not re-download a file needlessly.""" + + ri = RemoteFileInfo() + code = download_file(self.url, self.downloaded_file, ri, max_retry=5) + if code != 200: + self.skipTest(f"Could not dowload from {self.url}, cannot test") + return + + self.assertTrue(self.downloaded_file.exists()) + mtime = self.downloaded_file.stat().st_mtime + + sleep(1) + code = download_file(self.url, self.downloaded_file, ri, max_retry=5) + + # Either the server tols us the file had not been modified, or + # the download function checked the newly downloaded file is the + # same as the one downloaded just before. Either way, we should + # get a 304, and the file should not have been touched. + self.assertEqual(304, code) + self.assertEqual(mtime, self.downloaded_file.stat().st_mtime) + + def test_no_overwrite_identical_file(self) -> None: + """Test that a re-downloaded file is not overwritten needlessly.""" + + ri = RemoteFileInfo() + code = download_file(self.url, self.downloaded_file, ri, max_retry=5) + if code != 200: + self.skipTest(f"Could not download from {self.url}, cannot test") + return + + mtime = self.downloaded_file.stat().st_mtime + + # Remove server-side data from the RemoteFileInfo, so that the + # server cannot tell us whether the file has been modified or + # not, thereby forcing us to re-download the file. + ri.etag = None + ri.time = None + + sleep(1) + code = download_file(self.url, self.downloaded_file, ri, max_retry=5) + + # Even if we have re-downloaded the file, the function should + # still report that as a 304 and the previously downloaded file + # should not have been touched. + self.assertEqual(304, code) + self.assertEqual(mtime, self.downloaded_file.stat().st_mtime)