RDFLib
diff --git a/‎test/data/README.md‎
Lines changed: 23 additions & 3 deletions b/‎test/data/README.md‎
Lines changed: 23 additions & 3 deletions
diff --git a/‎test/data/fetcher.py‎
Lines changed: 309 additions & 0 deletions b/‎test/data/fetcher.py‎
Lines changed: 309 additions & 0 deletions
diff --git a/‎test/data/suites/w3c/n3/LICENSE.md‎
Lines changed: 4 additions & 0 deletions b/‎test/data/suites/w3c/n3/LICENSE.md‎
Lines changed: 4 additions & 0 deletions
@@ -1,9 +1,29 @@
-# Consistent Test Data
+# Test Data
 
-This directory contains consistent graphs that can be used inside tests, the
-graphs in this directory should not change.
+This directory contains data for use inside tests, ideally the data in this
+directory should be constant and should not change, and in general non-original
+data that is widely known is preferred to original data as well known data has
+well known attributes and qualities that can make it easier to reason about.
 
 
 ## File origins
 
 - `rdfs.ttl`: `http://www.w3.org/2000/01/rdf-schema#`
+
+## Fetcher
+
+Files that originate from the internet should be downloaded using `fetcher.py`
+so we can easily verify the integrity of the files by re-running `fetcher.py`.
+
+```bash
+# run in repo root
+
+# fetch everything
+.venv/bin/python3 test/data/fetcher.py
+
+# only fetch single file
+.venv/bin/python3 test/data/fetcher.py test/data/rdfs.ttl
+
+# only fetch files below path:
+.venv/bin/python3 test/data/fetcher.py test/data/suites
+```
@@ -0,0 +1,309 @@
+#!/usr/bin/env python
+import argparse
+import enum
+import logging
+import os
+import random
+import re
+import shutil
+import string
+import sys
+import tarfile
+from contextlib import ExitStack, contextmanager
+from dataclasses import dataclass, field
+from pathlib import Path
+from tarfile import TarFile, TarInfo
+from tempfile import TemporaryDirectory, mkdtemp
+from typing import IO, Generator, List, Pattern, Union
+from urllib.request import Request, urlopen
+from zipfile import ZipFile, ZipInfo
+
+DATA_PATH = Path(__file__).parent
+
+
+@dataclass
+class Resource:
+    remote: Union[str, Request]
+    local_path: Path
+
+    def fetch(self, tmp_path: Path) -> None:
+        raise NotImplementedError()
+
+
+@dataclass
+class FileResource(Resource):
+    def fetch(self, tmp_path: Path) -> None:
+        if self.local_path.exists():
+            logging.debug("info %s", self.local_path)
+            os.remove(self.local_path)
+
+        with ExitStack() as xstack:
+            request = (
+                self.remote
+                if isinstance(self.remote, Request)
+                else Request(self.remote)
+            )
+            response = urlopen(request)
+            remote_io: IO[bytes] = xstack.enter_context(response)
+
+            local_io = xstack.enter_context(self.local_path.open("wb+"))
+            shutil.copyfileobj(remote_io, local_io)
+
+        logging.info("Downloaded %s to %s", request.full_url, self.local_path)
+
+
+class ArchiveType(enum.Enum):
+    ZIP = "zip"
+    TAR_GZ = "tar.gz"
+
+
+@dataclass
+class ArchiveResource(Resource):
+    type: ArchiveType
+    pattern: Pattern[str]
+
+    def fetch(self, tmp_path: Path) -> None:
+        if self.local_path.exists():
+            logging.debug("info %s", self.local_path)
+            shutil.rmtree(self.local_path)
+        with ExitStack() as xstack:
+            request = (
+                self.remote
+                if isinstance(self.remote, Request)
+                else Request(self.remote)
+            )
+            response = urlopen(request)
+            remote_io: IO[bytes] = xstack.enter_context(response)
+            name = (
+                "".join(
+                    random.choices(
+                        string.ascii_uppercase + string.digits + string.ascii_lowercase,
+                        k=10,
+                    )
+                )
+                + f".{self.type.value}"
+            )
+            tmp_file = tmp_path / name
+            logging.info("fetching %s to temp file %s", self.remote, tmp_file)
+            with tmp_file.open("wb+") as tmp_io:
+                shutil.copyfileobj(remote_io, tmp_io)
+
+            archive_file: Union[ZipFile, TarFile]
+            if self.type is ArchiveType.ZIP:
+                archive_file = xstack.enter_context(ZipFile(tmp_file))
+            elif self.type is ArchiveType.TAR_GZ:
+                archive_file = xstack.enter_context(tarfile.open(tmp_file, mode="r:gz"))
+                # archive_file = xstack.enter_context(TarFile(tmp_file, mode="r|gz"))
+            else:
+                raise ValueError(f"invalid type {self.type}")
+
+            for member_info in self._member_list(archive_file):
+                member_filename = self._member_filename(member_info)
+                if self._member_isdir(member_info):
+                    logging.debug("Ignoring directory %s", member_filename)
+                    continue
+
+                match = self.pattern.match(member_filename)
+                if match is None:
+                    logging.debug("Ignoring unmatched %s", member_filename)
+                    continue
+                groups = match.groups()
+                if len(groups) > 0:
+                    dest_filename = groups[0]
+
+                member_io: IO[bytes]
+                with self._member_io(archive_file, member_info) as member_io:
+                    local_file = self.local_path / dest_filename
+                    if not local_file.parent.exists():
+                        local_file.parent.mkdir(parents=True)
+                    logging.debug("writing %s to %s", member_filename, local_file)
+                    local_file.write_bytes(member_io.read())
+
+        logging.info(
+            "Downloaded %s and extracted files matching %s to %s",
+            request.full_url,
+            self.pattern,
+            self.local_path,
+        )
+
+    @classmethod
+    def _member_list(
+        cls, archive: Union[ZipFile, TarFile]
+    ) -> Union[List[ZipInfo], List[TarInfo]]:
+        if isinstance(archive, ZipFile):
+            return archive.infolist()
+        return archive.getmembers()
+
+    @classmethod
+    def _member_isdir(cls, member_info: Union[ZipInfo, TarInfo]) -> bool:
+        if isinstance(member_info, ZipInfo):
+            return member_info.is_dir()
+        return member_info.isdir()
+
+    @classmethod
+    def _member_filename(cls, member_info: Union[ZipInfo, TarInfo]) -> str:
+        if isinstance(member_info, ZipInfo):
+            return member_info.filename
+        return member_info.name
+
+    @classmethod
+    @contextmanager
+    def _member_io(
+        cls, archive: Union[ZipFile, TarFile], member_info: Union[ZipInfo, TarInfo]
+    ) -> Generator[IO[bytes], None, None]:
+        if isinstance(archive, ZipFile):
+            assert isinstance(member_info, ZipInfo)
+            with archive.open(member_info) as member_io:
+                yield member_io
+        else:
+            assert isinstance(member_info, TarInfo)
+            opt_io = archive.extractfile(member_info)
+            assert opt_io is not None
+            yield opt_io
+
+
+RESOURCES: List[Resource] = [
+    ArchiveResource(
+        remote="https://github.com/w3c/N3/archive/c44d123c5958ca04117e28ca3769e2c0820f72e6.zip",
+        local_path=(DATA_PATH / "suites" / "w3c" / "n3"),
+        type=ArchiveType.ZIP,
+        pattern=re.compile(r"^[^\/]+[\/]tests[\/](.+)$"),
+    ),
+    ArchiveResource(
+        remote="https://www.w3.org/2013/TurtleTests/TESTS.tar.gz",
+        local_path=(DATA_PATH / "suites" / "w3c" / "turtle"),
+        type=ArchiveType.TAR_GZ,
+        pattern=re.compile(r"^[^\/]+[\/](.+)$"),
+    ),
+    ArchiveResource(
+        remote="https://www.w3.org/2013/N-QuadsTests/TESTS.tar.gz",
+        local_path=(DATA_PATH / "suites" / "w3c" / "nquads"),
+        type=ArchiveType.TAR_GZ,
+        pattern=re.compile(r"^(.+)$"),
+    ),
+    ArchiveResource(
+        remote="https://www.w3.org/2013/N-TriplesTests/TESTS.tar.gz",
+        local_path=(DATA_PATH / "suites" / "w3c" / "ntriples"),
+        type=ArchiveType.TAR_GZ,
+        pattern=re.compile(r"^(.+)$"),
+    ),
+    ArchiveResource(
+        remote="https://www.w3.org/2013/TrigTests/TESTS.tar.gz",
+        local_path=(DATA_PATH / "suites" / "w3c" / "trig"),
+        type=ArchiveType.TAR_GZ,
+        pattern=re.compile(r"^(.+)$"),
+    ),
+    # NOTE: Commented out as these files contains local modifications.
+    # ArchiveResource(
+    #     remote="https://www.w3.org/2013/RDFXMLTests/TESTS.zip",
+    #     local_path=(DATA_PATH / "suites" / "w3c" / "rdfxml"),
+    #     type=ArchiveType.ZIP,
+    #     pattern=re.compile(r"^(.+)$"),
+    # ),
+    # NOTE: Commented out as this contains local modifications.
+    # ArchiveResource(
+    #     remote="https://www.w3.org/2009/sparql/docs/tests/sparql11-test-suite-20121023.tar.gz",
+    #     local_path=(DATA_PATH / "suites" / "DAWG" / "data-sparql11"),
+    #     type=ArchiveType.TAR_GZ,
+    #     pattern=re.compile(r"^[^\/]+[\/](.+)$"),
+    # ),
+    FileResource(
+        remote=Request(
+            "http://www.w3.org/2000/01/rdf-schema#", headers={"Accept": "text/turtle"}
+        ),
+        local_path=(DATA_PATH / "rdfs.ttl"),
+    ),
+]
+
+
+@dataclass
+class Application:
+    parser: argparse.ArgumentParser = field(
+        default_factory=lambda: argparse.ArgumentParser(add_help=True)
+    )
+
+    def __post_init__(self) -> None:
+        parser = self.parser
+        parser.add_argument(
+            "-v",
+            "--verbose",
+            action="count",
+            dest="verbosity",
+            help="increase verbosity level",
+        )
+        parser.add_argument(
+            "--keep-tmp",
+            action="store_true",
+            default=False,
+        )
+        parser.add_argument("paths", nargs="*", type=str)
+        parser.set_defaults(handler=self.handle)
+
+    def run(self, args: List[str]) -> None:
+        parse_result = self.parser.parse_args(args)
+
+        verbosity = parse_result.verbosity
+        if verbosity is not None:
+            root_logger = logging.getLogger("")
+            root_logger.propagate = True
+            new_level = (
+                root_logger.getEffectiveLevel()
+                - (min(1, verbosity)) * 10
+                - min(max(0, verbosity - 1), 9) * 1
+            )
+            root_logger.setLevel(new_level)
+
+        logging.debug(
+            "args = %s, parse_result = %s, logging.level = %s",
+            args,
+            parse_result,
+            logging.getLogger("").getEffectiveLevel(),
+        )
+
+        parse_result.handler(parse_result)
+
+    def handle(self, parse_result: argparse.Namespace) -> None:
+        logging.debug("entry ...")
+
+        paths = {Path(path).absolute() for path in parse_result.paths}
+
+        logging.debug("paths = %s", paths)
+
+        if parse_result.keep_tmp:
+            tmp_path = Path(mkdtemp())
+        else:
+            tmp_dir = TemporaryDirectory()
+            tmp_path = Path(tmp_dir.name)
+
+        for resource in RESOURCES:
+            if paths:
+                include = False
+                for path in paths:
+                    try:
+                        resource.local_path.absolute().relative_to(path)
+                        include = True
+                    except ValueError:
+                        # not relative to, ignoring
+                        pass
+                if not include:
+                    logging.info("skipping %s", resource.local_path)
+                    continue
+            resource.fetch(tmp_path)
+
+
+def main() -> None:
+    logging.basicConfig(
+        level=os.environ.get("PYLOGGING_LEVEL", logging.INFO),
+        stream=sys.stderr,
+        datefmt="%Y-%m-%dT%H:%M:%S",
+        format=(
+            "%(asctime)s.%(msecs)03d %(process)d %(thread)d %(levelno)03d:%(levelname)-8s "
+            "%(name)-12s %(module)s:%(lineno)s:%(funcName)s %(message)s"
+        ),
+    )
+
+    Application().run(sys.argv[1:])
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,4 @@
+All documents in this Repository are licensed by contributors
+under both the the [W3C Test Suite License](http://www.w3.org/Consortium/Legal/2008/04-testsuite-license) and 
+[W3C Software and Document License](https://www.w3.org/Consortium/Legal/copyright-software).
+