Skip to content

Commit 02dd2bf

Browse files
committed
Add remote file fetcher and N3 test suite
This patch adds the N3 test suite from https://github.com/w3c/N3/tree/master/tests and also adds `test/data/fetcher.py` which fetches remote test data. Remotes are added for some data in the test data directory, more will be added later and the data itself will be corrected. I'm mainly doing this because I want N3 test data to test the fix I'm making for these issues: - #1807 - #1701 Related to: - #1840
1 parent e4aae60 commit 02dd2bf

File tree

1,831 files changed

+623889
-3
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,831 files changed

+623889
-3
lines changed

test/data/README.md

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,29 @@
1-
# Consistent Test Data
1+
# Test Data
22

3-
This directory contains consistent graphs that can be used inside tests, the
4-
graphs in this directory should not change.
3+
This directory contains data for use inside tests, ideally the data in this
4+
directory should be constant and should not change, and in general non-original
5+
data that is widely known is preferred to original data as well known data has
6+
well known attributes and qualities that can make it easier to reason about.
57

68

79
## File origins
810

911
- `rdfs.ttl`: `http://www.w3.org/2000/01/rdf-schema#`
12+
13+
## Fetcher
14+
15+
Files that originate from the internet should be downloaded using `fetcher.py`
16+
so we can easily verify the integrity of the files by re-running `fetcher.py`.
17+
18+
```bash
19+
# run in repo root
20+
21+
# fetch everything
22+
.venv/bin/python3 test/data/fetcher.py
23+
24+
# only fetch single file
25+
.venv/bin/python3 test/data/fetcher.py test/data/rdfs.ttl
26+
27+
# only fetch files below path:
28+
.venv/bin/python3 test/data/fetcher.py test/data/suites
29+
```

test/data/fetcher.py

Lines changed: 309 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,309 @@
1+
#!/usr/bin/env python
2+
import argparse
3+
import enum
4+
import logging
5+
import os
6+
import random
7+
import re
8+
import shutil
9+
import string
10+
import sys
11+
import tarfile
12+
from contextlib import ExitStack, contextmanager
13+
from dataclasses import dataclass, field
14+
from pathlib import Path
15+
from tarfile import TarFile, TarInfo
16+
from tempfile import TemporaryDirectory, mkdtemp
17+
from typing import IO, Generator, List, Pattern, Union
18+
from urllib.request import Request, urlopen
19+
from zipfile import ZipFile, ZipInfo
20+
21+
DATA_PATH = Path(__file__).parent
22+
23+
24+
@dataclass
25+
class Resource:
26+
remote: Union[str, Request]
27+
local_path: Path
28+
29+
def fetch(self, tmp_path: Path) -> None:
30+
raise NotImplementedError()
31+
32+
33+
@dataclass
34+
class FileResource(Resource):
35+
def fetch(self, tmp_path: Path) -> None:
36+
if self.local_path.exists():
37+
logging.debug("info %s", self.local_path)
38+
os.remove(self.local_path)
39+
40+
with ExitStack() as xstack:
41+
request = (
42+
self.remote
43+
if isinstance(self.remote, Request)
44+
else Request(self.remote)
45+
)
46+
response = urlopen(request)
47+
remote_io: IO[bytes] = xstack.enter_context(response)
48+
49+
local_io = xstack.enter_context(self.local_path.open("wb+"))
50+
shutil.copyfileobj(remote_io, local_io)
51+
52+
logging.info("Downloaded %s to %s", request.full_url, self.local_path)
53+
54+
55+
class ArchiveType(enum.Enum):
56+
ZIP = "zip"
57+
TAR_GZ = "tar.gz"
58+
59+
60+
@dataclass
61+
class ArchiveResource(Resource):
62+
type: ArchiveType
63+
pattern: Pattern[str]
64+
65+
def fetch(self, tmp_path: Path) -> None:
66+
if self.local_path.exists():
67+
logging.debug("info %s", self.local_path)
68+
shutil.rmtree(self.local_path)
69+
with ExitStack() as xstack:
70+
request = (
71+
self.remote
72+
if isinstance(self.remote, Request)
73+
else Request(self.remote)
74+
)
75+
response = urlopen(request)
76+
remote_io: IO[bytes] = xstack.enter_context(response)
77+
name = (
78+
"".join(
79+
random.choices(
80+
string.ascii_uppercase + string.digits + string.ascii_lowercase,
81+
k=10,
82+
)
83+
)
84+
+ f".{self.type.value}"
85+
)
86+
tmp_file = tmp_path / name
87+
logging.info("fetching %s to temp file %s", self.remote, tmp_file)
88+
with tmp_file.open("wb+") as tmp_io:
89+
shutil.copyfileobj(remote_io, tmp_io)
90+
91+
archive_file: Union[ZipFile, TarFile]
92+
if self.type is ArchiveType.ZIP:
93+
archive_file = xstack.enter_context(ZipFile(tmp_file))
94+
elif self.type is ArchiveType.TAR_GZ:
95+
archive_file = xstack.enter_context(tarfile.open(tmp_file, mode="r:gz"))
96+
# archive_file = xstack.enter_context(TarFile(tmp_file, mode="r|gz"))
97+
else:
98+
raise ValueError(f"invalid type {self.type}")
99+
100+
for member_info in self._member_list(archive_file):
101+
member_filename = self._member_filename(member_info)
102+
if self._member_isdir(member_info):
103+
logging.debug("Ignoring directory %s", member_filename)
104+
continue
105+
106+
match = self.pattern.match(member_filename)
107+
if match is None:
108+
logging.debug("Ignoring unmatched %s", member_filename)
109+
continue
110+
groups = match.groups()
111+
if len(groups) > 0:
112+
dest_filename = groups[0]
113+
114+
member_io: IO[bytes]
115+
with self._member_io(archive_file, member_info) as member_io:
116+
local_file = self.local_path / dest_filename
117+
if not local_file.parent.exists():
118+
local_file.parent.mkdir(parents=True)
119+
logging.debug("writing %s to %s", member_filename, local_file)
120+
local_file.write_bytes(member_io.read())
121+
122+
logging.info(
123+
"Downloaded %s and extracted files matching %s to %s",
124+
request.full_url,
125+
self.pattern,
126+
self.local_path,
127+
)
128+
129+
@classmethod
130+
def _member_list(
131+
cls, archive: Union[ZipFile, TarFile]
132+
) -> Union[List[ZipInfo], List[TarInfo]]:
133+
if isinstance(archive, ZipFile):
134+
return archive.infolist()
135+
return archive.getmembers()
136+
137+
@classmethod
138+
def _member_isdir(cls, member_info: Union[ZipInfo, TarInfo]) -> bool:
139+
if isinstance(member_info, ZipInfo):
140+
return member_info.is_dir()
141+
return member_info.isdir()
142+
143+
@classmethod
144+
def _member_filename(cls, member_info: Union[ZipInfo, TarInfo]) -> str:
145+
if isinstance(member_info, ZipInfo):
146+
return member_info.filename
147+
return member_info.name
148+
149+
@classmethod
150+
@contextmanager
151+
def _member_io(
152+
cls, archive: Union[ZipFile, TarFile], member_info: Union[ZipInfo, TarInfo]
153+
) -> Generator[IO[bytes], None, None]:
154+
if isinstance(archive, ZipFile):
155+
assert isinstance(member_info, ZipInfo)
156+
with archive.open(member_info) as member_io:
157+
yield member_io
158+
else:
159+
assert isinstance(member_info, TarInfo)
160+
opt_io = archive.extractfile(member_info)
161+
assert opt_io is not None
162+
yield opt_io
163+
164+
165+
RESOURCES: List[Resource] = [
166+
ArchiveResource(
167+
remote="https://github.com/w3c/N3/archive/c44d123c5958ca04117e28ca3769e2c0820f72e6.zip",
168+
local_path=(DATA_PATH / "suites" / "w3c" / "n3"),
169+
type=ArchiveType.ZIP,
170+
pattern=re.compile(r"^[^\/]+[\/]tests[\/](.+)$"),
171+
),
172+
ArchiveResource(
173+
remote="https://www.w3.org/2013/TurtleTests/TESTS.tar.gz",
174+
local_path=(DATA_PATH / "suites" / "w3c" / "turtle"),
175+
type=ArchiveType.TAR_GZ,
176+
pattern=re.compile(r"^[^\/]+[\/](.+)$"),
177+
),
178+
ArchiveResource(
179+
remote="https://www.w3.org/2013/N-QuadsTests/TESTS.tar.gz",
180+
local_path=(DATA_PATH / "suites" / "w3c" / "nquads"),
181+
type=ArchiveType.TAR_GZ,
182+
pattern=re.compile(r"^(.+)$"),
183+
),
184+
ArchiveResource(
185+
remote="https://www.w3.org/2013/N-TriplesTests/TESTS.tar.gz",
186+
local_path=(DATA_PATH / "suites" / "w3c" / "ntriples"),
187+
type=ArchiveType.TAR_GZ,
188+
pattern=re.compile(r"^(.+)$"),
189+
),
190+
ArchiveResource(
191+
remote="https://www.w3.org/2013/TrigTests/TESTS.tar.gz",
192+
local_path=(DATA_PATH / "suites" / "w3c" / "trig"),
193+
type=ArchiveType.TAR_GZ,
194+
pattern=re.compile(r"^(.+)$"),
195+
),
196+
# NOTE: Commented out as these files contains local modifications.
197+
# ArchiveResource(
198+
# remote="https://www.w3.org/2013/RDFXMLTests/TESTS.zip",
199+
# local_path=(DATA_PATH / "suites" / "w3c" / "rdfxml"),
200+
# type=ArchiveType.ZIP,
201+
# pattern=re.compile(r"^(.+)$"),
202+
# ),
203+
# NOTE: Commented out as this contains local modifications.
204+
# ArchiveResource(
205+
# remote="https://www.w3.org/2009/sparql/docs/tests/sparql11-test-suite-20121023.tar.gz",
206+
# local_path=(DATA_PATH / "suites" / "DAWG" / "data-sparql11"),
207+
# type=ArchiveType.TAR_GZ,
208+
# pattern=re.compile(r"^[^\/]+[\/](.+)$"),
209+
# ),
210+
FileResource(
211+
remote=Request(
212+
"http://www.w3.org/2000/01/rdf-schema#", headers={"Accept": "text/turtle"}
213+
),
214+
local_path=(DATA_PATH / "rdfs.ttl"),
215+
),
216+
]
217+
218+
219+
@dataclass
220+
class Application:
221+
parser: argparse.ArgumentParser = field(
222+
default_factory=lambda: argparse.ArgumentParser(add_help=True)
223+
)
224+
225+
def __post_init__(self) -> None:
226+
parser = self.parser
227+
parser.add_argument(
228+
"-v",
229+
"--verbose",
230+
action="count",
231+
dest="verbosity",
232+
help="increase verbosity level",
233+
)
234+
parser.add_argument(
235+
"--keep-tmp",
236+
action="store_true",
237+
default=False,
238+
)
239+
parser.add_argument("paths", nargs="*", type=str)
240+
parser.set_defaults(handler=self.handle)
241+
242+
def run(self, args: List[str]) -> None:
243+
parse_result = self.parser.parse_args(args)
244+
245+
verbosity = parse_result.verbosity
246+
if verbosity is not None:
247+
root_logger = logging.getLogger("")
248+
root_logger.propagate = True
249+
new_level = (
250+
root_logger.getEffectiveLevel()
251+
- (min(1, verbosity)) * 10
252+
- min(max(0, verbosity - 1), 9) * 1
253+
)
254+
root_logger.setLevel(new_level)
255+
256+
logging.debug(
257+
"args = %s, parse_result = %s, logging.level = %s",
258+
args,
259+
parse_result,
260+
logging.getLogger("").getEffectiveLevel(),
261+
)
262+
263+
parse_result.handler(parse_result)
264+
265+
def handle(self, parse_result: argparse.Namespace) -> None:
266+
logging.debug("entry ...")
267+
268+
paths = {Path(path).absolute() for path in parse_result.paths}
269+
270+
logging.debug("paths = %s", paths)
271+
272+
if parse_result.keep_tmp:
273+
tmp_path = Path(mkdtemp())
274+
else:
275+
tmp_dir = TemporaryDirectory()
276+
tmp_path = Path(tmp_dir.name)
277+
278+
for resource in RESOURCES:
279+
if paths:
280+
include = False
281+
for path in paths:
282+
try:
283+
resource.local_path.absolute().relative_to(path)
284+
include = True
285+
except ValueError:
286+
# not relative to, ignoring
287+
pass
288+
if not include:
289+
logging.info("skipping %s", resource.local_path)
290+
continue
291+
resource.fetch(tmp_path)
292+
293+
294+
def main() -> None:
295+
logging.basicConfig(
296+
level=os.environ.get("PYLOGGING_LEVEL", logging.INFO),
297+
stream=sys.stderr,
298+
datefmt="%Y-%m-%dT%H:%M:%S",
299+
format=(
300+
"%(asctime)s.%(msecs)03d %(process)d %(thread)d %(levelno)03d:%(levelname)-8s "
301+
"%(name)-12s %(module)s:%(lineno)s:%(funcName)s %(message)s"
302+
),
303+
)
304+
305+
Application().run(sys.argv[1:])
306+
307+
308+
if __name__ == "__main__":
309+
main()

test/data/suites/w3c/n3/LICENSE.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
All documents in this Repository are licensed by contributors
2+
under both the the [W3C Test Suite License](http://www.w3.org/Consortium/Legal/2008/04-testsuite-license) and
3+
[W3C Software and Document License](https://www.w3.org/Consortium/Legal/copyright-software).
4+

0 commit comments

Comments
 (0)