Skip to content

Commit 16a6062

Browse files
authored
Merge pull request #363 from python-jsonschema/improve-file-handling
Improve file handling with lazy reads
2 parents bb2be1c + 03d433b commit 16a6062

File tree

8 files changed

+167
-46
lines changed

8 files changed

+167
-46
lines changed

src/check_jsonschema/cli/main_command.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
SchemaLoaderBase,
2121
)
2222
from ..transforms import TRANSFORM_LIBRARY
23-
from .param_types import CommaDelimitedList, ValidatorClassName
23+
from .param_types import CommaDelimitedList, LazyBinaryReadFile, ValidatorClassName
2424
from .parse_result import ParseResult, SchemaLoadingMode
2525

2626
BUILTIN_SCHEMA_NAMES = [f"vendor.{k}" for k in SCHEMA_CATALOG.keys()] + [
@@ -220,7 +220,9 @@ def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str:
220220
help="Reduce output verbosity",
221221
count=True,
222222
)
223-
@click.argument("instancefiles", required=True, nargs=-1, type=click.File("rb"))
223+
@click.argument(
224+
"instancefiles", required=True, nargs=-1, type=LazyBinaryReadFile("rb", lazy=True)
225+
)
224226
def main(
225227
*,
226228
schemafile: str | None,

src/check_jsonschema/cli/param_types.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
from __future__ import annotations
22

33
import importlib
4+
import os
45
import re
6+
import stat
57
import typing as t
68

79
import click
810
import jsonschema
11+
from click._compat import open_stream
912

1013

1114
class CommaDelimitedList(click.ParamType):
@@ -104,3 +107,50 @@ def convert(
104107
self.fail(f"'{classname}' in '{pkg}' is not a class", param, ctx)
105108

106109
return t.cast(t.Type[jsonschema.protocols.Validator], result)
110+
111+
112+
class CustomLazyFile(click.utils.LazyFile):
113+
def __init__(
114+
self,
115+
filename: str | os.PathLike[str],
116+
mode: str = "r",
117+
encoding: str | None = None,
118+
errors: str | None = "strict",
119+
atomic: bool = False,
120+
):
121+
self.name: str = os.fspath(filename)
122+
self.mode = mode
123+
self.encoding = encoding
124+
self.errors = errors
125+
self.atomic = atomic
126+
self._f: t.IO[t.Any] | None
127+
self.should_close: bool
128+
129+
if self.name == "-":
130+
self._f, self.should_close = open_stream(filename, mode, encoding, errors)
131+
else:
132+
if "r" in mode and not stat.S_ISFIFO(os.stat(filename).st_mode):
133+
# Open and close the file in case we're opening it for
134+
# reading so that we can catch at least some errors in
135+
# some cases early.
136+
open(filename, mode).close()
137+
self._f = None
138+
self.should_close = True
139+
140+
141+
class LazyBinaryReadFile(click.File):
142+
def convert(
143+
self,
144+
value: str | os.PathLike[str] | t.IO[t.Any],
145+
param: click.Parameter | None,
146+
ctx: click.Context | None,
147+
) -> t.IO[bytes]:
148+
if hasattr(value, "read") or hasattr(value, "write"):
149+
return t.cast(t.IO[bytes], value)
150+
151+
value_: str | os.PathLike[str] = t.cast("str | os.PathLike[str]", value)
152+
153+
lf = CustomLazyFile(value_, mode="rb")
154+
if ctx is not None:
155+
ctx.call_on_close(lf.close_intelligently)
156+
return t.cast(t.IO[bytes], lf)

src/check_jsonschema/instance_loader.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,16 @@
33
import io
44
import typing as t
55

6+
from check_jsonschema.cli.param_types import CustomLazyFile
7+
68
from .parsers import ParseError, ParserSet
79
from .transforms import Transform
810

911

1012
class InstanceLoader:
1113
def __init__(
1214
self,
13-
files: t.Sequence[t.BinaryIO],
15+
files: t.Sequence[t.BinaryIO | CustomLazyFile],
1416
default_filetype: str = "json",
1517
data_transform: Transform | None = None,
1618
) -> None:
@@ -35,12 +37,21 @@ def iter_files(self) -> t.Iterator[tuple[str, ParseError | t.Any]]:
3537
name = "<stdin>"
3638
else:
3739
raise ValueError(f"File {file} has no name attribute")
40+
3841
try:
39-
data: t.Any = self._parsers.parse_data_with_path(
40-
file, name, self._default_filetype
41-
)
42-
except ParseError as err:
43-
data = err
44-
else:
45-
data = self._data_transform(data)
42+
if isinstance(file, CustomLazyFile):
43+
stream: t.BinaryIO = t.cast(t.BinaryIO, file.open())
44+
else:
45+
stream = file
46+
47+
try:
48+
data: t.Any = self._parsers.parse_data_with_path(
49+
stream, name, self._default_filetype
50+
)
51+
except ParseError as err:
52+
data = err
53+
else:
54+
data = self._data_transform(data)
55+
finally:
56+
file.close()
4657
yield (name, data)

src/check_jsonschema/schema_loader/readers.py

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,13 @@
1515
yaml = ruamel.yaml.YAML(typ="safe")
1616

1717

18+
class _UnsetType:
19+
pass
20+
21+
22+
_UNSET = _UnsetType()
23+
24+
1825
def _run_load_callback(schema_location: str, callback: t.Callable) -> dict:
1926
try:
2027
schema = callback()
@@ -31,6 +38,7 @@ def __init__(self, filename: str) -> None:
3138
self.path = filename2path(filename)
3239
self.filename = str(self.path)
3340
self.parsers = ParserSet()
41+
self._parsed_schema: dict | _UnsetType = _UNSET
3442

3543
def get_retrieval_uri(self) -> str | None:
3644
return self.path.as_uri()
@@ -39,21 +47,26 @@ def _read_impl(self) -> t.Any:
3947
return self.parsers.parse_file(self.path, default_filetype="json")
4048

4149
def read_schema(self) -> dict:
42-
return _run_load_callback(self.filename, self._read_impl)
50+
if self._parsed_schema is _UNSET:
51+
self._parsed_schema = _run_load_callback(self.filename, self._read_impl)
52+
return t.cast(dict, self._parsed_schema)
4353

4454

4555
class StdinSchemaReader:
4656
def __init__(self) -> None:
4757
self.parsers = ParserSet()
58+
self._parsed_schema: dict | _UnsetType = _UNSET
4859

4960
def get_retrieval_uri(self) -> str | None:
5061
return None
5162

5263
def read_schema(self) -> dict:
53-
try:
54-
return json.load(sys.stdin)
55-
except ValueError as e:
56-
raise ParseError("Failed to parse JSON from stdin") from e
64+
if self._parsed_schema is _UNSET:
65+
try:
66+
self._parsed_schema = json.load(sys.stdin)
67+
except ValueError as e:
68+
raise ParseError("Failed to parse JSON from stdin") from e
69+
return t.cast(dict, self._parsed_schema)
5770

5871

5972
class HttpSchemaReader:
@@ -71,14 +84,12 @@ def __init__(
7184
disable_cache=disable_cache,
7285
validation_callback=self._parse,
7386
)
74-
self._parsed_schema: t.Any | None = None
87+
self._parsed_schema: dict | _UnsetType = _UNSET
7588

7689
def _parse(self, schema_bytes: bytes) -> t.Any:
77-
if self._parsed_schema is None:
78-
self._parsed_schema = self.parsers.parse_data_with_path(
79-
io.BytesIO(schema_bytes), self.url, default_filetype="json"
80-
)
81-
return self._parsed_schema
90+
return self.parsers.parse_data_with_path(
91+
io.BytesIO(schema_bytes), self.url, default_filetype="json"
92+
)
8293

8394
def get_retrieval_uri(self) -> str | None:
8495
return self.url
@@ -88,4 +99,6 @@ def _read_impl(self) -> t.Any:
8899
return self._parse(fp.read())
89100

90101
def read_schema(self) -> dict:
91-
return _run_load_callback(self.url, self._read_impl)
102+
if self._parsed_schema is _UNSET:
103+
self._parsed_schema = _run_load_callback(self.url, self._read_impl)
104+
return t.cast(dict, self._parsed_schema)

tests/acceptance/test_special_filetypes.py

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1+
import multiprocessing
12
import os
23
import platform
34
import sys
4-
import threading
55

66
import pytest
77
import responses
@@ -33,6 +33,16 @@ def test_schema_and_instance_in_memfds(run_line_simple):
3333
os.close(instancefd)
3434

3535

36+
# helper (in global scope) for multiprocessing "spawn" to be able to use to launch
37+
# background writers
38+
def _fifo_write(path, data):
39+
fd = os.open(path, os.O_WRONLY)
40+
try:
41+
os.write(fd, data)
42+
finally:
43+
os.close(fd)
44+
45+
3646
@pytest.mark.skipif(os.name != "posix", reason="test requires mkfifo")
3747
@pytest.mark.parametrize("check_succeeds", (True, False))
3848
def test_schema_and_instance_in_fifos(tmp_path, run_line, check_succeeds):
@@ -45,25 +55,17 @@ def test_schema_and_instance_in_fifos(tmp_path, run_line, check_succeeds):
4555
os.mkfifo(schema_path)
4656
os.mkfifo(instance_path)
4757

48-
# execute FIFO writes as blocking writes in background threads
49-
# nonblocking writes fail file existence if there's no reader, so using a FIFO
50-
# requires some level of concurrency
51-
def fifo_write(path, data):
52-
fd = os.open(path, os.O_WRONLY)
53-
try:
54-
os.write(fd, data)
55-
finally:
56-
os.close(fd)
57-
58-
schema_thread = threading.Thread(
59-
target=fifo_write, args=[schema_path, b'{"type": "integer"}']
58+
spawn_ctx = multiprocessing.get_context("spawn")
59+
60+
schema_proc = spawn_ctx.Process(
61+
target=_fifo_write, args=(schema_path, b'{"type": "integer"}')
6062
)
61-
schema_thread.start()
63+
schema_proc.start()
6264
instance_data = b"42" if check_succeeds else b'"foo"'
63-
instance_thread = threading.Thread(
64-
target=fifo_write, args=[instance_path, instance_data]
65+
instance_proc = spawn_ctx.Process(
66+
target=_fifo_write, args=(instance_path, instance_data)
6567
)
66-
instance_thread.start()
68+
instance_proc.start()
6769

6870
try:
6971
result = run_line(
@@ -74,8 +76,8 @@ def fifo_write(path, data):
7476
else:
7577
assert result.exit_code == 1
7678
finally:
77-
schema_thread.join(timeout=0.1)
78-
instance_thread.join(timeout=0.1)
79+
schema_proc.terminate()
80+
instance_proc.terminate()
7981

8082

8183
@pytest.mark.parametrize("check_passes", (True, False))

tests/unit/test_cli_parse.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from __future__ import annotations
22

3-
import io
43
from unittest import mock
54

65
import click
@@ -86,7 +85,7 @@ def test_schemafile_and_instancefile(runner, mock_parse_result, in_tmp_dir, tmp_
8685
assert mock_parse_result.schema_path == "schema.json"
8786
assert isinstance(mock_parse_result.instancefiles, tuple)
8887
for f in mock_parse_result.instancefiles:
89-
assert isinstance(f, (io.BytesIO, io.BufferedReader))
88+
assert isinstance(f, click.utils.LazyFile)
9089
assert tuple(f.name for f in mock_parse_result.instancefiles) == ("foo.json",)
9190

9291

tests/unit/test_lazy_file_handling.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import os
2+
import platform
3+
4+
import pytest
5+
from click.testing import CliRunner
6+
7+
from check_jsonschema.cli.main_command import build_checker
8+
from check_jsonschema.cli.main_command import main as cli_main
9+
10+
11+
@pytest.fixture
12+
def runner() -> CliRunner:
13+
return CliRunner(mix_stderr=False)
14+
15+
16+
@pytest.mark.skipif(
17+
platform.system() != "Linux", reason="test requires /proc/self/ mechanism"
18+
)
19+
def test_open_file_usage_never_exceeds_1000(runner, monkeypatch, tmp_path):
20+
schema_path = tmp_path / "schema.json"
21+
schema_path.write_text("{}")
22+
23+
args = [
24+
"--schemafile",
25+
str(schema_path),
26+
]
27+
28+
for i in range(2000):
29+
instance_path = tmp_path / f"file{i}.json"
30+
instance_path.write_text("{}")
31+
args.append(str(instance_path))
32+
33+
checker = None
34+
35+
def fake_execute(argv):
36+
nonlocal checker
37+
checker = build_checker(argv)
38+
39+
monkeypatch.setattr("check_jsonschema.cli.main_command.execute", fake_execute)
40+
res = runner.invoke(cli_main, args)
41+
assert res.exit_code == 0, res.stderr
42+
43+
assert checker is not None
44+
assert len(os.listdir("/proc/self/fd")) < 2000
45+
for _fname, _data in checker._instance_loader.iter_files():
46+
assert len(os.listdir("/proc/self/fd")), 2000

tox.ini

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,10 @@ commands = coverage report --skip-covered
4646

4747
[testenv:mypy]
4848
description = "check type annotations with mypy"
49-
# temporarily pin back click until either click 8.1.5 releases or mypy fixes the issue
50-
# with referential integrity of type aliases
5149
deps = mypy
5250
types-jsonschema
5351
types-requests
54-
click==8.1.3
52+
click
5553
commands = mypy src/ {posargs}
5654

5755
[testenv:pyright]

0 commit comments

Comments
 (0)