Skip to content

Commit b8a6bbe

Browse files
authored
Merge pull request #4 from sudo-jarvis/create-lexer
Add support for multiple dialects
2 parents 5216541 + 50f3a60 commit b8a6bbe

File tree

8 files changed

+792
-130
lines changed

8 files changed

+792
-130
lines changed

jsonschema_lexer/_lexer.py

Lines changed: 151 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@
22
Contains the main functionality of the JSONSchemaLexer.
33
"""
44

5-
from typing import ClassVar
5+
from importlib.resources import files
6+
from pathlib import Path
7+
from typing import Any, ClassVar
8+
import json
69

710
from pygments.lexers.data import ( # type: ignore[reportMissingTypeStubs]
811
JsonLexer,
@@ -24,109 +27,159 @@ class JSONSchemaLexer(JsonLexer):
2427
]
2528

2629
data_types: ClassVar[list[str]] = [
27-
"object",
28-
"integer",
29-
"string",
30-
"number",
31-
"array",
32-
"boolean",
33-
"null",
34-
]
35-
core_keywords: ClassVar[list[str]] = [
36-
"$schema",
37-
"$id",
38-
"$ref",
39-
"$defs",
40-
"$comment",
41-
"$dynamicAnchor",
42-
"$dynamicRef",
43-
"$anchor",
44-
"$vocabulary",
45-
]
46-
applicator_keywords: ClassVar[list[str]] = [
47-
"oneOf",
48-
"allOf",
49-
"anyOf",
50-
"if",
51-
"then",
52-
"else",
53-
"not",
54-
"properties",
55-
"patternProperties",
56-
"additionalProperties",
57-
"dependentSchemas",
58-
"propertyNames",
59-
"prefixItems",
60-
"contains",
61-
"items",
62-
]
63-
meta_data_keywords: ClassVar[list[str]] = [
64-
"title",
65-
"description",
66-
"default",
67-
"deprecated",
68-
"examples",
69-
"readOnly",
70-
"writeOnly",
71-
]
72-
validation_keywords: ClassVar[list[str]] = [
73-
"type",
74-
"enum",
75-
"const",
76-
"minLength",
77-
"maxLength",
78-
"pattern",
79-
"maximum",
80-
"exclusiveMinimum",
81-
"multipleOf",
82-
"exclusiveMaximum",
83-
"minimum",
84-
"dependentRequired",
85-
"minProperties",
86-
"maxProperties",
87-
"required",
88-
"minItems",
89-
"maxItems",
90-
"minContains",
91-
"maxContains",
92-
"uniqueItems",
93-
]
94-
other_keywords: ClassVar[list[str]] = [
95-
"format",
96-
"unevaluatedItems",
97-
"unevaluatedProperties",
98-
"contentEncoding",
99-
"contentMediaType",
100-
"contentSchema",
101-
"format_assertion",
30+
'"object"',
31+
'"integer"',
32+
'"string"',
33+
'"number"',
34+
'"array"',
35+
'"boolean"',
36+
'"null"',
10237
]
38+
keywords: ClassVar[dict[str | None, list[str]]] = {}
39+
identifier: ClassVar[dict[str | None, str]] = {}
40+
default_dialect = None
41+
42+
def __init__(self, default_dialect: str | None = None):
43+
super().__init__() # type: ignore[reportUnknownMemberType]
44+
self._populate_keywords_and_identifiers()
45+
if default_dialect and default_dialect[0] != '"':
46+
default_dialect = '"' + default_dialect
47+
48+
if default_dialect and default_dialect[-1] != '"':
49+
default_dialect = default_dialect + '"'
10350

104-
parsed_keywords: ClassVar[list[str]] = [
105-
'"%s"' % keyword
106-
for keyword in (
107-
core_keywords
108-
+ applicator_keywords
109-
+ meta_data_keywords
110-
+ validation_keywords
111-
+ other_keywords
51+
self.default_dialect = default_dialect
52+
53+
def _populate_keywords_and_identifiers(self):
54+
dialect_files = files("jsonschema_lexer") / "data" / "keywords"
55+
if not dialect_files.is_dir():
56+
dialect_files = Path(__file__).parent / "data" / "keywords"
57+
for dialect_file in dialect_files.iterdir():
58+
with dialect_file.open() as file:
59+
json_content = json.load(file)
60+
dialect_name = f'"{json_content["dialect"]}"'
61+
self.keywords[dialect_name] = json_content["keywords"]
62+
self.identifier[dialect_name] = (
63+
f'"{json_content["identifier"]}"'
64+
)
65+
66+
def _find_rightmost_token_index(
67+
self,
68+
syntax_stack: list[tuple[int, str]],
69+
token: str | None,
70+
):
71+
return next(
72+
(
73+
i
74+
for i, (_, t) in reversed(list(enumerate(syntax_stack)))
75+
if t == token
76+
),
77+
None,
11278
)
113-
]
11479

115-
parsed_data_types: ClassVar[list[str]] = [
116-
'"%s"' % data_type for data_type in data_types
117-
]
80+
def _find_key_value_from_json(
81+
self,
82+
tokens: list[tuple[int, Any, str]],
83+
index: int,
84+
):
85+
return next(
86+
(t[2] for t in tokens[index:] if t[1] is Token.String.Double),
87+
None,
88+
)
89+
90+
def _get_nearest_valid_dialect(
91+
self,
92+
tokens: list[tuple[int, Any, str]],
93+
syntax_stack: list[tuple[int, str]],
94+
index: int | None = None,
95+
) -> str | None:
96+
if not index:
97+
index = len(syntax_stack) - 1
98+
99+
nearest_schema_index = self._find_rightmost_token_index(
100+
syntax_stack[: index + 1],
101+
'"$schema"',
102+
)
103+
if nearest_schema_index:
104+
dialect = self._find_key_value_from_json(
105+
tokens,
106+
nearest_schema_index,
107+
)
108+
identifier = self.identifier.get(dialect, None)
109+
is_dialect_valid = bool(
110+
identifier or syntax_stack[nearest_schema_index][0] == 0,
111+
)
112+
nearest_identifier_index = self._find_rightmost_token_index(
113+
syntax_stack[: index + 1],
114+
identifier,
115+
)
116+
if (
117+
nearest_identifier_index
118+
and identifier
119+
and syntax_stack[nearest_identifier_index][0]
120+
== syntax_stack[nearest_schema_index][0]
121+
) or syntax_stack[nearest_schema_index][0] == 0:
122+
return dialect
123+
elif is_dialect_valid and nearest_identifier_index:
124+
return self._get_nearest_valid_dialect(
125+
tokens,
126+
syntax_stack,
127+
nearest_identifier_index - 1,
128+
)
129+
elif is_dialect_valid and syntax_stack[-1][1] not in (
130+
'"$id"',
131+
'"id"',
132+
):
133+
return self._get_nearest_valid_dialect(
134+
tokens,
135+
syntax_stack,
136+
nearest_schema_index - 1,
137+
)
138+
139+
if self.default_dialect:
140+
return self.default_dialect
141+
142+
return None
143+
144+
def _parse_token_tuple(
145+
self,
146+
token_tuple: tuple[int, Any, str],
147+
keywords: list[str],
148+
):
149+
start, token, value = token_tuple
150+
keywords = ['"%s"' % keyword for keyword in (keywords)]
151+
if token is Token.Name.Tag and value in keywords:
152+
return start, Token.Keyword, value
153+
elif token is Token.String.Double and value in self.data_types:
154+
return start, Token.Name.Decorator, value
155+
else:
156+
return start, token, value
157+
158+
def map_tokens_by_schema(self, tokens: list[tuple[int, Any, str]]):
159+
syntax_stack: list[tuple[int, str]] = []
160+
cur_depth = -1
161+
for start, token, value in tokens:
162+
if value == "{":
163+
cur_depth += 1
164+
165+
syntax_stack.append((cur_depth, value))
166+
167+
if value == "}":
168+
while syntax_stack.pop()[1] != "{":
169+
continue
170+
yield self._parse_token_tuple((start, token, value), [])
171+
else:
172+
dialect = self._get_nearest_valid_dialect(tokens, syntax_stack)
173+
yield self._parse_token_tuple(
174+
(start, token, value),
175+
self.keywords.get(dialect, []),
176+
)
118177

119178
def get_tokens_unprocessed(self, text: str): # type: ignore[reportUnknownParameterType]
120179
"""
121180
Add token classes to it according to JSON Schema.
122181
"""
123-
for start, token, value in super().get_tokens_unprocessed(text): # type: ignore[reportUnknownVariableType]
124-
if token is Token.Name.Tag and value in self.parsed_keywords:
125-
yield start, Token.Keyword, value
126-
elif (
127-
token is Token.String.Double
128-
and value in self.parsed_data_types
129-
):
130-
yield start, Token.Name.Decorator, value
131-
else:
132-
yield start, token, value
182+
json_tokens: list[tuple[int, Any, str]] = list(
183+
super().get_tokens_unprocessed(text), # type: ignore[reportUnknownParameterType]
184+
)
185+
yield from self.map_tokens_by_schema(json_tokens)
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
{
2+
"dialect": "http://json-schema.org/draft-03/schema#",
3+
"identifier": "id",
4+
"keywords": [
5+
"type",
6+
"properties",
7+
"patternProperties",
8+
"additionalProperties",
9+
"items",
10+
"additionalItems",
11+
"required",
12+
"dependencies",
13+
"minimum",
14+
"maximum",
15+
"exclusiveMinimum",
16+
"exclusiveMaximum",
17+
"minItems",
18+
"maxItems",
19+
"uniqueItems",
20+
"pattern",
21+
"minLength",
22+
"maxLength",
23+
"enum",
24+
"default",
25+
"title",
26+
"description",
27+
"format",
28+
"divisibleBy",
29+
"disallow",
30+
"extends",
31+
"id",
32+
"$ref",
33+
"$schema"
34+
]
35+
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
{
2+
"dialect": "http://json-schema.org/draft-04/schema#",
3+
"identifier": "id",
4+
"keywords": [
5+
"id",
6+
"$schema",
7+
"title",
8+
"description",
9+
"default",
10+
"multipleOf",
11+
"maximum",
12+
"exclusiveMaximum",
13+
"minimum",
14+
"exclusiveMinimum",
15+
"maxLength",
16+
"minLength",
17+
"pattern",
18+
"additionalItems",
19+
"items",
20+
"maxItems",
21+
"minItems",
22+
"uniqueItems",
23+
"maxProperties",
24+
"minProperties",
25+
"required",
26+
"additionalProperties",
27+
"definitions",
28+
"properties",
29+
"patternProperties",
30+
"dependencies",
31+
"enum",
32+
"type",
33+
"format",
34+
"allOf",
35+
"anyOf",
36+
"oneOf",
37+
"not"
38+
]
39+
}
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
{
2+
"dialect": "http://json-schema.org/draft-06/schema#",
3+
"identifier": "$id",
4+
"keywords": [
5+
"$id",
6+
"$schema",
7+
"$ref",
8+
"title",
9+
"description",
10+
"default",
11+
"examples",
12+
"multipleOf",
13+
"maximum",
14+
"exclusiveMaximum",
15+
"minimum",
16+
"exclusiveMinimum",
17+
"maxLength",
18+
"minLength",
19+
"pattern",
20+
"additionalItems",
21+
"items",
22+
"maxItems",
23+
"minItems",
24+
"uniqueItems",
25+
"contains",
26+
"maxProperties",
27+
"minProperties",
28+
"required",
29+
"additionalProperties",
30+
"definitions",
31+
"properties",
32+
"patternProperties",
33+
"dependencies",
34+
"propertyNames",
35+
"const",
36+
"enum",
37+
"type",
38+
"format",
39+
"allOf",
40+
"anyOf",
41+
"oneOf",
42+
"not"
43+
]
44+
}

0 commit comments

Comments
 (0)