2
2
Contains the main functionality of the JSONSchemaLexer.
3
3
"""
4
4
5
- from typing import ClassVar
5
+ from importlib .resources import files
6
+ from pathlib import Path
7
+ from typing import Any , ClassVar
8
+ import json
6
9
7
10
from pygments .lexers .data import ( # type: ignore[reportMissingTypeStubs]
8
11
JsonLexer ,
@@ -24,109 +27,159 @@ class JSONSchemaLexer(JsonLexer):
24
27
]
25
28
26
29
data_types : ClassVar [list [str ]] = [
27
- "object" ,
28
- "integer" ,
29
- "string" ,
30
- "number" ,
31
- "array" ,
32
- "boolean" ,
33
- "null" ,
34
- ]
35
- core_keywords : ClassVar [list [str ]] = [
36
- "$schema" ,
37
- "$id" ,
38
- "$ref" ,
39
- "$defs" ,
40
- "$comment" ,
41
- "$dynamicAnchor" ,
42
- "$dynamicRef" ,
43
- "$anchor" ,
44
- "$vocabulary" ,
45
- ]
46
- applicator_keywords : ClassVar [list [str ]] = [
47
- "oneOf" ,
48
- "allOf" ,
49
- "anyOf" ,
50
- "if" ,
51
- "then" ,
52
- "else" ,
53
- "not" ,
54
- "properties" ,
55
- "patternProperties" ,
56
- "additionalProperties" ,
57
- "dependentSchemas" ,
58
- "propertyNames" ,
59
- "prefixItems" ,
60
- "contains" ,
61
- "items" ,
62
- ]
63
- meta_data_keywords : ClassVar [list [str ]] = [
64
- "title" ,
65
- "description" ,
66
- "default" ,
67
- "deprecated" ,
68
- "examples" ,
69
- "readOnly" ,
70
- "writeOnly" ,
71
- ]
72
- validation_keywords : ClassVar [list [str ]] = [
73
- "type" ,
74
- "enum" ,
75
- "const" ,
76
- "minLength" ,
77
- "maxLength" ,
78
- "pattern" ,
79
- "maximum" ,
80
- "exclusiveMinimum" ,
81
- "multipleOf" ,
82
- "exclusiveMaximum" ,
83
- "minimum" ,
84
- "dependentRequired" ,
85
- "minProperties" ,
86
- "maxProperties" ,
87
- "required" ,
88
- "minItems" ,
89
- "maxItems" ,
90
- "minContains" ,
91
- "maxContains" ,
92
- "uniqueItems" ,
93
- ]
94
- other_keywords : ClassVar [list [str ]] = [
95
- "format" ,
96
- "unevaluatedItems" ,
97
- "unevaluatedProperties" ,
98
- "contentEncoding" ,
99
- "contentMediaType" ,
100
- "contentSchema" ,
101
- "format_assertion" ,
30
+ '"object"' ,
31
+ '"integer"' ,
32
+ '"string"' ,
33
+ '"number"' ,
34
+ '"array"' ,
35
+ '"boolean"' ,
36
+ '"null"' ,
102
37
]
38
+ keywords : ClassVar [dict [str | None , list [str ]]] = {}
39
+ identifier : ClassVar [dict [str | None , str ]] = {}
40
+ default_dialect = None
41
+
42
+ def __init__ (self , default_dialect : str | None = None ):
43
+ super ().__init__ () # type: ignore[reportUnknownMemberType]
44
+ self ._populate_keywords_and_identifiers ()
45
+ if default_dialect and default_dialect [0 ] != '"' :
46
+ default_dialect = '"' + default_dialect
47
+
48
+ if default_dialect and default_dialect [- 1 ] != '"' :
49
+ default_dialect = default_dialect + '"'
103
50
104
- parsed_keywords : ClassVar [list [str ]] = [
105
- '"%s"' % keyword
106
- for keyword in (
107
- core_keywords
108
- + applicator_keywords
109
- + meta_data_keywords
110
- + validation_keywords
111
- + other_keywords
51
+ self .default_dialect = default_dialect
52
+
53
+ def _populate_keywords_and_identifiers (self ):
54
+ dialect_files = files ("jsonschema_lexer" ) / "data" / "keywords"
55
+ if not dialect_files .is_dir ():
56
+ dialect_files = Path (__file__ ).parent / "data" / "keywords"
57
+ for dialect_file in dialect_files .iterdir ():
58
+ with dialect_file .open () as file :
59
+ json_content = json .load (file )
60
+ dialect_name = f'"{ json_content ["dialect" ]} "'
61
+ self .keywords [dialect_name ] = json_content ["keywords" ]
62
+ self .identifier [dialect_name ] = (
63
+ f'"{ json_content ["identifier" ]} "'
64
+ )
65
+
66
+ def _find_rightmost_token_index (
67
+ self ,
68
+ syntax_stack : list [tuple [int , str ]],
69
+ token : str | None ,
70
+ ):
71
+ return next (
72
+ (
73
+ i
74
+ for i , (_ , t ) in reversed (list (enumerate (syntax_stack )))
75
+ if t == token
76
+ ),
77
+ None ,
112
78
)
113
- ]
114
79
115
- parsed_data_types : ClassVar [list [str ]] = [
116
- '"%s"' % data_type for data_type in data_types
117
- ]
80
+ def _find_key_value_from_json (
81
+ self ,
82
+ tokens : list [tuple [int , Any , str ]],
83
+ index : int ,
84
+ ):
85
+ return next (
86
+ (t [2 ] for t in tokens [index :] if t [1 ] is Token .String .Double ),
87
+ None ,
88
+ )
89
+
90
+ def _get_nearest_valid_dialect (
91
+ self ,
92
+ tokens : list [tuple [int , Any , str ]],
93
+ syntax_stack : list [tuple [int , str ]],
94
+ index : int | None = None ,
95
+ ) -> str | None :
96
+ if not index :
97
+ index = len (syntax_stack ) - 1
98
+
99
+ nearest_schema_index = self ._find_rightmost_token_index (
100
+ syntax_stack [: index + 1 ],
101
+ '"$schema"' ,
102
+ )
103
+ if nearest_schema_index :
104
+ dialect = self ._find_key_value_from_json (
105
+ tokens ,
106
+ nearest_schema_index ,
107
+ )
108
+ identifier = self .identifier .get (dialect , None )
109
+ is_dialect_valid = bool (
110
+ identifier or syntax_stack [nearest_schema_index ][0 ] == 0 ,
111
+ )
112
+ nearest_identifier_index = self ._find_rightmost_token_index (
113
+ syntax_stack [: index + 1 ],
114
+ identifier ,
115
+ )
116
+ if (
117
+ nearest_identifier_index
118
+ and identifier
119
+ and syntax_stack [nearest_identifier_index ][0 ]
120
+ == syntax_stack [nearest_schema_index ][0 ]
121
+ ) or syntax_stack [nearest_schema_index ][0 ] == 0 :
122
+ return dialect
123
+ elif is_dialect_valid and nearest_identifier_index :
124
+ return self ._get_nearest_valid_dialect (
125
+ tokens ,
126
+ syntax_stack ,
127
+ nearest_identifier_index - 1 ,
128
+ )
129
+ elif is_dialect_valid and syntax_stack [- 1 ][1 ] not in (
130
+ '"$id"' ,
131
+ '"id"' ,
132
+ ):
133
+ return self ._get_nearest_valid_dialect (
134
+ tokens ,
135
+ syntax_stack ,
136
+ nearest_schema_index - 1 ,
137
+ )
138
+
139
+ if self .default_dialect :
140
+ return self .default_dialect
141
+
142
+ return None
143
+
144
+ def _parse_token_tuple (
145
+ self ,
146
+ token_tuple : tuple [int , Any , str ],
147
+ keywords : list [str ],
148
+ ):
149
+ start , token , value = token_tuple
150
+ keywords = ['"%s"' % keyword for keyword in (keywords )]
151
+ if token is Token .Name .Tag and value in keywords :
152
+ return start , Token .Keyword , value
153
+ elif token is Token .String .Double and value in self .data_types :
154
+ return start , Token .Name .Decorator , value
155
+ else :
156
+ return start , token , value
157
+
158
+ def map_tokens_by_schema (self , tokens : list [tuple [int , Any , str ]]):
159
+ syntax_stack : list [tuple [int , str ]] = []
160
+ cur_depth = - 1
161
+ for start , token , value in tokens :
162
+ if value == "{" :
163
+ cur_depth += 1
164
+
165
+ syntax_stack .append ((cur_depth , value ))
166
+
167
+ if value == "}" :
168
+ while syntax_stack .pop ()[1 ] != "{" :
169
+ continue
170
+ yield self ._parse_token_tuple ((start , token , value ), [])
171
+ else :
172
+ dialect = self ._get_nearest_valid_dialect (tokens , syntax_stack )
173
+ yield self ._parse_token_tuple (
174
+ (start , token , value ),
175
+ self .keywords .get (dialect , []),
176
+ )
118
177
119
178
def get_tokens_unprocessed (self , text : str ): # type: ignore[reportUnknownParameterType]
120
179
"""
121
180
Add token classes to it according to JSON Schema.
122
181
"""
123
- for start , token , value in super ().get_tokens_unprocessed (text ): # type: ignore[reportUnknownVariableType]
124
- if token is Token .Name .Tag and value in self .parsed_keywords :
125
- yield start , Token .Keyword , value
126
- elif (
127
- token is Token .String .Double
128
- and value in self .parsed_data_types
129
- ):
130
- yield start , Token .Name .Decorator , value
131
- else :
132
- yield start , token , value
182
+ json_tokens : list [tuple [int , Any , str ]] = list (
183
+ super ().get_tokens_unprocessed (text ), # type: ignore[reportUnknownParameterType]
184
+ )
185
+ yield from self .map_tokens_by_schema (json_tokens )
0 commit comments