Skip to content

Commit 4f3fc86

Browse files
committed
[RFC] Clarify and restrict unicode support
Implements graphql/graphql-spec#96
1 parent de3b371 commit 4f3fc86

File tree

2 files changed

+249
-146
lines changed

2 files changed

+249
-146
lines changed

graphql/core/language/lexer.py

Lines changed: 120 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,8 @@ def get_token_kind_desc(kind):
103103
def char_code_at(s, pos):
104104
if 0 <= pos < len(s):
105105
return ord(s[pos])
106-
return 0
106+
107+
return None
107108

108109

109110
PUNCT_CODE_TO_KIND = {
@@ -122,6 +123,10 @@ def char_code_at(s, pos):
122123
}
123124

124125

126+
def print_char_code(code):
127+
return 'EOF' if code is None else json.dumps(unichr(code))
128+
129+
125130
def read_token(source, from_position):
126131
"""Gets the next token from the source starting at the given position.
127132
@@ -132,30 +137,52 @@ def read_token(source, from_position):
132137
body_length = len(body)
133138

134139
position = position_after_whitespace(body, from_position)
135-
code = char_code_at(body, position)
136140

137141
if position >= body_length:
138142
return Token(TokenKind.EOF, position, position)
139143

144+
code = char_code_at(body, position)
145+
146+
if code < 0x0020 and code not in (0x0009, 0x000A, 0x000D):
147+
raise LanguageError(
148+
source, position,
149+
u'Invalid character {}.'.format(print_char_code(code))
150+
)
151+
140152
kind = PUNCT_CODE_TO_KIND.get(code)
141153
if kind is not None:
142154
return Token(kind, position, position + 1)
143155

144156
if code == 46: # .
145-
if char_code_at(body, position + 1) == 46 and \
146-
char_code_at(body, position + 2) == 46:
157+
if char_code_at(body, position + 1) == char_code_at(body, position + 2) == 46:
147158
return Token(TokenKind.SPREAD, position, position + 3)
159+
148160
elif 65 <= code <= 90 or code == 95 or 97 <= code <= 122:
149161
# A-Z, _, a-z
150162
return read_name(source, position)
163+
151164
elif code == 45 or 48 <= code <= 57: # -, 0-9
152165
return read_number(source, position, code)
166+
153167
elif code == 34: # "
154168
return read_string(source, position)
155169

156170
raise LanguageError(
157171
source, position,
158-
u'Unexpected character {}'.format(json.dumps(body[position])))
172+
u'Unexpected character {}.'.format(print_char_code(code)))
173+
174+
ignored_whitespace_characters = frozenset([
175+
# BOM
176+
0xFEFF,
177+
# White Space
178+
0x0009, # tab
179+
0x0020, # space
180+
# Line Terminator
181+
0x000A, # new line
182+
0x000D, # carriage return
183+
# Comma
184+
0x002C
185+
])
159186

160187

161188
def position_after_whitespace(body, start_position):
@@ -166,20 +193,16 @@ def position_after_whitespace(body, start_position):
166193
position = start_position
167194
while position < body_length:
168195
code = char_code_at(body, position)
169-
if code in (
170-
32, # space
171-
44, # comma
172-
160, # '\xa0'
173-
0x2028, # line separator
174-
0x2029, # paragraph separator
175-
) or (code > 8 and code < 14): # whitespace
196+
if code in ignored_whitespace_characters:
176197
position += 1
198+
177199
elif code == 35: # #, skip comments
178200
position += 1
179201
while position < body_length:
180202
code = char_code_at(body, position)
181-
if not code or code in (10, 13, 0x2028, 0x2029):
203+
if not (code is not None and (code > 0x001F or code == 0x0009) and code not in (0x000A, 0x000D)):
182204
break
205+
183206
position += 1
184207
else:
185208
break
@@ -204,43 +227,34 @@ def read_number(source, start, first_code):
204227
if code == 48: # 0
205228
position += 1
206229
code = char_code_at(body, position)
207-
elif 49 <= code <= 57: # 1 - 9
208-
position += 1
209-
code = char_code_at(body, position)
210-
while 48 <= code <= 57: # 0 - 9
211-
position += 1
212-
code = char_code_at(body, position)
230+
231+
if code is not None and 48 <= code <= 57:
232+
raise LanguageError(
233+
source,
234+
position,
235+
u'Invalid number, unexpected digit after 0: {}.'.format(print_char_code(code))
236+
)
213237
else:
214-
raise LanguageError(source, position, 'Invalid number')
238+
position = read_digits(source, position, code)
239+
code = char_code_at(body, position)
215240

216241
if code == 46: # .
217242
is_float = True
218243

219244
position += 1
220245
code = char_code_at(body, position)
221-
if 48 <= code <= 57: # 0 - 9
222-
position += 1
223-
code = char_code_at(body, position)
224-
while 48 <= code <= 57: # 0 - 9
225-
position += 1
226-
code = char_code_at(body, position)
227-
else:
228-
raise LanguageError(source, position, 'Invalid number')
246+
position = read_digits(source, position, code)
247+
code = char_code_at(body, position)
229248

230-
if code == 101: # e
249+
if code in (69, 101): # E e
250+
is_float = True
251+
position += 1
252+
code = char_code_at(body, position)
253+
if code in (43, 45): # + -
231254
position += 1
232255
code = char_code_at(body, position)
233-
if code == 45: # -
234-
position += 1
235-
code = char_code_at(body, position)
236-
if 48 <= code <= 57: # 0 - 9
237-
position += 1
238-
code = char_code_at(body, position)
239-
while 48 <= code <= 57: # 0 - 9
240-
position += 1
241-
code = char_code_at(body, position)
242-
else:
243-
raise LanguageError(source, position, 'Invalid number')
256+
257+
position = read_digits(source, position, code)
244258

245259
return Token(
246260
TokenKind.FLOAT if is_float else TokenKind.INT,
@@ -250,6 +264,28 @@ def read_number(source, start, first_code):
250264
)
251265

252266

267+
def read_digits(source, start, first_code):
268+
body = source.body
269+
position = start
270+
code = first_code
271+
272+
if code is not None and 48 <= code <= 57: # 0 - 9
273+
while True:
274+
position += 1
275+
code = char_code_at(body, position)
276+
277+
if not (code is not None and 48 <= code <= 57):
278+
break
279+
280+
return position
281+
282+
raise LanguageError(
283+
source,
284+
position,
285+
u'Invalid number, expected digit but got: {}.'.format(print_char_code(code))
286+
)
287+
288+
253289
ESCAPED_CHAR_CODES = {
254290
34: '"',
255291
47: '/',
@@ -268,47 +304,73 @@ def read_string(source, start):
268304
"([^"\\\u000A\u000D\u2028\u2029]|(\\(u[0-9a-fA-F]{4}|["\\/bfnrt])))*"
269305
"""
270306
body = source.body
307+
body_length = len(body)
308+
271309
position = start + 1
272310
chunk_start = position
273-
code = None
274-
value = u''
311+
code = 0
312+
value = []
313+
append = value.append
275314

276-
while position < len(body):
315+
while position < body_length:
277316
code = char_code_at(body, position)
278-
if not code or code in (34, 10, 13, 0x2028, 0x2029):
317+
if not (
318+
code is not None and
319+
code not in (
320+
# LineTerminator
321+
0x000A, 0x000D,
322+
# Quote
323+
34
324+
)
325+
):
279326
break
327+
328+
if code < 0x0020 and code != 0x0009:
329+
raise LanguageError(
330+
source,
331+
position,
332+
u'Invalid character within String: {}.'.format(print_char_code(code))
333+
)
334+
280335
position += 1
281336
if code == 92: # \
282-
value += body[chunk_start:position - 1]
337+
append(body[chunk_start:position - 1])
338+
283339
code = char_code_at(body, position)
284340
escaped = ESCAPED_CHAR_CODES.get(code)
285341
if escaped is not None:
286-
value += escaped
287-
elif code == 117:
342+
append(escaped)
343+
344+
elif code == 117: # u
288345
char_code = uni_char_code(
289346
char_code_at(body, position + 1) or 0,
290347
char_code_at(body, position + 2) or 0,
291348
char_code_at(body, position + 3) or 0,
292349
char_code_at(body, position + 4) or 0,
293350
)
351+
294352
if char_code < 0:
295353
raise LanguageError(
296354
source, position,
297-
'Bad character escape sequence')
298-
value += unichr(char_code)
355+
u'Invalid character escape sequence: \\u{}.'.format(body[position + 1: position + 5])
356+
)
357+
358+
append(unichr(char_code))
299359
position += 4
300360
else:
301361
raise LanguageError(
302362
source, position,
303-
'Bad character escape sequence')
363+
u'Invalid character escape sequence: \\{}.'.format(unichr(code))
364+
)
365+
304366
position += 1
305367
chunk_start = position
306368

307-
if code != 34:
369+
if code != 34: # Quote (")
308370
raise LanguageError(source, position, 'Unterminated string')
309371

310-
value += body[chunk_start:position]
311-
return Token(TokenKind.STRING, start, position + 1, value)
372+
append(body[chunk_start:position])
373+
return Token(TokenKind.STRING, start, position + 1, u''.join(value))
312374

313375

314376
def uni_char_code(a, b, c, d):
@@ -348,15 +410,17 @@ def read_name(source, position):
348410
body = source.body
349411
body_length = len(body)
350412
end = position + 1
351-
code = None
413+
352414
while end != body_length:
353415
code = char_code_at(body, end)
354-
if not code or not (
416+
if not (code is not None and (
355417
code == 95 or # _
356418
48 <= code <= 57 or # 0-9
357419
65 <= code <= 90 or # A-Z
358420
97 <= code <= 122 # a-z
359-
):
421+
)):
360422
break
423+
361424
end += 1
425+
362426
return Token(TokenKind.NAME, position, end, body[position:end])

0 commit comments

Comments
 (0)