Skip to content

Commit a32fb28

Browse files
committed
RFC: Support full Unicode in lexer
Replicates graphql/graphql-js@8ca3d89
1 parent 0fa2c49 commit a32fb28

File tree

6 files changed

+463
-36
lines changed

6 files changed

+463
-36
lines changed

src/graphql/language/lexer.py

Lines changed: 93 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -67,12 +67,16 @@ def print_code_point_at(self, location: int) -> str:
6767
if location >= len(body):
6868
return TokenKind.EOF.value
6969
char = body[location]
70-
code = ord(char)
7170
# Printable ASCII
72-
if 0x20 <= code <= 0x7E:
71+
if "\x20" <= char <= "\x7E":
7372
return "'\"'" if char == '"' else f"'{char}'"
7473
# Unicode code point
75-
return f"U+{code:04X}"
74+
point = (
75+
decode_surrogate_pair(ord(char), ord(body[location + 1]))
76+
if is_supplementary_code_point(body, location)
77+
else ord(char)
78+
)
79+
return f"U+{point:04X}"
7680

7781
def create_token(
7882
self, kind: TokenKind, start: int, end: int, value: Optional[str] = None
@@ -141,7 +145,8 @@ def read_next_token(self, start: int) -> Token:
141145
if char == "'"
142146
else (
143147
f"Unexpected character: {self.print_code_point_at(position)}."
144-
if is_source_character(char)
148+
if is_unicode_scalar_value(char)
149+
or is_supplementary_code_point(body, position)
145150
else f"Invalid character: {self.print_code_point_at(position)}."
146151
)
147152
)
@@ -158,10 +163,14 @@ def read_comment(self, start: int) -> Token:
158163
position = start + 1
159164
while position < body_length:
160165
char = body[position]
161-
162-
if char in "\r\n" or not is_source_character(char):
166+
if char in "\r\n":
163167
break
164-
position += 1
168+
if is_unicode_scalar_value(char):
169+
position += 1
170+
elif is_supplementary_code_point(body, position):
171+
position += 2
172+
else:
173+
break # pragma: no cover
165174

166175
return self.create_token(
167176
TokenKind.COMMENT,
@@ -270,7 +279,11 @@ def read_string(self, start: int) -> Token:
270279
if char == "\\":
271280
append(body[chunk_start:position])
272281
escape = (
273-
self.read_escaped_unicode(position)
282+
(
283+
self.read_escaped_unicode_variable_width(position)
284+
if body[position + 2 : position + 3] == "{"
285+
else self.read_escaped_unicode_fixed_width(position)
286+
)
274287
if body[position + 1 : position + 2] == "u"
275288
else self.read_escaped_character(position)
276289
)
@@ -282,8 +295,10 @@ def read_string(self, start: int) -> Token:
282295
if char in "\r\n":
283296
break
284297

285-
if is_source_character(char):
298+
if is_unicode_scalar_value(char):
286299
position += 1
300+
elif is_supplementary_code_point(body, position):
301+
position += 2
287302
else:
288303
raise GraphQLSyntaxError(
289304
self.source,
@@ -294,11 +309,50 @@ def read_string(self, start: int) -> Token:
294309

295310
raise GraphQLSyntaxError(self.source, position, "Unterminated string.")
296311

297-
def read_escaped_unicode(self, position: int) -> EscapeSequence:
312+
def read_escaped_unicode_variable_width(self, position: int) -> EscapeSequence:
313+
body = self.source.body
314+
point = 0
315+
size = 3
316+
max_size = min(12, len(body) - position)
317+
# Cannot be larger than 12 chars (\u{00000000}).
318+
while size < max_size:
319+
char = body[position + size]
320+
size += 1
321+
if char == "}":
322+
# Must be at least 5 chars (\u{0}) and encode a Unicode scalar value.
323+
if size < 5 or not (
324+
0 <= point <= 0xD7FF or 0xE000 <= point <= 0x10FFFF
325+
):
326+
break
327+
return EscapeSequence(chr(point), size)
328+
# Append this hex digit to the code point.
329+
point = (point << 4) | read_hex_digit(char)
330+
if point < 0:
331+
break
332+
333+
raise GraphQLSyntaxError(
334+
self.source,
335+
position,
336+
f"Invalid Unicode escape sequence: '{body[position: position + size]}'.",
337+
)
338+
339+
def read_escaped_unicode_fixed_width(self, position: int) -> EscapeSequence:
298340
body = self.source.body
299341
code = read_16_bit_hex_code(body, position + 2)
300-
if code >= 0:
342+
343+
if 0 <= code <= 0xD7FF or 0xE000 <= code <= 0x10FFFF:
301344
return EscapeSequence(chr(code), 6)
345+
346+
# GraphQL allows JSON-style surrogate pair escape sequences, but only when
347+
# a valid pair is formed.
348+
if 0xD800 <= code <= 0xDBFF:
349+
if body[position + 6 : position + 8] == "\\u":
350+
trailing_code = read_16_bit_hex_code(body, position + 8)
351+
if 0xDC00 <= trailing_code <= 0xDFFF:
352+
return EscapeSequence(
353+
chr(decode_surrogate_pair(code, trailing_code)), 12
354+
)
355+
302356
raise GraphQLSyntaxError(
303357
self.source,
304358
position,
@@ -351,8 +405,10 @@ def read_block_string(self, start: int) -> Token:
351405
self.line_start = position
352406
continue
353407

354-
if is_source_character(char):
408+
if is_unicode_scalar_value(char):
355409
position += 1
410+
elif is_supplementary_code_point(body, position):
411+
position += 2
356412
else:
357413
raise GraphQLSyntaxError(
358414
self.source,
@@ -477,9 +533,31 @@ def read_hex_digit(char: str) -> int:
477533
return -1
478534

479535

480-
def is_source_character(char: str) -> bool:
481-
"""Check whether this is a SourceCharacter"""
482-
return char >= " " or char in "\t\r\n"
536+
def is_unicode_scalar_value(char: str) -> bool:
537+
"""Check whether this is a Unicode scalar value.
538+
539+
A Unicode scalar value is any Unicode code point except surrogate code
540+
points. In other words, the inclusive ranges of values 0x0000 to 0xD7FF and
541+
0xE000 to 0x10FFFF.
542+
"""
543+
return "\x00" <= char <= "\ud7ff" or "\ue000" <= char <= "\U0010ffff"
544+
545+
546+
def is_supplementary_code_point(body: str, location: int) -> bool:
547+
"""
548+
Check whether the current location is a supplementary code point.
549+
550+
The GraphQL specification defines source text as a sequence of unicode scalar
551+
values (which Unicode defines to exclude surrogate code points).
552+
"""
553+
return (
554+
"\ud800" <= body[location] <= "\udbff"
555+
and "\udc00" <= body[location + 1] <= "\udfff"
556+
)
557+
558+
559+
def decode_surrogate_pair(leading: int, trailing: int) -> int:
560+
return 0x10000 + (((leading & 0x03FF) << 10) | (trailing & 0x03FF))
483561

484562

485563
def is_name_start(char: str) -> bool:

src/graphql/language/print_string.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
__all__ = ["print_string"]
2+
3+
4+
def print_string(s: str) -> str:
5+
""" "Print a string as a GraphQL StringValue literal.
6+
7+
Replaces control characters and excluded characters (" U+0022 and \\ U+005C)
8+
with escape sequences.
9+
"""
10+
return f'"{s.translate(escape_sequences)}"'
11+
12+
13+
escape_sequences = {
14+
0x00: "\\u0000",
15+
0x01: "\\u0001",
16+
0x02: "\\u0002",
17+
0x03: "\\u0003",
18+
0x04: "\\u0004",
19+
0x05: "\\u0005",
20+
0x06: "\\u0006",
21+
0x07: "\\u0007",
22+
0x08: "\\b",
23+
0x09: "\\t",
24+
0x0A: "\\n",
25+
0x0B: "\\u000B",
26+
0x0C: "\\f",
27+
0x0D: "\\r",
28+
0x0E: "\\u000E",
29+
0x0F: "\\u000F",
30+
0x10: "\\u0010",
31+
0x11: "\\u0011",
32+
0x12: "\\u0012",
33+
0x13: "\\u0013",
34+
0x14: "\\u0014",
35+
0x15: "\\u0015",
36+
0x16: "\\u0016",
37+
0x17: "\\u0017",
38+
0x18: "\\u0018",
39+
0x19: "\\u0019",
40+
0x1A: "\\u001A",
41+
0x1B: "\\u001B",
42+
0x1C: "\\u001C",
43+
0x1D: "\\u001D",
44+
0x1E: "\\u001E",
45+
0x1F: "\\u001F",
46+
0x22: '\\"',
47+
0x5C: "\\\\",
48+
0x7F: "\\u007F",
49+
0x80: "\\u0080",
50+
0x81: "\\u0081",
51+
0x82: "\\u0082",
52+
0x83: "\\u0083",
53+
0x84: "\\u0084",
54+
0x85: "\\u0085",
55+
0x86: "\\u0086",
56+
0x87: "\\u0087",
57+
0x88: "\\u0088",
58+
0x89: "\\u0089",
59+
0x8A: "\\u008A",
60+
0x8B: "\\u008B",
61+
0x8C: "\\u008C",
62+
0x8D: "\\u008D",
63+
0x8E: "\\u008E",
64+
0x8F: "\\u008F",
65+
0x90: "\\u0090",
66+
0x91: "\\u0091",
67+
0x92: "\\u0092",
68+
0x93: "\\u0093",
69+
0x94: "\\u0094",
70+
0x95: "\\u0095",
71+
0x96: "\\u0096",
72+
0x97: "\\u0097",
73+
0x98: "\\u0098",
74+
0x99: "\\u0099",
75+
0x9A: "\\u009A",
76+
0x9B: "\\u009B",
77+
0x9C: "\\u009C",
78+
0x9D: "\\u009D",
79+
0x9E: "\\u009E",
80+
0x9F: "\\u009F",
81+
}

src/graphql/language/printer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
from json import dumps
21
from typing import Any, Collection, Optional
32

43
from ..language.ast import Node, OperationType
5-
from .visitor import visit, Visitor
64
from .block_string import print_block_string
5+
from .print_string import print_string
6+
from .visitor import visit, Visitor
77

88
__all__ = ["print_ast"]
99

@@ -148,7 +148,7 @@ def leave_float_value(node: PrintedNode, *_args: Any) -> str:
148148
def leave_string_value(node: PrintedNode, *_args: Any) -> str:
149149
if node.block:
150150
return print_block_string(node.value)
151-
return dumps(node.value)
151+
return print_string(node.value)
152152

153153
@staticmethod
154154
def leave_boolean_value(node: PrintedNode, *_args: Any) -> str:

0 commit comments

Comments
 (0)