Skip to content

Commit e7848a0

Browse files
committed
better unicode support
input string now support full range fixed length escaped unicode char and variable length escaped unicode char along with UTF-8 encoding. output string also support full range fixed length escaped unicode char for json string. full range unicode means it will use surrogate pair if necessary. fixes #50
1 parent a8af6b7 commit e7848a0

File tree

6 files changed

+918
-431
lines changed

6 files changed

+918
-431
lines changed

docs/toc.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
- [Overview](#overview)
44
- [Graphql core](#graphql-core)
55
- [Security features](#security-features)
6+
- [Unicode support](#unicode-support)
67

78
- [Tutorial](tutorial.md)
89
- [Important notes](tutorial.md#important-notes)
@@ -83,3 +84,18 @@ can bring down the service, both lexer and parser are configurable to mitigate t
8384
- `maxDefinitions`. Queries, mutations, subscriptions, and fragments total number should be reasonable. (default = 512)
8485
- `maxChoices`. Unions and directive's locations are limited by this number. (default = 64)
8586

87+
### Unicode support
88+
89+
- Input string:
90+
- Accepted encoding for input string are UTF-8.
91+
- Escaped unicode in quoted string take the form of UTF-16 BE:
92+
- Fixed 4 digit hex: e.g. `\u000A`
93+
- variable length: `\u{1F4A9}` with range (>= 0x0000 and <= 0xD7FF or >= 0xE000 and <= 0x10FFFF)
94+
- Escape sequences are only meaningful within a single-quoted string.
95+
In multiline string, unicode char must be encoded using UTF-8.
96+
- SurrogatePair: "\uD83D\uDCA9" is equal to "\u{1F4A9}"
97+
98+
- Output string:
99+
- Output string subject to output serialization format specification.
100+
- For example, output using json as serialization format will result in UTF-8 encoded string.
101+
- Or if the escape flag is set, it will use UTF-16 BE 4 digit hex fixed length similar to GraphQL escape sequence.

graphql/builtin/json_respstream.nim

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99

1010
import
1111
faststreams/[outputs, textio],
12-
../common/[respstream, ast]
12+
../common/[respstream, ast],
13+
../private/utf
1314

1415
export respstream
1516

@@ -25,6 +26,7 @@ type
2526
stream: OutputStream
2627
stack: seq[State]
2728
doubleEscape: bool
29+
escapeUnicode: bool
2830

2931
template top(x: seq[State]): State =
3032
x[^1]
@@ -75,7 +77,7 @@ proc write*(x: JsonRespStream, v: string) =
7577
append '\\'
7678
append c
7779

78-
for c in v:
80+
template writeChar(c: char) =
7981
case c
8082
of '\L': addPrefixSlash 'n'
8183
of '\b': addPrefixSlash 'b'
@@ -96,6 +98,25 @@ proc write*(x: JsonRespStream, v: string) =
9698
of '\\': addPrefixSlash '\\'
9799
else: append c
98100

101+
if x.escapeUnicode:
102+
for c in Utf8.codePoints(v):
103+
if c >= 0x80:
104+
let p = Utf16.toPair(c)
105+
if p.state == Utf16One:
106+
append "\\u"
107+
x.stream.writeHex([char(p.cp shr 8), char(p.cp and 0xFF)])
108+
elif p.state == Utf16Two:
109+
append "\\u"
110+
x.stream.writeHex([char(p.hi shr 8), char(p.hi and 0xFF)])
111+
append "\\u"
112+
x.stream.writeHex([char(p.lo shr 8), char(p.lo and 0xFF)])
113+
else:
114+
let cc = c.char
115+
writeChar(cc)
116+
else:
117+
for c in v:
118+
writeChar(c)
119+
99120
if x.doubleEscape:
100121
append "\\\""
101122
else:
@@ -198,12 +219,17 @@ proc getBytes*(x: JsonRespStream): seq[byte] =
198219
proc len*(x: JsonRespStream): int =
199220
x.stream.pos()
200221

201-
proc init*(v: JsonRespStream, doubleEscape: bool = false) =
222+
proc init*(v: JsonRespStream,
223+
doubleEscape: bool = false,
224+
escapeUnicode: bool = false) =
202225
v.stream = memoryOutput()
203226
v.stack = @[StateTop]
204227
v.doubleEscape = doubleEscape
228+
v.escapeUnicode = escapeUnicode
205229

206-
proc new*(_: type JsonRespStream, doubleEscape: bool = false): JsonRespStream =
230+
proc new*(_: type JsonRespStream,
231+
doubleEscape: bool = false,
232+
escapeUnicode: bool = false): JsonRespStream =
207233
let v = JsonRespStream()
208-
v.init(doubleEscape)
234+
v.init(doubleEscape, escapeUnicode)
209235
v

graphql/lexer.nim

Lines changed: 86 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
import
1111
std/[unicode, strutils],
1212
faststreams/inputs,
13-
./common/[names, errors, types]
13+
./common/[names, errors, types],
14+
./private/utf
1415

1516
type
1617
TokKind* = enum
@@ -48,11 +49,17 @@ type
4849
errInvalidUnicode = "Invalid unicode sequence '$1'"
4950
errInvalidChar = "Invalid char '$1'"
5051
errLoopLimit = "loop limit $1 reached for $2"
52+
errInvalidUTF8 = "Invalid UTF-8 sequence detected in string"
53+
errOrphanSurrogate = "Orphaned surrogate codepoint detected '$1'"
54+
55+
LexerFlag* = enum
56+
lfJsonCompatibility # parse json unicode escape chars but not graphql escape chars
5157

5258
LexConf* = object
5359
maxIdentChars* : int
5460
maxDigits* : int
5561
maxStringChars*: int
62+
flags* : set[LexerFlag]
5663

5764
LexConfInternal = object
5865
maxIdentChars : LoopGuard
@@ -72,6 +79,7 @@ type
7279
error* : LexerError
7380
err* : ErrorDesc
7481
conf : LexConfInternal
82+
flags* : set[LexerFlag]
7583

7684
proc defaultLexConf*(): LexConf =
7785
result.maxIdentChars = 128
@@ -88,7 +96,8 @@ proc init*(T: type Lexer, stream: InputStream, names: NameCache, conf = defaultL
8896
stream: stream,
8997
names: names,
9098
line: 1,
91-
conf: toInternalConf(conf)
99+
conf: toInternalConf(conf),
100+
flags: conf.flags
92101
)
93102

94103
template peek(s: InputStream): char =
@@ -121,7 +130,7 @@ proc lexerError(lex: var Lexer, errKind: LexerError, args: varargs[string, `$`])
121130
lex.err.message = $errKind
122131

123132
case errKind
124-
of errInvalidEscape, errInvalidUnicode, errInvalidChar:
133+
of errInvalidEscape, errInvalidUnicode, errInvalidChar, errOrphanSurrogate:
125134
lex.err.message = $errKind % [args[0]]
126135
of errLoopLimit:
127136
lex.err.message = $errKind % [args[0], args[1]]
@@ -276,19 +285,81 @@ func charTo(T: type, c: char): T {.inline.} =
276285
of {'A'..'F'}: result = T(c) - T('A') + T(10)
277286
else: doAssert(false, "should never executed")
278287

279-
proc scanHexDigits(lex: var Lexer, value: var int): int =
288+
proc scanHexDigits(lex: var Lexer, value: var int, token: var string): int =
280289
safeLoop(lex.conf.maxDigits, lex.safePeek HexDigits):
281290
inc result
282-
value = value * 16 + charTo(int, lex.stream.read)
291+
let c = lex.stream.read
292+
value = value * 16 + charTo(int, c)
293+
token.add c
294+
295+
proc invalidEscapeChar(lex: var Lexer) =
296+
if not lex.stream.readable:
297+
lex.lexerError(errInvalidEscape, tokEof)
298+
else:
299+
lex.lexerError(errInvalidEscape, lex.stream.peek)
283300

284301
proc scanUnicode(lex: var Lexer): bool =
285-
var code: int
286-
if lex.scanHexDigits(code) != 4:
287-
lex.lexerError(errInvalidUnicode, code)
288-
return false
302+
if lex.safePeek HexDigits:
303+
var codePoint: int
304+
var token: string
305+
if lex.scanHexDigits(codePoint, token) != 4:
306+
lex.lexerError(errInvalidUnicode, token)
307+
return false
289308

290-
lex.token.add unicode.toUTF8(Rune(code))
291-
return true
309+
if Utf16.highSurrogate(codePoint):
310+
if not lex.safePeek '\\':
311+
lex.lexerError(errOrphanSurrogate, token)
312+
return false
313+
advance lex.stream
314+
315+
if not lex.safePeek 'u':
316+
lex.lexerError(errOrphanSurrogate, token)
317+
return false
318+
advance lex.stream
319+
320+
var surrogate: int
321+
var hexSurrogate: string
322+
if lex.scanHexDigits(surrogate, hexSurrogate) != 4:
323+
lex.lexerError(errInvalidUnicode, hexSurrogate)
324+
return false
325+
326+
codePoint = Utf16.utf(codePoint, surrogate)
327+
token.add "\\u"
328+
token.add hexSurrogate
329+
330+
if not Utf8.append(lex.token, codePoint):
331+
lex.lexerError(errInvalidUnicode, token)
332+
return false
333+
334+
return true
335+
336+
elif lex.safePeek '{':
337+
if lfJsonCompatibility in lex.flags:
338+
lex.lexerError(errInvalidEscape, '{')
339+
return false
340+
341+
advance lex.stream # eat '{'
342+
343+
var codePoint: int
344+
var token: string
345+
if lex.scanHexDigits(codePoint, token) > 6:
346+
lex.lexerError(errInvalidUnicode, token)
347+
return false
348+
349+
if not Utf8.append(lex.token, codePoint):
350+
lex.lexerError(errInvalidUnicode, token)
351+
return false
352+
353+
if not lex.safePeek '}':
354+
lex.invalidEscapeChar
355+
return false
356+
357+
advance lex.stream # eat '}'
358+
return true
359+
360+
else:
361+
lex.invalidEscapeChar
362+
return false
292363

293364
proc scanEscapeChar(lex: var Lexer): bool =
294365
if not lex.stream.readable:
@@ -349,6 +420,8 @@ proc scanMultiLineString(lex: var Lexer) =
349420
lex.token.setLen(lex.token.len-1)
350421
lex.token.add "\"\"\"" # Escape Triple-Quote (\""")
351422
else:
423+
if Utf8.validate(lex.token) == false:
424+
lex.lexerError(errInvalidUTF8)
352425
return
353426
else:
354427
lex.token.add '"'
@@ -368,7 +441,6 @@ proc scanMultiLineString(lex: var Lexer) =
368441
of '\\':
369442
lex.token.add lex.stream.read
370443
else:
371-
# FIXME: this is not a valid UTF-16 lexer
372444
lex.token.add lex.stream.read
373445

374446
lex.lexerError(errUnterminatedBlockString)
@@ -382,14 +454,15 @@ proc scanSingleLineString(lex: var Lexer) =
382454
return
383455
of '"':
384456
advance lex.stream
457+
if Utf8.validate(lex.token) == false:
458+
lex.lexerError(errInvalidUTF8)
385459
return
386460
of '\\':
387461
advance lex.stream
388462
if not lex.scanEscapeChar():
389463
return
390464
continue
391465
else:
392-
# FIXME: this is not a valid UTF-16 lexer
393466
lex.token.add lex.stream.read
394467

395468
lex.lexerError(errUnterminatedString)

0 commit comments

Comments
 (0)