1010import
1111 std/ [unicode, strutils],
1212 faststreams/ inputs,
13- ./ common/ [names, errors, types]
13+ ./ common/ [names, errors, types],
14+ ./ private/ utf
1415
1516type
1617 TokKind * = enum
4849 errInvalidUnicode = " Invalid unicode sequence '$1'"
4950 errInvalidChar = " Invalid char '$1'"
5051 errLoopLimit = " loop limit $1 reached for $2"
52+ errInvalidUTF8 = " Invalid UTF-8 sequence detected in string"
53+ errOrphanSurrogate = " Orphaned surrogate codepoint detected '$1'"
54+
55+ LexerFlag * = enum
56+ lfJsonCompatibility # parse json unicode escape chars but not graphql escape chars
5157
5258 LexConf * = object
5359 maxIdentChars* : int
5460 maxDigits* : int
5561 maxStringChars* : int
62+ flags* : set [LexerFlag ]
5663
5764 LexConfInternal = object
5865 maxIdentChars : LoopGuard
7279 error* : LexerError
7380 err* : ErrorDesc
7481 conf : LexConfInternal
82+ flags* : set [LexerFlag ]
7583
7684proc defaultLexConf * (): LexConf =
7785 result .maxIdentChars = 128
@@ -88,7 +96,8 @@ proc init*(T: type Lexer, stream: InputStream, names: NameCache, conf = defaultL
8896 stream: stream,
8997 names: names,
9098 line: 1 ,
91- conf: toInternalConf (conf)
99+ conf: toInternalConf (conf),
100+ flags: conf.flags
92101 )
93102
94103template peek (s: InputStream ): char =
@@ -121,7 +130,7 @@ proc lexerError(lex: var Lexer, errKind: LexerError, args: varargs[string, `$`])
121130 lex.err.message = $ errKind
122131
123132 case errKind
124- of errInvalidEscape, errInvalidUnicode, errInvalidChar:
133+ of errInvalidEscape, errInvalidUnicode, errInvalidChar, errOrphanSurrogate :
125134 lex.err.message = $ errKind % [args[0 ]]
126135 of errLoopLimit:
127136 lex.err.message = $ errKind % [args[0 ], args[1 ]]
@@ -276,19 +285,81 @@ func charTo(T: type, c: char): T {.inline.} =
276285 of {'A' .. 'F' }: result = T (c) - T ('A' ) + T (10 )
277286 else : doAssert (false , " should never executed" )
278287
279- proc scanHexDigits (lex: var Lexer , value: var int ): int =
288+ proc scanHexDigits (lex: var Lexer , value: var int , token: var string ): int =
280289 safeLoop (lex.conf.maxDigits, lex.safePeek HexDigits ):
281290 inc result
282- value = value * 16 + charTo (int , lex.stream.read)
291+ let c = lex.stream.read
292+ value = value * 16 + charTo (int , c)
293+ token.add c
294+
295+ proc invalidEscapeChar (lex: var Lexer ) =
296+ if not lex.stream.readable:
297+ lex.lexerError (errInvalidEscape, tokEof)
298+ else :
299+ lex.lexerError (errInvalidEscape, lex.stream.peek)
283300
284301proc scanUnicode (lex: var Lexer ): bool =
285- var code: int
286- if lex.scanHexDigits (code) != 4 :
287- lex.lexerError (errInvalidUnicode, code)
288- return false
302+ if lex.safePeek HexDigits :
303+ var codePoint: int
304+ var token: string
305+ if lex.scanHexDigits (codePoint, token) != 4 :
306+ lex.lexerError (errInvalidUnicode, token)
307+ return false
289308
290- lex.token.add unicode.toUTF8 (Rune (code))
291- return true
309+ if Utf16 .highSurrogate (codePoint):
310+ if not lex.safePeek '\\ ' :
311+ lex.lexerError (errOrphanSurrogate, token)
312+ return false
313+ advance lex.stream
314+
315+ if not lex.safePeek 'u' :
316+ lex.lexerError (errOrphanSurrogate, token)
317+ return false
318+ advance lex.stream
319+
320+ var surrogate: int
321+ var hexSurrogate: string
322+ if lex.scanHexDigits (surrogate, hexSurrogate) != 4 :
323+ lex.lexerError (errInvalidUnicode, hexSurrogate)
324+ return false
325+
326+ codePoint = Utf16 .utf (codePoint, surrogate)
327+ token.add " \\ u"
328+ token.add hexSurrogate
329+
330+ if not Utf8 .append (lex.token, codePoint):
331+ lex.lexerError (errInvalidUnicode, token)
332+ return false
333+
334+ return true
335+
336+ elif lex.safePeek '{' :
337+ if lfJsonCompatibility in lex.flags:
338+ lex.lexerError (errInvalidEscape, '{' )
339+ return false
340+
341+ advance lex.stream # eat '{'
342+
343+ var codePoint: int
344+ var token: string
345+ if lex.scanHexDigits (codePoint, token) > 6 :
346+ lex.lexerError (errInvalidUnicode, token)
347+ return false
348+
349+ if not Utf8 .append (lex.token, codePoint):
350+ lex.lexerError (errInvalidUnicode, token)
351+ return false
352+
353+ if not lex.safePeek '}' :
354+ lex.invalidEscapeChar
355+ return false
356+
357+ advance lex.stream # eat '}'
358+ return true
359+
360+ else :
361+ lex.invalidEscapeChar
362+ return false
292363
293364proc scanEscapeChar (lex: var Lexer ): bool =
294365 if not lex.stream.readable:
@@ -349,6 +420,8 @@ proc scanMultiLineString(lex: var Lexer) =
349420 lex.token.setLen (lex.token.len- 1 )
350421 lex.token.add " \"\"\" " # Escape Triple-Quote (\""")
351422 else :
423+ if Utf8 .validate (lex.token) == false :
424+ lex.lexerError (errInvalidUTF8)
352425 return
353426 else :
354427 lex.token.add '"'
@@ -368,7 +441,6 @@ proc scanMultiLineString(lex: var Lexer) =
368441 of '\\ ' :
369442 lex.token.add lex.stream.read
370443 else :
371- # FIXME: this is not a valid UTF-16 lexer
372444 lex.token.add lex.stream.read
373445
374446 lex.lexerError (errUnterminatedBlockString)
@@ -382,14 +454,15 @@ proc scanSingleLineString(lex: var Lexer) =
382454 return
383455 of '"' :
384456 advance lex.stream
457+ if Utf8 .validate (lex.token) == false :
458+ lex.lexerError (errInvalidUTF8)
385459 return
386460 of '\\ ' :
387461 advance lex.stream
388462 if not lex.scanEscapeChar ():
389463 return
390464 continue
391465 else :
392- # FIXME: this is not a valid UTF-16 lexer
393466 lex.token.add lex.stream.read
394467
395468 lex.lexerError (errUnterminatedString)
0 commit comments