Skip to content

Commit 0ec1a99

Browse files
committed
Support parsing variable-length Unicode code points, vol.1
1 parent b97a646 commit 0ec1a99

File tree

1 file changed

+78
-71
lines changed

1 file changed

+78
-71
lines changed

juniper/src/parser/parser.rs

Lines changed: 78 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use std::{borrow::Cow, fmt};
1+
use std::{borrow::Cow, fmt, iter};
22

33
use compact_str::{CompactString, format_compact};
44
use derive_more::with_trait::{Display, Error};
@@ -227,31 +227,24 @@ impl<'a> StringLiteral<'a> {
227227
let mut char_iter = unquoted.chars();
228228
while let Some(ch) = char_iter.next() {
229229
match ch {
230+
// StringCharacter ::
231+
// SourceCharacter but not " or \ or LineTerminator
232+
// \uEscapedUnicode
233+
// \EscapedCharacter
230234
'\\' => match char_iter.next() {
231-
Some('"') => {
232-
unescaped.push('"');
233-
}
234-
Some('/') => {
235-
unescaped.push('/');
236-
}
237-
Some('n') => {
238-
unescaped.push('\n');
239-
}
240-
Some('r') => {
241-
unescaped.push('\r');
242-
}
243-
Some('t') => {
244-
unescaped.push('\t');
245-
}
246-
Some('\\') => {
247-
unescaped.push('\\');
248-
}
249-
Some('f') => {
250-
unescaped.push('\u{000c}');
251-
}
252-
Some('b') => {
253-
unescaped.push('\u{0008}');
254-
}
235+
// EscapedCharacter :: one of
236+
// " \ / b f n r t
237+
Some('"') => unescaped.push('"'),
238+
Some('\\') => unescaped.push('\\'),
239+
Some('/') => unescaped.push('/'),
240+
Some('b') => unescaped.push('\u{0008}'),
241+
Some('f') => unescaped.push('\u{000C}'),
242+
Some('n') => unescaped.push('\n'),
243+
Some('r') => unescaped.push('\r'),
244+
Some('t') => unescaped.push('\t'),
245+
// EscapedUnicode ::
246+
// {HexDigit[list]}
247+
// HexDigit HexDigit HexDigit HexDigit
255248
Some('u') => {
256249
unescaped.push(parse_unicode_codepoint(&mut char_iter)?);
257250
}
@@ -335,51 +328,65 @@ impl<'a> StringLiteral<'a> {
335328
}
336329
}
337330

338-
fn parse_unicode_codepoint<I>(char_iter: &mut I) -> Result<char, ParseError>
339-
where
340-
I: Iterator<Item = char>,
341-
{
342-
let escaped_code_point = char_iter
343-
.next()
344-
.ok_or_else(|| ParseError::LexerError(LexerError::UnknownEscapeSequence(r"\u".into())))
345-
.and_then(|c1| {
346-
char_iter
347-
.next()
348-
.map(|c2| format!("{c1}{c2}"))
349-
.ok_or_else(|| {
350-
ParseError::LexerError(LexerError::UnknownEscapeSequence(format!(r"\u{c1}")))
351-
})
352-
})
353-
.and_then(|mut s| {
354-
char_iter
355-
.next()
356-
.ok_or_else(|| {
357-
ParseError::LexerError(LexerError::UnknownEscapeSequence(format!(r"\u{s}")))
358-
})
359-
.map(|c2| {
360-
s.push(c2);
361-
s
362-
})
331+
/// Parses an [escaped unicode] character.
332+
///
333+
/// [escaped unicode]: https://spec.graphql.org/September2025#EscapedUnicode
334+
// TODO: Add tests
335+
// TODO: Check surrogate pairs?
336+
fn parse_unicode_codepoint(char_iter: &mut impl Iterator<Item = char>) -> Result<char, ParseError> {
337+
// EscapedUnicode ::
338+
// {HexDigit[list]}
339+
// HexDigit HexDigit HexDigit HexDigit
340+
341+
let Some(mut curr_ch) = char_iter.next() else {
342+
return Err(ParseError::LexerError(LexerError::UnknownEscapeSequence(
343+
r"\u".into(),
344+
)));
345+
};
346+
let mut escaped_code_point = String::with_capacity(6); // `\u{10FFFF}` is max code point
347+
348+
let is_variable_width = curr_ch == '{';
349+
if is_variable_width {
350+
loop {
351+
curr_ch = char_iter.next().ok_or_else(|| {
352+
ParseError::LexerError(LexerError::UnknownEscapeSequence(format!(
353+
r"\u{{{escaped_code_point}"
354+
)))
355+
})?;
356+
if curr_ch == '}' {
357+
break;
358+
} else if !curr_ch.is_alphanumeric() {
359+
return Err(ParseError::LexerError(LexerError::UnknownEscapeSequence(
360+
format!(r"\u{{{escaped_code_point}"),
361+
)));
362+
}
363+
escaped_code_point.push(curr_ch);
364+
}
365+
} else {
366+
let mut char_iter = iter::once(curr_ch).chain(char_iter);
367+
for _ in 0..4 {
368+
curr_ch = char_iter.next().ok_or_else(|| {
369+
ParseError::LexerError(LexerError::UnknownEscapeSequence(format!(
370+
r"\u{escaped_code_point}"
371+
)))
372+
})?;
373+
if !curr_ch.is_alphanumeric() {
374+
return Err(ParseError::LexerError(LexerError::UnknownEscapeSequence(
375+
format!(r"\u{escaped_code_point}"),
376+
)));
377+
}
378+
escaped_code_point.push(curr_ch);
379+
}
380+
}
381+
382+
u32::from_str_radix(&escaped_code_point, 16)
383+
.ok()
384+
.and_then(char::from_u32)
385+
.ok_or_else(|| {
386+
ParseError::LexerError(LexerError::UnknownEscapeSequence(if is_variable_width {
387+
format!(r"\u{{{escaped_code_point}}}")
388+
} else {
389+
format!(r"\u{escaped_code_point}")
390+
}))
363391
})
364-
.and_then(|mut s| {
365-
char_iter
366-
.next()
367-
.ok_or_else(|| {
368-
ParseError::LexerError(LexerError::UnknownEscapeSequence(format!(r"\u{s}")))
369-
})
370-
.map(|c2| {
371-
s.push(c2);
372-
s
373-
})
374-
})?;
375-
let code_point = u32::from_str_radix(&escaped_code_point, 16).map_err(|_| {
376-
ParseError::LexerError(LexerError::UnknownEscapeSequence(format!(
377-
r"\u{escaped_code_point}",
378-
)))
379-
})?;
380-
char::from_u32(code_point).ok_or_else(|| {
381-
ParseError::LexerError(LexerError::UnknownEscapeSequence(format!(
382-
r"\u{escaped_code_point}",
383-
)))
384-
})
385392
}

0 commit comments

Comments
 (0)