Skip to content

Commit 095f407

Browse files
committed
Support surrogates in lexer
1 parent 4299e54 commit 095f407

File tree

2 files changed

+200
-17
lines changed

2 files changed

+200
-17
lines changed

juniper/src/parser/lexer.rs

Lines changed: 199 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -319,7 +319,30 @@ impl<'a> Lexer<'a> {
319319
// {HexDigit[list]}
320320
// HexDigit HexDigit HexDigit HexDigit
321321
'u' if escaped => {
322-
self.scan_escaped_unicode(&old_pos)?;
322+
let mut code_point = self.scan_escaped_unicode(&old_pos)?;
323+
if code_point.is_high_surrogate() {
324+
let new_pos = self.position;
325+
let (Some((_, '\\')), Some((_, 'u'))) =
326+
(self.next_char(), self.next_char())
327+
else {
328+
return Err(Spanning::zero_width(
329+
&old_pos,
330+
LexerError::UnknownEscapeSequence(code_point.to_string()),
331+
));
332+
};
333+
let trailing_code_point = self.scan_escaped_unicode(&new_pos)?;
334+
if !trailing_code_point.is_low_surrogate() {
335+
return Err(Spanning::zero_width(
336+
&old_pos,
337+
LexerError::UnknownEscapeSequence(code_point.to_string()),
338+
));
339+
}
340+
code_point =
341+
UnicodeCodePoint::from_surrogate_pair(code_point, trailing_code_point);
342+
}
343+
_ = code_point
344+
.try_into_char()
345+
.map_err(|e| Spanning::zero_width(&old_pos, e))?;
323346
escaped = false;
324347
}
325348
c if escaped => {
@@ -419,7 +442,7 @@ impl<'a> Lexer<'a> {
419442
fn scan_escaped_unicode(
420443
&mut self,
421444
start_pos: &SourcePosition,
422-
) -> Result<(), Spanning<LexerError>> {
445+
) -> Result<UnicodeCodePoint, Spanning<LexerError>> {
423446
// EscapedUnicode ::
424447
// {HexDigit[list]}
425448
// HexDigit HexDigit HexDigit HexDigit
@@ -474,6 +497,13 @@ impl<'a> Lexer<'a> {
474497
)),
475498
));
476499
}
500+
// `\u{10FFFF}` is max code point
501+
if escape.len() - 2 > 6 {
502+
return Err(Spanning::zero_width(
503+
start_pos,
504+
LexerError::UnknownEscapeSequence(format!(r"\u{}", &escape[..escape.len()])),
505+
));
506+
}
477507
u32::from_str_radix(&escape[1..escape.len() - 1], 16)
478508
} else {
479509
if len != 4 {
@@ -491,16 +521,10 @@ impl<'a> Lexer<'a> {
491521
)
492522
})?;
493523

494-
// TODO: Support surrogate.
495-
496-
char::from_u32(code_point)
497-
.ok_or_else(|| {
498-
Spanning::zero_width(
499-
start_pos,
500-
LexerError::UnknownEscapeSequence(format!(r"\u{escape}")),
501-
)
502-
})
503-
.map(drop)
524+
Ok(UnicodeCodePoint {
525+
code: code_point,
526+
is_variable_width,
527+
})
504528
}
505529

506530
fn scan_number(&mut self) -> LexerResult<'a> {
@@ -725,6 +749,8 @@ impl UnicodeCodePoint {
725749
///
726750
/// [0]: https://unicodebook.readthedocs.io/unicode_encodings.html#utf-16-surrogate-pairs
727751
pub(crate) fn from_surrogate_pair(high: Self, low: Self) -> Self {
752+
debug_assert!(high.is_high_surrogate(), "`{high}` is not a high surrogate");
753+
debug_assert!(low.is_low_surrogate(), "`{high}` is not a low surrogate");
728754
Self {
729755
code: 0x10000 + ((high.code & 0x03FF) << 10) + (low.code & 0x03FF),
730756
is_variable_width: true,
@@ -967,6 +993,83 @@ mod test {
967993
),
968994
);
969995

996+
assert_eq!(
997+
tokenize_single(r#""string with unicode escape outside BMP \u{1F600}""#),
998+
Spanning::start_end(
999+
&SourcePosition::new(0, 0, 0),
1000+
&SourcePosition::new(50, 0, 50),
1001+
Token::Scalar(ScalarToken::String(Quoted(
1002+
r#""string with unicode escape outside BMP \u{1F600}""#,
1003+
))),
1004+
),
1005+
);
1006+
1007+
assert_eq!(
1008+
tokenize_single(r#""string with minimal unicode escape \u{0}""#),
1009+
Spanning::start_end(
1010+
&SourcePosition::new(0, 0, 0),
1011+
&SourcePosition::new(42, 0, 42),
1012+
Token::Scalar(ScalarToken::String(Quoted(
1013+
r#""string with minimal unicode escape \u{0}""#,
1014+
))),
1015+
),
1016+
);
1017+
1018+
assert_eq!(
1019+
tokenize_single(r#""string with maximal unicode escape \u{10FFFF}""#),
1020+
Spanning::start_end(
1021+
&SourcePosition::new(0, 0, 0),
1022+
&SourcePosition::new(47, 0, 47),
1023+
Token::Scalar(ScalarToken::String(Quoted(
1024+
r#""string with maximal unicode escape \u{10FFFF}""#,
1025+
))),
1026+
),
1027+
);
1028+
1029+
assert_eq!(
1030+
tokenize_single(r#""string with maximal minimal unicode escape \u{000000}""#),
1031+
Spanning::start_end(
1032+
&SourcePosition::new(0, 0, 0),
1033+
&SourcePosition::new(55, 0, 55),
1034+
Token::Scalar(ScalarToken::String(Quoted(
1035+
r#""string with maximal minimal unicode escape \u{000000}""#,
1036+
))),
1037+
),
1038+
);
1039+
1040+
assert_eq!(
1041+
tokenize_single(r#""string with unicode surrogate pair escape \uD83D\uDE00""#),
1042+
Spanning::start_end(
1043+
&SourcePosition::new(0, 0, 0),
1044+
&SourcePosition::new(56, 0, 56),
1045+
Token::Scalar(ScalarToken::String(Quoted(
1046+
r#""string with unicode surrogate pair escape \uD83D\uDE00""#,
1047+
))),
1048+
),
1049+
);
1050+
1051+
assert_eq!(
1052+
tokenize_single(r#""string with minimal surrogate pair escape \uD800\uDC00""#),
1053+
Spanning::start_end(
1054+
&SourcePosition::new(0, 0, 0),
1055+
&SourcePosition::new(56, 0, 56),
1056+
Token::Scalar(ScalarToken::String(Quoted(
1057+
r#""string with minimal surrogate pair escape \uD800\uDC00""#,
1058+
))),
1059+
),
1060+
);
1061+
1062+
assert_eq!(
1063+
tokenize_single(r#""string with maximal surrogate pair escape \uDBFF\uDFFF""#),
1064+
Spanning::start_end(
1065+
&SourcePosition::new(0, 0, 0),
1066+
&SourcePosition::new(56, 0, 56),
1067+
Token::Scalar(ScalarToken::String(Quoted(
1068+
r#""string with maximal surrogate pair escape \uDBFF\uDFFF""#,
1069+
))),
1070+
),
1071+
);
1072+
9701073
assert_eq!(
9711074
tokenize_single("\"contains unescaped \u{0007} control char\""),
9721075
Spanning::start_end(
@@ -1089,18 +1192,98 @@ mod test {
10891192
);
10901193

10911194
assert_eq!(
1092-
tokenize_error(r#""bad \u{DEAD} esc""#),
1195+
tokenize_error(r#""bad \u{FXXX} esc""#),
10931196
Spanning::zero_width(
10941197
&SourcePosition::new(6, 0, 6),
1095-
LexerError::UnknownEscapeSequence(r"\u{DEAD}".into()),
1198+
LexerError::UnknownEscapeSequence(r"\u{FXXX}".into()),
1199+
),
1200+
);
1201+
1202+
assert_eq!(
1203+
tokenize_error(r#""bad \u{FFFF esc""#),
1204+
Spanning::zero_width(
1205+
&SourcePosition::new(6, 0, 6),
1206+
LexerError::UnknownEscapeSequence(r"\u{FFFF".into()),
1207+
),
1208+
);
1209+
1210+
assert_eq!(
1211+
tokenize_error(r#""bad \u{FFF esc""#),
1212+
Spanning::zero_width(
1213+
&SourcePosition::new(6, 0, 6),
1214+
LexerError::UnknownEscapeSequence(r"\u{FFF".into()),
1215+
),
1216+
);
1217+
1218+
assert_eq!(
1219+
tokenize_error(r#""bad \u{FFFF""#),
1220+
Spanning::zero_width(
1221+
&SourcePosition::new(6, 0, 6),
1222+
LexerError::UnknownEscapeSequence(r"\u{FFFF".into()),
10961223
),
10971224
);
10981225

10991226
assert_eq!(
1100-
tokenize_error(r#""bad \u{DEA esc""#),
1227+
tokenize_error(r#""bad \u{} esc""#),
11011228
Spanning::zero_width(
11021229
&SourcePosition::new(6, 0, 6),
1103-
LexerError::UnknownEscapeSequence(r"\u{DEA".into()),
1230+
LexerError::UnknownEscapeSequence(r"\u{}".into()),
1231+
),
1232+
);
1233+
1234+
assert_eq!(
1235+
tokenize_error(r#""too high \u{110000} esc""#),
1236+
Spanning::zero_width(
1237+
&SourcePosition::new(11, 0, 11),
1238+
LexerError::UnknownEscapeSequence(r"\u{110000}".into()),
1239+
),
1240+
);
1241+
1242+
assert_eq!(
1243+
tokenize_error(r#""way too high \u{12345678} esc""#),
1244+
Spanning::zero_width(
1245+
&SourcePosition::new(15, 0, 15),
1246+
LexerError::UnknownEscapeSequence(r"\u{12345678}".into()),
1247+
),
1248+
);
1249+
1250+
assert_eq!(
1251+
tokenize_error(r#""too long \u{000000000} esc""#),
1252+
Spanning::zero_width(
1253+
&SourcePosition::new(11, 0, 11),
1254+
LexerError::UnknownEscapeSequence(r"\u{000000000}".into()),
1255+
),
1256+
);
1257+
1258+
assert_eq!(
1259+
tokenize_error(r#""bad surrogate \uDEAD esc""#),
1260+
Spanning::zero_width(
1261+
&SourcePosition::new(16, 0, 16),
1262+
LexerError::UnknownEscapeSequence(r"\uDEAD".into()),
1263+
),
1264+
);
1265+
1266+
assert_eq!(
1267+
tokenize_error(r#""bad surrogate \u{DEAD} esc""#),
1268+
Spanning::zero_width(
1269+
&SourcePosition::new(16, 0, 16),
1270+
LexerError::UnknownEscapeSequence(r"\u{DEAD}".into()),
1271+
),
1272+
);
1273+
1274+
assert_eq!(
1275+
tokenize_error(r#""bad high surrogate pair \uDEAD\uDEAD esc""#),
1276+
Spanning::zero_width(
1277+
&SourcePosition::new(26, 0, 26),
1278+
LexerError::UnknownEscapeSequence(r"\uDEAD".into()),
1279+
),
1280+
);
1281+
1282+
assert_eq!(
1283+
tokenize_error(r#""bad low surrogate pair \uD800\uD800 esc""#),
1284+
Spanning::zero_width(
1285+
&SourcePosition::new(25, 0, 25),
1286+
LexerError::UnknownEscapeSequence(r"\uD800".into()),
11041287
),
11051288
);
11061289

juniper/src/parser/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@ mod tests;
1212

1313
pub use self::document::parse_document_source;
1414

15+
pub(crate) use self::lexer::UnicodeCodePoint;
1516
pub use self::{
1617
lexer::{Lexer, LexerError, ScalarToken, StringLiteral, Token},
1718
parser::{OptionParseResult, ParseError, ParseResult, Parser, UnlocatedParseResult},
1819
utils::{SourcePosition, Span, Spanning},
1920
};
20-
pub(crate) use self::lexer::UnicodeCodePoint;

0 commit comments

Comments
 (0)