|
1 |
| -use std::{borrow::Cow, fmt}; |
| 1 | +use std::{borrow::Cow, fmt, iter}; |
2 | 2 |
|
3 | 3 | use compact_str::{CompactString, format_compact};
|
4 | 4 | use derive_more::with_trait::{Display, Error};
|
@@ -227,31 +227,24 @@ impl<'a> StringLiteral<'a> {
|
227 | 227 | let mut char_iter = unquoted.chars();
|
228 | 228 | while let Some(ch) = char_iter.next() {
|
229 | 229 | match ch {
|
| 230 | + // StringCharacter :: |
| 231 | + // SourceCharacter but not " or \ or LineTerminator |
| 232 | + // \uEscapedUnicode |
| 233 | + // \EscapedCharacter |
230 | 234 | '\\' => match char_iter.next() {
|
231 |
| - Some('"') => { |
232 |
| - unescaped.push('"'); |
233 |
| - } |
234 |
| - Some('/') => { |
235 |
| - unescaped.push('/'); |
236 |
| - } |
237 |
| - Some('n') => { |
238 |
| - unescaped.push('\n'); |
239 |
| - } |
240 |
| - Some('r') => { |
241 |
| - unescaped.push('\r'); |
242 |
| - } |
243 |
| - Some('t') => { |
244 |
| - unescaped.push('\t'); |
245 |
| - } |
246 |
| - Some('\\') => { |
247 |
| - unescaped.push('\\'); |
248 |
| - } |
249 |
| - Some('f') => { |
250 |
| - unescaped.push('\u{000c}'); |
251 |
| - } |
252 |
| - Some('b') => { |
253 |
| - unescaped.push('\u{0008}'); |
254 |
| - } |
| 235 | + // EscapedCharacter :: one of |
| 236 | + // " \ / b f n r t |
| 237 | + Some('"') => unescaped.push('"'), |
| 238 | + Some('\\') => unescaped.push('\\'), |
| 239 | + Some('/') => unescaped.push('/'), |
| 240 | + Some('b') => unescaped.push('\u{0008}'), |
| 241 | + Some('f') => unescaped.push('\u{000C}'), |
| 242 | + Some('n') => unescaped.push('\n'), |
| 243 | + Some('r') => unescaped.push('\r'), |
| 244 | + Some('t') => unescaped.push('\t'), |
| 245 | + // EscapedUnicode :: |
| 246 | + // {HexDigit[list]} |
| 247 | + // HexDigit HexDigit HexDigit HexDigit |
255 | 248 | Some('u') => {
|
256 | 249 | unescaped.push(parse_unicode_codepoint(&mut char_iter)?);
|
257 | 250 | }
|
@@ -335,51 +328,65 @@ impl<'a> StringLiteral<'a> {
|
335 | 328 | }
|
336 | 329 | }
|
337 | 330 |
|
338 |
| -fn parse_unicode_codepoint<I>(char_iter: &mut I) -> Result<char, ParseError> |
339 |
| -where |
340 |
| - I: Iterator<Item = char>, |
341 |
| -{ |
342 |
| - let escaped_code_point = char_iter |
343 |
| - .next() |
344 |
| - .ok_or_else(|| ParseError::LexerError(LexerError::UnknownEscapeSequence(r"\u".into()))) |
345 |
| - .and_then(|c1| { |
346 |
| - char_iter |
347 |
| - .next() |
348 |
| - .map(|c2| format!("{c1}{c2}")) |
349 |
| - .ok_or_else(|| { |
350 |
| - ParseError::LexerError(LexerError::UnknownEscapeSequence(format!(r"\u{c1}"))) |
351 |
| - }) |
352 |
| - }) |
353 |
| - .and_then(|mut s| { |
354 |
| - char_iter |
355 |
| - .next() |
356 |
| - .ok_or_else(|| { |
357 |
| - ParseError::LexerError(LexerError::UnknownEscapeSequence(format!(r"\u{s}"))) |
358 |
| - }) |
359 |
| - .map(|c2| { |
360 |
| - s.push(c2); |
361 |
| - s |
362 |
| - }) |
| 331 | +/// Parses an [escaped unicode] character. |
| 332 | +/// |
| 333 | +/// [escaped unicode]: https://spec.graphql.org/September2025#EscapedUnicode |
| 334 | +// TODO: Add tests |
| 335 | +// TODO: Check surrogate pairs? |
| 336 | +fn parse_unicode_codepoint(char_iter: &mut impl Iterator<Item = char>) -> Result<char, ParseError> { |
| 337 | + // EscapedUnicode :: |
| 338 | + // {HexDigit[list]} |
| 339 | + // HexDigit HexDigit HexDigit HexDigit |
| 340 | + |
| 341 | + let Some(mut curr_ch) = char_iter.next() else { |
| 342 | + return Err(ParseError::LexerError(LexerError::UnknownEscapeSequence( |
| 343 | + r"\u".into(), |
| 344 | + ))); |
| 345 | + }; |
| 346 | + let mut escaped_code_point = String::with_capacity(6); // `\u{10FFFF}` is max code point |
| 347 | + |
| 348 | + let is_variable_width = curr_ch == '{'; |
| 349 | + if is_variable_width { |
| 350 | + loop { |
| 351 | + curr_ch = char_iter.next().ok_or_else(|| { |
| 352 | + ParseError::LexerError(LexerError::UnknownEscapeSequence(format!( |
| 353 | + r"\u{{{escaped_code_point}" |
| 354 | + ))) |
| 355 | + })?; |
| 356 | + if curr_ch == '}' { |
| 357 | + break; |
| 358 | + } else if !curr_ch.is_alphanumeric() { |
| 359 | + return Err(ParseError::LexerError(LexerError::UnknownEscapeSequence( |
| 360 | + format!(r"\u{{{escaped_code_point}"), |
| 361 | + ))); |
| 362 | + } |
| 363 | + escaped_code_point.push(curr_ch); |
| 364 | + } |
| 365 | + } else { |
| 366 | + let mut char_iter = iter::once(curr_ch).chain(char_iter); |
| 367 | + for _ in 0..4 { |
| 368 | + curr_ch = char_iter.next().ok_or_else(|| { |
| 369 | + ParseError::LexerError(LexerError::UnknownEscapeSequence(format!( |
| 370 | + r"\u{escaped_code_point}" |
| 371 | + ))) |
| 372 | + })?; |
| 373 | + if !curr_ch.is_alphanumeric() { |
| 374 | + return Err(ParseError::LexerError(LexerError::UnknownEscapeSequence( |
| 375 | + format!(r"\u{escaped_code_point}"), |
| 376 | + ))); |
| 377 | + } |
| 378 | + escaped_code_point.push(curr_ch); |
| 379 | + } |
| 380 | + } |
| 381 | + |
| 382 | + u32::from_str_radix(&escaped_code_point, 16) |
| 383 | + .ok() |
| 384 | + .and_then(char::from_u32) |
| 385 | + .ok_or_else(|| { |
| 386 | + ParseError::LexerError(LexerError::UnknownEscapeSequence(if is_variable_width { |
| 387 | + format!(r"\u{{{escaped_code_point}}}") |
| 388 | + } else { |
| 389 | + format!(r"\u{escaped_code_point}") |
| 390 | + })) |
363 | 391 | })
|
364 |
| - .and_then(|mut s| { |
365 |
| - char_iter |
366 |
| - .next() |
367 |
| - .ok_or_else(|| { |
368 |
| - ParseError::LexerError(LexerError::UnknownEscapeSequence(format!(r"\u{s}"))) |
369 |
| - }) |
370 |
| - .map(|c2| { |
371 |
| - s.push(c2); |
372 |
| - s |
373 |
| - }) |
374 |
| - })?; |
375 |
| - let code_point = u32::from_str_radix(&escaped_code_point, 16).map_err(|_| { |
376 |
| - ParseError::LexerError(LexerError::UnknownEscapeSequence(format!( |
377 |
| - r"\u{escaped_code_point}", |
378 |
| - ))) |
379 |
| - })?; |
380 |
| - char::from_u32(code_point).ok_or_else(|| { |
381 |
| - ParseError::LexerError(LexerError::UnknownEscapeSequence(format!( |
382 |
| - r"\u{escaped_code_point}", |
383 |
| - ))) |
384 |
| - }) |
385 | 392 | }
|
0 commit comments