Skip to content

Commit 4299e54

Browse files
committed
Support surrogates in parsing
1 parent 3937c76 commit 4299e54

File tree

5 files changed

+368
-65
lines changed

5 files changed

+368
-65
lines changed

juniper/src/parser/document.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ where
2525
S: ScalarValue,
2626
{
2727
let mut lexer = Lexer::new(s);
28-
let mut parser = Parser::new(&mut lexer).map_err(|s| s.map(ParseError::LexerError))?;
28+
let mut parser = Parser::new(&mut lexer).map_err(|s| s.map(Into::into))?;
2929
parse_document(&mut parser, schema)
3030
}
3131

juniper/src/parser/lexer.rs

Lines changed: 67 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use std::{char, ops::Deref, str::CharIndices};
1+
use std::{char, fmt, ops::Deref, str::CharIndices};
22

33
use derive_more::with_trait::{Display, Error};
44

@@ -491,6 +491,8 @@ impl<'a> Lexer<'a> {
491491
)
492492
})?;
493493

494+
// TODO: Support surrogate.
495+
494496
char::from_u32(code_point)
495497
.ok_or_else(|| {
496498
Spanning::zero_width(
@@ -677,6 +679,70 @@ fn is_number_start(c: char) -> bool {
677679
c == '-' || c.is_ascii_digit()
678680
}
679681

682+
/// Representation of a [Unicode code point].
683+
///
684+
/// This is different from a [Unicode scalar value] (aka "character") represented by a [`char`],
685+
/// because can denote a [surrogate code point].
686+
///
687+
/// [surrogate code point]: https://unicode.org/glossary#surrogate_code_point
688+
/// [Unicode code point]: https://unicode.org/glossary#code_point
689+
/// [Unicode scalar value]: https://unicode.org/glossary#unicode_scalar_value
690+
#[derive(Clone, Copy, Debug)]
691+
pub(crate) struct UnicodeCodePoint {
692+
/// Code representing this [`UnicodeCodePoint`].
693+
pub(crate) code: u32,
694+
695+
/// Indicator whether this [`UnicodeCodePoint`] should be [`Display`]ed in variable-width form.
696+
pub(crate) is_variable_width: bool,
697+
}
698+
699+
impl Display for UnicodeCodePoint {
700+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
701+
if self.is_variable_width {
702+
write!(f, r"\u{{{:X}}}", self.code)
703+
} else {
704+
write!(f, r"\u{:04X}", self.code)
705+
}
706+
}
707+
}
708+
709+
impl UnicodeCodePoint {
710+
/// Indicates whether this [`UnicodeCodePoint`] is a high (leading) [surrogate].
711+
///
712+
/// [surrogate]: https://unicode.org/glossary#surrogate_code_point
713+
pub(crate) fn is_high_surrogate(self) -> bool {
714+
(0xD800..=0xDBFF).contains(&self.code)
715+
}
716+
717+
/// Indicates whether this [`UnicodeCodePoint`] is a low (trailing) [surrogate].
718+
///
719+
/// [surrogate]: https://unicode.org/glossary#surrogate_code_point
720+
pub(crate) fn is_low_surrogate(self) -> bool {
721+
(0xDC00..=0xDFFF).contains(&self.code)
722+
}
723+
724+
/// Joins a [`UnicodeCodePoint`] from the provided [surrogate pair][0].
725+
///
726+
/// [0]: https://unicodebook.readthedocs.io/unicode_encodings.html#utf-16-surrogate-pairs
727+
pub(crate) fn from_surrogate_pair(high: Self, low: Self) -> Self {
728+
Self {
729+
code: 0x10000 + ((high.code & 0x03FF) << 10) + (low.code & 0x03FF),
730+
is_variable_width: true,
731+
}
732+
}
733+
734+
/// Tries to convert this [`UnicodeCodePoint`] into a [`char`].
735+
///
736+
/// # Errors
737+
///
738+
/// If this [`UnicodeCodePoint`] doesn't represent a [Unicode scalar value].
739+
///
740+
/// [Unicode scalar value]: https://unicode.org/glossary#unicode_scalar_value
741+
pub(crate) fn try_into_char(self) -> Result<char, LexerError> {
742+
char::from_u32(self.code).ok_or_else(|| LexerError::UnknownEscapeSequence(self.to_string()))
743+
}
744+
}
745+
680746
#[cfg(test)]
681747
mod test {
682748
use crate::parser::{

juniper/src/parser/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,4 @@ pub use self::{
1717
parser::{OptionParseResult, ParseError, ParseResult, Parser, UnlocatedParseResult},
1818
utils::{SourcePosition, Span, Spanning},
1919
};
20+
pub(crate) use self::lexer::UnicodeCodePoint;

0 commit comments

Comments
 (0)