|
1 |
| -use std::{char, ops::Deref, str::CharIndices}; |
| 1 | +use std::{char, fmt, ops::Deref, str::CharIndices}; |
2 | 2 |
|
3 | 3 | use derive_more::with_trait::{Display, Error};
|
4 | 4 |
|
@@ -491,6 +491,8 @@ impl<'a> Lexer<'a> {
|
491 | 491 | )
|
492 | 492 | })?;
|
493 | 493 |
|
| 494 | + // TODO: Support surrogate. |
| 495 | + |
494 | 496 | char::from_u32(code_point)
|
495 | 497 | .ok_or_else(|| {
|
496 | 498 | Spanning::zero_width(
|
@@ -677,6 +679,70 @@ fn is_number_start(c: char) -> bool {
|
677 | 679 | c == '-' || c.is_ascii_digit()
|
678 | 680 | }
|
679 | 681 |
|
| 682 | +/// Representation of a [Unicode code point]. |
| 683 | +/// |
| 684 | +/// This is different from a [Unicode scalar value] (aka "character") represented by a [`char`], |
| 685 | +/// because can denote a [surrogate code point]. |
| 686 | +/// |
| 687 | +/// [surrogate code point]: https://unicode.org/glossary#surrogate_code_point |
| 688 | +/// [Unicode code point]: https://unicode.org/glossary#code_point |
| 689 | +/// [Unicode scalar value]: https://unicode.org/glossary#unicode_scalar_value |
| 690 | +#[derive(Clone, Copy, Debug)] |
| 691 | +pub(crate) struct UnicodeCodePoint { |
| 692 | + /// Code representing this [`UnicodeCodePoint`]. |
| 693 | + pub(crate) code: u32, |
| 694 | + |
| 695 | + /// Indicator whether this [`UnicodeCodePoint`] should be [`Display`]ed in variable-width form. |
| 696 | + pub(crate) is_variable_width: bool, |
| 697 | +} |
| 698 | + |
| 699 | +impl Display for UnicodeCodePoint { |
| 700 | + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| 701 | + if self.is_variable_width { |
| 702 | + write!(f, r"\u{{{:X}}}", self.code) |
| 703 | + } else { |
| 704 | + write!(f, r"\u{:04X}", self.code) |
| 705 | + } |
| 706 | + } |
| 707 | +} |
| 708 | + |
| 709 | +impl UnicodeCodePoint { |
| 710 | + /// Indicates whether this [`UnicodeCodePoint`] is a high (leading) [surrogate]. |
| 711 | + /// |
| 712 | + /// [surrogate]: https://unicode.org/glossary#surrogate_code_point |
| 713 | + pub(crate) fn is_high_surrogate(self) -> bool { |
| 714 | + (0xD800..=0xDBFF).contains(&self.code) |
| 715 | + } |
| 716 | + |
| 717 | + /// Indicates whether this [`UnicodeCodePoint`] is a low (trailing) [surrogate]. |
| 718 | + /// |
| 719 | + /// [surrogate]: https://unicode.org/glossary#surrogate_code_point |
| 720 | + pub(crate) fn is_low_surrogate(self) -> bool { |
| 721 | + (0xDC00..=0xDFFF).contains(&self.code) |
| 722 | + } |
| 723 | + |
| 724 | + /// Joins a [`UnicodeCodePoint`] from the provided [surrogate pair][0]. |
| 725 | + /// |
| 726 | + /// [0]: https://unicodebook.readthedocs.io/unicode_encodings.html#utf-16-surrogate-pairs |
| 727 | + pub(crate) fn from_surrogate_pair(high: Self, low: Self) -> Self { |
| 728 | + Self { |
| 729 | + code: 0x10000 + ((high.code & 0x03FF) << 10) + (low.code & 0x03FF), |
| 730 | + is_variable_width: true, |
| 731 | + } |
| 732 | + } |
| 733 | + |
| 734 | + /// Tries to convert this [`UnicodeCodePoint`] into a [`char`]. |
| 735 | + /// |
| 736 | + /// # Errors |
| 737 | + /// |
| 738 | + /// If this [`UnicodeCodePoint`] doesn't represent a [Unicode scalar value]. |
| 739 | + /// |
| 740 | + /// [Unicode scalar value]: https://unicode.org/glossary#unicode_scalar_value |
| 741 | + pub(crate) fn try_into_char(self) -> Result<char, LexerError> { |
| 742 | + char::from_u32(self.code).ok_or_else(|| LexerError::UnknownEscapeSequence(self.to_string())) |
| 743 | + } |
| 744 | +} |
| 745 | + |
680 | 746 | #[cfg(test)]
|
681 | 747 | mod test {
|
682 | 748 | use crate::parser::{
|
|
0 commit comments