Skip to content

Commit ae01819

Browse files
authored
Implementing Char for graphemes and all the necessary changes to make it happen. (#698)
* feat!: add newtypes for working with graphemes add `Grapheme`, `Graphemes` and `GraphemeIter` types move the implementation of input traits to the `text` module replace type `Input::Item` with `Grapheme` implement seq traits for graphemes * refactor!: update the `Char` trait Remove `Char::Str` and everything related to it. Remove a number of supertraits for the `Char` trait. Replace `Char::to_char()` with `Char::to_ascii()`. Remove `Char::from_ascii()`. Add `Char::is_newline()`. Remove the generic `C` from `StrInput`. * feat: add a `Char` implementation for `Grapheme` * fix: fix errors generated in features * fix: remove the use of unstable features * fix: fix backwards compatibility * fix: fix `clippy` warnings * refactor!: make `Grapheme` and `Graphemes` unsized Replace their uses with references to them. * fix: remove unused lifetime in `src/text.rs:609:10` * feat: implement `StrInput` for `&Graphemes` * fix: fix `clippy` errors * feat!: manually implement `Debug` and `Display` for graphemes
1 parent ed345d9 commit ae01819

File tree

10 files changed

+636
-288
lines changed

10 files changed

+636
-288
lines changed

benches/parser.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -245,9 +245,9 @@ fn bench_then(c: &mut Criterion) {
245245

246246
#[cfg(feature = "regex")]
247247
fn bench_regex(c: &mut Criterion) {
248-
let re_foo = regex::<_, _, extra::Default>("foo");
249-
let re_foo2 = regex::<_, _, extra::Default>("[fF]oo");
250-
let re_rep = regex::<_, _, extra::Default>("(?:abc){4}");
248+
let re_foo = regex::<_, extra::Default>("foo");
249+
let re_foo2 = regex::<_, extra::Default>("[fF]oo");
250+
let re_rep = regex::<_, extra::Default>("(?:abc){4}");
251251

252252
let mut group = c.benchmark_group("regex");
253253

src/combinator.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1629,7 +1629,7 @@ where
16291629
///
16301630
/// ```
16311631
/// # use chumsky::prelude::*;
1632-
/// let row_4 = text::int::<_, _, extra::Err<Simple<char>>>(10)
1632+
/// let row_4 = text::int::<_, extra::Err<Simple<char>>>(10)
16331633
/// .padded()
16341634
/// .separated_by(just(','))
16351635
/// .at_most(4)
@@ -1661,7 +1661,7 @@ where
16611661
///
16621662
/// ```
16631663
/// # use chumsky::prelude::*;
1664-
/// let coordinate_3d = text::int::<_, _, extra::Err<Simple<char>>>(10)
1664+
/// let coordinate_3d = text::int::<_, extra::Err<Simple<char>>>(10)
16651665
/// .padded()
16661666
/// .separated_by(just(','))
16671667
/// .exactly(3)
@@ -1690,7 +1690,7 @@ where
16901690
///
16911691
/// ```
16921692
/// # use chumsky::prelude::*;
1693-
/// let r#enum = text::ascii::keyword::<_, _, _, extra::Err<Simple<char>>>("enum")
1693+
/// let r#enum = text::ascii::keyword::<_, _, extra::Err<Simple<char>>>("enum")
16941694
/// .padded()
16951695
/// .ignore_then(text::ascii::ident()
16961696
/// .padded()
@@ -1720,7 +1720,7 @@ where
17201720
///
17211721
/// ```
17221722
/// # use chumsky::prelude::*;
1723-
/// let numbers = text::int::<_, _, extra::Err<Simple<char>>>(10)
1723+
/// let numbers = text::int::<_, extra::Err<Simple<char>>>(10)
17241724
/// .padded()
17251725
/// .separated_by(just(','))
17261726
/// .allow_trailing()

src/container.rs

Lines changed: 65 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -762,7 +762,7 @@ impl<'p> Seq<'p, char> for str {
762762
}
763763
}
764764

765-
impl<'p> Seq<'p, char> for &'p str {
765+
impl<'p> Seq<'p, char> for String {
766766
type Item<'a>
767767
= char
768768
where
@@ -792,7 +792,7 @@ impl<'p> Seq<'p, char> for &'p str {
792792
}
793793
}
794794

795-
impl<'p> Seq<'p, char> for String {
795+
impl<'p> Seq<'p, char> for &'p str {
796796
type Item<'a>
797797
= char
798798
where
@@ -822,6 +822,66 @@ impl<'p> Seq<'p, char> for String {
822822
}
823823
}
824824

825+
impl<'p> Seq<'p, &'p Grapheme> for &'p str {
826+
type Item<'a>
827+
= &'p Grapheme
828+
where
829+
Self: 'a;
830+
831+
type Iter<'a>
832+
= GraphemesIter<'p>
833+
where
834+
Self: 'a;
835+
836+
#[inline(always)]
837+
fn seq_iter(&self) -> Self::Iter<'_> {
838+
Graphemes::new(self).iter()
839+
}
840+
841+
#[inline(always)]
842+
fn contains(&self, val: &&'p Grapheme) -> bool {
843+
Graphemes::new(self).contains(val)
844+
}
845+
846+
#[inline]
847+
fn to_maybe_ref<'b>(item: Self::Item<'b>) -> MaybeRef<'p, &'p Grapheme>
848+
where
849+
'p: 'b,
850+
{
851+
MaybeRef::Val(item)
852+
}
853+
}
854+
855+
impl<'p> Seq<'p, &'p Grapheme> for &'p Graphemes {
856+
type Item<'a>
857+
= &'p Grapheme
858+
where
859+
Self: 'a;
860+
861+
type Iter<'a>
862+
= GraphemesIter<'p>
863+
where
864+
Self: 'a;
865+
866+
#[inline(always)]
867+
fn seq_iter(&self) -> Self::Iter<'_> {
868+
self.iter()
869+
}
870+
871+
#[inline(always)]
872+
fn contains(&self, val: &&'p Grapheme) -> bool {
873+
self.iter().any(|i| i == *val)
874+
}
875+
876+
#[inline]
877+
fn to_maybe_ref<'b>(item: Self::Item<'b>) -> MaybeRef<'p, &'p Grapheme>
878+
where
879+
'p: 'b,
880+
{
881+
MaybeRef::Val(item)
882+
}
883+
}
884+
825885
/// A utility trait to abstract over *linear* container-like things.
826886
///
827887
/// This trait is likely to change in future versions of the crate, so avoid implementing it yourself.
@@ -838,8 +898,10 @@ impl<'p, T> OrderedSeq<'p, T> for core::ops::RangeInclusive<T> where Self: Seq<'
838898
impl<'p, T> OrderedSeq<'p, T> for RangeFrom<T> where Self: Seq<'p, T> {}
839899

840900
impl OrderedSeq<'_, char> for str {}
841-
impl<'p> OrderedSeq<'p, char> for &'p str {}
842901
impl OrderedSeq<'_, char> for String {}
902+
impl<'p> OrderedSeq<'p, char> for &'p str {}
903+
impl<'p> OrderedSeq<'p, &'p Grapheme> for &'p str {}
904+
impl<'p> OrderedSeq<'p, &'p Grapheme> for &'p Graphemes {}
843905

844906
#[cfg(test)]
845907
mod test {

src/input.rs

Lines changed: 12 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ pub use crate::stream::{BoxedExactSizeStream, BoxedStream, IterInput, Stream};
1212
use super::*;
1313
#[cfg(feature = "std")]
1414
use std::io::{BufReader, Read, Seek};
15-
use unicode_segmentation::{Graphemes, UnicodeSegmentation};
1615

1716
/// A trait for types that represents a stream of input tokens. Unlike [`Iterator`], this type
1817
/// supports backtracking and a few other features required by the crate.
@@ -210,8 +209,9 @@ pub trait SliceInput<'src>: ExactSizeInput<'src> {
210209
// Implemented by inputs that reference a string slice and use byte indices as their cursor. This trait is sealed right
211210
// now because `StrInput` places additional requirements on its cursor semantics.
212211
/// A trait for types that represent string-like streams of input tokens.
213-
pub trait StrInput<'src, C: Char>:
214-
Sealed + ValueInput<'src, Cursor = usize, Token = C> + SliceInput<'src, Slice = &'src C::Str>
212+
pub trait StrInput<'src>: Sealed + ValueInput<'src, Cursor = usize> + SliceInput<'src>
213+
where
214+
Self::Token: Char,
215215
{
216216
}
217217

@@ -298,7 +298,7 @@ impl<'src> ValueInput<'src> for &'src str {
298298
}
299299

300300
impl Sealed for &str {}
301-
impl<'src> StrInput<'src, char> for &'src str {}
301+
impl<'src> StrInput<'src> for &'src str {}
302302

303303
impl<'src> SliceInput<'src> for &'src str {
304304
type Slice = &'src str;
@@ -319,89 +319,6 @@ impl<'src> SliceInput<'src> for &'src str {
319319
}
320320
}
321321

322-
impl<'src> Input<'src> for Graphemes<'src> {
323-
type Cursor = usize;
324-
type Span = SimpleSpan<usize>;
325-
326-
type Token = &'src str;
327-
type MaybeToken = &'src str;
328-
329-
type Cache = &'src str;
330-
331-
#[inline]
332-
fn begin(self) -> (Self::Cursor, Self::Cache) {
333-
(0, self.as_str())
334-
}
335-
336-
#[inline]
337-
fn cursor_location(cursor: &Self::Cursor) -> usize {
338-
*cursor
339-
}
340-
341-
#[inline(always)]
342-
unsafe fn next_maybe(
343-
this: &mut Self::Cache,
344-
cursor: &mut Self::Cursor,
345-
) -> Option<Self::MaybeToken> {
346-
if *cursor < this.len() {
347-
// SAFETY: `cursor < self.len()` above guarantees cursor is in-bounds
348-
// We only ever return cursors that are at a code point boundary.
349-
// The `next()` implementation returns `None`, only in the
350-
// situation of zero length of the remaining part of the string.
351-
// And the Unicode standard guarantees that any sequence of code
352-
// points is a valid sequence of grapheme clusters, so the
353-
// behaviour of the `next()` function should not change.
354-
let c = this
355-
.get_unchecked(*cursor..)
356-
.graphemes(true)
357-
.next()
358-
.unwrap_unchecked();
359-
*cursor += c.len();
360-
Some(c)
361-
} else {
362-
None
363-
}
364-
}
365-
366-
#[inline(always)]
367-
unsafe fn span(_this: &mut Self::Cache, range: Range<&Self::Cursor>) -> Self::Span {
368-
(*range.start..*range.end).into()
369-
}
370-
}
371-
372-
impl<'src> ExactSizeInput<'src> for Graphemes<'src> {
373-
#[inline(always)]
374-
unsafe fn span_from(this: &mut Self::Cache, range: RangeFrom<&Self::Cursor>) -> Self::Span {
375-
(*range.start..this.len()).into()
376-
}
377-
}
378-
379-
impl<'src> ValueInput<'src> for Graphemes<'src> {
380-
#[inline(always)]
381-
unsafe fn next(this: &mut Self::Cache, cursor: &mut Self::Cursor) -> Option<Self::Token> {
382-
Self::next_maybe(this, cursor)
383-
}
384-
}
385-
386-
impl<'src> SliceInput<'src> for Graphemes<'src> {
387-
type Slice = Graphemes<'src>;
388-
389-
#[inline(always)]
390-
fn full_slice(this: &mut Self::Cache) -> Self::Slice {
391-
this.graphemes(true)
392-
}
393-
394-
#[inline(always)]
395-
unsafe fn slice(this: &mut Self::Cache, range: Range<&Self::Cursor>) -> Self::Slice {
396-
this[*range.start..*range.end].graphemes(true)
397-
}
398-
399-
#[inline(always)]
400-
unsafe fn slice_from(this: &mut Self::Cache, from: RangeFrom<&Self::Cursor>) -> Self::Slice {
401-
this[*from.start..].graphemes(true)
402-
}
403-
}
404-
405322
impl<'src, T> Input<'src> for &'src [T] {
406323
type Cursor = usize;
407324
type Span = SimpleSpan<usize>;
@@ -448,7 +365,7 @@ impl<'src, T> ExactSizeInput<'src> for &'src [T] {
448365
}
449366

450367
impl Sealed for &[u8] {}
451-
impl<'src> StrInput<'src, u8> for &'src [u8] {}
368+
impl<'src> StrInput<'src> for &'src [u8] {}
452369

453370
impl<'src, T> SliceInput<'src> for &'src [T] {
454371
type Slice = &'src [T];
@@ -532,7 +449,7 @@ impl<'src, T: 'src, const N: usize> ExactSizeInput<'src> for &'src [T; N] {
532449
}
533450

534451
impl<const N: usize> Sealed for &[u8; N] {}
535-
impl<'src, const N: usize> StrInput<'src, u8> for &'src [u8; N] {}
452+
impl<'src, const N: usize> StrInput<'src> for &'src [u8; N] {}
536453

537454
impl<'src, T: 'src, const N: usize> SliceInput<'src> for &'src [T; N] {
538455
type Slice = &'src [T];
@@ -881,14 +798,14 @@ where
881798
F: Fn(I::Span) -> S,
882799
{
883800
}
884-
impl<'src, C, S, I, F: 'src> StrInput<'src, C> for MappedSpan<S, I, F>
801+
impl<'src, S, I, F: 'src> StrInput<'src> for MappedSpan<S, I, F>
885802
where
886-
I: StrInput<'src, C>,
803+
I: StrInput<'src>,
804+
I::Token: Char,
887805
S: Span + Clone + 'src,
888806
S::Context: Clone + 'src,
889807
S::Offset: From<<I::Span as Span>::Offset>,
890808
F: Fn(I::Span) -> S,
891-
C: Char,
892809
{
893810
}
894811

@@ -1027,13 +944,13 @@ where
1027944
S::Offset: From<<I::Span as Span>::Offset>,
1028945
{
1029946
}
1030-
impl<'src, C, S, I> StrInput<'src, C> for WithContext<S, I>
947+
impl<'src, S, I> StrInput<'src> for WithContext<S, I>
1031948
where
1032-
I: StrInput<'src, C>,
949+
I: StrInput<'src>,
950+
I::Token: Char,
1033951
S: Span + Clone + 'src,
1034952
S::Context: Clone + 'src,
1035953
S::Offset: From<<I::Span as Span>::Offset>,
1036-
C: Char,
1037954
{
1038955
}
1039956

0 commit comments

Comments
 (0)