Skip to content

Commit cd1c46d

Browse files
committed
syntax: simplify single char alternations
In short, simplify 'a|b|..|z' to '[a-z]'.
1 parent 350f142 commit cd1c46d

File tree

4 files changed

+97
-41
lines changed

4 files changed

+97
-41
lines changed

regex-syntax/src/debug.rs

Lines changed: 1 addition & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ impl<'a> core::fmt::Debug for Bytes<'a> {
7474
/// byte slice, then the first byte is returned instead.
7575
///
7676
/// This returns `None` if and only if `bytes` is empty.
77-
fn utf8_decode(bytes: &[u8]) -> Option<Result<char, u8>> {
77+
pub(crate) fn utf8_decode(bytes: &[u8]) -> Option<Result<char, u8>> {
7878
if bytes.is_empty() {
7979
return None;
8080
}
@@ -83,26 +83,3 @@ fn utf8_decode(bytes: &[u8]) -> Option<Result<char, u8>> {
8383
Err(_) => Some(Err(bytes[0])),
8484
}
8585
}
86-
87-
/*
88-
/// Given a UTF-8 leading byte, this returns the total number of code units
89-
/// in the following encoded codepoint.
90-
///
91-
/// If the given byte is not a valid UTF-8 leading byte, then this returns
92-
/// `None`.
93-
fn len(byte: u8) -> Option<usize> {
94-
if byte <= 0x7F {
95-
return Some(1);
96-
} else if byte & 0b1100_0000 == 0b1000_0000 {
97-
return None;
98-
} else if byte <= 0b1101_1111 {
99-
Some(2)
100-
} else if byte <= 0b1110_1111 {
101-
Some(3)
102-
} else if byte <= 0b1111_0111 {
103-
Some(4)
104-
} else {
105-
None
106-
}
107-
}
108-
*/

regex-syntax/src/hir/mod.rs

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,25 @@ impl Hir {
419419
} else if new.len() == 1 {
420420
return new.pop().unwrap();
421421
}
422+
// Now that it's completely flattened, look for the special case of
423+
// 'char1|char2|...|charN' and collapse that into a class. Note that we
424+
// look for 'char' first and then bytes. The issue here is that if we
425+
// find both non-ASCII codepoints and non-ASCII singleton bytes, then
426+
// it isn't actually possible to smush them into a single class. So we
427+
// look for all chars and then all bytes, and don't handle anything
428+
// else.
429+
if let Some(singletons) = singleton_chars(&new) {
430+
let it = singletons
431+
.into_iter()
432+
.map(|ch| ClassUnicodeRange { start: ch, end: ch });
433+
return Hir::class(Class::Unicode(ClassUnicode::new(it)));
434+
}
435+
if let Some(singletons) = singleton_bytes(&new) {
436+
let it = singletons
437+
.into_iter()
438+
.map(|b| ClassBytesRange { start: b, end: b });
439+
return Hir::class(Class::Bytes(ClassBytes::new(it)));
440+
}
422441
let props = Properties::alternation(&new);
423442
Hir { kind: HirKind::Alternation(new), props }
424443
}
@@ -1886,6 +1905,47 @@ impl Iterator for LookSetIter {
18861905
}
18871906
}
18881907

1908+
/// Given a sequence of HIR values where each value corresponds to a literal
1909+
/// that is a single `char`, return that sequence of `char`s. Otherwise return
1910+
/// None. No deduplication is done.
1911+
fn singleton_chars(hirs: &[Hir]) -> Option<Vec<char>> {
1912+
let mut singletons = vec![];
1913+
for hir in hirs.iter() {
1914+
let literal = match *hir.kind() {
1915+
HirKind::Literal(Literal(ref bytes)) => bytes,
1916+
_ => return None,
1917+
};
1918+
let ch = match crate::debug::utf8_decode(literal) {
1919+
None => return None,
1920+
Some(Err(_)) => return None,
1921+
Some(Ok(ch)) => ch,
1922+
};
1923+
if literal.len() != ch.len_utf8() {
1924+
return None;
1925+
}
1926+
singletons.push(ch);
1927+
}
1928+
Some(singletons)
1929+
}
1930+
1931+
/// Given a sequence of HIR values where each value corresponds to a literal
1932+
/// that is a single byte, return that sequence of bytes. Otherwise return
1933+
/// None. No deduplication is done.
1934+
fn singleton_bytes(hirs: &[Hir]) -> Option<Vec<u8>> {
1935+
let mut singletons = vec![];
1936+
for hir in hirs.iter() {
1937+
let literal = match *hir.kind() {
1938+
HirKind::Literal(Literal(ref bytes)) => bytes,
1939+
_ => return None,
1940+
};
1941+
if literal.len() != 1 {
1942+
return None;
1943+
}
1944+
singletons.push(literal[0]);
1945+
}
1946+
Some(singletons)
1947+
}
1948+
18891949
#[cfg(test)]
18901950
mod tests {
18911951
use super::*;

regex-syntax/src/hir/print.rs

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -434,8 +434,10 @@ mod tests {
434434
roundtrip("|", "(?:|)");
435435
roundtrip("||", "(?:||)");
436436

437-
roundtrip("a|b", "(?:a|b)");
438-
roundtrip("a|b|c", "(?:a|b|c)");
437+
roundtrip("a|b", "[ab]");
438+
roundtrip("ab|cd", "(?:(?:ab)|(?:cd))");
439+
roundtrip("a|b|c", "[a-c]");
440+
roundtrip("ab|cd|ef", "(?:(?:ab)|(?:cd)|(?:ef))");
439441
roundtrip("foo|bar|quux", "(?:(?:foo)|(?:bar)|(?:quux))");
440442
}
441443

@@ -494,19 +496,19 @@ mod tests {
494496
#[test]
495497
fn regression_repetition_alternation() {
496498
let expr = Hir::concat(alloc::vec![
497-
Hir::literal("x".as_bytes()),
499+
Hir::literal("ab".as_bytes()),
498500
Hir::repetition(hir::Repetition {
499501
min: 1,
500502
max: None,
501503
greedy: true,
502504
hir: Box::new(Hir::alternation(alloc::vec![
503-
Hir::literal("a".as_bytes()),
504-
Hir::literal("b".as_bytes()),
505+
Hir::literal("cd".as_bytes()),
506+
Hir::literal("ef".as_bytes()),
505507
])),
506508
}),
507-
Hir::literal("y".as_bytes()),
509+
Hir::literal("gh".as_bytes()),
508510
]);
509-
assert_eq!(r"(?:x(?:a|b)+y)", expr.to_string());
511+
assert_eq!(r"(?:(?:ab)(?:(?:cd)|(?:ef))+(?:gh))", expr.to_string());
510512

511513
let expr = Hir::concat(alloc::vec![
512514
Hir::look(hir::Look::Start),
@@ -538,13 +540,13 @@ mod tests {
538540
#[test]
539541
fn regression_alternation_concat() {
540542
let expr = Hir::concat(alloc::vec![
541-
Hir::literal("a".as_bytes()),
543+
Hir::literal("ab".as_bytes()),
542544
Hir::alternation(alloc::vec![
543-
Hir::literal("b".as_bytes()),
544-
Hir::literal("c".as_bytes()),
545+
Hir::literal("mn".as_bytes()),
546+
Hir::literal("xy".as_bytes()),
545547
]),
546548
]);
547-
assert_eq!(r"(?:a(?:b|c))", expr.to_string());
549+
assert_eq!(r"(?:(?:ab)(?:(?:mn)|(?:xy)))", expr.to_string());
548550

549551
let expr = Hir::concat(alloc::vec![
550552
Hir::look(hir::Look::Start),

regex-syntax/src/hir/translate.rs

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3285,15 +3285,11 @@ mod tests {
32853285
assert!(props(r"ab").is_alternation_literal());
32863286
assert!(props(r"abc").is_alternation_literal());
32873287
assert!(props(r"(?m)abc").is_alternation_literal());
3288-
assert!(props(r"a|b").is_alternation_literal());
3289-
assert!(props(r"a|b|c").is_alternation_literal());
32903288
assert!(props(r"foo|bar").is_alternation_literal());
32913289
assert!(props(r"foo|bar|baz").is_alternation_literal());
32923290
assert!(props(r"[a]").is_alternation_literal());
3293-
assert!(props(r"[a]|b").is_alternation_literal());
3294-
assert!(props(r"a|[b]").is_alternation_literal());
3295-
assert!(props(r"(?:a)|b").is_alternation_literal());
3296-
assert!(props(r"a|(?:b)").is_alternation_literal());
3291+
assert!(props(r"(?:ab)|cd").is_alternation_literal());
3292+
assert!(props(r"ab|(?:cd)").is_alternation_literal());
32973293

32983294
// Negative examples.
32993295
assert!(!props(r"").is_alternation_literal());
@@ -3307,6 +3303,12 @@ mod tests {
33073303
assert!(!props(r"a|[ab]").is_alternation_literal());
33083304
assert!(!props(r"(a)|b").is_alternation_literal());
33093305
assert!(!props(r"a|(b)").is_alternation_literal());
3306+
assert!(!props(r"a|b").is_alternation_literal());
3307+
assert!(!props(r"a|b|c").is_alternation_literal());
3308+
assert!(!props(r"[a]|b").is_alternation_literal());
3309+
assert!(!props(r"a|[b]").is_alternation_literal());
3310+
assert!(!props(r"(?:a)|b").is_alternation_literal());
3311+
assert!(!props(r"a|(?:b)").is_alternation_literal());
33103312
}
33113313

33123314
// This tests that the smart Hir::concat constructor simplifies the given
@@ -3354,5 +3356,20 @@ mod tests {
33543356
hir_lit("baz"),
33553357
])
33563358
);
3359+
assert_eq!(
3360+
t("quux|(?:abc|(?:def|mno)|xyz)|baz"),
3361+
hir_alt(vec![
3362+
hir_lit("quux"),
3363+
hir_lit("abc"),
3364+
hir_lit("def"),
3365+
hir_lit("mno"),
3366+
hir_lit("xyz"),
3367+
hir_lit("baz"),
3368+
])
3369+
);
3370+
assert_eq!(
3371+
t("a|b|c|d|e|f|x|y|z"),
3372+
hir_uclass(&[('a', 'f'), ('x', 'z')]),
3373+
);
33573374
}
33583375
}

0 commit comments

Comments
 (0)