Skip to content

Commit be57a23

Browse files
committed
syntax: add 'Hir::dot' method to replace 'Hir::{any,dot}_{char,byte}'
In a previous commit, I replaced 'Hir::{any,dot}' a total of four methods. Essentially, I expanded out the boolean parameter to 'Hir::{any,dot}'. I later realized that we'll probably need a "dot except for CR and LF" too. And having four methods all for the same 'dot' construct seemed a bit much. So I've turned it into one method with a new 'Dot' enum. Eventually, that enum should grow two more variants: 'AnyCharExceptCRLF' and 'AnyByteExceptCRLF'. That sort of expansion would have been pretty annoying to do (because of naming) in the prior scheme.
1 parent 224dc3a commit be57a23

File tree

3 files changed

+80
-65
lines changed

3 files changed

+80
-65
lines changed

regex-syntax/src/hir/mod.rs

Lines changed: 60 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -317,54 +317,41 @@ impl Hir {
317317
}
318318
}
319319

320-
/// Build an HIR expression for `.`.
321-
///
322-
/// A `.` expression matches any character except for a newline terminator.
323-
/// To build an expression that matches any character, including newline
324-
/// terminators, use the `any_char` method.
325-
pub fn dot_char() -> Hir {
326-
let mut cls = ClassUnicode::empty();
327-
cls.push(ClassUnicodeRange::new('\0', '\x09'));
328-
cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}'));
329-
Hir::class(Class::Unicode(cls))
330-
}
331-
332-
/// Build an HIR expression for `(?-u:.)`.
333-
///
334-
/// A non-Unicode `.` expression matches any byte except for a newline
335-
/// terminator. To build an expression that matches any byte, including
336-
/// newline terminators, use the `any_byte` method.
337-
pub fn dot_byte() -> Hir {
338-
let mut cls = ClassBytes::empty();
339-
cls.push(ClassBytesRange::new(b'\0', b'\x09'));
340-
cls.push(ClassBytesRange::new(b'\x0B', b'\xFF'));
341-
Hir::class(Class::Bytes(cls))
342-
}
343-
344-
/// Build an HIR expression for `(?s:.)`.
345-
///
346-
/// A `(?s:.)` expression matches any character, including newline
347-
/// terminators. To build an expression that matches any character except
348-
/// for newline terminators, use the `dot_char` method.
349-
///
350-
/// Note that `(?s:)` is equivalent to `\p{any}`.
351-
pub fn any_char() -> Hir {
352-
let mut cls = ClassUnicode::empty();
353-
cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}'));
354-
Hir::class(Class::Unicode(cls))
355-
}
356-
357-
/// Build an HIR expression for `(?s-u:.)`.
358-
///
359-
/// A `(?s-u:.)` expression matches any byte, including newline terminators.
360-
/// To build an expression that matches any byte except for newline
361-
/// terminators, use the `dot_byte` method.
362-
///
363-
/// Note that `(?s-u:.)` is equivalent to `(?-u:[\x00-\xFF])`.
364-
pub fn any_byte() -> Hir {
365-
let mut cls = ClassBytes::empty();
366-
cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
367-
Hir::class(Class::Bytes(cls))
320+
/// Returns an HIR expression for `.`.
321+
///
322+
/// * [`Dot::AnyChar`] maps to `(?su:.)`.
323+
/// * [`Dot::AnyByte`] maps to `(?s-u:.)`.
324+
/// * [`Dot::AnyCharExceptNL`] maps to `(?u-s:.)`.
325+
/// * [`Dot::AnyByteExceptNL`] maps to `(?-su:.)`.
326+
///
327+
/// Note that this is a convenience routine for constructing the correct
328+
/// character class based on the value of `Dot`. There is no explicit "dot"
329+
/// HIR value. It is just an abbreviation for a common character class.
330+
pub fn dot(dot: Dot) -> Hir {
331+
match dot {
332+
Dot::AnyChar => {
333+
let mut cls = ClassUnicode::empty();
334+
cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}'));
335+
Hir::class(Class::Unicode(cls))
336+
}
337+
Dot::AnyByte => {
338+
let mut cls = ClassBytes::empty();
339+
cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
340+
Hir::class(Class::Bytes(cls))
341+
}
342+
Dot::AnyCharExceptNL => {
343+
let mut cls = ClassUnicode::empty();
344+
cls.push(ClassUnicodeRange::new('\0', '\x09'));
345+
cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}'));
346+
Hir::class(Class::Unicode(cls))
347+
}
348+
Dot::AnyByteExceptNL => {
349+
let mut cls = ClassBytes::empty();
350+
cls.push(ClassBytesRange::new(b'\0', b'\x09'));
351+
cls.push(ClassBytesRange::new(b'\x0B', b'\xFF'));
352+
Hir::class(Class::Bytes(cls))
353+
}
354+
}
368355
}
369356
}
370357

@@ -1233,6 +1220,31 @@ impl Repetition {
12331220
}
12341221
}
12351222

1223+
/// A type describing the different flavors of `.`.
1224+
///
1225+
/// This type is meant to be used with [`Hir::dot`], which is a convenience
1226+
/// routine for building HIR values derived from the `.` regex.
1227+
#[non_exhaustive]
1228+
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1229+
pub enum Dot {
1230+
/// Matches the UTF-8 encoding of any Unicode scalar value.
1231+
///
1232+
/// This is equivalent to `(?su:.)` and also `\p{any}`.
1233+
AnyChar,
1234+
/// Matches any byte value.
1235+
///
1236+
/// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`.
1237+
AnyByte,
1238+
/// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`.
1239+
///
1240+
/// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`.
1241+
AnyCharExceptNL,
1242+
/// Matches any byte value except for `\n`.
1243+
///
1244+
/// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`.
1245+
AnyByteExceptNL,
1246+
}
1247+
12361248
/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
12371249
/// space but heap space proportional to the depth of the total `Hir`.
12381250
impl Drop for Hir {

regex-syntax/src/hir/translate.rs

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -856,23 +856,10 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
856856
}
857857

858858
fn hir_dot(&self, span: Span) -> Result<Hir> {
859-
let unicode = self.flags().unicode();
860-
if !unicode && !self.trans().allow_invalid_utf8 {
859+
if !self.flags().unicode() && !self.trans().allow_invalid_utf8 {
861860
return Err(self.error(span, ErrorKind::InvalidUtf8));
862861
}
863-
Ok(if self.flags().dot_matches_new_line() {
864-
if unicode {
865-
Hir::any_char()
866-
} else {
867-
Hir::any_byte()
868-
}
869-
} else {
870-
if unicode {
871-
Hir::dot_char()
872-
} else {
873-
Hir::dot_byte()
874-
}
875-
})
862+
Ok(Hir::dot(self.flags().dot()))
876863
}
877864

878865
fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> {
@@ -1210,6 +1197,22 @@ impl Flags {
12101197
}
12111198
}
12121199

1200+
fn dot(&self) -> hir::Dot {
1201+
if self.dot_matches_new_line() {
1202+
if self.unicode() {
1203+
hir::Dot::AnyChar
1204+
} else {
1205+
hir::Dot::AnyByte
1206+
}
1207+
} else {
1208+
if self.unicode() {
1209+
hir::Dot::AnyCharExceptNL
1210+
} else {
1211+
hir::Dot::AnyByteExceptNL
1212+
}
1213+
}
1214+
}
1215+
12131216
fn case_insensitive(&self) -> bool {
12141217
self.case_insensitive.unwrap_or(false)
12151218
}

src/compile.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -425,9 +425,9 @@ impl Compiler {
425425

426426
fn c_dotstar(&mut self) -> Result {
427427
let hir = if self.compiled.only_utf8() {
428-
Hir::any_char()
428+
Hir::dot(hir::Dot::AnyChar)
429429
} else {
430-
Hir::any_byte()
430+
Hir::dot(hir::Dot::AnyByte)
431431
};
432432
Ok(self
433433
.c(&Hir::repetition(hir::Repetition {

0 commit comments

Comments
 (0)