diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs index a812b4012..21c1a3a31 100644 --- a/regex-automata/src/meta/regex.rs +++ b/regex-automata/src/meta/regex.rs @@ -917,7 +917,9 @@ impl Regex { /// ``` #[inline] pub fn search(&self, input: &Input<'_>) -> Option { - if self.imp.info.is_impossible(input) { + if self.imp.info.captures_disabled() + || self.imp.info.is_impossible(input) + { return None; } let mut guard = self.pool.get(); @@ -973,7 +975,9 @@ impl Regex { /// ``` #[inline] pub fn search_half(&self, input: &Input<'_>) -> Option { - if self.imp.info.is_impossible(input) { + if self.imp.info.captures_disabled() + || self.imp.info.is_impossible(input) + { return None; } let mut guard = self.pool.get(); @@ -1128,7 +1132,9 @@ impl Regex { input: &Input<'_>, slots: &mut [Option], ) -> Option { - if self.imp.info.is_impossible(input) { + if self.imp.info.captures_disabled() + || self.imp.info.is_impossible(input) + { return None; } let mut guard = self.pool.get(); @@ -1242,7 +1248,9 @@ impl Regex { cache: &mut Cache, input: &Input<'_>, ) -> Option { - if self.imp.info.is_impossible(input) { + if self.imp.info.captures_disabled() + || self.imp.info.is_impossible(input) + { return None; } self.imp.strat.search(cache, input) @@ -1284,7 +1292,9 @@ impl Regex { cache: &mut Cache, input: &Input<'_>, ) -> Option { - if self.imp.info.is_impossible(input) { + if self.imp.info.captures_disabled() + || self.imp.info.is_impossible(input) + { return None; } self.imp.strat.search_half(cache, input) @@ -1437,7 +1447,9 @@ impl Regex { input: &Input<'_>, slots: &mut [Option], ) -> Option { - if self.imp.info.is_impossible(input) { + if self.imp.info.captures_disabled() + || self.imp.info.is_impossible(input) + { return None; } self.imp.strat.search_slots(cache, input, slots) @@ -1982,6 +1994,19 @@ impl RegexInfo { self.props_union().look_set_suffix().contains(Look::End) } + /// Returns true when the regex's NFA lacks capture states. + /// + /// In this case, some regex engines (like the PikeVM) are unable to report + /// match offsets, while some (like the lazy DFA can). To avoid whether a + /// match or not is reported based on engine selection, routines that + /// return match offsets will _always_ report `None` when this is true. + /// + /// Yes, this is a weird case and it's a little fucked up. But + /// `WhichCaptures::None` comes with an appropriate warning. + fn captures_disabled(&self) -> bool { + matches!(self.config().get_which_captures(), WhichCaptures::None) + } + /// Returns true if and only if it is known that a match is impossible /// for the given input. This is useful for short-circuiting and avoiding /// running the regex engine if it's known no match can be reported. @@ -2645,7 +2670,12 @@ impl Config { /// Setting this to `WhichCaptures::None` is usually not the right thing to /// do. When no capture states are compiled, some regex engines (such as /// the `PikeVM`) won't be able to report match offsets. This will manifest - /// as no match being found. + /// as no match being found. Indeed, in order to enforce consistent + /// behavior, the meta regex engine will always report `None` for routines + /// that return match offsets even if one of its regex engines could + /// service the request. This avoids "match or not" behavior from being + /// influenced by user input (since user input can influence the selection + /// of the regex engine). /// /// # Example /// @@ -2694,6 +2724,33 @@ impl Config { /// /// Ok::<(), Box>(()) /// ``` + /// + /// # Example: strange `Regex::find` behavior + /// + /// As noted above, when using [`WhichCaptures::None`], this means that + /// `Regex::is_match` could return `true` while `Regex::find` returns + /// `None`: + /// + /// ``` + /// use regex_automata::{ + /// meta::Regex, + /// nfa::thompson::WhichCaptures, + /// Input, + /// Match, + /// Span, + /// }; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().which_captures(WhichCaptures::None)) + /// .build(r"foo([0-9]+)bar")?; + /// let hay = "foo123bar"; + /// + /// assert!(re.is_match(hay)); + /// assert_eq!(re.find(hay), None); + /// assert_eq!(re.search_half(&Input::new(hay)), None); + /// + /// Ok::<(), Box>(()) + /// ``` pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config { self.which_captures = Some(which_captures); self diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 569f60acd..96a39ac4e 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -563,6 +563,28 @@ pub enum WhichCaptures { /// This is useful when capture states are either not needed (for example, /// if one is only trying to build a DFA) or if they aren't supported (for /// example, a reverse NFA). + /// + /// # Warning + /// + /// Callers must be exceedingly careful when using this + /// option. In particular, not all regex engines support + /// reporting match spans when using this option (for example, + /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) or + /// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker)). + /// + /// Perhaps more confusingly, using this option with such an + /// engine means that an `is_match` routine could report `true` + /// when `find` reports `None`. This is generally not something + /// that _should_ happen, but the low level control provided by + /// this crate makes it possible. + /// + /// Similarly, any regex engines (like [`meta::Regex`](crate::meta::Regex)) + /// should always return `None` from `find` routines when this option is + /// used, even if _some_ of its internal engines could find the match + /// boundaries. This is because inputs from user data could influence + /// engine selection, and thus influence whether a match is found or not. + /// Indeed, `meta::Regex::find` will always return `None` when configured + /// with this option. None, }