Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 64 additions & 7 deletions regex-automata/src/meta/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -917,7 +917,9 @@ impl Regex {
/// ```
#[inline]
pub fn search(&self, input: &Input<'_>) -> Option<Match> {
if self.imp.info.is_impossible(input) {
if self.imp.info.captures_disabled()
|| self.imp.info.is_impossible(input)
{
return None;
}
let mut guard = self.pool.get();
Expand Down Expand Up @@ -973,7 +975,9 @@ impl Regex {
/// ```
#[inline]
pub fn search_half(&self, input: &Input<'_>) -> Option<HalfMatch> {
if self.imp.info.is_impossible(input) {
if self.imp.info.captures_disabled()
|| self.imp.info.is_impossible(input)
{
return None;
}
let mut guard = self.pool.get();
Expand Down Expand Up @@ -1128,7 +1132,9 @@ impl Regex {
input: &Input<'_>,
slots: &mut [Option<NonMaxUsize>],
) -> Option<PatternID> {
if self.imp.info.is_impossible(input) {
if self.imp.info.captures_disabled()
|| self.imp.info.is_impossible(input)
{
return None;
}
let mut guard = self.pool.get();
Expand Down Expand Up @@ -1242,7 +1248,9 @@ impl Regex {
cache: &mut Cache,
input: &Input<'_>,
) -> Option<Match> {
if self.imp.info.is_impossible(input) {
if self.imp.info.captures_disabled()
|| self.imp.info.is_impossible(input)
{
return None;
}
self.imp.strat.search(cache, input)
Expand Down Expand Up @@ -1284,7 +1292,9 @@ impl Regex {
cache: &mut Cache,
input: &Input<'_>,
) -> Option<HalfMatch> {
if self.imp.info.is_impossible(input) {
if self.imp.info.captures_disabled()
|| self.imp.info.is_impossible(input)
{
return None;
}
self.imp.strat.search_half(cache, input)
Expand Down Expand Up @@ -1437,7 +1447,9 @@ impl Regex {
input: &Input<'_>,
slots: &mut [Option<NonMaxUsize>],
) -> Option<PatternID> {
if self.imp.info.is_impossible(input) {
if self.imp.info.captures_disabled()
|| self.imp.info.is_impossible(input)
{
return None;
}
self.imp.strat.search_slots(cache, input, slots)
Expand Down Expand Up @@ -1982,6 +1994,19 @@ impl RegexInfo {
self.props_union().look_set_suffix().contains(Look::End)
}

/// Returns true when the regex's NFA lacks capture states.
///
/// In this case, some regex engines (like the PikeVM) are unable to report
/// match offsets, while some (like the lazy DFA can). To avoid whether a
/// match or not is reported based on engine selection, routines that
/// return match offsets will _always_ report `None` when this is true.
///
/// Yes, this is a weird case and it's a little fucked up. But
/// `WhichCaptures::None` comes with an appropriate warning.
fn captures_disabled(&self) -> bool {
matches!(self.config().get_which_captures(), WhichCaptures::None)
}

/// Returns true if and only if it is known that a match is impossible
/// for the given input. This is useful for short-circuiting and avoiding
/// running the regex engine if it's known no match can be reported.
Expand Down Expand Up @@ -2645,7 +2670,12 @@ impl Config {
/// Setting this to `WhichCaptures::None` is usually not the right thing to
/// do. When no capture states are compiled, some regex engines (such as
/// the `PikeVM`) won't be able to report match offsets. This will manifest
/// as no match being found.
/// as no match being found. Indeed, in order to enforce consistent
/// behavior, the meta regex engine will always report `None` for routines
/// that return match offsets even if one of its regex engines could
/// service the request. This avoids "match or not" behavior from being
/// influenced by user input (since user input can influence the selection
/// of the regex engine).
///
/// # Example
///
Expand Down Expand Up @@ -2694,6 +2724,33 @@ impl Config {
///
/// Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
/// # Example: strange `Regex::find` behavior
///
/// As noted above, when using [`WhichCaptures::None`], this means that
/// `Regex::is_match` could return `true` while `Regex::find` returns
/// `None`:
///
/// ```
/// use regex_automata::{
/// meta::Regex,
/// nfa::thompson::WhichCaptures,
/// Input,
/// Match,
/// Span,
/// };
///
/// let re = Regex::builder()
/// .configure(Regex::config().which_captures(WhichCaptures::None))
/// .build(r"foo([0-9]+)bar")?;
/// let hay = "foo123bar";
///
/// assert!(re.is_match(hay));
/// assert_eq!(re.find(hay), None);
/// assert_eq!(re.search_half(&Input::new(hay)), None);
///
/// Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config {
self.which_captures = Some(which_captures);
self
Expand Down
22 changes: 22 additions & 0 deletions regex-automata/src/nfa/thompson/compiler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,28 @@ pub enum WhichCaptures {
/// This is useful when capture states are either not needed (for example,
/// if one is only trying to build a DFA) or if they aren't supported (for
/// example, a reverse NFA).
///
/// # Warning
///
/// Callers must be exceedingly careful when using this
/// option. In particular, not all regex engines support
/// reporting match spans when using this option (for example,
/// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) or
/// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker)).
///
/// Perhaps more confusingly, using this option with such an
/// engine means that an `is_match` routine could report `true`
/// when `find` reports `None`. This is generally not something
/// that _should_ happen, but the low level control provided by
/// this crate makes it possible.
///
/// Similarly, any regex engines (like [`meta::Regex`](crate::meta::Regex))
/// should always return `None` from `find` routines when this option is
/// used, even if _some_ of its internal engines could find the match
/// boundaries. This is because inputs from user data could influence
/// engine selection, and thus influence whether a match is found or not.
/// Indeed, `meta::Regex::find` will always return `None` when configured
/// with this option.
None,
}

Expand Down