Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 51 additions & 2 deletions src/bytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -177,10 +177,12 @@ impl RegexBuilder {
Box::new(move || MatchData::new(config.clone(), &code));
Pool::new(create)
};
let utf = code.is_utf()?;
Ok(Regex {
config: Arc::new(self.config.clone()),
pattern: pattern.to_string(),
code,
utf,
capture_names: Arc::new(capture_names),
capture_names_idx: Arc::new(idx),
match_data,
Expand Down Expand Up @@ -361,6 +363,8 @@ pub struct Regex {
pattern: String,
/// The underlying compiled PCRE2 object.
code: Arc<Code>,
/// True if the regex uses UTF mode.
utf: bool,
/// The capture group names for this regex.
capture_names: Arc<Vec<Option<String>>>,
/// A map from capture group name to capture group index.
Expand All @@ -382,6 +386,7 @@ impl Clone for Regex {
config: Arc::clone(&self.config),
pattern: self.pattern.clone(),
code: Arc::clone(&self.code),
utf: self.utf,
capture_names: Arc::clone(&self.capture_names),
capture_names_idx: Arc::clone(&self.capture_names_idx),
match_data,
Expand Down Expand Up @@ -759,6 +764,20 @@ impl Regex {
fn new_match_data(&self) -> MatchData {
MatchData::new(self.config.match_config.clone(), &self.code)
}

/// Determines the next possible match starting position within the
/// subject string. In UTF mode, the starting position must be a
/// UTF-8 character boundary. In non-UTF mode, any byte offset is
/// a valid starting position.
fn position_after(&self, subject: &[u8], start: usize) -> usize {
let mut pos = start + 1;
if self.utf {
while subject.get(pos).map_or(false, |b| (*b as i8) < -0x40) {
pos += 1;
}
}
pos
}
}

/// CaptureLocations is a low level representation of the raw offsets of each
Expand Down Expand Up @@ -1022,7 +1041,7 @@ impl<'r, 's> Iterator for Matches<'r, 's> {
// This is an empty match. To ensure we make progress, start
// the next search at the smallest possible starting position
// of the next match following this one.
self.last_end = m.end() + 1;
self.last_end = self.re.position_after(self.subject, m.end());
// Don't accept empty matches immediately following a match.
// Just move on to the next match.
if Some(m.end()) == self.last_match {
Expand Down Expand Up @@ -1069,7 +1088,7 @@ impl<'r, 's> Iterator for CaptureMatches<'r, 's> {
// This is an empty match. To ensure we make progress, start
// the next search at the smallest possible starting position
// of the next match following this one.
self.last_end = m.end() + 1;
self.last_end = self.re.position_after(self.subject, m.end());
// Don't accept empty matches immediately following a match.
// Just move on to the next match.
if Some(m.end()) == self.last_match {
Expand Down Expand Up @@ -1290,6 +1309,36 @@ mod tests {
);
}

#[test]
fn find_iter_empty_utf() {
let re = Regex::new(r"(*UTF)x*").unwrap();
assert_eq!(
find_iter_tuples(&re, "∀ÁA".as_bytes()),
vec![(0, 0), (3, 3), (5, 5), (6, 6),]
);

let re = Regex::new(r"x*").unwrap();
assert_eq!(
find_iter_tuples(&re, "∀ÁA".as_bytes()),
vec![(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6),]
);
}

#[test]
fn captures_iter_empty_utf() {
let re = Regex::new(r"(*UTF)x*").unwrap();
assert_eq!(
cap_iter_tuples(&re, "∀ÁA".as_bytes()),
vec![(0, 0), (3, 3), (5, 5), (6, 6),]
);

let re = Regex::new(r"x*").unwrap();
assert_eq!(
cap_iter_tuples(&re, "∀ÁA".as_bytes()),
vec![(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6),]
);
}

#[test]
fn max_jit_stack_size_does_something() {
if !is_jit_available() {
Expand Down
21 changes: 21 additions & 0 deletions src/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,27 @@ impl Code {
Ok(1 + count as usize)
}
}

/// Returns true if this regex uses UTF mode (matches whole UTF-8
/// characters), or false if it uses non-UTF mode (matches
/// individual bytes). This depends on the options specified when
/// compiling the regex, and may also be affected by flags such as
/// `(*UTF)` within the pattern itself.
pub(crate) fn is_utf(&self) -> Result<bool, Error> {
let mut options: u32 = 0;
let rc = unsafe {
pcre2_pattern_info_8(
self.as_ptr(),
PCRE2_INFO_ALLOPTIONS,
&mut options as *mut u32 as *mut c_void,
)
};
if rc != 0 {
Err(Error::info(rc))
} else {
Ok(options & PCRE2_UTF != 0)
}
}
}

/// A low level representation of PCRE2's compilation context.
Expand Down