Skip to content

Commit f56601b

Browse files
henrikhorluckridiculousfish
authored andcommitted
Add support for capture groups and substring replacement
This adds a new regex function `capture`, which captures matching substrings, using a new type `Captures`. It also adds new functions `replace` and `replace_all`, allowing substring replacement.
1 parent f933dc9 commit f56601b

File tree

4 files changed

+427
-4
lines changed

4 files changed

+427
-4
lines changed

src/bytes.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use crate::ffi::CodeUnitWidth8;
2+
pub use crate::regex_impl::Captures as CapturesImpl;
23
pub use crate::regex_impl::Match as MatchImpl;
34

45
#[doc(inline)]
@@ -21,6 +22,19 @@ pub type RegexBuilder = RegexBuilderImpl<CodeUnitWidth8>;
2122
/// of the subject string.
2223
pub type Match<'s> = MatchImpl<'s, CodeUnitWidth8>;
2324

25+
/// `Captures` represents a group of captured byte strings for a single match.
26+
///
27+
/// The 0th capture always corresponds to the entire match. Each subsequent
28+
/// index corresponds to the next capture group in the regex. If a capture
29+
/// group is named, then the matched byte string is *also* available via the
30+
/// `name` method. (Note that the 0th capture is always unnamed and so must be
31+
/// accessed with the `get` method.)
32+
///
33+
/// Positions returned from a capture group are always byte indices.
34+
///
35+
/// `'s` is the lifetime of the matched subject string.
36+
pub type Captures<'s> = CapturesImpl<'s, CodeUnitWidth8>;
37+
2438
#[cfg(test)]
2539
mod tests {
2640
use super::{CodeUnitWidth8, Regex, RegexBuilder};

src/ffi.rs

Lines changed: 117 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ pub trait CodeUnitWidth: std::fmt::Debug + 'static {
8888
type pcre2_match_context;
8989
type pcre2_match_data;
9090
type pcre2_jit_stack;
91-
type PCRE2_CHAR;
91+
type PCRE2_CHAR: Default + Copy + TryInto<Self::SubjectChar>;
9292
type PCRE2_SPTR;
9393
type name_table_entry: NameTableEntry;
9494
type SubjectChar: Copy;
@@ -169,6 +169,20 @@ pub trait CodeUnitWidth: std::fmt::Debug + 'static {
169169
unsafe fn pcre2_get_ovector_count(
170170
arg1: *mut Self::pcre2_match_data,
171171
) -> u32;
172+
173+
unsafe fn pcre2_substitute(
174+
code: *const Self::pcre2_code,
175+
subject: Self::PCRE2_SPTR,
176+
length: usize,
177+
startoffset: usize,
178+
options: u32,
179+
match_data: *mut Self::pcre2_match_data,
180+
mcontext: *mut Self::pcre2_match_context,
181+
replacement: Self::PCRE2_SPTR,
182+
rlength: usize,
183+
outputbuffer: *mut Self::PCRE2_CHAR,
184+
outputlengthptr: *mut usize,
185+
) -> ::libc::c_int;
172186
}
173187

174188
#[derive(Debug)]
@@ -313,6 +327,33 @@ impl CodeUnitWidth for CodeUnitWidth8 {
313327
) -> u32 {
314328
pcre2_get_ovector_count_8(arg1)
315329
}
330+
unsafe fn pcre2_substitute(
331+
code: *const Self::pcre2_code,
332+
subject: Self::PCRE2_SPTR,
333+
length: usize,
334+
startoffset: usize,
335+
options: u32,
336+
match_data: *mut Self::pcre2_match_data,
337+
mcontext: *mut Self::pcre2_match_context,
338+
replacement: Self::PCRE2_SPTR,
339+
rlength: usize,
340+
outputbuffer: *mut Self::PCRE2_CHAR,
341+
outputlengthptr: *mut usize,
342+
) -> ::libc::c_int {
343+
pcre2_substitute_8(
344+
code,
345+
subject,
346+
length,
347+
startoffset,
348+
options,
349+
match_data,
350+
mcontext,
351+
replacement,
352+
rlength,
353+
outputbuffer,
354+
outputlengthptr,
355+
)
356+
}
316357
}
317358

318359
#[derive(Debug)]
@@ -461,6 +502,34 @@ impl CodeUnitWidth for CodeUnitWidth32 {
461502
) -> u32 {
462503
pcre2_get_ovector_count_32(arg1)
463504
}
505+
506+
unsafe fn pcre2_substitute(
507+
code: *const Self::pcre2_code,
508+
subject: Self::PCRE2_SPTR,
509+
length: usize,
510+
startoffset: usize,
511+
options: u32,
512+
match_data: *mut Self::pcre2_match_data,
513+
mcontext: *mut Self::pcre2_match_context,
514+
replacement: Self::PCRE2_SPTR,
515+
rlength: usize,
516+
outputbuffer: *mut Self::PCRE2_CHAR,
517+
outputlengthptr: *mut usize,
518+
) -> ::libc::c_int {
519+
pcre2_substitute_32(
520+
code,
521+
subject,
522+
length,
523+
startoffset,
524+
options,
525+
match_data,
526+
mcontext,
527+
replacement,
528+
rlength,
529+
outputbuffer,
530+
outputlengthptr,
531+
)
532+
}
464533
}
465534

466535
/// Returns true if and only if PCRE2 believes that JIT is available.
@@ -692,6 +761,53 @@ impl<W: CodeUnitWidth> Code<W> {
692761
Ok(1 + count as usize)
693762
}
694763
}
764+
765+
pub unsafe fn substitute(
766+
&self,
767+
mut subject: &[W::SubjectChar],
768+
mut replacement: &[W::SubjectChar],
769+
start: usize,
770+
options: u32,
771+
output: &mut [W::PCRE2_CHAR],
772+
output_len: &mut usize,
773+
) -> Result<usize, Error> {
774+
// When the subject is empty, we use an empty slice
775+
// with a known valid pointer. Otherwise, slices derived
776+
// from, e.g., an empty `Vec<u8>` may not have a valid
777+
// pointer, since creating an empty `Vec` is guaranteed
778+
// to not allocate.
779+
if subject.is_empty() {
780+
subject = &[];
781+
}
782+
if replacement.is_empty() {
783+
replacement = &[];
784+
}
785+
let (subj_ptr, subj_len) = W::subject_to_sptr_len(subject);
786+
let (repl_ptr, repl_len) = W::subject_to_sptr_len(replacement);
787+
788+
// safety: we allow arbitrary options, security contract is on the caller
789+
let rc = unsafe {
790+
W::pcre2_substitute(
791+
self.code,
792+
subj_ptr,
793+
subj_len,
794+
start,
795+
options,
796+
ptr::null_mut(),
797+
// should probably not be null for performance reasons?
798+
ptr::null_mut(),
799+
repl_ptr,
800+
repl_len,
801+
output.as_mut_ptr() as *mut W::PCRE2_CHAR,
802+
output_len as *mut usize,
803+
)
804+
};
805+
if rc >= 0 {
806+
return Ok(rc as usize);
807+
}
808+
// this might warrant a new error type
809+
Err(Error::info(rc))
810+
}
695811
}
696812

697813
/// A low level representation of PCRE2's compilation context.

src/regex_impl.rs

Lines changed: 127 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use std::{
2+
borrow::Cow,
23
collections::HashMap,
34
fmt,
45
ops::Index,
@@ -8,8 +9,10 @@ use std::{
89

910
use log::debug;
1011
use pcre2_sys::{
11-
PCRE2_CASELESS, PCRE2_DOTALL, PCRE2_EXTENDED, PCRE2_MATCH_INVALID_UTF,
12-
PCRE2_MULTILINE, PCRE2_NEVER_UTF, PCRE2_NEWLINE_ANYCRLF, PCRE2_UCP,
12+
PCRE2_CASELESS, PCRE2_DOTALL, PCRE2_ERROR_NOMEMORY, PCRE2_EXTENDED,
13+
PCRE2_MATCH_INVALID_UTF, PCRE2_MULTILINE, PCRE2_NEVER_UTF,
14+
PCRE2_NEWLINE_ANYCRLF, PCRE2_SUBSTITUTE_EXTENDED, PCRE2_SUBSTITUTE_GLOBAL,
15+
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, PCRE2_SUBSTITUTE_UNSET_EMPTY, PCRE2_UCP,
1316
PCRE2_UNSET, PCRE2_UTF,
1417
};
1518

@@ -623,6 +626,127 @@ impl<W: CodeUnitWidth> Regex<W> {
623626
pub(crate) fn get_capture_names_idxs(&self) -> &HashMap<String, usize> {
624627
&self.capture_names_idx
625628
}
629+
630+
/// Replace the first match in the subject string with the replacement
631+
/// If `extended` is true, enable PCRE2's extended replacement syntax.
632+
pub fn replace<'s>(
633+
&self,
634+
subject: &'s [W::SubjectChar],
635+
replacement: &[W::SubjectChar],
636+
extended: bool,
637+
) -> Result<Cow<'s, [W::SubjectChar]>, Error>
638+
where
639+
[<W as CodeUnitWidth>::PCRE2_CHAR]: ToOwned,
640+
{
641+
self.replace_impl(subject, replacement, false, extended)
642+
}
643+
644+
/// Replace all non-overlapping matches in the subject string with the replacement
645+
/// If `extended` is true, enable PCRE2's extended replacement syntax.
646+
pub fn replace_all<'s>(
647+
&self,
648+
subject: &'s [W::SubjectChar],
649+
replacement: &[W::SubjectChar],
650+
extended: bool,
651+
) -> Result<Cow<'s, [W::SubjectChar]>, Error>
652+
where
653+
[<W as CodeUnitWidth>::PCRE2_CHAR]: ToOwned,
654+
{
655+
self.replace_impl(subject, replacement, true, extended)
656+
}
657+
658+
#[inline]
659+
fn replace_impl<'s>(
660+
&self,
661+
subject: &'s [W::SubjectChar],
662+
replacement: &[W::SubjectChar],
663+
replace_all: bool,
664+
extended: bool,
665+
) -> Result<Cow<'s, [W::SubjectChar]>, Error>
666+
where
667+
[<W as CodeUnitWidth>::PCRE2_CHAR]: ToOwned,
668+
{
669+
let mut options: u32 = 0;
670+
options |= PCRE2_SUBSTITUTE_OVERFLOW_LENGTH;
671+
// TODO: this should probably be configurable from user-side
672+
options |= PCRE2_SUBSTITUTE_UNSET_EMPTY;
673+
if extended {
674+
options |= PCRE2_SUBSTITUTE_EXTENDED;
675+
}
676+
if replace_all {
677+
options |= PCRE2_SUBSTITUTE_GLOBAL;
678+
}
679+
680+
// We prefer to allocate on the stack but fall back to the heap.
681+
// Note that PCRE2 has the following behavior with PCRE2_SUBSTITUTE_OVERFLOW_LENGTH:
682+
// - We supply the initial output buffer size in `capacity`. This should have sufficient
683+
// capacity for the terminating NUL character.
684+
// - If the capacity is NOT sufficient, PCRE2 returns the new required capacity, also
685+
// including the terminating NUL character.
686+
// - If the capacity IS sufficient, PCRE2 returns the number of characters written, NOT
687+
// including the terminating NUL character.
688+
// Example: our initial capacity is 256. If the returned string needs to be of length 512,
689+
// then PCRE2 will report NOMEMORY and set capacity to 513. After reallocating we pass in
690+
// a capacity of 513; it succeeds and sets capacity to 512, which is the length of the result.
691+
let mut stack_storage: [W::PCRE2_CHAR; 256] =
692+
[W::PCRE2_CHAR::default(); 256];
693+
let mut heap_storage = Vec::new();
694+
let mut output = stack_storage.as_mut();
695+
let mut capacity = output.len();
696+
697+
let mut rc = unsafe {
698+
self.code.substitute(
699+
subject,
700+
replacement,
701+
0,
702+
options,
703+
output,
704+
&mut capacity,
705+
)
706+
};
707+
708+
if let Err(e) = &rc {
709+
if e.code() == PCRE2_ERROR_NOMEMORY {
710+
if heap_storage.try_reserve_exact(capacity).is_err() {
711+
return Err(rc.unwrap_err());
712+
}
713+
heap_storage.resize(capacity, W::PCRE2_CHAR::default());
714+
output = &mut heap_storage;
715+
capacity = output.len();
716+
rc = unsafe {
717+
self.code.substitute(
718+
subject,
719+
replacement,
720+
0,
721+
options,
722+
output,
723+
&mut capacity,
724+
)
725+
};
726+
}
727+
}
728+
729+
let s = match rc? {
730+
0 => Cow::Borrowed(subject),
731+
_ => {
732+
// capacity has been updated with the length of the result (excluding nul terminator).
733+
let output = &output[..capacity];
734+
735+
// All inputs contained valid chars, so we expect all outputs to as well.
736+
let to_char = |c: W::PCRE2_CHAR| -> W::SubjectChar {
737+
c.try_into().unwrap_or_else(|_| {
738+
panic!("all output expected to be valid chars")
739+
})
740+
};
741+
742+
// this is really just a type cast
743+
let x: Vec<W::SubjectChar> =
744+
output.iter().copied().map(to_char).collect();
745+
Cow::Owned(x)
746+
}
747+
};
748+
Ok(s)
749+
}
626750
}
627751

628752
/// Advanced or "lower level" search methods.
@@ -870,7 +994,7 @@ impl<W: CodeUnitWidth> CaptureLocations<W> {
870994
}
871995
}
872996

873-
/// Captures represents a group of captured byte strings for a single match.
997+
/// `Captures` represents a group of captured strings for a single match.
874998
///
875999
/// The 0th capture always corresponds to the entire match. Each subsequent
8761000
/// index corresponds to the next capture group in the regex. If a capture

0 commit comments

Comments
 (0)