1
1
use std:: {
2
+ borrow:: Cow ,
2
3
collections:: HashMap ,
3
4
fmt,
4
5
ops:: Index ,
@@ -8,8 +9,10 @@ use std::{
8
9
9
10
use log:: debug;
10
11
use pcre2_sys:: {
11
- PCRE2_CASELESS , PCRE2_DOTALL , PCRE2_EXTENDED , PCRE2_MATCH_INVALID_UTF ,
12
- PCRE2_MULTILINE , PCRE2_NEVER_UTF , PCRE2_NEWLINE_ANYCRLF , PCRE2_UCP ,
12
+ PCRE2_CASELESS , PCRE2_DOTALL , PCRE2_ERROR_NOMEMORY , PCRE2_EXTENDED ,
13
+ PCRE2_MATCH_INVALID_UTF , PCRE2_MULTILINE , PCRE2_NEVER_UTF ,
14
+ PCRE2_NEWLINE_ANYCRLF , PCRE2_SUBSTITUTE_EXTENDED , PCRE2_SUBSTITUTE_GLOBAL ,
15
+ PCRE2_SUBSTITUTE_OVERFLOW_LENGTH , PCRE2_SUBSTITUTE_UNSET_EMPTY , PCRE2_UCP ,
13
16
PCRE2_UNSET , PCRE2_UTF ,
14
17
} ;
15
18
@@ -623,6 +626,127 @@ impl<W: CodeUnitWidth> Regex<W> {
623
626
pub ( crate ) fn get_capture_names_idxs ( & self ) -> & HashMap < String , usize > {
624
627
& self . capture_names_idx
625
628
}
629
+
630
+ /// Replace the first match in the subject string with the replacement
631
+ /// If `extended` is true, enable PCRE2's extended replacement syntax.
632
+ pub fn replace < ' s > (
633
+ & self ,
634
+ subject : & ' s [ W :: SubjectChar ] ,
635
+ replacement : & [ W :: SubjectChar ] ,
636
+ extended : bool ,
637
+ ) -> Result < Cow < ' s , [ W :: SubjectChar ] > , Error >
638
+ where
639
+ [ <W as CodeUnitWidth >:: PCRE2_CHAR ] : ToOwned ,
640
+ {
641
+ self . replace_impl ( subject, replacement, false , extended)
642
+ }
643
+
644
+ /// Replace all non-overlapping matches in the subject string with the replacement
645
+ /// If `extended` is true, enable PCRE2's extended replacement syntax.
646
+ pub fn replace_all < ' s > (
647
+ & self ,
648
+ subject : & ' s [ W :: SubjectChar ] ,
649
+ replacement : & [ W :: SubjectChar ] ,
650
+ extended : bool ,
651
+ ) -> Result < Cow < ' s , [ W :: SubjectChar ] > , Error >
652
+ where
653
+ [ <W as CodeUnitWidth >:: PCRE2_CHAR ] : ToOwned ,
654
+ {
655
+ self . replace_impl ( subject, replacement, true , extended)
656
+ }
657
+
658
+ #[ inline]
659
+ fn replace_impl < ' s > (
660
+ & self ,
661
+ subject : & ' s [ W :: SubjectChar ] ,
662
+ replacement : & [ W :: SubjectChar ] ,
663
+ replace_all : bool ,
664
+ extended : bool ,
665
+ ) -> Result < Cow < ' s , [ W :: SubjectChar ] > , Error >
666
+ where
667
+ [ <W as CodeUnitWidth >:: PCRE2_CHAR ] : ToOwned ,
668
+ {
669
+ let mut options: u32 = 0 ;
670
+ options |= PCRE2_SUBSTITUTE_OVERFLOW_LENGTH ;
671
+ // TODO: this should probably be configurable from user-side
672
+ options |= PCRE2_SUBSTITUTE_UNSET_EMPTY ;
673
+ if extended {
674
+ options |= PCRE2_SUBSTITUTE_EXTENDED ;
675
+ }
676
+ if replace_all {
677
+ options |= PCRE2_SUBSTITUTE_GLOBAL ;
678
+ }
679
+
680
+ // We prefer to allocate on the stack but fall back to the heap.
681
+ // Note that PCRE2 has the following behavior with PCRE2_SUBSTITUTE_OVERFLOW_LENGTH:
682
+ // - We supply the initial output buffer size in `capacity`. This should have sufficient
683
+ // capacity for the terminating NUL character.
684
+ // - If the capacity is NOT sufficient, PCRE2 returns the new required capacity, also
685
+ // including the terminating NUL character.
686
+ // - If the capacity IS sufficient, PCRE2 returns the number of characters written, NOT
687
+ // including the terminating NUL character.
688
+ // Example: our initial capacity is 256. If the returned string needs to be of length 512,
689
+ // then PCRE2 will report NOMEMORY and set capacity to 513. After reallocating we pass in
690
+ // a capacity of 513; it succeeds and sets capacity to 512, which is the length of the result.
691
+ let mut stack_storage: [ W :: PCRE2_CHAR ; 256 ] =
692
+ [ W :: PCRE2_CHAR :: default ( ) ; 256 ] ;
693
+ let mut heap_storage = Vec :: new ( ) ;
694
+ let mut output = stack_storage. as_mut ( ) ;
695
+ let mut capacity = output. len ( ) ;
696
+
697
+ let mut rc = unsafe {
698
+ self . code . substitute (
699
+ subject,
700
+ replacement,
701
+ 0 ,
702
+ options,
703
+ output,
704
+ & mut capacity,
705
+ )
706
+ } ;
707
+
708
+ if let Err ( e) = & rc {
709
+ if e. code ( ) == PCRE2_ERROR_NOMEMORY {
710
+ if heap_storage. try_reserve_exact ( capacity) . is_err ( ) {
711
+ return Err ( rc. unwrap_err ( ) ) ;
712
+ }
713
+ heap_storage. resize ( capacity, W :: PCRE2_CHAR :: default ( ) ) ;
714
+ output = & mut heap_storage;
715
+ capacity = output. len ( ) ;
716
+ rc = unsafe {
717
+ self . code . substitute (
718
+ subject,
719
+ replacement,
720
+ 0 ,
721
+ options,
722
+ output,
723
+ & mut capacity,
724
+ )
725
+ } ;
726
+ }
727
+ }
728
+
729
+ let s = match rc? {
730
+ 0 => Cow :: Borrowed ( subject) ,
731
+ _ => {
732
+ // capacity has been updated with the length of the result (excluding nul terminator).
733
+ let output = & output[ ..capacity] ;
734
+
735
+ // All inputs contained valid chars, so we expect all outputs to as well.
736
+ let to_char = |c : W :: PCRE2_CHAR | -> W :: SubjectChar {
737
+ c. try_into ( ) . unwrap_or_else ( |_| {
738
+ panic ! ( "all output expected to be valid chars" )
739
+ } )
740
+ } ;
741
+
742
+ // this is really just a type cast
743
+ let x: Vec < W :: SubjectChar > =
744
+ output. iter ( ) . copied ( ) . map ( to_char) . collect ( ) ;
745
+ Cow :: Owned ( x)
746
+ }
747
+ } ;
748
+ Ok ( s)
749
+ }
626
750
}
627
751
628
752
/// Advanced or "lower level" search methods.
@@ -870,7 +994,7 @@ impl<W: CodeUnitWidth> CaptureLocations<W> {
870
994
}
871
995
}
872
996
873
- /// Captures represents a group of captured byte strings for a single match.
997
+ /// ` Captures` represents a group of captured strings for a single match.
874
998
///
875
999
/// The 0th capture always corresponds to the entire match. Each subsequent
876
1000
/// index corresponds to the next capture group in the regex. If a capture
0 commit comments