Skip to content

Commit 813a426

Browse files
Correct substitute behavior on reallocation and add tests
This fixes the following issue in replace_impl: the call to try_reserve passed in a difference of capacities, but try_reserve expects a difference between the desired capacity and the length. Because the initial capacity was nonzero but the length was zero, this caused us to reserve less capacity than we ought to have, leading to an OOB write. Fix this by reworking replace_impl to have less unsafe code. Now we zero initialize the buffer, but we also prefer a stack buffer so we may save an allocation - probably a wash overall. Add a test for this case.
1 parent 7d78cb0 commit 813a426

File tree

3 files changed

+165
-25
lines changed

3 files changed

+165
-25
lines changed

src/ffi.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ pub trait CodeUnitWidth: std::fmt::Debug {
9090
type pcre2_match_context;
9191
type pcre2_match_data;
9292
type pcre2_jit_stack;
93-
type PCRE2_CHAR: TryInto<Self::SubjectChar>;
93+
type PCRE2_CHAR: Default + Copy + TryInto<Self::SubjectChar>;
9494
type PCRE2_SPTR;
9595
type name_table_entry: NameTableEntry;
9696
type SubjectChar: Copy;

src/regex_impl.rs

Lines changed: 25 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -546,40 +546,47 @@ impl<W: CodeUnitWidth> Regex<W> {
546546
options |= PCRE2_SUBSTITUTE_GLOBAL;
547547
}
548548

549-
// TODO: we can use MaybeUninit to avoid allocation
550-
let mut capacity = 256;
551-
let mut output: Vec<W::PCRE2_CHAR> = Vec::with_capacity(capacity);
552-
capacity = output.capacity();
549+
// We prefer to allocate on the stack but fall back to the heap.
550+
// Note that PCRE2 has the following behavior with PCRE2_SUBSTITUTE_OVERFLOW_LENGTH:
551+
// - We supply the initial output buffer size in `capacity`. This should have sufficient
552+
// capacity for the terminating NUL character.
553+
// - If the capacity is NOT sufficient, PCRE2 returns the new required capacity, also
554+
// including the terminating NUL character.
555+
// - If the capacity IS sufficient, PCRE2 returns the number of characters written, NOT
556+
// including the terminating NUL character.
557+
// Example: our initial capacity is 256. If the returned string needs to be of length 512,
558+
// then PCRE2 will report NOMEMORY and set capacity to 513. After reallocating we pass in
559+
// a capacity of 513; it succeeds and sets capacity to 512, which is the length of the result.
560+
let mut stack_storage: [W::PCRE2_CHAR; 256] = [W::PCRE2_CHAR::default(); 256];
561+
let mut heap_storage = Vec::new();
562+
let mut output = stack_storage.as_mut();
563+
let mut capacity = output.len();
553564

554565
let mut rc = unsafe {
555566
self.code
556-
.substitute(subject, replacement, 0, options, &mut output, &mut capacity)
567+
.substitute(subject, replacement, 0, options, output, &mut capacity)
557568
};
558569

559570
if let Err(e) = &rc {
560571
if e.code() == PCRE2_ERROR_NOMEMORY {
561-
if output.try_reserve(capacity - output.capacity()).is_err() {
572+
if heap_storage.try_reserve_exact(capacity).is_err() {
562573
return Err(rc.unwrap_err());
563574
}
564-
capacity = output.capacity();
575+
heap_storage.resize(capacity, W::PCRE2_CHAR::default());
576+
output = &mut heap_storage;
577+
capacity = output.len();
565578
rc = unsafe {
566-
self.code.substitute(
567-
subject,
568-
replacement,
569-
0,
570-
options,
571-
&mut output,
572-
&mut capacity,
573-
)
579+
self.code
580+
.substitute(subject, replacement, 0, options, output, &mut capacity)
574581
};
575582
}
576583
}
577584

578585
let s = match rc? {
579586
0 => Cow::Borrowed(subject),
580587
_ => {
581-
// +1 to account for null terminator
582-
unsafe { output.set_len(capacity + 1) };
588+
// capacity has been updated with the length of the result (excluding nul terminator).
589+
let output = &output[..capacity];
583590

584591
// All inputs contained valid chars, so we expect all outputs to as well.
585592
let to_char = |c: W::PCRE2_CHAR| -> W::SubjectChar {
@@ -588,13 +595,7 @@ impl<W: CodeUnitWidth> Regex<W> {
588595
};
589596

590597
// this is really just a type cast
591-
let x: Vec<W::SubjectChar> = output
592-
.into_iter()
593-
.map(to_char)
594-
// we don't want to return the null terminator
595-
.take(capacity)
596-
.collect::<Vec<W::SubjectChar>>();
597-
598+
let x: Vec<W::SubjectChar> = output.iter().copied().map(to_char).collect();
598599
Cow::Owned(x)
599600
}
600601
};

src/utf32.rs

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,145 @@ mod tests {
130130
assert_eq!(replaced, &*b("bc"));
131131
}
132132

133+
#[test]
134+
fn replace_no_match() {
135+
let re = RegexBuilder::new().build(b("d")).unwrap();
136+
let s = b("abc");
137+
let r = b("");
138+
let replaced = re.replace(&s, &r, true).unwrap();
139+
assert!(
140+
matches!(replaced, Cow::Borrowed(_)),
141+
"when there is no match, the original string should be returned"
142+
);
143+
let replaced = replaced.into_owned();
144+
assert_eq!(replaced, &*b("abc"));
145+
}
146+
147+
#[test]
148+
fn replace_with_replacement() {
149+
let re = RegexBuilder::new().build(b("b")).unwrap();
150+
let s = b("abc");
151+
let r = b("d");
152+
let replaced = re.replace(&s, &r, true).unwrap();
153+
assert!(
154+
matches!(replaced, Cow::Owned(_)),
155+
"a replacement should give a new string"
156+
);
157+
let replaced = replaced.into_owned();
158+
assert_eq!(replaced, &*b("adc"));
159+
}
160+
161+
#[test]
162+
fn replace_first_occurrence() {
163+
let re = RegexBuilder::new().build(b("a")).unwrap();
164+
let s = b("aaa");
165+
let r = b("b");
166+
let replaced = re.replace(&s, &r, false).unwrap();
167+
assert!(
168+
matches!(replaced, Cow::Owned(_)),
169+
"a replacement should give a new string"
170+
);
171+
let replaced = replaced.into_owned();
172+
assert_eq!(replaced, &*b("baa"));
173+
}
174+
175+
#[test]
176+
fn replace_multiple_occurrences() {
177+
let re = RegexBuilder::new().build(b("a")).unwrap();
178+
let s = b("aaa");
179+
let r = b("b");
180+
let replaced = re.replace_all(&s, &r, false).unwrap();
181+
assert!(
182+
matches!(replaced, Cow::Owned(_)),
183+
"a replacement should give a new string"
184+
);
185+
let replaced = replaced.into_owned();
186+
assert_eq!(replaced, &*b("bbb"));
187+
}
188+
189+
#[test]
190+
fn replace_empty_string() {
191+
let re = RegexBuilder::new().build(b("")).unwrap();
192+
let s = b("abc");
193+
let r = b("d");
194+
let replaced = re.replace(&s, &r, true).unwrap();
195+
assert!(
196+
matches!(replaced, Cow::Owned(_)),
197+
"a replacement should give a new string"
198+
);
199+
let replaced = replaced.into_owned();
200+
assert_eq!(replaced, &*b("dabc"));
201+
}
202+
203+
#[test]
204+
fn replace_empty_with_empty() {
205+
let re = RegexBuilder::new().build(b("")).unwrap();
206+
let s = b("");
207+
let r = b("");
208+
let replaced = re.replace(&s, &r, true).unwrap().into_owned();
209+
assert_eq!(replaced, &*b(""));
210+
}
211+
212+
#[test]
213+
fn replace_long_string() {
214+
let long_string = vec!['a'; 1024]; // Create a 1MB string filled with 'a'
215+
let re = RegexBuilder::new().build(b("a")).unwrap();
216+
let r = b("b");
217+
let replaced = re.replace(&long_string, &r, false).unwrap();
218+
assert!(
219+
matches!(replaced, Cow::Owned(_)),
220+
"a replacement should give a new string"
221+
);
222+
let replaced = replaced.into_owned();
223+
let mut expected = long_string.clone();
224+
expected[0] = 'b';
225+
assert_eq!(replaced, expected);
226+
}
227+
228+
#[test]
229+
fn replace_long_string_all() {
230+
let long_string = vec!['a'; 1024];
231+
let re = RegexBuilder::new().build(b("a")).unwrap();
232+
let r = b("b");
233+
let replaced = re.replace_all(&long_string, &r, false).unwrap();
234+
assert!(
235+
matches!(replaced, Cow::Owned(_)),
236+
"a replacement should give a new string"
237+
);
238+
let replaced = replaced.into_owned();
239+
let all_b = vec!['b'; 1024];
240+
assert_eq!(replaced, all_b);
241+
}
242+
243+
#[test]
244+
fn replace_long_string_all_elongating() {
245+
let long_string = vec!['a'; 1024];
246+
let re = RegexBuilder::new().build(b("a")).unwrap();
247+
let r = b("bx");
248+
let replaced = re.replace_all(&long_string, &r, false).unwrap();
249+
assert!(
250+
matches!(replaced, Cow::Owned(_)),
251+
"a replacement should give a new string"
252+
);
253+
let replaced = replaced.into_owned();
254+
let mut all_bx = Vec::new();
255+
for _ in long_string {
256+
all_bx.push('b');
257+
all_bx.push('x');
258+
}
259+
assert_eq!(replaced, all_bx);
260+
}
261+
262+
#[test]
263+
fn replace_long_string_all_disappearing() {
264+
let long_string = vec!['a'; 1024];
265+
let re = RegexBuilder::new().build(b("a")).unwrap();
266+
let r = b("");
267+
let replaced = re.replace_all(&long_string, &r, false).unwrap();
268+
let replaced = replaced.into_owned();
269+
assert_eq!(replaced, &[]);
270+
}
271+
133272
#[test]
134273
fn ucp() {
135274
let re = RegexBuilder::new().ucp(false).build(b(r"\w")).unwrap();

0 commit comments

Comments
 (0)