From 644968163a686e7132222ba1e629800c53930abe Mon Sep 17 00:00:00 2001 From: Priyanka Taneja Date: Mon, 28 Nov 2022 12:02:30 -0800 Subject: [PATCH 1/8] adding split function --- src/bytes.rs | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/src/bytes.rs b/src/bytes.rs index a374452..452cee3 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -1,6 +1,7 @@ use std::cell::RefCell; use std::collections::HashMap; use std::fmt; +use std::iter::FusedIterator; use std::ops::Index; use std::sync::Arc; @@ -55,6 +56,47 @@ impl<'s> Match<'s> { (self.start, self.end) } } +///splitting +/// test this +/// +#[derive(Clone)] +pub struct Split<'r, 't> { + finder: Matches<'r, 't>, + last: usize, +} + +impl<'r, 't> Iterator for Split<'r, 't> { + + type Item = Result< &'t [u8], Error>; + fn next(&mut self) -> Option> { + let text = self.finder.subject; + match self.finder.next() { + None => { + if self.last > text.len() { + None + } else { + let s = &text[self.last..]; + self.last = text.len() + 1; // Next call will return None + Some(Ok(s)) + } + } + Some(m) => { + match m { + Ok(mtch) => { + let matched = &text[self.last..mtch.start()]; + self.last = mtch.end(); + Some(Ok(matched)) + }, + Err(err) => Some(Err(err)), + } + + } + } + } +} + +impl<'r, 't> FusedIterator for Split<'r, 't> {} + #[derive(Clone, Debug)] struct Config { @@ -480,6 +522,12 @@ impl Regex { last_match: None, } } + ///splits regex or smth + pub fn split<'r, 't>(&'r self, text: &'t [u8]) -> Split<'r, 't> { + Split { finder: self.find_iter(text), last: 0 } + } + + /// Returns the capture groups corresponding to the leftmost-first /// match in `subject`. Capture group `0` always corresponds to the entire @@ -1009,6 +1057,8 @@ impl<'s, 'i> Index<&'i str> for Captures<'s> { /// /// `'r` is the lifetime of the compiled regular expression and `'s` is the /// lifetime of the subject string. + +#[derive(Clone)] pub struct Matches<'r, 's> { re: &'r Regex, match_data: &'r RefCell, @@ -1052,6 +1102,7 @@ impl<'r, 's> Iterator for Matches<'r, 's> { } } +impl<'r, 't> FusedIterator for Matches<'r, 't> {} /// An iterator that yields all non-overlapping capture groups matching a /// particular regular expression. /// From c50dcccddf30d524a756c8bdacb756d49d1800ad Mon Sep 17 00:00:00 2001 From: Priyanka Taneja Date: Mon, 28 Nov 2022 15:23:50 -0800 Subject: [PATCH 2/8] bug fix --- src/bytes.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/bytes.rs b/src/bytes.rs index 452cee3..b7c1efd 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -22,7 +22,8 @@ use crate::ffi::{Code, CompileContext, MatchConfig, MatchData}; /// of the subject string. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub struct Match<'s> { - subject: &'s [u8], + ///subject + pub subject: &'s [u8], start: usize, end: usize, } From 301409339c6fbb93a95cc00fa86237a7ffc27694 Mon Sep 17 00:00:00 2001 From: Priyanka Taneja Date: Tue, 29 Nov 2022 12:32:46 -0800 Subject: [PATCH 3/8] replace functions --- src/bytes.rs | 250 ++++++++++++++++++++++++++++++++++++++++++++++ src/expand.rs | 198 ++++++++++++++++++++++++++++++++++++ src/find_bytes.rs | 18 ++++ src/lib.rs | 2 + 4 files changed, 468 insertions(+) create mode 100644 src/expand.rs create mode 100644 src/find_bytes.rs diff --git a/src/bytes.rs b/src/bytes.rs index b7c1efd..26a97e1 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -1,9 +1,12 @@ +use std::borrow::Cow; use std::cell::RefCell; use std::collections::HashMap; use std::fmt; use std::iter::FusedIterator; use std::ops::Index; use std::sync::Arc; +use crate::expand::expand_bytes; +use crate::find_bytes::find_byte; use log::debug; use pcre2_sys::{ @@ -528,6 +531,71 @@ impl Regex { Split { finder: self.find_iter(text), last: 0 } } + /// Replaces at most `limit` non-overlapping matches in `text` with the + /// replacement provided. If `limit` is 0, then all non-overlapping matches + /// are replaced. + /// + /// See the documentation for `replace` for details on how to access + /// capturing group matches in the replacement text. + pub fn replacen<'t, R: Replacer>( + &self, + text: &'t [u8], + limit: usize, + mut rep: R, + ) -> Cow<'t, [u8]> { + if let Some(rep) = rep.no_expansion() { + let mut it = self.find_iter(text).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(text); + } + let mut new = Vec::with_capacity(text.len()); + let mut last_match = 0; + for (i, m) in it { + match m { + Ok(m) => { + new.extend_from_slice(&text[last_match..m.start()]); + new.extend_from_slice(&rep); + last_match = m.end(); + if limit > 0 && i >= limit - 1 { + break; + } + }, + Err(err) => break, + } + + } + new.extend_from_slice(&text[last_match..]); + return Cow::Owned(new); + } + + // The slower path, which we use if the replacement needs access to + // capture groups. + let mut it = self.captures_iter(text).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(text); + } + let mut new = Vec::with_capacity(text.len()); + let mut last_match = 0; + for (i, cap) in it { + // unwrap on 0 is OK because captures only reports matches + match cap { + Ok(cap) => { + let m = cap.get(0).unwrap(); + new.extend_from_slice(&text[last_match..m.start()]); + rep.replace_append(&cap, &mut new); + last_match = m.end(); + if limit > 0 && i >= limit - 1 { + break; + } + }, + Err(err) => break, + } + } + new.extend_from_slice(&text[last_match..]); + Cow::Owned(new) + } + + /// Returns the capture groups corresponding to the leftmost-first @@ -962,6 +1030,30 @@ impl<'s> Captures<'s> { pub fn len(&self) -> usize { self.locs.len() } + // Expands all instances of `$name` in `replacement` to the corresponding + /// capture group `name`, and writes them to the `dst` buffer given. + /// + /// `name` may be an integer corresponding to the index of the capture + /// group (counted by order of opening parenthesis where `0` is the + /// entire match) or it can be a name (consisting of letters, digits or + /// underscores) corresponding to a named capture group. + /// + /// If `name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name consisting of the characters `[_0-9A-Za-z]` + /// is used. e.g., `$1a` looks up the capture group named `1a` and not the + /// capture group at index `1`. To exert more precise control over the + /// name, or to refer to a capture group name that uses characters outside + /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When + /// using braces, any sequence of valid UTF-8 bytes is permitted. If the + /// sequence does not refer to a capture group name in the corresponding + /// regex, then it is replaced with an empty string. + /// + /// To write a literal `$` use `$$`. + pub fn expand(&self, replacement: &[u8], dst: &mut Vec) { + expand_bytes(self, replacement, dst) + } } impl<'s> fmt::Debug for Captures<'s> { @@ -1435,3 +1527,161 @@ mod tests { assert!(re.is_match(hay.as_bytes()).unwrap()); } } + +/// Replacer describes types that can be used to replace matches in a byte +/// string. +/// +/// In general, users of this crate shouldn't need to implement this trait, +/// since implementations are already provided for `&[u8]` along with other +/// variants of bytes types and `FnMut(&Captures) -> Vec` (or any +/// `FnMut(&Captures) -> T` where `T: AsRef<[u8]>`), which covers most use cases. +pub trait Replacer { + /// Appends text to `dst` to replace the current match. + /// + /// The current match is represented by `caps`, which is guaranteed to + /// have a match at capture group `0`. + /// + /// For example, a no-op replacement would be + /// `dst.extend(&caps[0])`. + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec); + + /// Return a fixed unchanging replacement byte string. + /// + /// When doing replacements, if access to `Captures` is not needed (e.g., + /// the replacement byte string does not need `$` expansion), then it can + /// be beneficial to avoid finding sub-captures. + /// + /// In general, this is called once for every call to `replacen`. + fn no_expansion<'r>(&'r mut self) -> Option> { + None + } + + /// Return a `Replacer` that borrows and wraps this `Replacer`. + /// + /// This is useful when you want to take a generic `Replacer` (which might + /// not be cloneable) and use it without consuming it, so it can be used + /// more than once. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::{Regex, Replacer}; + /// + /// fn replace_all_twice( + /// re: Regex, + /// src: &[u8], + /// mut rep: R, + /// ) -> Vec { + /// let dst = re.replace_all(src, rep.by_ref()); + /// let dst = re.replace_all(&dst, rep.by_ref()); + /// dst.into_owned() + /// } + /// ``` + fn by_ref<'r>(&'r mut self) -> ReplacerRef<'r, Self> { + ReplacerRef(self) + } +} + +/// By-reference adaptor for a `Replacer` +/// +/// Returned by [`Replacer::by_ref`](trait.Replacer.html#method.by_ref). +#[derive(Debug)] +pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R); + +impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec) { + self.0.replace_append(caps, dst) + } + fn no_expansion<'r>(&'r mut self) -> Option> { + self.0.no_expansion() + } +} + +impl<'a> Replacer for &'a [u8] { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec) { + caps.expand(*self, dst); + } + + fn no_expansion(&mut self) -> Option> { + no_expansion(self) + } +} + +impl<'a> Replacer for &'a Vec { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec) { + caps.expand(*self, dst); + } + + fn no_expansion(&mut self) -> Option> { + no_expansion(self) + } +} + +impl Replacer for Vec { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec) { + caps.expand(self, dst); + } + + fn no_expansion(&mut self) -> Option> { + no_expansion(self) + } +} + +impl<'a> Replacer for Cow<'a, [u8]> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec) { + caps.expand(self.as_ref(), dst); + } + + fn no_expansion(&mut self) -> Option> { + no_expansion(self) + } +} + +impl<'a> Replacer for &'a Cow<'a, [u8]> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec) { + caps.expand(self.as_ref(), dst); + } + + fn no_expansion(&mut self) -> Option> { + no_expansion(self) + } +} + +fn no_expansion>(t: &T) -> Option> { + let s = t.as_ref(); + match find_byte(b'$', s) { + Some(_) => None, + None => Some(Cow::Borrowed(s)), + } +} + +impl Replacer for F +where + F: FnMut(&Captures<'_>) -> T, + T: AsRef<[u8]>, +{ + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec) { + dst.extend_from_slice((*self)(caps).as_ref()); + } +} + +/// `NoExpand` indicates literal byte string replacement. +/// +/// It can be used with `replace` and `replace_all` to do a literal byte string +/// replacement without expanding `$name` to their corresponding capture +/// groups. This can be both convenient (to avoid escaping `$`, for example) +/// and performant (since capture groups don't need to be found). +/// +/// `'t` is the lifetime of the literal text. +#[derive(Clone, Debug)] +pub struct NoExpand<'t>(pub &'t [u8]); + +impl<'t> Replacer for NoExpand<'t> { + fn replace_append(&mut self, _: &Captures<'_>, dst: &mut Vec) { + dst.extend_from_slice(self.0); + } + + fn no_expansion(&mut self) -> Option> { + Some(Cow::Borrowed(self.0)) + } +} \ No newline at end of file diff --git a/src/expand.rs b/src/expand.rs new file mode 100644 index 0000000..69f9ce9 --- /dev/null +++ b/src/expand.rs @@ -0,0 +1,198 @@ +use std::str; + +use crate::find_bytes::find_byte; + +use crate::bytes; + + + +pub fn expand_bytes( + caps: &bytes::Captures<'_>, + mut replacement: &[u8], + dst: &mut Vec, +) { + while !replacement.is_empty() { + match find_byte(b'$', replacement) { + None => break, + Some(i) => { + dst.extend(&replacement[..i]); + replacement = &replacement[i..]; + } + } + if replacement.get(1).map_or(false, |&b| b == b'$') { + dst.push(b'$'); + replacement = &replacement[2..]; + continue; + } + debug_assert!(!replacement.is_empty()); + let cap_ref = match find_cap_ref(replacement) { + Some(cap_ref) => cap_ref, + None => { + dst.push(b'$'); + replacement = &replacement[1..]; + continue; + } + }; + replacement = &replacement[cap_ref.end..]; + match cap_ref.cap { + Ref::Number(i) => { + dst.extend(caps.get(i).map(|m| m.as_bytes()).unwrap_or(b"")); + } + Ref::Named(name) => { + dst.extend( + caps.name(name).map(|m| m.as_bytes()).unwrap_or(b""), + ); + } + } + } + dst.extend(replacement); +} + +/// `CaptureRef` represents a reference to a capture group inside some text. +/// The reference is either a capture group name or a number. +/// +/// It is also tagged with the position in the text following the +/// capture reference. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct CaptureRef<'a> { + cap: Ref<'a>, + end: usize, +} + +/// A reference to a capture group in some text. +/// +/// e.g., `$2`, `$foo`, `${foo}`. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum Ref<'a> { + Named(&'a str), + Number(usize), +} + +impl<'a> From<&'a str> for Ref<'a> { + fn from(x: &'a str) -> Ref<'a> { + Ref::Named(x) + } +} + +impl From for Ref<'static> { + fn from(x: usize) -> Ref<'static> { + Ref::Number(x) + } +} + +/// Parses a possible reference to a capture group name in the given text, +/// starting at the beginning of `replacement`. +/// +/// If no such valid reference could be found, None is returned. +fn find_cap_ref(replacement: &[u8]) -> Option> { + let mut i = 0; + let rep: &[u8] = replacement; + if rep.len() <= 1 || rep[0] != b'$' { + return None; + } + i += 1; + if rep[i] == b'{' { + return find_cap_ref_braced(rep, i + 1); + } + let mut cap_end = i; + while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) { + cap_end += 1; + } + if cap_end == i { + return None; + } + // We just verified that the range 0..cap_end is valid ASCII, so it must + // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 + // check via an unchecked conversion or by parsing the number straight from + // &[u8]. + let cap = + str::from_utf8(&rep[i..cap_end]).expect("valid UTF-8 capture name"); + Some(CaptureRef { + cap: match cap.parse::() { + Ok(i) => Ref::Number(i as usize), + Err(_) => Ref::Named(cap), + }, + end: cap_end, + }) +} + +fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option> { + let start = i; + while rep.get(i).map_or(false, |&b| b != b'}') { + i += 1; + } + if !rep.get(i).map_or(false, |&b| b == b'}') { + return None; + } + // When looking at braced names, we don't put any restrictions on the name, + // so it's possible it could be invalid UTF-8. But a capture group name + // can never be invalid UTF-8, so if we have invalid UTF-8, then we can + // safely return None. + let cap = match str::from_utf8(&rep[start..i]) { + Err(_) => return None, + Ok(cap) => cap, + }; + Some(CaptureRef { + cap: match cap.parse::() { + Ok(i) => Ref::Number(i as usize), + Err(_) => Ref::Named(cap), + }, + end: i + 1, + }) +} + +/// Returns true if and only if the given byte is allowed in a capture name. +fn is_valid_cap_letter(b: u8) -> bool { + match b { + b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true, + _ => false, + } +} + +#[cfg(test)] +mod tests { + use super::{find_cap_ref, CaptureRef}; + + macro_rules! find { + ($name:ident, $text:expr) => { + #[test] + fn $name() { + assert_eq!(None, find_cap_ref($text.as_bytes())); + } + }; + ($name:ident, $text:expr, $capref:expr) => { + #[test] + fn $name() { + assert_eq!(Some($capref), find_cap_ref($text.as_bytes())); + } + }; + } + + macro_rules! c { + ($name_or_number:expr, $pos:expr) => { + CaptureRef { cap: $name_or_number.into(), end: $pos } + }; + } + + find!(find_cap_ref1, "$foo", c!("foo", 4)); + find!(find_cap_ref2, "${foo}", c!("foo", 6)); + find!(find_cap_ref3, "$0", c!(0, 2)); + find!(find_cap_ref4, "$5", c!(5, 2)); + find!(find_cap_ref5, "$10", c!(10, 3)); + // See https://github.com/rust-lang/regex/pull/585 + // for more on characters following numbers + find!(find_cap_ref6, "$42a", c!("42a", 4)); + find!(find_cap_ref7, "${42}a", c!(42, 5)); + find!(find_cap_ref8, "${42"); + find!(find_cap_ref9, "${42 "); + find!(find_cap_ref10, " $0 "); + find!(find_cap_ref11, "$"); + find!(find_cap_ref12, " "); + find!(find_cap_ref13, ""); + find!(find_cap_ref14, "$1-$2", c!(1, 2)); + find!(find_cap_ref15, "$1_$2", c!("1_", 3)); + find!(find_cap_ref16, "$x-$y", c!("x", 2)); + find!(find_cap_ref17, "$x_$y", c!("x_", 3)); + find!(find_cap_ref18, "${#}", c!("#", 4)); + find!(find_cap_ref19, "${Z[}", c!("Z[", 5)); +} \ No newline at end of file diff --git a/src/find_bytes.rs b/src/find_bytes.rs new file mode 100644 index 0000000..58c5d20 --- /dev/null +++ b/src/find_bytes.rs @@ -0,0 +1,18 @@ +/// Searches for the given needle in the given haystack. +/// +/// If the perf-literal feature is enabled, then this uses the super optimized +/// memchr crate. Otherwise, it uses the naive byte-at-a-time implementation. +pub fn find_byte(needle: u8, haystack: &[u8]) -> Option { + #[cfg(not(feature = "perf-literal"))] + fn imp(needle: u8, haystack: &[u8]) -> Option { + haystack.iter().position(|&b| b == needle) + } + + #[cfg(feature = "perf-literal")] + fn imp(needle: u8, haystack: &[u8]) -> Option { + use memchr::memchr; + memchr(needle, haystack) + } + + imp(needle, haystack) +} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 040658a..8f3c24a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,3 +20,5 @@ PCRE2 regular expressions for matching on arbitrary bytes. pub mod bytes; mod error; mod ffi; +mod expand; +mod find_bytes; \ No newline at end of file From 462ba3de32f57b78717edc2ed6e159086314cb32 Mon Sep 17 00:00:00 2001 From: Priyanka Taneja Date: Tue, 29 Nov 2022 12:35:51 -0800 Subject: [PATCH 4/8] replace all --- src/bytes.rs | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/bytes.rs b/src/bytes.rs index 26a97e1..ab02365 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -567,7 +567,7 @@ impl Regex { new.extend_from_slice(&text[last_match..]); return Cow::Owned(new); } - + // The slower path, which we use if the replacement needs access to // capture groups. let mut it = self.captures_iter(text).enumerate().peekable(); @@ -595,6 +595,19 @@ impl Regex { Cow::Owned(new) } + /// Replaces all non-overlapping matches in `text` with the replacement + /// provided. This is the same as calling `replacen` with `limit` set to + /// `0`. + /// + /// See the documentation for `replace` for details on how to access + /// capturing group matches in the replacement text. + pub fn replace_all<'t, R: Replacer>( + &self, + text: &'t [u8], + rep: R, + ) -> Cow<'t, [u8]> { + self.replacen(text, 0, rep) + } From c1c315bf7b0fea5d190e0ce6ef5c298a6ee4d9cd Mon Sep 17 00:00:00 2001 From: Priyanka Taneja Date: Wed, 30 Nov 2022 16:48:22 -0800 Subject: [PATCH 5/8] subject gettier --- src/bytes.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/bytes.rs b/src/bytes.rs index ab02365..16669e4 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -26,7 +26,7 @@ use crate::ffi::{Code, CompileContext, MatchConfig, MatchData}; #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub struct Match<'s> { ///subject - pub subject: &'s [u8], + subject: &'s [u8], start: usize, end: usize, } @@ -49,6 +49,11 @@ impl<'s> Match<'s> { pub fn as_bytes(&self) -> &'s [u8] { &self.subject[self.start..self.end] } + /// Returns the matched portion of the subject string. + #[inline] + pub fn subject(&self) -> &'s [u8] { + &self.subject + } /// Creates a new match from the given subject string and byte offsets. fn new(subject: &'s [u8], start: usize, end: usize) -> Match<'s> { From 74ee08094335fbb274857bbdd961e846807a7525 Mon Sep 17 00:00:00 2001 From: Priyanka Taneja Date: Wed, 30 Nov 2022 17:01:48 -0800 Subject: [PATCH 6/8] clean --- src/bytes.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/bytes.rs b/src/bytes.rs index 16669e4..60aab33 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -25,7 +25,6 @@ use crate::ffi::{Code, CompileContext, MatchConfig, MatchData}; /// of the subject string. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub struct Match<'s> { - ///subject subject: &'s [u8], start: usize, end: usize, From 1c65f7cb03f395a8222f0a74c80b87bca4b99a83 Mon Sep 17 00:00:00 2001 From: Priyanka Taneja Date: Wed, 30 Nov 2022 17:05:15 -0800 Subject: [PATCH 7/8] comment --- src/bytes.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/bytes.rs b/src/bytes.rs index 60aab33..30ac9ff 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -64,9 +64,7 @@ impl<'s> Match<'s> { (self.start, self.end) } } -///splitting -/// test this -/// +///Split #[derive(Clone)] pub struct Split<'r, 't> { finder: Matches<'r, 't>, @@ -74,7 +72,6 @@ pub struct Split<'r, 't> { } impl<'r, 't> Iterator for Split<'r, 't> { - type Item = Result< &'t [u8], Error>; fn next(&mut self) -> Option> { let text = self.finder.subject; From 577c1704e9ed692bf57e6bc6a5a4a6677621f8ab Mon Sep 17 00:00:00 2001 From: Priyanka Taneja Date: Wed, 30 Nov 2022 17:09:25 -0800 Subject: [PATCH 8/8] comments --- src/bytes.rs | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/bytes.rs b/src/bytes.rs index 30ac9ff..60b1d0b 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -527,7 +527,26 @@ impl Regex { last_match: None, } } - ///splits regex or smth + /// Returns an iterator of substrings of `text` delimited by a match of the + /// regular expression. Namely, each element of the iterator corresponds to + /// text that *isn't* matched by the regular expression. + /// + /// This method will *not* copy the text given. + /// + /// # Example + /// + /// To split a string delimited by arbitrary amounts of spaces or tabs: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"[ \t]+").unwrap(); + /// let fields: Vec<&[u8]> = re.split(b"a b \t c\td e").collect(); + /// assert_eq!(fields, vec![ + /// &b"a"[..], &b"b"[..], &b"c"[..], &b"d"[..], &b"e"[..], + /// ]); + /// # } + /// ``` pub fn split<'r, 't>(&'r self, text: &'t [u8]) -> Split<'r, 't> { Split { finder: self.find_iter(text), last: 0 } }