|
| 1 | +use core::{slice, str::from_utf8_unchecked_mut}; |
| 2 | + |
| 3 | +use alloc::string::String; |
| 4 | + |
| 5 | +/// Extension methods on [`String`]. |
| 6 | +/// |
| 7 | +/// This trait is sealed and cannot be implemented for types outside of |
| 8 | +/// `retain_more`. |
| 9 | +pub trait RetainMoreString: sealed::AllocMoreSealedString { |
| 10 | + /// A version of [`String::retain`] which allows the predicate mutable |
| 11 | + /// access to the valid parts of the full string. |
| 12 | + /// |
| 13 | + /// The arguments of the predicate are: |
| 14 | + /// - 0: `&mut str`; The already retained parts of `self`, for which |
| 15 | + /// predicate returned `true` |
| 16 | + /// - 1: [`char`]; The current character being considered |
| 17 | + /// - 2: `&mut str`; The parts of `self` yet to be considered. |
| 18 | + /// |
| 19 | + /// # Usage |
| 20 | + /// |
| 21 | + /// ``` |
| 22 | + /// # use retain_more::RetainMoreString as _; |
| 23 | + /// let mut my_string = "Super secret code: -100054321;-78912. EOF\ |
| 24 | + /// Here is some content which shouldn't be seen" |
| 25 | + /// .to_string(); |
| 26 | + /// /// Remove all numbers from the string, including a single leading `'-'` and |
| 27 | + /// /// additionally remove all characters after the first occurence of `"EOF"` |
| 28 | + /// fn cleanup(before: &mut str, it: char, after: &mut str) -> bool { |
| 29 | + /// if before.ends_with("EOF") { |
| 30 | + /// false |
| 31 | + /// } else { |
| 32 | + /// match (it, after.chars().next()) { |
| 33 | + /// ('-', Some(c)) => !c.is_ascii_digit(), |
| 34 | + /// (c, _) => !c.is_ascii_digit(), |
| 35 | + /// } |
| 36 | + /// } |
| 37 | + /// } |
| 38 | + /// my_string.retain_all(cleanup); |
| 39 | + /// assert_eq!(&my_string, "Super secret code: ;. EOF"); |
| 40 | + /// ``` |
| 41 | + fn retain_all<F: FnMut(&mut str, char, &mut str) -> bool>(&mut self, f: F); |
| 42 | + |
| 43 | + /// A helper for the common case where only access to the parts of the |
| 44 | + /// [`String`] which haven't been considered are required |
| 45 | + fn retain_after<F: FnMut(char, &mut str) -> bool>(&mut self, mut f: F) { |
| 46 | + self.retain_all(move |_, current, after| f(current, after)) |
| 47 | + } |
| 48 | + |
| 49 | + /// A reimplmentation of [`String::retain`] using |
| 50 | + /// [`retain_all`](`RetainMoreString::retain_all`) |
| 51 | + /// |
| 52 | + /// This is used to demonstrate that |
| 53 | + /// [`retain_all`](`RetainMoreString::retain_all`) is a strictly more |
| 54 | + /// powerful abstraction than [`String::retain`] from [`alloc`]. |
| 55 | + /// |
| 56 | + /// # Examples |
| 57 | + /// (Taken from alloc docs) |
| 58 | + /// ``` |
| 59 | + /// let mut s = String::from("f_o_ob_ar"); |
| 60 | + /// |
| 61 | + /// s.retain(|c| c != '_'); |
| 62 | + /// |
| 63 | + /// assert_eq!(s, "foobar"); |
| 64 | + /// ``` |
| 65 | + /// |
| 66 | + /// The exact order may be useful for tracking external state, like an |
| 67 | + /// index. |
| 68 | + /// |
| 69 | + /// ``` |
| 70 | + /// let mut s = String::from("abcde"); |
| 71 | + /// let keep = [false, true, true, false, true]; |
| 72 | + /// let mut i = 0; |
| 73 | + /// s.retain(|_| (keep[i], i += 1).0); |
| 74 | + /// assert_eq!(s, "bce"); |
| 75 | + /// ``` |
| 76 | + fn retain_default<F: FnMut(char) -> bool>(&mut self, mut f: F) { |
| 77 | + self.retain_all(move |_, current, _| f(current)) |
| 78 | + } |
| 79 | +} |
| 80 | + |
| 81 | +impl RetainMoreString for String { |
| 82 | + fn retain_all<F: FnMut(&mut str, char, &mut str) -> bool>(&mut self, mut f: F) { |
| 83 | + let len = self.len(); |
| 84 | + // This is required for panic safety, see https://github.com/rust-lang/rust/issues/78498 |
| 85 | + // SAFETY: 0..0 is empty and hence that region is valid UTF-8 |
| 86 | + // SAFETY: 0 <= self.len(), since self.len() is a usize |
| 87 | + unsafe { |
| 88 | + self.as_mut_vec().set_len(0); |
| 89 | + } |
| 90 | + let mut del_bytes = 0; |
| 91 | + // The index of the start of the region which has not yet been considered. |
| 92 | + // This is always at a UTF-8 character boundary. |
| 93 | + let mut idx = 0; |
| 94 | + |
| 95 | + while idx < len { |
| 96 | + let ptr = self.as_mut_ptr(); |
| 97 | + // The implementation in `alloc` uses `self.get_unchecked(idx..len)` for |
| 98 | + // the equivalent section. <https://github.com/rust-lang/rust/blob/a6bd5246da78/library/alloc/src/string.rs#L1243> |
| 99 | + // This would be unsafe here because the reciever of that method |
| 100 | + // (`DerefMut::deref_mut(&mut self)`) is the empty `str`, since `len` is set to |
| 101 | + // 0 above. However, `get_unchecked` requires that the index is |
| 102 | + // within the bounds of the reciever, not just the allocation of the |
| 103 | + // reciever. This is not a safety issue within `alloc`, because the |
| 104 | + // implementation of `get_unchecked` within `core` expands to the |
| 105 | + // equivalent code as below. However, we cannot make that assumption |
| 106 | + // here, so have to go the long way around. |
| 107 | + let ch = unsafe { |
| 108 | + // SAFETY: `len` came from `self.len()`. Therefore `idx < len` implies `idx` is |
| 109 | + // within the heap allocation owned by self. Therefore the |
| 110 | + // result is within the same allocation as `ptr`. |
| 111 | + let start = ptr.add(idx); |
| 112 | + // SAFETY: The region is not aliased because the method has a mutable reference |
| 113 | + // to self. Additionally, there is no other acess across the |
| 114 | + // loop, and this is the start of the loop body, and no other references exist |
| 115 | + // before this line. We drop the region before any further |
| 116 | + // access later in the loop body. |
| 117 | + let region = slice::from_raw_parts_mut(start, len - idx); |
| 118 | + |
| 119 | + // `region` is `idx..len` within the original string. |
| 120 | + // idx is on a character boundary, and the rest of this method has not modified |
| 121 | + // this region of bytes (except through the `&mut str` as the third closure |
| 122 | + // parameter, any access through which is required to maintain the UTF-8 |
| 123 | + // invariant of that region) |
| 124 | + let ch = from_utf8_unchecked_mut(region).chars().next().unwrap(); |
| 125 | + ch |
| 126 | + // region is dropped here, so its access to the region of |
| 127 | + }; |
| 128 | + let ch_len = ch.len_utf8(); |
| 129 | + let (before, after) = unsafe { |
| 130 | + ( |
| 131 | + // SAFETY: UTF-8 is maintained in the before section by only copying |
| 132 | + // a full character at a time. |
| 133 | + from_utf8_unchecked_mut(slice::from_raw_parts_mut(ptr, idx - del_bytes)), |
| 134 | + // SAFETY: idx + ch_len <= len because self, hence `idx + ch_len` is within the allocation of self. |
| 135 | + // was valid UTF-8 by invariant, hence after is valid. |
| 136 | + // This does not alias with `before`, because `-del_bytes < ch_len` |
| 137 | + from_utf8_unchecked_mut(slice::from_raw_parts_mut( |
| 138 | + ptr.add(idx + ch_len), |
| 139 | + len - idx - ch_len, |
| 140 | + )), |
| 141 | + ) |
| 142 | + }; |
| 143 | + if !f(before, ch, after) { |
| 144 | + del_bytes += ch_len; |
| 145 | + } else if del_bytes > 0 { |
| 146 | + // Copy `ch` del_bytes bytes back. |
| 147 | + // Use the version in the allocation of self, which is already UTF-8 encoded. |
| 148 | + |
| 149 | + // Safety: We copy a region which is a single UTF-8 character. |
| 150 | + // We can't use copy_nonoverlapping here in case del_bytes > ch_len |
| 151 | + unsafe { |
| 152 | + core::ptr::copy(ptr.add(idx), ptr.add(idx - del_bytes), ch_len); |
| 153 | + } |
| 154 | + } |
| 155 | + |
| 156 | + // 'Point' idx to the next char |
| 157 | + idx += ch_len; |
| 158 | + } |
| 159 | + // len-del_bytes <= len <= capacity |
| 160 | + unsafe { |
| 161 | + self.as_mut_vec().set_len(len - del_bytes); |
| 162 | + } |
| 163 | + } |
| 164 | +} |
| 165 | + |
| 166 | +/// Implementation of the sealed pattern for [`RetainMoreString`] |
| 167 | +/// See [C-SEALED] from rust-api-guidelines for explanation |
| 168 | +/// |
| 169 | +/// [C-SEALED]: https://rust-lang.github.io/api-guidelines/future-proofing.html#sealed-traits-protect-against-downstream-implementations-c-sealed |
| 170 | +mod sealed { |
| 171 | + use alloc::string::String; |
| 172 | + |
| 173 | + pub trait AllocMoreSealedString {} |
| 174 | + impl AllocMoreSealedString for String {} |
| 175 | +} |
| 176 | + |
| 177 | +#[cfg(test)] |
| 178 | +mod tests { |
| 179 | + use super::*; |
| 180 | + use alloc::string::ToString; |
| 181 | + |
| 182 | + fn redact(current: char, rest: &mut str) -> bool { |
| 183 | + match (current, rest.chars().next()) { |
| 184 | + ('-', Some(c)) => !c.is_ascii_digit(), |
| 185 | + (c, _) => !c.is_ascii_digit(), |
| 186 | + } |
| 187 | + } |
| 188 | + |
| 189 | + fn after_helper<F: FnMut(char, &mut str) -> bool>(input: &str, output: &str, f: F) { |
| 190 | + let mut input = input.to_string(); |
| 191 | + input.retain_after(f); |
| 192 | + |
| 193 | + assert_eq!(&input[..], output); |
| 194 | + } |
| 195 | + #[test] |
| 196 | + fn retain_after() { |
| 197 | + after_helper("this has no numbers", "this has no numbers", redact); |
| 198 | + after_helper("54321", "", redact); |
| 199 | + after_helper("-12345", "", redact); |
| 200 | + after_helper("--12345", "-", redact); |
| 201 | + after_helper("-12-3-45--", "--", redact); |
| 202 | + } |
| 203 | +} |
0 commit comments