Skip to content

Commit 392af2f

Browse files
committed
More methods
- add longest_common_substring - add next_char_boundary - add previous_char_boundary
1 parent b058c55 commit 392af2f

File tree

4 files changed

+199
-29
lines changed

4 files changed

+199
-29
lines changed

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
authors = ["Davide Di Carlo <daddinuz@gmail.com>"]
33
description = "Extension traits for `String` and `&str` types."
44
name = "string_more"
5-
version = "0.2.1"
5+
version = "0.3.0"
66
edition = "2021"
77
license = "MIT"
88
keywords = ["String", "str", "extension", "in-place", "edit-distance"]

README.md

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,21 @@ These traits introduce additional methods to efficiently manipulate strings, foc
1616

1717
Add `string_more` to your `Cargo.toml`:
1818

19+
```bash
20+
cargo add string_more
21+
```
22+
23+
or edit your Cargo.toml manually by adding:
24+
1925
```toml
2026
[dependencies]
21-
string_more = "0.1"
27+
string_more = "0.3"
2228
```
2329

2430
Then, in your Rust code:
2531

2632
```rust
27-
use string_more::{StringExt, StrExt}; // Import both traits
33+
use string_more::{StringExt, StrExt}; // Import traits
2834

2935
fn main() {
3036
let mut my_string = String::from(" Hello, Rust! ");
@@ -34,9 +40,16 @@ fn main() {
3440
let s = "Hello, Rust!";
3541
let new_s = s.center(' ', 2); // Immutable operation on &str
3642
println!("{}", new_s); // " Hello, Rust! "
43+
44+
println!("{}", "Hello, Rust!".levenshtein_distance("Hello, World!")); // 5
3745
}
3846
```
3947

48+
## Why string_more?
49+
50+
Rust’s standard library provides robust string handling, but when additional flexibility is needed, `string_more` steps in with efficient, allocation-friendly operations.
51+
With both in-place and immutable operations, `string_more` is ideal for optimizing string manipulation in your Rust projects.
52+
4053
## Methods Overview
4154

4255
### `StringExt` (In-place operations for `String`)
@@ -195,10 +208,32 @@ let s = "Hello";
195208
s.char_frequencies::<BTreeMap<_, _>>(); // H:1 e:1 l:2 o:1
196209
```
197210

198-
## Why string_more?
211+
- **`longest_common_substring`**: Returns the longest common substring.
199212

200-
Rust’s standard library provides robust string handling, but when additional flexibility is needed, `string_more` steps in with efficient, allocation-friendly operations.
201-
With both in-place and immutable operations, `string_more` is ideal for optimizing string manipulation in your Rust projects.
213+
```rust
214+
let s = "sparrow";
215+
s.longest_common_substring("crow"); // "row"
216+
```
217+
218+
- **`next_char_boundary`**: Returns the byte index of the next char boundary in string starting from index.
219+
220+
```rust
221+
let s = "🦀";
222+
s.next_char_boundary(2); // 4
223+
```
224+
225+
- **`previous_char_boundary`**: Returns the byte index of the previous char boundary in string starting from index.
226+
227+
```rust
228+
let s = "🦀";
229+
s.previous_char_boundary(2); // 0
230+
```
231+
232+
## Safety and Coverage
233+
234+
This crate contains a small portion of unsafe code.
235+
All tests run under [miri](https://github.com/rust-lang/miri) and the tests cover about 90% of the code.
236+
You can generate the coverage report using [tarpaulin](https://github.com/xd009642/tarpaulin).
202237

203238
## Contributions
204239

src/lib.rs

Lines changed: 157 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,19 @@ pub trait StrExt: sailed::Sailed {
118118
/// The user can specify the output map in which the
119119
/// frequencies will be stored.
120120
fn char_frequencies<M: sailed::HzMap>(&self) -> M;
121+
122+
/// Returns the longest common substring between `self` and `other`.
123+
fn longest_common_substring(&self, other: &str) -> &str;
124+
125+
/// Get the byte index of the next char in the string starting from index.
126+
/// If index happens to be on a valid char boundary then index itself is returned.
127+
/// Note that both 0 and string's length are consedered valid char boundaries.
128+
fn next_char_boundary(&self, index: usize) -> usize;
129+
130+
/// Get the byte index of the previous char in the string starting from index.
131+
/// If index happens to be on a valid char boundary then index itself is returned.
132+
/// Note that both 0 and string's length are consedered valid char boundaries.
133+
fn previous_char_boundary(&self, index: usize) -> usize;
121134
}
122135

123136
/// The `StringExt` trait extends `String` with advanced in-place manipulation methods,
@@ -314,7 +327,7 @@ where
314327
.count();
315328

316329
// ensure end happens on a valid char boundary
317-
while end > 0 && !source.is_char_boundary(source.len() - end) {
330+
while !source.is_char_boundary(source.len() - end) {
318331
end -= 1;
319332
}
320333

@@ -328,7 +341,7 @@ where
328341
.count();
329342

330343
// ensure start happens on a valid char boundary
331-
while start > 0 && !source.is_char_boundary(start) {
344+
while !source.is_char_boundary(start) {
332345
start -= 1;
333346
}
334347

@@ -410,6 +423,59 @@ where
410423
self.chars().for_each(|c| map.incr(c));
411424
map
412425
}
426+
427+
fn longest_common_substring(&self, other: &str) -> &str {
428+
let (sa, sb) = (self.as_bytes(), other.as_bytes());
429+
let mut longest_common_substring = "";
430+
431+
for ia in 0..sa.len() {
432+
if sa.len() - ia < longest_common_substring.len() {
433+
break;
434+
}
435+
436+
for ib in 0..sb.len() {
437+
if sb.len() - ib < longest_common_substring.len() {
438+
break;
439+
}
440+
441+
let len = sa[ia..]
442+
.iter()
443+
.zip(sb[ib..].iter())
444+
.take_while(|(ca, cb)| ca == cb)
445+
.count();
446+
447+
if len > longest_common_substring.len() {
448+
let start = self.next_char_boundary(ia);
449+
let end = self.previous_char_boundary(ia + len);
450+
if end - start > longest_common_substring.len() {
451+
longest_common_substring = &self[start..end];
452+
}
453+
}
454+
}
455+
}
456+
457+
longest_common_substring
458+
}
459+
460+
fn next_char_boundary(&self, mut index: usize) -> usize {
461+
if index > self.len() {
462+
return self.len();
463+
}
464+
465+
while !self.is_char_boundary(index) {
466+
index += 1;
467+
}
468+
469+
index
470+
}
471+
472+
fn previous_char_boundary(&self, mut index: usize) -> usize {
473+
while !self.is_char_boundary(index) {
474+
index -= 1;
475+
}
476+
477+
index
478+
}
413479
}
414480

415481
impl StringExt for String {
@@ -543,10 +609,7 @@ impl StringExt for String {
543609
self.shift_in_place(i, tabsize.saturating_sub(1), ' ');
544610
i += tabsize;
545611
} else {
546-
i += 1;
547-
while i < self.len() && !self.is_char_boundary(i) {
548-
i += 1;
549-
}
612+
i = self.next_char_boundary(i + 1);
550613
}
551614
}
552615
}
@@ -627,10 +690,56 @@ impl EncodeUtf8 for String {
627690

628691
#[cfg(test)]
629692
mod tests {
630-
use std::collections::BTreeMap;
693+
use std::collections::{BTreeMap, HashMap};
631694

632695
use super::{EncodeUtf8, StrExt, StringExt};
633696

697+
#[test]
698+
fn encode_utf8() {
699+
const SEED: [&str; 4] = ["", "·", "x", "Hello world!"];
700+
701+
for init in SEED {
702+
let sut = init.to_string();
703+
assert_eq!(EncodeUtf8::encode_utf8(&sut, &mut ()), init);
704+
}
705+
}
706+
707+
#[test]
708+
fn next_char_boundary() {
709+
const SEED: [(&str, usize, usize); 8] = [
710+
("", 0, 0),
711+
("", 1, 0),
712+
("a", 0, 0),
713+
("a", 1, 1),
714+
("·", 0, 0),
715+
("·", 1, 2),
716+
("·", 2, 2),
717+
("🦀", 2, 4),
718+
];
719+
720+
for (sut, index, expected) in SEED {
721+
assert_eq!(sut.next_char_boundary(index), expected);
722+
}
723+
}
724+
725+
#[test]
726+
fn previous_char_boundary() {
727+
const SEED: [(&str, usize, usize); 8] = [
728+
("", 0, 0),
729+
("", 1, 0),
730+
("a", 0, 0),
731+
("a", 1, 1),
732+
(".", 0, 0),
733+
("·", 1, 0),
734+
("·", 2, 2),
735+
("🦀", 2, 0),
736+
];
737+
738+
for (sut, index, expected) in SEED {
739+
assert_eq!(sut.previous_char_boundary(index), expected);
740+
}
741+
}
742+
634743
#[test]
635744
fn fill_start() {
636745
const SEED: [(&str, &str, usize, &str); 18] = [
@@ -1164,24 +1273,15 @@ mod tests {
11641273
}
11651274
}
11661275

1167-
#[test]
1168-
fn string_encode_utf8() {
1169-
const SEED: [&str; 4] = ["", "·", "x", "Hello world!"];
1170-
1171-
for init in SEED {
1172-
let sut = init.to_string();
1173-
assert_eq!(EncodeUtf8::encode_utf8(&sut, &mut ()), init);
1174-
}
1175-
}
1176-
11771276
#[test]
11781277
fn levenshtein_distance() {
1179-
const SEED: [(&str, &str, usize); 17] = [
1278+
const SEED: [(&str, &str, usize); 18] = [
11801279
("", "", 0),
11811280
("", "a", 1),
11821281
("a", "", 1),
1282+
("abc", "def", 3),
11831283
("ring", "bring", 1),
1184-
("ring", "string", 2),
1284+
("string", "ring", 2),
11851285
("update", "udpate", 2),
11861286
("kitten", "sitting", 3),
11871287
("saturday", "sunday", 3),
@@ -1203,12 +1303,13 @@ mod tests {
12031303

12041304
#[test]
12051305
fn hamming_distance() {
1206-
const SEED: [(&str, &str, Option<usize>); 15] = [
1306+
const SEED: [(&str, &str, Option<usize>); 16] = [
12071307
("", "", Some(0)),
12081308
("", "a", None),
12091309
("a", "", None),
1310+
("abc", "def", Some(3)),
12101311
("ring", "bring", None),
1211-
("ring", "string", None),
1312+
("string", "ring", None),
12121313
("update", "udpate", Some(2)),
12131314
("kitten", "sitting", None),
12141315
("saturday", "sunday", None),
@@ -1233,8 +1334,9 @@ mod tests {
12331334

12341335
#[test]
12351336
fn char_frequencies() {
1236-
const SEED: [(&str, &[(char, usize)]); 2] = [
1337+
const SEED: [(&str, &[(char, usize)]); 3] = [
12371338
("", &[]),
1339+
("·x·", &[('x', 1), ('·', 2)]),
12381340
("hello", &[('h', 1), ('e', 1), ('l', 2), ('o', 1)]),
12391341
];
12401342

@@ -1243,6 +1345,39 @@ mod tests {
12431345
sut.char_frequencies::<BTreeMap<_, _>>(),
12441346
expected.iter().map(|(c, f)| (*c, *f)).collect()
12451347
);
1348+
1349+
assert_eq!(
1350+
sut.char_frequencies::<HashMap<_, _>>(),
1351+
expected.iter().map(|(c, f)| (*c, *f)).collect()
1352+
);
1353+
}
1354+
}
1355+
1356+
#[test]
1357+
fn longest_common_subsequence() {
1358+
const SEED: [(&str, &str, &str); 18] = [
1359+
("", "", ""),
1360+
("bar", "", ""),
1361+
("", "bar", ""),
1362+
("foo", "bar", ""),
1363+
("hello", "hello", "hello"),
1364+
("lorem ipsum dolor", "ipsum", "ipsum"),
1365+
("ipsum", "lorem ipsum dolor", "ipsum"),
1366+
("spĀm", "spām", "sp"),
1367+
("bananĀ", "bananā", "banan"),
1368+
("xĀ", "xā", "x"),
1369+
("xĀ", "xȀ", "x"),
1370+
("Āx", "āx", "x"),
1371+
("Āx", "Ȁx", "x"),
1372+
("Hello·World!", "·World", "·World"),
1373+
("Hello·World!", "Hello·", "Hello·"),
1374+
("0123456789", "012345", "012345"),
1375+
("0123456789", "456789", "456789"),
1376+
("0123456789", "345678", "345678"),
1377+
];
1378+
1379+
for (sut, other, expected) in SEED {
1380+
assert_eq!(sut.longest_common_substring(other), expected);
12461381
}
12471382
}
12481383
}

0 commit comments

Comments
 (0)