|
1 | | -const MODULUS: u16 = 101; |
2 | | -const BASE: u16 = 256; |
3 | | - |
4 | | -pub fn rabin_karp(target: &str, pattern: &str) -> Vec<usize> { |
5 | | - // Quick exit |
6 | | - if target.is_empty() || pattern.is_empty() || pattern.len() > target.len() { |
| 1 | +//! This module implements the Rabin-Karp string searching algorithm. |
| 2 | +//! It uses a rolling hash technique to find all occurrences of a pattern |
| 3 | +//! within a target string efficiently. |
| 4 | +
|
| 5 | +const MOD: usize = 101; |
| 6 | +const RADIX: usize = 256; |
| 7 | + |
| 8 | +/// Finds all starting indices where the `pattern` appears in the `text`. |
| 9 | +/// |
| 10 | +/// # Arguments |
| 11 | +/// * `text` - The string where the search is performed. |
| 12 | +/// * `pattern` - The substring pattern to search for. |
| 13 | +/// |
| 14 | +/// # Returns |
| 15 | +/// A vector of starting indices where the pattern is found. |
| 16 | +pub fn rabin_karp(text: &str, pattern: &str) -> Vec<usize> { |
| 17 | + if text.is_empty() || pattern.is_empty() || pattern.len() > text.len() { |
7 | 18 | return vec![]; |
8 | 19 | } |
9 | 20 |
|
10 | | - let pattern_hash = hash(pattern); |
| 21 | + let pat_hash = compute_hash(pattern); |
| 22 | + let mut radix_pow = 1; |
11 | 23 |
|
12 | | - // Pre-calculate BASE^(n-1) |
13 | | - let mut pow_rem: u16 = 1; |
| 24 | + // Compute RADIX^(n-1) % MOD |
14 | 25 | for _ in 0..pattern.len() - 1 { |
15 | | - pow_rem *= BASE; |
16 | | - pow_rem %= MODULUS; |
| 26 | + radix_pow = (radix_pow * RADIX) % MOD; |
17 | 27 | } |
18 | 28 |
|
19 | 29 | let mut rolling_hash = 0; |
20 | | - let mut ret = vec![]; |
21 | | - for i in 0..=target.len() - pattern.len() { |
| 30 | + let mut result = vec![]; |
| 31 | + for i in 0..=text.len() - pattern.len() { |
22 | 32 | rolling_hash = if i == 0 { |
23 | | - hash(&target[0..pattern.len()]) |
| 33 | + compute_hash(&text[0..pattern.len()]) |
24 | 34 | } else { |
25 | | - recalculate_hash(target, i - 1, i + pattern.len() - 1, rolling_hash, pow_rem) |
| 35 | + update_hash(text, i - 1, i + pattern.len() - 1, rolling_hash, radix_pow) |
26 | 36 | }; |
27 | | - if rolling_hash == pattern_hash && pattern[..] == target[i..i + pattern.len()] { |
28 | | - ret.push(i); |
| 37 | + if rolling_hash == pat_hash && pattern[..] == text[i..i + pattern.len()] { |
| 38 | + result.push(i); |
29 | 39 | } |
30 | 40 | } |
31 | | - ret |
| 41 | + result |
32 | 42 | } |
33 | 43 |
|
34 | | -// hash(s) is defined as BASE^(n-1) * s_0 + BASE^(n-2) * s_1 + ... + BASE^0 * s_(n-1) |
35 | | -fn hash(s: &str) -> u16 { |
36 | | - let mut res: u16 = 0; |
37 | | - for &c in s.as_bytes().iter() { |
38 | | - res = (res * BASE % MODULUS + c as u16) % MODULUS; |
39 | | - } |
40 | | - res |
| 44 | +/// Calculates the hash of a string using the Rabin-Karp formula. |
| 45 | +/// |
| 46 | +/// # Arguments |
| 47 | +/// * `s` - The string to calculate the hash for. |
| 48 | +/// |
| 49 | +/// # Returns |
| 50 | +/// The hash value of the string modulo `MOD`. |
| 51 | +fn compute_hash(s: &str) -> usize { |
| 52 | + let mut hash_val = 0; |
| 53 | + for &byte in s.as_bytes().iter() { |
| 54 | + hash_val = (hash_val * RADIX + byte as usize) % MOD; |
| 55 | + } |
| 56 | + hash_val |
41 | 57 | } |
42 | 58 |
|
43 | | -// new_hash = (old_hash - BASE^(n-1) * s_(i-n)) * BASE + s_i |
44 | | -fn recalculate_hash( |
| 59 | +/// Updates the rolling hash when shifting the search window. |
| 60 | +/// |
| 61 | +/// # Arguments |
| 62 | +/// * `s` - The full text where the search is performed. |
| 63 | +/// * `old_idx` - The index of the character that is leaving the window. |
| 64 | +/// * `new_idx` - The index of the new character entering the window. |
| 65 | +/// * `old_hash` - The hash of the previous substring. |
| 66 | +/// * `radix_pow` - The precomputed value of RADIX^(n-1) % MOD. |
| 67 | +/// |
| 68 | +/// # Returns |
| 69 | +/// The updated hash for the new substring. |
| 70 | +fn update_hash( |
45 | 71 | s: &str, |
46 | | - old_index: usize, |
47 | | - new_index: usize, |
48 | | - old_hash: u16, |
49 | | - pow_rem: u16, |
50 | | -) -> u16 { |
| 72 | + old_idx: usize, |
| 73 | + new_idx: usize, |
| 74 | + old_hash: usize, |
| 75 | + radix_pow: usize, |
| 76 | +) -> usize { |
51 | 77 | let mut new_hash = old_hash; |
52 | | - let (old_ch, new_ch) = ( |
53 | | - s.as_bytes()[old_index] as u16, |
54 | | - s.as_bytes()[new_index] as u16, |
55 | | - ); |
56 | | - new_hash = (new_hash + MODULUS - pow_rem * old_ch % MODULUS) % MODULUS; |
57 | | - new_hash = (new_hash * BASE + new_ch) % MODULUS; |
| 78 | + let old_char = s.as_bytes()[old_idx] as usize; |
| 79 | + let new_char = s.as_bytes()[new_idx] as usize; |
| 80 | + new_hash = (new_hash + MOD - (old_char * radix_pow % MOD)) % MOD; |
| 81 | + new_hash = (new_hash * RADIX + new_char) % MOD; |
58 | 82 | new_hash |
59 | 83 | } |
60 | 84 |
|
61 | 85 | #[cfg(test)] |
62 | 86 | mod tests { |
63 | 87 | use super::*; |
64 | 88 |
|
65 | | - #[test] |
66 | | - fn hi_hash() { |
67 | | - let hash_result = hash("hi"); |
68 | | - assert_eq!(hash_result, 65); |
69 | | - } |
70 | | - |
71 | | - #[test] |
72 | | - fn abr_hash() { |
73 | | - let hash_result = hash("abr"); |
74 | | - assert_eq!(hash_result, 4); |
75 | | - } |
76 | | - |
77 | | - #[test] |
78 | | - fn bra_hash() { |
79 | | - let hash_result = hash("bra"); |
80 | | - assert_eq!(hash_result, 30); |
81 | | - } |
82 | | - |
83 | | - // Attribution to @pgimalac for his tests from Knuth-Morris-Pratt |
84 | | - #[test] |
85 | | - fn each_letter_matches() { |
86 | | - let index = rabin_karp("aaa", "a"); |
87 | | - assert_eq!(index, vec![0, 1, 2]); |
88 | | - } |
89 | | - |
90 | | - #[test] |
91 | | - fn a_few_separate_matches() { |
92 | | - let index = rabin_karp("abababa", "ab"); |
93 | | - assert_eq!(index, vec![0, 2, 4]); |
94 | | - } |
95 | | - |
96 | | - #[test] |
97 | | - fn one_match() { |
98 | | - let index = rabin_karp("ABC ABCDAB ABCDABCDABDE", "ABCDABD"); |
99 | | - assert_eq!(index, vec![15]); |
100 | | - } |
101 | | - |
102 | | - #[test] |
103 | | - fn lots_of_matches() { |
104 | | - let index = rabin_karp("aaabaabaaaaa", "aa"); |
105 | | - assert_eq!(index, vec![0, 1, 4, 7, 8, 9, 10]); |
106 | | - } |
107 | | - |
108 | | - #[test] |
109 | | - fn lots_of_intricate_matches() { |
110 | | - let index = rabin_karp("ababababa", "aba"); |
111 | | - assert_eq!(index, vec![0, 2, 4, 6]); |
112 | | - } |
113 | | - |
114 | | - #[test] |
115 | | - fn not_found0() { |
116 | | - let index = rabin_karp("abcde", "f"); |
117 | | - assert_eq!(index, vec![]); |
118 | | - } |
119 | | - |
120 | | - #[test] |
121 | | - fn not_found1() { |
122 | | - let index = rabin_karp("abcde", "ac"); |
123 | | - assert_eq!(index, vec![]); |
124 | | - } |
125 | | - |
126 | | - #[test] |
127 | | - fn not_found2() { |
128 | | - let index = rabin_karp("ababab", "bababa"); |
129 | | - assert_eq!(index, vec![]); |
| 89 | + macro_rules! test_cases { |
| 90 | + ($($name:ident: $inputs:expr,)*) => { |
| 91 | + $( |
| 92 | + #[test] |
| 93 | + fn $name() { |
| 94 | + let (text, pattern, expected) = $inputs; |
| 95 | + assert_eq!(rabin_karp(text, pattern), expected); |
| 96 | + } |
| 97 | + )* |
| 98 | + }; |
130 | 99 | } |
131 | 100 |
|
132 | | - #[test] |
133 | | - fn empty_string() { |
134 | | - let index = rabin_karp("", "abcdef"); |
135 | | - assert_eq!(index, vec![]); |
| 101 | + test_cases! { |
| 102 | + single_match_at_start: ("hello world", "hello", vec![0]), |
| 103 | + single_match_at_end: ("hello world", "world", vec![6]), |
| 104 | + single_match_in_middle: ("abc def ghi", "def", vec![4]), |
| 105 | + multiple_matches: ("ababcabc", "abc", vec![2, 5]), |
| 106 | + overlapping_matches: ("aaaaa", "aaa", vec![0, 1, 2]), |
| 107 | + no_match: ("abcdefg", "xyz", vec![]), |
| 108 | + pattern_is_entire_string: ("abc", "abc", vec![0]), |
| 109 | + target_is_multiple_patterns: ("abcabcabc", "abc", vec![0, 3, 6]), |
| 110 | + empty_text: ("", "abc", vec![]), |
| 111 | + empty_pattern: ("abc", "", vec![]), |
| 112 | + empty_text_and_pattern: ("", "", vec![]), |
| 113 | + pattern_larger_than_text: ("abc", "abcd", vec![]), |
| 114 | + large_text_small_pattern: (&("a".repeat(1000) + "b"), "b", vec![1000]), |
| 115 | + single_char_match: ("a", "a", vec![0]), |
| 116 | + single_char_no_match: ("a", "b", vec![]), |
| 117 | + large_pattern_no_match: ("abc", "defghi", vec![]), |
| 118 | + repeating_chars: ("aaaaaa", "aa", vec![0, 1, 2, 3, 4]), |
| 119 | + special_characters: ("abc$def@ghi", "$def@", vec![3]), |
| 120 | + numeric_and_alphabetic_mix: ("abc123abc456", "123abc", vec![3]), |
| 121 | + case_sensitivity: ("AbcAbc", "abc", vec![]), |
136 | 122 | } |
137 | 123 | } |
0 commit comments