|
6 | 6 | /**
|
7 | 7 | * Implementation of KMP.
|
8 | 8 | * <p>
|
9 |
| - * Illustration of getPrefixIndices: with pattern ABCABCNOABCABCA |
10 |
| - * Here we make a distinction between position and index. The position is basically 1-indexed. |
11 |
| - * Note the return indices are still 0-indexed of the pattern string. |
| 9 | + * Illustration of getPrefixTable: with pattern ABCABCNOABCABCA |
| 10 | + * We consider 1-indexed positions. Position 0 will be useful later in as a trick to inform that are no prefix matches |
12 | 11 | * Position: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
|
13 |
| - * Pattern: A B C A B C N O A B C A B C A ... |
14 |
| - * Return: -1 0 0 0 1 2 3 0 0 1 2 3 4 5 6 4 ... |
15 |
| - * Read: ^ an indexing trick; consider 1-indexed characters for clarity and simplicity in the main algor |
16 |
| - * Read: ^ 'A' is the first character of the pattern string, |
17 |
| - * there is no prefix ending before its index, 0, that can be matched with. |
18 |
| - * Read: ^ ^ 'B' and 'C' cannot be matched with any prefix which are just 'A' and 'AB' respectively. |
19 |
| - * Read: ^ Can be matched with an earlier 'A'. So we store 1. |
20 |
| - * Prefix is the substring from idx 0 to 1 (exclusive). Note consider prefix from 0-indexed. |
21 |
| - * Realise 1 can also be interpreted as the index of the next character to match against! |
22 |
| - * Read: ^ ^ Similarly, continue matching |
23 |
| - * Read: ^ ^ No matches, so 0 |
24 |
| - * Read: ^ ^ ^ ^ ^ ^ Match with prefix until position 6! |
25 |
| - * Read: ^ where the magic happens, we can't match 'N' |
26 |
| - * at position 7 with 'A' at position 15, but |
27 |
| - * we know ABC of position 1-3 (or index 0-2) |
28 |
| - * exists and can 'restart' from there. |
29 |
| - * <p> |
30 |
| - * <p> |
| 12 | + * Pattern: A B C A B C N O A B C A B C A ... |
| 13 | + * Return: -1 0 0 0 1 2 3 0 0 1 2 3 4 5 6 4 ... CAN BE READ AS NUM OF CHARS MATCHED |
| 14 | + * Read: ^ -1 can be interpreted as invalid number of chars matched but exploited for simplicity in the main algor. |
| 15 | + * Read: ^ 'A' is the first character of the pattern, there is no prefix ending before itself, to match. |
| 16 | + * Read: ^ ^ 'B' and 'C' cannot be matched with any prefix which are just 'A' and 'AB' respectively. |
| 17 | + * Read: ^ can be matched with an earlier prefix, 'A'. So we store 1, the number of chars matched. |
| 18 | + * Realise 1 can also be interpreted as the index of the next character to match against! |
| 19 | + * Read: ^ ^ Similarly, continue matching |
| 20 | + * Read: ^ ^ No matches, so 0 |
| 21 | + * Read: ^ ^ ^ ^ ^ ^ Match with prefix, "ABCABC", until 6th char |
| 22 | + * of pattern string. |
| 23 | + * Read: ^ where the magic happens, we can't match 'N' |
| 24 | + * at position 7 with 'A' at position 15, but |
| 25 | + * we know "ABC" exists as an earlier sub-pattern |
| 26 | + * from 1st to 3rd and start matching the 4th |
| 27 | + * char onwards. |
31 | 28 | * <p>
|
32 | 29 | * Illustration of main logic:
|
33 | 30 | * Pattern: ABABAB
|
34 | 31 | * String : ABABCABABABAB
|
35 | 32 | * <p>
|
36 |
| - * A B A B C A B A B A B A B |
37 |
| - * Read: ^ to ^ Continue matching where possible, leading to Pattern[0:4] matched. |
38 |
| - * unable to match Pattern[4]. But notice that last two characters of String[0:4] |
39 |
| - * form a sub-pattern with Pattern[0:2] Maybe Pattern[2] == 'C' and we can 're-use' Pattern[0:2] |
40 |
| - * Read: ^ try ^ by checking if Pattern[2] == 'C' |
| 33 | + * A B A B C A B A B A B A B |
| 34 | + * Read: ^ to ^ Continue matching where possible, leading to 1st 4 characters matched. |
| 35 | + * unable to match Pattern[4]. But notice that last two characters |
| 36 | + * form a sub-pattern with the 1st 2, Maybe Pattern[2] == 'C' and we can 're-use' "AB" |
| 37 | + * Read: ^ ^ check if Pattern[2] == 'C' |
41 | 38 | * Read: Turns out no. No previously identified sub-pattern with 'C'. Restart matching Pattern.
|
42 |
| - * Read: ^ to ^ Found complete match! But rather than restart, notice that last 4 characters |
43 |
| - * Read: form a prefix sub-pattern of Pattern, which is Pattern[0:4] = "ABAB", so, |
44 |
| - * Read: ^ ^ Start matching from Pattern[4] and finally Pattern[5] |
| 39 | + * Read: ^ ^ Found complete match! But rather than restart, notice that last 4 characters |
| 40 | + * Read: of "ABABAB" form a prefix sub-pattern of Pattern, which is "ABAB", so, |
| 41 | + * Read: ^ reuse ^ ^ then match 5th and 6th char of pattern which happens to be "AB" |
45 | 42 | */
|
46 | 43 | public class KMP {
|
47 | 44 | /**
|
48 |
| - * Find and indicate all suffix that match with a prefix. |
| 45 | + * Captures the longest prefix which is also a suffix for some substring ending at each index, starting from 0. |
| 46 | + * Does this by tracking the number of characters (of the prefix and suffix) matched. |
49 | 47 | *
|
50 | 48 | * @param pattern to search
|
51 |
| - * @return an array of indices where the suffix ending at each position of they array can be matched with |
52 |
| - * corresponding a prefix of the pattern ending before the specified index |
| 49 | + * @return an array of indices |
53 | 50 | */
|
54 |
| - private static int[] getPrefixIndices(String pattern) { |
| 51 | + private static int[] getPrefixTable(String pattern) { |
| 52 | + // 1-indexed implementation |
55 | 53 | int len = pattern.length();
|
56 |
| - int[] prefixIndices = new int[len + 1]; |
57 |
| - prefixIndices[0] = -1; |
58 |
| - prefixIndices[1] = 0; // 1st character has no prefix to match with |
| 54 | + int[] numCharsMatched = new int[len + 1]; |
| 55 | + numCharsMatched[0] = -1; |
| 56 | + numCharsMatched[1] = 0; // 1st character has no prefix to match with |
59 | 57 |
|
60 | 58 | int currPrefixMatched = 0; // num of chars of prefix pattern currently matched
|
61 |
| - int pos = 2; // Starting from the 2nd character, recall 1-indexed |
| 59 | + int pos = 2; // Starting from the 2nd character |
62 | 60 | while (pos <= len) {
|
63 | 61 | if (pattern.charAt(pos - 1) == pattern.charAt(currPrefixMatched)) {
|
64 | 62 | currPrefixMatched += 1;
|
65 | 63 | // note, the line below can also be interpreted as the index of the next char to match
|
66 |
| - prefixIndices[pos] = currPrefixMatched; // an indexing trick, store at the pos, num of chars matched |
| 64 | + numCharsMatched[pos] = currPrefixMatched; |
67 | 65 | pos += 1;
|
68 | 66 | } else if (currPrefixMatched > 0) {
|
69 | 67 | // go back to a previous known match and try to match again
|
70 |
| - currPrefixMatched = prefixIndices[currPrefixMatched]; |
| 68 | + currPrefixMatched = numCharsMatched[currPrefixMatched]; |
71 | 69 | } else {
|
72 | 70 | // unable to match, time to move on
|
73 |
| - prefixIndices[pos] = 0; |
| 71 | + numCharsMatched[pos] = 0; |
74 | 72 | pos += 1;
|
75 | 73 | }
|
76 | 74 | }
|
77 |
| - return prefixIndices; |
| 75 | + return numCharsMatched; |
78 | 76 | }
|
79 | 77 |
|
80 | 78 | /**
|
81 |
| - * Main logic of KMP. Iterate the sequence, looking for patterns. If a difference is found, resume matching from |
82 |
| - * a previously identified sub-pattern, if possible. Length of pattern should be at least one. |
83 |
| - * |
| 79 | + * Main logic of KMP. Iterate the sequence, looking for patterns. If a mismatch is found, resume matching from |
| 80 | + * a previously identified sub-pattern, if possible. Here we assume length of pattern is at least one. |
84 | 81 | * @param sequence to search against
|
85 | 82 | * @param pattern to search for
|
86 | 83 | * @return start indices of all occurrences of pattern found
|
87 | 84 | */
|
88 | 85 | public static List<Integer> findOccurrences(String sequence, String pattern) {
|
89 |
| - assert pattern.length() >= 1 : "Pattern length cannot be 0!"; |
90 |
| - |
91 | 86 | int sLen = sequence.length();
|
92 | 87 | int pLen = pattern.length();
|
93 |
| - int[] prefixIndices = getPrefixIndices(pattern); |
| 88 | + int[] prefixTable = getPrefixTable(pattern); |
94 | 89 | List<Integer> indicesFound = new ArrayList<>();
|
95 | 90 |
|
96 |
| - int s = 0; |
97 |
| - int p = 0; |
| 91 | + int sTrav = 0; |
| 92 | + int pTrav = 0; |
98 | 93 |
|
99 |
| - while (s < sLen) { |
100 |
| - if (pattern.charAt(p) == sequence.charAt(s)) { |
101 |
| - p += 1; |
102 |
| - s += 1; |
103 |
| - if (p == pLen) { |
104 |
| - // occurrence found |
105 |
| - indicesFound.add(s - pLen); // start index of this occurrence |
106 |
| - p = prefixIndices[p]; // reset |
| 94 | + while (sTrav < sLen) { |
| 95 | + if (pattern.charAt(pTrav) == sequence.charAt(sTrav)) { |
| 96 | + pTrav += 1; |
| 97 | + sTrav += 1; |
| 98 | + if (pTrav == pLen) { // matched a complete pattern string |
| 99 | + indicesFound.add(sTrav - pLen); // start index of this occurrence |
| 100 | + // recall the number of chars matched in p can be read as the index of the next char in p to match |
| 101 | + pTrav = prefixTable[pTrav]; // start matching from a repeated sub-pattern, if possible |
107 | 102 | }
|
108 | 103 | } else {
|
109 |
| - p = prefixIndices[p]; |
110 |
| - if (p < 0) { // move on |
111 |
| - p += 1; |
112 |
| - s += 1; |
| 104 | + pTrav = prefixTable[pTrav]; |
| 105 | + if (pTrav < 0) { // move on; using -1 trick |
| 106 | + pTrav += 1; |
| 107 | + sTrav += 1; |
113 | 108 | }
|
| 109 | + // ALTERNATIVELY |
| 110 | + // if pTrav == 0 i.e. nothing matched, move on |
| 111 | + // sTrav += 1 |
| 112 | + // continue |
| 113 | + // |
| 114 | + // pTrav = prefixTable[pTrav] |
114 | 115 | }
|
115 | 116 | }
|
116 | 117 | return indicesFound;
|
|
0 commit comments