Skip to content

Commit 02b43da

Browse files
committed
string
fixed boyer moore add unittest for kmp.buildpattern
1 parent 0a5c624 commit 02b43da

File tree

5 files changed

+191
-49
lines changed

5 files changed

+191
-49
lines changed

src/string/boyerMoore.js

Lines changed: 74 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,14 @@
22
*
33
* @typedef {Object} PatternTable
44
* @property {Object[]} badCharacter badCharacter
5-
* @property {number[]} goodSuffix goodSuffix
6-
* @property {number} length
5+
* @property {number[]} L goodSuffix L array
6+
* L[i] is the largest position less than n - 1 such that string P[i..n-1] matches a suffix of P[0..L[i]]
7+
* If none exists,L[i] is defined to be -1 .
8+
* @property {number[]} H goodSuffix H array
9+
* H[i] is the length of the largest suffix of P[i..n-1] that is also a prefix of P, if one exists.
10+
* If none exists,H[i] is defined to be -1 .
11+
* @property {number[]} goodSuffixOffset goodSuffixOffset calc by L and H
12+
* @property {number} length length of pattern
713
*/
814

915
/**
@@ -18,28 +24,66 @@ function buildPattern(pattern) {
1824
for (let i = 1; i < pattern.length; i++) {
1925
badCharacter[i] = {}
2026
Object.assign(badCharacter[i], badCharacter[i - 1]);
21-
badCharacter[i][pattern[i - 1]] = i - 1;
27+
badCharacter[i][pattern[i - 1]] = (i - 1);
2228
}
23-
let goodSuffix = new Array().fill(0);
24-
// console.table(badCharacter);
25-
// console.table(goodSuffix);
29+
30+
// Both of these tables are constructible in O(n) time and use O(n) space.
31+
let L = new Array(length).fill(-1);
32+
let lastIndex = pattern.length - 1;
33+
let j = lastIndex; // good suffix index
34+
let i = lastIndex - 1;
35+
while (i >= 0 && j > 0) {
36+
let l = i + (lastIndex - j); // last index of current frame.
37+
if (l < j) {
38+
if (pattern[i] === pattern[j]) {
39+
L[j] = (L[j] === -1 ? l : L[j]);
40+
j--;
41+
}
42+
i--;
43+
} else {
44+
i = j - 1;
45+
j = lastIndex;
46+
}
47+
}
48+
// H should only be used if L[i] is -1 or a match has been found.
49+
let H = new Array(length).fill(-1);
50+
j = lastIndex; // good suffix index
51+
for (; j >= 0; j--) {
52+
let goodSuffixMaxSuffixLength = j + 1;
53+
let maxSuffixLength = length - goodSuffixMaxSuffixLength;
54+
if (pattern.substr(0, maxSuffixLength) === pattern.substr(goodSuffixMaxSuffixLength, maxSuffixLength)) {
55+
H[j] = maxSuffixLength - 1;
56+
} else {
57+
H[j] = H[j + 1] === undefined ? -1 : H[j + 1];
58+
}
59+
}
60+
// The alignment shift for index i in P is given by n - 1 - L[i] or n - 1 - H[i].
61+
let goodSuffixOffset = L.map((value, index) => {
62+
if (value === -1) {
63+
value = H[index];
64+
}
65+
return length - 1 - value;
66+
});
67+
2668
return {
27-
length,
2869
badCharacter,
29-
goodSuffix,
70+
goodSuffixOffset,
71+
L,
72+
H,
3073
};
3174
}
3275

3376
/**
3477
* calc offset from pattern table
3578
* @param {PatternTable} patternTable pattern table
36-
* @param {number} dismatchIndex dismatch index
37-
* @param {string} dismatchChar dismatch character
79+
* @param {number} mismatchIndex mismatchIndex in pattern
80+
* @param {string} badChar mismatch character in text
3881
*/
39-
function getOffset(patternTable, dismatchIndex, badChar) {
40-
let posBadChar = patternTable.badCharacter[dismatchIndex][badChar] === undefined ? -1 : patternTable.badCharacter[dismatchIndex][badChar];
41-
return dismatchIndex - posBadChar;
42-
// return Math.max(dismatchIndex - posBadChar, patternTable.goodSuffix[dismatchIndex]);
82+
function getOffset(patternTable, mismatchIndex, badChar) {
83+
let badCharIndex = patternTable.badCharacter[mismatchIndex][badChar] === undefined ? -1 : patternTable.badCharacter[mismatchIndex][badChar];
84+
let badCharOffset = mismatchIndex - badCharIndex;
85+
let goodSuffixOffset = patternTable.goodSuffixOffset[mismatchIndex + 1] || badCharOffset;
86+
return Math.max(badCharOffset, goodSuffixOffset);
4387
}
4488

4589
/**
@@ -53,22 +97,27 @@ function boyerMoore(text, pattern) {
5397
return 0;
5498
}
5599
const patternTable = buildPattern(pattern);
56-
let patternLastIndex = pattern.length - 1;
57-
let pIndex = patternLastIndex;
58-
let tIndex = pIndex;
59-
while (tIndex < text.length) {
60-
if (text[tIndex] === pattern[pIndex]) {
100+
let pLastIndex = pattern.length - 1; // pattern last index
101+
let fLastIndex = pLastIndex; // frame last index
102+
let comparedCount = 0; // compared count beween pattern and frame
103+
while (fLastIndex < text.length) {
104+
let pIndex = pLastIndex - comparedCount; // calc current compare index in pattern;
105+
let fIndex = fLastIndex - comparedCount; // calc current compare index in frame;
106+
if (text[fIndex] === pattern[pIndex]) {
61107
if (pIndex === 0) {
62-
return tIndex;
108+
return fIndex;
63109
}
64-
tIndex--;
65-
pIndex--;
110+
comparedCount++;
66111
} else {
67-
tIndex = tIndex + (patternLastIndex - pIndex) + getOffset(patternTable, pIndex, text[tIndex]);
68-
pIndex = patternLastIndex;
112+
// not match reset compared count and go to next frame
113+
fLastIndex = fLastIndex + getOffset(patternTable, pIndex, text[fIndex]);
114+
comparedCount = 0;
69115
}
70116
}
71117
return -1;
72118
}
73119

74-
module.exports = boyerMoore;
120+
module.exports = {
121+
boyerMoore,
122+
buildPattern
123+
};

src/string/knuthMorrisPratt.js

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,16 @@
55
*/
66
function buildPattern(pattern) {
77
const patternTable = new Array(pattern.length).fill(0); // Initialize
8-
let prefixIndex = 0;
8+
let prefixCount = 0;
99
for (let i = 1; i < pattern.length; i++) {
10-
if (pattern[i] === pattern[prefixIndex]) {
11-
patternTable[i] = ++prefixIndex;
12-
} else if (prefixIndex) {
13-
prefixIndex = patternTable[prefixIndex - 1];
14-
i--; // Still Compare Current pattern[i] with prefix
10+
let compareIndex = 0 + prefixCount;
11+
if (pattern[i] === pattern[compareIndex]) {
12+
patternTable[i] = (++prefixCount);
13+
} else if (prefixCount) {
14+
// AABAAAB prefixCount = 2,i = 5
15+
// ABABCABABAB prefixCount = 4,i = 9
16+
prefixCount = patternTable[0 + prefixCount - 1];
17+
i--;
1518
}
1619
}
1720
return patternTable;
@@ -39,7 +42,7 @@ function knuthMorrisPratt(text, pattern) {
3942
pIndex++;
4043
} else {
4144
if (pIndex) {
42-
// Have Common Prefix
45+
// Jump over some prefix strings have already been compared
4346
pIndex = patternTable[pIndex - 1];
4447
} else {
4548
pIndex = 0;
@@ -50,4 +53,7 @@ function knuthMorrisPratt(text, pattern) {
5053
return -1;
5154
}
5255

53-
module.exports = knuthMorrisPratt;
56+
module.exports = {
57+
knuthMorrisPratt,
58+
buildPattern
59+
};

test/string/string.boyerMoore.test.js

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,79 @@
11
const expect = require('chai').expect;
2-
const boyerMoore = require('../../src/string/boyerMoore');
2+
const {
3+
boyerMoore,
4+
buildPattern
5+
} = require('../../src/string/boyerMoore');
36

47
describe('boyerMoore', () => {
8+
59
it('should find substring in a string', () => {
610
expect(boyerMoore('HERE IS A SIMPLE EXAMPLE', 'EXAMPLE')).to.equal(17);
711
});
812

13+
it('should build pattern for good suffix #1', () => {
14+
let pattern = buildPattern('EXAMPLE');
15+
expect(pattern.L, 'pattern.L').to.deep.equal([-1, -1, -1, -1, -1, -1, 0]);
16+
expect(pattern.H, 'pattern.H').to.deep.equal([0, 0, 0, 0, 0, 0, -1]);
17+
expect(pattern.goodSuffixOffset, 'pattern.goodSuffixOffset').to.deep.equal([6, 6, 6, 6, 6, 6, 6]);
18+
});
19+
20+
it('should build pattern for good suffix #2', () => {
21+
let pattern = buildPattern('ABAB');
22+
expect(pattern.L, 'pattern.L').to.deep.equal([-1, -1, 1, 1]);
23+
expect(pattern.H, 'pattern.H').to.deep.equal([1, 1, -1, -1]);
24+
});
25+
26+
it('should build pattern for good suffix #3', () => {
27+
let pattern = buildPattern('AAAA');
28+
expect(pattern.L, 'pattern.L').to.deep.equal([-1, -1, 1, 2]);
29+
expect(pattern.H, 'pattern.H').to.deep.equal([2, 1, 0, -1]);
30+
});
31+
32+
it('should build pattern for good suffix #4', () => {
33+
let pattern = buildPattern('ABCD');
34+
expect(pattern.L, 'pattern.L').to.deep.equal([-1, -1, -1, -1]);
35+
expect(pattern.H, 'pattern.H').to.deep.equal([-1, -1, -1, -1]);
36+
});
37+
38+
it('should build pattern for good suffix #4', () => {
39+
let pattern = buildPattern('AAAAAA');
40+
expect(pattern.L, 'pattern.L').to.deep.equal([-1, -1, -1, 2, 3, 4]);
41+
expect(pattern.H, 'pattern.H').to.deep.equal([4, 3, 2, 1, 0, -1]);
42+
});
43+
44+
it('should build pattern for bad character #1', () => {
45+
let badCharacter = [{}, {
46+
"E": 0
47+
}, {
48+
"E": 0,
49+
"X": 1
50+
}, {
51+
"E": 0,
52+
"X": 1,
53+
"A": 2
54+
}, {
55+
"E": 0,
56+
"X": 1,
57+
"A": 2,
58+
"M": 3
59+
}, {
60+
"E": 0,
61+
"X": 1,
62+
"A": 2,
63+
"M": 3,
64+
"P": 4
65+
}, {
66+
"E": 0,
67+
"X": 1,
68+
"A": 2,
69+
"M": 3,
70+
"P": 4,
71+
"L": 5
72+
}];
73+
let pattern = buildPattern('EXAMPLE');
74+
expect(pattern.badCharacter, 'pattern.badCharacter').to.deep.equal(badCharacter);
75+
});
76+
977
it('should find substring in a string', () => {
1078
expect(boyerMoore('', '')).to.equal(0);
1179
expect(boyerMoore('a', '')).to.equal(0);

test/string/string.knuthMorrisPratt.test.js

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,18 @@
11
const expect = require('chai').expect;
2-
const knuthMorrisPratt = require('../../src/string/knuthMorrisPratt');
2+
const {
3+
knuthMorrisPratt,
4+
buildPattern
5+
} = require('../../src/string/knuthMorrisPratt');
36

47
describe('knuthMorrisPratt', () => {
8+
9+
it('should build pattern', () => {
10+
expect(buildPattern('')).to.deep.equal([]);
11+
expect(buildPattern('abcdabcy'), 'buildPattern(abcdabcy)').to.deep.equal([0, 0, 0, 0, 1, 2, 3, 0]);
12+
expect(buildPattern('AABAAA'), 'buildPattern(AABAAA)').to.deep.equal([0, 1, 0, 1, 2, 2]);
13+
expect(buildPattern('EXAMPLE'), 'buildPattern(EXAMPLE)').to.deep.equal([0, 0, 0, 0, 0, 0, 1]);
14+
});
15+
516
it('should find substring in a string', () => {
617
expect(knuthMorrisPratt('', '')).to.equal(0);
718
expect(knuthMorrisPratt('a', '')).to.equal(0);
@@ -17,16 +28,16 @@ describe('knuthMorrisPratt', () => {
1728
});
1829

1930
it('should work with bigger texts', () => {
20-
const text = 'Lorem Ipsum is simply dummy text of the printing and '
21-
+ 'typesetting industry. Lorem Ipsum has been the industry\'s standard '
22-
+ 'dummy text ever since the 1500s, when an unknown printer took a '
23-
+ 'galley of type and scrambled it to make a type specimen book. It '
24-
+ 'has survived not only five centuries, but also the leap into '
25-
+ 'electronic typesetting, remaining essentially unchanged. It was '
26-
+ 'popularised in the 1960s with the release of Letraset sheets '
27-
+ 'containing Lorem Ipsum passages, and more recently with desktop'
28-
+ 'publishing software like Aldus PageMaker including versions of Lorem '
29-
+ 'Ipsum.';
31+
const text = 'Lorem Ipsum is simply dummy text of the printing and ' +
32+
'typesetting industry. Lorem Ipsum has been the industry\'s standard ' +
33+
'dummy text ever since the 1500s, when an unknown printer took a ' +
34+
'galley of type and scrambled it to make a type specimen book. It ' +
35+
'has survived not only five centuries, but also the leap into ' +
36+
'electronic typesetting, remaining essentially unchanged. It was ' +
37+
'popularised in the 1960s with the release of Letraset sheets ' +
38+
'containing Lorem Ipsum passages, and more recently with desktop' +
39+
'publishing software like Aldus PageMaker including versions of Lorem ' +
40+
'Ipsum.';
3041

3142
expect(knuthMorrisPratt(text, 'Lorem')).to.equal(0);
3243
expect(knuthMorrisPratt(text, 'versions')).to.equal(549);

字符串算法/README.md

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -71,13 +71,21 @@ Boyer-Moore算法的关键在于,当P的最后一个字符被比较完成后,我
7171

7272
* 坏字符规则: 当文本 `T` 中的某个字符跟模式 `P` 的某个字符不匹配时,称文本 `T` 中的这个失配字符为坏字符
7373

74-
模式后移位数 = 坏字符在模式中失配的位置 - 坏字符在模式中最后一次出现的位置
74+
`模式后移位数 = 坏字符在模式中失配的位置 - 坏字符在模式中最后一次出现的位置`
7575

76-
* 好后缀规则: 当文本 `T` 中的某个字符跟模式 `P` 的某个字符不匹配时,称文本 `T` 中的已经匹配的字符串为好后缀
76+
* 好后缀规则: 当文本 `T` 中的某个字符跟模式 `P` 的某个字符不匹配时,称文本 `T` 中的已经匹配的字符串为好后缀.
7777

78-
模式后移位数 = 好后缀在模式中的当前位置 - 好后缀在模式中最后一次出现的位置
78+
```模式后移位数 = 好后缀在模式中的当前位置(P.lastIndex) - 好后缀在模式中最后一次出现的位置```
7979

80-
Boyer-Moore算法每次后移这两个规则之中的较大值.
80+
可以按照下面的方式理解:
81+
82+
假设有`P``T`,`T`中字符串`t`匹配到了`P`的一个后缀,但在比较位置`i`时发生不匹配.设匹配到的好后缀在`T`中为`t`,在P中为`t'`(`t = t'`).
83+
84+
1.`P``i`位置的左侧最靠近`i`位置查找字符串`t'`使得`t'=t`,若存在,则移动相应的位数将找到的`t'``T`中的`t`对齐.(可以预处理为数组 `L`)
85+
2. 如果`t'`不存在,继续查找`t`的某一个后缀是否为P的前缀,若存在,则移动相应的位将`P`的前缀与`t`的后缀位置对齐.(可预处理为数组 `H`)
86+
3.`P`向后移动`n`个字符.
87+
88+
Boyer-Moore算法每次后移这坏字符规则和好后缀规则之中的较大值.
8189

8290
[Boyer-Moore Source Code](../src/string/boyerMoore.js)
8391

0 commit comments

Comments
 (0)