Skip to content

Commit a79f641

Browse files
authored
LUCENE-9720: Hunspell: more ways to vary misspelled word variations for suggestions (#2286)
1 parent d88264b commit a79f641

File tree

14 files changed

+317
-2
lines changed

14 files changed

+317
-2
lines changed

lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,8 @@ public class Dictionary {
152152
private char[] ignore;
153153

154154
String tryChars = "";
155+
String[] neighborKeyGroups = new String[0];
156+
boolean enableSplitSuggestions = true;
155157
List<RepEntry> repTable = new ArrayList<>();
156158

157159
// FSTs used for ICONV/OCONV, output ord pointing to replacement text
@@ -385,6 +387,10 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder)
385387
String[] parts = splitBySpace(reader, reader.readLine(), 3);
386388
repTable.add(new RepEntry(parts[1], parts[2]));
387389
}
390+
} else if ("KEY".equals(firstWord)) {
391+
neighborKeyGroups = singleArgument(reader, line).split("\\|");
392+
} else if ("NOSPLITSUGS".equals(firstWord)) {
393+
enableSplitSuggestions = false;
388394
} else if ("FORBIDDENWORD".equals(firstWord)) {
389395
forbiddenword = flagParsingStrategy.parseFlag(singleArgument(reader, line));
390396
} else if ("COMPOUNDMIN".equals(firstWord)) {

lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java

Lines changed: 184 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,10 @@
1818

1919
import java.util.Arrays;
2020
import java.util.LinkedHashSet;
21+
import java.util.Locale;
2122

2223
class ModifyingSuggester {
24+
private static final int MAX_CHAR_DISTANCE = 4;
2325
private final LinkedHashSet<String> result = new LinkedHashSet<>();
2426
private final char[] tryChars;
2527
private final SpellChecker speller;
@@ -30,9 +32,52 @@ class ModifyingSuggester {
3032
}
3133

3234
LinkedHashSet<String> suggest(String word) {
35+
tryVariationsOf(word);
36+
37+
WordCase wc = WordCase.caseOf(word);
38+
39+
if (wc == WordCase.MIXED) {
40+
int dot = word.indexOf('.');
41+
if (dot > 0
42+
&& dot < word.length() - 1
43+
&& WordCase.caseOf(word.substring(dot + 1)) == WordCase.TITLE) {
44+
result.add(word.substring(0, dot + 1) + " " + word.substring(dot + 1));
45+
}
46+
47+
tryVariationsOf(toLowerCase(word));
48+
}
49+
50+
return result;
51+
}
52+
53+
private String toLowerCase(String word) {
54+
char[] chars = new char[word.length()];
55+
for (int i = 0; i < word.length(); i++) {
56+
chars[i] = speller.dictionary.caseFold(word.charAt(i));
57+
}
58+
return new String(chars);
59+
}
60+
61+
private void tryVariationsOf(String word) {
62+
trySuggestion(word.toUpperCase(Locale.ROOT));
63+
if (checkDictionaryForSplitSuggestions(word)) {
64+
return;
65+
}
66+
3367
tryRep(word);
68+
69+
trySwappingChars(word);
70+
tryLongSwap(word);
71+
tryNeighborKeys(word);
72+
tryRemovingChar(word);
3473
tryAddingChar(word);
35-
return result;
74+
tryMovingChar(word);
75+
tryReplacingChar(word);
76+
tryTwoDuplicateChars(word);
77+
78+
if (speller.dictionary.enableSplitSuggestions) {
79+
trySplitting(word);
80+
}
3681
}
3782

3883
private void tryRep(String word) {
@@ -50,6 +95,75 @@ private void tryRep(String word) {
5095
}
5196
}
5297

98+
private void trySwappingChars(String word) {
99+
int length = word.length();
100+
for (int i = 0; i < length - 1; i++) {
101+
char c1 = word.charAt(i);
102+
char c2 = word.charAt(i + 1);
103+
trySuggestion(word.substring(0, i) + c2 + c1 + word.substring(i + 2));
104+
}
105+
106+
if (length == 4 || length == 5) {
107+
tryDoubleSwapForShortWords(word, length);
108+
}
109+
}
110+
111+
// ahev -> have, owudl -> would
112+
private void tryDoubleSwapForShortWords(String word, int length) {
113+
char[] candidate = word.toCharArray();
114+
candidate[0] = word.charAt(1);
115+
candidate[1] = word.charAt(0);
116+
candidate[length - 1] = word.charAt(length - 2);
117+
candidate[length - 2] = word.charAt(length - 1);
118+
trySuggestion(new String(candidate));
119+
120+
if (candidate.length == 5) {
121+
candidate[0] = word.charAt(0);
122+
candidate[1] = word.charAt(2);
123+
candidate[2] = word.charAt(1);
124+
trySuggestion(new String(candidate));
125+
}
126+
}
127+
128+
private void tryNeighborKeys(String word) {
129+
for (int i = 0; i < word.length(); i++) {
130+
char c = word.charAt(i);
131+
char up = Character.toUpperCase(c);
132+
if (up != c) {
133+
trySuggestion(word.substring(0, i) + up + word.substring(i + 1));
134+
}
135+
136+
// check neighbor characters in keyboard string
137+
for (String group : speller.dictionary.neighborKeyGroups) {
138+
if (group.indexOf(c) >= 0) {
139+
for (int j = 0; j < group.length(); j++) {
140+
if (group.charAt(j) != c) {
141+
trySuggestion(word.substring(0, i) + group.charAt(j) + word.substring(i + 1));
142+
}
143+
}
144+
}
145+
}
146+
}
147+
}
148+
149+
private void tryLongSwap(String word) {
150+
for (int i = 0; i < word.length(); i++) {
151+
for (int j = i + 2; j < word.length() && j <= i + MAX_CHAR_DISTANCE; j++) {
152+
char c1 = word.charAt(i);
153+
char c2 = word.charAt(j);
154+
String prefix = word.substring(0, i);
155+
String suffix = word.substring(j + 1);
156+
trySuggestion(prefix + c2 + word.substring(i + 1, j) + c1 + suffix);
157+
}
158+
}
159+
}
160+
161+
private void tryRemovingChar(String word) {
162+
for (int i = 0; i < word.length(); i++) {
163+
trySuggestion(word.substring(0, i) + word.substring(i + 1));
164+
}
165+
}
166+
53167
private void tryAddingChar(String word) {
54168
for (int i = 0; i <= word.length(); i++) {
55169
String prefix = word.substring(0, i);
@@ -60,6 +174,75 @@ private void tryAddingChar(String word) {
60174
}
61175
}
62176

177+
private void tryMovingChar(String word) {
178+
for (int i = 0; i < word.length(); i++) {
179+
for (int j = i + 2; j < word.length() && j <= i + MAX_CHAR_DISTANCE; j++) {
180+
String prefix = word.substring(0, i);
181+
trySuggestion(prefix + word.substring(i + 1, j) + word.charAt(i) + word.substring(j));
182+
trySuggestion(prefix + word.charAt(j) + word.substring(i, j) + word.substring(j + 1));
183+
}
184+
}
185+
}
186+
187+
private void tryReplacingChar(String word) {
188+
for (int i = 0; i < word.length(); i++) {
189+
String prefix = word.substring(0, i);
190+
String suffix = word.substring(i + 1);
191+
for (char toInsert : tryChars) {
192+
if (toInsert != word.charAt(i)) {
193+
trySuggestion(prefix + toInsert + suffix);
194+
}
195+
}
196+
}
197+
}
198+
199+
// perhaps we doubled two characters
200+
// (for example vacation -> vacacation)
201+
private void tryTwoDuplicateChars(String word) {
202+
int dupLen = 0;
203+
for (int i = 2; i < word.length(); i++) {
204+
if (word.charAt(i) == word.charAt(i - 2)) {
205+
dupLen++;
206+
if (dupLen == 3 || dupLen == 2 && i >= 4) {
207+
trySuggestion(word.substring(0, i - 1) + word.substring(i + 1));
208+
dupLen = 0;
209+
}
210+
} else {
211+
dupLen = 0;
212+
}
213+
}
214+
}
215+
216+
private boolean checkDictionaryForSplitSuggestions(String word) {
217+
boolean found = false;
218+
for (int i = 1; i < word.length() - 1; i++) {
219+
String w1 = word.substring(0, i);
220+
String w2 = word.substring(i);
221+
found |= trySuggestion(w1 + " " + w2);
222+
if (shouldSplitByDash()) {
223+
found |= trySuggestion(w1 + "-" + w2);
224+
}
225+
}
226+
return found;
227+
}
228+
229+
private void trySplitting(String word) {
230+
for (int i = 1; i < word.length() - 1; i++) {
231+
String w1 = word.substring(0, i);
232+
String w2 = word.substring(i);
233+
if (speller.checkWord(w1) && speller.checkWord(w2)) {
234+
result.add(w1 + " " + w2);
235+
if (shouldSplitByDash()) {
236+
result.add(w1 + "-" + w2);
237+
}
238+
}
239+
}
240+
}
241+
242+
private boolean shouldSplitByDash() {
243+
return speller.dictionary.tryChars.contains("-") || speller.dictionary.tryChars.contains("a");
244+
}
245+
63246
private boolean trySuggestion(String candidate) {
64247
if (speller.checkWord(candidate)) {
65248
result.add(candidate);

lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -414,7 +414,10 @@ private List<String> modifyChunksBetweenDashes(String word) {
414414
String chunk = word.substring(chunkStart, chunkEnd);
415415
if (!spell(chunk)) {
416416
for (String chunkSug : suggest(chunk)) {
417-
result.add(word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd));
417+
String replaced = word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd);
418+
if (!dictionary.isForbiddenWord(replaced.toCharArray(), replaced.length(), scratch)) {
419+
result.add(replaced);
420+
}
418421
}
419422
}
420423
}

lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ static WordCase caseOf(char[] word, int length) {
3737
return get(startsWithLower, seenUpper, seenLower);
3838
}
3939

40+
static WordCase caseOf(CharSequence word) {
41+
return caseOf(word, word.length());
42+
}
43+
4044
static WordCase caseOf(CharSequence word, int length) {
4145
boolean startsWithLower = Character.isLowerCase(word.charAt(0));
4246

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
IJs, ijs

lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,14 @@ public void testGermanCompounding() throws Exception {
156156
doTest("germancompounding");
157157
}
158158

159+
public void testModifyingSuggestions() throws Exception {
160+
doTest("sug");
161+
}
162+
163+
public void testModifyingSuggestions2() throws Exception {
164+
doTest("sug2");
165+
}
166+
159167
protected void doTest(String name) throws Exception {
160168
checkSpellCheckerExpectations(
161169
Path.of(getClass().getResource(name + ".aff").toURI()).getParent().resolve(name), true);
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# new suggestion methods of Hunspell 1.5:
2+
# capitalization: nasa -> NASA
3+
# long swap: permenant -> permanent
4+
# long mov: Ghandi -> Gandhi
5+
# double two characters: vacacation -> vacation
6+
# space with REP: "alot" -> "a lot" ("a lot" need to be in the dic file.)
7+
#
8+
# Note: see test "ph" for the newer and
9+
# more simple method to handle common misspellings,
10+
# for example, alot->a lot, inspite->in spite,
11+
# (that is giving the best suggestion, and limiting
12+
# ngram/phonetic suggestion)
13+
14+
# switch off ngram suggestion for testing
15+
MAXNGRAMSUGS 0
16+
REP 2
17+
REP alot a_lot
18+
REP inspite in_spite
19+
KEY qwertzuiop|asdfghjkl|yxcvbnm|aq
20+
WORDCHARS .-
21+
FORBIDDENWORD ?
22+
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
13
2+
NASA
3+
Gandhi
4+
grateful
5+
permanent
6+
vacation
7+
a
8+
lot
9+
have
10+
which
11+
McDonald
12+
permanent-vacation/?
13+
in
14+
spite
15+
inspire
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
NASA
2+
Gandhi
3+
grateful
4+
permanent
5+
vacation
6+
a lot, lot
7+
in spite, inspire
8+
permanent. Vacation
9+
have
10+
which
11+
Gandhi
12+
McDonald
13+
permanent
14+
15+
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
nasa
2+
Ghandi
3+
greatful
4+
permenant
5+
vacacation
6+
alot
7+
inspite
8+
permanent.Vacation
9+
ahev
10+
hwihc
11+
GAndhi
12+
Mcdonald
13+
permqnent
14+
permanent-vacation
15+
permqnent-vacation

0 commit comments

Comments
 (0)