Skip to content

Commit 04167b2

Browse files
authored
LUCENE-9726: Hunspell: speed up spellchecking by stopping at a single… (#2295)
1 parent e2cf6ee commit 04167b2

File tree

3 files changed

+170
-131
lines changed

3 files changed

+170
-131
lines changed

lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CheckCompoundPattern.java

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
*/
1717
package org.apache.lucene.analysis.hunspell;
1818

19-
import java.util.List;
2019
import org.apache.lucene.util.BytesRef;
2120
import org.apache.lucene.util.CharsRef;
2221
import org.apache.lucene.util.IntsRef;
@@ -58,29 +57,28 @@ public String toString() {
5857
}
5958

6059
boolean prohibitsCompounding(
61-
CharsRef word, int breakPos, List<CharsRef> stemsBefore, List<CharsRef> stemsAfter) {
60+
CharsRef word, int breakPos, CharsRef stemBefore, CharsRef stemAfter) {
6261
if (isNonAffixedPattern(endChars)) {
63-
if (stemsBefore.stream()
64-
.noneMatch(stem -> charsMatch(word, breakPos - stem.length, stem.chars))) {
62+
if (!charsMatch(word, breakPos - stemBefore.length, stemBefore.chars)) {
6563
return false;
6664
}
6765
} else if (!charsMatch(word, breakPos - endChars.length, endChars)) {
6866
return false;
6967
}
7068

7169
if (isNonAffixedPattern(beginChars)) {
72-
if (stemsAfter.stream().noneMatch(stem -> charsMatch(word, breakPos, stem.chars))) {
70+
if (!charsMatch(word, breakPos, stemAfter.chars)) {
7371
return false;
7472
}
7573
} else if (!charsMatch(word, breakPos, beginChars)) {
7674
return false;
7775
}
7876

79-
if (endFlags.length > 0 && !hasStemWithFlags(stemsBefore, endFlags)) {
77+
if (endFlags.length > 0 && !stemHasFlags(stemBefore, endFlags)) {
8078
return false;
8179
}
8280
//noinspection RedundantIfStatement
83-
if (beginFlags.length > 0 && !hasStemWithFlags(stemsAfter, beginFlags)) {
81+
if (beginFlags.length > 0 && !stemHasFlags(stemAfter, beginFlags)) {
8482
return false;
8583
}
8684

@@ -91,14 +89,9 @@ private static boolean isNonAffixedPattern(char[] pattern) {
9189
return pattern.length == 1 && pattern[0] == '0';
9290
}
9391

94-
private boolean hasStemWithFlags(List<CharsRef> stems, char[] flags) {
95-
for (CharsRef stem : stems) {
96-
IntsRef forms = dictionary.lookupWord(stem.chars, stem.offset, stem.length);
97-
if (forms != null && hasAllFlags(flags, forms)) {
98-
return true;
99-
}
100-
}
101-
return false;
92+
private boolean stemHasFlags(CharsRef stem, char[] flags) {
93+
IntsRef forms = dictionary.lookupWord(stem.chars, stem.offset, stem.length);
94+
return forms != null && hasAllFlags(flags, forms);
10295
}
10396

10497
private boolean hasAllFlags(char[] flags, IntsRef forms) {

lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java

Lines changed: 39 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ private boolean checkWord(char[] wordChars, int length, WordCase originalCase) {
140140
return false;
141141
}
142142

143-
if (!stemmer.doStem(wordChars, 0, length, originalCase, SIMPLE_WORD).isEmpty()) {
143+
if (findStem(wordChars, 0, length, originalCase, SIMPLE_WORD) != null) {
144144
return true;
145145
}
146146

@@ -156,25 +156,40 @@ && checkCompoundRules(wordChars, 0, length, new ArrayList<>())) {
156156
return false;
157157
}
158158

159+
private CharsRef findStem(
160+
char[] wordChars, int offset, int length, WordCase originalCase, WordContext context) {
161+
CharsRef[] result = {null};
162+
stemmer.doStem(
163+
wordChars,
164+
offset,
165+
length,
166+
originalCase,
167+
context,
168+
(stem, forms, formID) -> {
169+
result[0] = stem;
170+
return false;
171+
});
172+
return result[0];
173+
}
174+
159175
private boolean checkCompounds(
160-
CharsRef word, WordCase originalCase, int depth, Predicate<List<CharsRef>> checkPatterns) {
176+
CharsRef word, WordCase originalCase, int depth, Predicate<CharsRef> checkPatterns) {
161177
if (depth > dictionary.compoundMax - 2) return false;
162178

163179
int limit = word.length - dictionary.compoundMin + 1;
164180
for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
165181
WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
166182
int breakOffset = word.offset + breakPos;
167183
if (mayBreakIntoCompounds(word.chars, word.offset, word.length, breakOffset)) {
168-
List<CharsRef> stems =
169-
stemmer.doStem(word.chars, word.offset, breakPos, originalCase, context);
170-
if (stems.isEmpty()
184+
CharsRef stem = findStem(word.chars, word.offset, breakPos, originalCase, context);
185+
if (stem == null
171186
&& dictionary.simplifiedTriple
172187
&& word.chars[breakOffset - 1] == word.chars[breakOffset]) {
173-
stems = stemmer.doStem(word.chars, word.offset, breakPos + 1, originalCase, context);
188+
stem = findStem(word.chars, word.offset, breakPos + 1, originalCase, context);
174189
}
175-
if (!stems.isEmpty() && checkPatterns.test(stems)) {
176-
Predicate<List<CharsRef>> nextCheck = checkNextPatterns(word, breakPos, stems);
177-
if (checkCompoundsAfter(word, breakPos, originalCase, depth, stems, nextCheck)) {
190+
if (stem != null && checkPatterns.test(stem)) {
191+
Predicate<CharsRef> nextCheck = checkNextPatterns(word, breakPos, stem);
192+
if (checkCompoundsAfter(word, breakPos, originalCase, depth, stem, nextCheck)) {
178193
return true;
179194
}
180195
}
@@ -195,12 +210,11 @@ private boolean checkCompoundPatternReplacements(
195210
if (expanded != null) {
196211
WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
197212
int breakPos = pos + pattern.endLength();
198-
List<CharsRef> stems =
199-
stemmer.doStem(expanded.chars, expanded.offset, breakPos, originalCase, context);
200-
if (!stems.isEmpty()) {
201-
Predicate<List<CharsRef>> nextCheck =
202-
next -> pattern.prohibitsCompounding(expanded, breakPos, stems, next);
203-
if (checkCompoundsAfter(expanded, breakPos, originalCase, depth, stems, nextCheck)) {
213+
CharsRef stem = findStem(expanded.chars, expanded.offset, breakPos, originalCase, context);
214+
if (stem != null) {
215+
Predicate<CharsRef> nextCheck =
216+
next -> pattern.prohibitsCompounding(expanded, breakPos, stem, next);
217+
if (checkCompoundsAfter(expanded, breakPos, originalCase, depth, stem, nextCheck)) {
204218
return true;
205219
}
206220
}
@@ -209,28 +223,27 @@ private boolean checkCompoundPatternReplacements(
209223
return false;
210224
}
211225

212-
private Predicate<List<CharsRef>> checkNextPatterns(
213-
CharsRef word, int breakPos, List<CharsRef> stems) {
214-
return nextStems ->
226+
private Predicate<CharsRef> checkNextPatterns(CharsRef word, int breakPos, CharsRef stems) {
227+
return nextStem ->
215228
dictionary.checkCompoundPatterns.stream()
216-
.noneMatch(p -> p.prohibitsCompounding(word, breakPos, stems, nextStems));
229+
.noneMatch(p -> p.prohibitsCompounding(word, breakPos, stems, nextStem));
217230
}
218231

219232
private boolean checkCompoundsAfter(
220233
CharsRef word,
221234
int breakPos,
222235
WordCase originalCase,
223236
int depth,
224-
List<CharsRef> prevStems,
225-
Predicate<List<CharsRef>> checkPatterns) {
237+
CharsRef prevStem,
238+
Predicate<CharsRef> checkPatterns) {
226239
int remainingLength = word.length - breakPos;
227240
int breakOffset = word.offset + breakPos;
228-
List<CharsRef> tailStems =
229-
stemmer.doStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
230-
if (!tailStems.isEmpty()
231-
&& !(dictionary.checkCompoundDup && intersectIgnoreCase(prevStems, tailStems))
241+
CharsRef tailStem =
242+
findStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
243+
if (tailStem != null
244+
&& !(dictionary.checkCompoundDup && equalsIgnoreCase(prevStem, tailStem))
232245
&& !hasForceUCaseProblem(word.chars, breakOffset, remainingLength, originalCase)
233-
&& checkPatterns.test(tailStems)) {
246+
&& checkPatterns.test(tailStem)) {
234247
return true;
235248
}
236249

0 commit comments

Comments
 (0)