LUCENE-9726: Hunspell: speed up spellchecking by stopping at a single… (#2295)

donnerpeter · web-flow · commit 04167b27f5f0 · 2021-02-04T09:13:11.000+01:00
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CheckCompoundPattern.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CheckCompoundPattern.java
@@ -16,7 +16,6 @@
  */
 package org.apache.lucene.analysis.hunspell;
 
-import java.util.List;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;
@@ -58,29 +57,28 @@ public String toString() {
   }
 
   boolean prohibitsCompounding(
-      CharsRef word, int breakPos, List<CharsRef> stemsBefore, List<CharsRef> stemsAfter) {
+      CharsRef word, int breakPos, CharsRef stemBefore, CharsRef stemAfter) {
     if (isNonAffixedPattern(endChars)) {
-      if (stemsBefore.stream()
-          .noneMatch(stem -> charsMatch(word, breakPos - stem.length, stem.chars))) {
+      if (!charsMatch(word, breakPos - stemBefore.length, stemBefore.chars)) {
         return false;
       }
     } else if (!charsMatch(word, breakPos - endChars.length, endChars)) {
       return false;
     }
 
     if (isNonAffixedPattern(beginChars)) {
-      if (stemsAfter.stream().noneMatch(stem -> charsMatch(word, breakPos, stem.chars))) {
+      if (!charsMatch(word, breakPos, stemAfter.chars)) {
         return false;
       }
     } else if (!charsMatch(word, breakPos, beginChars)) {
       return false;
     }
 
-    if (endFlags.length > 0 && !hasStemWithFlags(stemsBefore, endFlags)) {
+    if (endFlags.length > 0 && !stemHasFlags(stemBefore, endFlags)) {
       return false;
     }
     //noinspection RedundantIfStatement
-    if (beginFlags.length > 0 && !hasStemWithFlags(stemsAfter, beginFlags)) {
+    if (beginFlags.length > 0 && !stemHasFlags(stemAfter, beginFlags)) {
       return false;
     }
 
@@ -91,14 +89,9 @@ private static boolean isNonAffixedPattern(char[] pattern) {
     return pattern.length == 1 && pattern[0] == '0';
   }
 
-  private boolean hasStemWithFlags(List<CharsRef> stems, char[] flags) {
-    for (CharsRef stem : stems) {
-      IntsRef forms = dictionary.lookupWord(stem.chars, stem.offset, stem.length);
-      if (forms != null && hasAllFlags(flags, forms)) {
-        return true;
-      }
-    }
-    return false;
+  private boolean stemHasFlags(CharsRef stem, char[] flags) {
+    IntsRef forms = dictionary.lookupWord(stem.chars, stem.offset, stem.length);
+    return forms != null && hasAllFlags(flags, forms);
   }
 
   private boolean hasAllFlags(char[] flags, IntsRef forms) {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@@ -140,7 +140,7 @@ private boolean checkWord(char[] wordChars, int length, WordCase originalCase) {
       return false;
     }
 
-    if (!stemmer.doStem(wordChars, 0, length, originalCase, SIMPLE_WORD).isEmpty()) {
+    if (findStem(wordChars, 0, length, originalCase, SIMPLE_WORD) != null) {
       return true;
     }
 
@@ -156,25 +156,40 @@ && checkCompoundRules(wordChars, 0, length, new ArrayList<>())) {
     return false;
   }
 
+  private CharsRef findStem(
+      char[] wordChars, int offset, int length, WordCase originalCase, WordContext context) {
+    CharsRef[] result = {null};
+    stemmer.doStem(
+        wordChars,
+        offset,
+        length,
+        originalCase,
+        context,
+        (stem, forms, formID) -> {
+          result[0] = stem;
+          return false;
+        });
+    return result[0];
+  }
+
   private boolean checkCompounds(
-      CharsRef word, WordCase originalCase, int depth, Predicate<List<CharsRef>> checkPatterns) {
+      CharsRef word, WordCase originalCase, int depth, Predicate<CharsRef> checkPatterns) {
     if (depth > dictionary.compoundMax - 2) return false;
 
     int limit = word.length - dictionary.compoundMin + 1;
     for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
       WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
       int breakOffset = word.offset + breakPos;
       if (mayBreakIntoCompounds(word.chars, word.offset, word.length, breakOffset)) {
-        List<CharsRef> stems =
-            stemmer.doStem(word.chars, word.offset, breakPos, originalCase, context);
-        if (stems.isEmpty()
+        CharsRef stem = findStem(word.chars, word.offset, breakPos, originalCase, context);
+        if (stem == null
             && dictionary.simplifiedTriple
             && word.chars[breakOffset - 1] == word.chars[breakOffset]) {
-          stems = stemmer.doStem(word.chars, word.offset, breakPos + 1, originalCase, context);
+          stem = findStem(word.chars, word.offset, breakPos + 1, originalCase, context);
         }
-        if (!stems.isEmpty() && checkPatterns.test(stems)) {
-          Predicate<List<CharsRef>> nextCheck = checkNextPatterns(word, breakPos, stems);
-          if (checkCompoundsAfter(word, breakPos, originalCase, depth, stems, nextCheck)) {
+        if (stem != null && checkPatterns.test(stem)) {
+          Predicate<CharsRef> nextCheck = checkNextPatterns(word, breakPos, stem);
+          if (checkCompoundsAfter(word, breakPos, originalCase, depth, stem, nextCheck)) {
             return true;
           }
         }
@@ -195,12 +210,11 @@ private boolean checkCompoundPatternReplacements(
       if (expanded != null) {
         WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
         int breakPos = pos + pattern.endLength();
-        List<CharsRef> stems =
-            stemmer.doStem(expanded.chars, expanded.offset, breakPos, originalCase, context);
-        if (!stems.isEmpty()) {
-          Predicate<List<CharsRef>> nextCheck =
-              next -> pattern.prohibitsCompounding(expanded, breakPos, stems, next);
-          if (checkCompoundsAfter(expanded, breakPos, originalCase, depth, stems, nextCheck)) {
+        CharsRef stem = findStem(expanded.chars, expanded.offset, breakPos, originalCase, context);
+        if (stem != null) {
+          Predicate<CharsRef> nextCheck =
+              next -> pattern.prohibitsCompounding(expanded, breakPos, stem, next);
+          if (checkCompoundsAfter(expanded, breakPos, originalCase, depth, stem, nextCheck)) {
             return true;
           }
         }
@@ -209,28 +223,27 @@ private boolean checkCompoundPatternReplacements(
     return false;
   }
 
-  private Predicate<List<CharsRef>> checkNextPatterns(
-      CharsRef word, int breakPos, List<CharsRef> stems) {
-    return nextStems ->
+  private Predicate<CharsRef> checkNextPatterns(CharsRef word, int breakPos, CharsRef stems) {
+    return nextStem ->
         dictionary.checkCompoundPatterns.stream()
-            .noneMatch(p -> p.prohibitsCompounding(word, breakPos, stems, nextStems));
+            .noneMatch(p -> p.prohibitsCompounding(word, breakPos, stems, nextStem));
   }
 
   private boolean checkCompoundsAfter(
       CharsRef word,
       int breakPos,
       WordCase originalCase,
       int depth,
-      List<CharsRef> prevStems,
-      Predicate<List<CharsRef>> checkPatterns) {
+      CharsRef prevStem,
+      Predicate<CharsRef> checkPatterns) {
     int remainingLength = word.length - breakPos;
     int breakOffset = word.offset + breakPos;
-    List<CharsRef> tailStems =
-        stemmer.doStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
-    if (!tailStems.isEmpty()
-        && !(dictionary.checkCompoundDup && intersectIgnoreCase(prevStems, tailStems))
+    CharsRef tailStem =
+        findStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
+    if (tailStem != null
+        && !(dictionary.checkCompoundDup && equalsIgnoreCase(prevStem, tailStem))
         && !hasForceUCaseProblem(word.chars, breakOffset, remainingLength, originalCase)
-        && checkPatterns.test(tailStems)) {
+        && checkPatterns.test(tailStem)) {
       return true;
     }
 
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java