LUCENE-9692: Hunspell: extract Stemmer.stripAffix from similar code in prefix/suffix processing (#2237)

donnerpeter · web-flow · commit e4ec3e391989 · 2021-01-25T09:11:11.000+01:00
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@@ -26,7 +26,6 @@
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 import org.apache.lucene.util.fst.FST;
-import org.apache.lucene.util.fst.Outputs;
 
 /**
  * Stemmer uses the affix rules declared in the Dictionary to generate one or more stems for a word.
@@ -305,11 +304,10 @@ private List<CharsRef> stem(
 
     if (doPrefix && dictionary.prefixes != null) {
       FST<IntsRef> fst = dictionary.prefixes;
-      Outputs<IntsRef> outputs = fst.outputs;
       FST.BytesReader bytesReader = prefixReaders[recursionDepth];
       FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
       fst.getFirstArc(arc);
-      IntsRef NO_OUTPUT = outputs.getNoOutput();
+      IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
       IntsRef output = NO_OUTPUT;
       int limit = dictionary.fullStrip ? length + 1 : length;
       for (int i = 0; i < limit; i++) {
@@ -333,23 +331,12 @@ private List<CharsRef> stem(
           }
 
           if (isAffixCompatible(prefix, prevFlag, recursionDepth, false)) {
-            int deAffixedLength = length - i;
-
-            int stripOrd = dictionary.affixData(prefix, Dictionary.AFFIX_STRIP_ORD);
-            int stripStart = dictionary.stripOffsets[stripOrd];
-            int stripEnd = dictionary.stripOffsets[stripOrd + 1];
-            int stripLength = stripEnd - stripStart;
-
-            if (!checkCondition(
-                prefix, dictionary.stripData, stripStart, stripLength, word, i, deAffixedLength)) {
+            char[] strippedWord = stripAffix(word, length, i, prefix, true);
+            if (strippedWord == null) {
               continue;
             }
 
-            char[] strippedWord = new char[stripLength + deAffixedLength];
-            System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
-            System.arraycopy(word, i, strippedWord, stripLength, deAffixedLength);
-
-            List<CharsRef> stemList =
+            stems.addAll(
                 applyAffix(
                     strippedWord,
                     strippedWord.length,
@@ -358,21 +345,18 @@ private List<CharsRef> stem(
                     recursionDepth,
                     true,
                     circumfix,
-                    caseVariant);
-
-            stems.addAll(stemList);
+                    caseVariant));
           }
         }
       }
     }
 
     if (doSuffix && dictionary.suffixes != null) {
       FST<IntsRef> fst = dictionary.suffixes;
-      Outputs<IntsRef> outputs = fst.outputs;
       FST.BytesReader bytesReader = suffixReaders[recursionDepth];
       FST.Arc<IntsRef> arc = suffixArcs[recursionDepth];
       fst.getFirstArc(arc);
-      IntsRef NO_OUTPUT = outputs.getNoOutput();
+      IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
       IntsRef output = NO_OUTPUT;
       int limit = dictionary.fullStrip ? 0 : 1;
       for (int i = length; i >= limit; i--) {
@@ -396,25 +380,12 @@ private List<CharsRef> stem(
           }
 
           if (isAffixCompatible(suffix, prevFlag, recursionDepth, previousWasPrefix)) {
-            int appendLength = length - i;
-            int deAffixedLength = length - appendLength;
-
-            int stripOrd = dictionary.affixData(suffix, Dictionary.AFFIX_STRIP_ORD);
-            int stripStart = dictionary.stripOffsets[stripOrd];
-            int stripEnd = dictionary.stripOffsets[stripOrd + 1];
-            int stripLength = stripEnd - stripStart;
-
-            if (!checkCondition(
-                suffix, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) {
+            char[] strippedWord = stripAffix(word, length, length - i, suffix, false);
+            if (strippedWord == null) {
               continue;
             }
 
-            char[] strippedWord = new char[stripLength + deAffixedLength];
-            System.arraycopy(word, 0, strippedWord, 0, deAffixedLength);
-            System.arraycopy(
-                dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength);
-
-            List<CharsRef> stemList =
+            stems.addAll(
                 applyAffix(
                     strippedWord,
                     strippedWord.length,
@@ -423,9 +394,7 @@ private List<CharsRef> stem(
                     recursionDepth,
                     false,
                     circumfix,
-                    caseVariant);
-
-            stems.addAll(stemList);
+                    caseVariant));
           }
         }
       }
@@ -434,6 +403,30 @@ private List<CharsRef> stem(
     return stems;
   }
 
+  private char[] stripAffix(char[] word, int length, int affixLen, int affix, boolean isPrefix) {
+    int deAffixedLen = length - affixLen;
+
+    int stripOrd = dictionary.affixData(affix, Dictionary.AFFIX_STRIP_ORD);
+    int stripStart = dictionary.stripOffsets[stripOrd];
+    int stripEnd = dictionary.stripOffsets[stripOrd + 1];
+    int stripLen = stripEnd - stripStart;
+
+    char[] stripData = dictionary.stripData;
+    boolean condition =
+        isPrefix
+            ? checkCondition(affix, stripData, stripStart, stripLen, word, affixLen, deAffixedLen)
+            : checkCondition(affix, word, 0, deAffixedLen, stripData, stripStart, stripLen);
+    if (!condition) {
+      return null;
+    }
+
+    char[] strippedWord = new char[stripLen + deAffixedLen];
+    System.arraycopy(
+        word, isPrefix ? affixLen : 0, strippedWord, isPrefix ? stripLen : 0, deAffixedLen);
+    System.arraycopy(stripData, stripStart, strippedWord, isPrefix ? 0 : deAffixedLen, stripLen);
+    return strippedWord;
+  }
+
   private boolean isAffixCompatible(
       int affix, int prevFlag, int recursionDepth, boolean previousWasPrefix) {
     int append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);
@@ -495,9 +488,9 @@ private boolean checkCondition(
    * @param strippedWord Word the affix has been removed and the strip added
    * @param length valid length of stripped word
    * @param affix HunspellAffix representing the affix rule itself
-   * @param prefixId when we already stripped a prefix, we cant simply recurse and check the suffix,
-   *     unless both are compatible so we must check dictionary form against both to add it as a
-   *     stem!
+   * @param prefixId when we already stripped a prefix, we can't simply recurse and check the
+   *     suffix, unless both are compatible so we must check dictionary form against both to add it
+   *     as a stem!
    * @param recursionDepth current recursion depth
    * @param prefix true if we are removing a prefix (false if it's a suffix)
    * @return List of stems for the word, or an empty list if none are found