LUCENE-9698: Hunspell: reuse char[] when possible when stripping affix (#2243)

donnerpeter · web-flow · commit 695e789891ae · 2021-01-26T13:03:44.000+01:00
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@@ -74,7 +74,7 @@ private boolean checkCaseVariants(char[] wordChars, WordCase wordCase) {
       if (checkWord(caseVariant, wordChars.length, true)) {
         return true;
       }
-      char[] aposCase = stemmer.capitalizeAfterApostrophe(caseVariant, wordChars.length);
+      char[] aposCase = Stemmer.capitalizeAfterApostrophe(caseVariant, wordChars.length);
       if (aposCase != null && checkWord(aposCase, aposCase.length, true)) {
         return true;
       }
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@@ -143,7 +143,7 @@ char[] caseFoldLower(char[] word, int length) {
 
   // Special prefix handling for Catalan, French, Italian:
   // prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia).
-  char[] capitalizeAfterApostrophe(char[] word, int length) {
+  static char[] capitalizeAfterApostrophe(char[] word, int length) {
     for (int i = 1; i < length - 1; i++) {
       if (word[i] == '\'') {
         char next = word[i + 1];
@@ -175,11 +175,12 @@ List<CharsRef> doStem(char[] word, int length, boolean caseVariant) {
         if (Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
           continue;
         }
-        stems.add(newStem(word, length, forms, i));
+        stems.add(newStem(word, 0, length, forms, i));
       }
     }
     try {
-      stems.addAll(stem(word, length, -1, (char) 0, -1, 0, true, true, false, false, caseVariant));
+      stems.addAll(
+          stem(word, 0, length, -1, (char) 0, -1, 0, true, true, false, false, caseVariant));
     } catch (IOException bogus) {
       throw new RuntimeException(bogus);
     }
@@ -214,7 +215,7 @@ public List<CharsRef> uniqueStems(char[] word, int length) {
     return deduped;
   }
 
-  private CharsRef newStem(char[] buffer, int length, IntsRef forms, int formID) {
+  private CharsRef newStem(char[] buffer, int offset, int length, IntsRef forms, int formID) {
     final String exception;
     if (dictionary.hasStemExceptions) {
       int exceptionID = forms.ints[forms.offset + formID + 1];
@@ -232,7 +233,7 @@ private CharsRef newStem(char[] buffer, int length, IntsRef forms, int formID) {
       if (exception != null) {
         scratchSegment.append(exception);
       } else {
-        scratchSegment.append(buffer, 0, length);
+        scratchSegment.append(buffer, offset, length);
       }
       try {
         Dictionary.applyMappings(dictionary.oconv, scratchSegment);
@@ -246,7 +247,7 @@ private CharsRef newStem(char[] buffer, int length, IntsRef forms, int formID) {
       if (exception != null) {
         return new CharsRef(exception);
       } else {
-        return new CharsRef(buffer, 0, length);
+        return new CharsRef(buffer, offset, length);
       }
     }
   }
@@ -284,6 +285,7 @@ private CharsRef newStem(char[] buffer, int length, IntsRef forms, int formID) {
    */
   private List<CharsRef> stem(
       char[] word,
+      int offset,
       int length,
       int previous,
       char prevFlag,
@@ -308,7 +310,7 @@ private List<CharsRef> stem(
       int limit = dictionary.fullStrip ? length + 1 : length;
       for (int i = 0; i < limit; i++) {
         if (i > 0) {
-          int ch = word[i - 1];
+          char ch = word[offset + i - 1];
           if (fst.findTargetArc(ch, arc, arc, prefixReader) == null) {
             break;
           } else if (arc.output() != NO_OUTPUT) {
@@ -327,15 +329,17 @@ private List<CharsRef> stem(
           }
 
           if (isAffixCompatible(prefix, prevFlag, recursionDepth, false)) {
-            char[] strippedWord = stripAffix(word, length, i, prefix, true);
+            char[] strippedWord = stripAffix(word, offset, length, i, prefix, true);
             if (strippedWord == null) {
               continue;
             }
 
+            boolean pureAffix = strippedWord == word;
             stems.addAll(
                 applyAffix(
                     strippedWord,
-                    strippedWord.length,
+                    pureAffix ? offset + i : 0,
+                    pureAffix ? length - i : strippedWord.length,
                     prefix,
                     -1,
                     recursionDepth,
@@ -356,7 +360,7 @@ private List<CharsRef> stem(
       int limit = dictionary.fullStrip ? 0 : 1;
       for (int i = length; i >= limit; i--) {
         if (i < length) {
-          int ch = word[i];
+          char ch = word[offset + i];
           if (fst.findTargetArc(ch, arc, arc, suffixReader) == null) {
             break;
           } else if (arc.output() != NO_OUTPUT) {
@@ -375,15 +379,17 @@ private List<CharsRef> stem(
           }
 
           if (isAffixCompatible(suffix, prevFlag, recursionDepth, previousWasPrefix)) {
-            char[] strippedWord = stripAffix(word, length, length - i, suffix, false);
+            char[] strippedWord = stripAffix(word, offset, length, length - i, suffix, false);
             if (strippedWord == null) {
               continue;
             }
 
+            boolean pureAffix = strippedWord == word;
             stems.addAll(
                 applyAffix(
                     strippedWord,
-                    strippedWord.length,
+                    pureAffix ? offset : 0,
+                    pureAffix ? i : strippedWord.length,
                     suffix,
                     prefixId,
                     recursionDepth,
@@ -398,7 +404,13 @@ private List<CharsRef> stem(
     return stems;
   }
 
-  private char[] stripAffix(char[] word, int length, int affixLen, int affix, boolean isPrefix) {
+  /**
+   * @return null if affix conditions isn't met; a reference to the same char[] if the affix has no
+   *     strip data and can thus be simply removed, or a new char[] containing the word affix
+   *     removal
+   */
+  private char[] stripAffix(
+      char[] word, int offset, int length, int affixLen, int affix, boolean isPrefix) {
     int deAffixedLen = length - affixLen;
 
     int stripOrd = dictionary.affixData(affix, Dictionary.AFFIX_STRIP_ORD);
@@ -409,15 +421,22 @@ private char[] stripAffix(char[] word, int length, int affixLen, int affix, bool
     char[] stripData = dictionary.stripData;
     boolean condition =
         isPrefix
-            ? checkCondition(affix, stripData, stripStart, stripLen, word, affixLen, deAffixedLen)
-            : checkCondition(affix, word, 0, deAffixedLen, stripData, stripStart, stripLen);
+            ? checkCondition(
+                affix, stripData, stripStart, stripLen, word, offset + affixLen, deAffixedLen)
+            : checkCondition(affix, word, offset, deAffixedLen, stripData, stripStart, stripLen);
     if (!condition) {
       return null;
     }
 
+    if (stripLen == 0) return word;
+
     char[] strippedWord = new char[stripLen + deAffixedLen];
     System.arraycopy(
-        word, isPrefix ? affixLen : 0, strippedWord, isPrefix ? stripLen : 0, deAffixedLen);
+        word,
+        offset + (isPrefix ? affixLen : 0),
+        strippedWord,
+        isPrefix ? stripLen : 0,
+        deAffixedLen);
     System.arraycopy(stripData, stripStart, strippedWord, isPrefix ? 0 : deAffixedLen, stripLen);
     return strippedWord;
   }
@@ -484,6 +503,7 @@ private boolean checkCondition(
    */
   private List<CharsRef> applyAffix(
       char[] strippedWord,
+      int offset,
       int length,
       int affix,
       int prefixId,
@@ -496,7 +516,7 @@ private List<CharsRef> applyAffix(
 
     List<CharsRef> stems = new ArrayList<>();
 
-    IntsRef forms = dictionary.lookupWord(strippedWord, 0, length);
+    IntsRef forms = dictionary.lookupWord(strippedWord, offset, length);
     if (forms != null) {
       for (int i = 0; i < forms.length; i += formStep) {
         char[] wordFlags = dictionary.decodeFlags(forms.ints[forms.offset + i], scratch);
@@ -530,7 +550,7 @@ private List<CharsRef> applyAffix(
           if (Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
             continue;
           }
-          stems.add(newStem(strippedWord, length, forms, i));
+          stems.add(newStem(strippedWord, offset, length, forms, i));
         }
       }
     }
@@ -572,6 +592,7 @@ private List<CharsRef> applyAffix(
       stems.addAll(
           stem(
               strippedWord,
+              offset,
               length,
               affix,
               flag,

Original file line number	Diff line number	Diff line change
`@@ -74,7 +74,7 @@ private boolean checkCaseVariants(char[] wordChars, WordCase wordCase) {`
`74`	`74`	`if (checkWord(caseVariant, wordChars.length, true)) {`
`75`	`75`	`return true;`
`76`	`76`	`}`
`77`		`- char[] aposCase = stemmer.capitalizeAfterApostrophe(caseVariant, wordChars.length);`
	`77`	`+ char[] aposCase = Stemmer.capitalizeAfterApostrophe(caseVariant, wordChars.length);`
`78`	`78`	`if (aposCase != null && checkWord(aposCase, aposCase.length, true)) {`
`79`	`79`	`return true;`
`80`	`80`	`}`