LUCENE-9799: Hunspell: don't check second-level affixes when the first level isn't a continuation (#2413)

donnerpeter · web-flow · commit f783848e710f · 2021-02-22T05:35:36.000-05:00
* LUCENE-9799: Hunspell: don't check second-level affixes when the first level isn't a continuation

* check more words in TestPerformance
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -37,10 +37,10 @@
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.LinkedHashSet;
 import java.util.List;
-import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
@@ -78,9 +78,6 @@ public class Dictionary {
   private static final int DEFAULT_FLAGS = 65510;
   static final char HIDDEN_FLAG = (char) 65511; // called 'ONLYUPCASEFLAG' in Hunspell
 
-  // TODO: really for suffixes we should reverse the automaton and run them backwards
-  private static final String PREFIX_CONDITION_REGEX = "%s.*";
-  private static final String SUFFIX_CONDITION_REGEX = ".*%s";
   static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1;
   CharsetDecoder decoder = replacingDecoder(DEFAULT_CHARSET);
 
@@ -147,8 +144,12 @@ public class Dictionary {
   boolean ignoreCase;
   boolean checkSharpS;
   boolean complexPrefixes;
-  // if no affixes have continuation classes, no need to do 2-level affix stripping
-  boolean twoStageAffix;
+
+  /**
+   * All flags used in affix continuation classes. If an outer affix's flag isn't here, there's no
+   * need to do 2-level affix stripping with it.
+   */
+  private char[] secondStageAffixFlags;
 
   char circumfix;
   char keepcase, forceUCase;
@@ -332,6 +333,7 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, Flag
       throws IOException, ParseException {
     TreeMap<String, List<Integer>> prefixes = new TreeMap<>();
     TreeMap<String, List<Integer>> suffixes = new TreeMap<>();
+    Set<Character> stage2Flags = new HashSet<>();
     Map<String, Integer> seenPatterns = new HashMap<>();
 
     // zero condition -> 0 ord
@@ -359,9 +361,9 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, Flag
       } else if ("AM".equals(firstWord)) {
         parseMorphAlias(line);
       } else if ("PFX".equals(firstWord)) {
-        parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX, seenPatterns, seenStrips, flags);
+        parseAffix(prefixes, stage2Flags, line, reader, false, seenPatterns, seenStrips, flags);
       } else if ("SFX".equals(firstWord)) {
-        parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX, seenPatterns, seenStrips, flags);
+        parseAffix(suffixes, stage2Flags, line, reader, true, seenPatterns, seenStrips, flags);
       } else if (line.equals("COMPLEXPREFIXES")) {
         complexPrefixes =
             true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
@@ -476,6 +478,7 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, Flag
 
     this.prefixes = affixFST(prefixes);
     this.suffixes = affixFST(suffixes);
+    secondStageAffixFlags = toSortedCharArray(stage2Flags);
 
     int totalChars = 0;
     for (String strip : seenStrips.keySet()) {
@@ -675,16 +678,15 @@ static String escapeDash(String re) {
    * @param affixes Map where the result of the parsing will be put
    * @param header Header line of the affix rule
    * @param reader BufferedReader to read the content of the rule from
-   * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate
-   *     the condition regex pattern
    * @param seenPatterns map from condition -&gt; index of patterns, for deduplication.
    * @throws IOException Can be thrown while reading the rule
    */
   private void parseAffix(
       TreeMap<String, List<Integer>> affixes,
+      Set<Character> secondStageFlags,
       String header,
       LineNumberReader reader,
-      String conditionPattern,
+      boolean isSuffix,
       Map<String, Integer> seenPatterns,
       Map<String, Integer> seenStrips,
       FlagEnumerator flags)
@@ -694,7 +696,6 @@ private void parseAffix(
     String[] args = header.split("\\s+");
 
     boolean crossProduct = args[2].equals("Y");
-    boolean isSuffix = conditionPattern.equals(SUFFIX_CONDITION_REGEX);
 
     int numLines;
     try {
@@ -725,7 +726,9 @@ private void parseAffix(
         }
 
         appendFlags = flagParsingStrategy.parseFlags(flagPart);
-        twoStageAffix = true;
+        for (char appendFlag : appendFlags) {
+          secondStageFlags.add(appendFlag);
+        }
       }
       // zero affix -> empty string
       if ("0".equals(affixArg)) {
@@ -750,7 +753,8 @@ private void parseAffix(
         // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
         // but this is complicated...
       } else {
-        regex = String.format(Locale.ROOT, conditionPattern, condition);
+        // TODO: really for suffixes we should reverse the automaton and run them backwards
+        regex = isSuffix ? ".*" + condition : condition + ".*";
       }
 
       // deduplicate patterns
@@ -1610,6 +1614,20 @@ CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
     return reuse;
   }
 
+  private static char[] toSortedCharArray(Set<Character> set) {
+    char[] chars = new char[set.size()];
+    int i = 0;
+    for (Character c : set) {
+      chars[i++] = c;
+    }
+    Arrays.sort(chars);
+    return chars;
+  }
+
+  boolean isSecondStageAffix(char flag) {
+    return Arrays.binarySearch(secondStageAffixFlags, flag) >= 0;
+  }
+
   /** folds single character (according to LANG if present) */
   char caseFold(char c) {
     if (alternateCasing) {
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@@ -648,11 +648,11 @@ private boolean applyAffix(
       if (recursionDepth == 0) {
         if (prefix) {
           prefixId = affix;
-          doPrefix = dictionary.complexPrefixes && dictionary.twoStageAffix;
+          doPrefix = dictionary.complexPrefixes && dictionary.isSecondStageAffix(flag);
           // we took away the first prefix.
           // COMPLEXPREFIXES = true:  combine with a second prefix and another suffix
           // COMPLEXPREFIXES = false: combine with a suffix
-        } else if (!dictionary.complexPrefixes && dictionary.twoStageAffix) {
+        } else if (!dictionary.complexPrefixes && dictionary.isSecondStageAffix(flag)) {
           doPrefix = false;
           // we took away a suffix.
           // COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
@@ -665,7 +665,7 @@ private boolean applyAffix(
         if (prefix && dictionary.complexPrefixes) {
           prefixId = affix;
           // we took away the second prefix: go look for another suffix
-        } else if (prefix || dictionary.complexPrefixes || !dictionary.twoStageAffix) {
+        } else if (prefix || dictionary.complexPrefixes || !dictionary.isSecondStageAffix(flag)) {
           return true;
         }
         // we took away a prefix, then a suffix: go look for another suffix
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
@@ -66,7 +66,7 @@ public void en_suggest() throws Exception {
 
   @Test
   public void de() throws Exception {
-    checkAnalysisPerformance("de", 200_000);
+    checkAnalysisPerformance("de", 300_000);
   }
 
   @Test
@@ -76,7 +76,7 @@ public void de_suggest() throws Exception {
 
   @Test
   public void fr() throws Exception {
-    checkAnalysisPerformance("fr", 40_000);
+    checkAnalysisPerformance("fr", 80_000);
   }
 
   @Test

Original file line number	Diff line number	Diff line change
`@@ -66,7 +66,7 @@ public void en_suggest() throws Exception {`
`66`	`66`
`67`	`67`	`@Test`
`68`	`68`	`public void de() throws Exception {`
`69`		`- checkAnalysisPerformance("de", 200_000);`
	`69`	`+ checkAnalysisPerformance("de", 300_000);`
`70`	`70`	`}`
`71`	`71`
`72`	`72`	`@Test`
`@@ -76,7 +76,7 @@ public void de_suggest() throws Exception {`
`76`	`76`
`77`	`77`	`@Test`
`78`	`78`	`public void fr() throws Exception {`
`79`		`- checkAnalysisPerformance("fr", 40_000);`
	`79`	`+ checkAnalysisPerformance("fr", 80_000);`
`80`	`80`	`}`
`81`	`81`
`82`	`82`	`@Test`