apache
diff --git a/‎lucene/CHANGES.txt‎
Lines changed: 2 additions & 2 deletions b/‎lucene/CHANGES.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CompoundRule.java‎
Lines changed: 105 additions & 0 deletions b/‎lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CompoundRule.java‎
Lines changed: 105 additions & 0 deletions
diff --git a/‎lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java‎
Lines changed: 70 additions & 11 deletions b/‎lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java‎
Lines changed: 70 additions & 11 deletions
diff --git a/‎lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java‎
Lines changed: 82 additions & 5 deletions b/‎lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java‎
Lines changed: 82 additions & 5 deletions
@@ -86,8 +86,8 @@ API Changes
 
 Improvements
 
-* LUCENE-9665 LUCENE-9676 LUCENE-9667 : Hunspell improvements: add SpellChecker API, support default encoding and
-  BREAK/FORBIDDENWORD affix rules, improve stemming of all-caps words (Peter Gromov)
+* LUCENE-9687: Hunspell support improvements: add SpellChecker API, support default encoding and
+  BREAK/FORBIDDENWORD/COMPOUNDRULE affix rules, improve stemming of all-caps words (Peter Gromov)
 
 * LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
   (Dawid Weiss)
 
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import java.util.List;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
+
+class CompoundRule {
+  private final char[] data;
+  private final Dictionary dictionary;
+
+  CompoundRule(String rule, Dictionary dictionary) {
+    this.dictionary = dictionary;
+    StringBuilder parsedFlags = new StringBuilder();
+    int pos = 0;
+    while (pos < rule.length()) {
+      int lParen = rule.indexOf("(", pos);
+      if (lParen < 0) {
+        parsedFlags.append(dictionary.flagParsingStrategy.parseFlags(rule.substring(pos)));
+        break;
+      }
+
+      parsedFlags.append(dictionary.flagParsingStrategy.parseFlags(rule.substring(pos, lParen)));
+      int rParen = rule.indexOf(')', lParen + 1);
+      if (rParen < 0) {
+        throw new IllegalArgumentException("Unmatched parentheses: " + rule);
+      }
+
+      parsedFlags.append(
+          dictionary.flagParsingStrategy.parseFlags(rule.substring(lParen + 1, rParen)));
+      pos = rParen + 1;
+      if (pos < rule.length() && (rule.charAt(pos) == '?' || rule.charAt(pos) == '*')) {
+        parsedFlags.append(rule.charAt(pos++));
+      }
+    }
+    data = parsedFlags.toString().toCharArray();
+  }
+
+  boolean mayMatch(List<IntsRef> words, BytesRef scratch) {
+    return match(words, 0, 0, scratch, false);
+  }
+
+  boolean fullyMatches(List<IntsRef> words, BytesRef scratch) {
+    return match(words, 0, 0, scratch, true);
+  }
+
+  private boolean match(
+      List<IntsRef> words, int patternIndex, int wordIndex, BytesRef scratch, boolean fully) {
+    if (patternIndex >= data.length) {
+      return wordIndex >= words.size();
+    }
+    if (wordIndex >= words.size() && !fully) {
+      return true;
+    }
+
+    char flag = data[patternIndex];
+    if (patternIndex < data.length - 1 && data[patternIndex + 1] == '*') {
+      int startWI = wordIndex;
+      while (wordIndex < words.size() && dictionary.hasFlag(words.get(wordIndex), flag, scratch)) {
+        wordIndex++;
+      }
+
+      while (wordIndex >= startWI) {
+        if (match(words, patternIndex + 2, wordIndex, scratch, fully)) {
+          return true;
+        }
+
+        wordIndex--;
+      }
+      return false;
+    }
+
+    boolean currentWordMatches =
+        wordIndex < words.size() && dictionary.hasFlag(words.get(wordIndex), flag, scratch);
+
+    if (patternIndex < data.length - 1 && data[patternIndex + 1] == '?') {
+      if (currentWordMatches && match(words, patternIndex + 2, wordIndex + 1, scratch, fully)) {
+        return true;
+      }
+      return match(words, patternIndex + 2, wordIndex, scratch, fully);
+    }
+
+    return currentWordMatches && match(words, patternIndex + 1, wordIndex + 1, scratch, fully);
+  }
+
+  @Override
+  public String toString() {
+    return new String(data);
+  }
+}
@@ -92,6 +92,8 @@ public class Dictionary {
   private static final String LANG_KEY = "LANG";
   private static final String BREAK_KEY = "BREAK";
   private static final String FORBIDDENWORD_KEY = "FORBIDDENWORD";
+  private static final String COMPOUNDMIN_KEY = "COMPOUNDMIN";
+  private static final String COMPOUNDRULE_KEY = "COMPOUNDRULE";
   private static final String KEEPCASE_KEY = "KEEPCASE";
   private static final String NEEDAFFIX_KEY = "NEEDAFFIX";
   private static final String PSEUDOROOT_KEY = "PSEUDOROOT";
@@ -136,7 +138,7 @@ public class Dictionary {
   static final int AFFIX_APPEND = 3;
 
   // Default flag parsing strategy
-  private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy();
+  FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy();
 
   // AF entries
   private String[] aliases;
@@ -163,6 +165,8 @@ public class Dictionary {
   int needaffix = -1; // needaffix flag, or -1 if one is not defined
   int forbiddenword = -1; // forbiddenword flag, or -1 if one is not defined
   int onlyincompound = -1; // onlyincompound flag, or -1 if one is not defined
+  int compoundMin = 3;
+  List<CompoundRule> compoundRules; // nullable
 
   // ignored characters (dictionary, affix, inputs)
   private char[] ignore;
@@ -419,6 +423,18 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder)
           throw new ParseException("Illegal FORBIDDENWORD declaration", reader.getLineNumber());
         }
         forbiddenword = flagParsingStrategy.parseFlag(parts[1]);
+      } else if (line.startsWith(COMPOUNDMIN_KEY)) {
+        String[] parts = line.split("\\s+");
+        if (parts.length != 2) {
+          throw new ParseException("Illegal COMPOUNDMIN declaration", reader.getLineNumber());
+        }
+        compoundMin = Math.max(1, Integer.parseInt(parts[1]));
+      } else if (line.startsWith(COMPOUNDRULE_KEY)) {
+        String[] parts = line.split("\\s+");
+        if (parts.length != 2) {
+          throw new ParseException("Illegal COMPOUNDRULE header", reader.getLineNumber());
+        }
+        this.compoundRules = parseCompoundRules(reader, Integer.parseInt(parts[1]));
       }
     }
 
@@ -442,6 +458,21 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder)
     stripOffsets[currentIndex] = currentOffset;
   }
 
+  private List<CompoundRule> parseCompoundRules(LineNumberReader reader, int num)
+      throws IOException, ParseException {
+    String line;
+    List<CompoundRule> compoundRules = new ArrayList<>();
+    for (int i = 0; i < num; i++) {
+      line = reader.readLine();
+      String[] parts = line.split("\\s+");
+      if (!line.startsWith(COMPOUNDRULE_KEY) || parts.length != 2) {
+        throw new ParseException("COMPOUNDRULE rule expected", reader.getLineNumber());
+      }
+      compoundRules.add(new CompoundRule(parts[1], this));
+    }
+    return compoundRules;
+  }
+
   private Breaks parseBreaks(LineNumberReader reader, String line)
       throws IOException, ParseException {
     Set<String> starting = new LinkedHashSet<>();
@@ -910,7 +941,7 @@ private void addHiddenCapitalizedWord(
       reuse.append(caseFold(word.charAt(i)));
     }
     reuse.append(FLAG_SEPARATOR);
-    reuse.append(HIDDEN_FLAG);
+    flagParsingStrategy.appendFlag(HIDDEN_FLAG, reuse);
     reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
     writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
   }
@@ -1188,16 +1219,19 @@ private String parseStemException(String morphData) {
     return null;
   }
 
-  boolean isForbiddenWord(char[] word, BytesRef scratch) {
+  boolean isForbiddenWord(char[] word, int length, BytesRef scratch) {
     if (forbiddenword != -1) {
-      IntsRef forms = lookupWord(word, 0, word.length);
-      if (forms != null) {
-        int formStep = formStep();
-        for (int i = 0; i < forms.length; i += formStep) {
-          if (hasFlag(forms.ints[forms.offset + i], (char) forbiddenword, scratch)) {
-            return true;
-          }
-        }
+      IntsRef forms = lookupWord(word, 0, length);
+      return forms != null && hasFlag(forms, (char) forbiddenword, scratch);
+    }
+    return false;
+  }
+
+  boolean hasFlag(IntsRef forms, char flag, BytesRef scratch) {
+    int formStep = formStep();
+    for (int i = 0; i < forms.length; i += formStep) {
+      if (hasFlag(forms.ints[forms.offset + i], flag, scratch)) {
+        return true;
       }
     }
     return false;
@@ -1227,6 +1261,8 @@ char parseFlag(String rawFlag) {
      * @return Parsed flags
      */
     abstract char[] parseFlags(String rawFlags);
+
+    abstract void appendFlag(char flag, StringBuilder to);
   }
 
   /**
@@ -1238,6 +1274,11 @@ private static class SimpleFlagParsingStrategy extends FlagParsingStrategy {
     public char[] parseFlags(String rawFlags) {
       return rawFlags.toCharArray();
     }
+
+    @Override
+    void appendFlag(char flag, StringBuilder to) {
+      to.append(flag);
+    }
   }
 
   /**
@@ -1266,6 +1307,14 @@ public char[] parseFlags(String rawFlags) {
       }
       return flags;
     }
+
+    @Override
+    void appendFlag(char flag, StringBuilder to) {
+      if (to.length() > 0) {
+        to.append(",");
+      }
+      to.append((int) flag);
+    }
   }
 
   /**
@@ -1300,6 +1349,16 @@ public char[] parseFlags(String rawFlags) {
       builder.getChars(0, builder.length(), flags, 0);
       return flags;
     }
+
+    @Override
+    void appendFlag(char flag, StringBuilder to) {
+      to.append((char) (flag >> 8));
+      to.append((char) (flag & 0xff));
+    }
+  }
+
+  boolean hasCompounding() {
+    return compoundRules != null;
   }
 
   boolean hasFlag(int entryId, char flag, BytesRef scratch) {
 
@@ -16,7 +16,10 @@
  */
 package org.apache.lucene.analysis.hunspell;
 
+import java.util.ArrayList;
+import java.util.List;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
 
 /**
  * A spell checker based on Hunspell dictionaries. The objects of this class are not thread-safe
@@ -37,26 +40,100 @@ public SpellChecker(Dictionary dictionary) {
   public boolean spell(String word) {
     if (word.isEmpty()) return true;
 
-    char[] wordChars = word.toCharArray();
-    if (dictionary.isForbiddenWord(wordChars, scratch)) {
-      return false;
+    if (dictionary.needsInputCleaning) {
+      word = dictionary.cleanInput(word, new StringBuilder()).toString();
     }
 
     if (isNumber(word)) {
       return true;
     }
 
-    if (!stemmer.stem(wordChars, word.length()).isEmpty()) {
+    char[] wordChars = word.toCharArray();
+    if (checkWord(wordChars, wordChars.length, false)) {
       return true;
     }
 
-    if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) {
+    WordCase wc = stemmer.caseOf(wordChars, wordChars.length);
+    if ((wc == WordCase.UPPER || wc == WordCase.TITLE) && checkCaseVariants(wordChars, wc)) {
+      return true;
+    }
+
+    if (dictionary.breaks.isNotEmpty()
+        && !hasTooManyBreakOccurrences(word)
+        && !dictionary.isForbiddenWord(wordChars, word.length(), scratch)) {
       return tryBreaks(word);
     }
 
     return false;
   }
 
+  private boolean checkCaseVariants(char[] wordChars, WordCase wordCase) {
+    char[] caseVariant = wordChars;
+    if (wordCase == WordCase.UPPER) {
+      caseVariant = stemmer.caseFoldTitle(caseVariant, wordChars.length);
+      if (checkWord(caseVariant, wordChars.length, true)) {
+        return true;
+      }
+    }
+    return checkWord(stemmer.caseFoldLower(caseVariant, wordChars.length), wordChars.length, true);
+  }
+
+  private boolean checkWord(char[] wordChars, int length, boolean caseVariant) {
+    if (dictionary.isForbiddenWord(wordChars, length, scratch)) {
+      return false;
+    }
+
+    if (!stemmer.doStem(wordChars, length, caseVariant).isEmpty()) {
+      return true;
+    }
+
+    if (dictionary.hasCompounding()) {
+      return checkCompounds(wordChars, 0, length, new ArrayList<>());
+    }
+
+    return false;
+  }
+
+  private boolean checkCompounds(char[] wordChars, int offset, int length, List<IntsRef> words) {
+    if (words.size() >= 100) return false;
+
+    int limit = length - dictionary.compoundMin + 1;
+    for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
+      IntsRef forms = dictionary.lookupWord(wordChars, offset, breakPos);
+      if (forms != null) {
+        words.add(forms);
+
+        if (dictionary.compoundRules != null
+            && dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words, scratch))) {
+          if (checkLastCompoundPart(wordChars, offset + breakPos, length - breakPos, words)) {
+            return true;
+          }
+
+          if (checkCompounds(wordChars, offset + breakPos, length - breakPos, words)) {
+            return true;
+          }
+        }
+
+        words.remove(words.size() - 1);
+      }
+    }
+
+    return false;
+  }
+
+  private boolean checkLastCompoundPart(
+      char[] wordChars, int start, int length, List<IntsRef> words) {
+    IntsRef forms = dictionary.lookupWord(wordChars, start, length);
+    if (forms == null) return false;
+
+    words.add(forms);
+    boolean result =
+        dictionary.compoundRules != null
+            && dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words, scratch));
+    words.remove(words.size() - 1);
+    return result;
+  }
+
   private static boolean isNumber(String s) {
     int i = 0;
     while (i < s.length()) {