LUCENE-9717: Hunspell: support CHECKCOMPOUNDPATTERN (#2280)

donnerpeter · web-flow · commit d0ae2bd2b9c9 · 2021-02-03T08:58:40.000+01:00
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CheckCompoundPattern.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CheckCompoundPattern.java
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import java.util.List;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.IntsRef;
+
+class CheckCompoundPattern {
+  private final char[] endChars;
+  private final char[] beginChars;
+  private final char[] replacement;
+  private final char[] endFlags;
+  private final char[] beginFlags;
+  private final Dictionary dictionary;
+  private final BytesRef scratch = new BytesRef();
+
+  CheckCompoundPattern(
+      String unparsed, Dictionary.FlagParsingStrategy strategy, Dictionary dictionary) {
+    this.dictionary = dictionary;
+    String[] parts = unparsed.split("\\s+");
+    if (parts.length < 3) {
+      throw new IllegalArgumentException("Invalid pattern: " + unparsed);
+    }
+
+    int flagSep = parts[1].indexOf("/");
+    endChars = (flagSep < 0 ? parts[1] : parts[1].substring(0, flagSep)).toCharArray();
+    endFlags = flagSep < 0 ? new char[0] : strategy.parseFlags(parts[1].substring(flagSep + 1));
+
+    flagSep = parts[2].indexOf("/");
+    beginChars = (flagSep < 0 ? parts[2] : parts[2].substring(0, flagSep)).toCharArray();
+    beginFlags = flagSep < 0 ? new char[0] : strategy.parseFlags(parts[2].substring(flagSep + 1));
+
+    replacement = parts.length == 3 ? null : parts[3].toCharArray();
+  }
+
+  @Override
+  public String toString() {
+    return new String(endChars)
+        + " "
+        + new String(beginChars)
+        + (replacement == null ? "" : " -> " + new String(replacement));
+  }
+
+  boolean prohibitsCompounding(
+      CharsRef word, int breakPos, List<CharsRef> stemsBefore, List<CharsRef> stemsAfter) {
+    if (isNonAffixedPattern(endChars)) {
+      if (stemsBefore.stream()
+          .noneMatch(stem -> charsMatch(word, breakPos - stem.length, stem.chars))) {
+        return false;
+      }
+    } else if (!charsMatch(word, breakPos - endChars.length, endChars)) {
+      return false;
+    }
+
+    if (isNonAffixedPattern(beginChars)) {
+      if (stemsAfter.stream().noneMatch(stem -> charsMatch(word, breakPos, stem.chars))) {
+        return false;
+      }
+    } else if (!charsMatch(word, breakPos, beginChars)) {
+      return false;
+    }
+
+    if (endFlags.length > 0 && !hasStemWithFlags(stemsBefore, endFlags)) {
+      return false;
+    }
+    //noinspection RedundantIfStatement
+    if (beginFlags.length > 0 && !hasStemWithFlags(stemsAfter, beginFlags)) {
+      return false;
+    }
+
+    return true;
+  }
+
+  private static boolean isNonAffixedPattern(char[] pattern) {
+    return pattern.length == 1 && pattern[0] == '0';
+  }
+
+  private boolean hasStemWithFlags(List<CharsRef> stems, char[] flags) {
+    for (CharsRef stem : stems) {
+      IntsRef forms = dictionary.lookupWord(stem.chars, stem.offset, stem.length);
+      if (forms != null && hasAllFlags(flags, forms)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  private boolean hasAllFlags(char[] flags, IntsRef forms) {
+    for (char flag : flags) {
+      if (!dictionary.hasFlag(forms, flag, scratch)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  CharsRef expandReplacement(CharsRef word, int breakPos) {
+    if (replacement != null && charsMatch(word, breakPos, replacement)) {
+      return new CharsRef(
+          word.subSequence(0, breakPos)
+              + new String(endChars)
+              + new String(beginChars)
+              + word.subSequence(breakPos + replacement.length, word.length));
+    }
+    return null;
+  }
+
+  int endLength() {
+    return endChars.length;
+  }
+
+  private static boolean charsMatch(CharsRef word, int offset, char[] pattern) {
+    int len = pattern.length;
+    if (word.length - offset < len || offset < 0 || offset > word.length) {
+      return false;
+    }
+
+    for (int i = 0; i < len; i++) {
+      if (word.chars[word.offset + offset + i] != pattern[i]) {
+        return false;
+      }
+    }
+    return true;
+  }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -146,6 +146,7 @@ public class Dictionary {
   boolean checkCompoundTriple, simplifiedTriple;
   int compoundMin = 3, compoundMax = Integer.MAX_VALUE;
   List<CompoundRule> compoundRules; // nullable
+  List<CheckCompoundPattern> checkCompoundPatterns = new ArrayList<>();
 
   // ignored characters (dictionary, affix, inputs)
   private char[] ignore;
@@ -412,6 +413,12 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder)
         checkCompoundTriple = true;
       } else if ("SIMPLIFIEDTRIPLE".equals(firstWord)) {
         simplifiedTriple = true;
+      } else if ("CHECKCOMPOUNDPATTERN".equals(firstWord)) {
+        int count = Integer.parseInt(singleArgument(reader, line));
+        for (int i = 0; i < count; i++) {
+          checkCompoundPatterns.add(
+              new CheckCompoundPattern(reader.readLine(), flagParsingStrategy, this));
+        }
       }
     }
 
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@@ -26,6 +26,7 @@
 import java.util.Collections;
 import java.util.List;
 import java.util.Set;
+import java.util.function.Predicate;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;
@@ -149,47 +150,94 @@ && checkCompoundRules(wordChars, 0, length, new ArrayList<>())) {
     }
 
     if (dictionary.compoundBegin != FLAG_UNSET || dictionary.compoundFlag != FLAG_UNSET) {
-      return checkCompounds(wordChars, 0, length, originalCase, 0);
+      return checkCompounds(new CharsRef(wordChars, 0, length), originalCase, 0, __ -> true);
     }
 
     return false;
   }
 
   private boolean checkCompounds(
-      char[] chars, int offset, int length, WordCase originalCase, int depth) {
+      CharsRef word, WordCase originalCase, int depth, Predicate<List<CharsRef>> checkPatterns) {
     if (depth > dictionary.compoundMax - 2) return false;
 
-    int limit = length - dictionary.compoundMin + 1;
+    int limit = word.length - dictionary.compoundMin + 1;
     for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
       WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
-      int breakOffset = offset + breakPos;
-      if (mayBreakIntoCompounds(chars, offset, length, breakOffset)) {
-        List<CharsRef> stems = stemmer.doStem(chars, offset, breakPos, originalCase, context);
+      int breakOffset = word.offset + breakPos;
+      if (mayBreakIntoCompounds(word.chars, word.offset, word.length, breakOffset)) {
+        List<CharsRef> stems =
+            stemmer.doStem(word.chars, word.offset, breakPos, originalCase, context);
         if (stems.isEmpty()
             && dictionary.simplifiedTriple
-            && chars[breakOffset - 1] == chars[breakOffset]) {
-          stems = stemmer.doStem(chars, offset, breakPos + 1, originalCase, context);
+            && word.chars[breakOffset - 1] == word.chars[breakOffset]) {
+          stems = stemmer.doStem(word.chars, word.offset, breakPos + 1, originalCase, context);
         }
-        if (stems.isEmpty()) continue;
-
-        int remainingLength = length - breakPos;
-        List<CharsRef> lastStems =
-            stemmer.doStem(chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
-        if (!lastStems.isEmpty()
-            && !(dictionary.checkCompoundDup && intersectIgnoreCase(stems, lastStems))
-            && !hasForceUCaseProblem(chars, breakOffset, remainingLength, originalCase)) {
-          return true;
+        if (!stems.isEmpty() && checkPatterns.test(stems)) {
+          Predicate<List<CharsRef>> nextCheck = checkNextPatterns(word, breakPos, stems);
+          if (checkCompoundsAfter(word, breakPos, originalCase, depth, stems, nextCheck)) {
+            return true;
+          }
         }
+      }
 
-        if (checkCompounds(chars, breakOffset, remainingLength, originalCase, depth + 1)) {
-          return true;
-        }
+      if (checkCompoundPatternReplacements(word, breakPos, originalCase, depth)) {
+        return true;
       }
     }
 
     return false;
   }
 
+  private boolean checkCompoundPatternReplacements(
+      CharsRef word, int pos, WordCase originalCase, int depth) {
+    for (CheckCompoundPattern pattern : dictionary.checkCompoundPatterns) {
+      CharsRef expanded = pattern.expandReplacement(word, pos);
+      if (expanded != null) {
+        WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
+        int breakPos = pos + pattern.endLength();
+        List<CharsRef> stems =
+            stemmer.doStem(expanded.chars, expanded.offset, breakPos, originalCase, context);
+        if (!stems.isEmpty()) {
+          Predicate<List<CharsRef>> nextCheck =
+              next -> pattern.prohibitsCompounding(expanded, breakPos, stems, next);
+          if (checkCompoundsAfter(expanded, breakPos, originalCase, depth, stems, nextCheck)) {
+            return true;
+          }
+        }
+      }
+    }
+    return false;
+  }
+
+  private Predicate<List<CharsRef>> checkNextPatterns(
+      CharsRef word, int breakPos, List<CharsRef> stems) {
+    return nextStems ->
+        dictionary.checkCompoundPatterns.stream()
+            .noneMatch(p -> p.prohibitsCompounding(word, breakPos, stems, nextStems));
+  }
+
+  private boolean checkCompoundsAfter(
+      CharsRef word,
+      int breakPos,
+      WordCase originalCase,
+      int depth,
+      List<CharsRef> prevStems,
+      Predicate<List<CharsRef>> checkPatterns) {
+    int remainingLength = word.length - breakPos;
+    int breakOffset = word.offset + breakPos;
+    List<CharsRef> tailStems =
+        stemmer.doStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
+    if (!tailStems.isEmpty()
+        && !(dictionary.checkCompoundDup && intersectIgnoreCase(prevStems, tailStems))
+        && !hasForceUCaseProblem(word.chars, breakOffset, remainingLength, originalCase)
+        && checkPatterns.test(tailStems)) {
+      return true;
+    }
+
+    CharsRef tail = new CharsRef(word.chars, breakOffset, remainingLength);
+    return checkCompounds(tail, originalCase, depth + 1, checkPatterns);
+  }
+
   private boolean hasForceUCaseProblem(
       char[] chars, int offset, int length, WordCase originalCase) {
     if (dictionary.forceUCase == FLAG_UNSET) return false;
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@@ -64,6 +64,18 @@ public void testI53643_numbersWithSeparators() throws Exception {
     doTest("i53643");
   }
 
+  public void testCheckCompoundPattern() throws Exception {
+    doTest("checkcompoundpattern");
+  }
+
+  public void testCheckCompoundPattern2() throws Exception {
+    doTest("checkcompoundpattern2");
+  }
+
+  public void testCheckCompoundPattern3() throws Exception {
+    doTest("checkcompoundpattern3");
+  }
+
   public void testDotless_i() throws Exception {
     doTest("dotless_i");
   }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern.aff
@@ -0,0 +1,5 @@
+# forbid compounds with spec. pattern at word bounds
+COMPOUNDFLAG A
+CHECKCOMPOUNDPATTERN 2
+CHECKCOMPOUNDPATTERN nny ny
+CHECKCOMPOUNDPATTERN ssz sz
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern.dic
@@ -0,0 +1,5 @@
+4
+k�nny/A
+nyel�s/A
+hossz/A
+sz�m�t�s/A
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern.good
@@ -0,0 +1,2 @@
+könnyszámítás
+hossznyelés
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern.wrong
@@ -0,0 +1,4 @@
+könnynyelés
+hosszszámítás
+hosszkönnynyelés
+könnynyeléshossz
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern2.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern2.aff
@@ -0,0 +1,7 @@
+# forbid compounds with spec. pattern at word bound and allow modificated form
+# (for German and Indian languages)
+COMPOUNDFLAG A
+CHECKCOMPOUNDPATTERN 2
+CHECKCOMPOUNDPATTERN o b z
+CHECKCOMPOUNDPATTERN oo ba u
+COMPOUNDMIN 1
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern2.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern2.dic
@@ -0,0 +1,3 @@
+2
+foo/A
+bar/A
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern2.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern2.good
@@ -0,0 +1,3 @@
+barfoo
+fozar
+fur
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern2.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern2.wrong
@@ -0,0 +1 @@
+foobar
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern3.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern3.aff
@@ -0,0 +1,6 @@
+# forbid compounds with spec. pattern at word bound and allow modified form
+# (for Indian languages)
+COMPOUNDFLAG A
+CHECKCOMPOUNDPATTERN 1
+CHECKCOMPOUNDPATTERN o/X b/Y z
+COMPOUNDMIN 1
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern3.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern3.dic
@@ -0,0 +1,5 @@
+4
+foo/A
+boo/AX
+bar/A
+ban/AY
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern3.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern3.good
@@ -0,0 +1,9 @@
+bozan
+barfoo
+banfoo
+banbar
+foobar
+fooban
+foobanbar
+boobar
+boobarfoo
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern3.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern3.wrong
@@ -0,0 +1,8 @@
+booban
+boobanfoo
+fozar
+fozarfoo
+fozan
+fozanfoo
+bozar
+bozarfoo