Skip to content

Commit d0ae2bd

Browse files
authored
LUCENE-9717: Hunspell: support CHECKCOMPOUNDPATTERN (#2280)
1 parent 6509a30 commit d0ae2bd

16 files changed

+286
-20
lines changed
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.lucene.analysis.hunspell;
18+
19+
import java.util.List;
20+
import org.apache.lucene.util.BytesRef;
21+
import org.apache.lucene.util.CharsRef;
22+
import org.apache.lucene.util.IntsRef;
23+
24+
class CheckCompoundPattern {
25+
private final char[] endChars;
26+
private final char[] beginChars;
27+
private final char[] replacement;
28+
private final char[] endFlags;
29+
private final char[] beginFlags;
30+
private final Dictionary dictionary;
31+
private final BytesRef scratch = new BytesRef();
32+
33+
CheckCompoundPattern(
34+
String unparsed, Dictionary.FlagParsingStrategy strategy, Dictionary dictionary) {
35+
this.dictionary = dictionary;
36+
String[] parts = unparsed.split("\\s+");
37+
if (parts.length < 3) {
38+
throw new IllegalArgumentException("Invalid pattern: " + unparsed);
39+
}
40+
41+
int flagSep = parts[1].indexOf("/");
42+
endChars = (flagSep < 0 ? parts[1] : parts[1].substring(0, flagSep)).toCharArray();
43+
endFlags = flagSep < 0 ? new char[0] : strategy.parseFlags(parts[1].substring(flagSep + 1));
44+
45+
flagSep = parts[2].indexOf("/");
46+
beginChars = (flagSep < 0 ? parts[2] : parts[2].substring(0, flagSep)).toCharArray();
47+
beginFlags = flagSep < 0 ? new char[0] : strategy.parseFlags(parts[2].substring(flagSep + 1));
48+
49+
replacement = parts.length == 3 ? null : parts[3].toCharArray();
50+
}
51+
52+
@Override
53+
public String toString() {
54+
return new String(endChars)
55+
+ " "
56+
+ new String(beginChars)
57+
+ (replacement == null ? "" : " -> " + new String(replacement));
58+
}
59+
60+
boolean prohibitsCompounding(
61+
CharsRef word, int breakPos, List<CharsRef> stemsBefore, List<CharsRef> stemsAfter) {
62+
if (isNonAffixedPattern(endChars)) {
63+
if (stemsBefore.stream()
64+
.noneMatch(stem -> charsMatch(word, breakPos - stem.length, stem.chars))) {
65+
return false;
66+
}
67+
} else if (!charsMatch(word, breakPos - endChars.length, endChars)) {
68+
return false;
69+
}
70+
71+
if (isNonAffixedPattern(beginChars)) {
72+
if (stemsAfter.stream().noneMatch(stem -> charsMatch(word, breakPos, stem.chars))) {
73+
return false;
74+
}
75+
} else if (!charsMatch(word, breakPos, beginChars)) {
76+
return false;
77+
}
78+
79+
if (endFlags.length > 0 && !hasStemWithFlags(stemsBefore, endFlags)) {
80+
return false;
81+
}
82+
//noinspection RedundantIfStatement
83+
if (beginFlags.length > 0 && !hasStemWithFlags(stemsAfter, beginFlags)) {
84+
return false;
85+
}
86+
87+
return true;
88+
}
89+
90+
private static boolean isNonAffixedPattern(char[] pattern) {
91+
return pattern.length == 1 && pattern[0] == '0';
92+
}
93+
94+
private boolean hasStemWithFlags(List<CharsRef> stems, char[] flags) {
95+
for (CharsRef stem : stems) {
96+
IntsRef forms = dictionary.lookupWord(stem.chars, stem.offset, stem.length);
97+
if (forms != null && hasAllFlags(flags, forms)) {
98+
return true;
99+
}
100+
}
101+
return false;
102+
}
103+
104+
private boolean hasAllFlags(char[] flags, IntsRef forms) {
105+
for (char flag : flags) {
106+
if (!dictionary.hasFlag(forms, flag, scratch)) {
107+
return false;
108+
}
109+
}
110+
return true;
111+
}
112+
113+
CharsRef expandReplacement(CharsRef word, int breakPos) {
114+
if (replacement != null && charsMatch(word, breakPos, replacement)) {
115+
return new CharsRef(
116+
word.subSequence(0, breakPos)
117+
+ new String(endChars)
118+
+ new String(beginChars)
119+
+ word.subSequence(breakPos + replacement.length, word.length));
120+
}
121+
return null;
122+
}
123+
124+
int endLength() {
125+
return endChars.length;
126+
}
127+
128+
private static boolean charsMatch(CharsRef word, int offset, char[] pattern) {
129+
int len = pattern.length;
130+
if (word.length - offset < len || offset < 0 || offset > word.length) {
131+
return false;
132+
}
133+
134+
for (int i = 0; i < len; i++) {
135+
if (word.chars[word.offset + offset + i] != pattern[i]) {
136+
return false;
137+
}
138+
}
139+
return true;
140+
}
141+
}

lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ public class Dictionary {
146146
boolean checkCompoundTriple, simplifiedTriple;
147147
int compoundMin = 3, compoundMax = Integer.MAX_VALUE;
148148
List<CompoundRule> compoundRules; // nullable
149+
List<CheckCompoundPattern> checkCompoundPatterns = new ArrayList<>();
149150

150151
// ignored characters (dictionary, affix, inputs)
151152
private char[] ignore;
@@ -412,6 +413,12 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder)
412413
checkCompoundTriple = true;
413414
} else if ("SIMPLIFIEDTRIPLE".equals(firstWord)) {
414415
simplifiedTriple = true;
416+
} else if ("CHECKCOMPOUNDPATTERN".equals(firstWord)) {
417+
int count = Integer.parseInt(singleArgument(reader, line));
418+
for (int i = 0; i < count; i++) {
419+
checkCompoundPatterns.add(
420+
new CheckCompoundPattern(reader.readLine(), flagParsingStrategy, this));
421+
}
415422
}
416423
}
417424

lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java

Lines changed: 68 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import java.util.Collections;
2727
import java.util.List;
2828
import java.util.Set;
29+
import java.util.function.Predicate;
2930
import org.apache.lucene.util.BytesRef;
3031
import org.apache.lucene.util.CharsRef;
3132
import org.apache.lucene.util.IntsRef;
@@ -149,47 +150,94 @@ && checkCompoundRules(wordChars, 0, length, new ArrayList<>())) {
149150
}
150151

151152
if (dictionary.compoundBegin != FLAG_UNSET || dictionary.compoundFlag != FLAG_UNSET) {
152-
return checkCompounds(wordChars, 0, length, originalCase, 0);
153+
return checkCompounds(new CharsRef(wordChars, 0, length), originalCase, 0, __ -> true);
153154
}
154155

155156
return false;
156157
}
157158

158159
private boolean checkCompounds(
159-
char[] chars, int offset, int length, WordCase originalCase, int depth) {
160+
CharsRef word, WordCase originalCase, int depth, Predicate<List<CharsRef>> checkPatterns) {
160161
if (depth > dictionary.compoundMax - 2) return false;
161162

162-
int limit = length - dictionary.compoundMin + 1;
163+
int limit = word.length - dictionary.compoundMin + 1;
163164
for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
164165
WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
165-
int breakOffset = offset + breakPos;
166-
if (mayBreakIntoCompounds(chars, offset, length, breakOffset)) {
167-
List<CharsRef> stems = stemmer.doStem(chars, offset, breakPos, originalCase, context);
166+
int breakOffset = word.offset + breakPos;
167+
if (mayBreakIntoCompounds(word.chars, word.offset, word.length, breakOffset)) {
168+
List<CharsRef> stems =
169+
stemmer.doStem(word.chars, word.offset, breakPos, originalCase, context);
168170
if (stems.isEmpty()
169171
&& dictionary.simplifiedTriple
170-
&& chars[breakOffset - 1] == chars[breakOffset]) {
171-
stems = stemmer.doStem(chars, offset, breakPos + 1, originalCase, context);
172+
&& word.chars[breakOffset - 1] == word.chars[breakOffset]) {
173+
stems = stemmer.doStem(word.chars, word.offset, breakPos + 1, originalCase, context);
172174
}
173-
if (stems.isEmpty()) continue;
174-
175-
int remainingLength = length - breakPos;
176-
List<CharsRef> lastStems =
177-
stemmer.doStem(chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
178-
if (!lastStems.isEmpty()
179-
&& !(dictionary.checkCompoundDup && intersectIgnoreCase(stems, lastStems))
180-
&& !hasForceUCaseProblem(chars, breakOffset, remainingLength, originalCase)) {
181-
return true;
175+
if (!stems.isEmpty() && checkPatterns.test(stems)) {
176+
Predicate<List<CharsRef>> nextCheck = checkNextPatterns(word, breakPos, stems);
177+
if (checkCompoundsAfter(word, breakPos, originalCase, depth, stems, nextCheck)) {
178+
return true;
179+
}
182180
}
181+
}
183182

184-
if (checkCompounds(chars, breakOffset, remainingLength, originalCase, depth + 1)) {
185-
return true;
186-
}
183+
if (checkCompoundPatternReplacements(word, breakPos, originalCase, depth)) {
184+
return true;
187185
}
188186
}
189187

190188
return false;
191189
}
192190

191+
private boolean checkCompoundPatternReplacements(
192+
CharsRef word, int pos, WordCase originalCase, int depth) {
193+
for (CheckCompoundPattern pattern : dictionary.checkCompoundPatterns) {
194+
CharsRef expanded = pattern.expandReplacement(word, pos);
195+
if (expanded != null) {
196+
WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
197+
int breakPos = pos + pattern.endLength();
198+
List<CharsRef> stems =
199+
stemmer.doStem(expanded.chars, expanded.offset, breakPos, originalCase, context);
200+
if (!stems.isEmpty()) {
201+
Predicate<List<CharsRef>> nextCheck =
202+
next -> pattern.prohibitsCompounding(expanded, breakPos, stems, next);
203+
if (checkCompoundsAfter(expanded, breakPos, originalCase, depth, stems, nextCheck)) {
204+
return true;
205+
}
206+
}
207+
}
208+
}
209+
return false;
210+
}
211+
212+
private Predicate<List<CharsRef>> checkNextPatterns(
213+
CharsRef word, int breakPos, List<CharsRef> stems) {
214+
return nextStems ->
215+
dictionary.checkCompoundPatterns.stream()
216+
.noneMatch(p -> p.prohibitsCompounding(word, breakPos, stems, nextStems));
217+
}
218+
219+
private boolean checkCompoundsAfter(
220+
CharsRef word,
221+
int breakPos,
222+
WordCase originalCase,
223+
int depth,
224+
List<CharsRef> prevStems,
225+
Predicate<List<CharsRef>> checkPatterns) {
226+
int remainingLength = word.length - breakPos;
227+
int breakOffset = word.offset + breakPos;
228+
List<CharsRef> tailStems =
229+
stemmer.doStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
230+
if (!tailStems.isEmpty()
231+
&& !(dictionary.checkCompoundDup && intersectIgnoreCase(prevStems, tailStems))
232+
&& !hasForceUCaseProblem(word.chars, breakOffset, remainingLength, originalCase)
233+
&& checkPatterns.test(tailStems)) {
234+
return true;
235+
}
236+
237+
CharsRef tail = new CharsRef(word.chars, breakOffset, remainingLength);
238+
return checkCompounds(tail, originalCase, depth + 1, checkPatterns);
239+
}
240+
193241
private boolean hasForceUCaseProblem(
194242
char[] chars, int offset, int length, WordCase originalCase) {
195243
if (dictionary.forceUCase == FLAG_UNSET) return false;

lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,18 @@ public void testI53643_numbersWithSeparators() throws Exception {
6464
doTest("i53643");
6565
}
6666

67+
public void testCheckCompoundPattern() throws Exception {
68+
doTest("checkcompoundpattern");
69+
}
70+
71+
public void testCheckCompoundPattern2() throws Exception {
72+
doTest("checkcompoundpattern2");
73+
}
74+
75+
public void testCheckCompoundPattern3() throws Exception {
76+
doTest("checkcompoundpattern3");
77+
}
78+
6779
public void testDotless_i() throws Exception {
6880
doTest("dotless_i");
6981
}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# forbid compounds with spec. pattern at word bounds
2+
COMPOUNDFLAG A
3+
CHECKCOMPOUNDPATTERN 2
4+
CHECKCOMPOUNDPATTERN nny ny
5+
CHECKCOMPOUNDPATTERN ssz sz
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
4
2+
k�nny/A
3+
nyel�s/A
4+
hossz/A
5+
sz�m�t�s/A
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
könnyszámítás
2+
hossznyelés
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
könnynyelés
2+
hosszszámítás
3+
hosszkönnynyelés
4+
könnynyeléshossz
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# forbid compounds with spec. pattern at word bound and allow modificated form
2+
# (for German and Indian languages)
3+
COMPOUNDFLAG A
4+
CHECKCOMPOUNDPATTERN 2
5+
CHECKCOMPOUNDPATTERN o b z
6+
CHECKCOMPOUNDPATTERN oo ba u
7+
COMPOUNDMIN 1
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
2
2+
foo/A
3+
bar/A

0 commit comments

Comments
 (0)