Skip to content

Commit d796813

Browse files
authored
LUCENE-9684: Hunspell: support COMPOUNDRULE (#2228)
1 parent cf5db8d commit d796813

40 files changed

+730
-29
lines changed

lucene/CHANGES.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,8 @@ API Changes
8686

8787
Improvements
8888

89-
* LUCENE-9665 LUCENE-9676 LUCENE-9667 : Hunspell improvements: add SpellChecker API, support default encoding and
90-
BREAK/FORBIDDENWORD affix rules, improve stemming of all-caps words (Peter Gromov)
89+
* LUCENE-9687: Hunspell support improvements: add SpellChecker API, support default encoding and
90+
BREAK/FORBIDDENWORD/COMPOUNDRULE affix rules, improve stemming of all-caps words (Peter Gromov)
9191

9292
* LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
9393
(Dawid Weiss)
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.lucene.analysis.hunspell;
18+
19+
import java.util.List;
20+
import org.apache.lucene.util.BytesRef;
21+
import org.apache.lucene.util.IntsRef;
22+
23+
class CompoundRule {
24+
private final char[] data;
25+
private final Dictionary dictionary;
26+
27+
CompoundRule(String rule, Dictionary dictionary) {
28+
this.dictionary = dictionary;
29+
StringBuilder parsedFlags = new StringBuilder();
30+
int pos = 0;
31+
while (pos < rule.length()) {
32+
int lParen = rule.indexOf("(", pos);
33+
if (lParen < 0) {
34+
parsedFlags.append(dictionary.flagParsingStrategy.parseFlags(rule.substring(pos)));
35+
break;
36+
}
37+
38+
parsedFlags.append(dictionary.flagParsingStrategy.parseFlags(rule.substring(pos, lParen)));
39+
int rParen = rule.indexOf(')', lParen + 1);
40+
if (rParen < 0) {
41+
throw new IllegalArgumentException("Unmatched parentheses: " + rule);
42+
}
43+
44+
parsedFlags.append(
45+
dictionary.flagParsingStrategy.parseFlags(rule.substring(lParen + 1, rParen)));
46+
pos = rParen + 1;
47+
if (pos < rule.length() && (rule.charAt(pos) == '?' || rule.charAt(pos) == '*')) {
48+
parsedFlags.append(rule.charAt(pos++));
49+
}
50+
}
51+
data = parsedFlags.toString().toCharArray();
52+
}
53+
54+
boolean mayMatch(List<IntsRef> words, BytesRef scratch) {
55+
return match(words, 0, 0, scratch, false);
56+
}
57+
58+
boolean fullyMatches(List<IntsRef> words, BytesRef scratch) {
59+
return match(words, 0, 0, scratch, true);
60+
}
61+
62+
private boolean match(
63+
List<IntsRef> words, int patternIndex, int wordIndex, BytesRef scratch, boolean fully) {
64+
if (patternIndex >= data.length) {
65+
return wordIndex >= words.size();
66+
}
67+
if (wordIndex >= words.size() && !fully) {
68+
return true;
69+
}
70+
71+
char flag = data[patternIndex];
72+
if (patternIndex < data.length - 1 && data[patternIndex + 1] == '*') {
73+
int startWI = wordIndex;
74+
while (wordIndex < words.size() && dictionary.hasFlag(words.get(wordIndex), flag, scratch)) {
75+
wordIndex++;
76+
}
77+
78+
while (wordIndex >= startWI) {
79+
if (match(words, patternIndex + 2, wordIndex, scratch, fully)) {
80+
return true;
81+
}
82+
83+
wordIndex--;
84+
}
85+
return false;
86+
}
87+
88+
boolean currentWordMatches =
89+
wordIndex < words.size() && dictionary.hasFlag(words.get(wordIndex), flag, scratch);
90+
91+
if (patternIndex < data.length - 1 && data[patternIndex + 1] == '?') {
92+
if (currentWordMatches && match(words, patternIndex + 2, wordIndex + 1, scratch, fully)) {
93+
return true;
94+
}
95+
return match(words, patternIndex + 2, wordIndex, scratch, fully);
96+
}
97+
98+
return currentWordMatches && match(words, patternIndex + 1, wordIndex + 1, scratch, fully);
99+
}
100+
101+
@Override
102+
public String toString() {
103+
return new String(data);
104+
}
105+
}

lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java

Lines changed: 70 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ public class Dictionary {
9292
private static final String LANG_KEY = "LANG";
9393
private static final String BREAK_KEY = "BREAK";
9494
private static final String FORBIDDENWORD_KEY = "FORBIDDENWORD";
95+
private static final String COMPOUNDMIN_KEY = "COMPOUNDMIN";
96+
private static final String COMPOUNDRULE_KEY = "COMPOUNDRULE";
9597
private static final String KEEPCASE_KEY = "KEEPCASE";
9698
private static final String NEEDAFFIX_KEY = "NEEDAFFIX";
9799
private static final String PSEUDOROOT_KEY = "PSEUDOROOT";
@@ -136,7 +138,7 @@ public class Dictionary {
136138
static final int AFFIX_APPEND = 3;
137139

138140
// Default flag parsing strategy
139-
private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy();
141+
FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy();
140142

141143
// AF entries
142144
private String[] aliases;
@@ -163,6 +165,8 @@ public class Dictionary {
163165
int needaffix = -1; // needaffix flag, or -1 if one is not defined
164166
int forbiddenword = -1; // forbiddenword flag, or -1 if one is not defined
165167
int onlyincompound = -1; // onlyincompound flag, or -1 if one is not defined
168+
int compoundMin = 3;
169+
List<CompoundRule> compoundRules; // nullable
166170

167171
// ignored characters (dictionary, affix, inputs)
168172
private char[] ignore;
@@ -419,6 +423,18 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder)
419423
throw new ParseException("Illegal FORBIDDENWORD declaration", reader.getLineNumber());
420424
}
421425
forbiddenword = flagParsingStrategy.parseFlag(parts[1]);
426+
} else if (line.startsWith(COMPOUNDMIN_KEY)) {
427+
String[] parts = line.split("\\s+");
428+
if (parts.length != 2) {
429+
throw new ParseException("Illegal COMPOUNDMIN declaration", reader.getLineNumber());
430+
}
431+
compoundMin = Math.max(1, Integer.parseInt(parts[1]));
432+
} else if (line.startsWith(COMPOUNDRULE_KEY)) {
433+
String[] parts = line.split("\\s+");
434+
if (parts.length != 2) {
435+
throw new ParseException("Illegal COMPOUNDRULE header", reader.getLineNumber());
436+
}
437+
this.compoundRules = parseCompoundRules(reader, Integer.parseInt(parts[1]));
422438
}
423439
}
424440

@@ -442,6 +458,21 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder)
442458
stripOffsets[currentIndex] = currentOffset;
443459
}
444460

461+
private List<CompoundRule> parseCompoundRules(LineNumberReader reader, int num)
462+
throws IOException, ParseException {
463+
String line;
464+
List<CompoundRule> compoundRules = new ArrayList<>();
465+
for (int i = 0; i < num; i++) {
466+
line = reader.readLine();
467+
String[] parts = line.split("\\s+");
468+
if (!line.startsWith(COMPOUNDRULE_KEY) || parts.length != 2) {
469+
throw new ParseException("COMPOUNDRULE rule expected", reader.getLineNumber());
470+
}
471+
compoundRules.add(new CompoundRule(parts[1], this));
472+
}
473+
return compoundRules;
474+
}
475+
445476
private Breaks parseBreaks(LineNumberReader reader, String line)
446477
throws IOException, ParseException {
447478
Set<String> starting = new LinkedHashSet<>();
@@ -910,7 +941,7 @@ private void addHiddenCapitalizedWord(
910941
reuse.append(caseFold(word.charAt(i)));
911942
}
912943
reuse.append(FLAG_SEPARATOR);
913-
reuse.append(HIDDEN_FLAG);
944+
flagParsingStrategy.appendFlag(HIDDEN_FLAG, reuse);
914945
reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
915946
writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
916947
}
@@ -1188,16 +1219,19 @@ private String parseStemException(String morphData) {
11881219
return null;
11891220
}
11901221

1191-
boolean isForbiddenWord(char[] word, BytesRef scratch) {
1222+
boolean isForbiddenWord(char[] word, int length, BytesRef scratch) {
11921223
if (forbiddenword != -1) {
1193-
IntsRef forms = lookupWord(word, 0, word.length);
1194-
if (forms != null) {
1195-
int formStep = formStep();
1196-
for (int i = 0; i < forms.length; i += formStep) {
1197-
if (hasFlag(forms.ints[forms.offset + i], (char) forbiddenword, scratch)) {
1198-
return true;
1199-
}
1200-
}
1224+
IntsRef forms = lookupWord(word, 0, length);
1225+
return forms != null && hasFlag(forms, (char) forbiddenword, scratch);
1226+
}
1227+
return false;
1228+
}
1229+
1230+
boolean hasFlag(IntsRef forms, char flag, BytesRef scratch) {
1231+
int formStep = formStep();
1232+
for (int i = 0; i < forms.length; i += formStep) {
1233+
if (hasFlag(forms.ints[forms.offset + i], flag, scratch)) {
1234+
return true;
12011235
}
12021236
}
12031237
return false;
@@ -1227,6 +1261,8 @@ char parseFlag(String rawFlag) {
12271261
* @return Parsed flags
12281262
*/
12291263
abstract char[] parseFlags(String rawFlags);
1264+
1265+
abstract void appendFlag(char flag, StringBuilder to);
12301266
}
12311267

12321268
/**
@@ -1238,6 +1274,11 @@ private static class SimpleFlagParsingStrategy extends FlagParsingStrategy {
12381274
public char[] parseFlags(String rawFlags) {
12391275
return rawFlags.toCharArray();
12401276
}
1277+
1278+
@Override
1279+
void appendFlag(char flag, StringBuilder to) {
1280+
to.append(flag);
1281+
}
12411282
}
12421283

12431284
/**
@@ -1266,6 +1307,14 @@ public char[] parseFlags(String rawFlags) {
12661307
}
12671308
return flags;
12681309
}
1310+
1311+
@Override
1312+
void appendFlag(char flag, StringBuilder to) {
1313+
if (to.length() > 0) {
1314+
to.append(",");
1315+
}
1316+
to.append((int) flag);
1317+
}
12691318
}
12701319

12711320
/**
@@ -1300,6 +1349,16 @@ public char[] parseFlags(String rawFlags) {
13001349
builder.getChars(0, builder.length(), flags, 0);
13011350
return flags;
13021351
}
1352+
1353+
@Override
1354+
void appendFlag(char flag, StringBuilder to) {
1355+
to.append((char) (flag >> 8));
1356+
to.append((char) (flag & 0xff));
1357+
}
1358+
}
1359+
1360+
boolean hasCompounding() {
1361+
return compoundRules != null;
13031362
}
13041363

13051364
boolean hasFlag(int entryId, char flag, BytesRef scratch) {

lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java

Lines changed: 82 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@
1616
*/
1717
package org.apache.lucene.analysis.hunspell;
1818

19+
import java.util.ArrayList;
20+
import java.util.List;
1921
import org.apache.lucene.util.BytesRef;
22+
import org.apache.lucene.util.IntsRef;
2023

2124
/**
2225
* A spell checker based on Hunspell dictionaries. The objects of this class are not thread-safe
@@ -37,26 +40,100 @@ public SpellChecker(Dictionary dictionary) {
3740
public boolean spell(String word) {
3841
if (word.isEmpty()) return true;
3942

40-
char[] wordChars = word.toCharArray();
41-
if (dictionary.isForbiddenWord(wordChars, scratch)) {
42-
return false;
43+
if (dictionary.needsInputCleaning) {
44+
word = dictionary.cleanInput(word, new StringBuilder()).toString();
4345
}
4446

4547
if (isNumber(word)) {
4648
return true;
4749
}
4850

49-
if (!stemmer.stem(wordChars, word.length()).isEmpty()) {
51+
char[] wordChars = word.toCharArray();
52+
if (checkWord(wordChars, wordChars.length, false)) {
5053
return true;
5154
}
5255

53-
if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) {
56+
WordCase wc = stemmer.caseOf(wordChars, wordChars.length);
57+
if ((wc == WordCase.UPPER || wc == WordCase.TITLE) && checkCaseVariants(wordChars, wc)) {
58+
return true;
59+
}
60+
61+
if (dictionary.breaks.isNotEmpty()
62+
&& !hasTooManyBreakOccurrences(word)
63+
&& !dictionary.isForbiddenWord(wordChars, word.length(), scratch)) {
5464
return tryBreaks(word);
5565
}
5666

5767
return false;
5868
}
5969

70+
private boolean checkCaseVariants(char[] wordChars, WordCase wordCase) {
71+
char[] caseVariant = wordChars;
72+
if (wordCase == WordCase.UPPER) {
73+
caseVariant = stemmer.caseFoldTitle(caseVariant, wordChars.length);
74+
if (checkWord(caseVariant, wordChars.length, true)) {
75+
return true;
76+
}
77+
}
78+
return checkWord(stemmer.caseFoldLower(caseVariant, wordChars.length), wordChars.length, true);
79+
}
80+
81+
private boolean checkWord(char[] wordChars, int length, boolean caseVariant) {
82+
if (dictionary.isForbiddenWord(wordChars, length, scratch)) {
83+
return false;
84+
}
85+
86+
if (!stemmer.doStem(wordChars, length, caseVariant).isEmpty()) {
87+
return true;
88+
}
89+
90+
if (dictionary.hasCompounding()) {
91+
return checkCompounds(wordChars, 0, length, new ArrayList<>());
92+
}
93+
94+
return false;
95+
}
96+
97+
private boolean checkCompounds(char[] wordChars, int offset, int length, List<IntsRef> words) {
98+
if (words.size() >= 100) return false;
99+
100+
int limit = length - dictionary.compoundMin + 1;
101+
for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
102+
IntsRef forms = dictionary.lookupWord(wordChars, offset, breakPos);
103+
if (forms != null) {
104+
words.add(forms);
105+
106+
if (dictionary.compoundRules != null
107+
&& dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words, scratch))) {
108+
if (checkLastCompoundPart(wordChars, offset + breakPos, length - breakPos, words)) {
109+
return true;
110+
}
111+
112+
if (checkCompounds(wordChars, offset + breakPos, length - breakPos, words)) {
113+
return true;
114+
}
115+
}
116+
117+
words.remove(words.size() - 1);
118+
}
119+
}
120+
121+
return false;
122+
}
123+
124+
private boolean checkLastCompoundPart(
125+
char[] wordChars, int start, int length, List<IntsRef> words) {
126+
IntsRef forms = dictionary.lookupWord(wordChars, start, length);
127+
if (forms == null) return false;
128+
129+
words.add(forms);
130+
boolean result =
131+
dictionary.compoundRules != null
132+
&& dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words, scratch));
133+
words.remove(words.size() - 1);
134+
return result;
135+
}
136+
60137
private static boolean isNumber(String s) {
61138
int i = 0;
62139
while (i < s.length()) {

0 commit comments

Comments
 (0)