Skip to content

Commit f783848

Browse files
authored
LUCENE-9799: Hunspell: don't check second-level affixes when the first level isn't a continuation (#2413)
* LUCENE-9799: Hunspell: don't check second-level affixes when the first level isn't a continuation * check more words in TestPerformance
1 parent e420e6c commit f783848

File tree

3 files changed

+37
-19
lines changed

3 files changed

+37
-19
lines changed

lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java

Lines changed: 32 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,10 @@
3737
import java.util.Collections;
3838
import java.util.Comparator;
3939
import java.util.HashMap;
40+
import java.util.HashSet;
4041
import java.util.LinkedHashMap;
4142
import java.util.LinkedHashSet;
4243
import java.util.List;
43-
import java.util.Locale;
4444
import java.util.Map;
4545
import java.util.Set;
4646
import java.util.TreeMap;
@@ -78,9 +78,6 @@ public class Dictionary {
7878
private static final int DEFAULT_FLAGS = 65510;
7979
static final char HIDDEN_FLAG = (char) 65511; // called 'ONLYUPCASEFLAG' in Hunspell
8080

81-
// TODO: really for suffixes we should reverse the automaton and run them backwards
82-
private static final String PREFIX_CONDITION_REGEX = "%s.*";
83-
private static final String SUFFIX_CONDITION_REGEX = ".*%s";
8481
static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1;
8582
CharsetDecoder decoder = replacingDecoder(DEFAULT_CHARSET);
8683

@@ -147,8 +144,12 @@ public class Dictionary {
147144
boolean ignoreCase;
148145
boolean checkSharpS;
149146
boolean complexPrefixes;
150-
// if no affixes have continuation classes, no need to do 2-level affix stripping
151-
boolean twoStageAffix;
147+
148+
/**
149+
* All flags used in affix continuation classes. If an outer affix's flag isn't here, there's no
150+
* need to do 2-level affix stripping with it.
151+
*/
152+
private char[] secondStageAffixFlags;
152153

153154
char circumfix;
154155
char keepcase, forceUCase;
@@ -332,6 +333,7 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, Flag
332333
throws IOException, ParseException {
333334
TreeMap<String, List<Integer>> prefixes = new TreeMap<>();
334335
TreeMap<String, List<Integer>> suffixes = new TreeMap<>();
336+
Set<Character> stage2Flags = new HashSet<>();
335337
Map<String, Integer> seenPatterns = new HashMap<>();
336338

337339
// zero condition -> 0 ord
@@ -359,9 +361,9 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, Flag
359361
} else if ("AM".equals(firstWord)) {
360362
parseMorphAlias(line);
361363
} else if ("PFX".equals(firstWord)) {
362-
parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX, seenPatterns, seenStrips, flags);
364+
parseAffix(prefixes, stage2Flags, line, reader, false, seenPatterns, seenStrips, flags);
363365
} else if ("SFX".equals(firstWord)) {
364-
parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX, seenPatterns, seenStrips, flags);
366+
parseAffix(suffixes, stage2Flags, line, reader, true, seenPatterns, seenStrips, flags);
365367
} else if (line.equals("COMPLEXPREFIXES")) {
366368
complexPrefixes =
367369
true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
@@ -476,6 +478,7 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, Flag
476478

477479
this.prefixes = affixFST(prefixes);
478480
this.suffixes = affixFST(suffixes);
481+
secondStageAffixFlags = toSortedCharArray(stage2Flags);
479482

480483
int totalChars = 0;
481484
for (String strip : seenStrips.keySet()) {
@@ -675,16 +678,15 @@ static String escapeDash(String re) {
675678
* @param affixes Map where the result of the parsing will be put
676679
* @param header Header line of the affix rule
677680
* @param reader BufferedReader to read the content of the rule from
678-
* @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate
679-
* the condition regex pattern
680681
* @param seenPatterns map from condition -&gt; index of patterns, for deduplication.
681682
* @throws IOException Can be thrown while reading the rule
682683
*/
683684
private void parseAffix(
684685
TreeMap<String, List<Integer>> affixes,
686+
Set<Character> secondStageFlags,
685687
String header,
686688
LineNumberReader reader,
687-
String conditionPattern,
689+
boolean isSuffix,
688690
Map<String, Integer> seenPatterns,
689691
Map<String, Integer> seenStrips,
690692
FlagEnumerator flags)
@@ -694,7 +696,6 @@ private void parseAffix(
694696
String[] args = header.split("\\s+");
695697

696698
boolean crossProduct = args[2].equals("Y");
697-
boolean isSuffix = conditionPattern.equals(SUFFIX_CONDITION_REGEX);
698699

699700
int numLines;
700701
try {
@@ -725,7 +726,9 @@ private void parseAffix(
725726
}
726727

727728
appendFlags = flagParsingStrategy.parseFlags(flagPart);
728-
twoStageAffix = true;
729+
for (char appendFlag : appendFlags) {
730+
secondStageFlags.add(appendFlag);
731+
}
729732
}
730733
// zero affix -> empty string
731734
if ("0".equals(affixArg)) {
@@ -750,7 +753,8 @@ private void parseAffix(
750753
// if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
751754
// but this is complicated...
752755
} else {
753-
regex = String.format(Locale.ROOT, conditionPattern, condition);
756+
// TODO: really for suffixes we should reverse the automaton and run them backwards
757+
regex = isSuffix ? ".*" + condition : condition + ".*";
754758
}
755759

756760
// deduplicate patterns
@@ -1610,6 +1614,20 @@ CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
16101614
return reuse;
16111615
}
16121616

1617+
private static char[] toSortedCharArray(Set<Character> set) {
1618+
char[] chars = new char[set.size()];
1619+
int i = 0;
1620+
for (Character c : set) {
1621+
chars[i++] = c;
1622+
}
1623+
Arrays.sort(chars);
1624+
return chars;
1625+
}
1626+
1627+
boolean isSecondStageAffix(char flag) {
1628+
return Arrays.binarySearch(secondStageAffixFlags, flag) >= 0;
1629+
}
1630+
16131631
/** folds single character (according to LANG if present) */
16141632
char caseFold(char c) {
16151633
if (alternateCasing) {

lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -648,11 +648,11 @@ private boolean applyAffix(
648648
if (recursionDepth == 0) {
649649
if (prefix) {
650650
prefixId = affix;
651-
doPrefix = dictionary.complexPrefixes && dictionary.twoStageAffix;
651+
doPrefix = dictionary.complexPrefixes && dictionary.isSecondStageAffix(flag);
652652
// we took away the first prefix.
653653
// COMPLEXPREFIXES = true: combine with a second prefix and another suffix
654654
// COMPLEXPREFIXES = false: combine with a suffix
655-
} else if (!dictionary.complexPrefixes && dictionary.twoStageAffix) {
655+
} else if (!dictionary.complexPrefixes && dictionary.isSecondStageAffix(flag)) {
656656
doPrefix = false;
657657
// we took away a suffix.
658658
// COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
@@ -665,7 +665,7 @@ private boolean applyAffix(
665665
if (prefix && dictionary.complexPrefixes) {
666666
prefixId = affix;
667667
// we took away the second prefix: go look for another suffix
668-
} else if (prefix || dictionary.complexPrefixes || !dictionary.twoStageAffix) {
668+
} else if (prefix || dictionary.complexPrefixes || !dictionary.isSecondStageAffix(flag)) {
669669
return true;
670670
}
671671
// we took away a prefix, then a suffix: go look for another suffix

lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ public void en_suggest() throws Exception {
6666

6767
@Test
6868
public void de() throws Exception {
69-
checkAnalysisPerformance("de", 200_000);
69+
checkAnalysisPerformance("de", 300_000);
7070
}
7171

7272
@Test
@@ -76,7 +76,7 @@ public void de_suggest() throws Exception {
7676

7777
@Test
7878
public void fr() throws Exception {
79-
checkAnalysisPerformance("fr", 40_000);
79+
checkAnalysisPerformance("fr", 80_000);
8080
}
8181

8282
@Test

0 commit comments

Comments
 (0)