Skip to content

Commit 0a1a3f4

Browse files
authored
LUCENE-9688: Hunspell: consider prefix's continuation flags when applying suffix (#2229)
1 parent d796813 commit 0a1a3f4

File tree

4 files changed

+37
-23
lines changed

4 files changed

+37
-23
lines changed

lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java

Lines changed: 24 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -251,8 +251,8 @@ private CharsRef newStem(char[] buffer, int length, IntsRef forms, int formID) {
251251
* @param previous previous affix that was removed (so we dont remove same one twice)
252252
* @param prevFlag Flag from a previous stemming step that need to be cross-checked with any
253253
* affixes in this recursive step
254-
* @param prefixFlag flag of the most inner removed prefix, so that when removing a suffix, it's
255-
* also checked against the word
254+
* @param prefixId ID of the most inner removed prefix, so that when removing a suffix, it's also
255+
* checked against the word
256256
* @param recursionDepth current recursiondepth
257257
* @param doPrefix true if we should remove prefixes
258258
* @param doSuffix true if we should remove suffixes
@@ -270,7 +270,7 @@ private List<CharsRef> stem(
270270
int length,
271271
int previous,
272272
int prevFlag,
273-
int prefixFlag,
273+
int prefixId,
274274
int recursionDepth,
275275
boolean doPrefix,
276276
boolean doSuffix,
@@ -398,7 +398,7 @@ private List<CharsRef> stem(
398398
strippedWord,
399399
strippedWord.length,
400400
suffix,
401-
prefixFlag,
401+
prefixId,
402402
recursionDepth,
403403
false,
404404
circumfix,
@@ -474,9 +474,9 @@ private boolean checkCondition(
474474
* @param strippedWord Word the affix has been removed and the strip added
475475
* @param length valid length of stripped word
476476
* @param affix HunspellAffix representing the affix rule itself
477-
* @param prefixFlag when we already stripped a prefix, we cant simply recurse and check the
478-
* suffix, unless both are compatible so we must check dictionary form against both to add it
479-
* as a stem!
477+
* @param prefixId when we already stripped a prefix, we cant simply recurse and check the suffix,
478+
* unless both are compatible so we must check dictionary form against both to add it as a
479+
* stem!
480480
* @param recursionDepth current recursion depth
481481
* @param prefix true if we are removing a prefix (false if it's a suffix)
482482
* @return List of stems for the word, or an empty list if none are found
@@ -485,40 +485,37 @@ private List<CharsRef> applyAffix(
485485
char[] strippedWord,
486486
int length,
487487
int affix,
488-
int prefixFlag,
488+
int prefixId,
489489
int recursionDepth,
490490
boolean prefix,
491491
boolean circumfix,
492492
boolean caseVariant)
493493
throws IOException {
494494
char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
495-
char append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);
496495

497496
List<CharsRef> stems = new ArrayList<>();
498497

499498
IntsRef forms = dictionary.lookupWord(strippedWord, 0, length);
500499
if (forms != null) {
501500
for (int i = 0; i < forms.length; i += formStep) {
502501
char[] wordFlags = dictionary.decodeFlags(forms.ints[forms.offset + i], scratch);
503-
if (Dictionary.hasFlag(wordFlags, flag)) {
502+
if (Dictionary.hasFlag(wordFlags, flag) || isFlagAppendedByAffix(prefixId, flag)) {
504503
// confusing: in this one exception, we already chained the first prefix against the
505504
// second,
506505
// so it doesnt need to be checked against the word
507506
boolean chainedPrefix = dictionary.complexPrefixes && recursionDepth == 1 && prefix;
508-
if (!chainedPrefix
509-
&& prefixFlag >= 0
510-
&& !Dictionary.hasFlag(wordFlags, (char) prefixFlag)) {
511-
// see if we can chain prefix thru the suffix continuation class (only if it has any!)
512-
if (!dictionary.hasFlag(append, (char) prefixFlag, scratch)) {
507+
if (!chainedPrefix && prefixId >= 0) {
508+
char prefixFlag = dictionary.affixData(prefixId, Dictionary.AFFIX_FLAG);
509+
if (!Dictionary.hasFlag(wordFlags, prefixFlag)
510+
&& !isFlagAppendedByAffix(affix, prefixFlag)) {
513511
continue;
514512
}
515513
}
516514

517515
// if circumfix was previously set by a prefix, we must check this suffix,
518516
// to ensure it has it, and vice versa
519517
if (dictionary.circumfix != -1) {
520-
boolean suffixCircumfix =
521-
dictionary.hasFlag(append, (char) dictionary.circumfix, scratch);
518+
boolean suffixCircumfix = isFlagAppendedByAffix(affix, (char) dictionary.circumfix);
522519
if (circumfix != suffixCircumfix) {
523520
continue;
524521
}
@@ -541,14 +538,14 @@ private List<CharsRef> applyAffix(
541538
// if a circumfix flag is defined in the dictionary, and we are a prefix, we need to check if we
542539
// have that flag
543540
if (dictionary.circumfix != -1 && !circumfix && prefix) {
544-
circumfix = dictionary.hasFlag(append, (char) dictionary.circumfix, scratch);
541+
circumfix = isFlagAppendedByAffix(affix, (char) dictionary.circumfix);
545542
}
546543

547544
if (isCrossProduct(affix) && recursionDepth <= 1) {
548545
boolean doPrefix;
549546
if (recursionDepth == 0) {
550547
if (prefix) {
551-
prefixFlag = flag;
548+
prefixId = affix;
552549
doPrefix = dictionary.complexPrefixes && dictionary.twoStageAffix;
553550
// we took away the first prefix.
554551
// COMPLEXPREFIXES = true: combine with a second prefix and another suffix
@@ -564,7 +561,7 @@ private List<CharsRef> applyAffix(
564561
} else {
565562
doPrefix = false;
566563
if (prefix && dictionary.complexPrefixes) {
567-
prefixFlag = flag;
564+
prefixId = affix;
568565
// we took away the second prefix: go look for another suffix
569566
} else if (prefix || dictionary.complexPrefixes || !dictionary.twoStageAffix) {
570567
return stems;
@@ -578,7 +575,7 @@ private List<CharsRef> applyAffix(
578575
length,
579576
affix,
580577
flag,
581-
prefixFlag,
578+
prefixId,
582579
recursionDepth + 1,
583580
doPrefix,
584581
true,
@@ -590,6 +587,12 @@ private List<CharsRef> applyAffix(
590587
return stems;
591588
}
592589

590+
private boolean isFlagAppendedByAffix(int affixId, char flag) {
591+
if (affixId < 0) return false;
592+
int appendId = dictionary.affixData(affixId, Dictionary.AFFIX_APPEND);
593+
return dictionary.hasFlag(appendId, flag, scratch);
594+
}
595+
593596
private boolean isCrossProduct(int affix) {
594597
return (dictionary.affixData(affix, Dictionary.AFFIX_CONDITION) & 1) == 1;
595598
}

lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDependencies.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,5 +38,8 @@ public void testDependencies() {
3838
assertStemsTo("hydration", "hydrate");
3939
assertStemsTo("dehydrate", "hydrate");
4040
assertStemsTo("dehydration", "hydrate");
41+
42+
assertStemsTo("calorie", "calorie", "calorie");
43+
assertStemsTo("calories", "calorie");
4144
}
4245
}

lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dependencies.aff

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,10 @@ PFX h 0 de .
1717

1818
SFX A Y 1
1919
SFX A te tion/S .
20+
21+
SFX s Y 1
22+
SFX s 0 s .
23+
24+
PFX p Y 1
25+
PFX p 0 0/s .
26+
Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
2
1+
4
22
drink/RQ [verb]
33
drink/S [noun]
4-
hydrate/hA
4+
hydrate/hA
5+
calorie/p

0 commit comments

Comments
 (0)