Skip to content

Commit e4ec3e3

Browse files
authored
LUCENE-9692: Hunspell: extract Stemmer.stripAffix from similar code in prefix/suffix processing (#2237)
1 parent f64e7cb commit e4ec3e3

File tree

1 file changed

+37
-44
lines changed
  • lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell

1 file changed

+37
-44
lines changed

lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java

Lines changed: 37 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
import org.apache.lucene.util.IntsRef;
2727
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
2828
import org.apache.lucene.util.fst.FST;
29-
import org.apache.lucene.util.fst.Outputs;
3029

3130
/**
3231
* Stemmer uses the affix rules declared in the Dictionary to generate one or more stems for a word.
@@ -305,11 +304,10 @@ private List<CharsRef> stem(
305304

306305
if (doPrefix && dictionary.prefixes != null) {
307306
FST<IntsRef> fst = dictionary.prefixes;
308-
Outputs<IntsRef> outputs = fst.outputs;
309307
FST.BytesReader bytesReader = prefixReaders[recursionDepth];
310308
FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
311309
fst.getFirstArc(arc);
312-
IntsRef NO_OUTPUT = outputs.getNoOutput();
310+
IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
313311
IntsRef output = NO_OUTPUT;
314312
int limit = dictionary.fullStrip ? length + 1 : length;
315313
for (int i = 0; i < limit; i++) {
@@ -333,23 +331,12 @@ private List<CharsRef> stem(
333331
}
334332

335333
if (isAffixCompatible(prefix, prevFlag, recursionDepth, false)) {
336-
int deAffixedLength = length - i;
337-
338-
int stripOrd = dictionary.affixData(prefix, Dictionary.AFFIX_STRIP_ORD);
339-
int stripStart = dictionary.stripOffsets[stripOrd];
340-
int stripEnd = dictionary.stripOffsets[stripOrd + 1];
341-
int stripLength = stripEnd - stripStart;
342-
343-
if (!checkCondition(
344-
prefix, dictionary.stripData, stripStart, stripLength, word, i, deAffixedLength)) {
334+
char[] strippedWord = stripAffix(word, length, i, prefix, true);
335+
if (strippedWord == null) {
345336
continue;
346337
}
347338

348-
char[] strippedWord = new char[stripLength + deAffixedLength];
349-
System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
350-
System.arraycopy(word, i, strippedWord, stripLength, deAffixedLength);
351-
352-
List<CharsRef> stemList =
339+
stems.addAll(
353340
applyAffix(
354341
strippedWord,
355342
strippedWord.length,
@@ -358,21 +345,18 @@ private List<CharsRef> stem(
358345
recursionDepth,
359346
true,
360347
circumfix,
361-
caseVariant);
362-
363-
stems.addAll(stemList);
348+
caseVariant));
364349
}
365350
}
366351
}
367352
}
368353

369354
if (doSuffix && dictionary.suffixes != null) {
370355
FST<IntsRef> fst = dictionary.suffixes;
371-
Outputs<IntsRef> outputs = fst.outputs;
372356
FST.BytesReader bytesReader = suffixReaders[recursionDepth];
373357
FST.Arc<IntsRef> arc = suffixArcs[recursionDepth];
374358
fst.getFirstArc(arc);
375-
IntsRef NO_OUTPUT = outputs.getNoOutput();
359+
IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
376360
IntsRef output = NO_OUTPUT;
377361
int limit = dictionary.fullStrip ? 0 : 1;
378362
for (int i = length; i >= limit; i--) {
@@ -396,25 +380,12 @@ private List<CharsRef> stem(
396380
}
397381

398382
if (isAffixCompatible(suffix, prevFlag, recursionDepth, previousWasPrefix)) {
399-
int appendLength = length - i;
400-
int deAffixedLength = length - appendLength;
401-
402-
int stripOrd = dictionary.affixData(suffix, Dictionary.AFFIX_STRIP_ORD);
403-
int stripStart = dictionary.stripOffsets[stripOrd];
404-
int stripEnd = dictionary.stripOffsets[stripOrd + 1];
405-
int stripLength = stripEnd - stripStart;
406-
407-
if (!checkCondition(
408-
suffix, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) {
383+
char[] strippedWord = stripAffix(word, length, length - i, suffix, false);
384+
if (strippedWord == null) {
409385
continue;
410386
}
411387

412-
char[] strippedWord = new char[stripLength + deAffixedLength];
413-
System.arraycopy(word, 0, strippedWord, 0, deAffixedLength);
414-
System.arraycopy(
415-
dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength);
416-
417-
List<CharsRef> stemList =
388+
stems.addAll(
418389
applyAffix(
419390
strippedWord,
420391
strippedWord.length,
@@ -423,9 +394,7 @@ private List<CharsRef> stem(
423394
recursionDepth,
424395
false,
425396
circumfix,
426-
caseVariant);
427-
428-
stems.addAll(stemList);
397+
caseVariant));
429398
}
430399
}
431400
}
@@ -434,6 +403,30 @@ private List<CharsRef> stem(
434403
return stems;
435404
}
436405

406+
private char[] stripAffix(char[] word, int length, int affixLen, int affix, boolean isPrefix) {
407+
int deAffixedLen = length - affixLen;
408+
409+
int stripOrd = dictionary.affixData(affix, Dictionary.AFFIX_STRIP_ORD);
410+
int stripStart = dictionary.stripOffsets[stripOrd];
411+
int stripEnd = dictionary.stripOffsets[stripOrd + 1];
412+
int stripLen = stripEnd - stripStart;
413+
414+
char[] stripData = dictionary.stripData;
415+
boolean condition =
416+
isPrefix
417+
? checkCondition(affix, stripData, stripStart, stripLen, word, affixLen, deAffixedLen)
418+
: checkCondition(affix, word, 0, deAffixedLen, stripData, stripStart, stripLen);
419+
if (!condition) {
420+
return null;
421+
}
422+
423+
char[] strippedWord = new char[stripLen + deAffixedLen];
424+
System.arraycopy(
425+
word, isPrefix ? affixLen : 0, strippedWord, isPrefix ? stripLen : 0, deAffixedLen);
426+
System.arraycopy(stripData, stripStart, strippedWord, isPrefix ? 0 : deAffixedLen, stripLen);
427+
return strippedWord;
428+
}
429+
437430
private boolean isAffixCompatible(
438431
int affix, int prevFlag, int recursionDepth, boolean previousWasPrefix) {
439432
int append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);
@@ -495,9 +488,9 @@ private boolean checkCondition(
495488
* @param strippedWord Word the affix has been removed and the strip added
496489
* @param length valid length of stripped word
497490
* @param affix HunspellAffix representing the affix rule itself
498-
* @param prefixId when we already stripped a prefix, we cant simply recurse and check the suffix,
499-
* unless both are compatible so we must check dictionary form against both to add it as a
500-
* stem!
491+
* @param prefixId when we already stripped a prefix, we can't simply recurse and check the
492+
* suffix, unless both are compatible so we must check dictionary form against both to add it
493+
* as a stem!
501494
* @param recursionDepth current recursion depth
502495
* @param prefix true if we are removing a prefix (false if it's a suffix)
503496
* @return List of stems for the word, or an empty list if none are found

0 commit comments

Comments
 (0)