Skip to content

Commit 695e789

Browse files
authored
LUCENE-9698: Hunspell: reuse char[] when possible when stripping affix (#2243)
1 parent 80e4def commit 695e789

File tree

2 files changed

+40
-19
lines changed

2 files changed

+40
-19
lines changed

lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ private boolean checkCaseVariants(char[] wordChars, WordCase wordCase) {
7474
if (checkWord(caseVariant, wordChars.length, true)) {
7575
return true;
7676
}
77-
char[] aposCase = stemmer.capitalizeAfterApostrophe(caseVariant, wordChars.length);
77+
char[] aposCase = Stemmer.capitalizeAfterApostrophe(caseVariant, wordChars.length);
7878
if (aposCase != null && checkWord(aposCase, aposCase.length, true)) {
7979
return true;
8080
}

lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java

Lines changed: 39 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ char[] caseFoldLower(char[] word, int length) {
143143

144144
// Special prefix handling for Catalan, French, Italian:
145145
// prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia).
146-
char[] capitalizeAfterApostrophe(char[] word, int length) {
146+
static char[] capitalizeAfterApostrophe(char[] word, int length) {
147147
for (int i = 1; i < length - 1; i++) {
148148
if (word[i] == '\'') {
149149
char next = word[i + 1];
@@ -175,11 +175,12 @@ List<CharsRef> doStem(char[] word, int length, boolean caseVariant) {
175175
if (Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
176176
continue;
177177
}
178-
stems.add(newStem(word, length, forms, i));
178+
stems.add(newStem(word, 0, length, forms, i));
179179
}
180180
}
181181
try {
182-
stems.addAll(stem(word, length, -1, (char) 0, -1, 0, true, true, false, false, caseVariant));
182+
stems.addAll(
183+
stem(word, 0, length, -1, (char) 0, -1, 0, true, true, false, false, caseVariant));
183184
} catch (IOException bogus) {
184185
throw new RuntimeException(bogus);
185186
}
@@ -214,7 +215,7 @@ public List<CharsRef> uniqueStems(char[] word, int length) {
214215
return deduped;
215216
}
216217

217-
private CharsRef newStem(char[] buffer, int length, IntsRef forms, int formID) {
218+
private CharsRef newStem(char[] buffer, int offset, int length, IntsRef forms, int formID) {
218219
final String exception;
219220
if (dictionary.hasStemExceptions) {
220221
int exceptionID = forms.ints[forms.offset + formID + 1];
@@ -232,7 +233,7 @@ private CharsRef newStem(char[] buffer, int length, IntsRef forms, int formID) {
232233
if (exception != null) {
233234
scratchSegment.append(exception);
234235
} else {
235-
scratchSegment.append(buffer, 0, length);
236+
scratchSegment.append(buffer, offset, length);
236237
}
237238
try {
238239
Dictionary.applyMappings(dictionary.oconv, scratchSegment);
@@ -246,7 +247,7 @@ private CharsRef newStem(char[] buffer, int length, IntsRef forms, int formID) {
246247
if (exception != null) {
247248
return new CharsRef(exception);
248249
} else {
249-
return new CharsRef(buffer, 0, length);
250+
return new CharsRef(buffer, offset, length);
250251
}
251252
}
252253
}
@@ -284,6 +285,7 @@ private CharsRef newStem(char[] buffer, int length, IntsRef forms, int formID) {
284285
*/
285286
private List<CharsRef> stem(
286287
char[] word,
288+
int offset,
287289
int length,
288290
int previous,
289291
char prevFlag,
@@ -308,7 +310,7 @@ private List<CharsRef> stem(
308310
int limit = dictionary.fullStrip ? length + 1 : length;
309311
for (int i = 0; i < limit; i++) {
310312
if (i > 0) {
311-
int ch = word[i - 1];
313+
char ch = word[offset + i - 1];
312314
if (fst.findTargetArc(ch, arc, arc, prefixReader) == null) {
313315
break;
314316
} else if (arc.output() != NO_OUTPUT) {
@@ -327,15 +329,17 @@ private List<CharsRef> stem(
327329
}
328330

329331
if (isAffixCompatible(prefix, prevFlag, recursionDepth, false)) {
330-
char[] strippedWord = stripAffix(word, length, i, prefix, true);
332+
char[] strippedWord = stripAffix(word, offset, length, i, prefix, true);
331333
if (strippedWord == null) {
332334
continue;
333335
}
334336

337+
boolean pureAffix = strippedWord == word;
335338
stems.addAll(
336339
applyAffix(
337340
strippedWord,
338-
strippedWord.length,
341+
pureAffix ? offset + i : 0,
342+
pureAffix ? length - i : strippedWord.length,
339343
prefix,
340344
-1,
341345
recursionDepth,
@@ -356,7 +360,7 @@ private List<CharsRef> stem(
356360
int limit = dictionary.fullStrip ? 0 : 1;
357361
for (int i = length; i >= limit; i--) {
358362
if (i < length) {
359-
int ch = word[i];
363+
char ch = word[offset + i];
360364
if (fst.findTargetArc(ch, arc, arc, suffixReader) == null) {
361365
break;
362366
} else if (arc.output() != NO_OUTPUT) {
@@ -375,15 +379,17 @@ private List<CharsRef> stem(
375379
}
376380

377381
if (isAffixCompatible(suffix, prevFlag, recursionDepth, previousWasPrefix)) {
378-
char[] strippedWord = stripAffix(word, length, length - i, suffix, false);
382+
char[] strippedWord = stripAffix(word, offset, length, length - i, suffix, false);
379383
if (strippedWord == null) {
380384
continue;
381385
}
382386

387+
boolean pureAffix = strippedWord == word;
383388
stems.addAll(
384389
applyAffix(
385390
strippedWord,
386-
strippedWord.length,
391+
pureAffix ? offset : 0,
392+
pureAffix ? i : strippedWord.length,
387393
suffix,
388394
prefixId,
389395
recursionDepth,
@@ -398,7 +404,13 @@ private List<CharsRef> stem(
398404
return stems;
399405
}
400406

401-
private char[] stripAffix(char[] word, int length, int affixLen, int affix, boolean isPrefix) {
407+
/**
408+
* @return null if affix conditions isn't met; a reference to the same char[] if the affix has no
409+
* strip data and can thus be simply removed, or a new char[] containing the word affix
410+
* removal
411+
*/
412+
private char[] stripAffix(
413+
char[] word, int offset, int length, int affixLen, int affix, boolean isPrefix) {
402414
int deAffixedLen = length - affixLen;
403415

404416
int stripOrd = dictionary.affixData(affix, Dictionary.AFFIX_STRIP_ORD);
@@ -409,15 +421,22 @@ private char[] stripAffix(char[] word, int length, int affixLen, int affix, bool
409421
char[] stripData = dictionary.stripData;
410422
boolean condition =
411423
isPrefix
412-
? checkCondition(affix, stripData, stripStart, stripLen, word, affixLen, deAffixedLen)
413-
: checkCondition(affix, word, 0, deAffixedLen, stripData, stripStart, stripLen);
424+
? checkCondition(
425+
affix, stripData, stripStart, stripLen, word, offset + affixLen, deAffixedLen)
426+
: checkCondition(affix, word, offset, deAffixedLen, stripData, stripStart, stripLen);
414427
if (!condition) {
415428
return null;
416429
}
417430

431+
if (stripLen == 0) return word;
432+
418433
char[] strippedWord = new char[stripLen + deAffixedLen];
419434
System.arraycopy(
420-
word, isPrefix ? affixLen : 0, strippedWord, isPrefix ? stripLen : 0, deAffixedLen);
435+
word,
436+
offset + (isPrefix ? affixLen : 0),
437+
strippedWord,
438+
isPrefix ? stripLen : 0,
439+
deAffixedLen);
421440
System.arraycopy(stripData, stripStart, strippedWord, isPrefix ? 0 : deAffixedLen, stripLen);
422441
return strippedWord;
423442
}
@@ -484,6 +503,7 @@ private boolean checkCondition(
484503
*/
485504
private List<CharsRef> applyAffix(
486505
char[] strippedWord,
506+
int offset,
487507
int length,
488508
int affix,
489509
int prefixId,
@@ -496,7 +516,7 @@ private List<CharsRef> applyAffix(
496516

497517
List<CharsRef> stems = new ArrayList<>();
498518

499-
IntsRef forms = dictionary.lookupWord(strippedWord, 0, length);
519+
IntsRef forms = dictionary.lookupWord(strippedWord, offset, length);
500520
if (forms != null) {
501521
for (int i = 0; i < forms.length; i += formStep) {
502522
char[] wordFlags = dictionary.decodeFlags(forms.ints[forms.offset + i], scratch);
@@ -530,7 +550,7 @@ private List<CharsRef> applyAffix(
530550
if (Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
531551
continue;
532552
}
533-
stems.add(newStem(strippedWord, length, forms, i));
553+
stems.add(newStem(strippedWord, offset, length, forms, i));
534554
}
535555
}
536556
}
@@ -572,6 +592,7 @@ private List<CharsRef> applyAffix(
572592
stems.addAll(
573593
stem(
574594
strippedWord,
595+
offset,
575596
length,
576597
affix,
577598
flag,

0 commit comments

Comments
 (0)