26
26
import org .apache .lucene .util .IntsRef ;
27
27
import org .apache .lucene .util .automaton .CharacterRunAutomaton ;
28
28
import org .apache .lucene .util .fst .FST ;
29
- import org .apache .lucene .util .fst .Outputs ;
30
29
31
30
/**
32
31
* Stemmer uses the affix rules declared in the Dictionary to generate one or more stems for a word.
@@ -305,11 +304,10 @@ private List<CharsRef> stem(
305
304
306
305
if (doPrefix && dictionary .prefixes != null ) {
307
306
FST <IntsRef > fst = dictionary .prefixes ;
308
- Outputs <IntsRef > outputs = fst .outputs ;
309
307
FST .BytesReader bytesReader = prefixReaders [recursionDepth ];
310
308
FST .Arc <IntsRef > arc = prefixArcs [recursionDepth ];
311
309
fst .getFirstArc (arc );
312
- IntsRef NO_OUTPUT = outputs .getNoOutput ();
310
+ IntsRef NO_OUTPUT = fst . outputs .getNoOutput ();
313
311
IntsRef output = NO_OUTPUT ;
314
312
int limit = dictionary .fullStrip ? length + 1 : length ;
315
313
for (int i = 0 ; i < limit ; i ++) {
@@ -333,23 +331,12 @@ private List<CharsRef> stem(
333
331
}
334
332
335
333
if (isAffixCompatible (prefix , prevFlag , recursionDepth , false )) {
336
- int deAffixedLength = length - i ;
337
-
338
- int stripOrd = dictionary .affixData (prefix , Dictionary .AFFIX_STRIP_ORD );
339
- int stripStart = dictionary .stripOffsets [stripOrd ];
340
- int stripEnd = dictionary .stripOffsets [stripOrd + 1 ];
341
- int stripLength = stripEnd - stripStart ;
342
-
343
- if (!checkCondition (
344
- prefix , dictionary .stripData , stripStart , stripLength , word , i , deAffixedLength )) {
334
+ char [] strippedWord = stripAffix (word , length , i , prefix , true );
335
+ if (strippedWord == null ) {
345
336
continue ;
346
337
}
347
338
348
- char [] strippedWord = new char [stripLength + deAffixedLength ];
349
- System .arraycopy (dictionary .stripData , stripStart , strippedWord , 0 , stripLength );
350
- System .arraycopy (word , i , strippedWord , stripLength , deAffixedLength );
351
-
352
- List <CharsRef > stemList =
339
+ stems .addAll (
353
340
applyAffix (
354
341
strippedWord ,
355
342
strippedWord .length ,
@@ -358,21 +345,18 @@ private List<CharsRef> stem(
358
345
recursionDepth ,
359
346
true ,
360
347
circumfix ,
361
- caseVariant );
362
-
363
- stems .addAll (stemList );
348
+ caseVariant ));
364
349
}
365
350
}
366
351
}
367
352
}
368
353
369
354
if (doSuffix && dictionary .suffixes != null ) {
370
355
FST <IntsRef > fst = dictionary .suffixes ;
371
- Outputs <IntsRef > outputs = fst .outputs ;
372
356
FST .BytesReader bytesReader = suffixReaders [recursionDepth ];
373
357
FST .Arc <IntsRef > arc = suffixArcs [recursionDepth ];
374
358
fst .getFirstArc (arc );
375
- IntsRef NO_OUTPUT = outputs .getNoOutput ();
359
+ IntsRef NO_OUTPUT = fst . outputs .getNoOutput ();
376
360
IntsRef output = NO_OUTPUT ;
377
361
int limit = dictionary .fullStrip ? 0 : 1 ;
378
362
for (int i = length ; i >= limit ; i --) {
@@ -396,25 +380,12 @@ private List<CharsRef> stem(
396
380
}
397
381
398
382
if (isAffixCompatible (suffix , prevFlag , recursionDepth , previousWasPrefix )) {
399
- int appendLength = length - i ;
400
- int deAffixedLength = length - appendLength ;
401
-
402
- int stripOrd = dictionary .affixData (suffix , Dictionary .AFFIX_STRIP_ORD );
403
- int stripStart = dictionary .stripOffsets [stripOrd ];
404
- int stripEnd = dictionary .stripOffsets [stripOrd + 1 ];
405
- int stripLength = stripEnd - stripStart ;
406
-
407
- if (!checkCondition (
408
- suffix , word , 0 , deAffixedLength , dictionary .stripData , stripStart , stripLength )) {
383
+ char [] strippedWord = stripAffix (word , length , length - i , suffix , false );
384
+ if (strippedWord == null ) {
409
385
continue ;
410
386
}
411
387
412
- char [] strippedWord = new char [stripLength + deAffixedLength ];
413
- System .arraycopy (word , 0 , strippedWord , 0 , deAffixedLength );
414
- System .arraycopy (
415
- dictionary .stripData , stripStart , strippedWord , deAffixedLength , stripLength );
416
-
417
- List <CharsRef > stemList =
388
+ stems .addAll (
418
389
applyAffix (
419
390
strippedWord ,
420
391
strippedWord .length ,
@@ -423,9 +394,7 @@ private List<CharsRef> stem(
423
394
recursionDepth ,
424
395
false ,
425
396
circumfix ,
426
- caseVariant );
427
-
428
- stems .addAll (stemList );
397
+ caseVariant ));
429
398
}
430
399
}
431
400
}
@@ -434,6 +403,30 @@ private List<CharsRef> stem(
434
403
return stems ;
435
404
}
436
405
406
+ private char [] stripAffix (char [] word , int length , int affixLen , int affix , boolean isPrefix ) {
407
+ int deAffixedLen = length - affixLen ;
408
+
409
+ int stripOrd = dictionary .affixData (affix , Dictionary .AFFIX_STRIP_ORD );
410
+ int stripStart = dictionary .stripOffsets [stripOrd ];
411
+ int stripEnd = dictionary .stripOffsets [stripOrd + 1 ];
412
+ int stripLen = stripEnd - stripStart ;
413
+
414
+ char [] stripData = dictionary .stripData ;
415
+ boolean condition =
416
+ isPrefix
417
+ ? checkCondition (affix , stripData , stripStart , stripLen , word , affixLen , deAffixedLen )
418
+ : checkCondition (affix , word , 0 , deAffixedLen , stripData , stripStart , stripLen );
419
+ if (!condition ) {
420
+ return null ;
421
+ }
422
+
423
+ char [] strippedWord = new char [stripLen + deAffixedLen ];
424
+ System .arraycopy (
425
+ word , isPrefix ? affixLen : 0 , strippedWord , isPrefix ? stripLen : 0 , deAffixedLen );
426
+ System .arraycopy (stripData , stripStart , strippedWord , isPrefix ? 0 : deAffixedLen , stripLen );
427
+ return strippedWord ;
428
+ }
429
+
437
430
private boolean isAffixCompatible (
438
431
int affix , int prevFlag , int recursionDepth , boolean previousWasPrefix ) {
439
432
int append = dictionary .affixData (affix , Dictionary .AFFIX_APPEND );
@@ -495,9 +488,9 @@ private boolean checkCondition(
495
488
* @param strippedWord Word the affix has been removed and the strip added
496
489
* @param length valid length of stripped word
497
490
* @param affix HunspellAffix representing the affix rule itself
498
- * @param prefixId when we already stripped a prefix, we cant simply recurse and check the suffix,
499
- * unless both are compatible so we must check dictionary form against both to add it as a
500
- * stem!
491
+ * @param prefixId when we already stripped a prefix, we can't simply recurse and check the
492
+ * suffix, unless both are compatible so we must check dictionary form against both to add it
493
+ * as a stem!
501
494
* @param recursionDepth current recursion depth
502
495
* @param prefix true if we are removing a prefix (false if it's a suffix)
503
496
* @return List of stems for the word, or an empty list if none are found
0 commit comments