@@ -143,7 +143,7 @@ char[] caseFoldLower(char[] word, int length) {
143
143
144
144
// Special prefix handling for Catalan, French, Italian:
145
145
// prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia).
146
- char [] capitalizeAfterApostrophe (char [] word , int length ) {
146
+ static char [] capitalizeAfterApostrophe (char [] word , int length ) {
147
147
for (int i = 1 ; i < length - 1 ; i ++) {
148
148
if (word [i ] == '\'' ) {
149
149
char next = word [i + 1 ];
@@ -175,11 +175,12 @@ List<CharsRef> doStem(char[] word, int length, boolean caseVariant) {
175
175
if (Dictionary .hasFlag (wordFlags , dictionary .onlyincompound )) {
176
176
continue ;
177
177
}
178
- stems .add (newStem (word , length , forms , i ));
178
+ stems .add (newStem (word , 0 , length , forms , i ));
179
179
}
180
180
}
181
181
try {
182
- stems .addAll (stem (word , length , -1 , (char ) 0 , -1 , 0 , true , true , false , false , caseVariant ));
182
+ stems .addAll (
183
+ stem (word , 0 , length , -1 , (char ) 0 , -1 , 0 , true , true , false , false , caseVariant ));
183
184
} catch (IOException bogus ) {
184
185
throw new RuntimeException (bogus );
185
186
}
@@ -214,7 +215,7 @@ public List<CharsRef> uniqueStems(char[] word, int length) {
214
215
return deduped ;
215
216
}
216
217
217
- private CharsRef newStem (char [] buffer , int length , IntsRef forms , int formID ) {
218
+ private CharsRef newStem (char [] buffer , int offset , int length , IntsRef forms , int formID ) {
218
219
final String exception ;
219
220
if (dictionary .hasStemExceptions ) {
220
221
int exceptionID = forms .ints [forms .offset + formID + 1 ];
@@ -232,7 +233,7 @@ private CharsRef newStem(char[] buffer, int length, IntsRef forms, int formID) {
232
233
if (exception != null ) {
233
234
scratchSegment .append (exception );
234
235
} else {
235
- scratchSegment .append (buffer , 0 , length );
236
+ scratchSegment .append (buffer , offset , length );
236
237
}
237
238
try {
238
239
Dictionary .applyMappings (dictionary .oconv , scratchSegment );
@@ -246,7 +247,7 @@ private CharsRef newStem(char[] buffer, int length, IntsRef forms, int formID) {
246
247
if (exception != null ) {
247
248
return new CharsRef (exception );
248
249
} else {
249
- return new CharsRef (buffer , 0 , length );
250
+ return new CharsRef (buffer , offset , length );
250
251
}
251
252
}
252
253
}
@@ -284,6 +285,7 @@ private CharsRef newStem(char[] buffer, int length, IntsRef forms, int formID) {
284
285
*/
285
286
private List <CharsRef > stem (
286
287
char [] word ,
288
+ int offset ,
287
289
int length ,
288
290
int previous ,
289
291
char prevFlag ,
@@ -308,7 +310,7 @@ private List<CharsRef> stem(
308
310
int limit = dictionary .fullStrip ? length + 1 : length ;
309
311
for (int i = 0 ; i < limit ; i ++) {
310
312
if (i > 0 ) {
311
- int ch = word [i - 1 ];
313
+ char ch = word [offset + i - 1 ];
312
314
if (fst .findTargetArc (ch , arc , arc , prefixReader ) == null ) {
313
315
break ;
314
316
} else if (arc .output () != NO_OUTPUT ) {
@@ -327,15 +329,17 @@ private List<CharsRef> stem(
327
329
}
328
330
329
331
if (isAffixCompatible (prefix , prevFlag , recursionDepth , false )) {
330
- char [] strippedWord = stripAffix (word , length , i , prefix , true );
332
+ char [] strippedWord = stripAffix (word , offset , length , i , prefix , true );
331
333
if (strippedWord == null ) {
332
334
continue ;
333
335
}
334
336
337
+ boolean pureAffix = strippedWord == word ;
335
338
stems .addAll (
336
339
applyAffix (
337
340
strippedWord ,
338
- strippedWord .length ,
341
+ pureAffix ? offset + i : 0 ,
342
+ pureAffix ? length - i : strippedWord .length ,
339
343
prefix ,
340
344
-1 ,
341
345
recursionDepth ,
@@ -356,7 +360,7 @@ private List<CharsRef> stem(
356
360
int limit = dictionary .fullStrip ? 0 : 1 ;
357
361
for (int i = length ; i >= limit ; i --) {
358
362
if (i < length ) {
359
- int ch = word [i ];
363
+ char ch = word [offset + i ];
360
364
if (fst .findTargetArc (ch , arc , arc , suffixReader ) == null ) {
361
365
break ;
362
366
} else if (arc .output () != NO_OUTPUT ) {
@@ -375,15 +379,17 @@ private List<CharsRef> stem(
375
379
}
376
380
377
381
if (isAffixCompatible (suffix , prevFlag , recursionDepth , previousWasPrefix )) {
378
- char [] strippedWord = stripAffix (word , length , length - i , suffix , false );
382
+ char [] strippedWord = stripAffix (word , offset , length , length - i , suffix , false );
379
383
if (strippedWord == null ) {
380
384
continue ;
381
385
}
382
386
387
+ boolean pureAffix = strippedWord == word ;
383
388
stems .addAll (
384
389
applyAffix (
385
390
strippedWord ,
386
- strippedWord .length ,
391
+ pureAffix ? offset : 0 ,
392
+ pureAffix ? i : strippedWord .length ,
387
393
suffix ,
388
394
prefixId ,
389
395
recursionDepth ,
@@ -398,7 +404,13 @@ private List<CharsRef> stem(
398
404
return stems ;
399
405
}
400
406
401
- private char [] stripAffix (char [] word , int length , int affixLen , int affix , boolean isPrefix ) {
407
+ /**
408
+ * @return null if affix conditions isn't met; a reference to the same char[] if the affix has no
409
+ * strip data and can thus be simply removed, or a new char[] containing the word affix
410
+ * removal
411
+ */
412
+ private char [] stripAffix (
413
+ char [] word , int offset , int length , int affixLen , int affix , boolean isPrefix ) {
402
414
int deAffixedLen = length - affixLen ;
403
415
404
416
int stripOrd = dictionary .affixData (affix , Dictionary .AFFIX_STRIP_ORD );
@@ -409,15 +421,22 @@ private char[] stripAffix(char[] word, int length, int affixLen, int affix, bool
409
421
char [] stripData = dictionary .stripData ;
410
422
boolean condition =
411
423
isPrefix
412
- ? checkCondition (affix , stripData , stripStart , stripLen , word , affixLen , deAffixedLen )
413
- : checkCondition (affix , word , 0 , deAffixedLen , stripData , stripStart , stripLen );
424
+ ? checkCondition (
425
+ affix , stripData , stripStart , stripLen , word , offset + affixLen , deAffixedLen )
426
+ : checkCondition (affix , word , offset , deAffixedLen , stripData , stripStart , stripLen );
414
427
if (!condition ) {
415
428
return null ;
416
429
}
417
430
431
+ if (stripLen == 0 ) return word ;
432
+
418
433
char [] strippedWord = new char [stripLen + deAffixedLen ];
419
434
System .arraycopy (
420
- word , isPrefix ? affixLen : 0 , strippedWord , isPrefix ? stripLen : 0 , deAffixedLen );
435
+ word ,
436
+ offset + (isPrefix ? affixLen : 0 ),
437
+ strippedWord ,
438
+ isPrefix ? stripLen : 0 ,
439
+ deAffixedLen );
421
440
System .arraycopy (stripData , stripStart , strippedWord , isPrefix ? 0 : deAffixedLen , stripLen );
422
441
return strippedWord ;
423
442
}
@@ -484,6 +503,7 @@ private boolean checkCondition(
484
503
*/
485
504
private List <CharsRef > applyAffix (
486
505
char [] strippedWord ,
506
+ int offset ,
487
507
int length ,
488
508
int affix ,
489
509
int prefixId ,
@@ -496,7 +516,7 @@ private List<CharsRef> applyAffix(
496
516
497
517
List <CharsRef > stems = new ArrayList <>();
498
518
499
- IntsRef forms = dictionary .lookupWord (strippedWord , 0 , length );
519
+ IntsRef forms = dictionary .lookupWord (strippedWord , offset , length );
500
520
if (forms != null ) {
501
521
for (int i = 0 ; i < forms .length ; i += formStep ) {
502
522
char [] wordFlags = dictionary .decodeFlags (forms .ints [forms .offset + i ], scratch );
@@ -530,7 +550,7 @@ private List<CharsRef> applyAffix(
530
550
if (Dictionary .hasFlag (wordFlags , dictionary .onlyincompound )) {
531
551
continue ;
532
552
}
533
- stems .add (newStem (strippedWord , length , forms , i ));
553
+ stems .add (newStem (strippedWord , offset , length , forms , i ));
534
554
}
535
555
}
536
556
}
@@ -572,6 +592,7 @@ private List<CharsRef> applyAffix(
572
592
stems .addAll (
573
593
stem (
574
594
strippedWord ,
595
+ offset ,
575
596
length ,
576
597
affix ,
577
598
flag ,
0 commit comments