37
37
import java .util .Collections ;
38
38
import java .util .Comparator ;
39
39
import java .util .HashMap ;
40
+ import java .util .HashSet ;
40
41
import java .util .LinkedHashMap ;
41
42
import java .util .LinkedHashSet ;
42
43
import java .util .List ;
43
- import java .util .Locale ;
44
44
import java .util .Map ;
45
45
import java .util .Set ;
46
46
import java .util .TreeMap ;
@@ -78,9 +78,6 @@ public class Dictionary {
78
78
private static final int DEFAULT_FLAGS = 65510 ;
79
79
static final char HIDDEN_FLAG = (char ) 65511 ; // called 'ONLYUPCASEFLAG' in Hunspell
80
80
81
- // TODO: really for suffixes we should reverse the automaton and run them backwards
82
- private static final String PREFIX_CONDITION_REGEX = "%s.*" ;
83
- private static final String SUFFIX_CONDITION_REGEX = ".*%s" ;
84
81
static final Charset DEFAULT_CHARSET = StandardCharsets .ISO_8859_1 ;
85
82
CharsetDecoder decoder = replacingDecoder (DEFAULT_CHARSET );
86
83
@@ -147,8 +144,12 @@ public class Dictionary {
147
144
boolean ignoreCase ;
148
145
boolean checkSharpS ;
149
146
boolean complexPrefixes ;
150
- // if no affixes have continuation classes, no need to do 2-level affix stripping
151
- boolean twoStageAffix ;
147
+
148
+ /**
149
+ * All flags used in affix continuation classes. If an outer affix's flag isn't here, there's no
150
+ * need to do 2-level affix stripping with it.
151
+ */
152
+ private char [] secondStageAffixFlags ;
152
153
153
154
char circumfix ;
154
155
char keepcase , forceUCase ;
@@ -332,6 +333,7 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, Flag
332
333
throws IOException , ParseException {
333
334
TreeMap <String , List <Integer >> prefixes = new TreeMap <>();
334
335
TreeMap <String , List <Integer >> suffixes = new TreeMap <>();
336
+ Set <Character > stage2Flags = new HashSet <>();
335
337
Map <String , Integer > seenPatterns = new HashMap <>();
336
338
337
339
// zero condition -> 0 ord
@@ -359,9 +361,9 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, Flag
359
361
} else if ("AM" .equals (firstWord )) {
360
362
parseMorphAlias (line );
361
363
} else if ("PFX" .equals (firstWord )) {
362
- parseAffix (prefixes , line , reader , PREFIX_CONDITION_REGEX , seenPatterns , seenStrips , flags );
364
+ parseAffix (prefixes , stage2Flags , line , reader , false , seenPatterns , seenStrips , flags );
363
365
} else if ("SFX" .equals (firstWord )) {
364
- parseAffix (suffixes , line , reader , SUFFIX_CONDITION_REGEX , seenPatterns , seenStrips , flags );
366
+ parseAffix (suffixes , stage2Flags , line , reader , true , seenPatterns , seenStrips , flags );
365
367
} else if (line .equals ("COMPLEXPREFIXES" )) {
366
368
complexPrefixes =
367
369
true ; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
@@ -476,6 +478,7 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, Flag
476
478
477
479
this .prefixes = affixFST (prefixes );
478
480
this .suffixes = affixFST (suffixes );
481
+ secondStageAffixFlags = toSortedCharArray (stage2Flags );
479
482
480
483
int totalChars = 0 ;
481
484
for (String strip : seenStrips .keySet ()) {
@@ -675,16 +678,15 @@ static String escapeDash(String re) {
675
678
* @param affixes Map where the result of the parsing will be put
676
679
* @param header Header line of the affix rule
677
680
* @param reader BufferedReader to read the content of the rule from
678
- * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate
679
- * the condition regex pattern
680
681
* @param seenPatterns map from condition -> index of patterns, for deduplication.
681
682
* @throws IOException Can be thrown while reading the rule
682
683
*/
683
684
private void parseAffix (
684
685
TreeMap <String , List <Integer >> affixes ,
686
+ Set <Character > secondStageFlags ,
685
687
String header ,
686
688
LineNumberReader reader ,
687
- String conditionPattern ,
689
+ boolean isSuffix ,
688
690
Map <String , Integer > seenPatterns ,
689
691
Map <String , Integer > seenStrips ,
690
692
FlagEnumerator flags )
@@ -694,7 +696,6 @@ private void parseAffix(
694
696
String [] args = header .split ("\\ s+" );
695
697
696
698
boolean crossProduct = args [2 ].equals ("Y" );
697
- boolean isSuffix = conditionPattern .equals (SUFFIX_CONDITION_REGEX );
698
699
699
700
int numLines ;
700
701
try {
@@ -725,7 +726,9 @@ private void parseAffix(
725
726
}
726
727
727
728
appendFlags = flagParsingStrategy .parseFlags (flagPart );
728
- twoStageAffix = true ;
729
+ for (char appendFlag : appendFlags ) {
730
+ secondStageFlags .add (appendFlag );
731
+ }
729
732
}
730
733
// zero affix -> empty string
731
734
if ("0" .equals (affixArg )) {
@@ -750,7 +753,8 @@ private void parseAffix(
750
753
// if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
751
754
// but this is complicated...
752
755
} else {
753
- regex = String .format (Locale .ROOT , conditionPattern , condition );
756
+ // TODO: really for suffixes we should reverse the automaton and run them backwards
757
+ regex = isSuffix ? ".*" + condition : condition + ".*" ;
754
758
}
755
759
756
760
// deduplicate patterns
@@ -1610,6 +1614,20 @@ CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
1610
1614
return reuse ;
1611
1615
}
1612
1616
1617
+ private static char [] toSortedCharArray (Set <Character > set ) {
1618
+ char [] chars = new char [set .size ()];
1619
+ int i = 0 ;
1620
+ for (Character c : set ) {
1621
+ chars [i ++] = c ;
1622
+ }
1623
+ Arrays .sort (chars );
1624
+ return chars ;
1625
+ }
1626
+
1627
+ boolean isSecondStageAffix (char flag ) {
1628
+ return Arrays .binarySearch (secondStageAffixFlags , flag ) >= 0 ;
1629
+ }
1630
+
1613
1631
/** folds single character (according to LANG if present) */
1614
1632
char caseFold (char c ) {
1615
1633
if (alternateCasing ) {
0 commit comments