3737import java .util .Collections ;
3838import java .util .Comparator ;
3939import java .util .HashMap ;
40+ import java .util .HashSet ;
4041import java .util .LinkedHashMap ;
4142import java .util .LinkedHashSet ;
4243import java .util .List ;
43- import java .util .Locale ;
4444import java .util .Map ;
4545import java .util .Set ;
4646import java .util .TreeMap ;
@@ -78,9 +78,6 @@ public class Dictionary {
7878 private static final int DEFAULT_FLAGS = 65510 ;
7979 static final char HIDDEN_FLAG = (char ) 65511 ; // called 'ONLYUPCASEFLAG' in Hunspell
8080
81- // TODO: really for suffixes we should reverse the automaton and run them backwards
82- private static final String PREFIX_CONDITION_REGEX = "%s.*" ;
83- private static final String SUFFIX_CONDITION_REGEX = ".*%s" ;
8481 static final Charset DEFAULT_CHARSET = StandardCharsets .ISO_8859_1 ;
8582 CharsetDecoder decoder = replacingDecoder (DEFAULT_CHARSET );
8683
@@ -147,8 +144,12 @@ public class Dictionary {
147144 boolean ignoreCase ;
148145 boolean checkSharpS ;
149146 boolean complexPrefixes ;
150- // if no affixes have continuation classes, no need to do 2-level affix stripping
151- boolean twoStageAffix ;
147+
148+ /**
149+ * All flags used in affix continuation classes. If an outer affix's flag isn't here, there's no
150+ * need to do 2-level affix stripping with it.
151+ */
152+ private char [] secondStageAffixFlags ;
152153
153154 char circumfix ;
154155 char keepcase , forceUCase ;
@@ -332,6 +333,7 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, Flag
332333 throws IOException , ParseException {
333334 TreeMap <String , List <Integer >> prefixes = new TreeMap <>();
334335 TreeMap <String , List <Integer >> suffixes = new TreeMap <>();
336+ Set <Character > stage2Flags = new HashSet <>();
335337 Map <String , Integer > seenPatterns = new HashMap <>();
336338
337339 // zero condition -> 0 ord
@@ -359,9 +361,9 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, Flag
359361 } else if ("AM" .equals (firstWord )) {
360362 parseMorphAlias (line );
361363 } else if ("PFX" .equals (firstWord )) {
362- parseAffix (prefixes , line , reader , PREFIX_CONDITION_REGEX , seenPatterns , seenStrips , flags );
364+ parseAffix (prefixes , stage2Flags , line , reader , false , seenPatterns , seenStrips , flags );
363365 } else if ("SFX" .equals (firstWord )) {
364- parseAffix (suffixes , line , reader , SUFFIX_CONDITION_REGEX , seenPatterns , seenStrips , flags );
366+ parseAffix (suffixes , stage2Flags , line , reader , true , seenPatterns , seenStrips , flags );
365367 } else if (line .equals ("COMPLEXPREFIXES" )) {
366368 complexPrefixes =
367369 true ; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
@@ -476,6 +478,7 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, Flag
476478
477479 this .prefixes = affixFST (prefixes );
478480 this .suffixes = affixFST (suffixes );
481+ secondStageAffixFlags = toSortedCharArray (stage2Flags );
479482
480483 int totalChars = 0 ;
481484 for (String strip : seenStrips .keySet ()) {
@@ -675,16 +678,15 @@ static String escapeDash(String re) {
675678 * @param affixes Map where the result of the parsing will be put
676679 * @param header Header line of the affix rule
677680 * @param reader BufferedReader to read the content of the rule from
678- * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate
679- * the condition regex pattern
680681 * @param seenPatterns map from condition -> index of patterns, for deduplication.
681682 * @throws IOException Can be thrown while reading the rule
682683 */
683684 private void parseAffix (
684685 TreeMap <String , List <Integer >> affixes ,
686+ Set <Character > secondStageFlags ,
685687 String header ,
686688 LineNumberReader reader ,
687- String conditionPattern ,
689+ boolean isSuffix ,
688690 Map <String , Integer > seenPatterns ,
689691 Map <String , Integer > seenStrips ,
690692 FlagEnumerator flags )
@@ -694,7 +696,6 @@ private void parseAffix(
694696 String [] args = header .split ("\\ s+" );
695697
696698 boolean crossProduct = args [2 ].equals ("Y" );
697- boolean isSuffix = conditionPattern .equals (SUFFIX_CONDITION_REGEX );
698699
699700 int numLines ;
700701 try {
@@ -725,7 +726,9 @@ private void parseAffix(
725726 }
726727
727728 appendFlags = flagParsingStrategy .parseFlags (flagPart );
728- twoStageAffix = true ;
729+ for (char appendFlag : appendFlags ) {
730+ secondStageFlags .add (appendFlag );
731+ }
729732 }
730733 // zero affix -> empty string
731734 if ("0" .equals (affixArg )) {
@@ -750,7 +753,8 @@ private void parseAffix(
750753 // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
751754 // but this is complicated...
752755 } else {
753- regex = String .format (Locale .ROOT , conditionPattern , condition );
756+ // TODO: really for suffixes we should reverse the automaton and run them backwards
757+ regex = isSuffix ? ".*" + condition : condition + ".*" ;
754758 }
755759
756760 // deduplicate patterns
@@ -1610,6 +1614,20 @@ CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
16101614 return reuse ;
16111615 }
16121616
1617+ private static char [] toSortedCharArray (Set <Character > set ) {
1618+ char [] chars = new char [set .size ()];
1619+ int i = 0 ;
1620+ for (Character c : set ) {
1621+ chars [i ++] = c ;
1622+ }
1623+ Arrays .sort (chars );
1624+ return chars ;
1625+ }
1626+
1627+ boolean isSecondStageAffix (char flag ) {
1628+ return Arrays .binarySearch (secondStageAffixFlags , flag ) >= 0 ;
1629+ }
1630+
16131631 /** folds single character (according to LANG if present) */
16141632 char caseFold (char c ) {
16151633 if (alternateCasing ) {
0 commit comments