@@ -78,31 +78,6 @@ public class Dictionary {
78
78
79
79
private static final char HIDDEN_FLAG = (char ) 65511 ; // called 'ONLYUPCASEFLAG' in Hunspell
80
80
81
- private static final String ALIAS_KEY = "AF" ;
82
- private static final String MORPH_ALIAS_KEY = "AM" ;
83
- private static final String PREFIX_KEY = "PFX" ;
84
- private static final String SUFFIX_KEY = "SFX" ;
85
- private static final String FLAG_KEY = "FLAG" ;
86
- private static final String COMPLEXPREFIXES_KEY = "COMPLEXPREFIXES" ;
87
- private static final String CIRCUMFIX_KEY = "CIRCUMFIX" ;
88
- private static final String IGNORE_KEY = "IGNORE" ;
89
- private static final String ICONV_KEY = "ICONV" ;
90
- private static final String OCONV_KEY = "OCONV" ;
91
- private static final String FULLSTRIP_KEY = "FULLSTRIP" ;
92
- private static final String LANG_KEY = "LANG" ;
93
- private static final String BREAK_KEY = "BREAK" ;
94
- private static final String FORBIDDENWORD_KEY = "FORBIDDENWORD" ;
95
- private static final String COMPOUNDMIN_KEY = "COMPOUNDMIN" ;
96
- private static final String COMPOUNDRULE_KEY = "COMPOUNDRULE" ;
97
- private static final String KEEPCASE_KEY = "KEEPCASE" ;
98
- private static final String NEEDAFFIX_KEY = "NEEDAFFIX" ;
99
- private static final String PSEUDOROOT_KEY = "PSEUDOROOT" ;
100
- private static final String ONLYINCOMPOUND_KEY = "ONLYINCOMPOUND" ;
101
-
102
- private static final String NUM_FLAG_TYPE = "num" ;
103
- private static final String UTF8_FLAG_TYPE = "UTF-8" ;
104
- private static final String LONG_FLAG_TYPE = "long" ;
105
-
106
81
// TODO: really for suffixes we should reverse the automaton and run them backwards
107
82
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*" ;
108
83
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s" ;
@@ -346,95 +321,62 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder)
346
321
if (reader .getLineNumber () == 1 && line .startsWith ("\uFEFF " )) {
347
322
line = line .substring (1 );
348
323
}
349
- if (line .startsWith (ALIAS_KEY )) {
324
+ line = line .trim ();
325
+ if (line .isEmpty ()) continue ;
326
+
327
+ String firstWord = line .split ("\\ s" )[0 ];
328
+ if ("AF" .equals (firstWord )) {
350
329
parseAlias (line );
351
- } else if (line . startsWith ( MORPH_ALIAS_KEY )) {
330
+ } else if ("AM" . equals ( firstWord )) {
352
331
parseMorphAlias (line );
353
- } else if (line . startsWith ( PREFIX_KEY )) {
332
+ } else if ("PFX" . equals ( firstWord )) {
354
333
parseAffix (
355
334
prefixes , line , reader , PREFIX_CONDITION_REGEX_PATTERN , seenPatterns , seenStrips );
356
- } else if (line . startsWith ( SUFFIX_KEY )) {
335
+ } else if ("SFX" . equals ( firstWord )) {
357
336
parseAffix (
358
337
suffixes , line , reader , SUFFIX_CONDITION_REGEX_PATTERN , seenPatterns , seenStrips );
359
- } else if (line . startsWith ( FLAG_KEY )) {
338
+ } else if ("FLAG" . equals ( firstWord )) {
360
339
// Assume that the FLAG line comes before any prefix or suffixes
361
340
// Store the strategy so it can be used when parsing the dic file
362
341
flagParsingStrategy = getFlagParsingStrategy (line );
363
- } else if (line .equals (COMPLEXPREFIXES_KEY )) {
342
+ } else if (line .equals ("COMPLEXPREFIXES" )) {
364
343
complexPrefixes =
365
344
true ; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
366
- } else if (line .startsWith (CIRCUMFIX_KEY )) {
367
- String [] parts = line .split ("\\ s+" );
368
- if (parts .length != 2 ) {
369
- throw new ParseException ("Illegal CIRCUMFIX declaration" , reader .getLineNumber ());
370
- }
371
- circumfix = flagParsingStrategy .parseFlag (parts [1 ]);
372
- } else if (line .startsWith (KEEPCASE_KEY )) {
373
- String [] parts = line .split ("\\ s+" );
374
- if (parts .length != 2 ) {
375
- throw new ParseException ("Illegal KEEPCASE declaration" , reader .getLineNumber ());
376
- }
377
- keepcase = flagParsingStrategy .parseFlag (parts [1 ]);
378
- } else if (line .startsWith (NEEDAFFIX_KEY ) || line .startsWith (PSEUDOROOT_KEY )) {
379
- String [] parts = line .split ("\\ s+" );
380
- if (parts .length != 2 ) {
381
- throw new ParseException ("Illegal NEEDAFFIX declaration" , reader .getLineNumber ());
382
- }
383
- needaffix = flagParsingStrategy .parseFlag (parts [1 ]);
384
- } else if (line .startsWith (ONLYINCOMPOUND_KEY )) {
385
- String [] parts = line .split ("\\ s+" );
386
- if (parts .length != 2 ) {
387
- throw new ParseException ("Illegal ONLYINCOMPOUND declaration" , reader .getLineNumber ());
388
- }
389
- onlyincompound = flagParsingStrategy .parseFlag (parts [1 ]);
390
- } else if (line .startsWith (IGNORE_KEY )) {
391
- String [] parts = line .split ("\\ s+" );
392
- if (parts .length != 2 ) {
393
- throw new ParseException ("Illegal IGNORE declaration" , reader .getLineNumber ());
394
- }
395
- ignore = parts [1 ].toCharArray ();
345
+ } else if ("CIRCUMFIX" .equals (firstWord )) {
346
+ circumfix = flagParsingStrategy .parseFlag (singleArgument (reader , line ));
347
+ } else if ("KEEPCASE" .equals (firstWord )) {
348
+ keepcase = flagParsingStrategy .parseFlag (singleArgument (reader , line ));
349
+ } else if ("NEEDAFFIX" .equals (firstWord ) || "PSEUDOROOT" .equals (firstWord )) {
350
+ needaffix = flagParsingStrategy .parseFlag (singleArgument (reader , line ));
351
+ } else if ("ONLYINCOMPOUND" .equals (firstWord )) {
352
+ onlyincompound = flagParsingStrategy .parseFlag (singleArgument (reader , line ));
353
+ } else if ("IGNORE" .equals (firstWord )) {
354
+ ignore = singleArgument (reader , line ).toCharArray ();
396
355
Arrays .sort (ignore );
397
356
needsInputCleaning = true ;
398
- } else if (line .startsWith (ICONV_KEY ) || line .startsWith (OCONV_KEY )) {
399
- String [] parts = line .split ("\\ s+" );
400
- String type = parts [0 ];
401
- if (parts .length != 2 ) {
402
- throw new ParseException ("Illegal " + type + " declaration" , reader .getLineNumber ());
403
- }
404
- int num = Integer .parseInt (parts [1 ]);
357
+ } else if ("ICONV" .equals (firstWord ) || "OCONV" .equals (firstWord )) {
358
+ int num = Integer .parseInt (singleArgument (reader , line ));
405
359
FST <CharsRef > res = parseConversions (reader , num );
406
- if (type . equals ( "ICONV " )) {
360
+ if (line . startsWith ( "I " )) {
407
361
iconv = res ;
408
362
needsInputCleaning |= iconv != null ;
409
363
} else {
410
364
oconv = res ;
411
365
needsOutputCleaning |= oconv != null ;
412
366
}
413
- } else if (line . startsWith ( FULLSTRIP_KEY )) {
367
+ } else if ("FULLSTRIP" . equals ( firstWord )) {
414
368
fullStrip = true ;
415
- } else if (line . startsWith ( LANG_KEY )) {
416
- language = line . substring ( LANG_KEY . length ()). trim ( );
369
+ } else if ("LANG" . equals ( firstWord )) {
370
+ language = singleArgument ( reader , line );
417
371
alternateCasing = "tr_TR" .equals (language ) || "az_AZ" .equals (language );
418
- } else if (line . startsWith ( BREAK_KEY )) {
372
+ } else if ("BREAK" . equals ( firstWord )) {
419
373
breaks = parseBreaks (reader , line );
420
- } else if (line .startsWith (FORBIDDENWORD_KEY )) {
421
- String [] parts = line .split ("\\ s+" );
422
- if (parts .length != 2 ) {
423
- throw new ParseException ("Illegal FORBIDDENWORD declaration" , reader .getLineNumber ());
424
- }
425
- forbiddenword = flagParsingStrategy .parseFlag (parts [1 ]);
426
- } else if (line .startsWith (COMPOUNDMIN_KEY )) {
427
- String [] parts = line .split ("\\ s+" );
428
- if (parts .length != 2 ) {
429
- throw new ParseException ("Illegal COMPOUNDMIN declaration" , reader .getLineNumber ());
430
- }
431
- compoundMin = Math .max (1 , Integer .parseInt (parts [1 ]));
432
- } else if (line .startsWith (COMPOUNDRULE_KEY )) {
433
- String [] parts = line .split ("\\ s+" );
434
- if (parts .length != 2 ) {
435
- throw new ParseException ("Illegal COMPOUNDRULE header" , reader .getLineNumber ());
436
- }
437
- this .compoundRules = parseCompoundRules (reader , Integer .parseInt (parts [1 ]));
374
+ } else if ("FORBIDDENWORD" .equals (firstWord )) {
375
+ forbiddenword = flagParsingStrategy .parseFlag (singleArgument (reader , line ));
376
+ } else if ("COMPOUNDMIN" .equals (firstWord )) {
377
+ compoundMin = Math .max (1 , Integer .parseInt (singleArgument (reader , line )));
378
+ } else if ("COMPOUNDRULE" .equals (firstWord )) {
379
+ compoundRules = parseCompoundRules (reader , Integer .parseInt (singleArgument (reader , line )));
438
380
}
439
381
}
440
382
@@ -458,17 +400,25 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder)
458
400
stripOffsets [currentIndex ] = currentOffset ;
459
401
}
460
402
403
+ private String singleArgument (LineNumberReader reader , String line ) throws ParseException {
404
+ return splitBySpace (reader , line , 2 )[1 ];
405
+ }
406
+
407
+ private String [] splitBySpace (LineNumberReader reader , String line , int expectedParts )
408
+ throws ParseException {
409
+ String [] parts = line .split ("\\ s+" );
410
+ if (parts .length < expectedParts
411
+ || parts .length > expectedParts && !parts [expectedParts ].startsWith ("#" )) {
412
+ throw new ParseException ("Invalid syntax" , reader .getLineNumber ());
413
+ }
414
+ return parts ;
415
+ }
416
+
461
417
private List <CompoundRule > parseCompoundRules (LineNumberReader reader , int num )
462
418
throws IOException , ParseException {
463
- String line ;
464
419
List <CompoundRule > compoundRules = new ArrayList <>();
465
420
for (int i = 0 ; i < num ; i ++) {
466
- line = reader .readLine ();
467
- String [] parts = line .split ("\\ s+" );
468
- if (!line .startsWith (COMPOUNDRULE_KEY ) || parts .length != 2 ) {
469
- throw new ParseException ("COMPOUNDRULE rule expected" , reader .getLineNumber ());
470
- }
471
- compoundRules .add (new CompoundRule (parts [1 ], this ));
421
+ compoundRules .add (new CompoundRule (singleArgument (reader , reader .readLine ()), this ));
472
422
}
473
423
return compoundRules ;
474
424
}
@@ -478,14 +428,9 @@ private Breaks parseBreaks(LineNumberReader reader, String line)
478
428
Set <String > starting = new LinkedHashSet <>();
479
429
Set <String > ending = new LinkedHashSet <>();
480
430
Set <String > middle = new LinkedHashSet <>();
481
- int num = Integer .parseInt (line . substring ( BREAK_KEY . length ()). trim ( ));
431
+ int num = Integer .parseInt (singleArgument ( reader , line ));
482
432
for (int i = 0 ; i < num ; i ++) {
483
- line = reader .readLine ();
484
- String [] parts = line .split ("\\ s+" );
485
- if (!line .startsWith (BREAK_KEY ) || parts .length != 2 ) {
486
- throw new ParseException ("BREAK chars expected" , reader .getLineNumber ());
487
- }
488
- String breakStr = parts [1 ];
433
+ String breakStr = singleArgument (reader , reader .readLine ());
489
434
if (breakStr .startsWith ("^" )) {
490
435
starting .add (breakStr .substring (1 ));
491
436
} else if (breakStr .endsWith ("$" )) {
@@ -689,11 +634,7 @@ private FST<CharsRef> parseConversions(LineNumberReader reader, int num)
689
634
Map <String , String > mappings = new TreeMap <>();
690
635
691
636
for (int i = 0 ; i < num ; i ++) {
692
- String line = reader .readLine ();
693
- String [] parts = line .split ("\\ s+" );
694
- if (parts .length != 3 ) {
695
- throw new ParseException ("invalid syntax: " + line , reader .getLineNumber ());
696
- }
637
+ String [] parts = splitBySpace (reader , reader .readLine (), 3 );
697
638
if (mappings .put (parts [1 ], parts [2 ]) != null ) {
698
639
throw new IllegalStateException ("duplicate mapping specified for: " + parts [1 ]);
699
640
}
@@ -789,11 +730,11 @@ static FlagParsingStrategy getFlagParsingStrategy(String flagLine) {
789
730
}
790
731
String flagType = parts [1 ];
791
732
792
- if (NUM_FLAG_TYPE .equals (flagType )) {
733
+ if ("num" .equals (flagType )) {
793
734
return new NumFlagParsingStrategy ();
794
- } else if (UTF8_FLAG_TYPE .equals (flagType )) {
735
+ } else if ("UTF-8" .equals (flagType )) {
795
736
return new SimpleFlagParsingStrategy ();
796
- } else if (LONG_FLAG_TYPE .equals (flagType )) {
737
+ } else if ("long" .equals (flagType )) {
797
738
return new DoubleASCIIFlagParsingStrategy ();
798
739
}
799
740
0 commit comments