@@ -92,6 +92,8 @@ public class Dictionary {
92
92
private static final String LANG_KEY = "LANG" ;
93
93
private static final String BREAK_KEY = "BREAK" ;
94
94
private static final String FORBIDDENWORD_KEY = "FORBIDDENWORD" ;
95
+ private static final String COMPOUNDMIN_KEY = "COMPOUNDMIN" ;
96
+ private static final String COMPOUNDRULE_KEY = "COMPOUNDRULE" ;
95
97
private static final String KEEPCASE_KEY = "KEEPCASE" ;
96
98
private static final String NEEDAFFIX_KEY = "NEEDAFFIX" ;
97
99
private static final String PSEUDOROOT_KEY = "PSEUDOROOT" ;
@@ -136,7 +138,7 @@ public class Dictionary {
136
138
static final int AFFIX_APPEND = 3 ;
137
139
138
140
// Default flag parsing strategy
139
- private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy ();
141
+ FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy ();
140
142
141
143
// AF entries
142
144
private String [] aliases ;
@@ -163,6 +165,8 @@ public class Dictionary {
163
165
int needaffix = -1 ; // needaffix flag, or -1 if one is not defined
164
166
int forbiddenword = -1 ; // forbiddenword flag, or -1 if one is not defined
165
167
int onlyincompound = -1 ; // onlyincompound flag, or -1 if one is not defined
168
+ int compoundMin = 3 ;
169
+ List <CompoundRule > compoundRules ; // nullable
166
170
167
171
// ignored characters (dictionary, affix, inputs)
168
172
private char [] ignore ;
@@ -419,6 +423,18 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder)
419
423
throw new ParseException ("Illegal FORBIDDENWORD declaration" , reader .getLineNumber ());
420
424
}
421
425
forbiddenword = flagParsingStrategy .parseFlag (parts [1 ]);
426
+ } else if (line .startsWith (COMPOUNDMIN_KEY )) {
427
+ String [] parts = line .split ("\\ s+" );
428
+ if (parts .length != 2 ) {
429
+ throw new ParseException ("Illegal COMPOUNDMIN declaration" , reader .getLineNumber ());
430
+ }
431
+ compoundMin = Math .max (1 , Integer .parseInt (parts [1 ]));
432
+ } else if (line .startsWith (COMPOUNDRULE_KEY )) {
433
+ String [] parts = line .split ("\\ s+" );
434
+ if (parts .length != 2 ) {
435
+ throw new ParseException ("Illegal COMPOUNDRULE header" , reader .getLineNumber ());
436
+ }
437
+ this .compoundRules = parseCompoundRules (reader , Integer .parseInt (parts [1 ]));
422
438
}
423
439
}
424
440
@@ -442,6 +458,21 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder)
442
458
stripOffsets [currentIndex ] = currentOffset ;
443
459
}
444
460
461
+ private List <CompoundRule > parseCompoundRules (LineNumberReader reader , int num )
462
+ throws IOException , ParseException {
463
+ String line ;
464
+ List <CompoundRule > compoundRules = new ArrayList <>();
465
+ for (int i = 0 ; i < num ; i ++) {
466
+ line = reader .readLine ();
467
+ String [] parts = line .split ("\\ s+" );
468
+ if (!line .startsWith (COMPOUNDRULE_KEY ) || parts .length != 2 ) {
469
+ throw new ParseException ("COMPOUNDRULE rule expected" , reader .getLineNumber ());
470
+ }
471
+ compoundRules .add (new CompoundRule (parts [1 ], this ));
472
+ }
473
+ return compoundRules ;
474
+ }
475
+
445
476
private Breaks parseBreaks (LineNumberReader reader , String line )
446
477
throws IOException , ParseException {
447
478
Set <String > starting = new LinkedHashSet <>();
@@ -910,7 +941,7 @@ private void addHiddenCapitalizedWord(
910
941
reuse .append (caseFold (word .charAt (i )));
911
942
}
912
943
reuse .append (FLAG_SEPARATOR );
913
- reuse . append (HIDDEN_FLAG );
944
+ flagParsingStrategy . appendFlag (HIDDEN_FLAG , reuse );
914
945
reuse .append (afterSep , afterSep .charAt (0 ) == FLAG_SEPARATOR ? 1 : 0 , afterSep .length ());
915
946
writer .write (reuse .toString ().getBytes (StandardCharsets .UTF_8 ));
916
947
}
@@ -1188,16 +1219,19 @@ private String parseStemException(String morphData) {
1188
1219
return null ;
1189
1220
}
1190
1221
1191
- boolean isForbiddenWord (char [] word , BytesRef scratch ) {
1222
+ boolean isForbiddenWord (char [] word , int length , BytesRef scratch ) {
1192
1223
if (forbiddenword != -1 ) {
1193
- IntsRef forms = lookupWord (word , 0 , word .length );
1194
- if (forms != null ) {
1195
- int formStep = formStep ();
1196
- for (int i = 0 ; i < forms .length ; i += formStep ) {
1197
- if (hasFlag (forms .ints [forms .offset + i ], (char ) forbiddenword , scratch )) {
1198
- return true ;
1199
- }
1200
- }
1224
+ IntsRef forms = lookupWord (word , 0 , length );
1225
+ return forms != null && hasFlag (forms , (char ) forbiddenword , scratch );
1226
+ }
1227
+ return false ;
1228
+ }
1229
+
1230
+ boolean hasFlag (IntsRef forms , char flag , BytesRef scratch ) {
1231
+ int formStep = formStep ();
1232
+ for (int i = 0 ; i < forms .length ; i += formStep ) {
1233
+ if (hasFlag (forms .ints [forms .offset + i ], flag , scratch )) {
1234
+ return true ;
1201
1235
}
1202
1236
}
1203
1237
return false ;
@@ -1227,6 +1261,8 @@ char parseFlag(String rawFlag) {
1227
1261
* @return Parsed flags
1228
1262
*/
1229
1263
abstract char [] parseFlags (String rawFlags );
1264
+
1265
+ abstract void appendFlag (char flag , StringBuilder to );
1230
1266
}
1231
1267
1232
1268
/**
@@ -1238,6 +1274,11 @@ private static class SimpleFlagParsingStrategy extends FlagParsingStrategy {
1238
1274
public char [] parseFlags (String rawFlags ) {
1239
1275
return rawFlags .toCharArray ();
1240
1276
}
1277
+
1278
+ @ Override
1279
+ void appendFlag (char flag , StringBuilder to ) {
1280
+ to .append (flag );
1281
+ }
1241
1282
}
1242
1283
1243
1284
/**
@@ -1266,6 +1307,14 @@ public char[] parseFlags(String rawFlags) {
1266
1307
}
1267
1308
return flags ;
1268
1309
}
1310
+
1311
+ @ Override
1312
+ void appendFlag (char flag , StringBuilder to ) {
1313
+ if (to .length () > 0 ) {
1314
+ to .append ("," );
1315
+ }
1316
+ to .append ((int ) flag );
1317
+ }
1269
1318
}
1270
1319
1271
1320
/**
@@ -1300,6 +1349,16 @@ public char[] parseFlags(String rawFlags) {
1300
1349
builder .getChars (0 , builder .length (), flags , 0 );
1301
1350
return flags ;
1302
1351
}
1352
+
1353
+ @ Override
1354
+ void appendFlag (char flag , StringBuilder to ) {
1355
+ to .append ((char ) (flag >> 8 ));
1356
+ to .append ((char ) (flag & 0xff ));
1357
+ }
1358
+ }
1359
+
1360
+ boolean hasCompounding () {
1361
+ return compoundRules != null ;
1303
1362
}
1304
1363
1305
1364
boolean hasFlag (int entryId , char flag , BytesRef scratch ) {
0 commit comments