Skip to content

Commit f64e7cb

Browse files
authored
LUCENE-9691: Hunspell: support trailing comments on aff option lines (#2236)
plus cleanup & deduplicate parsing
1 parent c7e1079 commit f64e7cb

File tree

2 files changed

+58
-116
lines changed
  • lucene/analysis/common/src

2 files changed

+58
-116
lines changed

lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java

Lines changed: 54 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -78,31 +78,6 @@ public class Dictionary {
7878

7979
private static final char HIDDEN_FLAG = (char) 65511; // called 'ONLYUPCASEFLAG' in Hunspell
8080

81-
private static final String ALIAS_KEY = "AF";
82-
private static final String MORPH_ALIAS_KEY = "AM";
83-
private static final String PREFIX_KEY = "PFX";
84-
private static final String SUFFIX_KEY = "SFX";
85-
private static final String FLAG_KEY = "FLAG";
86-
private static final String COMPLEXPREFIXES_KEY = "COMPLEXPREFIXES";
87-
private static final String CIRCUMFIX_KEY = "CIRCUMFIX";
88-
private static final String IGNORE_KEY = "IGNORE";
89-
private static final String ICONV_KEY = "ICONV";
90-
private static final String OCONV_KEY = "OCONV";
91-
private static final String FULLSTRIP_KEY = "FULLSTRIP";
92-
private static final String LANG_KEY = "LANG";
93-
private static final String BREAK_KEY = "BREAK";
94-
private static final String FORBIDDENWORD_KEY = "FORBIDDENWORD";
95-
private static final String COMPOUNDMIN_KEY = "COMPOUNDMIN";
96-
private static final String COMPOUNDRULE_KEY = "COMPOUNDRULE";
97-
private static final String KEEPCASE_KEY = "KEEPCASE";
98-
private static final String NEEDAFFIX_KEY = "NEEDAFFIX";
99-
private static final String PSEUDOROOT_KEY = "PSEUDOROOT";
100-
private static final String ONLYINCOMPOUND_KEY = "ONLYINCOMPOUND";
101-
102-
private static final String NUM_FLAG_TYPE = "num";
103-
private static final String UTF8_FLAG_TYPE = "UTF-8";
104-
private static final String LONG_FLAG_TYPE = "long";
105-
10681
// TODO: really for suffixes we should reverse the automaton and run them backwards
10782
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
10883
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
@@ -346,95 +321,62 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder)
346321
if (reader.getLineNumber() == 1 && line.startsWith("\uFEFF")) {
347322
line = line.substring(1);
348323
}
349-
if (line.startsWith(ALIAS_KEY)) {
324+
line = line.trim();
325+
if (line.isEmpty()) continue;
326+
327+
String firstWord = line.split("\\s")[0];
328+
if ("AF".equals(firstWord)) {
350329
parseAlias(line);
351-
} else if (line.startsWith(MORPH_ALIAS_KEY)) {
330+
} else if ("AM".equals(firstWord)) {
352331
parseMorphAlias(line);
353-
} else if (line.startsWith(PREFIX_KEY)) {
332+
} else if ("PFX".equals(firstWord)) {
354333
parseAffix(
355334
prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
356-
} else if (line.startsWith(SUFFIX_KEY)) {
335+
} else if ("SFX".equals(firstWord)) {
357336
parseAffix(
358337
suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
359-
} else if (line.startsWith(FLAG_KEY)) {
338+
} else if ("FLAG".equals(firstWord)) {
360339
// Assume that the FLAG line comes before any prefix or suffixes
361340
// Store the strategy so it can be used when parsing the dic file
362341
flagParsingStrategy = getFlagParsingStrategy(line);
363-
} else if (line.equals(COMPLEXPREFIXES_KEY)) {
342+
} else if (line.equals("COMPLEXPREFIXES")) {
364343
complexPrefixes =
365344
true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
366-
} else if (line.startsWith(CIRCUMFIX_KEY)) {
367-
String[] parts = line.split("\\s+");
368-
if (parts.length != 2) {
369-
throw new ParseException("Illegal CIRCUMFIX declaration", reader.getLineNumber());
370-
}
371-
circumfix = flagParsingStrategy.parseFlag(parts[1]);
372-
} else if (line.startsWith(KEEPCASE_KEY)) {
373-
String[] parts = line.split("\\s+");
374-
if (parts.length != 2) {
375-
throw new ParseException("Illegal KEEPCASE declaration", reader.getLineNumber());
376-
}
377-
keepcase = flagParsingStrategy.parseFlag(parts[1]);
378-
} else if (line.startsWith(NEEDAFFIX_KEY) || line.startsWith(PSEUDOROOT_KEY)) {
379-
String[] parts = line.split("\\s+");
380-
if (parts.length != 2) {
381-
throw new ParseException("Illegal NEEDAFFIX declaration", reader.getLineNumber());
382-
}
383-
needaffix = flagParsingStrategy.parseFlag(parts[1]);
384-
} else if (line.startsWith(ONLYINCOMPOUND_KEY)) {
385-
String[] parts = line.split("\\s+");
386-
if (parts.length != 2) {
387-
throw new ParseException("Illegal ONLYINCOMPOUND declaration", reader.getLineNumber());
388-
}
389-
onlyincompound = flagParsingStrategy.parseFlag(parts[1]);
390-
} else if (line.startsWith(IGNORE_KEY)) {
391-
String[] parts = line.split("\\s+");
392-
if (parts.length != 2) {
393-
throw new ParseException("Illegal IGNORE declaration", reader.getLineNumber());
394-
}
395-
ignore = parts[1].toCharArray();
345+
} else if ("CIRCUMFIX".equals(firstWord)) {
346+
circumfix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
347+
} else if ("KEEPCASE".equals(firstWord)) {
348+
keepcase = flagParsingStrategy.parseFlag(singleArgument(reader, line));
349+
} else if ("NEEDAFFIX".equals(firstWord) || "PSEUDOROOT".equals(firstWord)) {
350+
needaffix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
351+
} else if ("ONLYINCOMPOUND".equals(firstWord)) {
352+
onlyincompound = flagParsingStrategy.parseFlag(singleArgument(reader, line));
353+
} else if ("IGNORE".equals(firstWord)) {
354+
ignore = singleArgument(reader, line).toCharArray();
396355
Arrays.sort(ignore);
397356
needsInputCleaning = true;
398-
} else if (line.startsWith(ICONV_KEY) || line.startsWith(OCONV_KEY)) {
399-
String[] parts = line.split("\\s+");
400-
String type = parts[0];
401-
if (parts.length != 2) {
402-
throw new ParseException("Illegal " + type + " declaration", reader.getLineNumber());
403-
}
404-
int num = Integer.parseInt(parts[1]);
357+
} else if ("ICONV".equals(firstWord) || "OCONV".equals(firstWord)) {
358+
int num = Integer.parseInt(singleArgument(reader, line));
405359
FST<CharsRef> res = parseConversions(reader, num);
406-
if (type.equals("ICONV")) {
360+
if (line.startsWith("I")) {
407361
iconv = res;
408362
needsInputCleaning |= iconv != null;
409363
} else {
410364
oconv = res;
411365
needsOutputCleaning |= oconv != null;
412366
}
413-
} else if (line.startsWith(FULLSTRIP_KEY)) {
367+
} else if ("FULLSTRIP".equals(firstWord)) {
414368
fullStrip = true;
415-
} else if (line.startsWith(LANG_KEY)) {
416-
language = line.substring(LANG_KEY.length()).trim();
369+
} else if ("LANG".equals(firstWord)) {
370+
language = singleArgument(reader, line);
417371
alternateCasing = "tr_TR".equals(language) || "az_AZ".equals(language);
418-
} else if (line.startsWith(BREAK_KEY)) {
372+
} else if ("BREAK".equals(firstWord)) {
419373
breaks = parseBreaks(reader, line);
420-
} else if (line.startsWith(FORBIDDENWORD_KEY)) {
421-
String[] parts = line.split("\\s+");
422-
if (parts.length != 2) {
423-
throw new ParseException("Illegal FORBIDDENWORD declaration", reader.getLineNumber());
424-
}
425-
forbiddenword = flagParsingStrategy.parseFlag(parts[1]);
426-
} else if (line.startsWith(COMPOUNDMIN_KEY)) {
427-
String[] parts = line.split("\\s+");
428-
if (parts.length != 2) {
429-
throw new ParseException("Illegal COMPOUNDMIN declaration", reader.getLineNumber());
430-
}
431-
compoundMin = Math.max(1, Integer.parseInt(parts[1]));
432-
} else if (line.startsWith(COMPOUNDRULE_KEY)) {
433-
String[] parts = line.split("\\s+");
434-
if (parts.length != 2) {
435-
throw new ParseException("Illegal COMPOUNDRULE header", reader.getLineNumber());
436-
}
437-
this.compoundRules = parseCompoundRules(reader, Integer.parseInt(parts[1]));
374+
} else if ("FORBIDDENWORD".equals(firstWord)) {
375+
forbiddenword = flagParsingStrategy.parseFlag(singleArgument(reader, line));
376+
} else if ("COMPOUNDMIN".equals(firstWord)) {
377+
compoundMin = Math.max(1, Integer.parseInt(singleArgument(reader, line)));
378+
} else if ("COMPOUNDRULE".equals(firstWord)) {
379+
compoundRules = parseCompoundRules(reader, Integer.parseInt(singleArgument(reader, line)));
438380
}
439381
}
440382

@@ -458,17 +400,25 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder)
458400
stripOffsets[currentIndex] = currentOffset;
459401
}
460402

403+
private String singleArgument(LineNumberReader reader, String line) throws ParseException {
404+
return splitBySpace(reader, line, 2)[1];
405+
}
406+
407+
private String[] splitBySpace(LineNumberReader reader, String line, int expectedParts)
408+
throws ParseException {
409+
String[] parts = line.split("\\s+");
410+
if (parts.length < expectedParts
411+
|| parts.length > expectedParts && !parts[expectedParts].startsWith("#")) {
412+
throw new ParseException("Invalid syntax", reader.getLineNumber());
413+
}
414+
return parts;
415+
}
416+
461417
private List<CompoundRule> parseCompoundRules(LineNumberReader reader, int num)
462418
throws IOException, ParseException {
463-
String line;
464419
List<CompoundRule> compoundRules = new ArrayList<>();
465420
for (int i = 0; i < num; i++) {
466-
line = reader.readLine();
467-
String[] parts = line.split("\\s+");
468-
if (!line.startsWith(COMPOUNDRULE_KEY) || parts.length != 2) {
469-
throw new ParseException("COMPOUNDRULE rule expected", reader.getLineNumber());
470-
}
471-
compoundRules.add(new CompoundRule(parts[1], this));
421+
compoundRules.add(new CompoundRule(singleArgument(reader, reader.readLine()), this));
472422
}
473423
return compoundRules;
474424
}
@@ -478,14 +428,9 @@ private Breaks parseBreaks(LineNumberReader reader, String line)
478428
Set<String> starting = new LinkedHashSet<>();
479429
Set<String> ending = new LinkedHashSet<>();
480430
Set<String> middle = new LinkedHashSet<>();
481-
int num = Integer.parseInt(line.substring(BREAK_KEY.length()).trim());
431+
int num = Integer.parseInt(singleArgument(reader, line));
482432
for (int i = 0; i < num; i++) {
483-
line = reader.readLine();
484-
String[] parts = line.split("\\s+");
485-
if (!line.startsWith(BREAK_KEY) || parts.length != 2) {
486-
throw new ParseException("BREAK chars expected", reader.getLineNumber());
487-
}
488-
String breakStr = parts[1];
433+
String breakStr = singleArgument(reader, reader.readLine());
489434
if (breakStr.startsWith("^")) {
490435
starting.add(breakStr.substring(1));
491436
} else if (breakStr.endsWith("$")) {
@@ -689,11 +634,7 @@ private FST<CharsRef> parseConversions(LineNumberReader reader, int num)
689634
Map<String, String> mappings = new TreeMap<>();
690635

691636
for (int i = 0; i < num; i++) {
692-
String line = reader.readLine();
693-
String[] parts = line.split("\\s+");
694-
if (parts.length != 3) {
695-
throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
696-
}
637+
String[] parts = splitBySpace(reader, reader.readLine(), 3);
697638
if (mappings.put(parts[1], parts[2]) != null) {
698639
throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
699640
}
@@ -789,11 +730,11 @@ static FlagParsingStrategy getFlagParsingStrategy(String flagLine) {
789730
}
790731
String flagType = parts[1];
791732

792-
if (NUM_FLAG_TYPE.equals(flagType)) {
733+
if ("num".equals(flagType)) {
793734
return new NumFlagParsingStrategy();
794-
} else if (UTF8_FLAG_TYPE.equals(flagType)) {
735+
} else if ("UTF-8".equals(flagType)) {
795736
return new SimpleFlagParsingStrategy();
796-
} else if (LONG_FLAG_TYPE.equals(flagType)) {
737+
} else if ("long".equals(flagType)) {
797738
return new DoubleASCIIFlagParsingStrategy();
798739
}
799740

lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/conv.aff

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@ ICONV B b
66
ICONV C c
77
ICONV I i
88

9-
OCONV 4
10-
OCONV a A
11-
OCONV b B
9+
# Testing also whitespace and comments.
10+
OCONV 4 # space, space
11+
OCONV a A # tab, space, space
12+
OCONV b B # tab, tab, space
1213
OCONV c C
1314
OCONV i I
1415

0 commit comments

Comments
 (0)