Skip to content

Commit a82634d

Browse files
authored
LUCENE-9693: Hunspell: check that all flags are > 0 and fit char range (#2238)
1 parent 0d88c14 commit a82634d

File tree

2 files changed

+33
-39
lines changed

2 files changed

+33
-39
lines changed

lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ public class Dictionary {
7676

7777
static final char[] NOFLAGS = new char[0];
7878

79+
static final int FLAG_UNSET = 0;
80+
private static final int DEFAULT_FLAGS = 65510;
7981
private static final char HIDDEN_FLAG = (char) 65511; // called 'ONLYUPCASEFLAG' in Hunspell
8082

8183
// TODO: really for suffixes we should reverse the automaton and run them backwards
@@ -135,11 +137,11 @@ public class Dictionary {
135137
// if no affixes have continuation classes, no need to do 2-level affix stripping
136138
boolean twoStageAffix;
137139

138-
int circumfix = -1; // circumfix flag, or -1 if one is not defined
139-
int keepcase = -1; // keepcase flag, or -1 if one is not defined
140-
int needaffix = -1; // needaffix flag, or -1 if one is not defined
141-
int forbiddenword = -1; // forbiddenword flag, or -1 if one is not defined
142-
int onlyincompound = -1; // onlyincompound flag, or -1 if one is not defined
140+
char circumfix;
141+
char keepcase;
142+
char needaffix;
143+
char forbiddenword;
144+
char onlyincompound;
143145
int compoundMin = 3;
144146
List<CompoundRule> compoundRules; // nullable
145147

@@ -1161,9 +1163,9 @@ private String parseStemException(String morphData) {
11611163
}
11621164

11631165
boolean isForbiddenWord(char[] word, int length, BytesRef scratch) {
1164-
if (forbiddenword != -1) {
1166+
if (forbiddenword != FLAG_UNSET) {
11651167
IntsRef forms = lookupWord(word, 0, length);
1166-
return forms != null && hasFlag(forms, (char) forbiddenword, scratch);
1168+
return forms != null && hasFlag(forms, forbiddenword, scratch);
11671169
}
11681170
return false;
11691171
}
@@ -1240,7 +1242,12 @@ public char[] parseFlags(String rawFlags) {
12401242
if (replacement.isEmpty()) {
12411243
continue;
12421244
}
1243-
flags[upto++] = (char) Integer.parseInt(replacement);
1245+
int flag = Integer.parseInt(replacement);
1246+
if (flag == FLAG_UNSET || flag >= Character.MAX_VALUE) { // read default flags as well
1247+
throw new IllegalArgumentException(
1248+
"Num flags should be between 0 and " + DEFAULT_FLAGS + ", found " + flag);
1249+
}
1250+
flags[upto++] = (char) flag;
12441251
}
12451252

12461253
if (upto < flags.length) {
@@ -1251,10 +1258,8 @@ public char[] parseFlags(String rawFlags) {
12511258

12521259
@Override
12531260
void appendFlag(char flag, StringBuilder to) {
1254-
if (to.length() > 0) {
1255-
to.append(",");
1256-
}
12571261
to.append((int) flag);
1262+
to.append(",");
12581263
}
12591264
}
12601265

@@ -1303,11 +1308,11 @@ boolean hasCompounding() {
13031308
}
13041309

13051310
boolean hasFlag(int entryId, char flag, BytesRef scratch) {
1306-
return hasFlag(decodeFlags(entryId, scratch), flag);
1311+
return flag != FLAG_UNSET && hasFlag(decodeFlags(entryId, scratch), flag);
13071312
}
13081313

13091314
static boolean hasFlag(char[] flags, char flag) {
1310-
return Arrays.binarySearch(flags, flag) >= 0;
1315+
return flag != FLAG_UNSET && Arrays.binarySearch(flags, flag) >= 0;
13111316
}
13121317

13131318
CharSequence cleanInput(CharSequence input, StringBuilder reuse) {

lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java

Lines changed: 15 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -168,20 +168,18 @@ List<CharsRef> doStem(char[] word, int length, boolean caseVariant) {
168168
continue;
169169
}
170170
// we can't add this form, it's a pseudostem requiring an affix
171-
if (dictionary.needaffix != -1
172-
&& Dictionary.hasFlag(wordFlags, (char) dictionary.needaffix)) {
171+
if (Dictionary.hasFlag(wordFlags, dictionary.needaffix)) {
173172
continue;
174173
}
175174
// we can't add this form, it only belongs inside a compound word
176-
if (dictionary.onlyincompound != -1
177-
&& Dictionary.hasFlag(wordFlags, (char) dictionary.onlyincompound)) {
175+
if (Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
178176
continue;
179177
}
180178
stems.add(newStem(word, length, forms, i));
181179
}
182180
}
183181
try {
184-
stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false, caseVariant));
182+
stems.addAll(stem(word, length, -1, (char) 0, -1, 0, true, true, false, false, caseVariant));
185183
} catch (IOException bogus) {
186184
throw new RuntimeException(bogus);
187185
}
@@ -190,7 +188,7 @@ List<CharsRef> doStem(char[] word, int length, boolean caseVariant) {
190188

191189
private boolean acceptCase(boolean caseVariant, char[] wordFlags) {
192190
return caseVariant
193-
? dictionary.keepcase == -1 || !Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)
191+
? !Dictionary.hasFlag(wordFlags, dictionary.keepcase)
194192
: !Dictionary.hasHiddenFlag(wordFlags);
195193
}
196194

@@ -289,7 +287,7 @@ private List<CharsRef> stem(
289287
char[] word,
290288
int length,
291289
int previous,
292-
int prevFlag,
290+
char prevFlag,
293291
int prefixId,
294292
int recursionDepth,
295293
boolean doPrefix,
@@ -428,27 +426,19 @@ private char[] stripAffix(char[] word, int length, int affixLen, int affix, bool
428426
}
429427

430428
private boolean isAffixCompatible(
431-
int affix, int prevFlag, int recursionDepth, boolean previousWasPrefix) {
429+
int affix, char prevFlag, int recursionDepth, boolean previousWasPrefix) {
432430
int append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);
433431

434432
if (recursionDepth == 0) {
435-
if (dictionary.onlyincompound == -1) {
436-
return true;
437-
}
438-
439433
// check if affix is allowed in a non-compound word
440-
return !dictionary.hasFlag(append, (char) dictionary.onlyincompound, scratch);
434+
return !dictionary.hasFlag(append, dictionary.onlyincompound, scratch);
441435
}
442436

443437
if (isCrossProduct(affix)) {
444438
// cross check incoming continuation class (flag of previous affix) against list.
445439
char[] appendFlags = dictionary.decodeFlags(append, scratch);
446-
assert prevFlag >= 0;
447-
boolean allowed =
448-
dictionary.onlyincompound == -1
449-
|| !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
450-
if (allowed) {
451-
return previousWasPrefix || Dictionary.hasFlag(appendFlags, (char) prevFlag);
440+
if (!Dictionary.hasFlag(appendFlags, dictionary.onlyincompound)) {
441+
return previousWasPrefix || Dictionary.hasFlag(appendFlags, prevFlag);
452442
}
453443
}
454444

@@ -528,8 +518,8 @@ private List<CharsRef> applyAffix(
528518

529519
// if circumfix was previously set by a prefix, we must check this suffix,
530520
// to ensure it has it, and vice versa
531-
if (dictionary.circumfix != -1) {
532-
boolean suffixCircumfix = isFlagAppendedByAffix(affix, (char) dictionary.circumfix);
521+
if (dictionary.circumfix != Dictionary.FLAG_UNSET) {
522+
boolean suffixCircumfix = isFlagAppendedByAffix(affix, dictionary.circumfix);
533523
if (circumfix != suffixCircumfix) {
534524
continue;
535525
}
@@ -540,8 +530,7 @@ private List<CharsRef> applyAffix(
540530
continue;
541531
}
542532
// we aren't decompounding (yet)
543-
if (dictionary.onlyincompound != -1
544-
&& Dictionary.hasFlag(wordFlags, (char) dictionary.onlyincompound)) {
533+
if (Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
545534
continue;
546535
}
547536
stems.add(newStem(strippedWord, length, forms, i));
@@ -551,8 +540,8 @@ private List<CharsRef> applyAffix(
551540

552541
// if a circumfix flag is defined in the dictionary, and we are a prefix, we need to check if we
553542
// have that flag
554-
if (dictionary.circumfix != -1 && !circumfix && prefix) {
555-
circumfix = isFlagAppendedByAffix(affix, (char) dictionary.circumfix);
543+
if (dictionary.circumfix != Dictionary.FLAG_UNSET && !circumfix && prefix) {
544+
circumfix = isFlagAppendedByAffix(affix, dictionary.circumfix);
556545
}
557546

558547
if (isCrossProduct(affix) && recursionDepth <= 1) {
@@ -602,7 +591,7 @@ private List<CharsRef> applyAffix(
602591
}
603592

604593
private boolean isFlagAppendedByAffix(int affixId, char flag) {
605-
if (affixId < 0) return false;
594+
if (affixId < 0 || flag == Dictionary.FLAG_UNSET) return false;
606595
int appendId = dictionary.affixData(affixId, Dictionary.AFFIX_APPEND);
607596
return dictionary.hasFlag(appendId, flag, scratch);
608597
}

0 commit comments

Comments
 (0)