Skip to content

Commit 8f75933

Browse files
authored
LUCENE-9716: Hunspell: support flag usage before its format is even specified (#2277)
1 parent 47e3d06 commit 8f75933

File tree

2 files changed

+83
-74
lines changed

2 files changed

+83
-74
lines changed

lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java

Lines changed: 53 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
package org.apache.lucene.analysis.hunspell;
1818

1919
import java.io.BufferedInputStream;
20-
import java.io.BufferedOutputStream;
2120
import java.io.BufferedReader;
2221
import java.io.IOException;
2322
import java.io.InputStream;
@@ -45,8 +44,6 @@
4544
import java.util.Map;
4645
import java.util.Set;
4746
import java.util.TreeMap;
48-
import java.util.regex.Matcher;
49-
import java.util.regex.Pattern;
5047
import org.apache.lucene.codecs.CodecUtil;
5148
import org.apache.lucene.store.Directory;
5249
import org.apache.lucene.store.IOContext;
@@ -84,6 +81,7 @@ public class Dictionary {
8481
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
8582
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
8683
static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1;
84+
CharsetDecoder decoder = replacingDecoder(DEFAULT_CHARSET);
8785

8886
FST<IntsRef> prefixes;
8987
FST<IntsRef> suffixes;
@@ -212,25 +210,21 @@ public Dictionary(
212210

213211
Path tempPath = getDefaultTempDir(); // TODO: make this configurable?
214212
Path aff = Files.createTempFile(tempPath, "affix", "aff");
215-
OutputStream out = new BufferedOutputStream(Files.newOutputStream(aff));
216-
InputStream aff1 = null;
213+
214+
BufferedInputStream aff1 = null;
217215
InputStream aff2 = null;
218216
boolean success = false;
219217
try {
220-
// copy contents of affix stream to temp file
221-
final byte[] buffer = new byte[1024 * 8];
222-
int len;
223-
while ((len = affix.read(buffer)) > 0) {
224-
out.write(buffer, 0, len);
218+
// Copy contents of the affix stream to a temp file.
219+
try (OutputStream os = Files.newOutputStream(aff)) {
220+
affix.transferTo(os);
225221
}
226-
out.close();
227222

228-
// pass 1: get encoding
223+
// pass 1: get encoding & flag
229224
aff1 = new BufferedInputStream(Files.newInputStream(aff));
230-
String encoding = getDictionaryEncoding(aff1);
225+
readConfig(aff1);
231226

232227
// pass 2: parse affixes
233-
CharsetDecoder decoder = getJavaEncoding(encoding);
234228
aff2 = new BufferedInputStream(Files.newInputStream(aff));
235229
readAffixFile(aff2, decoder);
236230

@@ -242,7 +236,7 @@ public Dictionary(
242236
morphAliases = null; // no longer needed
243237
success = true;
244238
} finally {
245-
IOUtils.closeWhileHandlingException(out, aff1, aff2);
239+
IOUtils.closeWhileHandlingException(aff1, aff2);
246240
if (success) {
247241
Files.delete(aff);
248242
} else {
@@ -344,10 +338,6 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder)
344338
} else if ("SFX".equals(firstWord)) {
345339
parseAffix(
346340
suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
347-
} else if ("FLAG".equals(firstWord)) {
348-
// Assume that the FLAG line comes before any prefix or suffixes
349-
// Store the strategy so it can be used when parsing the dic file
350-
flagParsingStrategy = getFlagParsingStrategy(line, decoder.charset());
351341
} else if (line.equals("COMPLEXPREFIXES")) {
352342
complexPrefixes =
353343
true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
@@ -696,46 +686,51 @@ private FST<CharsRef> parseConversions(LineNumberReader reader, int num)
696686
return fstCompiler.compile();
697687
}
698688

699-
/** pattern accepts optional BOM + SET + any whitespace */
700-
static final Pattern ENCODING_PATTERN = Pattern.compile("^(\u00EF\u00BB\u00BF)?SET\\s+");
689+
private static final byte[] BOM_UTF8 = {(byte) 0xef, (byte) 0xbb, (byte) 0xbf};
690+
691+
/** Parses the encoding and flag format specified in the provided InputStream */
692+
private void readConfig(BufferedInputStream stream) throws IOException, ParseException {
693+
// I assume we don't support other BOMs (utf16, etc.)? We trivially could,
694+
// by adding maybeConsume() with a proper bom... but I don't see hunspell repo to have
695+
// any such exotic examples.
696+
Charset streamCharset;
697+
if (maybeConsume(stream, BOM_UTF8)) {
698+
streamCharset = StandardCharsets.UTF_8;
699+
} else {
700+
streamCharset = DEFAULT_CHARSET;
701+
}
702+
703+
// TODO: can these flags change throughout the file? If not then we can abort sooner. And
704+
// then we wouldn't even need to create a temp file for the affix stream - a large enough
705+
// leading buffer (BufferedInputStream) would be sufficient?
706+
LineNumberReader reader = new LineNumberReader(new InputStreamReader(stream, streamCharset));
707+
String line;
708+
while ((line = reader.readLine()) != null) {
709+
String firstWord = line.split("\\s")[0];
710+
if ("SET".equals(firstWord)) {
711+
decoder = getDecoder(singleArgument(reader, line));
712+
} else if ("FLAG".equals(firstWord)) {
713+
flagParsingStrategy = getFlagParsingStrategy(line, decoder.charset());
714+
}
715+
}
716+
}
701717

702718
/**
703-
* Parses the encoding specified in the affix file readable through the provided InputStream
719+
* Consume the provided byte sequence in full, if present. Otherwise leave the input stream
720+
* intact.
704721
*
705-
* @param affix InputStream for reading the affix file
706-
* @return Encoding specified in the affix file
707-
* @throws IOException Can be thrown while reading from the InputStream
722+
* @return {@code true} if the sequence matched and has been consumed.
708723
*/
709-
static String getDictionaryEncoding(InputStream affix) throws IOException {
710-
final StringBuilder encoding = new StringBuilder();
711-
for (; ; ) {
712-
encoding.setLength(0);
713-
int ch;
714-
while ((ch = affix.read()) >= 0) {
715-
if (ch == '\n') {
716-
break;
717-
}
718-
if (ch != '\r') {
719-
encoding.append((char) ch);
720-
}
724+
private static boolean maybeConsume(BufferedInputStream stream, byte[] bytes) throws IOException {
725+
stream.mark(bytes.length);
726+
for (int i = 0; i < bytes.length; i++) {
727+
int nextByte = stream.read();
728+
if (nextByte != (bytes[i] & 0xff)) { // covers EOF (-1) as well.
729+
stream.reset();
730+
return false;
721731
}
722-
if (encoding.length() == 0
723-
|| encoding.charAt(0) == '#'
724-
||
725-
// this test only at the end as ineffective but would allow lines only containing spaces:
726-
encoding.toString().trim().length() == 0) {
727-
if (ch < 0) {
728-
return DEFAULT_CHARSET.name();
729-
}
730-
continue;
731-
}
732-
Matcher matcher = ENCODING_PATTERN.matcher(encoding);
733-
if (matcher.find()) {
734-
int last = matcher.end();
735-
return encoding.substring(last).trim();
736-
}
737-
return DEFAULT_CHARSET.name();
738732
}
733+
return true;
739734
}
740735

741736
static final Map<String, String> CHARSET_ALIASES =
@@ -748,15 +743,18 @@ static String getDictionaryEncoding(InputStream affix) throws IOException {
748743
* @param encoding Encoding to retrieve the CharsetDecoder for
749744
* @return CharSetDecoder for the given encoding
750745
*/
751-
private CharsetDecoder getJavaEncoding(String encoding) {
746+
private CharsetDecoder getDecoder(String encoding) {
752747
if ("ISO8859-14".equals(encoding)) {
753748
return new ISO8859_14Decoder();
754749
}
755750
String canon = CHARSET_ALIASES.get(encoding);
756751
if (canon != null) {
757752
encoding = canon;
758753
}
759-
Charset charset = Charset.forName(encoding);
754+
return replacingDecoder(Charset.forName(encoding));
755+
}
756+
757+
private static CharsetDecoder replacingDecoder(Charset charset) {
760758
return charset.newDecoder().onMalformedInput(CodingErrorAction.REPLACE);
761759
}
762760

lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java

Lines changed: 30 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import java.nio.charset.StandardCharsets;
2525
import java.text.ParseException;
2626
import java.util.Random;
27+
import org.apache.lucene.store.ByteBuffersDirectory;
2728
import org.apache.lucene.store.Directory;
2829
import org.apache.lucene.util.BytesRef;
2930
import org.apache.lucene.util.CharsRef;
@@ -142,6 +143,20 @@ public void testInvalidData() throws Exception {
142143
tempDir.close();
143144
}
144145

146+
public void testUsingFlagsBeforeFlagDirective() throws IOException, ParseException {
147+
byte[] aff = "KEEPCASE 42\nFLAG num".getBytes(StandardCharsets.UTF_8);
148+
byte[] dic = "1\nfoo/42".getBytes(StandardCharsets.UTF_8);
149+
150+
Dictionary dictionary =
151+
new Dictionary(
152+
new ByteBuffersDirectory(),
153+
"",
154+
new ByteArrayInputStream(aff),
155+
new ByteArrayInputStream(dic));
156+
157+
assertEquals(42, dictionary.keepcase);
158+
}
159+
145160
// malformed flags causes ParseException
146161
public void testInvalidFlags() throws Exception {
147162
InputStream affixStream = getClass().getResourceAsStream("broken-flags.aff");
@@ -245,25 +260,21 @@ public void testReplacements() throws Exception {
245260
}
246261

247262
public void testSetWithCrazyWhitespaceAndBOMs() throws Exception {
248-
assertEquals(
249-
"UTF-8",
250-
Dictionary.getDictionaryEncoding(
251-
new ByteArrayInputStream("SET\tUTF-8\n".getBytes(StandardCharsets.UTF_8))));
252-
assertEquals(
253-
"UTF-8",
254-
Dictionary.getDictionaryEncoding(
255-
new ByteArrayInputStream("SET\t UTF-8\n".getBytes(StandardCharsets.UTF_8))));
256-
assertEquals(
257-
"UTF-8",
258-
Dictionary.getDictionaryEncoding(
259-
new ByteArrayInputStream("\uFEFFSET\tUTF-8\n".getBytes(StandardCharsets.UTF_8))));
260-
assertEquals(
261-
"UTF-8",
262-
Dictionary.getDictionaryEncoding(
263-
new ByteArrayInputStream("\uFEFFSET\tUTF-8\r\n".getBytes(StandardCharsets.UTF_8))));
264-
assertEquals(
265-
Dictionary.DEFAULT_CHARSET.name(),
266-
Dictionary.getDictionaryEncoding(new ByteArrayInputStream(new byte[0])));
263+
assertEquals("UTF-8", getDictionaryEncoding("SET\tUTF-8\n"));
264+
assertEquals("UTF-8", getDictionaryEncoding("SET\t UTF-8\n"));
265+
assertEquals("UTF-8", getDictionaryEncoding("\uFEFFSET\tUTF-8\n"));
266+
assertEquals("UTF-8", getDictionaryEncoding("\uFEFFSET\tUTF-8\r\n"));
267+
assertEquals(Dictionary.DEFAULT_CHARSET.name(), getDictionaryEncoding(""));
268+
}
269+
270+
private static String getDictionaryEncoding(String affFile) throws IOException, ParseException {
271+
Dictionary dictionary =
272+
new Dictionary(
273+
new ByteBuffersDirectory(),
274+
"",
275+
new ByteArrayInputStream(affFile.getBytes(StandardCharsets.UTF_8)),
276+
new ByteArrayInputStream("1\nmock".getBytes(StandardCharsets.UTF_8)));
277+
return dictionary.decoder.charset().name();
267278
}
268279

269280
public void testFlagWithCrazyWhitespace() {

0 commit comments

Comments
 (0)