LUCENE-9716: Hunspell: support flag usage before its format is even specified (#2277)

donnerpeter · web-flow · commit 8f75933f3dae · 2021-02-02T21:25:56.000+01:00
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -17,7 +17,6 @@
 package org.apache.lucene.analysis.hunspell;
 
 import java.io.BufferedInputStream;
-import java.io.BufferedOutputStream;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
@@ -45,8 +44,6 @@
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
@@ -84,6 +81,7 @@ public class Dictionary {
   private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
   private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
   static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1;
+  CharsetDecoder decoder = replacingDecoder(DEFAULT_CHARSET);
 
   FST<IntsRef> prefixes;
   FST<IntsRef> suffixes;
@@ -212,25 +210,21 @@ public Dictionary(
 
     Path tempPath = getDefaultTempDir(); // TODO: make this configurable?
     Path aff = Files.createTempFile(tempPath, "affix", "aff");
-    OutputStream out = new BufferedOutputStream(Files.newOutputStream(aff));
-    InputStream aff1 = null;
+
+    BufferedInputStream aff1 = null;
     InputStream aff2 = null;
     boolean success = false;
     try {
-      // copy contents of affix stream to temp file
-      final byte[] buffer = new byte[1024 * 8];
-      int len;
-      while ((len = affix.read(buffer)) > 0) {
-        out.write(buffer, 0, len);
+      // Copy contents of the affix stream to a temp file.
+      try (OutputStream os = Files.newOutputStream(aff)) {
+        affix.transferTo(os);
       }
-      out.close();
 
-      // pass 1: get encoding
+      // pass 1: get encoding & flag
       aff1 = new BufferedInputStream(Files.newInputStream(aff));
-      String encoding = getDictionaryEncoding(aff1);
+      readConfig(aff1);
 
       // pass 2: parse affixes
-      CharsetDecoder decoder = getJavaEncoding(encoding);
       aff2 = new BufferedInputStream(Files.newInputStream(aff));
       readAffixFile(aff2, decoder);
 
@@ -242,7 +236,7 @@ public Dictionary(
       morphAliases = null; // no longer needed
       success = true;
     } finally {
-      IOUtils.closeWhileHandlingException(out, aff1, aff2);
+      IOUtils.closeWhileHandlingException(aff1, aff2);
       if (success) {
         Files.delete(aff);
       } else {
@@ -344,10 +338,6 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder)
       } else if ("SFX".equals(firstWord)) {
         parseAffix(
             suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
-      } else if ("FLAG".equals(firstWord)) {
-        // Assume that the FLAG line comes before any prefix or suffixes
-        // Store the strategy so it can be used when parsing the dic file
-        flagParsingStrategy = getFlagParsingStrategy(line, decoder.charset());
       } else if (line.equals("COMPLEXPREFIXES")) {
         complexPrefixes =
             true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
@@ -696,46 +686,51 @@ private FST<CharsRef> parseConversions(LineNumberReader reader, int num)
     return fstCompiler.compile();
   }
 
-  /** pattern accepts optional BOM + SET + any whitespace */
-  static final Pattern ENCODING_PATTERN = Pattern.compile("^(\u00EF\u00BB\u00BF)?SET\\s+");
+  private static final byte[] BOM_UTF8 = {(byte) 0xef, (byte) 0xbb, (byte) 0xbf};
+
+  /** Parses the encoding and flag format specified in the provided InputStream */
+  private void readConfig(BufferedInputStream stream) throws IOException, ParseException {
+    // I assume we don't support other BOMs (utf16, etc.)? We trivially could,
+    // by adding maybeConsume() with a proper bom... but I don't see hunspell repo to have
+    // any such exotic examples.
+    Charset streamCharset;
+    if (maybeConsume(stream, BOM_UTF8)) {
+      streamCharset = StandardCharsets.UTF_8;
+    } else {
+      streamCharset = DEFAULT_CHARSET;
+    }
+
+    // TODO: can these flags change throughout the file? If not then we can abort sooner. And
+    // then we wouldn't even need to create a temp file for the affix stream - a large enough
+    // leading buffer (BufferedInputStream) would be sufficient?
+    LineNumberReader reader = new LineNumberReader(new InputStreamReader(stream, streamCharset));
+    String line;
+    while ((line = reader.readLine()) != null) {
+      String firstWord = line.split("\\s")[0];
+      if ("SET".equals(firstWord)) {
+        decoder = getDecoder(singleArgument(reader, line));
+      } else if ("FLAG".equals(firstWord)) {
+        flagParsingStrategy = getFlagParsingStrategy(line, decoder.charset());
+      }
+    }
+  }
 
   /**
-   * Parses the encoding specified in the affix file readable through the provided InputStream
+   * Consume the provided byte sequence in full, if present. Otherwise leave the input stream
+   * intact.
    *
-   * @param affix InputStream for reading the affix file
-   * @return Encoding specified in the affix file
-   * @throws IOException Can be thrown while reading from the InputStream
+   * @return {@code true} if the sequence matched and has been consumed.
    */
-  static String getDictionaryEncoding(InputStream affix) throws IOException {
-    final StringBuilder encoding = new StringBuilder();
-    for (; ; ) {
-      encoding.setLength(0);
-      int ch;
-      while ((ch = affix.read()) >= 0) {
-        if (ch == '\n') {
-          break;
-        }
-        if (ch != '\r') {
-          encoding.append((char) ch);
-        }
+  private static boolean maybeConsume(BufferedInputStream stream, byte[] bytes) throws IOException {
+    stream.mark(bytes.length);
+    for (int i = 0; i < bytes.length; i++) {
+      int nextByte = stream.read();
+      if (nextByte != (bytes[i] & 0xff)) { // covers EOF (-1) as well.
+        stream.reset();
+        return false;
       }
-      if (encoding.length() == 0
-          || encoding.charAt(0) == '#'
-          ||
-          // this test only at the end as ineffective but would allow lines only containing spaces:
-          encoding.toString().trim().length() == 0) {
-        if (ch < 0) {
-          return DEFAULT_CHARSET.name();
-        }
-        continue;
-      }
-      Matcher matcher = ENCODING_PATTERN.matcher(encoding);
-      if (matcher.find()) {
-        int last = matcher.end();
-        return encoding.substring(last).trim();
-      }
-      return DEFAULT_CHARSET.name();
     }
+    return true;
   }
 
   static final Map<String, String> CHARSET_ALIASES =
@@ -748,15 +743,18 @@ static String getDictionaryEncoding(InputStream affix) throws IOException {
    * @param encoding Encoding to retrieve the CharsetDecoder for
    * @return CharSetDecoder for the given encoding
    */
-  private CharsetDecoder getJavaEncoding(String encoding) {
+  private CharsetDecoder getDecoder(String encoding) {
     if ("ISO8859-14".equals(encoding)) {
       return new ISO8859_14Decoder();
     }
     String canon = CHARSET_ALIASES.get(encoding);
     if (canon != null) {
       encoding = canon;
     }
-    Charset charset = Charset.forName(encoding);
+    return replacingDecoder(Charset.forName(encoding));
+  }
+
+  private static CharsetDecoder replacingDecoder(Charset charset) {
     return charset.newDecoder().onMalformedInput(CodingErrorAction.REPLACE);
   }
 
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
@@ -24,6 +24,7 @@
 import java.nio.charset.StandardCharsets;
 import java.text.ParseException;
 import java.util.Random;
+import org.apache.lucene.store.ByteBuffersDirectory;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
@@ -142,6 +143,20 @@ public void testInvalidData() throws Exception {
     tempDir.close();
   }
 
+  public void testUsingFlagsBeforeFlagDirective() throws IOException, ParseException {
+    byte[] aff = "KEEPCASE 42\nFLAG num".getBytes(StandardCharsets.UTF_8);
+    byte[] dic = "1\nfoo/42".getBytes(StandardCharsets.UTF_8);
+
+    Dictionary dictionary =
+        new Dictionary(
+            new ByteBuffersDirectory(),
+            "",
+            new ByteArrayInputStream(aff),
+            new ByteArrayInputStream(dic));
+
+    assertEquals(42, dictionary.keepcase);
+  }
+
   // malformed flags causes ParseException
   public void testInvalidFlags() throws Exception {
     InputStream affixStream = getClass().getResourceAsStream("broken-flags.aff");
@@ -245,25 +260,21 @@ public void testReplacements() throws Exception {
   }
 
   public void testSetWithCrazyWhitespaceAndBOMs() throws Exception {
-    assertEquals(
-        "UTF-8",
-        Dictionary.getDictionaryEncoding(
-            new ByteArrayInputStream("SET\tUTF-8\n".getBytes(StandardCharsets.UTF_8))));
-    assertEquals(
-        "UTF-8",
-        Dictionary.getDictionaryEncoding(
-            new ByteArrayInputStream("SET\t UTF-8\n".getBytes(StandardCharsets.UTF_8))));
-    assertEquals(
-        "UTF-8",
-        Dictionary.getDictionaryEncoding(
-            new ByteArrayInputStream("\uFEFFSET\tUTF-8\n".getBytes(StandardCharsets.UTF_8))));
-    assertEquals(
-        "UTF-8",
-        Dictionary.getDictionaryEncoding(
-            new ByteArrayInputStream("\uFEFFSET\tUTF-8\r\n".getBytes(StandardCharsets.UTF_8))));
-    assertEquals(
-        Dictionary.DEFAULT_CHARSET.name(),
-        Dictionary.getDictionaryEncoding(new ByteArrayInputStream(new byte[0])));
+    assertEquals("UTF-8", getDictionaryEncoding("SET\tUTF-8\n"));
+    assertEquals("UTF-8", getDictionaryEncoding("SET\t UTF-8\n"));
+    assertEquals("UTF-8", getDictionaryEncoding("\uFEFFSET\tUTF-8\n"));
+    assertEquals("UTF-8", getDictionaryEncoding("\uFEFFSET\tUTF-8\r\n"));
+    assertEquals(Dictionary.DEFAULT_CHARSET.name(), getDictionaryEncoding(""));
+  }
+
+  private static String getDictionaryEncoding(String affFile) throws IOException, ParseException {
+    Dictionary dictionary =
+        new Dictionary(
+            new ByteBuffersDirectory(),
+            "",
+            new ByteArrayInputStream(affFile.getBytes(StandardCharsets.UTF_8)),
+            new ByteArrayInputStream("1\nmock".getBytes(StandardCharsets.UTF_8)));
+    return dictionary.decoder.charset().name();
   }
 
   public void testFlagWithCrazyWhitespace() {