17
17
package org .apache .lucene .analysis .hunspell ;
18
18
19
19
import java .io .BufferedInputStream ;
20
- import java .io .BufferedOutputStream ;
21
20
import java .io .BufferedReader ;
22
21
import java .io .IOException ;
23
22
import java .io .InputStream ;
45
44
import java .util .Map ;
46
45
import java .util .Set ;
47
46
import java .util .TreeMap ;
48
- import java .util .regex .Matcher ;
49
- import java .util .regex .Pattern ;
50
47
import org .apache .lucene .codecs .CodecUtil ;
51
48
import org .apache .lucene .store .Directory ;
52
49
import org .apache .lucene .store .IOContext ;
@@ -84,6 +81,7 @@ public class Dictionary {
84
81
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*" ;
85
82
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s" ;
86
83
static final Charset DEFAULT_CHARSET = StandardCharsets .ISO_8859_1 ;
84
+ CharsetDecoder decoder = replacingDecoder (DEFAULT_CHARSET );
87
85
88
86
FST <IntsRef > prefixes ;
89
87
FST <IntsRef > suffixes ;
@@ -212,25 +210,21 @@ public Dictionary(
212
210
213
211
Path tempPath = getDefaultTempDir (); // TODO: make this configurable?
214
212
Path aff = Files .createTempFile (tempPath , "affix" , "aff" );
215
- OutputStream out = new BufferedOutputStream ( Files . newOutputStream ( aff ));
216
- InputStream aff1 = null ;
213
+
214
+ BufferedInputStream aff1 = null ;
217
215
InputStream aff2 = null ;
218
216
boolean success = false ;
219
217
try {
220
- // copy contents of affix stream to temp file
221
- final byte [] buffer = new byte [1024 * 8 ];
222
- int len ;
223
- while ((len = affix .read (buffer )) > 0 ) {
224
- out .write (buffer , 0 , len );
218
+ // Copy contents of the affix stream to a temp file.
219
+ try (OutputStream os = Files .newOutputStream (aff )) {
220
+ affix .transferTo (os );
225
221
}
226
- out .close ();
227
222
228
- // pass 1: get encoding
223
+ // pass 1: get encoding & flag
229
224
aff1 = new BufferedInputStream (Files .newInputStream (aff ));
230
- String encoding = getDictionaryEncoding (aff1 );
225
+ readConfig (aff1 );
231
226
232
227
// pass 2: parse affixes
233
- CharsetDecoder decoder = getJavaEncoding (encoding );
234
228
aff2 = new BufferedInputStream (Files .newInputStream (aff ));
235
229
readAffixFile (aff2 , decoder );
236
230
@@ -242,7 +236,7 @@ public Dictionary(
242
236
morphAliases = null ; // no longer needed
243
237
success = true ;
244
238
} finally {
245
- IOUtils .closeWhileHandlingException (out , aff1 , aff2 );
239
+ IOUtils .closeWhileHandlingException (aff1 , aff2 );
246
240
if (success ) {
247
241
Files .delete (aff );
248
242
} else {
@@ -344,10 +338,6 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder)
344
338
} else if ("SFX" .equals (firstWord )) {
345
339
parseAffix (
346
340
suffixes , line , reader , SUFFIX_CONDITION_REGEX_PATTERN , seenPatterns , seenStrips );
347
- } else if ("FLAG" .equals (firstWord )) {
348
- // Assume that the FLAG line comes before any prefix or suffixes
349
- // Store the strategy so it can be used when parsing the dic file
350
- flagParsingStrategy = getFlagParsingStrategy (line , decoder .charset ());
351
341
} else if (line .equals ("COMPLEXPREFIXES" )) {
352
342
complexPrefixes =
353
343
true ; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
@@ -696,46 +686,51 @@ private FST<CharsRef> parseConversions(LineNumberReader reader, int num)
696
686
return fstCompiler .compile ();
697
687
}
698
688
699
- /** pattern accepts optional BOM + SET + any whitespace */
700
- static final Pattern ENCODING_PATTERN = Pattern .compile ("^(\u00EF \u00BB \u00BF )?SET\\ s+" );
689
+ private static final byte [] BOM_UTF8 = {(byte ) 0xef , (byte ) 0xbb , (byte ) 0xbf };
690
+
691
+ /** Parses the encoding and flag format specified in the provided InputStream */
692
+ private void readConfig (BufferedInputStream stream ) throws IOException , ParseException {
693
+ // I assume we don't support other BOMs (utf16, etc.)? We trivially could,
694
+ // by adding maybeConsume() with a proper bom... but I don't see hunspell repo to have
695
+ // any such exotic examples.
696
+ Charset streamCharset ;
697
+ if (maybeConsume (stream , BOM_UTF8 )) {
698
+ streamCharset = StandardCharsets .UTF_8 ;
699
+ } else {
700
+ streamCharset = DEFAULT_CHARSET ;
701
+ }
702
+
703
+ // TODO: can these flags change throughout the file? If not then we can abort sooner. And
704
+ // then we wouldn't even need to create a temp file for the affix stream - a large enough
705
+ // leading buffer (BufferedInputStream) would be sufficient?
706
+ LineNumberReader reader = new LineNumberReader (new InputStreamReader (stream , streamCharset ));
707
+ String line ;
708
+ while ((line = reader .readLine ()) != null ) {
709
+ String firstWord = line .split ("\\ s" )[0 ];
710
+ if ("SET" .equals (firstWord )) {
711
+ decoder = getDecoder (singleArgument (reader , line ));
712
+ } else if ("FLAG" .equals (firstWord )) {
713
+ flagParsingStrategy = getFlagParsingStrategy (line , decoder .charset ());
714
+ }
715
+ }
716
+ }
701
717
702
718
/**
703
- * Parses the encoding specified in the affix file readable through the provided InputStream
719
+ * Consume the provided byte sequence in full, if present. Otherwise leave the input stream
720
+ * intact.
704
721
*
705
- * @param affix InputStream for reading the affix file
706
- * @return Encoding specified in the affix file
707
- * @throws IOException Can be thrown while reading from the InputStream
722
+ * @return {@code true} if the sequence matched and has been consumed.
708
723
*/
709
- static String getDictionaryEncoding (InputStream affix ) throws IOException {
710
- final StringBuilder encoding = new StringBuilder ();
711
- for (; ; ) {
712
- encoding .setLength (0 );
713
- int ch ;
714
- while ((ch = affix .read ()) >= 0 ) {
715
- if (ch == '\n' ) {
716
- break ;
717
- }
718
- if (ch != '\r' ) {
719
- encoding .append ((char ) ch );
720
- }
724
+ private static boolean maybeConsume (BufferedInputStream stream , byte [] bytes ) throws IOException {
725
+ stream .mark (bytes .length );
726
+ for (int i = 0 ; i < bytes .length ; i ++) {
727
+ int nextByte = stream .read ();
728
+ if (nextByte != (bytes [i ] & 0xff )) { // covers EOF (-1) as well.
729
+ stream .reset ();
730
+ return false ;
721
731
}
722
- if (encoding .length () == 0
723
- || encoding .charAt (0 ) == '#'
724
- ||
725
- // this test only at the end as ineffective but would allow lines only containing spaces:
726
- encoding .toString ().trim ().length () == 0 ) {
727
- if (ch < 0 ) {
728
- return DEFAULT_CHARSET .name ();
729
- }
730
- continue ;
731
- }
732
- Matcher matcher = ENCODING_PATTERN .matcher (encoding );
733
- if (matcher .find ()) {
734
- int last = matcher .end ();
735
- return encoding .substring (last ).trim ();
736
- }
737
- return DEFAULT_CHARSET .name ();
738
732
}
733
+ return true ;
739
734
}
740
735
741
736
static final Map <String , String > CHARSET_ALIASES =
@@ -748,15 +743,18 @@ static String getDictionaryEncoding(InputStream affix) throws IOException {
748
743
* @param encoding Encoding to retrieve the CharsetDecoder for
749
744
* @return CharSetDecoder for the given encoding
750
745
*/
751
- private CharsetDecoder getJavaEncoding (String encoding ) {
746
+ private CharsetDecoder getDecoder (String encoding ) {
752
747
if ("ISO8859-14" .equals (encoding )) {
753
748
return new ISO8859_14Decoder ();
754
749
}
755
750
String canon = CHARSET_ALIASES .get (encoding );
756
751
if (canon != null ) {
757
752
encoding = canon ;
758
753
}
759
- Charset charset = Charset .forName (encoding );
754
+ return replacingDecoder (Charset .forName (encoding ));
755
+ }
756
+
757
+ private static CharsetDecoder replacingDecoder (Charset charset ) {
760
758
return charset .newDecoder ().onMalformedInput (CodingErrorAction .REPLACE );
761
759
}
762
760
0 commit comments