19
19
20
20
/*
21
21
* Copyright (c) 2007, 2021, Oracle and/or its affiliates. All rights reserved.
22
- * Portions Copyright (c) 2017, 2018 , Chris Fraire <[email protected] >.
22
+ * Portions Copyright (c) 2017, 2021 , Chris Fraire <[email protected] >.
23
23
*/
24
24
package org .opengrok .indexer .analysis .plain ;
25
25
26
26
import java .io .IOException ;
27
27
import java .io .InputStream ;
28
+ import java .nio .ByteBuffer ;
29
+ import java .nio .CharBuffer ;
30
+ import java .nio .charset .CharsetDecoder ;
31
+ import java .nio .charset .CoderResult ;
32
+ import java .nio .charset .CodingErrorAction ;
28
33
import java .nio .charset .StandardCharsets ;
34
+ import java .util .Arrays ;
29
35
30
36
import org .opengrok .indexer .analysis .AbstractAnalyzer ;
31
37
import org .opengrok .indexer .analysis .AnalyzerFactory ;
32
38
import org .opengrok .indexer .analysis .FileAnalyzerFactory ;
33
39
import org .opengrok .indexer .util .IOUtils ;
34
40
41
+ /**
42
+ * Represents a subclass of {@link FileAnalyzerFactory} for plain-text
43
+ * files in ASCII, UTF-8, or UTF-16.
44
+ */
35
45
public final class PlainAnalyzerFactory extends FileAnalyzerFactory {
36
46
37
47
private static final String NAME = "Plain Text" ;
38
48
39
- private static final Matcher MATCHER = new Matcher () {
49
+ private static final int MIN_CHARS_WHILE_REMAINING = 20 ;
50
+
51
+ // Up to 4 octets per UTF-8 character
52
+ private static final int TRY_UTF8_BYTES = MIN_CHARS_WHILE_REMAINING * 4 ;
53
+
54
+ /**
55
+ * The reentrant {@link Matcher} implementation for plain-text files.
56
+ */
57
+ public static final Matcher MATCHER = new Matcher () {
40
58
@ Override
41
59
public String description () {
42
- return "UTF-8, UTF-16BE, or UTF-16LE Byte Order Mark is" +
43
- " present; or first eight bytes are all ASCII graphic" +
44
- " characters or ASCII whitespace" ;
60
+ return "UTF-8, UTF-16BE, or UTF-16LE Byte Order Mark is present; or initial " +
61
+ "bytes are all UTF-8-encoded graphic characters or whitespace" ;
45
62
}
46
63
47
64
@ Override
48
- public AnalyzerFactory isMagic (byte [] content , InputStream in )
49
- throws IOException {
50
- if (isPlainText (content )) {
65
+ public AnalyzerFactory isMagic (byte [] content , InputStream in ) throws IOException {
66
+ int lengthBOM = IOUtils .skipForBOM (content );
67
+ if (lengthBOM > 0 ) {
68
+ return DEFAULT_INSTANCE ;
69
+ }
70
+ if (readSomePlainCharactersUTF8noBOMwithoutError (in )) {
51
71
return DEFAULT_INSTANCE ;
52
- } else {
53
- return null ;
54
72
}
73
+ return null ;
55
74
}
56
75
57
76
@ Override
58
77
public AnalyzerFactory forFactory () {
59
78
return DEFAULT_INSTANCE ;
60
79
}
80
+ };
61
81
62
- /**
63
- * Check whether the byte array contains plain text. First, look
64
- * for a UTF BOM; otherwise, inspect as if US-ASCII.
65
- */
66
- private boolean isPlainText (byte [] content ) throws IOException {
67
- int lengthBOM = IOUtils .skipForBOM (content );
68
- if (lengthBOM > 0 ) {
69
- return true ;
70
- }
71
- String ascii = new String (content , StandardCharsets .US_ASCII );
72
- return isPlainText (ascii );
73
- }
74
-
75
- /**
76
- * Check whether the string only contains plain ASCII characters.
77
- */
78
- private boolean isPlainText (String str ) {
79
- for (int i = 0 ; i < str .length (); i ++) {
80
- char b = str .charAt (i );
81
- if ((b >= 32 && b < 127 ) || // ASCII printable characters
82
- (b == 9 ) || // horizontal tab
83
- (b == 10 ) || // line feed
84
- (b == 12 ) || // form feed
85
- (b == 13 )) { // carriage return
86
- // is plain text so far, go to next byte
87
- continue ;
88
- } else {
89
- // 8-bit values or unprintable control characters,
90
- // probably not plain text
91
- return false ;
92
- }
93
- }
94
- return true ;
95
- }
96
- };
97
-
98
- public static final PlainAnalyzerFactory DEFAULT_INSTANCE =
99
- new PlainAnalyzerFactory ();
82
+ /**
83
+ * Gets the singleton, factory instance that associates
84
+ * {@link PlainAnalyzer} with files whose initial bytes are the UTF-8,
85
+ * UTF-16BE, or UTF-16LE Byte Order Mark; or whose initial bytes are all
86
+ * UTF-8-encoded graphic characters or whitespace.
87
+ */
88
+ public static final PlainAnalyzerFactory DEFAULT_INSTANCE = new PlainAnalyzerFactory ();
100
89
101
90
private PlainAnalyzerFactory () {
102
91
super (null , null , null , null , MATCHER , "text/plain" , AbstractAnalyzer .Genre .PLAIN , NAME );
@@ -106,4 +95,63 @@ private PlainAnalyzerFactory() {
106
95
protected AbstractAnalyzer newAnalyzer () {
107
96
return new PlainAnalyzer (this );
108
97
}
98
+
99
+ private static boolean readSomePlainCharactersUTF8noBOMwithoutError (InputStream in )
100
+ throws IOException {
101
+
102
+ boolean isEOF = false ;
103
+ byte [] bytes = new byte [TRY_UTF8_BYTES ];
104
+ in .mark (TRY_UTF8_BYTES );
105
+ int len = in .read (bytes );
106
+ in .reset ();
107
+ if (len < 1 ) {
108
+ return false ;
109
+ }
110
+ if (len != TRY_UTF8_BYTES ) {
111
+ bytes = Arrays .copyOf (bytes , len );
112
+ isEOF = true ;
113
+ }
114
+
115
+ /*
116
+ * Decode one character at a time until either a decoding error occurs
117
+ * (failure) or the minimum number of required, valid characters is
118
+ * reached (success).
119
+ *
120
+ * "Decode bytes to chars one at a time"
121
+ * answered by https://stackoverflow.com/users/1831293/evgeniy-dorofeev
122
+ * https://stackoverflow.com/questions/17227331/decode-bytes-to-chars-one-at-a-time
123
+ * asked by https://stackoverflow.com/users/244360/kong
124
+ *
125
+ * Used under CC 4 with modifications noted as follows as required by
126
+ * license:
127
+ * * 2021-08-15 -- [email protected] , revised to check for errors.
128
+ */
129
+ CharsetDecoder cd = StandardCharsets .UTF_8 .newDecoder ().
130
+ onMalformedInput (CodingErrorAction .REPORT ).
131
+ onUnmappableCharacter (CodingErrorAction .REPORT );
132
+ ByteBuffer bin = ByteBuffer .wrap (bytes );
133
+ CharBuffer out = CharBuffer .allocate (MIN_CHARS_WHILE_REMAINING );
134
+ int numCharacters = 0 ;
135
+ CoderResult decodeResult = cd .decode (bin , out , isEOF );
136
+ if (decodeResult .isError ()) {
137
+ return false ;
138
+ }
139
+
140
+ int numChars = out .position ();
141
+ out .position (0 );
142
+ for (int i = 0 ; i < numChars ; ++i ) {
143
+ char c = out .charAt (i );
144
+ if (Character .isISOControl (c ) && !Character .isWhitespace (c )) {
145
+ return false ;
146
+ }
147
+ if (++numCharacters >= MIN_CHARS_WHILE_REMAINING ) {
148
+ return true ;
149
+ }
150
+ }
151
+ /*
152
+ * At this point, as no error has occurred, then if any character was
153
+ * read, consider the input as plain text.
154
+ */
155
+ return (numCharacters > 0 );
156
+ }
109
157
}
0 commit comments