Skip to content

Commit 750b311

Browse files
idodeclarevladak
authored andcommitted
Plain-text matcher should try UTF-8 decoding
Also, correct the attribution of JavaClassAnalyzerFactory as a precise magic-matcher.
1 parent 3411697 commit 750b311

File tree

12 files changed

+296
-80
lines changed

12 files changed

+296
-80
lines changed

opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
/*
2121
* Copyright (c) 2005, 2021, Oracle and/or its affiliates. All rights reserved.
22-
* Portions Copyright (c) 2017, 2020, Chris Fraire <[email protected]>.
22+
* Portions Copyright (c) 2017, 2021, Chris Fraire <[email protected]>.
2323
*/
2424
package org.opengrok.indexer.analysis;
2525

@@ -272,7 +272,6 @@ public class AnalyzerGuru {
272272
new ErlangAnalyzerFactory(),
273273
new ShAnalyzerFactory(),
274274
new PowershellAnalyzerFactory(),
275-
PlainAnalyzerFactory.DEFAULT_INSTANCE,
276275
new UuencodeAnalyzerFactory(),
277276
new GZIPAnalyzerFactory(),
278277
new JavaAnalyzerFactory(),
@@ -303,7 +302,9 @@ public class AnalyzerGuru {
303302
new AsmAnalyzerFactory(),
304303
new HCLAnalyzerFactory(),
305304
new TerraformAnalyzerFactory(),
306-
new RAnalyzerFactory()
305+
new RAnalyzerFactory(),
306+
// Keep PlainAnalyzer last, with its lone, quite fuzzy matcher.
307+
PlainAnalyzerFactory.DEFAULT_INSTANCE
307308
};
308309

309310
for (AnalyzerFactory analyzer : analyzers) {
@@ -979,7 +980,7 @@ private static AnalyzerFactory findForStream(InputStream in,
979980

980981
// First, do precise-magic Matcher matching
981982
for (FileAnalyzerFactory.Matcher matcher : matchers) {
982-
if (matcher.getIsPreciseMagic()) {
983+
if (matcher.isPreciseMagic()) {
983984
fac = matcher.isMagic(content, in);
984985
if (fac != null) {
985986
if (LOGGER.isLoggable(Level.FINEST)) {
@@ -1001,7 +1002,7 @@ private static AnalyzerFactory findForStream(InputStream in,
10011002

10021003
// Last, do imprecise-magic Matcher matching
10031004
for (FileAnalyzerFactory.Matcher matcher : matchers) {
1004-
if (!matcher.getIsPreciseMagic()) {
1005+
if (!matcher.isPreciseMagic()) {
10051006
fac = matcher.isMagic(content, in);
10061007
if (fac != null) {
10071008
if (LOGGER.isLoggable(Level.FINEST)) {

opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuruHelp.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
/*
2121
* Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
22-
* Copyright (c) 2017, 2019, Chris Fraire <[email protected]>.
22+
* Copyright (c) 2017, 2021, Chris Fraire <[email protected]>.
2323
*/
2424
package org.opengrok.indexer.analysis;
2525

@@ -83,12 +83,12 @@ public static String getUsage() {
8383

8484
b.append(System.lineSeparator() + "AnalyzerGuru magic matchers:" + System.lineSeparator());
8585
AnalyzerGuru.getAnalyzerFactoryMatchers().forEach((m) -> {
86-
if (m.getIsPreciseMagic()) {
86+
if (m.isPreciseMagic()) {
8787
b.append(reportable(m));
8888
}
8989
});
9090
AnalyzerGuru.getAnalyzerFactoryMatchers().forEach((m) -> {
91-
if (!m.getIsPreciseMagic()) {
91+
if (!m.isPreciseMagic()) {
9292
b.append(reportable(m));
9393
}
9494
});

opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/FileAnalyzerFactory.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
/*
2121
* Copyright (c) 2007, 2021, Oracle and/or its affiliates. All rights reserved.
22-
* Portions Copyright (c) 2017, 2018, Chris Fraire <[email protected]>.
22+
* Portions Copyright (c) 2017, 2021, Chris Fraire <[email protected]>.
2323
*/
2424
package org.opengrok.indexer.analysis;
2525

@@ -134,7 +134,7 @@ public interface Matcher {
134134
* Get a value indicating if the magic is byte-precise.
135135
* @return true if precise
136136
*/
137-
default boolean getIsPreciseMagic() {
137+
default boolean isPreciseMagic() {
138138
return false;
139139
}
140140

@@ -147,8 +147,7 @@ default boolean getIsPreciseMagic() {
147147
* @return a defined, reportable String
148148
*/
149149
default String description() {
150-
return getIsPreciseMagic() ? "precise matcher" :
151-
"heuristic matcher";
150+
return isPreciseMagic() ? "precise matcher" : "heuristic matcher";
152151
}
153152

154153
/**

opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/ZipMatcherBase.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
/*
2121
* Copyright (c) 2007, 2018, Oracle and/or its affiliates. All rights reserved.
22-
* Portions Copyright (c) 2017, 2018, Chris Fraire <[email protected]>.
22+
* Portions Copyright (c) 2017, 2021, Chris Fraire <[email protected]>.
2323
*/
2424
package org.opengrok.indexer.analysis.archive;
2525

@@ -42,7 +42,7 @@ public abstract class ZipMatcherBase implements FileAnalyzerFactory.Matcher {
4242
private static final int XFHSIZ = 4;
4343

4444
@Override
45-
public boolean getIsPreciseMagic() {
45+
public boolean isPreciseMagic() {
4646
return true;
4747
}
4848

opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/document/DocumentMatcher.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,15 @@
1818
*/
1919

2020
/*
21-
* Copyright (c) 2017, 2020, Chris Fraire <[email protected]>.
21+
* Copyright (c) 2017, 2021, Chris Fraire <[email protected]>.
2222
*/
2323
package org.opengrok.indexer.analysis.document;
2424

2525
import java.io.BufferedReader;
2626
import java.io.IOException;
2727
import java.io.InputStream;
2828
import java.io.InputStreamReader;
29+
import java.nio.charset.StandardCharsets;
2930
import java.util.Arrays;
3031
import org.opengrok.indexer.analysis.AnalyzerFactory;
3132
import org.opengrok.indexer.analysis.FileAnalyzerFactory.Matcher;
@@ -108,7 +109,7 @@ public AnalyzerFactory isMagic(byte[] contents, InputStream in)
108109
int bomLength = 0;
109110
String encoding = IOUtils.findBOMEncoding(contents);
110111
if (encoding == null) {
111-
encoding = "UTF-8";
112+
encoding = StandardCharsets.UTF_8.name();
112113
} else {
113114
bomLength = IOUtils.skipForBOM(contents);
114115
if (in.skip(bomLength) != bomLength) {

opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/executables/JavaClassAnalyzerFactory.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
/*
2121
* Copyright (c) 2007, 2018, Oracle and/or its affiliates. All rights reserved.
22-
* Portions Copyright (c) 2018, 2020, Chris Fraire <[email protected]>.
22+
* Portions Copyright (c) 2018, 2021, Chris Fraire <[email protected]>.
2323
*/
2424
package org.opengrok.indexer.analysis.executables;
2525

@@ -52,6 +52,11 @@ public class JavaClassAnalyzerFactory extends FileAnalyzerFactory {
5252
private static final int JAVA_SE_9_MAJOR_VER = 0x35;
5353

5454
private static final Matcher MATCHER = new Matcher() {
55+
@Override
56+
public boolean isPreciseMagic() {
57+
return true;
58+
}
59+
5560
@Override
5661
public String description() {
5762
return "0xCAFEBABE magic with major_version from JDK 1.1 to Java" +

opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/plain/PlainAnalyzerFactory.java

Lines changed: 96 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -19,84 +19,73 @@
1919

2020
/*
2121
* Copyright (c) 2007, 2021, Oracle and/or its affiliates. All rights reserved.
22-
* Portions Copyright (c) 2017, 2018, Chris Fraire <[email protected]>.
22+
* Portions Copyright (c) 2017, 2021, Chris Fraire <[email protected]>.
2323
*/
2424
package org.opengrok.indexer.analysis.plain;
2525

2626
import java.io.IOException;
2727
import java.io.InputStream;
28+
import java.nio.ByteBuffer;
29+
import java.nio.CharBuffer;
30+
import java.nio.charset.CharsetDecoder;
31+
import java.nio.charset.CoderResult;
32+
import java.nio.charset.CodingErrorAction;
2833
import java.nio.charset.StandardCharsets;
34+
import java.util.Arrays;
2935

3036
import org.opengrok.indexer.analysis.AbstractAnalyzer;
3137
import org.opengrok.indexer.analysis.AnalyzerFactory;
3238
import org.opengrok.indexer.analysis.FileAnalyzerFactory;
3339
import org.opengrok.indexer.util.IOUtils;
3440

41+
/**
42+
* Represents a subclass of {@link FileAnalyzerFactory} for plain-text
43+
* files in ASCII, UTF-8, or UTF-16.
44+
*/
3545
public final class PlainAnalyzerFactory extends FileAnalyzerFactory {
3646

3747
private static final String NAME = "Plain Text";
3848

39-
private static final Matcher MATCHER = new Matcher() {
49+
private static final int MIN_CHARS_WHILE_REMAINING = 20;
50+
51+
// Up to 4 octets per UTF-8 character
52+
private static final int TRY_UTF8_BYTES = MIN_CHARS_WHILE_REMAINING * 4;
53+
54+
/**
55+
* The reentrant {@link Matcher} implementation for plain-text files.
56+
*/
57+
public static final Matcher MATCHER = new Matcher() {
4058
@Override
4159
public String description() {
42-
return "UTF-8, UTF-16BE, or UTF-16LE Byte Order Mark is" +
43-
" present; or first eight bytes are all ASCII graphic" +
44-
" characters or ASCII whitespace";
60+
return "UTF-8, UTF-16BE, or UTF-16LE Byte Order Mark is present; or initial " +
61+
"bytes are all UTF-8-encoded graphic characters or whitespace";
4562
}
4663

4764
@Override
48-
public AnalyzerFactory isMagic(byte[] content, InputStream in)
49-
throws IOException {
50-
if (isPlainText(content)) {
65+
public AnalyzerFactory isMagic(byte[] content, InputStream in) throws IOException {
66+
int lengthBOM = IOUtils.skipForBOM(content);
67+
if (lengthBOM > 0) {
68+
return DEFAULT_INSTANCE;
69+
}
70+
if (readSomePlainCharactersUTF8noBOMwithoutError(in)) {
5171
return DEFAULT_INSTANCE;
52-
} else {
53-
return null;
5472
}
73+
return null;
5574
}
5675

5776
@Override
5877
public AnalyzerFactory forFactory() {
5978
return DEFAULT_INSTANCE;
6079
}
80+
};
6181

62-
/**
63-
* Check whether the byte array contains plain text. First, look
64-
* for a UTF BOM; otherwise, inspect as if US-ASCII.
65-
*/
66-
private boolean isPlainText(byte[] content) throws IOException {
67-
int lengthBOM = IOUtils.skipForBOM(content);
68-
if (lengthBOM > 0) {
69-
return true;
70-
}
71-
String ascii = new String(content, StandardCharsets.US_ASCII);
72-
return isPlainText(ascii);
73-
}
74-
75-
/**
76-
* Check whether the string only contains plain ASCII characters.
77-
*/
78-
private boolean isPlainText(String str) {
79-
for (int i = 0; i < str.length(); i++) {
80-
char b = str.charAt(i);
81-
if ((b >= 32 && b < 127) || // ASCII printable characters
82-
(b == 9) || // horizontal tab
83-
(b == 10) || // line feed
84-
(b == 12) || // form feed
85-
(b == 13)) { // carriage return
86-
// is plain text so far, go to next byte
87-
continue;
88-
} else {
89-
// 8-bit values or unprintable control characters,
90-
// probably not plain text
91-
return false;
92-
}
93-
}
94-
return true;
95-
}
96-
};
97-
98-
public static final PlainAnalyzerFactory DEFAULT_INSTANCE =
99-
new PlainAnalyzerFactory();
82+
/**
83+
* Gets the singleton, factory instance that associates
84+
* {@link PlainAnalyzer} with files whose initial bytes are the UTF-8,
85+
* UTF-16BE, or UTF-16LE Byte Order Mark; or whose initial bytes are all
86+
* UTF-8-encoded graphic characters or whitespace.
87+
*/
88+
public static final PlainAnalyzerFactory DEFAULT_INSTANCE = new PlainAnalyzerFactory();
10089

10190
private PlainAnalyzerFactory() {
10291
super(null, null, null, null, MATCHER, "text/plain", AbstractAnalyzer.Genre.PLAIN, NAME);
@@ -106,4 +95,63 @@ private PlainAnalyzerFactory() {
10695
protected AbstractAnalyzer newAnalyzer() {
10796
return new PlainAnalyzer(this);
10897
}
98+
99+
private static boolean readSomePlainCharactersUTF8noBOMwithoutError(InputStream in)
100+
throws IOException {
101+
102+
boolean isEOF = false;
103+
byte[] bytes = new byte[TRY_UTF8_BYTES];
104+
in.mark(TRY_UTF8_BYTES);
105+
int len = in.read(bytes);
106+
in.reset();
107+
if (len < 1) {
108+
return false;
109+
}
110+
if (len != TRY_UTF8_BYTES) {
111+
bytes = Arrays.copyOf(bytes, len);
112+
isEOF = true;
113+
}
114+
115+
/*
116+
* Decode one character at a time until either a decoding error occurs
117+
* (failure) or the minimum number of required, valid characters is
118+
* reached (success).
119+
*
120+
* "Decode bytes to chars one at a time"
121+
* answered by https://stackoverflow.com/users/1831293/evgeniy-dorofeev
122+
* https://stackoverflow.com/questions/17227331/decode-bytes-to-chars-one-at-a-time
123+
* asked by https://stackoverflow.com/users/244360/kong
124+
*
125+
* Used under CC 4 with modifications noted as follows as required by
126+
* license:
127+
* * 2021-08-15 -- [email protected], revised to check for errors.
128+
*/
129+
CharsetDecoder cd = StandardCharsets.UTF_8.newDecoder().
130+
onMalformedInput(CodingErrorAction.REPORT).
131+
onUnmappableCharacter(CodingErrorAction.REPORT);
132+
ByteBuffer bin = ByteBuffer.wrap(bytes);
133+
CharBuffer out = CharBuffer.allocate(MIN_CHARS_WHILE_REMAINING);
134+
int numCharacters = 0;
135+
CoderResult decodeResult = cd.decode(bin, out, isEOF);
136+
if (decodeResult.isError()) {
137+
return false;
138+
}
139+
140+
int numChars = out.position();
141+
out.position(0);
142+
for (int i = 0; i < numChars; ++i) {
143+
char c = out.charAt(i);
144+
if (Character.isISOControl(c) && !Character.isWhitespace(c)) {
145+
return false;
146+
}
147+
if (++numCharacters >= MIN_CHARS_WHILE_REMAINING) {
148+
return true;
149+
}
150+
}
151+
/*
152+
* At this point, as no error has occurred, then if any character was
153+
* read, consider the input as plain text.
154+
*/
155+
return (numCharacters > 0);
156+
}
109157
}

0 commit comments

Comments
 (0)