Skip to content

Commit 4fdbf16

Browse files
authored
Merge pull request #1635 from tetsurom/master
Improve UTF-8/16 BOM handling.
2 parents 44621f5 + 93e370b commit 4fdbf16

File tree

6 files changed

+74
-39
lines changed

6 files changed

+74
-39
lines changed

src/org/opensolaris/opengrok/analysis/TextAnalyzer.java

Lines changed: 2 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import java.io.InputStreamReader;
2929
import java.io.Reader;
3030
import java.nio.charset.Charset;
31+
import org.opensolaris.opengrok.util.IOUtils;
3132

3233
public abstract class TextAnalyzer extends FileAnalyzer {
3334

@@ -36,33 +37,6 @@ public TextAnalyzer(FileAnalyzerFactory factory) {
3637
}
3738

3839
protected Reader getReader(InputStream stream) throws IOException {
39-
InputStream in = stream.markSupported() ?
40-
stream : new BufferedInputStream(stream);
41-
42-
String charset = null;
43-
44-
in.mark(3);
45-
46-
byte[] head = new byte[3];
47-
int br = in.read(head, 0, 3);
48-
49-
if (br >= 2
50-
&& (head[0] == (byte) 0xFE && head[1] == (byte) 0xFF)
51-
|| (head[0] == (byte) 0xFF && head[1] == (byte) 0xFE)) {
52-
charset = "UTF-16";
53-
in.reset();
54-
} else if (br >= 3 && head[0] == (byte) 0xEF && head[1] == (byte) 0xBB
55-
&& head[2] == (byte) 0xBF) {
56-
// InputStreamReader does not properly discard BOM on UTF8 streams,
57-
// so don't reset the stream.
58-
charset = "UTF-8";
59-
}
60-
61-
if (charset == null) {
62-
in.reset();
63-
charset = Charset.defaultCharset().name();
64-
}
65-
66-
return new InputStreamReader(in, charset);
40+
return IOUtils.createBOMStrippedReader(stream);
6741
}
6842
}

src/org/opensolaris/opengrok/search/Results.java

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
import org.opensolaris.opengrok.configuration.RuntimeEnvironment;
5656
import org.opensolaris.opengrok.history.HistoryException;
5757
import org.opensolaris.opengrok.logger.LoggerFactory;
58+
import org.opensolaris.opengrok.util.IOUtils;
5859
import org.opensolaris.opengrok.web.Prefix;
5960
import org.opensolaris.opengrok.web.SearchHelper;
6061
import org.opensolaris.opengrok.web.Util;
@@ -118,10 +119,11 @@ private static Reader getXrefReader(
118119
File basedir, String path, boolean compressed)
119120
throws IOException {
120121
if (compressed) {
121-
return new BufferedReader(new InputStreamReader(new GZIPInputStream(
122-
new FileInputStream(new File(basedir, path + ".gz")))));
122+
return new BufferedReader(IOUtils.createBOMStrippedReader(
123+
new GZIPInputStream(new FileInputStream(new File(basedir, path + ".gz")))));
123124
} else {
124-
return new BufferedReader(new FileReader(new File(basedir, path)));
125+
return new BufferedReader(IOUtils.createBOMStrippedReader(
126+
new FileInputStream(new File(basedir, path))));
125127
}
126128
}
127129

@@ -233,8 +235,8 @@ public static void prettyPrint(Writer out, SearchHelper sh, int start,
233235
String htags = getTags(sh.sourceRoot, rpath, false);
234236
out.write(sh.summarizer.getSummary(htags).toString());
235237
} else {
236-
FileReader r = genre == Genre.PLAIN
237-
? new FileReader(new File(sh.sourceRoot, rpath))
238+
Reader r = genre == Genre.PLAIN
239+
? IOUtils.createBOMStrippedReader(new FileInputStream(new File(sh.sourceRoot, rpath)))
238240
: null;
239241
sh.sourceContext.getContext(r, out, xrefPrefix, morePrefix,
240242
rpath, tags, true, sh.builder.isDefSearch(), null, scopes);

src/org/opensolaris/opengrok/util/IOUtils.java

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,16 @@
2323
*/
2424
package org.opensolaris.opengrok.util;
2525

26+
import java.io.BufferedInputStream;
27+
import java.io.BufferedReader;
2628
import java.io.Closeable;
2729
import java.io.File;
2830
import java.io.FilenameFilter;
2931
import java.io.IOException;
32+
import java.io.InputStream;
33+
import java.io.InputStreamReader;
34+
import java.io.Reader;
35+
import java.nio.charset.Charset;
3036
import java.nio.file.FileVisitResult;
3137
import java.nio.file.Files;
3238
import java.nio.file.Path;
@@ -165,4 +171,52 @@ public boolean accept(File dir, String name) {
165171
}
166172
return Arrays.asList(files);
167173
}
174+
175+
/**
176+
* Create BOM stripped reader from the stream.
177+
* Charset of the reader is set to UTF-8, UTF-16 or system's default.
178+
* @param stream input stream
179+
* @return reader for the stream without BOM
180+
*/
181+
public static Reader createBOMStrippedReader(InputStream stream) throws IOException {
182+
return createBOMStrippedReader(stream, Charset.defaultCharset().name());
183+
}
184+
185+
/**
186+
* Create BOM stripped reader from the stream.
187+
* Charset of the reader is set to UTF-8, UTF-16 or default.
188+
* @param stream input stream
189+
* @param defaultCharset default charset
190+
* @return reader for the stream without BOM
191+
*/
192+
public static Reader createBOMStrippedReader(InputStream stream, String defaultCharset) throws IOException {
193+
InputStream in = stream.markSupported() ?
194+
stream : new BufferedInputStream(stream);
195+
196+
String charset = null;
197+
198+
in.mark(3);
199+
200+
byte[] head = new byte[3];
201+
int br = in.read(head, 0, 3);
202+
203+
if (br >= 2
204+
&& (head[0] == (byte) 0xFE && head[1] == (byte) 0xFF)
205+
|| (head[0] == (byte) 0xFF && head[1] == (byte) 0xFE)) {
206+
charset = "UTF-16";
207+
in.reset();
208+
} else if (br >= 3 && head[0] == (byte) 0xEF && head[1] == (byte) 0xBB
209+
&& head[2] == (byte) 0xBF) {
210+
// InputStreamReader does not properly discard BOM on UTF8 streams,
211+
// so don't reset the stream.
212+
charset = "UTF-8";
213+
}
214+
215+
if (charset == null) {
216+
in.reset();
217+
charset = defaultCharset;
218+
}
219+
220+
return new InputStreamReader(in, charset);
221+
}
168222
}

src/org/opensolaris/opengrok/web/PageConfig.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@ public DiffData getDiffData() {
267267
Project p = getProject();
268268
for (int i = 0; i < 2; i++) {
269269
try (BufferedReader br = new BufferedReader(
270-
ExpandTabsReader.wrap(new InputStreamReader(in[i]), p))) {
270+
ExpandTabsReader.wrap(IOUtils.createBOMStrippedReader(in[i]), p))) {
271271
String line;
272272
while ((line = br.readLine()) != null) {
273273
lines.add(line);

web/list.jsp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ org.opensolaris.opengrok.analysis.FileAnalyzer.Genre,
4545
org.opensolaris.opengrok.analysis.FileAnalyzerFactory,
4646
org.opensolaris.opengrok.history.Annotation,
4747
org.opensolaris.opengrok.index.IndexDatabase,
48+
org.opensolaris.opengrok.util.IOUtils,
4849
org.opensolaris.opengrok.web.DirectoryListing"
4950
%><%
5051
{
@@ -180,7 +181,7 @@ Binary file [Click <a href="<%= rawPath %>?r=<%= Util.URIEncode(rev) %>">here</a
180181
Annotation annotation = cfg.getAnnotation();
181182
//not needed yet
182183
//annotation.writeTooltipMap(out);
183-
r = new InputStreamReader(in);
184+
r = IOUtils.createBOMStrippedReader(in);
184185
AnalyzerGuru.writeXref(a, r, out, defs,
185186
annotation, Project.getProject(resourceFile));
186187
} else if (g == Genre.IMAGE) {
@@ -267,7 +268,7 @@ Binary file [Click <a href="<%= rawPath %>?r=<%= Util.URIEncode(rev) %>">here</a
267268
// find the definitions in the index.
268269
Definitions defs = IndexDatabase.getDefinitions(resourceFile);
269270
Annotation annotation = cfg.getAnnotation();
270-
r = new InputStreamReader(bin);
271+
r = IOUtils.createBOMStrippedReader(bin);
271272
AnalyzerGuru.writeXref(a, r, out, defs, annotation,
272273
Project.getProject(resourceFile));
273274
%></pre>

web/more.jsp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,16 @@ Copyright (c) 2010, 2017, Oracle and/or its affiliates. All rights reserved.
2323
Portions Copyright 2011 Jens Elkner.
2424
2525
--%><%@page errorPage="error.jsp" import="
26-
java.io.FileReader,
26+
java.io.FileInputStream,
27+
java.io.Reader,
2728
java.util.logging.Level,
2829
java.util.logging.Logger,
2930

3031
org.apache.lucene.search.Query,
3132
org.opensolaris.opengrok.search.QueryBuilder,
3233
org.opensolaris.opengrok.search.context.Context,
33-
org.opensolaris.opengrok.logger.LoggerFactory"
34+
org.opensolaris.opengrok.logger.LoggerFactory,
35+
org.opensolaris.opengrok.util.IOUtils"
3436
%>
3537
<%
3638
{
@@ -54,7 +56,9 @@ file="mast.jsp"
5456
%><p><span class="pagetitle">Lines Matching <b><%= tquery %></b></span></p>
5557
<div id="more" style="line-height:1.5em;">
5658
<pre><%
57-
sourceContext.getContext(new FileReader(cfg.getResourceFile()), out,
59+
Reader r = IOUtils.createBOMStrippedReader(
60+
new FileInputStream(cfg.getResourceFile()));
61+
sourceContext.getContext(r, out,
5862
request.getContextPath() + Prefix.XREF_P, null, cfg.getPath(),
5963
null, false, false, null, null);
6064
%></pre>

0 commit comments

Comments
 (0)