Skip to content

Commit a39bcfe

Browse files
committed
Change the analyzers so that they don't keep the entire file in memory.
Instead of passing a stream to the analyze() methods, and have them cache the entire contents in a byte or char array, open a new stream each time the source file should be read. Also, write the xref files from the analyze() methods. In many analyzers, this avoids the need for building up a data structure that holds the full xref output in memory. This fixes #8.
1 parent b5a3e34 commit a39bcfe

22 files changed

+443
-442
lines changed

src/org/opensolaris/opengrok/analysis/AnalyzerGuru.java

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -232,14 +232,16 @@ public static FileAnalyzer getAnalyzer(InputStream in, String file) throws IOExc
232232
/**
233233
* Create a Lucene document and fill in the required fields
234234
* @param file The file to index
235-
* @param in The data to generate the index for
236235
* @param path Where the file is located (from source root)
236+
* @param fa The analyzer to use on the file
237+
* @param xrefOut Where to write the xref (possibly {@code null})
237238
* @return The Lucene document to add to the index database
238239
* @throws java.io.IOException If an exception occurs while collecting the
239240
* datas
240241
*/
241-
public Document getDocument(File file, InputStream in, String path,
242-
FileAnalyzer fa) throws IOException {
242+
public Document getDocument(File file, String path,
243+
FileAnalyzer fa, Writer xrefOut)
244+
throws IOException {
243245
Document doc = new Document();
244246
String date = DateTools.timeToString(file.lastModified(),
245247
DateTools.Resolution.MILLISECOND);
@@ -272,7 +274,7 @@ public Document getDocument(File file, InputStream in, String path,
272274
doc.add(new Field("t", g.typeName(), string_ft_stored_nanalyzed_norms
273275
));
274276
}
275-
fa.analyze(doc, in);
277+
fa.analyze(doc, StreamSource.fromFile(file), xrefOut);
276278
}
277279

278280
return doc;

src/org/opensolaris/opengrok/analysis/FileAnalyzer.java

Lines changed: 9 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -23,24 +23,16 @@
2323
*/
2424
package org.opensolaris.opengrok.analysis;
2525

26-
import java.io.BufferedWriter;
27-
import java.io.File;
28-
import java.io.FileOutputStream;
2926
import java.io.IOException;
30-
import java.io.InputStream;
31-
import java.io.OutputStream;
32-
import java.io.OutputStreamWriter;
3327
import java.io.Reader;
3428
import java.io.Writer;
3529
import java.util.logging.Level;
36-
import java.util.zip.GZIPOutputStream;
3730
import org.apache.lucene.analysis.Analyzer;
3831
import org.apache.lucene.document.Document;
3932
import org.opensolaris.opengrok.OpenGrokLogger;
4033
import org.opensolaris.opengrok.analysis.plain.PlainFullTokenizer;
4134
import org.opensolaris.opengrok.analysis.plain.PlainSymbolTokenizer;
4235
import org.opensolaris.opengrok.configuration.Project;
43-
import org.opensolaris.opengrok.configuration.RuntimeEnvironment;
4436

4537
/**
4638
* Base class for all different File Analyzers
@@ -138,7 +130,15 @@ public FileAnalyzer(FileAnalyzerFactory factory) {
138130

139131
}
140132

141-
public void analyze(Document doc, InputStream in) throws IOException {
133+
/**
134+
* Analyze the contents of a source file. This includes populating the
135+
* Lucene document with fields to add to the index, and writing the
136+
* cross-referenced data to the specified destination.
137+
* @param doc the Lucene document
138+
* @param src the input data source
139+
* @param xrefOut where to write the xref (may be {@code null})
140+
*/
141+
public void analyze(Document doc, StreamSource src, Writer xrefOut) throws IOException {
142142
// not used
143143
}
144144

@@ -161,26 +161,4 @@ public TokenStreamComponents createComponents(String fieldName, Reader reader) {
161161
return null;
162162
}
163163
}
164-
165-
/**
166-
* Write a cross referenced HTML file.
167-
* @param out to writer HTML cross-reference
168-
* @throws java.io.IOException if an error occurs
169-
*/
170-
public void writeXref(Writer out) throws IOException {
171-
out.write("Error General File X-Ref writer!");
172-
}
173-
174-
public void writeXref(File xrefDir, String path) throws IOException {
175-
RuntimeEnvironment env = RuntimeEnvironment.getInstance();
176-
177-
final boolean compressed = env.isCompressXref();
178-
final File file = new File(xrefDir, path + (compressed ? ".gz" : ""));
179-
try (OutputStream out = compressed ?
180-
new GZIPOutputStream(new FileOutputStream(file)) :
181-
new FileOutputStream(file);
182-
Writer w = new BufferedWriter(new OutputStreamWriter(out))) {
183-
writeXref(w);
184-
}
185-
}
186164
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
/*
2+
* CDDL HEADER START
3+
*
4+
* The contents of this file are subject to the terms of the
5+
* Common Development and Distribution License (the "License").
6+
* You may not use this file except in compliance with the License.
7+
*
8+
* See LICENSE.txt included in this distribution for the specific
9+
* language governing permissions and limitations under the License.
10+
*
11+
* When distributing Covered Code, include this CDDL HEADER in each
12+
* file and include the License file at LICENSE.txt.
13+
* If applicable, add the following below this CDDL HEADER, with the
14+
* fields enclosed by brackets "[]" replaced with your own identifying
15+
* information: Portions Copyright [yyyy] [name of copyright owner]
16+
*
17+
* CDDL HEADER END
18+
*/
19+
20+
/*
21+
* Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
22+
*/
23+
package org.opensolaris.opengrok.analysis;
24+
25+
import java.io.BufferedInputStream;
26+
import java.io.File;
27+
import java.io.FileInputStream;
28+
import java.io.IOException;
29+
import java.io.InputStream;
30+
31+
/**
32+
* This class lets you create {@code InputStream}s that read data from a
33+
* specific source. It could be used if you need to pass a stream as an
34+
* argument to a method where the stream may need to be read multiple times.
35+
* Instead of passing the stream directly, you pass a {@code StreamSource}
36+
* instance that generates the stream. The receiver may call
37+
* {@link #getStream()} multiple times, getting a fresh stream each time,
38+
* so that there may be multiple, concurrent readers that don't interfere
39+
* with each other.
40+
*/
41+
public abstract class StreamSource {
42+
/**
43+
* Get a stream that reads data from the input source. Every call should
44+
* return a new instance so that multiple readers can read from the source
45+
* without interfering with each other.
46+
*
47+
* @return an {@code InputStream}
48+
* @throws IOException if an error occurs when opening the stream
49+
*/
50+
public abstract InputStream getStream() throws IOException;
51+
52+
/**
53+
* Helper method that creates a {@code StreamSource} instance that
54+
* reads data from a file.
55+
*
56+
* @param file the data file
57+
* @return a stream source that reads from {@code file}
58+
*/
59+
public static StreamSource fromFile(final File file) {
60+
return new StreamSource() {
61+
@Override
62+
public InputStream getStream() throws IOException {
63+
return new BufferedInputStream(new FileInputStream(file));
64+
}
65+
};
66+
}
67+
}

src/org/opensolaris/opengrok/analysis/TextAnalyzer.java

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,25 +18,27 @@
1818
*/
1919

2020
/*
21-
* Copyright (c) 2005, 2012, Oracle and/or its affiliates. All rights reserved.
21+
* Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved.
2222
*/
2323
package org.opensolaris.opengrok.analysis;
2424

25+
import java.io.BufferedInputStream;
2526
import java.io.IOException;
2627
import java.io.InputStream;
2728
import java.io.InputStreamReader;
2829
import java.io.Reader;
2930
import java.nio.charset.Charset;
30-
import org.apache.lucene.document.Document;
3131

3232
public abstract class TextAnalyzer extends FileAnalyzer {
3333

3434
public TextAnalyzer(FileAnalyzerFactory factory) {
3535
super(factory);
3636
}
3737

38-
@Override
39-
public final void analyze(Document doc, InputStream in) throws IOException {
38+
protected Reader getReader(InputStream stream) throws IOException {
39+
InputStream in = stream.markSupported() ?
40+
stream : new BufferedInputStream(stream);
41+
4042
String charset = null;
4143

4244
in.mark(3);
@@ -61,8 +63,6 @@ public final void analyze(Document doc, InputStream in) throws IOException {
6163
charset = Charset.defaultCharset().name();
6264
}
6365

64-
analyze(doc, new InputStreamReader(in, charset));
66+
return new InputStreamReader(in, charset);
6567
}
66-
67-
protected abstract void analyze(Document doc, Reader reader) throws IOException;
6868
}

src/org/opensolaris/opengrok/analysis/archive/BZip2Analyzer.java

Lines changed: 27 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
*/
1919

2020
/*
21-
* Copyright (c) 2005, 2012, Oracle and/or its affiliates. All rights reserved.
21+
* Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved.
2222
*/
2323
package org.opensolaris.opengrok.analysis.archive;
2424

@@ -33,6 +33,7 @@
3333
import org.opensolaris.opengrok.analysis.AnalyzerGuru;
3434
import org.opensolaris.opengrok.analysis.FileAnalyzer;
3535
import org.opensolaris.opengrok.analysis.FileAnalyzerFactory;
36+
import org.opensolaris.opengrok.analysis.StreamSource;
3637

3738
/**
3839
* Analyzes a BZip2 file Created on September 22, 2005
@@ -57,20 +58,16 @@ protected BZip2Analyzer(FileAnalyzerFactory factory) {
5758
private FileAnalyzer fa;
5859

5960
@Override
60-
public void analyze(Document doc, InputStream in) throws IOException {
61-
if (in.read() != 'B') {
62-
throw new IOException("Not BZIP2 format");
63-
}
64-
if (in.read() != 'Z') {
65-
throw new IOException("Not BZIP2 format");
66-
}
67-
BufferedInputStream gzis = new BufferedInputStream(new CBZip2InputStream(in));
61+
public void analyze(Document doc, StreamSource src, Writer xrefOut) throws IOException {
62+
StreamSource bzSrc = wrap(src);
6863
String path = doc.get("path");
6964
if (path != null
7065
&& (path.endsWith(".bz2") || path.endsWith(".BZ2") || path.endsWith(".bz"))) {
7166
String newname = path.substring(0, path.lastIndexOf('.'));
7267
//System.err.println("BZIPPED OF = " + newname);
73-
fa = AnalyzerGuru.getAnalyzer(gzis, newname);
68+
try (InputStream in = bzSrc.getStream()) {
69+
fa = AnalyzerGuru.getAnalyzer(in, newname);
70+
}
7471
if (fa instanceof BZip2Analyzer) {
7572
fa = null;
7673
} else {
@@ -79,7 +76,7 @@ public void analyze(Document doc, InputStream in) throws IOException {
7976
} else {
8077
this.g = Genre.DATA;
8178
}
82-
fa.analyze(doc, gzis);
79+
fa.analyze(doc, bzSrc, xrefOut);
8380
if (doc.get("t") != null) {
8481
doc.removeField("t");
8582
if (g == Genre.XREFABLE) {
@@ -90,23 +87,30 @@ public void analyze(Document doc, InputStream in) throws IOException {
9087
}
9188
}
9289

90+
/**
91+
* Wrap the raw stream source in one that returns the uncompressed stream.
92+
*/
93+
private static StreamSource wrap(final StreamSource src) {
94+
return new StreamSource() {
95+
@Override
96+
public InputStream getStream() throws IOException {
97+
InputStream raw = src.getStream();
98+
// A BZip2 file starts with "BZ", but CBZip2InputStream
99+
// expects the magic bytes to be stripped off first.
100+
if (raw.read() == 'B' && raw.read() == 'Z') {
101+
return new BufferedInputStream(new CBZip2InputStream(raw));
102+
} else {
103+
throw new IOException("Not BZIP2 format");
104+
}
105+
}
106+
};
107+
}
108+
93109
@Override
94110
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
95111
if (fa != null) {
96112
return fa.createComponents(fieldName, reader);
97113
}
98114
return super.createComponents(fieldName, reader);
99115
}
100-
101-
/**
102-
* Write a cross referenced HTML file.
103-
*
104-
* @param out Writer to store HTML cross-reference
105-
*/
106-
@Override
107-
public void writeXref(Writer out) throws IOException {
108-
if ((fa != null) && (fa.getGenre() == Genre.PLAIN || fa.getGenre() == Genre.XREFABLE)) {
109-
fa.writeXref(out);
110-
}
111-
}
112116
}

src/org/opensolaris/opengrok/analysis/archive/GZIPAnalyzer.java

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
*/
1919

2020
/*
21-
* Copyright (c) 2005, 2012, Oracle and/or its affiliates. All rights reserved.
21+
* Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved.
2222
*/
2323
package org.opensolaris.opengrok.analysis.archive;
2424

@@ -36,6 +36,7 @@
3636
import org.opensolaris.opengrok.analysis.FileAnalyzer;
3737
import org.opensolaris.opengrok.analysis.FileAnalyzer.Genre;
3838
import org.opensolaris.opengrok.analysis.FileAnalyzerFactory;
39+
import org.opensolaris.opengrok.analysis.StreamSource;
3940

4041
/**
4142
* Analyzes GZip files Created on September 22, 2005
@@ -60,14 +61,16 @@ protected GZIPAnalyzer(FileAnalyzerFactory factory) {
6061
private FileAnalyzer fa;
6162

6263
@Override
63-
public void analyze(Document doc, InputStream in) throws IOException {
64-
BufferedInputStream gzis = new BufferedInputStream(new GZIPInputStream(in));
64+
public void analyze(Document doc, StreamSource src, Writer xrefOut) throws IOException {
65+
StreamSource gzSrc = wrap(src);
6566
String path = doc.get("path");
6667
if (path != null
6768
&& (path.endsWith(".gz") || path.endsWith(".GZ") || path.endsWith(".Gz"))) {
6869
String newname = path.substring(0, path.length() - 3);
6970
//System.err.println("GZIPPED OF = " + newname);
70-
fa = AnalyzerGuru.getAnalyzer(gzis, newname);
71+
try (InputStream gzis = gzSrc.getStream()) {
72+
fa = AnalyzerGuru.getAnalyzer(gzis, newname);
73+
}
7174
if (fa == null) {
7275
this.g = Genre.DATA;
7376
OpenGrokLogger.getLogger().log(Level.WARNING, "Did not analyze {0}, detected as data.", newname);
@@ -79,7 +82,7 @@ public void analyze(Document doc, InputStream in) throws IOException {
7982
} else {
8083
this.g = Genre.DATA;
8184
}
82-
fa.analyze(doc, gzis);
85+
fa.analyze(doc, gzSrc, xrefOut);
8386
if (doc.get("t") != null) {
8487
doc.removeField("t");
8588
if (g == Genre.XREFABLE) {
@@ -91,23 +94,24 @@ public void analyze(Document doc, InputStream in) throws IOException {
9194
}
9295
}
9396

97+
/**
98+
* Wrap the raw stream source in one that returns the uncompressed stream.
99+
*/
100+
private static StreamSource wrap(final StreamSource src) {
101+
return new StreamSource() {
102+
@Override
103+
public InputStream getStream() throws IOException {
104+
return new BufferedInputStream(
105+
new GZIPInputStream(src.getStream()));
106+
}
107+
};
108+
}
109+
94110
@Override
95111
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
96112
if (fa != null) {
97113
return fa.createComponents(fieldName, reader);
98114
}
99115
return super.createComponents(fieldName, reader);
100116
}
101-
102-
/**
103-
* Write a cross referenced HTML file.
104-
*
105-
* @param out Writer to store HTML cross-reference
106-
*/
107-
@Override
108-
public void writeXref(Writer out) throws IOException {
109-
if ((fa != null) && (fa.getGenre() == Genre.PLAIN || fa.getGenre() == Genre.XREFABLE)) {
110-
fa.writeXref(out);
111-
}
112-
}
113117
}

0 commit comments

Comments
 (0)