Skip to content

Commit f9ff866

Browse files
committed
Fix #2560 : recognize Huge Text in gzip or bzip2
1 parent 752a7ce commit f9ff866

File tree

3 files changed

+128
-59
lines changed

3 files changed

+128
-59
lines changed

opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/BZip2Analyzer.java

Lines changed: 14 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,16 @@
2727
import java.io.IOException;
2828
import java.io.InputStream;
2929
import java.io.Writer;
30+
import java.util.logging.Level;
31+
import java.util.logging.Logger;
32+
3033
import org.apache.lucene.document.Document;
31-
import org.apache.lucene.document.Field;
3234
import org.apache.tools.bzip2.CBZip2InputStream;
3335
import org.opengrok.indexer.analysis.AbstractAnalyzer;
3436
import org.opengrok.indexer.analysis.AnalyzerFactory;
3537
import org.opengrok.indexer.analysis.AnalyzerGuru;
36-
import org.opengrok.indexer.analysis.FileAnalyzer;
3738
import org.opengrok.indexer.analysis.StreamSource;
39+
import org.opengrok.indexer.logger.LoggerFactory;
3840
import org.opengrok.indexer.search.QueryBuilder;
3941

4042
/**
@@ -43,17 +45,9 @@
4345
* Created on September 22, 2005
4446
* @author Chandan
4547
*/
46-
public class BZip2Analyzer extends FileAnalyzer {
47-
48-
private Genre g;
48+
public class BZip2Analyzer extends CompressedAnalyzer {
4949

50-
@Override
51-
public Genre getGenre() {
52-
if (g != null) {
53-
return g;
54-
}
55-
return super.getGenre();
56-
}
50+
private static final Logger LOGGER = LoggerFactory.getLogger(BZip2Analyzer.class);
5751

5852
protected BZip2Analyzer(AnalyzerFactory factory) {
5953
super(factory);
@@ -71,11 +65,11 @@ public String getCtagsLang() {
7165
* Gets a version number to be used to tag processed documents so that
7266
* re-analysis can be re-done later if a stored version number is different
7367
* from the current implementation.
74-
* @return 20180111_00
68+
* @return 20200417_00
7569
*/
7670
@Override
7771
protected int getSpecializedVersionNo() {
78-
return 20180111_00; // Edit comment above too!
72+
return 20200417_00; // Edit comment above too!
7973
}
8074

8175
@Override
@@ -92,20 +86,12 @@ public void analyze(Document doc, StreamSource src, Writer xrefOut)
9286
try (InputStream in = bzSrc.getStream()) {
9387
fa = AnalyzerGuru.getAnalyzer(in, newname);
9488
}
95-
if (!(fa instanceof BZip2Analyzer)) {
96-
if (fa.getGenre() == Genre.PLAIN || fa.getGenre() == Genre.XREFABLE) {
97-
this.g = Genre.XREFABLE;
98-
} else {
99-
this.g = Genre.DATA;
100-
}
101-
fa.analyze(doc, bzSrc, xrefOut);
102-
if (doc.get(QueryBuilder.T) != null) {
103-
doc.removeField(QueryBuilder.T);
104-
if (g == Genre.XREFABLE) {
105-
doc.add(new Field(QueryBuilder.T, g.typeName(),
106-
AnalyzerGuru.string_ft_stored_nanalyzed_norms));
107-
}
108-
}
89+
if (fa == null) {
90+
this.g = Genre.DATA;
91+
LOGGER.log(Level.WARNING, "Did not analyze {0} detected as data.", newname);
92+
//TODO we could probably wrap tar analyzer here, need to do research on reader coming from gzis ...
93+
} else if (!(fa instanceof BZip2Analyzer)) {
94+
analyzeUncompressed(doc, xrefOut, fa, bzSrc);
10995
}
11096
}
11197
}
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
/*
2+
* CDDL HEADER START
3+
*
4+
* The contents of this file are subject to the terms of the
5+
* Common Development and Distribution License (the "License").
6+
* You may not use this file except in compliance with the License.
7+
*
8+
* See LICENSE.txt included in this distribution for the specific
9+
* language governing permissions and limitations under the License.
10+
*
11+
* When distributing Covered Code, include this CDDL HEADER in each
12+
* file and include the License file at LICENSE.txt.
13+
* If applicable, add the following below this CDDL HEADER, with the
14+
* fields enclosed by brackets "[]" replaced with your own identifying
15+
* information: Portions Copyright [yyyy] [name of copyright owner]
16+
*
17+
* CDDL HEADER END
18+
*/
19+
20+
/*
21+
* Copyright (c) 2005, 2018, Oracle and/or its affiliates. All rights reserved.
22+
* Portions Copyright (c) 2017-2020, Chris Fraire <[email protected]>.
23+
*/
24+
25+
package org.opengrok.indexer.analysis.archive;
26+
27+
import org.apache.lucene.document.Document;
28+
import org.apache.lucene.document.Field;
29+
import org.opengrok.indexer.analysis.AbstractAnalyzer;
30+
import org.opengrok.indexer.analysis.AnalyzerFactory;
31+
import org.opengrok.indexer.analysis.AnalyzerGuru;
32+
import org.opengrok.indexer.analysis.FileAnalyzer;
33+
import org.opengrok.indexer.analysis.StreamSource;
34+
import org.opengrok.indexer.analysis.data.HugeTextAnalyzerFactory;
35+
import org.opengrok.indexer.configuration.RuntimeEnvironment;
36+
import org.opengrok.indexer.search.QueryBuilder;
37+
38+
import java.io.IOException;
39+
import java.io.InputStream;
40+
import java.io.Writer;
41+
42+
/**
43+
* Represents a base for compressed formats (e.g. gzip or bzip2) but not for
44+
* archive formats that have compression (e.g. Zip or Jar).
45+
* @author Chandan
46+
*/
47+
public abstract class CompressedAnalyzer extends FileAnalyzer {
48+
49+
protected Genre g;
50+
51+
@Override
52+
public Genre getGenre() {
53+
if (g != null) {
54+
return g;
55+
}
56+
return super.getGenre();
57+
}
58+
59+
protected CompressedAnalyzer(AnalyzerFactory factory) {
60+
super(factory);
61+
}
62+
63+
protected void analyzeUncompressed(
64+
Document doc, Writer xrefOut, AbstractAnalyzer fa, StreamSource compressedSrc)
65+
throws IOException, InterruptedException {
66+
67+
if (fa.getGenre() == Genre.PLAIN) {
68+
if (meetsHugeTextThreshold(compressedSrc)) {
69+
fa = HugeTextAnalyzerFactory.DEFAULT_INSTANCE.getAnalyzer();
70+
g = Genre.DATA;
71+
} else {
72+
g = Genre.XREFABLE;
73+
}
74+
} else if (fa.getGenre() == Genre.XREFABLE) {
75+
g = Genre.XREFABLE;
76+
} else {
77+
g = Genre.DATA;
78+
}
79+
80+
fa.analyze(doc, compressedSrc, xrefOut);
81+
if (doc.get(QueryBuilder.T) != null) {
82+
doc.removeField(QueryBuilder.T);
83+
}
84+
doc.add(new Field(QueryBuilder.T, g.typeName(),
85+
AnalyzerGuru.string_ft_stored_nanalyzed_norms));
86+
}
87+
88+
private boolean meetsHugeTextThreshold(StreamSource compressedSrc) throws IOException {
89+
RuntimeEnvironment env = RuntimeEnvironment.getInstance();
90+
int hugeTextThresholdBytes = env.getHugeTextThresholdBytes();
91+
if (Integer.MAX_VALUE == hugeTextThresholdBytes) {
92+
// Don't bother decompressing to count if the limit is MAX_VALUE.
93+
return false;
94+
}
95+
96+
byte[] buf = new byte[8 * 1024];
97+
int bytesRead = 0;
98+
int n;
99+
try (InputStream in = compressedSrc.getStream()) {
100+
while ((n = in.read(buf, 0, buf.length)) != -1) {
101+
bytesRead += n;
102+
if (bytesRead >= hugeTextThresholdBytes) {
103+
return true;
104+
}
105+
}
106+
}
107+
return false;
108+
}
109+
}

opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/GZIPAnalyzer.java

Lines changed: 5 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,9 @@
3232
import java.util.logging.Logger;
3333
import java.util.zip.GZIPInputStream;
3434
import org.apache.lucene.document.Document;
35-
import org.apache.lucene.document.Field;
3635
import org.opengrok.indexer.analysis.AbstractAnalyzer;
3736
import org.opengrok.indexer.analysis.AnalyzerFactory;
3837
import org.opengrok.indexer.analysis.AnalyzerGuru;
39-
import org.opengrok.indexer.analysis.FileAnalyzer;
4038
import org.opengrok.indexer.analysis.StreamSource;
4139
import org.opengrok.indexer.logger.LoggerFactory;
4240
import org.opengrok.indexer.search.QueryBuilder;
@@ -47,20 +45,10 @@
4745
* Created on September 22, 2005
4846
* @author Chandan
4947
*/
50-
public class GZIPAnalyzer extends FileAnalyzer {
48+
public class GZIPAnalyzer extends CompressedAnalyzer {
5149

5250
private static final Logger LOGGER = LoggerFactory.getLogger(GZIPAnalyzer.class);
5351

54-
private Genre g;
55-
56-
@Override
57-
public Genre getGenre() {
58-
if (g != null) {
59-
return g;
60-
}
61-
return super.getGenre();
62-
}
63-
6452
protected GZIPAnalyzer(AnalyzerFactory factory) {
6553
super(factory);
6654
}
@@ -77,11 +65,11 @@ public String getCtagsLang() {
7765
* Gets a version number to be used to tag processed documents so that
7866
* re-analysis can be re-done later if a stored version number is different
7967
* from the current implementation.
80-
* @return 20180111_00
68+
* @return 20200417_00
8169
*/
8270
@Override
8371
protected int getSpecializedVersionNo() {
84-
return 20180111_00; // Edit comment above too!
72+
return 20200417_00; // Edit comment above too!
8573
}
8674

8775
@Override
@@ -93,30 +81,16 @@ public void analyze(Document doc, StreamSource src, Writer xrefOut)
9381
String path = doc.get(QueryBuilder.PATH);
9482
if (path != null && path.toLowerCase(Locale.ROOT).endsWith(".gz")) {
9583
String newname = path.substring(0, path.length() - 3);
96-
//System.err.println("GZIPPED OF = " + newname);
9784
try (InputStream gzis = gzSrc.getStream()) {
9885
fa = AnalyzerGuru.getAnalyzer(gzis, newname);
9986
}
10087
if (fa == null) {
10188
this.g = Genre.DATA;
102-
LOGGER.log(Level.WARNING, "Did not analyze {0}, detected as data.", newname);
89+
LOGGER.log(Level.WARNING, "Did not analyze {0} detected as data.", newname);
10390
//TODO we could probably wrap tar analyzer here, need to do research on reader coming from gzis ...
10491
} else { // cant recurse!
10592
//simple file gziped case captured here
106-
if (fa.getGenre() == Genre.PLAIN || fa.getGenre() == Genre.XREFABLE) {
107-
this.g = Genre.XREFABLE;
108-
} else {
109-
this.g = Genre.DATA;
110-
}
111-
fa.analyze(doc, gzSrc, xrefOut);
112-
if (doc.get(QueryBuilder.T) != null) {
113-
doc.removeField(QueryBuilder.T);
114-
if (g == Genre.XREFABLE) {
115-
doc.add(new Field(QueryBuilder.T, g.typeName(),
116-
AnalyzerGuru.string_ft_stored_nanalyzed_norms));
117-
}
118-
}
119-
93+
analyzeUncompressed(doc, xrefOut, fa, gzSrc);
12094
}
12195
}
12296
}

0 commit comments

Comments
 (0)