Skip to content

Commit 752a7ce

Browse files
committed
Fix #534 Fix #1646 Fix #3097 : constrain huge text files
1 parent 1b26df7 commit 752a7ce

File tree

17 files changed

+791
-48
lines changed

17 files changed

+791
-48
lines changed

opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@
6767
import org.opengrok.indexer.analysis.c.CxxAnalyzerFactory;
6868
import org.opengrok.indexer.analysis.clojure.ClojureAnalyzerFactory;
6969
import org.opengrok.indexer.analysis.csharp.CSharpAnalyzerFactory;
70+
import org.opengrok.indexer.analysis.data.HugeTextAnalyzerFactory;
7071
import org.opengrok.indexer.analysis.data.IgnorantAnalyzerFactory;
7172
import org.opengrok.indexer.analysis.data.ImageAnalyzerFactory;
7273
import org.opengrok.indexer.analysis.document.MandocAnalyzerFactory;
@@ -244,6 +245,8 @@ public class AnalyzerGuru {
244245
private static final LangTreeMap langMap = new LangTreeMap();
245246
private static final LangTreeMap defaultLangMap = new LangTreeMap();
246247

248+
private static String hugeTextFileTypeName;
249+
247250
/*
248251
* If you write your own analyzer please register it here. The order is
249252
* important for any factory that uses a FileAnalyzerFactory.Matcher
@@ -303,7 +306,8 @@ public class AnalyzerGuru {
303306
new AsmAnalyzerFactory(),
304307
new HCLAnalyzerFactory(),
305308
new TerraformAnalyzerFactory(),
306-
new RAnalyzerFactory()
309+
new RAnalyzerFactory(),
310+
HugeTextAnalyzerFactory.DEFAULT_INSTANCE
307311
};
308312

309313
for (AnalyzerFactory analyzer : analyzers) {
@@ -393,6 +397,21 @@ public static List<AnalyzerFactory> getAnalyzerFactories() {
393397
return Collections.unmodifiableList(factories);
394398
}
395399

400+
/**
401+
* Gets the normalized name of the
402+
* {@link org.opengrok.indexer.analysis.data.HugeTextAnalyzer} class.
403+
* @return a defined instance
404+
*/
405+
public static String getHugeTextFileTypeName() {
406+
if (hugeTextFileTypeName == null) {
407+
String newValue = HugeTextAnalyzerFactory.DEFAULT_INSTANCE.getAnalyzer().
408+
getFileTypeName();
409+
hugeTextFileTypeName = newValue;
410+
return newValue;
411+
}
412+
return hugeTextFileTypeName;
413+
}
414+
396415
/**
397416
* Register a {@code FileAnalyzerFactory} instance.
398417
*/
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
/*
2+
* CDDL HEADER START
3+
*
4+
* The contents of this file are subject to the terms of the
5+
* Common Development and Distribution License (the "License").
6+
* You may not use this file except in compliance with the License.
7+
*
8+
* See LICENSE.txt included in this distribution for the specific
9+
* language governing permissions and limitations under the License.
10+
*
11+
* When distributing Covered Code, include this CDDL HEADER in each
12+
* file and include the License file at LICENSE.txt.
13+
* If applicable, add the following below this CDDL HEADER, with the
14+
* fields enclosed by brackets "[]" replaced with your own identifying
15+
* information: Portions Copyright [yyyy] [name of copyright owner]
16+
*
17+
* CDDL HEADER END
18+
*/
19+
20+
/*
21+
* Copyright (c) 2020, Chris Fraire <[email protected]>.
22+
*/
23+
24+
package org.opengrok.indexer.analysis.data;
25+
26+
import org.apache.lucene.document.Document;
27+
import org.opengrok.indexer.analysis.AnalyzerFactory;
28+
import org.opengrok.indexer.analysis.FileAnalyzer;
29+
import org.opengrok.indexer.analysis.OGKTextField;
30+
import org.opengrok.indexer.analysis.StreamSource;
31+
import org.opengrok.indexer.configuration.RuntimeEnvironment;
32+
import org.opengrok.indexer.search.QueryBuilder;
33+
import org.opengrok.indexer.util.LimitedReader;
34+
import org.opengrok.indexer.util.IOUtils;
35+
36+
import java.io.IOException;
37+
import java.io.InputStream;
38+
import java.io.Reader;
39+
import java.io.Writer;
40+
import java.nio.charset.StandardCharsets;
41+
42+
/**
43+
* Represents an analyzer for huge text data files that are not eligible for
44+
* xref.
45+
*/
46+
public class HugeTextAnalyzer extends FileAnalyzer {
47+
48+
/**
49+
* Creates a new instance.
50+
* @param factory defined instance for the analyzer
51+
*/
52+
protected HugeTextAnalyzer(AnalyzerFactory factory) {
53+
super(factory);
54+
}
55+
56+
/**
57+
* @return {@code null} as there is no aligned language
58+
*/
59+
@Override
60+
public String getCtagsLang() {
61+
return null;
62+
}
63+
64+
/**
65+
* Gets a version number to be used to tag processed documents so that
66+
* re-analysis can be re-done later if a stored version number is different
67+
* from the current implementation.
68+
* @return 20200415_00
69+
*/
70+
@Override
71+
protected int getSpecializedVersionNo() {
72+
return 20200415_00; // Edit comment above too!
73+
}
74+
75+
@Override
76+
public void analyze(Document doc, StreamSource src, Writer xrefOut) throws IOException {
77+
/*
78+
* Though we don't intend to xref, Lucene demands consistency or else it
79+
* would throw IllegalArgumentException: cannot change field "full" from
80+
* index options=DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS to
81+
* inconsistent index options=DOCS_AND_FREQS_AND_POSITIONS
82+
*/
83+
doc.add(new OGKTextField(QueryBuilder.FULL, getReader(src.getStream())));
84+
}
85+
86+
protected Reader getReader(InputStream stream) throws IOException {
87+
// sourceRoot is read with UTF-8 as a default.
88+
return new LimitedReader(IOUtils.createBOMStrippedReader(stream,
89+
StandardCharsets.UTF_8.name()),
90+
RuntimeEnvironment.getInstance().getHugeTextLimitCharacters());
91+
}
92+
}
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
/*
2+
* CDDL HEADER START
3+
*
4+
* The contents of this file are subject to the terms of the
5+
* Common Development and Distribution License (the "License").
6+
* You may not use this file except in compliance with the License.
7+
*
8+
* See LICENSE.txt included in this distribution for the specific
9+
* language governing permissions and limitations under the License.
10+
*
11+
* When distributing Covered Code, include this CDDL HEADER in each
12+
* file and include the License file at LICENSE.txt.
13+
* If applicable, add the following below this CDDL HEADER, with the
14+
* fields enclosed by brackets "[]" replaced with your own identifying
15+
* information: Portions Copyright [yyyy] [name of copyright owner]
16+
*
17+
* CDDL HEADER END
18+
*/
19+
20+
/*
21+
* Copyright (c) 2020, Chris Fraire <[email protected]>.
22+
*/
23+
24+
package org.opengrok.indexer.analysis.data;
25+
26+
import org.opengrok.indexer.analysis.AbstractAnalyzer;
27+
import org.opengrok.indexer.analysis.FileAnalyzerFactory;
28+
29+
/**
30+
* Represents a factory for creating {@link HugeTextAnalyzer} instances.
31+
*/
32+
public class HugeTextAnalyzerFactory extends FileAnalyzerFactory {
33+
34+
private static final String NAME = "Huge Text";
35+
36+
/**
37+
* Gets a factory instance with no associated file extensions nor magic nor
38+
* any other mapping attribute.
39+
*/
40+
public static final HugeTextAnalyzerFactory DEFAULT_INSTANCE = new HugeTextAnalyzerFactory();
41+
42+
private HugeTextAnalyzerFactory() {
43+
super(null, null, null, null, null, null, AbstractAnalyzer.Genre.DATA, NAME);
44+
}
45+
46+
/**
47+
* Creates a new {@link HugeTextAnalyzer} instance.
48+
* @return a defined instance
49+
*/
50+
@Override
51+
protected AbstractAnalyzer newAnalyzer() {
52+
return new HugeTextAnalyzer(this);
53+
}
54+
}

opengrok-indexer/src/main/java/org/opengrok/indexer/configuration/Configuration.java

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ public final class Configuration {
7676

7777
private static final Logger LOGGER = LoggerFactory.getLogger(Configuration.class);
7878
public static final String PLUGIN_DIRECTORY_DEFAULT = "plugins";
79+
public static final int HUGE_TEXT_THRESHOLD_BYTES_DEFAULT = 1_000_000;
80+
public static final int HUGE_TEXT_LIMIT_CHARACTERS_DEFAULT = 5_000_000;
7981

8082
/**
8183
* A check if a pattern contains at least one pair of parentheses meaning
@@ -301,6 +303,9 @@ public final class Configuration {
301303

302304
private Set<String> disabledRepositories;
303305

306+
private int hugeTextThresholdBytes;
307+
private int hugeTextLimitCharacters;
308+
304309
/*
305310
* types of handling history for remote SCM repositories:
306311
* ON - index history and display it in webapp
@@ -526,6 +531,8 @@ public Configuration() {
526531
setHistoryCacheTime(30);
527532
setHistoryEnabled(true);
528533
setHitsPerPage(25);
534+
setHugeTextLimitCharacters(HUGE_TEXT_LIMIT_CHARACTERS_DEFAULT);
535+
setHugeTextThresholdBytes(HUGE_TEXT_THRESHOLD_BYTES_DEFAULT);
529536
setIgnoredNames(new IgnoredNames());
530537
setIncludedNames(new Filter());
531538
setIndexVersionedFilesOnly(false);
@@ -1323,6 +1330,37 @@ public void setDisabledRepositories(Set<String> disabledRepositories) {
13231330
this.disabledRepositories = disabledRepositories;
13241331
}
13251332

1333+
/**
1334+
* Gets the number of bytes at which a plain-text file will be analyzed
1335+
* as a huge text data file and be ineligible for xref. Default is 1_000_000.
1336+
*/
1337+
public int getHugeTextThresholdBytes() {
1338+
return hugeTextThresholdBytes;
1339+
}
1340+
1341+
/**
1342+
* Sets the number of bytes at which a plain-text file will be analyzed
1343+
* as a huge text data file and be ineligible for xref.
1344+
*/
1345+
public void setHugeTextThresholdBytes(int value) {
1346+
hugeTextThresholdBytes = Math.max(value, 0);
1347+
}
1348+
1349+
/**
1350+
* Gets the number of characters to analyze from a huge text data file.
1351+
* Default is 5_000_000.
1352+
*/
1353+
public int getHugeTextLimitCharacters() {
1354+
return hugeTextLimitCharacters;
1355+
}
1356+
1357+
/**
1358+
* Sets the number of characters to analyze from a huge text data file.
1359+
*/
1360+
public void setHugeTextLimitCharacters(int value) {
1361+
hugeTextLimitCharacters = Math.max(value, 0);
1362+
}
1363+
13261364
/**
13271365
* Write the current configuration to a file.
13281366
*

opengrok-indexer/src/main/java/org/opengrok/indexer/configuration/RuntimeEnvironment.java

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1342,6 +1342,38 @@ public void setDisabledRepositories(Set<String> disabledRepositories) {
13421342
syncWriteConfiguration(disabledRepositories, Configuration::setDisabledRepositories);
13431343
}
13441344

1345+
/**
1346+
* Gets the configured number of bytes at which a plain-text file will be
1347+
* analyzed as a huge text data file and be ineligible for xref.
1348+
*/
1349+
public int getHugeTextThresholdBytes() {
1350+
return syncReadConfiguration(Configuration::getHugeTextThresholdBytes);
1351+
}
1352+
1353+
/**
1354+
* Sets the configured number of bytes at which a plain-text file will be
1355+
* analyzed as a huge text data file and be ineligible for xref.
1356+
*/
1357+
public void setHugeTextThresholdBytes(int hugeTextThresholdBytes) {
1358+
syncWriteConfiguration(hugeTextThresholdBytes, Configuration::setHugeTextThresholdBytes);
1359+
}
1360+
1361+
/**
1362+
* Gets the configured number of characters to analyze from a huge text
1363+
* data file.
1364+
*/
1365+
public int getHugeTextLimitCharacters() {
1366+
return syncReadConfiguration(Configuration::getHugeTextLimitCharacters);
1367+
}
1368+
1369+
/**
1370+
* Sets the configured number of characters to analyze from a huge text
1371+
* data file.
1372+
*/
1373+
public void setHugeTextLimitCharacters(int hugeTextLimitCharacters) {
1374+
syncWriteConfiguration(hugeTextLimitCharacters, Configuration::setHugeTextLimitCharacters);
1375+
}
1376+
13451377
/**
13461378
* Read an configuration file and set it as the current configuration.
13471379
*

opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@
8888
import org.opengrok.indexer.analysis.AnalyzerGuru;
8989
import org.opengrok.indexer.analysis.Ctags;
9090
import org.opengrok.indexer.analysis.Definitions;
91+
import org.opengrok.indexer.analysis.data.HugeTextAnalyzerFactory;
9192
import org.opengrok.indexer.configuration.PathAccepter;
9293
import org.opengrok.indexer.configuration.Project;
9394
import org.opengrok.indexer.configuration.RuntimeEnvironment;
@@ -709,6 +710,11 @@ private void addFile(File file, String path, Ctags ctags)
709710
RuntimeEnvironment env = RuntimeEnvironment.getInstance();
710711
AbstractAnalyzer fa = getAnalyzerFor(file, path);
711712

713+
if (AbstractAnalyzer.Genre.PLAIN.equals(fa.getGenre()) &&
714+
file.length() >= env.getHugeTextThresholdBytes()) {
715+
fa = HugeTextAnalyzerFactory.DEFAULT_INSTANCE.getAnalyzer();
716+
}
717+
712718
for (IndexChangedListener listener : listeners) {
713719
listener.fileAdd(path, fa.getClass().getSimpleName());
714720
}
@@ -1708,14 +1714,14 @@ private void finishWriting() throws IOException {
17081714
}
17091715

17101716
/**
1711-
* Verify TABSIZE, and evaluate AnalyzerGuru version together with ZVER --
1712-
* or return a value to indicate mismatch.
1717+
* Verify TABSIZE, validate AnalyzerGuru version together with Analyzer
1718+
* version, and recheck huge text file constraint -- or return a value to
1719+
* indicate mismatch.
17131720
* @param file the source file object
17141721
* @param path the source file path
17151722
* @return {@code false} if a mismatch is detected
17161723
*/
1717-
private boolean checkSettings(File file,
1718-
String path) throws IOException {
1724+
private boolean checkSettings(File file, String path) throws IOException {
17191725

17201726
RuntimeEnvironment env = RuntimeEnvironment.getInstance();
17211727
boolean outIsXrefWriter = false;
@@ -1759,8 +1765,7 @@ private boolean checkSettings(File file,
17591765
break;
17601766
}
17611767

1762-
AnalyzerFactory fac =
1763-
AnalyzerGuru.findByFileTypeName(fileTypeName);
1768+
AnalyzerFactory fac = AnalyzerGuru.findByFileTypeName(fileTypeName);
17641769
if (fac != null) {
17651770
fa = fac.getAnalyzer();
17661771
}
@@ -1795,7 +1800,27 @@ private boolean checkSettings(File file,
17951800
return false;
17961801
}
17971802

1803+
// If it is a Huge Text file, re-check constraints.
1804+
if (AnalyzerGuru.getHugeTextFileTypeName().equals(fileTypeName) &&
1805+
file.length() < env.getHugeTextThresholdBytes()) {
1806+
if (LOGGER.isLoggable(Level.FINE)) {
1807+
LOGGER.log(Level.FINE, "{0} no longer qualifies: {1}",
1808+
new Object[]{fileTypeName, path});
1809+
}
1810+
return false;
1811+
}
1812+
17981813
if (fa != null) {
1814+
// If the Genre is PLAIN, re-check Huge Text file constraints.
1815+
if (AbstractAnalyzer.Genre.PLAIN.equals(fa.getGenre()) &&
1816+
file.length() >= env.getHugeTextThresholdBytes()) {
1817+
if (LOGGER.isLoggable(Level.FINE)) {
1818+
LOGGER.log(Level.FINE, "{0} is now a huge text file: {1}",
1819+
new Object[]{fileTypeName, path});
1820+
}
1821+
return false;
1822+
}
1823+
17991824
outIsXrefWriter = isXrefWriter(fa);
18001825
}
18011826

0 commit comments

Comments
 (0)