oracle
diff --git a/‎opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java‎
Lines changed: 20 additions & 1 deletion b/‎opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java‎
Lines changed: 20 additions & 1 deletion
diff --git a/‎opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/data/HugeTextAnalyzer.java‎
Lines changed: 92 additions & 0 deletions b/‎opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/data/HugeTextAnalyzer.java‎
Lines changed: 92 additions & 0 deletions
diff --git a/‎opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/data/HugeTextAnalyzerFactory.java‎
Lines changed: 54 additions & 0 deletions b/‎opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/data/HugeTextAnalyzerFactory.java‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎opengrok-indexer/src/main/java/org/opengrok/indexer/configuration/Configuration.java‎
Lines changed: 38 additions & 0 deletions b/‎opengrok-indexer/src/main/java/org/opengrok/indexer/configuration/Configuration.java‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎opengrok-indexer/src/main/java/org/opengrok/indexer/configuration/RuntimeEnvironment.java‎
Lines changed: 32 additions & 0 deletions b/‎opengrok-indexer/src/main/java/org/opengrok/indexer/configuration/RuntimeEnvironment.java‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java‎
Lines changed: 31 additions & 6 deletions b/‎opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java‎
Lines changed: 31 additions & 6 deletions
@@ -67,6 +67,7 @@
 import org.opengrok.indexer.analysis.c.CxxAnalyzerFactory;
 import org.opengrok.indexer.analysis.clojure.ClojureAnalyzerFactory;
 import org.opengrok.indexer.analysis.csharp.CSharpAnalyzerFactory;
+import org.opengrok.indexer.analysis.data.HugeTextAnalyzerFactory;
 import org.opengrok.indexer.analysis.data.IgnorantAnalyzerFactory;
 import org.opengrok.indexer.analysis.data.ImageAnalyzerFactory;
 import org.opengrok.indexer.analysis.document.MandocAnalyzerFactory;
@@ -244,6 +245,8 @@ public class AnalyzerGuru {
     private static final LangTreeMap langMap = new LangTreeMap();
     private static final LangTreeMap defaultLangMap = new LangTreeMap();
 
+    private static String hugeTextFileTypeName;
+
     /*
      * If you write your own analyzer please register it here. The order is
      * important for any factory that uses a FileAnalyzerFactory.Matcher
@@ -303,7 +306,8 @@ public class AnalyzerGuru {
                 new AsmAnalyzerFactory(),
                 new HCLAnalyzerFactory(),
                 new TerraformAnalyzerFactory(),
-                new RAnalyzerFactory()
+                new RAnalyzerFactory(),
+                HugeTextAnalyzerFactory.DEFAULT_INSTANCE
             };
 
             for (AnalyzerFactory analyzer : analyzers) {
@@ -393,6 +397,21 @@ public static List<AnalyzerFactory> getAnalyzerFactories() {
         return Collections.unmodifiableList(factories);
     }
 
+    /**
+     * Gets the normalized name of the
+     * {@link org.opengrok.indexer.analysis.data.HugeTextAnalyzer} class.
+     * @return a defined instance
+     */
+    public static String getHugeTextFileTypeName() {
+        if (hugeTextFileTypeName == null) {
+            String newValue = HugeTextAnalyzerFactory.DEFAULT_INSTANCE.getAnalyzer().
+                    getFileTypeName();
+            hugeTextFileTypeName = newValue;
+            return newValue;
+        }
+        return hugeTextFileTypeName;
+    }
+
     /**
      * Register a {@code FileAnalyzerFactory} instance.
      */
 
@@ -0,0 +1,92 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * See LICENSE.txt included in this distribution for the specific
+ * language governing permissions and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at LICENSE.txt.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2020, Chris Fraire <[email protected]>.
+ */
+
+package org.opengrok.indexer.analysis.data;
+
+import org.apache.lucene.document.Document;
+import org.opengrok.indexer.analysis.AnalyzerFactory;
+import org.opengrok.indexer.analysis.FileAnalyzer;
+import org.opengrok.indexer.analysis.OGKTextField;
+import org.opengrok.indexer.analysis.StreamSource;
+import org.opengrok.indexer.configuration.RuntimeEnvironment;
+import org.opengrok.indexer.search.QueryBuilder;
+import org.opengrok.indexer.util.LimitedReader;
+import org.opengrok.indexer.util.IOUtils;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.io.Writer;
+import java.nio.charset.StandardCharsets;
+
+/**
+ * Represents an analyzer for huge text data files that are not eligible for
+ * xref.
+ */
+public class HugeTextAnalyzer extends FileAnalyzer {
+
+    /**
+     * Creates a new instance.
+     * @param factory defined instance for the analyzer
+     */
+    protected HugeTextAnalyzer(AnalyzerFactory factory) {
+        super(factory);
+    }
+
+    /**
+     * @return {@code null} as there is no aligned language
+     */
+    @Override
+    public String getCtagsLang() {
+        return null;
+    }
+
+    /**
+     * Gets a version number to be used to tag processed documents so that
+     * re-analysis can be re-done later if a stored version number is different
+     * from the current implementation.
+     * @return 20200415_00
+     */
+    @Override
+    protected int getSpecializedVersionNo() {
+        return 20200415_00; // Edit comment above too!
+    }
+
+    @Override
+    public void analyze(Document doc, StreamSource src, Writer xrefOut) throws IOException {
+        /*
+         * Though we don't intend to xref, Lucene demands consistency or else it
+         * would throw IllegalArgumentException: cannot change field "full" from
+         * index options=DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS to
+         * inconsistent index options=DOCS_AND_FREQS_AND_POSITIONS
+         */
+        doc.add(new OGKTextField(QueryBuilder.FULL, getReader(src.getStream())));
+    }
+
+    protected Reader getReader(InputStream stream) throws IOException {
+        // sourceRoot is read with UTF-8 as a default.
+        return new LimitedReader(IOUtils.createBOMStrippedReader(stream,
+                StandardCharsets.UTF_8.name()),
+                RuntimeEnvironment.getInstance().getHugeTextLimitCharacters());
+    }
+}
@@ -0,0 +1,54 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * See LICENSE.txt included in this distribution for the specific
+ * language governing permissions and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at LICENSE.txt.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2020, Chris Fraire <[email protected]>.
+ */
+
+package org.opengrok.indexer.analysis.data;
+
+import org.opengrok.indexer.analysis.AbstractAnalyzer;
+import org.opengrok.indexer.analysis.FileAnalyzerFactory;
+
+/**
+ * Represents a factory for creating {@link HugeTextAnalyzer} instances.
+ */
+public class HugeTextAnalyzerFactory extends FileAnalyzerFactory {
+
+    private static final String NAME = "Huge Text";
+
+    /**
+     * Gets a factory instance with no associated file extensions nor magic nor
+     * any other mapping attribute.
+     */
+    public static final HugeTextAnalyzerFactory DEFAULT_INSTANCE = new HugeTextAnalyzerFactory();
+
+    private HugeTextAnalyzerFactory() {
+        super(null, null, null, null, null, null, AbstractAnalyzer.Genre.DATA, NAME);
+    }
+
+    /**
+     * Creates a new {@link HugeTextAnalyzer} instance.
+     * @return a defined instance
+     */
+    @Override
+    protected AbstractAnalyzer newAnalyzer() {
+        return new HugeTextAnalyzer(this);
+    }
+}
@@ -76,6 +76,8 @@ public final class Configuration {
 
     private static final Logger LOGGER = LoggerFactory.getLogger(Configuration.class);
     public static final String PLUGIN_DIRECTORY_DEFAULT = "plugins";
+    public static final int HUGE_TEXT_THRESHOLD_BYTES_DEFAULT = 1_000_000;
+    public static final int HUGE_TEXT_LIMIT_CHARACTERS_DEFAULT = 5_000_000;
 
     /**
      * A check if a pattern contains at least one pair of parentheses meaning
@@ -301,6 +303,9 @@ public final class Configuration {
 
     private Set<String> disabledRepositories;
 
+    private int hugeTextThresholdBytes;
+    private int hugeTextLimitCharacters;
+
     /*
      * types of handling history for remote SCM repositories:
      *  ON - index history and display it in webapp
@@ -526,6 +531,8 @@ public Configuration() {
         setHistoryCacheTime(30);
         setHistoryEnabled(true);
         setHitsPerPage(25);
+        setHugeTextLimitCharacters(HUGE_TEXT_LIMIT_CHARACTERS_DEFAULT);
+        setHugeTextThresholdBytes(HUGE_TEXT_THRESHOLD_BYTES_DEFAULT);
         setIgnoredNames(new IgnoredNames());
         setIncludedNames(new Filter());
         setIndexVersionedFilesOnly(false);
@@ -1323,6 +1330,37 @@ public void setDisabledRepositories(Set<String> disabledRepositories) {
         this.disabledRepositories = disabledRepositories;
     }
 
+    /**
+     * Gets the number of bytes at which a plain-text file will be analyzed
+     * as a huge text data file and be ineligible for xref. Default is 1_000_000.
+     */
+    public int getHugeTextThresholdBytes() {
+        return hugeTextThresholdBytes;
+    }
+
+    /**
+     * Sets the number of bytes at which a plain-text file will be analyzed
+     * as a huge text data file and be ineligible for xref.
+     */
+    public void setHugeTextThresholdBytes(int value) {
+        hugeTextThresholdBytes = Math.max(value, 0);
+    }
+
+    /**
+     * Gets the number of characters to analyze from a huge text data file.
+     * Default is 5_000_000.
+     */
+    public int getHugeTextLimitCharacters() {
+        return hugeTextLimitCharacters;
+    }
+
+    /**
+     * Sets the number of characters to analyze from a huge text data file.
+     */
+    public void setHugeTextLimitCharacters(int value) {
+        hugeTextLimitCharacters = Math.max(value, 0);
+    }
+
     /**
      * Write the current configuration to a file.
      *
 
@@ -1342,6 +1342,38 @@ public void setDisabledRepositories(Set<String> disabledRepositories) {
         syncWriteConfiguration(disabledRepositories, Configuration::setDisabledRepositories);
     }
 
+    /**
+     * Gets the configured number of bytes at which a plain-text file will be
+     * analyzed as a huge text data file and be ineligible for xref.
+     */
+    public int getHugeTextThresholdBytes() {
+        return syncReadConfiguration(Configuration::getHugeTextThresholdBytes);
+    }
+
+    /**
+     * Sets the configured number of bytes at which a plain-text file will be
+     * analyzed as a huge text data file and be ineligible for xref.
+     */
+    public void setHugeTextThresholdBytes(int hugeTextThresholdBytes) {
+        syncWriteConfiguration(hugeTextThresholdBytes, Configuration::setHugeTextThresholdBytes);
+    }
+
+    /**
+     * Gets the configured number of characters to analyze from a huge text
+     * data file.
+     */
+    public int getHugeTextLimitCharacters() {
+        return syncReadConfiguration(Configuration::getHugeTextLimitCharacters);
+    }
+
+    /**
+     * Sets the configured number of characters to analyze from a huge text
+     * data file.
+     */
+    public void setHugeTextLimitCharacters(int hugeTextLimitCharacters) {
+        syncWriteConfiguration(hugeTextLimitCharacters, Configuration::setHugeTextLimitCharacters);
+    }
+
     /**
      * Read an configuration file and set it as the current configuration.
      *
 
@@ -88,6 +88,7 @@
 import org.opengrok.indexer.analysis.AnalyzerGuru;
 import org.opengrok.indexer.analysis.Ctags;
 import org.opengrok.indexer.analysis.Definitions;
+import org.opengrok.indexer.analysis.data.HugeTextAnalyzerFactory;
 import org.opengrok.indexer.configuration.PathAccepter;
 import org.opengrok.indexer.configuration.Project;
 import org.opengrok.indexer.configuration.RuntimeEnvironment;
@@ -709,6 +710,11 @@ private void addFile(File file, String path, Ctags ctags)
         RuntimeEnvironment env = RuntimeEnvironment.getInstance();
         AbstractAnalyzer fa = getAnalyzerFor(file, path);
 
+        if (AbstractAnalyzer.Genre.PLAIN.equals(fa.getGenre()) &&
+                file.length() >= env.getHugeTextThresholdBytes()) {
+            fa = HugeTextAnalyzerFactory.DEFAULT_INSTANCE.getAnalyzer();
+        }
+
         for (IndexChangedListener listener : listeners) {
             listener.fileAdd(path, fa.getClass().getSimpleName());
         }
@@ -1708,14 +1714,14 @@ private void finishWriting() throws IOException {
     }
 
     /**
-     * Verify TABSIZE, and evaluate AnalyzerGuru version together with ZVER --
-     * or return a value to indicate mismatch.
+     * Verify TABSIZE, validate AnalyzerGuru version together with Analyzer
+     * version, and recheck huge text file constraint -- or return a value to
+     * indicate mismatch.
      * @param file the source file object
      * @param path the source file path
      * @return {@code false} if a mismatch is detected
      */
-    private boolean checkSettings(File file,
-                                  String path) throws IOException {
+    private boolean checkSettings(File file, String path) throws IOException {
 
         RuntimeEnvironment env = RuntimeEnvironment.getInstance();
         boolean outIsXrefWriter = false;
@@ -1759,8 +1765,7 @@ private boolean checkSettings(File file,
                     break;
                 }
 
-                AnalyzerFactory fac =
-                        AnalyzerGuru.findByFileTypeName(fileTypeName);
+                AnalyzerFactory fac = AnalyzerGuru.findByFileTypeName(fileTypeName);
                 if (fac != null) {
                     fa = fac.getAnalyzer();
                 }
@@ -1795,7 +1800,27 @@ private boolean checkSettings(File file,
                 return false;
             }
 
+            // If it is a Huge Text file, re-check constraints.
+            if (AnalyzerGuru.getHugeTextFileTypeName().equals(fileTypeName) &&
+                    file.length() < env.getHugeTextThresholdBytes()) {
+                if (LOGGER.isLoggable(Level.FINE)) {
+                    LOGGER.log(Level.FINE, "{0} no longer qualifies: {1}",
+                            new Object[]{fileTypeName, path});
+                }
+                return false;
+            }
+
             if (fa != null) {
+                // If the Genre is PLAIN, re-check Huge Text file constraints.
+                if (AbstractAnalyzer.Genre.PLAIN.equals(fa.getGenre()) &&
+                        file.length() >= env.getHugeTextThresholdBytes()) {
+                    if (LOGGER.isLoggable(Level.FINE)) {
+                        LOGGER.log(Level.FINE, "{0} is now a huge text file: {1}",
+                                new Object[]{fileTypeName, path});
+                    }
+                    return false;
+                }
+
                 outIsXrefWriter = isXrefWriter(fa);
             }