Fix #2560 : recognize Huge Text in gzip or bzip2

idodeclare · idodeclare · commit f9ff866eb8bb · 2020-10-09T10:56:23.000-05:00
diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/BZip2Analyzer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/BZip2Analyzer.java
@@ -27,14 +27,16 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Writer;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
 import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
 import org.apache.tools.bzip2.CBZip2InputStream;
 import org.opengrok.indexer.analysis.AbstractAnalyzer;
 import org.opengrok.indexer.analysis.AnalyzerFactory;
 import org.opengrok.indexer.analysis.AnalyzerGuru;
-import org.opengrok.indexer.analysis.FileAnalyzer;
 import org.opengrok.indexer.analysis.StreamSource;
+import org.opengrok.indexer.logger.LoggerFactory;
 import org.opengrok.indexer.search.QueryBuilder;
 
 /**
@@ -43,17 +45,9 @@
  * Created on September 22, 2005
  * @author Chandan
  */
-public class BZip2Analyzer extends FileAnalyzer {
-
-    private Genre g;
+public class BZip2Analyzer extends CompressedAnalyzer {
 
-    @Override
-    public Genre getGenre() {
-        if (g != null) {
-            return g;
-        }
-        return super.getGenre();
-    }
+    private static final Logger LOGGER = LoggerFactory.getLogger(BZip2Analyzer.class);
 
     protected BZip2Analyzer(AnalyzerFactory factory) {
         super(factory);
@@ -71,11 +65,11 @@ public String getCtagsLang() {
      * Gets a version number to be used to tag processed documents so that
      * re-analysis can be re-done later if a stored version number is different
      * from the current implementation.
-     * @return 20180111_00
+     * @return 20200417_00
      */
     @Override
     protected int getSpecializedVersionNo() {
-        return 20180111_00; // Edit comment above too!
+        return 20200417_00; // Edit comment above too!
     }
 
     @Override
@@ -92,20 +86,12 @@ public void analyze(Document doc, StreamSource src, Writer xrefOut)
             try (InputStream in = bzSrc.getStream()) {
                 fa = AnalyzerGuru.getAnalyzer(in, newname);
             }
-            if (!(fa instanceof BZip2Analyzer)) {
-                if (fa.getGenre() == Genre.PLAIN || fa.getGenre() == Genre.XREFABLE) {
-                    this.g = Genre.XREFABLE;
-                } else {
-                    this.g = Genre.DATA;
-                }
-                fa.analyze(doc, bzSrc, xrefOut);
-                if (doc.get(QueryBuilder.T) != null) {
-                    doc.removeField(QueryBuilder.T);
-                    if (g == Genre.XREFABLE) {
-                        doc.add(new Field(QueryBuilder.T, g.typeName(),
-                                AnalyzerGuru.string_ft_stored_nanalyzed_norms));
-                    }
-                }
+            if (fa == null) {
+                this.g = Genre.DATA;
+                LOGGER.log(Level.WARNING, "Did not analyze {0} detected as data.", newname);
+                //TODO we could probably wrap tar analyzer here, need to do research on reader coming from gzis ...
+            } else if (!(fa instanceof BZip2Analyzer)) {
+                analyzeUncompressed(doc, xrefOut, fa, bzSrc);
             }
         }
     }
diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/CompressedAnalyzer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/CompressedAnalyzer.java
@@ -0,0 +1,109 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * See LICENSE.txt included in this distribution for the specific
+ * language governing permissions and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at LICENSE.txt.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Portions Copyright (c) 2017-2020, Chris Fraire <cfraire@me.com>.
+ */
+
+package org.opengrok.indexer.analysis.archive;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.opengrok.indexer.analysis.AbstractAnalyzer;
+import org.opengrok.indexer.analysis.AnalyzerFactory;
+import org.opengrok.indexer.analysis.AnalyzerGuru;
+import org.opengrok.indexer.analysis.FileAnalyzer;
+import org.opengrok.indexer.analysis.StreamSource;
+import org.opengrok.indexer.analysis.data.HugeTextAnalyzerFactory;
+import org.opengrok.indexer.configuration.RuntimeEnvironment;
+import org.opengrok.indexer.search.QueryBuilder;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Writer;
+
+/**
+ * Represents a base for compressed formats (e.g. gzip or bzip2) but not for
+ * archive formats that have compression (e.g. Zip or Jar).
+ * @author Chandan
+ */
+public abstract class CompressedAnalyzer extends FileAnalyzer {
+
+    protected Genre g;
+
+    @Override
+    public Genre getGenre() {
+        if (g != null) {
+            return g;
+        }
+        return super.getGenre();
+    }
+
+    protected CompressedAnalyzer(AnalyzerFactory factory) {
+        super(factory);
+    }
+
+    protected void analyzeUncompressed(
+            Document doc, Writer xrefOut, AbstractAnalyzer fa, StreamSource compressedSrc)
+            throws IOException, InterruptedException {
+
+        if (fa.getGenre() == Genre.PLAIN) {
+            if (meetsHugeTextThreshold(compressedSrc)) {
+                fa = HugeTextAnalyzerFactory.DEFAULT_INSTANCE.getAnalyzer();
+                g = Genre.DATA;
+            } else {
+                g = Genre.XREFABLE;
+            }
+        } else if (fa.getGenre() == Genre.XREFABLE) {
+            g = Genre.XREFABLE;
+        } else {
+            g = Genre.DATA;
+        }
+
+        fa.analyze(doc, compressedSrc, xrefOut);
+        if (doc.get(QueryBuilder.T) != null) {
+            doc.removeField(QueryBuilder.T);
+        }
+        doc.add(new Field(QueryBuilder.T, g.typeName(),
+                AnalyzerGuru.string_ft_stored_nanalyzed_norms));
+    }
+
+    private boolean meetsHugeTextThreshold(StreamSource compressedSrc) throws IOException {
+        RuntimeEnvironment env = RuntimeEnvironment.getInstance();
+        int hugeTextThresholdBytes = env.getHugeTextThresholdBytes();
+        if (Integer.MAX_VALUE == hugeTextThresholdBytes) {
+            // Don't bother decompressing to count if the limit is MAX_VALUE.
+            return false;
+        }
+
+        byte[] buf = new byte[8 * 1024];
+        int bytesRead = 0;
+        int n;
+        try (InputStream in = compressedSrc.getStream()) {
+            while ((n = in.read(buf, 0, buf.length)) != -1) {
+                bytesRead += n;
+                if (bytesRead >= hugeTextThresholdBytes) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+}
diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/GZIPAnalyzer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/GZIPAnalyzer.java
@@ -32,11 +32,9 @@
 import java.util.logging.Logger;
 import java.util.zip.GZIPInputStream;
 import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
 import org.opengrok.indexer.analysis.AbstractAnalyzer;
 import org.opengrok.indexer.analysis.AnalyzerFactory;
 import org.opengrok.indexer.analysis.AnalyzerGuru;
-import org.opengrok.indexer.analysis.FileAnalyzer;
 import org.opengrok.indexer.analysis.StreamSource;
 import org.opengrok.indexer.logger.LoggerFactory;
 import org.opengrok.indexer.search.QueryBuilder;
@@ -47,20 +45,10 @@
  * Created on September 22, 2005
  * @author Chandan
  */
-public class GZIPAnalyzer extends FileAnalyzer {
+public class GZIPAnalyzer extends CompressedAnalyzer {
 
     private static final Logger LOGGER = LoggerFactory.getLogger(GZIPAnalyzer.class);
 
-    private Genre g;
-
-    @Override
-    public Genre getGenre() {
-        if (g != null) {
-            return g;
-        }
-        return super.getGenre();
-    }
-
     protected GZIPAnalyzer(AnalyzerFactory factory) {
         super(factory);
     }
@@ -77,11 +65,11 @@ public String getCtagsLang() {
      * Gets a version number to be used to tag processed documents so that
      * re-analysis can be re-done later if a stored version number is different
      * from the current implementation.
-     * @return 20180111_00
+     * @return 20200417_00
      */
     @Override
     protected int getSpecializedVersionNo() {
-        return 20180111_00; // Edit comment above too!
+        return 20200417_00; // Edit comment above too!
     }
 
     @Override
@@ -93,30 +81,16 @@ public void analyze(Document doc, StreamSource src, Writer xrefOut)
         String path = doc.get(QueryBuilder.PATH);
         if (path != null && path.toLowerCase(Locale.ROOT).endsWith(".gz")) {
             String newname = path.substring(0, path.length() - 3);
-            //System.err.println("GZIPPED OF = " + newname);
             try (InputStream gzis = gzSrc.getStream()) {
                 fa = AnalyzerGuru.getAnalyzer(gzis, newname);
             }
             if (fa == null) {
                 this.g = Genre.DATA;
-                LOGGER.log(Level.WARNING, "Did not analyze {0}, detected as data.", newname);
+                LOGGER.log(Level.WARNING, "Did not analyze {0} detected as data.", newname);
                 //TODO we could probably wrap tar analyzer here, need to do research on reader coming from gzis ...
             } else { // cant recurse!
                 //simple file gziped case captured here
-                if (fa.getGenre() == Genre.PLAIN || fa.getGenre() == Genre.XREFABLE) {
-                    this.g = Genre.XREFABLE;
-                } else {
-                    this.g = Genre.DATA;
-                }
-                fa.analyze(doc, gzSrc, xrefOut);
-                if (doc.get(QueryBuilder.T) != null) {
-                    doc.removeField(QueryBuilder.T);
-                    if (g == Genre.XREFABLE) {
-                        doc.add(new Field(QueryBuilder.T, g.typeName(),
-                                AnalyzerGuru.string_ft_stored_nanalyzed_norms));
-                    }
-                }
-
+                analyzeUncompressed(doc, xrefOut, fa, gzSrc);
             }
         }
     }