Skip to content

Commit bc1aea9

Browse files
committed
Make list.jsp Huge-Text-aware
Also, move some logic properly to AnalyzerGuru that had crept into IndexDatabase.
1 parent 15458da commit bc1aea9

File tree

10 files changed

+255
-185
lines changed

10 files changed

+255
-185
lines changed

opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java

Lines changed: 133 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,10 @@
2323
*/
2424
package org.opengrok.indexer.analysis;
2525

26+
import java.io.BufferedInputStream;
2627
import java.io.BufferedReader;
2728
import java.io.File;
29+
import java.io.FileInputStream;
2830
import java.io.FileWriter;
2931
import java.io.IOException;
3032
import java.io.InputStream;
@@ -47,6 +49,7 @@
4749
import java.util.TreeSet;
4850
import java.util.logging.Level;
4951
import java.util.logging.Logger;
52+
5053
import org.apache.lucene.document.DateTools;
5154
import org.apache.lucene.document.Document;
5255
import org.apache.lucene.document.Field;
@@ -130,6 +133,13 @@
130133
*/
131134
public class AnalyzerGuru {
132135

136+
/**
137+
* A value used as a placeholder for a filename when content is anonymous
138+
* (e.g. from temporary source or from a stream for which an identifier is
139+
* not available).
140+
*/
141+
public static final String ANONYMOUS_NAME = "<anonymous>";
142+
133143
/**
134144
* The maximum number of characters (multi-byte if a BOM is identified) to
135145
* read from the input stream to be used for magic string matching.
@@ -551,29 +561,92 @@ public static AbstractAnalyzer getAnalyzer(String fileTypeName) {
551561
}
552562

553563
/**
554-
* Get an analyzer suited to analyze a file. This function will reuse
555-
* analyzers since they are costly.
564+
* Gets an analyzer factory suited to analyze a file, but without a check
565+
* for Huge Text since the file size is not available.
556566
*
557567
* @param in Input stream containing data to be analyzed
558-
* @param file Name of the file to be analyzed
559-
* @return An analyzer suited for that file content
568+
* @param fileName Name of the file to be analyzed
569+
* @return An analyzer factory suited for that file content
560570
* @throws java.io.IOException If an error occurs while accessing the data
561571
* in the input stream.
562572
*/
563-
public static AbstractAnalyzer getAnalyzer(InputStream in, String file) throws IOException {
564-
AnalyzerFactory factory = find(in, file);
573+
public static AnalyzerFactory getAnalyzerFactory(InputStream in, String fileName)
574+
throws IOException {
575+
AnalyzerFactory factory = find(in, fileName);
565576
if (factory == null) {
566-
AbstractAnalyzer defaultAnalyzer = getAnalyzer();
577+
factory = DEFAULT_ANALYZER_FACTORY;
567578
if (LOGGER.isLoggable(Level.FINEST)) {
579+
AbstractAnalyzer defaultAnalyzer = factory.getAnalyzer();
568580
LOGGER.log(Level.FINEST, "{0}: fallback {1}",
569-
new Object[]{file,
570-
defaultAnalyzer.getClass().getSimpleName() });
581+
new Object[]{fileName, defaultAnalyzer.getClass().getSimpleName()});
571582
}
572-
return defaultAnalyzer;
573583
}
584+
return factory;
585+
}
586+
587+
/**
588+
* Gets an analyzer suited to analyze a file, but without a check for Huge
589+
* Text since the file size is not available.
590+
*
591+
* @param in Input stream containing data to be analyzed
592+
* @param fileName Name of the file to be analyzed
593+
* @return An analyzer factory suited for the file content
594+
* @throws java.io.IOException If an error occurs while accessing the data
595+
* in the input stream.
596+
*/
597+
public static AbstractAnalyzer getAnalyzer(InputStream in, String fileName)
598+
throws IOException {
599+
AnalyzerFactory factory = getAnalyzerFactory(in, fileName);
574600
return factory.getAnalyzer();
575601
}
576602

603+
/**
604+
* Gets an analyzer factory suited to analyze a file, with a check for Huge
605+
* Text.
606+
*
607+
* @param file a defined instance to be analyzed
608+
* @param path Name (possibly normalized) of the file to be analyzed
609+
* @param logHugeText a value indicating whether to log if the file is
610+
* identified as Huge Text
611+
* @return An analyzer factory suited for the file content
612+
* @throws java.io.IOException If an error occurs while reading the file
613+
*/
614+
public static AnalyzerFactory getAnalyzerFactory(File file, String path, boolean logHugeText)
615+
throws IOException {
616+
617+
AnalyzerFactory fac;
618+
try (InputStream in = new BufferedInputStream(
619+
new FileInputStream(file))) {
620+
fac = AnalyzerGuru.getAnalyzerFactory(in, path);
621+
}
622+
623+
if (AbstractAnalyzer.Genre.PLAIN.equals(fac.getGenre()) &&
624+
file.length() >= RuntimeEnvironment.getInstance().getHugeTextThresholdBytes()) {
625+
fac = HugeTextAnalyzerFactory.DEFAULT_INSTANCE;
626+
if (logHugeText && LOGGER.isLoggable(Level.WARNING)) {
627+
String origFileTypeName = fac.getAnalyzer().getFileTypeName();
628+
LOGGER.log(Level.WARNING, "{0} is huge text: {1}",
629+
new Object[]{origFileTypeName, path});
630+
}
631+
}
632+
return fac;
633+
}
634+
635+
/**
636+
* Get an analyzer suited to analyze a file, with a check for Huge Text.
637+
*
638+
* @param file a defined instance to be analyzed
639+
* @param path Name (possibly normalized) of the file to be analyzed
640+
* @param logHugeText a value indicating whether to log if the file is
641+
* identified as Huge Text
642+
* @return An analyzer suited for the file content
643+
* @throws java.io.IOException If an error occurs while reading the file
644+
*/
645+
public static AbstractAnalyzer getAnalyzer(File file, String path, boolean logHugeText)
646+
throws IOException {
647+
return getAnalyzerFactory(file, path, logHugeText).getAnalyzer();
648+
}
649+
577650
/**
578651
* Free resources associated with all registered analyzers.
579652
*/
@@ -718,24 +791,36 @@ public static void writeDumpedXref(String contextPath,
718791
}
719792

720793
/**
721-
* Get the genre of a file.
794+
* Get the genre of a file, with a check for Huge Text.
722795
*
723796
* @param file The file to inspect
797+
* @param fileName name of the file to inspect
724798
* @return The genre suitable to decide how to display the file
725799
*/
726-
public static AbstractAnalyzer.Genre getGenre(String file) {
727-
return getGenre(find(file));
800+
public static AbstractAnalyzer.Genre getGenre(File file, String fileName) {
801+
try {
802+
return getGenre(getAnalyzerFactory(file, fileName, true));
803+
} catch (IOException e) {
804+
LOGGER.log(Level.WARNING, "Error reading {0}", fileName);
805+
return null;
806+
}
728807
}
729808

730809
/**
731-
* Get the genre of a bulk of data.
810+
* Get the genre of a bulk of data, but without a check for Huge Text since
811+
* the file size is not available.
732812
*
733813
* @param in A stream containing the data
814+
* @param fileName name of the file to inspect
734815
* @return The genre suitable to decide how to display the file
735-
* @throws java.io.IOException If an error occurs while getting the content
736816
*/
737-
public static AbstractAnalyzer.Genre getGenre(InputStream in) throws IOException {
738-
return getGenre(find(in));
817+
public static AbstractAnalyzer.Genre getGenre(InputStream in, String fileName) {
818+
try {
819+
return getGenre(getAnalyzerFactory(in, fileName));
820+
} catch (IOException e) {
821+
LOGGER.log(Level.WARNING, "Error reading {0}", fileName);
822+
return null;
823+
}
739824
}
740825

741826
/**
@@ -881,31 +966,36 @@ private static AnalyzerFactory findFactory(Class<?> factoryClass)
881966
*
882967
*
883968
* @param in The input stream containing the data
884-
* @param file The file name to get the analyzer for
969+
* @param fileName The file name to get the analyzer for
885970
* @return the analyzer factory to use
886971
* @throws java.io.IOException If a problem occurs while reading the data
887972
*/
888-
public static AnalyzerFactory find(InputStream in, String file)
889-
throws IOException {
890-
AnalyzerFactory factory = find(file);
973+
static AnalyzerFactory find(InputStream in, String fileName) throws IOException {
974+
AnalyzerFactory factory = find(fileName);
891975
// TODO above is not that great, since if 2 analyzers share one extension
892976
// then only the first one registered will own it
893977
// it would be cool if above could return more analyzers and below would
894978
// then decide between them ...
895979
if (factory != null) {
896980
return factory;
897981
}
898-
return findForStream(in, file);
982+
return findForStream(in, fileName);
899983
}
900984

901985
/**
902-
* Finds a suitable analyser class for file name.
986+
* Finds a suitable analyser class for {@code fileName}, which should only
987+
* be used in rare situations, such as for a JAR member or when content is
988+
* not available to support a full determination.
989+
* <p>To clarify, a full determination as done by
990+
* {@link #getAnalyzerFactory(File, String, boolean)} also reads a bit of
991+
* content as well as inspects file length to determine the ultimate
992+
* analyser.
903993
*
904-
* @param file The file name to get the analyzer for
994+
* @param fileName The file name to get the analyzer for
905995
* @return the analyzer factory to use
906996
*/
907-
public static AnalyzerFactory find(String file) {
908-
String path = file;
997+
public static AnalyzerFactory find(String fileName) {
998+
String path = fileName;
909999
int i;
9101000

9111001
// Get basename of the file first.
@@ -924,8 +1014,7 @@ public static AnalyzerFactory find(String file) {
9241014
if (factory != null) {
9251015
if (LOGGER.isLoggable(Level.FINEST)) {
9261016
LOGGER.log(Level.FINEST, "{0}: chosen by prefix: {1}",
927-
new Object[]{file,
928-
factory.getClass().getSimpleName() });
1017+
new Object[]{fileName, factory.getClass().getSimpleName()});
9291018
}
9301019
return factory;
9311020
}
@@ -938,8 +1027,7 @@ public static AnalyzerFactory find(String file) {
9381027
if (factory != null) {
9391028
if (LOGGER.isLoggable(Level.FINEST)) {
9401029
LOGGER.log(Level.FINEST, "{0}: chosen by suffix: {1}",
941-
new Object[]{file,
942-
factory.getClass().getSimpleName() });
1030+
new Object[]{fileName, factory.getClass().getSimpleName()});
9431031
}
9441032
return factory;
9451033
}
@@ -957,22 +1045,22 @@ public static AnalyzerFactory find(String file) {
9571045
* @throws java.io.IOException if an error occurs while reading data from
9581046
* the stream
9591047
*/
960-
public static AnalyzerFactory find(InputStream in) throws IOException {
961-
return findForStream(in, "<anonymous>");
1048+
static AnalyzerFactory find(InputStream in) throws IOException {
1049+
return findForStream(in, ANONYMOUS_NAME);
9621050
}
9631051

9641052
/**
9651053
* Finds a suitable analyzer class for the data in this stream
9661054
* corresponding to a file of the specified name.
9671055
*
9681056
* @param in The stream containing the data to analyze
969-
* @param file The file name to get the analyzer for
1057+
* @param fileName The file name to get the analyzer for
9701058
* @return the analyzer factory to use
9711059
* @throws java.io.IOException if an error occurs while reading data from
9721060
* the stream
9731061
*/
974-
private static AnalyzerFactory findForStream(InputStream in,
975-
String file) throws IOException {
1062+
private static AnalyzerFactory findForStream(InputStream in, String fileName)
1063+
throws IOException {
9761064

9771065
in.mark(MAGIC_BYTES_NUM);
9781066
byte[] content = new byte[MAGIC_BYTES_NUM];
@@ -998,8 +1086,8 @@ private static AnalyzerFactory findForStream(InputStream in,
9981086
if (fac != null) {
9991087
if (LOGGER.isLoggable(Level.FINEST)) {
10001088
LOGGER.log(Level.FINEST,
1001-
"{0}: chosen by precise magic: {1}", new Object[]{
1002-
file, fac.getClass().getSimpleName() });
1089+
"{0}: chosen by precise magic: {1}",
1090+
new Object[]{fileName, fac.getClass().getSimpleName()});
10031091
}
10041092
return fac;
10051093
}
@@ -1008,7 +1096,7 @@ private static AnalyzerFactory findForStream(InputStream in,
10081096

10091097
// Next, look for magic strings
10101098
String opening = readOpening(in, content);
1011-
fac = findMagicString(opening, file);
1099+
fac = findMagicString(opening, fileName);
10121100
if (fac != null) {
10131101
return fac;
10141102
}
@@ -1020,9 +1108,8 @@ private static AnalyzerFactory findForStream(InputStream in,
10201108
if (fac != null) {
10211109
if (LOGGER.isLoggable(Level.FINEST)) {
10221110
LOGGER.log(Level.FINEST,
1023-
"{0}: chosen by imprecise magic: {1}",
1024-
new Object[]{file,
1025-
fac.getClass().getSimpleName() });
1111+
"{0}: chosen by imprecise magic: {1}",
1112+
new Object[]{fileName, fac.getClass().getSimpleName()});
10261113
}
10271114
return fac;
10281115
}
@@ -1032,16 +1119,15 @@ private static AnalyzerFactory findForStream(InputStream in,
10321119
return null;
10331120
}
10341121

1035-
private static AnalyzerFactory findMagicString(String opening, String file) {
1122+
private static AnalyzerFactory findMagicString(String opening, String fileName) {
10361123

10371124
// first, try to look up two words in magics
10381125
String fragment = getWords(opening, 2);
10391126
AnalyzerFactory fac = magics.get(fragment);
10401127
if (fac != null) {
10411128
if (LOGGER.isLoggable(Level.FINEST)) {
10421129
LOGGER.log(Level.FINEST, "{0}: chosen by magic {2}: {1}",
1043-
new Object[]{file, fac.getClass().getSimpleName(),
1044-
fragment});
1130+
new Object[]{fileName, fac.getClass().getSimpleName(), fragment});
10451131
}
10461132
return fac;
10471133
}
@@ -1052,8 +1138,7 @@ private static AnalyzerFactory findMagicString(String opening, String file) {
10521138
if (fac != null) {
10531139
if (LOGGER.isLoggable(Level.FINEST)) {
10541140
LOGGER.log(Level.FINEST, "{0}: chosen by magic {2}: {1}",
1055-
new Object[]{file, fac.getClass().getSimpleName(),
1056-
fragment});
1141+
new Object[]{fileName, fac.getClass().getSimpleName(), fragment});
10571142
}
10581143
return fac;
10591144
}
@@ -1066,8 +1151,8 @@ private static AnalyzerFactory findMagicString(String opening, String file) {
10661151
fac = entry.getValue();
10671152
if (LOGGER.isLoggable(Level.FINEST)) {
10681153
LOGGER.log(Level.FINEST,
1069-
"{0}: chosen by magic(substr) {2}: {1}", new Object[]{
1070-
file, fac.getClass().getSimpleName(), magic});
1154+
"{0}: chosen by magic(substr) {2}: {1}",
1155+
new Object[]{fileName, fac.getClass().getSimpleName(), magic});
10711156
}
10721157
return fac;
10731158
}

0 commit comments

Comments
 (0)