23
23
*/
24
24
package org .opengrok .indexer .analysis ;
25
25
26
+ import java .io .BufferedInputStream ;
26
27
import java .io .BufferedReader ;
27
28
import java .io .File ;
29
+ import java .io .FileInputStream ;
28
30
import java .io .FileWriter ;
29
31
import java .io .IOException ;
30
32
import java .io .InputStream ;
47
49
import java .util .TreeSet ;
48
50
import java .util .logging .Level ;
49
51
import java .util .logging .Logger ;
52
+
50
53
import org .apache .lucene .document .DateTools ;
51
54
import org .apache .lucene .document .Document ;
52
55
import org .apache .lucene .document .Field ;
130
133
*/
131
134
public class AnalyzerGuru {
132
135
136
+ /**
137
+ * A value used as a placeholder for a filename when content is anonymous
138
+ * (e.g. from temporary source or from a stream for which an identifier is
139
+ * not available).
140
+ */
141
+ public static final String ANONYMOUS_NAME = "<anonymous>" ;
142
+
133
143
/**
134
144
* The maximum number of characters (multi-byte if a BOM is identified) to
135
145
* read from the input stream to be used for magic string matching.
@@ -551,29 +561,92 @@ public static AbstractAnalyzer getAnalyzer(String fileTypeName) {
551
561
}
552
562
553
563
/**
554
- * Get an analyzer suited to analyze a file. This function will reuse
555
- * analyzers since they are costly .
564
+ * Gets an analyzer factory suited to analyze a file, but without a check
565
+ * for Huge Text since the file size is not available .
556
566
*
557
567
* @param in Input stream containing data to be analyzed
558
- * @param file Name of the file to be analyzed
559
- * @return An analyzer suited for that file content
568
+ * @param fileName Name of the file to be analyzed
569
+ * @return An analyzer factory suited for that file content
560
570
* @throws java.io.IOException If an error occurs while accessing the data
561
571
* in the input stream.
562
572
*/
563
- public static AbstractAnalyzer getAnalyzer (InputStream in , String file ) throws IOException {
564
- AnalyzerFactory factory = find (in , file );
573
+ public static AnalyzerFactory getAnalyzerFactory (InputStream in , String fileName )
574
+ throws IOException {
575
+ AnalyzerFactory factory = find (in , fileName );
565
576
if (factory == null ) {
566
- AbstractAnalyzer defaultAnalyzer = getAnalyzer () ;
577
+ factory = DEFAULT_ANALYZER_FACTORY ;
567
578
if (LOGGER .isLoggable (Level .FINEST )) {
579
+ AbstractAnalyzer defaultAnalyzer = factory .getAnalyzer ();
568
580
LOGGER .log (Level .FINEST , "{0}: fallback {1}" ,
569
- new Object []{file ,
570
- defaultAnalyzer .getClass ().getSimpleName () });
581
+ new Object []{fileName , defaultAnalyzer .getClass ().getSimpleName ()});
571
582
}
572
- return defaultAnalyzer ;
573
583
}
584
+ return factory ;
585
+ }
586
+
587
+ /**
588
+ * Gets an analyzer suited to analyze a file, but without a check for Huge
589
+ * Text since the file size is not available.
590
+ *
591
+ * @param in Input stream containing data to be analyzed
592
+ * @param fileName Name of the file to be analyzed
593
+ * @return An analyzer factory suited for the file content
594
+ * @throws java.io.IOException If an error occurs while accessing the data
595
+ * in the input stream.
596
+ */
597
+ public static AbstractAnalyzer getAnalyzer (InputStream in , String fileName )
598
+ throws IOException {
599
+ AnalyzerFactory factory = getAnalyzerFactory (in , fileName );
574
600
return factory .getAnalyzer ();
575
601
}
576
602
603
+ /**
604
+ * Gets an analyzer factory suited to analyze a file, with a check for Huge
605
+ * Text.
606
+ *
607
+ * @param file a defined instance to be analyzed
608
+ * @param path Name (possibly normalized) of the file to be analyzed
609
+ * @param logHugeText a value indicating whether to log if the file is
610
+ * identified as Huge Text
611
+ * @return An analyzer factory suited for the file content
612
+ * @throws java.io.IOException If an error occurs while reading the file
613
+ */
614
+ public static AnalyzerFactory getAnalyzerFactory (File file , String path , boolean logHugeText )
615
+ throws IOException {
616
+
617
+ AnalyzerFactory fac ;
618
+ try (InputStream in = new BufferedInputStream (
619
+ new FileInputStream (file ))) {
620
+ fac = AnalyzerGuru .getAnalyzerFactory (in , path );
621
+ }
622
+
623
+ if (AbstractAnalyzer .Genre .PLAIN .equals (fac .getGenre ()) &&
624
+ file .length () >= RuntimeEnvironment .getInstance ().getHugeTextThresholdBytes ()) {
625
+ fac = HugeTextAnalyzerFactory .DEFAULT_INSTANCE ;
626
+ if (logHugeText && LOGGER .isLoggable (Level .WARNING )) {
627
+ String origFileTypeName = fac .getAnalyzer ().getFileTypeName ();
628
+ LOGGER .log (Level .WARNING , "{0} is huge text: {1}" ,
629
+ new Object []{origFileTypeName , path });
630
+ }
631
+ }
632
+ return fac ;
633
+ }
634
+
635
+ /**
636
+ * Get an analyzer suited to analyze a file, with a check for Huge Text.
637
+ *
638
+ * @param file a defined instance to be analyzed
639
+ * @param path Name (possibly normalized) of the file to be analyzed
640
+ * @param logHugeText a value indicating whether to log if the file is
641
+ * identified as Huge Text
642
+ * @return An analyzer suited for the file content
643
+ * @throws java.io.IOException If an error occurs while reading the file
644
+ */
645
+ public static AbstractAnalyzer getAnalyzer (File file , String path , boolean logHugeText )
646
+ throws IOException {
647
+ return getAnalyzerFactory (file , path , logHugeText ).getAnalyzer ();
648
+ }
649
+
577
650
/**
578
651
* Free resources associated with all registered analyzers.
579
652
*/
@@ -718,24 +791,36 @@ public static void writeDumpedXref(String contextPath,
718
791
}
719
792
720
793
/**
721
- * Get the genre of a file.
794
+ * Get the genre of a file, with a check for Huge Text .
722
795
*
723
796
* @param file The file to inspect
797
+ * @param fileName name of the file to inspect
724
798
* @return The genre suitable to decide how to display the file
725
799
*/
726
- public static AbstractAnalyzer .Genre getGenre (String file ) {
727
- return getGenre (find (file ));
800
+ public static AbstractAnalyzer .Genre getGenre (File file , String fileName ) {
801
+ try {
802
+ return getGenre (getAnalyzerFactory (file , fileName , true ));
803
+ } catch (IOException e ) {
804
+ LOGGER .log (Level .WARNING , "Error reading {0}" , fileName );
805
+ return null ;
806
+ }
728
807
}
729
808
730
809
/**
731
- * Get the genre of a bulk of data.
810
+ * Get the genre of a bulk of data, but without a check for Huge Text since
811
+ * the file size is not available.
732
812
*
733
813
* @param in A stream containing the data
814
+ * @param fileName name of the file to inspect
734
815
* @return The genre suitable to decide how to display the file
735
- * @throws java.io.IOException If an error occurs while getting the content
736
816
*/
737
- public static AbstractAnalyzer .Genre getGenre (InputStream in ) throws IOException {
738
- return getGenre (find (in ));
817
+ public static AbstractAnalyzer .Genre getGenre (InputStream in , String fileName ) {
818
+ try {
819
+ return getGenre (getAnalyzerFactory (in , fileName ));
820
+ } catch (IOException e ) {
821
+ LOGGER .log (Level .WARNING , "Error reading {0}" , fileName );
822
+ return null ;
823
+ }
739
824
}
740
825
741
826
/**
@@ -881,31 +966,36 @@ private static AnalyzerFactory findFactory(Class<?> factoryClass)
881
966
*
882
967
*
883
968
* @param in The input stream containing the data
884
- * @param file The file name to get the analyzer for
969
+ * @param fileName The file name to get the analyzer for
885
970
* @return the analyzer factory to use
886
971
* @throws java.io.IOException If a problem occurs while reading the data
887
972
*/
888
- public static AnalyzerFactory find (InputStream in , String file )
889
- throws IOException {
890
- AnalyzerFactory factory = find (file );
973
+ static AnalyzerFactory find (InputStream in , String fileName ) throws IOException {
974
+ AnalyzerFactory factory = find (fileName );
891
975
// TODO above is not that great, since if 2 analyzers share one extension
892
976
// then only the first one registered will own it
893
977
// it would be cool if above could return more analyzers and below would
894
978
// then decide between them ...
895
979
if (factory != null ) {
896
980
return factory ;
897
981
}
898
- return findForStream (in , file );
982
+ return findForStream (in , fileName );
899
983
}
900
984
901
985
/**
902
- * Finds a suitable analyser class for file name.
986
+ * Finds a suitable analyser class for {@code fileName}, which should only
987
+ * be used in rare situations, such as for a JAR member or when content is
988
+ * not available to support a full determination.
989
+ * <p>To clarify, a full determination as done by
990
+ * {@link #getAnalyzerFactory(File, String, boolean)} also reads a bit of
991
+ * content as well as inspects file length to determine the ultimate
992
+ * analyser.
903
993
*
904
- * @param file The file name to get the analyzer for
994
+ * @param fileName The file name to get the analyzer for
905
995
* @return the analyzer factory to use
906
996
*/
907
- public static AnalyzerFactory find (String file ) {
908
- String path = file ;
997
+ public static AnalyzerFactory find (String fileName ) {
998
+ String path = fileName ;
909
999
int i ;
910
1000
911
1001
// Get basename of the file first.
@@ -924,8 +1014,7 @@ public static AnalyzerFactory find(String file) {
924
1014
if (factory != null ) {
925
1015
if (LOGGER .isLoggable (Level .FINEST )) {
926
1016
LOGGER .log (Level .FINEST , "{0}: chosen by prefix: {1}" ,
927
- new Object []{file ,
928
- factory .getClass ().getSimpleName () });
1017
+ new Object []{fileName , factory .getClass ().getSimpleName ()});
929
1018
}
930
1019
return factory ;
931
1020
}
@@ -938,8 +1027,7 @@ public static AnalyzerFactory find(String file) {
938
1027
if (factory != null ) {
939
1028
if (LOGGER .isLoggable (Level .FINEST )) {
940
1029
LOGGER .log (Level .FINEST , "{0}: chosen by suffix: {1}" ,
941
- new Object []{file ,
942
- factory .getClass ().getSimpleName () });
1030
+ new Object []{fileName , factory .getClass ().getSimpleName ()});
943
1031
}
944
1032
return factory ;
945
1033
}
@@ -957,22 +1045,22 @@ public static AnalyzerFactory find(String file) {
957
1045
* @throws java.io.IOException if an error occurs while reading data from
958
1046
* the stream
959
1047
*/
960
- public static AnalyzerFactory find (InputStream in ) throws IOException {
961
- return findForStream (in , "<anonymous>" );
1048
+ static AnalyzerFactory find (InputStream in ) throws IOException {
1049
+ return findForStream (in , ANONYMOUS_NAME );
962
1050
}
963
1051
964
1052
/**
965
1053
* Finds a suitable analyzer class for the data in this stream
966
1054
* corresponding to a file of the specified name.
967
1055
*
968
1056
* @param in The stream containing the data to analyze
969
- * @param file The file name to get the analyzer for
1057
+ * @param fileName The file name to get the analyzer for
970
1058
* @return the analyzer factory to use
971
1059
* @throws java.io.IOException if an error occurs while reading data from
972
1060
* the stream
973
1061
*/
974
- private static AnalyzerFactory findForStream (InputStream in ,
975
- String file ) throws IOException {
1062
+ private static AnalyzerFactory findForStream (InputStream in , String fileName )
1063
+ throws IOException {
976
1064
977
1065
in .mark (MAGIC_BYTES_NUM );
978
1066
byte [] content = new byte [MAGIC_BYTES_NUM ];
@@ -998,8 +1086,8 @@ private static AnalyzerFactory findForStream(InputStream in,
998
1086
if (fac != null ) {
999
1087
if (LOGGER .isLoggable (Level .FINEST )) {
1000
1088
LOGGER .log (Level .FINEST ,
1001
- "{0}: chosen by precise magic: {1}" , new Object []{
1002
- file , fac .getClass ().getSimpleName () });
1089
+ "{0}: chosen by precise magic: {1}" ,
1090
+ new Object []{ fileName , fac .getClass ().getSimpleName ()});
1003
1091
}
1004
1092
return fac ;
1005
1093
}
@@ -1008,7 +1096,7 @@ private static AnalyzerFactory findForStream(InputStream in,
1008
1096
1009
1097
// Next, look for magic strings
1010
1098
String opening = readOpening (in , content );
1011
- fac = findMagicString (opening , file );
1099
+ fac = findMagicString (opening , fileName );
1012
1100
if (fac != null ) {
1013
1101
return fac ;
1014
1102
}
@@ -1020,9 +1108,8 @@ private static AnalyzerFactory findForStream(InputStream in,
1020
1108
if (fac != null ) {
1021
1109
if (LOGGER .isLoggable (Level .FINEST )) {
1022
1110
LOGGER .log (Level .FINEST ,
1023
- "{0}: chosen by imprecise magic: {1}" ,
1024
- new Object []{file ,
1025
- fac .getClass ().getSimpleName () });
1111
+ "{0}: chosen by imprecise magic: {1}" ,
1112
+ new Object []{fileName , fac .getClass ().getSimpleName ()});
1026
1113
}
1027
1114
return fac ;
1028
1115
}
@@ -1032,16 +1119,15 @@ private static AnalyzerFactory findForStream(InputStream in,
1032
1119
return null ;
1033
1120
}
1034
1121
1035
- private static AnalyzerFactory findMagicString (String opening , String file ) {
1122
+ private static AnalyzerFactory findMagicString (String opening , String fileName ) {
1036
1123
1037
1124
// first, try to look up two words in magics
1038
1125
String fragment = getWords (opening , 2 );
1039
1126
AnalyzerFactory fac = magics .get (fragment );
1040
1127
if (fac != null ) {
1041
1128
if (LOGGER .isLoggable (Level .FINEST )) {
1042
1129
LOGGER .log (Level .FINEST , "{0}: chosen by magic {2}: {1}" ,
1043
- new Object []{file , fac .getClass ().getSimpleName (),
1044
- fragment });
1130
+ new Object []{fileName , fac .getClass ().getSimpleName (), fragment });
1045
1131
}
1046
1132
return fac ;
1047
1133
}
@@ -1052,8 +1138,7 @@ private static AnalyzerFactory findMagicString(String opening, String file) {
1052
1138
if (fac != null ) {
1053
1139
if (LOGGER .isLoggable (Level .FINEST )) {
1054
1140
LOGGER .log (Level .FINEST , "{0}: chosen by magic {2}: {1}" ,
1055
- new Object []{file , fac .getClass ().getSimpleName (),
1056
- fragment });
1141
+ new Object []{fileName , fac .getClass ().getSimpleName (), fragment });
1057
1142
}
1058
1143
return fac ;
1059
1144
}
@@ -1066,8 +1151,8 @@ private static AnalyzerFactory findMagicString(String opening, String file) {
1066
1151
fac = entry .getValue ();
1067
1152
if (LOGGER .isLoggable (Level .FINEST )) {
1068
1153
LOGGER .log (Level .FINEST ,
1069
- "{0}: chosen by magic(substr) {2}: {1}" , new Object []{
1070
- file , fac .getClass ().getSimpleName (), magic });
1154
+ "{0}: chosen by magic(substr) {2}: {1}" ,
1155
+ new Object []{ fileName , fac .getClass ().getSimpleName (), magic });
1071
1156
}
1072
1157
return fac ;
1073
1158
}
0 commit comments