Use a TYPE-specific analyzer if re-ANALYSIS is done

idodeclare · idodeclare · commit f812a1033ed3 · 2018-04-07T13:03:32.000-05:00
diff --git a/src/org/opensolaris/opengrok/analysis/AnalyzerGuru.java b/src/org/opensolaris/opengrok/analysis/AnalyzerGuru.java
@@ -213,6 +213,13 @@ public class AnalyzerGuru {
 
     private static final Map<String, String> fileTypeDescriptions = new TreeMap<>();
 
+    /**
+     * Maps from {@link FileAnalyzer#getFileTypeName()} to
+     * {@link FileAnalyzerFactory}
+     */
+    private static final Map<String, FileAnalyzerFactory> FILETYPE_FACTORIES =
+            new HashMap<>();
+
     /*
      * If you write your own analyzer please register it here. The order is
      * important for any factory that uses a FileAnalyzerFactory.Matcher
@@ -338,6 +345,9 @@ private static void registerAnalyzer(FileAnalyzerFactory factory) {
         }
         matchers.addAll(factory.getMatchers());
         factories.add(factory);
+
+        FileAnalyzer fa = factory.getAnalyzer();
+        FILETYPE_FACTORIES.put(fa.getFileTypeName(), factory);
     }
 
     /**
@@ -385,6 +395,17 @@ public static FileAnalyzer getAnalyzer() {
         return DEFAULT_ANALYZER_FACTORY.getAnalyzer();
     }
 
+    /**
+     * Gets an analyzer for the specified {@code fileTypeName} if it accords
+     * with a known {@link FileAnalyzer#getFileTypeName()}.
+     * @param fileTypeName a defined name
+     * @return a defined instance if known or otherwise {@code null}
+     */
+    public static FileAnalyzer getAnalyzer(String fileTypeName) {
+        FileAnalyzerFactory factory = FILETYPE_FACTORIES.get(fileTypeName);
+        return factory == null ? null : factory.getAnalyzer();
+    }
+
     /**
      * Get an analyzer suited to analyze a file. This function will reuse
      * analyzers since they are costly.
diff --git a/src/org/opensolaris/opengrok/search/context/OGKUnifiedHighlighter.java b/src/org/opensolaris/opengrok/search/context/OGKUnifiedHighlighter.java
@@ -41,6 +41,7 @@
 import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.opensolaris.opengrok.analysis.AnalyzerGuru;
 import org.opensolaris.opengrok.analysis.ExpandTabsReader;
 import org.opensolaris.opengrok.analysis.StreamSource;
 import org.opensolaris.opengrok.configuration.RuntimeEnvironment;
@@ -62,6 +63,8 @@ public class OGKUnifiedHighlighter extends UnifiedHighlighter {
 
     private int tabSize;
 
+    private String fileTypeName;
+
     /**
      * Initializes an instance with
      * {@link UnifiedHighlighter#UnifiedHighlighter(org.apache.lucene.search.IndexSearcher, org.apache.lucene.analysis.Analyzer)}
@@ -82,6 +85,22 @@ public OGKUnifiedHighlighter(RuntimeEnvironment env,
         this.env = env;
     }
 
+    /**
+     * Gets a file type name-specific analyzer during the execution of
+     * {@link #highlightFieldsUnion(java.lang.String[], org.apache.lucene.search.Query, int, int)},
+     * or just gets the object passed in to the constructor at all other times.
+     * @return a defined instance
+     */
+    @Override
+    public Analyzer getIndexAnalyzer() {
+        String ftname = fileTypeName;
+        if (ftname == null) {
+            return indexAnalyzer;
+        }
+        Analyzer fa = AnalyzerGuru.getAnalyzer(ftname);
+        return fa == null ? indexAnalyzer : fa;
+    }
+
     public int getTabSize() {
         return tabSize;
     }
@@ -90,6 +109,34 @@ public void setTabSize(int value) {
         this.tabSize = value;
     }
 
+    /**
+     * Transiently arranges that {@link #getIndexAnalyzer()} returns a file type
+     * name-specific analyzer during a subsequent call of
+     * {@link #highlightFieldsUnionWork(java.lang.String[], org.apache.lucene.search.Query, int, int)}.
+     * @param fields a defined instance
+     * @param query a defined instance
+     * @param docId a valid document ID
+     * @param lineLimit the maximum number of lines to return
+     * @return a defined instance or else {@code null} if there are no results
+     * @throws IOException if accessing the Lucene document fails
+     */
+    public String highlightFieldsUnion(String[] fields, Query query,
+            int docId, int lineLimit) throws IOException {
+        /**
+         * Setting fileTypeName has to happen before getFieldHighlighter() is
+         * called by highlightFieldsAsObjects() so that the result of
+         * getIndexAnalyzer() (if it is called due to requiring ANALYSIS) can be
+         * influenced by fileTypeName.
+         */
+        Document doc = searcher.doc(docId);
+        fileTypeName = doc == null ? null : doc.get(QueryBuilder.TYPE);
+        try {
+            return highlightFieldsUnionWork(fields, query, docId, lineLimit);
+        } finally {
+            fileTypeName = null;
+        }
+    }
+
     /**
      * Calls
      * {@link #highlightFieldsAsObjects(java.lang.String[], org.apache.lucene.search.Query, int[], int[])},
@@ -99,10 +146,10 @@ public void setTabSize(int value) {
      * @param query a defined instance
      * @param docId a valid document ID
      * @param lineLimit the maximum number of lines to return
-     * @return {@code null} if there are no results or a defined instance
-     * @throws IOException
+     * @return a defined instance or else {@code null} if there are no results
+     * @throws IOException if accessing the Lucene document fails
      */
-    public String highlightFieldsUnion(String[] fields, Query query,
+    protected String highlightFieldsUnionWork(String[] fields, Query query,
             int docId, int lineLimit) throws IOException {
         int[] maxPassagesCopy = new int[fields.length];
         /**
@@ -154,9 +201,6 @@ public String highlightFieldsUnion(String[] fields, Query query,
      * {@code cacheCharsThreshold} is exceeded. Specifically if that number is
      * 0, then only one document is fetched no matter what. Values in the array
      * of {@link CharSequence} will be {@code null} if no value was found."
-     * @param fields
-     * @param docIter
-     * @param cacheCharsThreshold
      * @return a defined instance
      * @throws IOException if an I/O error occurs
      */
@@ -214,13 +258,11 @@ protected OffsetSource getOptimizedOffsetSource(String field,
              * postings should be sufficient in the comment for
              * shouldHandleMultiTermQuery(String): "MTQ highlighting can be
              * expensive, particularly when using offsets in postings."
-             *     DEFS are stored with term vectors to avoid this problem.
-             *     FULL should be approximately fine with re-analysis using an
-             * on-the-fly PlainAnalyzer.
-             *     REFS should be approximately fine with re-analysis using an
-             * on-the-fly PlainAnalyzer. It might not accord with the true
-             * language symbol tokenizer, but it should not be wildly
-             * divergent.
+             *     DEFS are stored with term vectors to avoid this problem,
+             * since re-analysis would not at all accord with ctags Definitions.
+             *     For FULL and REFS, highlightFieldsUnion() arranges that
+             * getIndexAnalyzer() can return a TYPE-specific analyzer for use by
+             * getOffsetStrategy() -- if re-ANALYSIS is required.
              */
             switch (field) {
                 case QueryBuilder.FULL: