actually check duplicate live documents (#4219)

vladak · web-flow · commit f10696b0af4c · 2023-03-14T10:54:19.000+01:00
diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexCheck.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexCheck.java
@@ -27,11 +27,11 @@
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.Collection;
-import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 
@@ -41,19 +41,16 @@
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.index.MultiBits;
-import org.apache.lucene.index.MultiTerms;
 import org.apache.lucene.index.SegmentInfos;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.store.LockFactory;
 import org.apache.lucene.store.NativeFSLockFactory;
 import org.apache.lucene.store.NoLockFactory;
 import org.apache.lucene.util.Bits;
-import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.Version;
 import org.jetbrains.annotations.NotNull;
+import org.jetbrains.annotations.Nullable;
 import org.opengrok.indexer.configuration.Configuration;
 import org.opengrok.indexer.logger.LoggerFactory;
 import org.opengrok.indexer.search.QueryBuilder;
@@ -109,6 +106,12 @@ public static class IndexDocumentException extends Exception {
 
         private final Map<String, Integer> fileMap;
 
+        public IndexDocumentException(String s) {
+            super(s);
+
+            this.fileMap = null;
+        }
+
         public IndexDocumentException(String s, Map<String, Integer> fileMap) {
             super(s);
 
@@ -117,7 +120,7 @@ public IndexDocumentException(String s, Map<String, Integer> fileMap) {
 
         @Override
         public String toString() {
-            return getMessage() + ": " + fileMap;
+            return getMessage() + ": " + (fileMap == null ? "" : fileMap);
         }
     }
 
@@ -214,7 +217,7 @@ public static void checkDir(Path indexPath, IndexCheckMode mode, String projectN
         }
 
         if (mode.ordinal() >= IndexCheckMode.DOCUMENTS.ordinal()) {
-            checkDuplicateDocuments(indexPath, projectName);
+            checkDuplicateDocuments(indexPath);
         }
     }
 
@@ -257,67 +260,71 @@ public static Set<String> getDeletedUids(Path indexPath) throws IOException {
 
     /**
      * @param indexPath path to index
-     * @param projectName project name, can be empty
-     * @return list of live document paths
+     * @return list of live document paths (some of them can be duplicate if the index is corrupted)
+     * or {@code null} if live documents cannot be retrieved.
      * @throws IOException on I/O error
      */
-    public static List<String> getLiveDocumentPaths(Path indexPath, String projectName) throws IOException {
+    @Nullable
+    public static List<String> getLiveDocumentPaths(Path indexPath) throws IOException {
         try (IndexReader indexReader = getIndexReader(indexPath)) {
-            Terms terms = MultiTerms.getTerms(indexReader, QueryBuilder.U);
-            TermsEnum uidIter = terms.iterator();
-            String dir = "/" + projectName;
-            String startUid = Util.path2uid(dir, "");
-            uidIter.seekCeil(new BytesRef(startUid));
-            final BytesRef emptyBR = new BytesRef("");
-            // paths of live (i.e. not deleted) documents. Must be a list so that duplicate documents can be checked.
             List<String> livePaths = new ArrayList<>();
-            Set<String> deletedUids = getDeletedUids(indexPath);
 
-            while (uidIter != null && uidIter.term() != null && uidIter.term().compareTo(emptyBR) != 0) {
-                String termValue = uidIter.term().utf8ToString();
-                String termPath = Util.uid2url(termValue);
+            Bits liveDocs = MultiBits.getLiveDocs(indexReader);
+            if (liveDocs == null) { // the index has no deletions
+                return null;
+            }
 
-                if (deletedUids.contains(termValue)) {
-                    BytesRef next = uidIter.next();
-                    if (next == null) {
-                        uidIter = null;
-                    }
+            for (int i = 0; i < indexReader.maxDoc(); i++) {
+                Document doc = indexReader.storedFields().document(i);
+
+                if (!liveDocs.get(i)) {
                     continue;
                 }
 
-                livePaths.add(termPath);
-
-                BytesRef next = uidIter.next();
-                if (next == null) {
-                    uidIter = null;
+                // This should avoid the special LOC documents.
+                IndexableField field = doc.getField(QueryBuilder.U);
+                if (field != null) {
+                    String uid = field.stringValue();
+                    livePaths.add(Util.uid2url(uid));
                 }
             }
 
             return livePaths;
         }
     }
 
-    private static void checkDuplicateDocuments(Path indexPath, String projectName)
+    private static void checkDuplicateDocuments(Path indexPath)
             throws IOException, IndexDocumentException {
 
         LOGGER.log(Level.FINE, "Checking duplicate documents in ''{0}''", indexPath);
         Statistics stat = new Statistics();
-        List<String> livePaths = getLiveDocumentPaths(indexPath, projectName);
+        List<String> livePaths = getLiveDocumentPaths(indexPath);
+        if (livePaths == null) {
+            throw new IndexDocumentException(String.format("cannot determine live paths for '%s'", indexPath));
+        }
         HashSet<String> pathSet = new HashSet<>(livePaths);
-        HashMap<String, Integer> fileMap = new HashMap<>();
+        Map<String, Integer> fileMap = new ConcurrentHashMap<>();
         if (pathSet.size() != livePaths.size()) {
             LOGGER.log(Level.FINE,
                     "index in ''{0}'' has document path set ({1}) vs document list ({2}) discrepancy",
                     new Object[]{indexPath, pathSet.size(), livePaths.size()});
             for (String path : livePaths) {
                 if (pathSet.contains(path)) {
-                    LOGGER.log(Level.FINER, "duplicate path: ''{0}''", path);
                     fileMap.putIfAbsent(path, 0);
                     fileMap.put(path, fileMap.get(path) + 1);
                 }
             }
+        }
 
+        // Traverse the file map and leave only duplicate entries.
+        for (String path: fileMap.keySet()) {
+            if (fileMap.get(path) > 1) {
+                LOGGER.log(Level.FINER, "duplicate path: ''{0}''", path);
+            } else {
+                fileMap.remove(path);
+            }
         }
+
         stat.report(LOGGER, Level.FINE, String.format("duplicate check in '%s' done", indexPath));
         if (!fileMap.isEmpty()) {
             throw new IndexDocumentException(String.format("index in '%s' contains duplicate live documents",
diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/index/IndexerVsDeletedDocumentsTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/index/IndexerVsDeletedDocumentsTest.java
@@ -389,7 +389,7 @@ void testIndexTraversalWithDeletedDocuments(boolean projectsEnabled, boolean use
      * similar to what is done in {@link IndexDatabase#update()}.
      */
     private void checkLiveDocs(String projectName) throws IOException {
-        List<String> livePaths = getLiveDocumentPaths(getIndexPath(projectName), projectName);
+        List<String> livePaths = getLiveDocumentPaths(getIndexPath(projectName));
 
         assertTrue(livePaths.size() > 0);
         assertEquals(new HashSet<>(livePaths).size(), livePaths.size());