Skip to content

Commit f10696b

Browse files
authored
actually check duplicate live documents (#4219)
1 parent 87e7c81 commit f10696b

File tree

2 files changed

+43
-36
lines changed

2 files changed

+43
-36
lines changed

opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexCheck.java

Lines changed: 42 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,11 @@
2727
import java.nio.file.Path;
2828
import java.util.ArrayList;
2929
import java.util.Collection;
30-
import java.util.HashMap;
3130
import java.util.HashSet;
3231
import java.util.List;
3332
import java.util.Map;
3433
import java.util.Set;
34+
import java.util.concurrent.ConcurrentHashMap;
3535
import java.util.logging.Level;
3636
import java.util.logging.Logger;
3737

@@ -41,19 +41,16 @@
4141
import org.apache.lucene.index.IndexReader;
4242
import org.apache.lucene.index.IndexableField;
4343
import org.apache.lucene.index.MultiBits;
44-
import org.apache.lucene.index.MultiTerms;
4544
import org.apache.lucene.index.SegmentInfos;
46-
import org.apache.lucene.index.Terms;
47-
import org.apache.lucene.index.TermsEnum;
4845
import org.apache.lucene.store.Directory;
4946
import org.apache.lucene.store.FSDirectory;
5047
import org.apache.lucene.store.LockFactory;
5148
import org.apache.lucene.store.NativeFSLockFactory;
5249
import org.apache.lucene.store.NoLockFactory;
5350
import org.apache.lucene.util.Bits;
54-
import org.apache.lucene.util.BytesRef;
5551
import org.apache.lucene.util.Version;
5652
import org.jetbrains.annotations.NotNull;
53+
import org.jetbrains.annotations.Nullable;
5754
import org.opengrok.indexer.configuration.Configuration;
5855
import org.opengrok.indexer.logger.LoggerFactory;
5956
import org.opengrok.indexer.search.QueryBuilder;
@@ -109,6 +106,12 @@ public static class IndexDocumentException extends Exception {
109106

110107
private final Map<String, Integer> fileMap;
111108

109+
public IndexDocumentException(String s) {
110+
super(s);
111+
112+
this.fileMap = null;
113+
}
114+
112115
public IndexDocumentException(String s, Map<String, Integer> fileMap) {
113116
super(s);
114117

@@ -117,7 +120,7 @@ public IndexDocumentException(String s, Map<String, Integer> fileMap) {
117120

118121
@Override
119122
public String toString() {
120-
return getMessage() + ": " + fileMap;
123+
return getMessage() + ": " + (fileMap == null ? "" : fileMap);
121124
}
122125
}
123126

@@ -214,7 +217,7 @@ public static void checkDir(Path indexPath, IndexCheckMode mode, String projectN
214217
}
215218

216219
if (mode.ordinal() >= IndexCheckMode.DOCUMENTS.ordinal()) {
217-
checkDuplicateDocuments(indexPath, projectName);
220+
checkDuplicateDocuments(indexPath);
218221
}
219222
}
220223

@@ -257,67 +260,71 @@ public static Set<String> getDeletedUids(Path indexPath) throws IOException {
257260

258261
/**
259262
* @param indexPath path to index
260-
* @param projectName project name, can be empty
261-
* @return list of live document paths
263+
* @return list of live document paths (some of them can be duplicate if the index is corrupted)
264+
* or {@code null} if live documents cannot be retrieved.
262265
* @throws IOException on I/O error
263266
*/
264-
public static List<String> getLiveDocumentPaths(Path indexPath, String projectName) throws IOException {
267+
@Nullable
268+
public static List<String> getLiveDocumentPaths(Path indexPath) throws IOException {
265269
try (IndexReader indexReader = getIndexReader(indexPath)) {
266-
Terms terms = MultiTerms.getTerms(indexReader, QueryBuilder.U);
267-
TermsEnum uidIter = terms.iterator();
268-
String dir = "/" + projectName;
269-
String startUid = Util.path2uid(dir, "");
270-
uidIter.seekCeil(new BytesRef(startUid));
271-
final BytesRef emptyBR = new BytesRef("");
272-
// paths of live (i.e. not deleted) documents. Must be a list so that duplicate documents can be checked.
273270
List<String> livePaths = new ArrayList<>();
274-
Set<String> deletedUids = getDeletedUids(indexPath);
275271

276-
while (uidIter != null && uidIter.term() != null && uidIter.term().compareTo(emptyBR) != 0) {
277-
String termValue = uidIter.term().utf8ToString();
278-
String termPath = Util.uid2url(termValue);
272+
Bits liveDocs = MultiBits.getLiveDocs(indexReader);
273+
if (liveDocs == null) { // the index has no deletions
274+
return null;
275+
}
279276

280-
if (deletedUids.contains(termValue)) {
281-
BytesRef next = uidIter.next();
282-
if (next == null) {
283-
uidIter = null;
284-
}
277+
for (int i = 0; i < indexReader.maxDoc(); i++) {
278+
Document doc = indexReader.storedFields().document(i);
279+
280+
if (!liveDocs.get(i)) {
285281
continue;
286282
}
287283

288-
livePaths.add(termPath);
289-
290-
BytesRef next = uidIter.next();
291-
if (next == null) {
292-
uidIter = null;
284+
// This should avoid the special LOC documents.
285+
IndexableField field = doc.getField(QueryBuilder.U);
286+
if (field != null) {
287+
String uid = field.stringValue();
288+
livePaths.add(Util.uid2url(uid));
293289
}
294290
}
295291

296292
return livePaths;
297293
}
298294
}
299295

300-
private static void checkDuplicateDocuments(Path indexPath, String projectName)
296+
private static void checkDuplicateDocuments(Path indexPath)
301297
throws IOException, IndexDocumentException {
302298

303299
LOGGER.log(Level.FINE, "Checking duplicate documents in ''{0}''", indexPath);
304300
Statistics stat = new Statistics();
305-
List<String> livePaths = getLiveDocumentPaths(indexPath, projectName);
301+
List<String> livePaths = getLiveDocumentPaths(indexPath);
302+
if (livePaths == null) {
303+
throw new IndexDocumentException(String.format("cannot determine live paths for '%s'", indexPath));
304+
}
306305
HashSet<String> pathSet = new HashSet<>(livePaths);
307-
HashMap<String, Integer> fileMap = new HashMap<>();
306+
Map<String, Integer> fileMap = new ConcurrentHashMap<>();
308307
if (pathSet.size() != livePaths.size()) {
309308
LOGGER.log(Level.FINE,
310309
"index in ''{0}'' has document path set ({1}) vs document list ({2}) discrepancy",
311310
new Object[]{indexPath, pathSet.size(), livePaths.size()});
312311
for (String path : livePaths) {
313312
if (pathSet.contains(path)) {
314-
LOGGER.log(Level.FINER, "duplicate path: ''{0}''", path);
315313
fileMap.putIfAbsent(path, 0);
316314
fileMap.put(path, fileMap.get(path) + 1);
317315
}
318316
}
317+
}
319318

319+
// Traverse the file map and leave only duplicate entries.
320+
for (String path: fileMap.keySet()) {
321+
if (fileMap.get(path) > 1) {
322+
LOGGER.log(Level.FINER, "duplicate path: ''{0}''", path);
323+
} else {
324+
fileMap.remove(path);
325+
}
320326
}
327+
321328
stat.report(LOGGER, Level.FINE, String.format("duplicate check in '%s' done", indexPath));
322329
if (!fileMap.isEmpty()) {
323330
throw new IndexDocumentException(String.format("index in '%s' contains duplicate live documents",

opengrok-indexer/src/test/java/org/opengrok/indexer/index/IndexerVsDeletedDocumentsTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,7 @@ void testIndexTraversalWithDeletedDocuments(boolean projectsEnabled, boolean use
389389
* similar to what is done in {@link IndexDatabase#update()}.
390390
*/
391391
private void checkLiveDocs(String projectName) throws IOException {
392-
List<String> livePaths = getLiveDocumentPaths(getIndexPath(projectName), projectName);
392+
List<String> livePaths = getLiveDocumentPaths(getIndexPath(projectName));
393393

394394
assertTrue(livePaths.size() > 0);
395395
assertEquals(new HashSet<>(livePaths).size(), livePaths.size());

0 commit comments

Comments
 (0)