Skip to content

Commit 4524012

Browse files
authored
skip deleted documents during index traversal (#4182)
fixes #4030
1 parent edf23c7 commit 4524012

File tree

9 files changed

+697
-64
lines changed

9 files changed

+697
-64
lines changed

opengrok-indexer/pom.xml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ information: Portions Copyright [yyyy] [name of copyright owner]
1818
1919
CDDL HEADER END
2020
21-
Copyright (c) 2010, 2022, Oracle and/or its affiliates. All rights reserved.
21+
Copyright (c) 2010, 2023, Oracle and/or its affiliates. All rights reserved.
2222
Portions Copyright (c) 2017-2020, Chris Fraire <[email protected]>.
2323
Portions Copyright (c) 2020-2020, Lubos Kosco <[email protected]>.
2424
@@ -57,6 +57,11 @@ Portions Copyright (c) 2020-2020, Lubos Kosco <[email protected]>.
5757
<artifactId>commons-compress</artifactId>
5858
<version>1.21</version>
5959
</dependency>
60+
<dependency>
61+
<groupId>org.apache.commons</groupId>
62+
<artifactId>commons-io</artifactId>
63+
<version>${commons-io.version}</version>
64+
</dependency>
6065
<dependency>
6166
<groupId>org.apache.lucene</groupId>
6267
<artifactId>lucene-core</artifactId>

opengrok-indexer/src/main/java/org/opengrok/indexer/history/FileCollector.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
*/
2323
package org.opengrok.indexer.history;
2424

25+
import org.jetbrains.annotations.VisibleForTesting;
26+
2527
import java.util.Collection;
2628
import java.util.SortedSet;
2729
import java.util.TreeSet;
@@ -63,4 +65,9 @@ public SortedSet<String> getFiles() {
6365
void addFiles(Collection<String> files) {
6466
this.files.addAll(files);
6567
}
68+
69+
@VisibleForTesting
70+
public void reset() {
71+
files.clear();
72+
}
6673
}

opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java

Lines changed: 148 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@
6969
import org.apache.lucene.index.IndexWriterConfig;
7070
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
7171
import org.apache.lucene.index.IndexableField;
72+
import org.apache.lucene.index.MultiBits;
7273
import org.apache.lucene.index.MultiTerms;
7374
import org.apache.lucene.index.PostingsEnum;
7475
import org.apache.lucene.index.StoredFields;
@@ -85,6 +86,7 @@
8586
import org.apache.lucene.store.NativeFSLockFactory;
8687
import org.apache.lucene.store.NoLockFactory;
8788
import org.apache.lucene.store.SimpleFSLockFactory;
89+
import org.apache.lucene.util.Bits;
8890
import org.apache.lucene.util.BytesRef;
8991
import org.jetbrains.annotations.NotNull;
9092
import org.jetbrains.annotations.Nullable;
@@ -136,6 +138,8 @@ public class IndexDatabase {
136138

137139
private static final Set<String> REVERT_COUNTS_FIELDS;
138140

141+
private static final Set<String> LIVE_CHECK_FIELDS;
142+
139143
private static final Object INSTANCE_LOCK = new Object();
140144

141145
/**
@@ -169,6 +173,7 @@ public class IndexDatabase {
169173
private List<String> directories;
170174
private LockFactory lockFactory;
171175
private final BytesRef emptyBR = new BytesRef("");
176+
private final Set<String> deletedUids = new HashSet<>();
172177

173178
// Directory where we store indexes
174179
public static final String INDEX_DIR = "index";
@@ -177,6 +182,8 @@ public class IndexDatabase {
177182

178183
private final IndexDownArgsFactory indexDownArgsFactory;
179184

185+
private final IndexWriterConfigFactory indexWriterConfigFactory;
186+
180187
/**
181188
* Create a new instance of the Index Database. Use this constructor if you
182189
* don't use any projects
@@ -198,6 +205,24 @@ public IndexDatabase(Project project, IndexDownArgsFactory factory) throws IOExc
198205
indexDownArgsFactory = factory;
199206
this.project = project;
200207
lockFactory = NoLockFactory.INSTANCE;
208+
indexWriterConfigFactory = new IndexWriterConfigFactory();
209+
initialize();
210+
}
211+
212+
/**
213+
* Create a new instance of an Index Database for a given project.
214+
*
215+
* @param project the project to create the database for
216+
* @param indexDownArgsFactory {@link IndexDownArgsFactory} instance
217+
* @param indexWriterConfigFactory {@link IndexWriterConfigFactory} instance
218+
* @throws java.io.IOException if an error occurs while creating directories
219+
*/
220+
public IndexDatabase(Project project, IndexDownArgsFactory indexDownArgsFactory,
221+
IndexWriterConfigFactory indexWriterConfigFactory) throws IOException {
222+
this.indexDownArgsFactory = indexDownArgsFactory;
223+
this.project = project;
224+
lockFactory = NoLockFactory.INSTANCE;
225+
this.indexWriterConfigFactory = indexWriterConfigFactory;
201226
initialize();
202227
}
203228

@@ -215,6 +240,10 @@ public IndexDatabase(Project project, IndexDownArgsFactory factory) throws IOExc
215240
REVERT_COUNTS_FIELDS.add(QueryBuilder.PATH);
216241
REVERT_COUNTS_FIELDS.add(QueryBuilder.NUML);
217242
REVERT_COUNTS_FIELDS.add(QueryBuilder.LOC);
243+
244+
LIVE_CHECK_FIELDS = new HashSet<>();
245+
LIVE_CHECK_FIELDS.add(QueryBuilder.U);
246+
LIVE_CHECK_FIELDS.add(QueryBuilder.PATH);
218247
}
219248

220249
/**
@@ -582,11 +611,7 @@ public void update() throws IOException {
582611

583612
IOException finishingException = null;
584613
try {
585-
Analyzer analyzer = AnalyzerGuru.getAnalyzer();
586-
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
587-
iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
588-
iwc.setRAMBufferSizeMB(env.getRamBufferSize());
589-
writer = new IndexWriter(indexDirectory, iwc);
614+
writer = new IndexWriter(indexDirectory, indexWriterConfigFactory.get());
590615
writer.commit(); // to make sure index exists on the disk
591616
completer = new PendingFileCompleter();
592617

@@ -610,6 +635,7 @@ public void update() throws IOException {
610635

611636
String startUid = Util.path2uid(dir, "");
612637
reader = DirectoryReader.open(indexDirectory); // open existing index
638+
setupDeletedUids();
613639
countsAggregator = new NumLinesLOCAggregator();
614640
settings = readAnalysisSettings();
615641
if (settings == null) {
@@ -735,10 +761,63 @@ public void update() throws IOException {
735761
}
736762
}
737763

764+
/**
765+
* The traversal of the uid terms done in {@link #processFile(IndexDownArgs, File, String)}
766+
* and {@link #processFileIncremental(IndexDownArgs, File, String)} needs to skip over deleted documents
767+
* that are often found in multi-segment indexes. This method stores the uids of these documents
768+
* and is expected to be called before the traversal for the top level directory is started.
769+
* @throws IOException if the index cannot be read for some reason
770+
*/
771+
private void setupDeletedUids() throws IOException {
772+
// This method might be called repeatedly from within the same IndexDatabase instance
773+
// for various directories so the map needs to be reset so that it does not contain unrelated uids.
774+
deletedUids.clear();
775+
776+
Bits liveDocs = MultiBits.getLiveDocs(reader); // Will return null if there are no deletions.
777+
if (liveDocs == null) {
778+
LOGGER.log(Level.FINEST, "no deletions found in {0}", reader);
779+
return;
780+
}
781+
782+
Statistics stat = new Statistics();
783+
LOGGER.log(Level.FINEST, "traversing the documents in {0} to collect uids of deleted documents",
784+
indexDirectory);
785+
for (int i = 0; i < reader.maxDoc(); i++) {
786+
if (!liveDocs.get(i)) {
787+
Document doc = reader.document(i, LIVE_CHECK_FIELDS); // use limited-field version
788+
IndexableField field = doc.getField(QueryBuilder.U);
789+
if (field != null) {
790+
if (LOGGER.isLoggable(Level.FINEST)) {
791+
String uidString = field.stringValue();
792+
LOGGER.log(Level.FINEST, "adding ''{0}'' at {1} to deleted uid set",
793+
new Object[]{Util.uid2url(uidString), Util.uid2date(uidString)});
794+
}
795+
deletedUids.add(field.stringValue());
796+
}
797+
}
798+
}
799+
stat.report(LOGGER, Level.FINEST, String.format("found %s deleted documents in %s",
800+
deletedUids.size(), indexDirectory));
801+
}
802+
803+
private void logIgnoredUid(String uid) {
804+
LOGGER.log(Level.FINEST, "ignoring deleted document for {0} at {1}",
805+
new Object[]{Util.uid2url(uid), Util.uid2date(uid)});
806+
}
807+
738808
private void processTrailingTerms(String startUid, boolean usedHistory, IndexDownArgs args) throws IOException {
739809
while (uidIter != null && uidIter.term() != null
740810
&& uidIter.term().utf8ToString().startsWith(startUid)) {
741811

812+
if (deletedUids.contains(uidIter.term().utf8ToString())) {
813+
logIgnoredUid(uidIter.term().utf8ToString());
814+
BytesRef next = uidIter.next();
815+
if (next == null) {
816+
uidIter = null;
817+
}
818+
continue;
819+
}
820+
742821
if (usedHistory) {
743822
// Allow for forced reindex. For history based reindex the trailing terms
744823
// correspond to the files that have not changed. Such files might need to be re-indexed
@@ -1526,68 +1605,55 @@ void indexDown(File dir, String parent, IndexDownArgs args) throws IOException {
15261605
* @param path path of the file argument relative to source root (with leading slash)
15271606
* @throws IOException on error
15281607
*/
1529-
private void processFileIncremental(IndexDownArgs args, File file, String path) throws IOException {
1530-
if (uidIter != null) {
1531-
path = Util.fixPathIfWindows(path);
1532-
// Traverse terms until reaching one that matches the path of given file.
1533-
while (uidIter != null && uidIter.term() != null
1534-
&& uidIter.term().compareTo(emptyBR) != 0
1535-
&& Util.uid2url(uidIter.term().utf8ToString()).compareTo(path) < 0) {
1536-
1537-
// A file that was not changed.
1538-
/*
1539-
* Possibly short-circuit to force reindexing of prior-version indexes.
1540-
*/
1541-
String termPath = Util.uid2url(uidIter.term().utf8ToString());
1542-
File termFile = new File(RuntimeEnvironment.getInstance().getSourceRootFile(), termPath);
1543-
boolean matchOK = (isWithDirectoryCounts || isCountingDeltas) &&
1544-
checkSettings(termFile, termPath);
1545-
if (!matchOK) {
1546-
removeFile(false);
1608+
@VisibleForTesting
1609+
void processFileIncremental(IndexDownArgs args, File file, String path) throws IOException {
1610+
final boolean fileExists = file.exists();
15471611

1548-
args.curCount++;
1549-
args.works.add(new IndexFileWork(termFile, termPath));
1550-
}
1612+
path = Util.fixPathIfWindows(path);
1613+
// Traverse terms until reaching document beyond path of given file.
1614+
while (uidIter != null && uidIter.term() != null
1615+
&& uidIter.term().compareTo(emptyBR) != 0
1616+
&& Util.uid2url(uidIter.term().utf8ToString()).compareTo(path) <= 0) {
15511617

1618+
if (deletedUids.contains(uidIter.term().utf8ToString())) {
1619+
logIgnoredUid(uidIter.term().utf8ToString());
15521620
BytesRef next = uidIter.next();
15531621
if (next == null) {
15541622
uidIter = null;
15551623
}
1624+
continue;
15561625
}
15571626

1558-
if (uidIter != null && uidIter.term() != null
1559-
&& Util.uid2url(uidIter.term().utf8ToString()).equals(path)) {
1560-
/*
1561-
* At this point we know that the file has corresponding term in the index
1562-
* and has changed in some way. Either it was deleted or it was changed.
1563-
*/
1564-
if (!file.exists()) {
1565-
removeFile(true);
1566-
} else {
1627+
/*
1628+
* Possibly short-circuit to force reindexing of prior-version indexes.
1629+
*/
1630+
String termPath = Util.uid2url(uidIter.term().utf8ToString());
1631+
if (!termPath.equals(path)) {
1632+
// A file that was not changed.
1633+
File termFile = new File(RuntimeEnvironment.getInstance().getSourceRootFile(), termPath);
1634+
boolean matchOK = (isWithDirectoryCounts || isCountingDeltas) &&
1635+
checkSettings(termFile, termPath);
1636+
if (!matchOK) {
15671637
removeFile(false);
15681638

15691639
args.curCount++;
1570-
args.works.add(new IndexFileWork(file, path));
1571-
}
1572-
1573-
BytesRef next = uidIter.next();
1574-
if (next == null) {
1575-
uidIter = null;
1640+
args.works.add(new IndexFileWork(termFile, termPath));
15761641
}
15771642
} else {
1578-
// Potentially new file. A file might be added and then deleted,
1579-
// so it is necessary to check its existence.
1580-
if (file.exists()) {
1581-
args.curCount++;
1582-
args.works.add(new IndexFileWork(file, path));
1583-
}
1643+
removeFile(!fileExists);
15841644
}
1585-
} else {
1586-
if (file.exists()) {
1587-
args.curCount++;
1588-
args.works.add(new IndexFileWork(file, path));
1645+
1646+
BytesRef next = uidIter.next();
1647+
if (next == null) {
1648+
uidIter = null;
15891649
}
15901650
}
1651+
1652+
// The function would not be called if the file was not changed in some way.
1653+
if (fileExists) {
1654+
args.curCount++;
1655+
args.works.add(new IndexFileWork(file, path));
1656+
}
15911657
}
15921658

15931659
/**
@@ -1597,7 +1663,8 @@ private void processFileIncremental(IndexDownArgs args, File file, String path)
15971663
* @param path path corresponding to the file parameter, relative to source root (with leading slash)
15981664
* @throws IOException on error
15991665
*/
1600-
private void processFile(IndexDownArgs args, File file, String path) throws IOException {
1666+
@VisibleForTesting
1667+
void processFile(IndexDownArgs args, File file, String path) throws IOException {
16011668
if (uidIter != null) {
16021669
path = Util.fixPathIfWindows(path);
16031670
String uid = Util.path2uid(path,
@@ -1611,6 +1678,15 @@ private void processFile(IndexDownArgs args, File file, String path) throws IOEx
16111678
&& uidIter.term().compareTo(emptyBR) != 0
16121679
&& uidIter.term().compareTo(buid) < 0) {
16131680

1681+
if (deletedUids.contains(uidIter.term().utf8ToString())) {
1682+
logIgnoredUid(uidIter.term().utf8ToString());
1683+
BytesRef next = uidIter.next();
1684+
if (next == null) {
1685+
uidIter = null;
1686+
}
1687+
continue;
1688+
}
1689+
16141690
// If the term's path matches path of currently processed file,
16151691
// it is clear that the file has been modified and thus
16161692
// removeFile() will be followed by call to addFile() in indexParallel().
@@ -1628,6 +1704,14 @@ private void processFile(IndexDownArgs args, File file, String path) throws IOEx
16281704

16291705
// If the file was not modified, probably skip to the next one.
16301706
if (uidIter != null && uidIter.term() != null && uidIter.term().bytesEquals(buid)) {
1707+
if (deletedUids.contains(uidIter.term().utf8ToString())) {
1708+
logIgnoredUid(uidIter.term().utf8ToString());
1709+
BytesRef next = uidIter.next();
1710+
if (next == null) {
1711+
uidIter = null;
1712+
}
1713+
return;
1714+
}
16311715

16321716
/*
16331717
* Possibly short-circuit to force reindexing of prior-version indexes.
@@ -2035,13 +2119,13 @@ private void finishWriting() throws IOException {
20352119
try {
20362120
writeAnalysisSettings();
20372121

2038-
LOGGER.log(Level.FINE, "preparing to commit changes to Lucene index"); // TODO add info about which database
2122+
LOGGER.log(Level.FINE, "preparing to commit changes to {0}", this);
20392123
writer.prepareCommit();
20402124
hasPendingCommit = true;
20412125

2126+
Statistics completerStat = new Statistics();
20422127
int n = completer.complete();
2043-
// TODO: add elapsed
2044-
LOGGER.log(Level.FINE, "completed {0} object(s)", n);
2128+
completerStat.report(LOGGER, Level.FINE, String.format("completed %d object(s)", n));
20452129

20462130
// Just before commit(), reset the `hasPendingCommit' flag,
20472131
// since after commit() is called, there is no need for
@@ -2202,4 +2286,13 @@ private boolean xrefExistsFor(String path) {
22022286
private static class AcceptSymlinkRet {
22032287
String localRelPath;
22042288
}
2289+
2290+
@Override
2291+
public String toString() {
2292+
if (this.project != null) {
2293+
return "index database for project '" + this.project.getName() + "'";
2294+
}
2295+
2296+
return "global index database";
2297+
}
22052298
}

0 commit comments

Comments
 (0)