69
69
import org .apache .lucene .index .IndexWriterConfig ;
70
70
import org .apache .lucene .index .IndexWriterConfig .OpenMode ;
71
71
import org .apache .lucene .index .IndexableField ;
72
+ import org .apache .lucene .index .MultiBits ;
72
73
import org .apache .lucene .index .MultiTerms ;
73
74
import org .apache .lucene .index .PostingsEnum ;
74
75
import org .apache .lucene .index .StoredFields ;
85
86
import org .apache .lucene .store .NativeFSLockFactory ;
86
87
import org .apache .lucene .store .NoLockFactory ;
87
88
import org .apache .lucene .store .SimpleFSLockFactory ;
89
+ import org .apache .lucene .util .Bits ;
88
90
import org .apache .lucene .util .BytesRef ;
89
91
import org .jetbrains .annotations .NotNull ;
90
92
import org .jetbrains .annotations .Nullable ;
@@ -136,6 +138,8 @@ public class IndexDatabase {
136
138
137
139
private static final Set <String > REVERT_COUNTS_FIELDS ;
138
140
141
+ private static final Set <String > LIVE_CHECK_FIELDS ;
142
+
139
143
private static final Object INSTANCE_LOCK = new Object ();
140
144
141
145
/**
@@ -169,6 +173,7 @@ public class IndexDatabase {
169
173
private List <String > directories ;
170
174
private LockFactory lockFactory ;
171
175
private final BytesRef emptyBR = new BytesRef ("" );
176
+ private final Set <String > deletedUids = new HashSet <>();
172
177
173
178
// Directory where we store indexes
174
179
public static final String INDEX_DIR = "index" ;
@@ -177,6 +182,8 @@ public class IndexDatabase {
177
182
178
183
private final IndexDownArgsFactory indexDownArgsFactory ;
179
184
185
+ private final IndexWriterConfigFactory indexWriterConfigFactory ;
186
+
180
187
/**
181
188
* Create a new instance of the Index Database. Use this constructor if you
182
189
* don't use any projects
@@ -198,6 +205,24 @@ public IndexDatabase(Project project, IndexDownArgsFactory factory) throws IOExc
198
205
indexDownArgsFactory = factory ;
199
206
this .project = project ;
200
207
lockFactory = NoLockFactory .INSTANCE ;
208
+ indexWriterConfigFactory = new IndexWriterConfigFactory ();
209
+ initialize ();
210
+ }
211
+
212
+ /**
213
+ * Create a new instance of an Index Database for a given project.
214
+ *
215
+ * @param project the project to create the database for
216
+ * @param indexDownArgsFactory {@link IndexDownArgsFactory} instance
217
+ * @param indexWriterConfigFactory {@link IndexWriterConfigFactory} instance
218
+ * @throws java.io.IOException if an error occurs while creating directories
219
+ */
220
+ public IndexDatabase (Project project , IndexDownArgsFactory indexDownArgsFactory ,
221
+ IndexWriterConfigFactory indexWriterConfigFactory ) throws IOException {
222
+ this .indexDownArgsFactory = indexDownArgsFactory ;
223
+ this .project = project ;
224
+ lockFactory = NoLockFactory .INSTANCE ;
225
+ this .indexWriterConfigFactory = indexWriterConfigFactory ;
201
226
initialize ();
202
227
}
203
228
@@ -215,6 +240,10 @@ public IndexDatabase(Project project, IndexDownArgsFactory factory) throws IOExc
215
240
REVERT_COUNTS_FIELDS .add (QueryBuilder .PATH );
216
241
REVERT_COUNTS_FIELDS .add (QueryBuilder .NUML );
217
242
REVERT_COUNTS_FIELDS .add (QueryBuilder .LOC );
243
+
244
+ LIVE_CHECK_FIELDS = new HashSet <>();
245
+ LIVE_CHECK_FIELDS .add (QueryBuilder .U );
246
+ LIVE_CHECK_FIELDS .add (QueryBuilder .PATH );
218
247
}
219
248
220
249
/**
@@ -582,11 +611,7 @@ public void update() throws IOException {
582
611
583
612
IOException finishingException = null ;
584
613
try {
585
- Analyzer analyzer = AnalyzerGuru .getAnalyzer ();
586
- IndexWriterConfig iwc = new IndexWriterConfig (analyzer );
587
- iwc .setOpenMode (OpenMode .CREATE_OR_APPEND );
588
- iwc .setRAMBufferSizeMB (env .getRamBufferSize ());
589
- writer = new IndexWriter (indexDirectory , iwc );
614
+ writer = new IndexWriter (indexDirectory , indexWriterConfigFactory .get ());
590
615
writer .commit (); // to make sure index exists on the disk
591
616
completer = new PendingFileCompleter ();
592
617
@@ -610,6 +635,7 @@ public void update() throws IOException {
610
635
611
636
String startUid = Util .path2uid (dir , "" );
612
637
reader = DirectoryReader .open (indexDirectory ); // open existing index
638
+ setupDeletedUids ();
613
639
countsAggregator = new NumLinesLOCAggregator ();
614
640
settings = readAnalysisSettings ();
615
641
if (settings == null ) {
@@ -735,10 +761,63 @@ public void update() throws IOException {
735
761
}
736
762
}
737
763
764
+ /**
765
+ * The traversal of the uid terms done in {@link #processFile(IndexDownArgs, File, String)}
766
+ * and {@link #processFileIncremental(IndexDownArgs, File, String)} needs to skip over deleted documents
767
+ * that are often found in multi-segment indexes. This method stores the uids of these documents
768
+ * and is expected to be called before the traversal for the top level directory is started.
769
+ * @throws IOException if the index cannot be read for some reason
770
+ */
771
+ private void setupDeletedUids () throws IOException {
772
+ // This method might be called repeatedly from within the same IndexDatabase instance
773
+ // for various directories so the map needs to be reset so that it does not contain unrelated uids.
774
+ deletedUids .clear ();
775
+
776
+ Bits liveDocs = MultiBits .getLiveDocs (reader ); // Will return null if there are no deletions.
777
+ if (liveDocs == null ) {
778
+ LOGGER .log (Level .FINEST , "no deletions found in {0}" , reader );
779
+ return ;
780
+ }
781
+
782
+ Statistics stat = new Statistics ();
783
+ LOGGER .log (Level .FINEST , "traversing the documents in {0} to collect uids of deleted documents" ,
784
+ indexDirectory );
785
+ for (int i = 0 ; i < reader .maxDoc (); i ++) {
786
+ if (!liveDocs .get (i )) {
787
+ Document doc = reader .document (i , LIVE_CHECK_FIELDS ); // use limited-field version
788
+ IndexableField field = doc .getField (QueryBuilder .U );
789
+ if (field != null ) {
790
+ if (LOGGER .isLoggable (Level .FINEST )) {
791
+ String uidString = field .stringValue ();
792
+ LOGGER .log (Level .FINEST , "adding ''{0}'' at {1} to deleted uid set" ,
793
+ new Object []{Util .uid2url (uidString ), Util .uid2date (uidString )});
794
+ }
795
+ deletedUids .add (field .stringValue ());
796
+ }
797
+ }
798
+ }
799
+ stat .report (LOGGER , Level .FINEST , String .format ("found %s deleted documents in %s" ,
800
+ deletedUids .size (), indexDirectory ));
801
+ }
802
+
803
+ private void logIgnoredUid (String uid ) {
804
+ LOGGER .log (Level .FINEST , "ignoring deleted document for {0} at {1}" ,
805
+ new Object []{Util .uid2url (uid ), Util .uid2date (uid )});
806
+ }
807
+
738
808
private void processTrailingTerms (String startUid , boolean usedHistory , IndexDownArgs args ) throws IOException {
739
809
while (uidIter != null && uidIter .term () != null
740
810
&& uidIter .term ().utf8ToString ().startsWith (startUid )) {
741
811
812
+ if (deletedUids .contains (uidIter .term ().utf8ToString ())) {
813
+ logIgnoredUid (uidIter .term ().utf8ToString ());
814
+ BytesRef next = uidIter .next ();
815
+ if (next == null ) {
816
+ uidIter = null ;
817
+ }
818
+ continue ;
819
+ }
820
+
742
821
if (usedHistory ) {
743
822
// Allow for forced reindex. For history based reindex the trailing terms
744
823
// correspond to the files that have not changed. Such files might need to be re-indexed
@@ -1526,68 +1605,55 @@ void indexDown(File dir, String parent, IndexDownArgs args) throws IOException {
1526
1605
* @param path path of the file argument relative to source root (with leading slash)
1527
1606
* @throws IOException on error
1528
1607
*/
1529
- private void processFileIncremental (IndexDownArgs args , File file , String path ) throws IOException {
1530
- if (uidIter != null ) {
1531
- path = Util .fixPathIfWindows (path );
1532
- // Traverse terms until reaching one that matches the path of given file.
1533
- while (uidIter != null && uidIter .term () != null
1534
- && uidIter .term ().compareTo (emptyBR ) != 0
1535
- && Util .uid2url (uidIter .term ().utf8ToString ()).compareTo (path ) < 0 ) {
1536
-
1537
- // A file that was not changed.
1538
- /*
1539
- * Possibly short-circuit to force reindexing of prior-version indexes.
1540
- */
1541
- String termPath = Util .uid2url (uidIter .term ().utf8ToString ());
1542
- File termFile = new File (RuntimeEnvironment .getInstance ().getSourceRootFile (), termPath );
1543
- boolean matchOK = (isWithDirectoryCounts || isCountingDeltas ) &&
1544
- checkSettings (termFile , termPath );
1545
- if (!matchOK ) {
1546
- removeFile (false );
1608
+ @ VisibleForTesting
1609
+ void processFileIncremental (IndexDownArgs args , File file , String path ) throws IOException {
1610
+ final boolean fileExists = file .exists ();
1547
1611
1548
- args .curCount ++;
1549
- args .works .add (new IndexFileWork (termFile , termPath ));
1550
- }
1612
+ path = Util .fixPathIfWindows (path );
1613
+ // Traverse terms until reaching document beyond path of given file.
1614
+ while (uidIter != null && uidIter .term () != null
1615
+ && uidIter .term ().compareTo (emptyBR ) != 0
1616
+ && Util .uid2url (uidIter .term ().utf8ToString ()).compareTo (path ) <= 0 ) {
1551
1617
1618
+ if (deletedUids .contains (uidIter .term ().utf8ToString ())) {
1619
+ logIgnoredUid (uidIter .term ().utf8ToString ());
1552
1620
BytesRef next = uidIter .next ();
1553
1621
if (next == null ) {
1554
1622
uidIter = null ;
1555
1623
}
1624
+ continue ;
1556
1625
}
1557
1626
1558
- if (uidIter != null && uidIter .term () != null
1559
- && Util .uid2url (uidIter .term ().utf8ToString ()).equals (path )) {
1560
- /*
1561
- * At this point we know that the file has corresponding term in the index
1562
- * and has changed in some way. Either it was deleted or it was changed.
1563
- */
1564
- if (!file .exists ()) {
1565
- removeFile (true );
1566
- } else {
1627
+ /*
1628
+ * Possibly short-circuit to force reindexing of prior-version indexes.
1629
+ */
1630
+ String termPath = Util .uid2url (uidIter .term ().utf8ToString ());
1631
+ if (!termPath .equals (path )) {
1632
+ // A file that was not changed.
1633
+ File termFile = new File (RuntimeEnvironment .getInstance ().getSourceRootFile (), termPath );
1634
+ boolean matchOK = (isWithDirectoryCounts || isCountingDeltas ) &&
1635
+ checkSettings (termFile , termPath );
1636
+ if (!matchOK ) {
1567
1637
removeFile (false );
1568
1638
1569
1639
args .curCount ++;
1570
- args .works .add (new IndexFileWork (file , path ));
1571
- }
1572
-
1573
- BytesRef next = uidIter .next ();
1574
- if (next == null ) {
1575
- uidIter = null ;
1640
+ args .works .add (new IndexFileWork (termFile , termPath ));
1576
1641
}
1577
1642
} else {
1578
- // Potentially new file. A file might be added and then deleted,
1579
- // so it is necessary to check its existence.
1580
- if (file .exists ()) {
1581
- args .curCount ++;
1582
- args .works .add (new IndexFileWork (file , path ));
1583
- }
1643
+ removeFile (!fileExists );
1584
1644
}
1585
- } else {
1586
- if ( file . exists ()) {
1587
- args . curCount ++;
1588
- args . works . add ( new IndexFileWork ( file , path )) ;
1645
+
1646
+ BytesRef next = uidIter . next ();
1647
+ if ( next == null ) {
1648
+ uidIter = null ;
1589
1649
}
1590
1650
}
1651
+
1652
+ // The function would not be called if the file was not changed in some way.
1653
+ if (fileExists ) {
1654
+ args .curCount ++;
1655
+ args .works .add (new IndexFileWork (file , path ));
1656
+ }
1591
1657
}
1592
1658
1593
1659
/**
@@ -1597,7 +1663,8 @@ private void processFileIncremental(IndexDownArgs args, File file, String path)
1597
1663
* @param path path corresponding to the file parameter, relative to source root (with leading slash)
1598
1664
* @throws IOException on error
1599
1665
*/
1600
- private void processFile (IndexDownArgs args , File file , String path ) throws IOException {
1666
+ @ VisibleForTesting
1667
+ void processFile (IndexDownArgs args , File file , String path ) throws IOException {
1601
1668
if (uidIter != null ) {
1602
1669
path = Util .fixPathIfWindows (path );
1603
1670
String uid = Util .path2uid (path ,
@@ -1611,6 +1678,15 @@ private void processFile(IndexDownArgs args, File file, String path) throws IOEx
1611
1678
&& uidIter .term ().compareTo (emptyBR ) != 0
1612
1679
&& uidIter .term ().compareTo (buid ) < 0 ) {
1613
1680
1681
+ if (deletedUids .contains (uidIter .term ().utf8ToString ())) {
1682
+ logIgnoredUid (uidIter .term ().utf8ToString ());
1683
+ BytesRef next = uidIter .next ();
1684
+ if (next == null ) {
1685
+ uidIter = null ;
1686
+ }
1687
+ continue ;
1688
+ }
1689
+
1614
1690
// If the term's path matches path of currently processed file,
1615
1691
// it is clear that the file has been modified and thus
1616
1692
// removeFile() will be followed by call to addFile() in indexParallel().
@@ -1628,6 +1704,14 @@ private void processFile(IndexDownArgs args, File file, String path) throws IOEx
1628
1704
1629
1705
// If the file was not modified, probably skip to the next one.
1630
1706
if (uidIter != null && uidIter .term () != null && uidIter .term ().bytesEquals (buid )) {
1707
+ if (deletedUids .contains (uidIter .term ().utf8ToString ())) {
1708
+ logIgnoredUid (uidIter .term ().utf8ToString ());
1709
+ BytesRef next = uidIter .next ();
1710
+ if (next == null ) {
1711
+ uidIter = null ;
1712
+ }
1713
+ return ;
1714
+ }
1631
1715
1632
1716
/*
1633
1717
* Possibly short-circuit to force reindexing of prior-version indexes.
@@ -2035,13 +2119,13 @@ private void finishWriting() throws IOException {
2035
2119
try {
2036
2120
writeAnalysisSettings ();
2037
2121
2038
- LOGGER .log (Level .FINE , "preparing to commit changes to Lucene index" ); // TODO add info about which database
2122
+ LOGGER .log (Level .FINE , "preparing to commit changes to {0}" , this );
2039
2123
writer .prepareCommit ();
2040
2124
hasPendingCommit = true ;
2041
2125
2126
+ Statistics completerStat = new Statistics ();
2042
2127
int n = completer .complete ();
2043
- // TODO: add elapsed
2044
- LOGGER .log (Level .FINE , "completed {0} object(s)" , n );
2128
+ completerStat .report (LOGGER , Level .FINE , String .format ("completed %d object(s)" , n ));
2045
2129
2046
2130
// Just before commit(), reset the `hasPendingCommit' flag,
2047
2131
// since after commit() is called, there is no need for
@@ -2202,4 +2286,13 @@ private boolean xrefExistsFor(String path) {
2202
2286
private static class AcceptSymlinkRet {
2203
2287
String localRelPath ;
2204
2288
}
2289
+
2290
+ @ Override
2291
+ public String toString () {
2292
+ if (this .project != null ) {
2293
+ return "index database for project '" + this .project .getName () + "'" ;
2294
+ }
2295
+
2296
+ return "global index database" ;
2297
+ }
2205
2298
}
0 commit comments