27
27
import java .nio .file .Path ;
28
28
import java .util .ArrayList ;
29
29
import java .util .Collection ;
30
- import java .util .HashMap ;
31
30
import java .util .HashSet ;
32
31
import java .util .List ;
33
32
import java .util .Map ;
34
33
import java .util .Set ;
34
+ import java .util .concurrent .ConcurrentHashMap ;
35
35
import java .util .logging .Level ;
36
36
import java .util .logging .Logger ;
37
37
41
41
import org .apache .lucene .index .IndexReader ;
42
42
import org .apache .lucene .index .IndexableField ;
43
43
import org .apache .lucene .index .MultiBits ;
44
- import org .apache .lucene .index .MultiTerms ;
45
44
import org .apache .lucene .index .SegmentInfos ;
46
- import org .apache .lucene .index .Terms ;
47
- import org .apache .lucene .index .TermsEnum ;
48
45
import org .apache .lucene .store .Directory ;
49
46
import org .apache .lucene .store .FSDirectory ;
50
47
import org .apache .lucene .store .LockFactory ;
51
48
import org .apache .lucene .store .NativeFSLockFactory ;
52
49
import org .apache .lucene .store .NoLockFactory ;
53
50
import org .apache .lucene .util .Bits ;
54
- import org .apache .lucene .util .BytesRef ;
55
51
import org .apache .lucene .util .Version ;
56
52
import org .jetbrains .annotations .NotNull ;
53
+ import org .jetbrains .annotations .Nullable ;
57
54
import org .opengrok .indexer .configuration .Configuration ;
58
55
import org .opengrok .indexer .logger .LoggerFactory ;
59
56
import org .opengrok .indexer .search .QueryBuilder ;
@@ -109,6 +106,12 @@ public static class IndexDocumentException extends Exception {
109
106
110
107
private final Map <String , Integer > fileMap ;
111
108
109
+ public IndexDocumentException (String s ) {
110
+ super (s );
111
+
112
+ this .fileMap = null ;
113
+ }
114
+
112
115
public IndexDocumentException (String s , Map <String , Integer > fileMap ) {
113
116
super (s );
114
117
@@ -117,7 +120,7 @@ public IndexDocumentException(String s, Map<String, Integer> fileMap) {
117
120
118
121
@ Override
119
122
public String toString () {
120
- return getMessage () + ": " + fileMap ;
123
+ return getMessage () + ": " + ( fileMap == null ? "" : fileMap ) ;
121
124
}
122
125
}
123
126
@@ -214,7 +217,7 @@ public static void checkDir(Path indexPath, IndexCheckMode mode, String projectN
214
217
}
215
218
216
219
if (mode .ordinal () >= IndexCheckMode .DOCUMENTS .ordinal ()) {
217
- checkDuplicateDocuments (indexPath , projectName );
220
+ checkDuplicateDocuments (indexPath );
218
221
}
219
222
}
220
223
@@ -257,67 +260,71 @@ public static Set<String> getDeletedUids(Path indexPath) throws IOException {
257
260
258
261
/**
259
262
* @param indexPath path to index
260
- * @param projectName project name, can be empty
261
- * @return list of live document paths
263
+ * @return list of live document paths (some of them can be duplicate if the index is corrupted)
264
+ * or {@code null} if live documents cannot be retrieved.
262
265
* @throws IOException on I/O error
263
266
*/
264
- public static List <String > getLiveDocumentPaths (Path indexPath , String projectName ) throws IOException {
267
+ @ Nullable
268
+ public static List <String > getLiveDocumentPaths (Path indexPath ) throws IOException {
265
269
try (IndexReader indexReader = getIndexReader (indexPath )) {
266
- Terms terms = MultiTerms .getTerms (indexReader , QueryBuilder .U );
267
- TermsEnum uidIter = terms .iterator ();
268
- String dir = "/" + projectName ;
269
- String startUid = Util .path2uid (dir , "" );
270
- uidIter .seekCeil (new BytesRef (startUid ));
271
- final BytesRef emptyBR = new BytesRef ("" );
272
- // paths of live (i.e. not deleted) documents. Must be a list so that duplicate documents can be checked.
273
270
List <String > livePaths = new ArrayList <>();
274
- Set <String > deletedUids = getDeletedUids (indexPath );
275
271
276
- while (uidIter != null && uidIter .term () != null && uidIter .term ().compareTo (emptyBR ) != 0 ) {
277
- String termValue = uidIter .term ().utf8ToString ();
278
- String termPath = Util .uid2url (termValue );
272
+ Bits liveDocs = MultiBits .getLiveDocs (indexReader );
273
+ if (liveDocs == null ) { // the index has no deletions
274
+ return null ;
275
+ }
279
276
280
- if (deletedUids .contains (termValue )) {
281
- BytesRef next = uidIter .next ();
282
- if (next == null ) {
283
- uidIter = null ;
284
- }
277
+ for (int i = 0 ; i < indexReader .maxDoc (); i ++) {
278
+ Document doc = indexReader .storedFields ().document (i );
279
+
280
+ if (!liveDocs .get (i )) {
285
281
continue ;
286
282
}
287
283
288
- livePaths . add ( termPath );
289
-
290
- BytesRef next = uidIter . next ();
291
- if ( next == null ) {
292
- uidIter = null ;
284
+ // This should avoid the special LOC documents.
285
+ IndexableField field = doc . getField ( QueryBuilder . U );
286
+ if ( field != null ) {
287
+ String uid = field . stringValue ();
288
+ livePaths . add ( Util . uid2url ( uid )) ;
293
289
}
294
290
}
295
291
296
292
return livePaths ;
297
293
}
298
294
}
299
295
300
- private static void checkDuplicateDocuments (Path indexPath , String projectName )
296
+ private static void checkDuplicateDocuments (Path indexPath )
301
297
throws IOException , IndexDocumentException {
302
298
303
299
LOGGER .log (Level .FINE , "Checking duplicate documents in ''{0}''" , indexPath );
304
300
Statistics stat = new Statistics ();
305
- List <String > livePaths = getLiveDocumentPaths (indexPath , projectName );
301
+ List <String > livePaths = getLiveDocumentPaths (indexPath );
302
+ if (livePaths == null ) {
303
+ throw new IndexDocumentException (String .format ("cannot determine live paths for '%s'" , indexPath ));
304
+ }
306
305
HashSet <String > pathSet = new HashSet <>(livePaths );
307
- HashMap <String , Integer > fileMap = new HashMap <>();
306
+ Map <String , Integer > fileMap = new ConcurrentHashMap <>();
308
307
if (pathSet .size () != livePaths .size ()) {
309
308
LOGGER .log (Level .FINE ,
310
309
"index in ''{0}'' has document path set ({1}) vs document list ({2}) discrepancy" ,
311
310
new Object []{indexPath , pathSet .size (), livePaths .size ()});
312
311
for (String path : livePaths ) {
313
312
if (pathSet .contains (path )) {
314
- LOGGER .log (Level .FINER , "duplicate path: ''{0}''" , path );
315
313
fileMap .putIfAbsent (path , 0 );
316
314
fileMap .put (path , fileMap .get (path ) + 1 );
317
315
}
318
316
}
317
+ }
319
318
319
+ // Traverse the file map and leave only duplicate entries.
320
+ for (String path : fileMap .keySet ()) {
321
+ if (fileMap .get (path ) > 1 ) {
322
+ LOGGER .log (Level .FINER , "duplicate path: ''{0}''" , path );
323
+ } else {
324
+ fileMap .remove (path );
325
+ }
320
326
}
327
+
321
328
stat .report (LOGGER , Level .FINE , String .format ("duplicate check in '%s' done" , indexPath ));
322
329
if (!fileMap .isEmpty ()) {
323
330
throw new IndexDocumentException (String .format ("index in '%s' contains duplicate live documents" ,
0 commit comments