18
18
*/
19
19
20
20
/*
21
- * Copyright (c) 2018, 2022 , Oracle and/or its affiliates. All rights reserved.
21
+ * Copyright (c) 2018, 2023 , Oracle and/or its affiliates. All rights reserved.
22
22
* Portions Copyright (c) 2018, Chris Fraire <[email protected] >.
23
23
*/
24
24
package org .opengrok .indexer .index ;
25
25
26
- import java .io .File ;
27
26
import java .io .IOException ;
27
+ import java .nio .file .Path ;
28
+ import java .util .ArrayList ;
28
29
import java .util .Collection ;
30
+ import java .util .HashMap ;
31
+ import java .util .HashSet ;
32
+ import java .util .List ;
33
+ import java .util .Map ;
34
+ import java .util .Set ;
29
35
import java .util .logging .Level ;
30
36
import java .util .logging .Logger ;
31
37
38
+ import org .apache .lucene .document .Document ;
39
+ import org .apache .lucene .index .DirectoryReader ;
32
40
import org .apache .lucene .index .IndexNotFoundException ;
41
+ import org .apache .lucene .index .IndexReader ;
42
+ import org .apache .lucene .index .IndexableField ;
43
+ import org .apache .lucene .index .MultiBits ;
44
+ import org .apache .lucene .index .MultiTerms ;
33
45
import org .apache .lucene .index .SegmentInfos ;
46
+ import org .apache .lucene .index .Terms ;
47
+ import org .apache .lucene .index .TermsEnum ;
34
48
import org .apache .lucene .store .Directory ;
35
49
import org .apache .lucene .store .FSDirectory ;
36
50
import org .apache .lucene .store .LockFactory ;
37
51
import org .apache .lucene .store .NativeFSLockFactory ;
52
+ import org .apache .lucene .store .NoLockFactory ;
53
+ import org .apache .lucene .util .Bits ;
54
+ import org .apache .lucene .util .BytesRef ;
38
55
import org .apache .lucene .util .Version ;
39
56
import org .jetbrains .annotations .NotNull ;
40
57
import org .opengrok .indexer .configuration .Configuration ;
41
58
import org .opengrok .indexer .logger .LoggerFactory ;
59
+ import org .opengrok .indexer .search .QueryBuilder ;
60
+ import org .opengrok .indexer .util .Statistics ;
61
+ import org .opengrok .indexer .web .Util ;
42
62
43
63
/**
44
64
* Index checker.
48
68
public class IndexCheck {
49
69
private static final Logger LOGGER = LoggerFactory .getLogger (IndexCheck .class );
50
70
71
+ /**
72
+ * Index check modes. Ordered from least to most extensive.
73
+ */
74
+ public enum IndexCheckMode {
75
+ NO_CHECK ,
76
+ VERSION ,
77
+ DOCUMENTS
78
+ }
79
+
51
80
/**
52
81
* Exception thrown when index version does not match Lucene version.
53
82
*/
@@ -72,84 +101,227 @@ public String toString() {
72
101
}
73
102
}
74
103
104
+ /**
105
+ * Exception thrown when index contains duplicate live documents.
106
+ */
107
+ public static class IndexDocumentException extends Exception {
108
+ private static final long serialVersionUID = 5693446916108385595L ;
109
+
110
+ private final Map <String , Integer > fileMap ;
111
+
112
+ public IndexDocumentException (String s , Map <String , Integer > fileMap ) {
113
+ super (s );
114
+
115
+ this .fileMap = fileMap ;
116
+ }
117
+
118
+ @ Override
119
+ public String toString () {
120
+ return getMessage () + ": " + fileMap ;
121
+ }
122
+ }
123
+
75
124
private IndexCheck () {
76
125
// utility class
77
126
}
78
127
79
128
/**
80
- * Check if version of index(es) matches major Lucene version .
129
+ * Check index(es).
81
130
* @param configuration configuration based on which to perform the check
82
- * @param subFilesList collection of paths. If non-empty, only projects matching these paths will be checked.
131
+ * @param mode index check mode
132
+ * @param projectNames collection of project names. If non-empty, only projects matching these paths will be checked.
133
+ * Otherwise, either the sole index or all project indexes will be checked, depending
134
+ * on whether projects are enabled in the configuration.
83
135
* @return true on success, false on failure
84
136
*/
85
- public static boolean check (@ NotNull Configuration configuration , Collection <String > subFilesList ) {
86
- File indexRoot = new File (configuration .getDataRoot (), IndexDatabase .INDEX_DIR );
87
- LOGGER .log (Level .FINE , "Checking for Lucene index version mismatch in {0}" , indexRoot );
137
+ public static boolean check (@ NotNull Configuration configuration , IndexCheckMode mode ,
138
+ Collection <String > projectNames ) {
139
+
140
+ if (mode .equals (IndexCheckMode .NO_CHECK )) {
141
+ LOGGER .log (Level .WARNING , "no index check mode selected" );
142
+ return true ;
143
+ }
144
+
145
+ Path indexRoot = Path .of (configuration .getDataRoot (), IndexDatabase .INDEX_DIR );
88
146
int ret = 0 ;
89
147
90
- if (!subFilesList .isEmpty ()) {
148
+ if (!projectNames .isEmpty ()) {
91
149
// Assumes projects are enabled.
92
- for (String projectName : subFilesList ) {
93
- LOGGER .log (Level .FINER ,
94
- "Checking Lucene index version in project {0}" ,
95
- projectName );
96
- ret |= checkDirNoExceptions (new File (indexRoot , projectName ));
150
+ for (String projectName : projectNames ) {
151
+ ret |= checkDirNoExceptions (Path .of (indexRoot .toString (), projectName ), mode , projectName );
97
152
}
98
153
} else {
99
154
if (configuration .isProjectsEnabled ()) {
100
155
for (String projectName : configuration .getProjects ().keySet ()) {
101
- LOGGER .log (Level .FINER ,
102
- "Checking Lucene index version in project {0}" ,
103
- projectName );
104
- ret |= checkDirNoExceptions (new File (indexRoot , projectName ));
156
+ ret |= checkDirNoExceptions (Path .of (indexRoot .toString (), projectName ), mode , projectName );
105
157
}
106
158
} else {
107
- LOGGER .log (Level .FINER , "Checking Lucene index version in {0}" ,
108
- indexRoot );
109
- ret |= checkDirNoExceptions (indexRoot );
159
+ ret |= checkDirNoExceptions (indexRoot , mode , "" );
110
160
}
111
161
}
112
162
113
163
return ret == 0 ;
114
164
}
115
165
116
- private static int checkDirNoExceptions (File dir ) {
166
+ /**
167
+ * @param indexPath directory with index
168
+ * @return 0 on success, 1 on failure
169
+ */
170
+ private static int checkDirNoExceptions (Path indexPath , IndexCheckMode mode , String projectName ) {
117
171
try {
118
- checkDir (dir );
172
+ LOGGER .log (Level .INFO , "Checking index in ''{0}''" , indexPath );
173
+ checkDir (indexPath , mode , projectName );
119
174
} catch (Exception e ) {
120
- LOGGER .log (Level .WARNING , "Index check for directory " + dir + " failed" , e );
175
+ LOGGER .log (Level .WARNING , String . format ( "Index check for directory '%s' failed" , indexPath ) , e );
121
176
return 1 ;
122
177
}
123
178
179
+ LOGGER .log (Level .INFO , "Index check for directory ''{0}'' passed" , indexPath );
124
180
return 0 ;
125
181
}
126
182
127
183
/**
128
184
* Check index in given directory. It assumes that that all commits (if any)
129
185
* in the Lucene segment file were done with the same version.
130
186
*
131
- * @param dir directory with index
187
+ * @param indexPath directory with index to check
188
+ * @param mode index check mode
189
+ * @param projectName name of the project, can be empty
132
190
* @throws IOException if the directory cannot be opened
133
191
* @throws IndexVersionException if the version of the index does not match Lucene index version
134
192
*/
135
- public static void checkDir (File dir ) throws IndexVersionException , IOException {
193
+ public static void checkDir (Path indexPath , IndexCheckMode mode , String projectName )
194
+ throws IndexVersionException , IndexDocumentException , IOException {
195
+
136
196
LockFactory lockFactory = NativeFSLockFactory .INSTANCE ;
137
197
int segVersion ;
138
198
139
- try (Directory indexDirectory = FSDirectory .open (dir . toPath () , lockFactory )) {
199
+ try (Directory indexDirectory = FSDirectory .open (indexPath , lockFactory )) {
140
200
try {
141
201
SegmentInfos segInfos = SegmentInfos .readLatestCommit (indexDirectory );
142
202
segVersion = segInfos .getIndexCreatedVersionMajor ();
143
203
} catch (IndexNotFoundException e ) {
144
- LOGGER .log (Level .FINE , "no index found in ''{0}''" , indexDirectory );
204
+ LOGGER .log (Level .WARNING , "no index found in ''{0}''" , indexDirectory );
145
205
return ;
146
206
}
147
207
}
148
208
209
+ LOGGER .log (Level .FINE , "Checking index version in ''{0}''" , indexPath );
149
210
if (segVersion != Version .LATEST .major ) {
150
211
throw new IndexVersionException (
151
- String .format ("Directory %s has index version discrepancy" , dir ),
212
+ String .format ("Directory '%s' has index version discrepancy" , indexPath ),
152
213
Version .LATEST .major , segVersion );
153
214
}
215
+
216
+ if (mode .ordinal () >= IndexCheckMode .DOCUMENTS .ordinal ()) {
217
+ checkDuplicateDocuments (indexPath , projectName );
218
+ }
219
+ }
220
+
221
+ public static IndexReader getIndexReader (Path indexPath ) throws IOException {
222
+ try (FSDirectory indexDirectory = FSDirectory .open (indexPath , NoLockFactory .INSTANCE )) {
223
+ return DirectoryReader .open (indexDirectory );
224
+ }
225
+ }
226
+
227
+ /**
228
+ * @param indexPath path to the index
229
+ * @return set of deleted uids in the index related to the project name
230
+ * @throws IOException if the index cannot be read
231
+ */
232
+ public static Set <String > getDeletedUids (Path indexPath ) throws IOException {
233
+ Set <String > deletedUids = new HashSet <>();
234
+
235
+ try (IndexReader indexReader = getIndexReader (indexPath )) {
236
+ Bits liveDocs = MultiBits .getLiveDocs (indexReader );
237
+ if (liveDocs == null ) { // the index has no deletions
238
+ return deletedUids ;
239
+ }
240
+
241
+ for (int i = 0 ; i < indexReader .maxDoc (); i ++) {
242
+ Document doc = indexReader .storedFields ().document (i );
243
+ // This should avoid the special LOC documents.
244
+ IndexableField field = doc .getField (QueryBuilder .U );
245
+ if (field != null ) {
246
+ String uid = field .stringValue ();
247
+
248
+ if (!liveDocs .get (i )) {
249
+ deletedUids .add (uid );
250
+ }
251
+ }
252
+ }
253
+ }
254
+
255
+ return deletedUids ;
256
+ }
257
+
258
+ /**
259
+ * @param indexPath path to index
260
+ * @param projectName project name, can be empty
261
+ * @return list of live document paths
262
+ * @throws IOException on I/O error
263
+ */
264
+ public static List <String > getLiveDocumentPaths (Path indexPath , String projectName ) throws IOException {
265
+ try (IndexReader indexReader = getIndexReader (indexPath )) {
266
+ Terms terms = MultiTerms .getTerms (indexReader , QueryBuilder .U );
267
+ TermsEnum uidIter = terms .iterator ();
268
+ String dir = "/" + projectName ;
269
+ String startUid = Util .path2uid (dir , "" );
270
+ uidIter .seekCeil (new BytesRef (startUid ));
271
+ final BytesRef emptyBR = new BytesRef ("" );
272
+ // paths of live (i.e. not deleted) documents. Must be a list so that duplicate documents can be checked.
273
+ List <String > livePaths = new ArrayList <>();
274
+ Set <String > deletedUids = getDeletedUids (indexPath );
275
+
276
+ while (uidIter != null && uidIter .term () != null && uidIter .term ().compareTo (emptyBR ) != 0 ) {
277
+ String termValue = uidIter .term ().utf8ToString ();
278
+ String termPath = Util .uid2url (termValue );
279
+
280
+ if (deletedUids .contains (termValue )) {
281
+ BytesRef next = uidIter .next ();
282
+ if (next == null ) {
283
+ uidIter = null ;
284
+ }
285
+ continue ;
286
+ }
287
+
288
+ livePaths .add (termPath );
289
+
290
+ BytesRef next = uidIter .next ();
291
+ if (next == null ) {
292
+ uidIter = null ;
293
+ }
294
+ }
295
+
296
+ return livePaths ;
297
+ }
298
+ }
299
+
300
+ private static void checkDuplicateDocuments (Path indexPath , String projectName )
301
+ throws IOException , IndexDocumentException {
302
+
303
+ LOGGER .log (Level .FINE , "Checking duplicate documents in ''{0}''" , indexPath );
304
+ Statistics stat = new Statistics ();
305
+ List <String > livePaths = getLiveDocumentPaths (indexPath , projectName );
306
+ HashSet <String > pathSet = new HashSet <>(livePaths );
307
+ HashMap <String , Integer > fileMap = new HashMap <>();
308
+ if (pathSet .size () != livePaths .size ()) {
309
+ LOGGER .log (Level .FINE ,
310
+ "index in ''{0}'' has document path set ({1}) vs document list ({2}) discrepancy" ,
311
+ new Object []{indexPath , pathSet .size (), livePaths .size ()});
312
+ for (String path : livePaths ) {
313
+ if (pathSet .contains (path )) {
314
+ LOGGER .log (Level .FINER , "duplicate path: ''{0}''" , path );
315
+ fileMap .putIfAbsent (path , 0 );
316
+ fileMap .put (path , fileMap .get (path ) + 1 );
317
+ }
318
+ }
319
+
320
+ }
321
+ stat .report (LOGGER , Level .FINE , String .format ("duplicate check in '%s' done" , indexPath ));
322
+ if (!fileMap .isEmpty ()) {
323
+ throw new IndexDocumentException (String .format ("index in '%s' contains duplicate live documents" ,
324
+ indexPath ), fileMap );
325
+ }
154
326
}
155
327
}
0 commit comments