Skip to content

Commit 7577736

Browse files
authored
add documents argument to --indexCheck (#4191)
1 parent 0324677 commit 7577736

File tree

6 files changed

+259
-141
lines changed

6 files changed

+259
-141
lines changed

opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexCheck.java

Lines changed: 200 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -18,27 +18,47 @@
1818
*/
1919

2020
/*
21-
* Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved.
21+
* Copyright (c) 2018, 2023, Oracle and/or its affiliates. All rights reserved.
2222
* Portions Copyright (c) 2018, Chris Fraire <[email protected]>.
2323
*/
2424
package org.opengrok.indexer.index;
2525

26-
import java.io.File;
2726
import java.io.IOException;
27+
import java.nio.file.Path;
28+
import java.util.ArrayList;
2829
import java.util.Collection;
30+
import java.util.HashMap;
31+
import java.util.HashSet;
32+
import java.util.List;
33+
import java.util.Map;
34+
import java.util.Set;
2935
import java.util.logging.Level;
3036
import java.util.logging.Logger;
3137

38+
import org.apache.lucene.document.Document;
39+
import org.apache.lucene.index.DirectoryReader;
3240
import org.apache.lucene.index.IndexNotFoundException;
41+
import org.apache.lucene.index.IndexReader;
42+
import org.apache.lucene.index.IndexableField;
43+
import org.apache.lucene.index.MultiBits;
44+
import org.apache.lucene.index.MultiTerms;
3345
import org.apache.lucene.index.SegmentInfos;
46+
import org.apache.lucene.index.Terms;
47+
import org.apache.lucene.index.TermsEnum;
3448
import org.apache.lucene.store.Directory;
3549
import org.apache.lucene.store.FSDirectory;
3650
import org.apache.lucene.store.LockFactory;
3751
import org.apache.lucene.store.NativeFSLockFactory;
52+
import org.apache.lucene.store.NoLockFactory;
53+
import org.apache.lucene.util.Bits;
54+
import org.apache.lucene.util.BytesRef;
3855
import org.apache.lucene.util.Version;
3956
import org.jetbrains.annotations.NotNull;
4057
import org.opengrok.indexer.configuration.Configuration;
4158
import org.opengrok.indexer.logger.LoggerFactory;
59+
import org.opengrok.indexer.search.QueryBuilder;
60+
import org.opengrok.indexer.util.Statistics;
61+
import org.opengrok.indexer.web.Util;
4262

4363
/**
4464
* Index checker.
@@ -48,6 +68,15 @@
4868
public class IndexCheck {
4969
private static final Logger LOGGER = LoggerFactory.getLogger(IndexCheck.class);
5070

71+
/**
72+
* Index check modes. Ordered from least to most extensive.
73+
*/
74+
public enum IndexCheckMode {
75+
NO_CHECK,
76+
VERSION,
77+
DOCUMENTS
78+
}
79+
5180
/**
5281
* Exception thrown when index version does not match Lucene version.
5382
*/
@@ -72,84 +101,227 @@ public String toString() {
72101
}
73102
}
74103

104+
/**
105+
* Exception thrown when index contains duplicate live documents.
106+
*/
107+
public static class IndexDocumentException extends Exception {
108+
private static final long serialVersionUID = 5693446916108385595L;
109+
110+
private final Map<String, Integer> fileMap;
111+
112+
public IndexDocumentException(String s, Map<String, Integer> fileMap) {
113+
super(s);
114+
115+
this.fileMap = fileMap;
116+
}
117+
118+
@Override
119+
public String toString() {
120+
return getMessage() + ": " + fileMap;
121+
}
122+
}
123+
75124
private IndexCheck() {
76125
// utility class
77126
}
78127

79128
/**
80-
* Check if version of index(es) matches major Lucene version.
129+
* Check index(es).
81130
* @param configuration configuration based on which to perform the check
82-
* @param subFilesList collection of paths. If non-empty, only projects matching these paths will be checked.
131+
* @param mode index check mode
132+
* @param projectNames collection of project names. If non-empty, only projects matching these paths will be checked.
133+
* Otherwise, either the sole index or all project indexes will be checked, depending
134+
* on whether projects are enabled in the configuration.
83135
* @return true on success, false on failure
84136
*/
85-
public static boolean check(@NotNull Configuration configuration, Collection<String> subFilesList) {
86-
File indexRoot = new File(configuration.getDataRoot(), IndexDatabase.INDEX_DIR);
87-
LOGGER.log(Level.FINE, "Checking for Lucene index version mismatch in {0}", indexRoot);
137+
public static boolean check(@NotNull Configuration configuration, IndexCheckMode mode,
138+
Collection<String> projectNames) {
139+
140+
if (mode.equals(IndexCheckMode.NO_CHECK)) {
141+
LOGGER.log(Level.WARNING, "no index check mode selected");
142+
return true;
143+
}
144+
145+
Path indexRoot = Path.of(configuration.getDataRoot(), IndexDatabase.INDEX_DIR);
88146
int ret = 0;
89147

90-
if (!subFilesList.isEmpty()) {
148+
if (!projectNames.isEmpty()) {
91149
// Assumes projects are enabled.
92-
for (String projectName : subFilesList) {
93-
LOGGER.log(Level.FINER,
94-
"Checking Lucene index version in project {0}",
95-
projectName);
96-
ret |= checkDirNoExceptions(new File(indexRoot, projectName));
150+
for (String projectName : projectNames) {
151+
ret |= checkDirNoExceptions(Path.of(indexRoot.toString(), projectName), mode, projectName);
97152
}
98153
} else {
99154
if (configuration.isProjectsEnabled()) {
100155
for (String projectName : configuration.getProjects().keySet()) {
101-
LOGGER.log(Level.FINER,
102-
"Checking Lucene index version in project {0}",
103-
projectName);
104-
ret |= checkDirNoExceptions(new File(indexRoot, projectName));
156+
ret |= checkDirNoExceptions(Path.of(indexRoot.toString(), projectName), mode, projectName);
105157
}
106158
} else {
107-
LOGGER.log(Level.FINER, "Checking Lucene index version in {0}",
108-
indexRoot);
109-
ret |= checkDirNoExceptions(indexRoot);
159+
ret |= checkDirNoExceptions(indexRoot, mode, "");
110160
}
111161
}
112162

113163
return ret == 0;
114164
}
115165

116-
private static int checkDirNoExceptions(File dir) {
166+
/**
167+
* @param indexPath directory with index
168+
* @return 0 on success, 1 on failure
169+
*/
170+
private static int checkDirNoExceptions(Path indexPath, IndexCheckMode mode, String projectName) {
117171
try {
118-
checkDir(dir);
172+
LOGGER.log(Level.INFO, "Checking index in ''{0}''", indexPath);
173+
checkDir(indexPath, mode, projectName);
119174
} catch (Exception e) {
120-
LOGGER.log(Level.WARNING, "Index check for directory " + dir + " failed", e);
175+
LOGGER.log(Level.WARNING, String.format("Index check for directory '%s' failed", indexPath), e);
121176
return 1;
122177
}
123178

179+
LOGGER.log(Level.INFO, "Index check for directory ''{0}'' passed", indexPath);
124180
return 0;
125181
}
126182

127183
/**
128184
* Check index in given directory. It assumes that that all commits (if any)
129185
* in the Lucene segment file were done with the same version.
130186
*
131-
* @param dir directory with index
187+
* @param indexPath directory with index to check
188+
* @param mode index check mode
189+
* @param projectName name of the project, can be empty
132190
* @throws IOException if the directory cannot be opened
133191
* @throws IndexVersionException if the version of the index does not match Lucene index version
134192
*/
135-
public static void checkDir(File dir) throws IndexVersionException, IOException {
193+
public static void checkDir(Path indexPath, IndexCheckMode mode, String projectName)
194+
throws IndexVersionException, IndexDocumentException, IOException {
195+
136196
LockFactory lockFactory = NativeFSLockFactory.INSTANCE;
137197
int segVersion;
138198

139-
try (Directory indexDirectory = FSDirectory.open(dir.toPath(), lockFactory)) {
199+
try (Directory indexDirectory = FSDirectory.open(indexPath, lockFactory)) {
140200
try {
141201
SegmentInfos segInfos = SegmentInfos.readLatestCommit(indexDirectory);
142202
segVersion = segInfos.getIndexCreatedVersionMajor();
143203
} catch (IndexNotFoundException e) {
144-
LOGGER.log(Level.FINE, "no index found in ''{0}''", indexDirectory);
204+
LOGGER.log(Level.WARNING, "no index found in ''{0}''", indexDirectory);
145205
return;
146206
}
147207
}
148208

209+
LOGGER.log(Level.FINE, "Checking index version in ''{0}''", indexPath);
149210
if (segVersion != Version.LATEST.major) {
150211
throw new IndexVersionException(
151-
String.format("Directory %s has index version discrepancy", dir),
212+
String.format("Directory '%s' has index version discrepancy", indexPath),
152213
Version.LATEST.major, segVersion);
153214
}
215+
216+
if (mode.ordinal() >= IndexCheckMode.DOCUMENTS.ordinal()) {
217+
checkDuplicateDocuments(indexPath, projectName);
218+
}
219+
}
220+
221+
public static IndexReader getIndexReader(Path indexPath) throws IOException {
222+
try (FSDirectory indexDirectory = FSDirectory.open(indexPath, NoLockFactory.INSTANCE)) {
223+
return DirectoryReader.open(indexDirectory);
224+
}
225+
}
226+
227+
/**
228+
* @param indexPath path to the index
229+
* @return set of deleted uids in the index related to the project name
230+
* @throws IOException if the index cannot be read
231+
*/
232+
public static Set<String> getDeletedUids(Path indexPath) throws IOException {
233+
Set<String> deletedUids = new HashSet<>();
234+
235+
try (IndexReader indexReader = getIndexReader(indexPath)) {
236+
Bits liveDocs = MultiBits.getLiveDocs(indexReader);
237+
if (liveDocs == null) { // the index has no deletions
238+
return deletedUids;
239+
}
240+
241+
for (int i = 0; i < indexReader.maxDoc(); i++) {
242+
Document doc = indexReader.storedFields().document(i);
243+
// This should avoid the special LOC documents.
244+
IndexableField field = doc.getField(QueryBuilder.U);
245+
if (field != null) {
246+
String uid = field.stringValue();
247+
248+
if (!liveDocs.get(i)) {
249+
deletedUids.add(uid);
250+
}
251+
}
252+
}
253+
}
254+
255+
return deletedUids;
256+
}
257+
258+
/**
259+
* @param indexPath path to index
260+
* @param projectName project name, can be empty
261+
* @return list of live document paths
262+
* @throws IOException on I/O error
263+
*/
264+
public static List<String> getLiveDocumentPaths(Path indexPath, String projectName) throws IOException {
265+
try (IndexReader indexReader = getIndexReader(indexPath)) {
266+
Terms terms = MultiTerms.getTerms(indexReader, QueryBuilder.U);
267+
TermsEnum uidIter = terms.iterator();
268+
String dir = "/" + projectName;
269+
String startUid = Util.path2uid(dir, "");
270+
uidIter.seekCeil(new BytesRef(startUid));
271+
final BytesRef emptyBR = new BytesRef("");
272+
// paths of live (i.e. not deleted) documents. Must be a list so that duplicate documents can be checked.
273+
List<String> livePaths = new ArrayList<>();
274+
Set<String> deletedUids = getDeletedUids(indexPath);
275+
276+
while (uidIter != null && uidIter.term() != null && uidIter.term().compareTo(emptyBR) != 0) {
277+
String termValue = uidIter.term().utf8ToString();
278+
String termPath = Util.uid2url(termValue);
279+
280+
if (deletedUids.contains(termValue)) {
281+
BytesRef next = uidIter.next();
282+
if (next == null) {
283+
uidIter = null;
284+
}
285+
continue;
286+
}
287+
288+
livePaths.add(termPath);
289+
290+
BytesRef next = uidIter.next();
291+
if (next == null) {
292+
uidIter = null;
293+
}
294+
}
295+
296+
return livePaths;
297+
}
298+
}
299+
300+
private static void checkDuplicateDocuments(Path indexPath, String projectName)
301+
throws IOException, IndexDocumentException {
302+
303+
LOGGER.log(Level.FINE, "Checking duplicate documents in ''{0}''", indexPath);
304+
Statistics stat = new Statistics();
305+
List<String> livePaths = getLiveDocumentPaths(indexPath, projectName);
306+
HashSet<String> pathSet = new HashSet<>(livePaths);
307+
HashMap<String, Integer> fileMap = new HashMap<>();
308+
if (pathSet.size() != livePaths.size()) {
309+
LOGGER.log(Level.FINE,
310+
"index in ''{0}'' has document path set ({1}) vs document list ({2}) discrepancy",
311+
new Object[]{indexPath, pathSet.size(), livePaths.size()});
312+
for (String path : livePaths) {
313+
if (pathSet.contains(path)) {
314+
LOGGER.log(Level.FINER, "duplicate path: ''{0}''", path);
315+
fileMap.putIfAbsent(path, 0);
316+
fileMap.put(path, fileMap.get(path) + 1);
317+
}
318+
}
319+
320+
}
321+
stat.report(LOGGER, Level.FINE, String.format("duplicate check in '%s' done", indexPath));
322+
if (!fileMap.isEmpty()) {
323+
throw new IndexDocumentException(String.format("index in '%s' contains duplicate live documents",
324+
indexPath), fileMap);
325+
}
154326
}
155327
}

opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -802,7 +802,7 @@ private void setupDeletedUids() throws IOException {
802802
}
803803

804804
private void logIgnoredUid(String uid) {
805-
LOGGER.log(Level.FINEST, "ignoring deleted document for {0} at {1}",
805+
LOGGER.log(Level.FINEST, "ignoring deleted document for ''{0}'' at {1}",
806806
new Object[]{Util.uid2url(uid), Util.uid2date(uid)});
807807
}
808808

0 commit comments

Comments
 (0)