Skip to content

Commit 0249797

Browse files
committed
KNN benchy: include numDocs in indexPath so benchy reindexes when requested numDocs changes; remove obsolete workaround for bug that returns NO_MORE_DOCS as a Lucene docid; fix typo in generated nightly benchy HTML; add TODOs
1 parent 08a04e6 commit 0249797

File tree

3 files changed

+13
-11
lines changed

3 files changed

+13
-11
lines changed

src/main/knn/KnnGraphTester.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -466,7 +466,7 @@ private void run(String... args) throws Exception {
466466
throw new IllegalArgumentException("-prefilter requires filterSelectivity between 0 and 1");
467467
}
468468
if (indexPath == null) {
469-
indexPath = Paths.get(formatIndexPath(docVectorsPath)); // derive index path
469+
indexPath = Paths.get(formatIndexPath(docVectorsPath, numDocs)); // derive index path
470470
log("Index Path = %s\n", indexPath);
471471
}
472472
if (parentJoin && reindex == false && isParentJoinIndex(indexPath) == false) {
@@ -665,7 +665,8 @@ private static Query generateRandomQuery(Random random, Path indexPath, int size
665665
}
666666
}
667667

668-
private String formatIndexPath(Path docsPath) {
668+
private String formatIndexPath(Path docsPath, int numDocs) {
669+
// TODO: shouldn't this use the same hashing that we use when saving exact results to cache file?
669670
List<String> suffix = new ArrayList<>();
670671
if (indexType == IndexType.FLAT) {
671672
suffix.add("flat");
@@ -685,6 +686,8 @@ private String formatIndexPath(Path docsPath) {
685686
if (parentJoin) {
686687
suffix.add("parentJoin");
687688
}
689+
// make sure we reindex if numDocs has changed:
690+
suffix.add(Integer.toString(numDocs));
688691
return INDEX_DIR + "/" + docsPath.getFileName() + "-" + String.join("-", suffix) + ".index";
689692
}
690693

src/main/knn/KnnTesterUtils.java

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,14 @@ public class KnnTesterUtils {
3232
public static int[] getResultIds(TopDocs topDocs, StoredFields storedFields) throws IOException {
3333
int[] resultIds = new int[topDocs.scoreDocs.length];
3434
int i = 0;
35+
// TODO: switch to doc values for this id field? more efficent than stored fields
36+
// TODO: or, at least load the stored documents in index (Lucene docid) order to
37+
// amortize cost of decompressing each stored doc block (hmm, though, this cost/time
38+
// is not included in the reported benchy results... this is called after all KNN
39+
// queries have run)
3540
for (ScoreDoc doc : topDocs.scoreDocs) {
36-
if (doc.doc != NO_MORE_DOCS) {
37-
// there is a bug somewhere that can result in doc=NO_MORE_DOCS! I think it happens
38-
// in some degenerate case (like input query has NaN in it?) that causes no results to
39-
// be returned from HNSW search?
40-
resultIds[i++] = Integer.parseInt(storedFields.document(doc.doc).get(KnnGraphTester.ID_FIELD));
41-
} else {
42-
System.out.println("NO_MORE_DOCS!");
43-
}
41+
assert doc.doc != NO_MORE_DOCS: "illegal docid " + doc.doc + " returned from KNN search?";
42+
resultIds[i++] = Integer.parseInt(storedFields.document(doc.doc).get(KnnGraphTester.ID_FIELD));
4443
}
4544
return resultIds;
4645
}

src/python/runNightlyKnn.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,7 @@ def write_graph():
292292
</style>
293293
294294
<div id="summary" style="height:17%%; width:95%%">
295-
This benchmark indexes and searches Cohere 768 dimensin vectors from https://huggingface.co/datasets/Cohere/wikipedia-22-12-en-embeddings.
295+
This benchmark indexes 8.0M and searches Cohere 768 dimension vectors from https://huggingface.co/datasets/Cohere/wikipedia-22-12-en-embeddings.
296296
</div>
297297
""")
298298

0 commit comments

Comments
 (0)