diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/graph/GraphIndexBuilder.java b/jvector-base/src/main/java/io/github/jbellis/jvector/graph/GraphIndexBuilder.java index dfad371d0..690d1049f 100644 --- a/jvector-base/src/main/java/io/github/jbellis/jvector/graph/GraphIndexBuilder.java +++ b/jvector-base/src/main/java/io/github/jbellis/jvector/graph/GraphIndexBuilder.java @@ -325,7 +325,7 @@ public GraphIndexBuilder(BuildScoreProvider scoreProvider, this.simdExecutor = simdExecutor; this.parallelExecutor = parallelExecutor; - this.graph = new OnHeapGraphIndex(maxDegrees, dimension, neighborOverflow, new VamanaDiversityProvider(scoreProvider, alpha)); + this.graph = new OnHeapGraphIndex(maxDegrees, dimension, neighborOverflow, new VamanaDiversityProvider(scoreProvider, alpha), addHierarchy); this.searchers = ExplicitThreadLocal.withInitial(() -> { var gs = new GraphSearcher(graph); @@ -349,14 +349,12 @@ public GraphIndexBuilder(BuildScoreProvider scoreProvider, * @param beamWidth the width of the beam used during the graph building process. * @param neighborOverflow the factor determining how many additional neighbors are allowed beyond the configured limit. * @param alpha the weight factor for balancing score computations. - * @param addHierarchy whether to add hierarchical structures while building the graph. * @param refineFinalGraph whether to perform a refinement step on the final graph structure. * @param simdExecutor the ForkJoinPool executor used for SIMD tasks during graph building. * @param parallelExecutor the ForkJoinPool executor used for general parallelization during graph building. - * - * @throws IOException if an I/O error occurs during the graph loading or conversion process. */ - private GraphIndexBuilder(BuildScoreProvider buildScoreProvider, int dimension, MutableGraphIndex mutableGraphIndex, int beamWidth, float neighborOverflow, float alpha, boolean addHierarchy, boolean refineFinalGraph, ForkJoinPool simdExecutor, ForkJoinPool parallelExecutor) { + @Experimental + public GraphIndexBuilder(BuildScoreProvider buildScoreProvider, int dimension, MutableGraphIndex mutableGraphIndex, int beamWidth, float neighborOverflow, float alpha, boolean refineFinalGraph, ForkJoinPool simdExecutor, ForkJoinPool parallelExecutor) { if (beamWidth <= 0) { throw new IllegalArgumentException("beamWidth must be positive"); } @@ -371,7 +369,7 @@ private GraphIndexBuilder(BuildScoreProvider buildScoreProvider, int dimension, this.neighborOverflow = neighborOverflow; this.dimension = dimension; this.alpha = alpha; - this.addHierarchy = addHierarchy; + this.addHierarchy = mutableGraphIndex.isHierarchical(); this.refineFinalGraph = refineFinalGraph; this.beamWidth = beamWidth; this.simdExecutor = simdExecutor; @@ -981,8 +979,6 @@ private void loadV3(RandomAccessReader in, int size) throws IOException { * @param beamWidth the width of the beam used during the graph building process. * @param overflowRatio the ratio of extra neighbors to allow temporarily when inserting a node. * @param alpha the weight factor for balancing score computations. - * @param addHierarchy whether to add hierarchical structures while building the graph. - * * @return the in-memory representation of the graph index. * @throws IOException if an I/O error occurs during the graph loading or conversion process. */ @@ -993,10 +989,9 @@ public static ImmutableGraphIndex buildAndMergeNewNodes(RandomAccessReader in, int startingNodeOffset, int beamWidth, float overflowRatio, - float alpha, - boolean addHierarchy) throws IOException { + float alpha) throws IOException { - return buildAndMergeNewNodes(in, newVectors, buildScoreProvider, startingNodeOffset, beamWidth, overflowRatio, alpha, addHierarchy, PhysicalCoreExecutor.pool(), ForkJoinPool.commonPool()); + return buildAndMergeNewNodes(in, newVectors, buildScoreProvider, startingNodeOffset, beamWidth, overflowRatio, alpha, PhysicalCoreExecutor.pool(), ForkJoinPool.commonPool()); } /** @@ -1010,7 +1005,6 @@ public static ImmutableGraphIndex buildAndMergeNewNodes(RandomAccessReader in, * @param beamWidth the width of the beam used during the graph building process. * @param overflowRatio the ratio of extra neighbors to allow temporarily when inserting a node. * @param alpha the weight factor for balancing score computations. - * @param addHierarchy whether to add hierarchical structures while building the graph. * @param simdExecutor the ForkJoinPool executor used for SIMD tasks during graph building. * @param parallelExecutor the ForkJoinPool executor used for general parallelization during graph building. * @@ -1025,7 +1019,6 @@ public static ImmutableGraphIndex buildAndMergeNewNodes(RandomAccessReader in, int beamWidth, float overflowRatio, float alpha, - boolean addHierarchy, ForkJoinPool simdExecutor, ForkJoinPool parallelExecutor) throws IOException { @@ -1040,7 +1033,6 @@ public static ImmutableGraphIndex buildAndMergeNewNodes(RandomAccessReader in, beamWidth, overflowRatio, alpha, - addHierarchy, true, simdExecutor, parallelExecutor diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/graph/ImmutableGraphIndex.java b/jvector-base/src/main/java/io/github/jbellis/jvector/graph/ImmutableGraphIndex.java index 0fc6d27f8..54c11af05 100644 --- a/jvector-base/src/main/java/io/github/jbellis/jvector/graph/ImmutableGraphIndex.java +++ b/jvector-base/src/main/java/io/github/jbellis/jvector/graph/ImmutableGraphIndex.java @@ -104,6 +104,15 @@ default boolean containsNode(int nodeId) { @Override void close() throws IOException; + + /** + * Returns true if this graph is hierarchical, false otherwise. + * Note that a graph can be hierarchical even if it has a single layer, i.e., getMaxLevel() == 0. + * For example, while building a new hierarchical graph, we may temporarily only have nodes at level 0 + * because of the random assignment of nodes to levels. + */ + boolean isHierarchical(); + /** * @return The maximum (coarser) level that contains a vector in the graph. */ diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/graph/OnHeapGraphIndex.java b/jvector-base/src/main/java/io/github/jbellis/jvector/graph/OnHeapGraphIndex.java index 711304f79..1f9cb3b32 100644 --- a/jvector-base/src/main/java/io/github/jbellis/jvector/graph/OnHeapGraphIndex.java +++ b/jvector-base/src/main/java/io/github/jbellis/jvector/graph/OnHeapGraphIndex.java @@ -80,7 +80,9 @@ public class OnHeapGraphIndex implements MutableGraphIndex { private volatile boolean allMutationsCompleted = false; - OnHeapGraphIndex(List maxDegrees, int dimension, double overflowRatio, DiversityProvider diversityProvider) { + private final boolean isHierarchical; + + OnHeapGraphIndex(List maxDegrees, int dimension, double overflowRatio, DiversityProvider diversityProvider, boolean isHierarchical) { this.overflowRatio = overflowRatio; this.maxDegrees = new IntArrayList(); this.dimension = dimension; @@ -94,6 +96,7 @@ public class OnHeapGraphIndex implements MutableGraphIndex { getDegree(0), (int) (getDegree(0) * overflowRatio)) ); + this.isHierarchical = isHierarchical; } /** @@ -128,6 +131,11 @@ public NodesIterator getNeighborsIterator(int level, int node) { } } + @Override + public boolean isHierarchical() { + return isHierarchical; + } + @Override public int getMaxLevelForNode(int node) { int maxLayer = -1; @@ -568,7 +576,8 @@ public static OnHeapGraphIndex load(RandomAccessReader in, int dimension, double int entryNode = in.readInt(); - var graph = new OnHeapGraphIndex(layerDegrees, dimension, overflowRatio, diversityProvider); + boolean isHierarchical = layerCount > 1; + var graph = new OnHeapGraphIndex(layerDegrees, dimension, overflowRatio, diversityProvider, isHierarchical); Map nodeLevelMap = new HashMap<>(); diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/graph/disk/OnDiskGraphIndex.java b/jvector-base/src/main/java/io/github/jbellis/jvector/graph/disk/OnDiskGraphIndex.java index 8f18ffcf4..e4699f0ea 100644 --- a/jvector-base/src/main/java/io/github/jbellis/jvector/graph/disk/OnDiskGraphIndex.java +++ b/jvector-base/src/main/java/io/github/jbellis/jvector/graph/disk/OnDiskGraphIndex.java @@ -227,6 +227,11 @@ public Set getFeatureSet() { return features.keySet(); } + @Override + public boolean isHierarchical() { + return layerInfo.size() > 1; + } + @Override public int getDimension() { return dimension; diff --git a/jvector-tests/src/test/java/io/github/jbellis/jvector/TestUtil.java b/jvector-tests/src/test/java/io/github/jbellis/jvector/TestUtil.java index 05a3f7195..d9f399936 100644 --- a/jvector-tests/src/test/java/io/github/jbellis/jvector/TestUtil.java +++ b/jvector-tests/src/test/java/io/github/jbellis/jvector/TestUtil.java @@ -249,6 +249,11 @@ public FullyConnectedGraphIndex(int entryNode, List layerSizes) { this.layerSizes = layerSizes; } + @Override + public boolean isHierarchical() { + return layerSizes.size() > 1; + } + @Override public int size(int level) { return layerSizes.get(level); @@ -388,6 +393,11 @@ public RandomlyConnectedGraphIndex(int size, int M, Random random) { this(List.of(new CommonHeader.LayerInfo(size, M)), random); } + @Override + public boolean isHierarchical() { + return layerInfo.size() > 1; + } + @Override public int getMaxLevel() { return layerInfo.size() - 1; diff --git a/jvector-tests/src/test/java/io/github/jbellis/jvector/graph/OnHeapGraphIndexTest.java b/jvector-tests/src/test/java/io/github/jbellis/jvector/graph/OnHeapGraphIndexTest.java index 13706a482..261e2b1a9 100644 --- a/jvector-tests/src/test/java/io/github/jbellis/jvector/graph/OnHeapGraphIndexTest.java +++ b/jvector-tests/src/test/java/io/github/jbellis/jvector/graph/OnHeapGraphIndexTest.java @@ -264,7 +264,7 @@ public void testIncrementalInsertionFromOnDiskIndex_withIdentityOrdinalMapping() // We will create a trivial 1:1 mapping between the new graph and the ravv final int[] graphToRavvOrdMap = IntStream.range(0, allVectorsRavv.size()).toArray(); final RemappedRandomAccessVectorValues remappedAllVectorsRavv = new RemappedRandomAccessVectorValues(allVectorsRavv, graphToRavvOrdMap); - ImmutableGraphIndex reconstructedAllNodeOnHeapGraphIndex = GraphIndexBuilder.buildAndMergeNewNodes(readerSupplier.get(), remappedAllVectorsRavv, allBuildScoreProvider, NUM_BASE_VECTORS, BEAM_WIDTH, NEIGHBOR_OVERFLOW, ALPHA, ADD_HIERARCHY); + ImmutableGraphIndex reconstructedAllNodeOnHeapGraphIndex = GraphIndexBuilder.buildAndMergeNewNodes(readerSupplier.get(), remappedAllVectorsRavv, allBuildScoreProvider, NUM_BASE_VECTORS, BEAM_WIDTH, NEIGHBOR_OVERFLOW, ALPHA); // Verify that the recall is similar across multiple queries // Note: Incremental insertion can have slightly different recall than bulk indexing due to the order of insertions @@ -313,8 +313,7 @@ public void testIncrementalInsertionFromOnDiskIndex_withNonIdentityOrdinalMappin final int[] allGraphToRavvOrdMap = IntStream.range(0, allVectorsRavv.size()).map(i -> allVectorsRavv.size() - 1 - i).toArray(); final RemappedRandomAccessVectorValues remappedAllVectorsRavv = new RemappedRandomAccessVectorValues(allVectorsRavv, allGraphToRavvOrdMap); var allBsp = BuildScoreProvider.randomAccessScoreProvider(remappedAllVectorsRavv, SIMILARITY_FUNCTION); - - ImmutableGraphIndex reconstructedAllNodeOnHeapGraphIndex = GraphIndexBuilder.buildAndMergeNewNodes(readerSupplier.get(), remappedAllVectorsRavv, allBsp, NUM_BASE_VECTORS, BEAM_WIDTH, NEIGHBOR_OVERFLOW, ALPHA, ADD_HIERARCHY); + ImmutableGraphIndex reconstructedAllNodeOnHeapGraphIndex = GraphIndexBuilder.buildAndMergeNewNodes(readerSupplier.get(), remappedAllVectorsRavv, allBsp, NUM_BASE_VECTORS, BEAM_WIDTH, NEIGHBOR_OVERFLOW, ALPHA); // Verify that the recall is similar across multiple queries // Note: Non-identity mapping can have slightly lower recall due to the complexity of merging with remapped ordinals