datastax
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks-jmh/README.md‎
Lines changed: 81 additions & 10 deletions b/‎benchmarks-jmh/README.md‎
Lines changed: 81 additions & 10 deletions
diff --git a/‎benchmarks-jmh/scripts/jmh_results_formatter.py‎
Lines changed: 80 additions & 0 deletions b/‎benchmarks-jmh/scripts/jmh_results_formatter.py‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎benchmarks-jmh/src/main/java/io/github/jbellis/jvector/bench/IndexConstructionWithRandomSetBenchmark.java‎
Lines changed: 5 additions & 49 deletions b/‎benchmarks-jmh/src/main/java/io/github/jbellis/jvector/bench/IndexConstructionWithRandomSetBenchmark.java‎
Lines changed: 5 additions & 49 deletions
diff --git a/‎benchmarks-jmh/src/main/java/io/github/jbellis/jvector/bench/PQDistanceCalculationBenchmark.java‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks-jmh/src/main/java/io/github/jbellis/jvector/bench/PQDistanceCalculationBenchmark.java‎
Lines changed: 1 addition & 1 deletion
@@ -36,3 +36,4 @@ hdf5/
 
 # JMH generated files
 dependency-reduced-pom.xml
+results.csv
@@ -6,23 +6,25 @@ are mostly targeting scalability and latency aspects.
 
 1. You can build and then run
 ```shell
-VERSION="4.0.0-beta.6"
+# Get version from pom.xml
+VERSION=$(mvn help:evaluate -Dexpression=revision -q -DforceStdout)
 mvn clean install -DskipTests=true
 java --enable-native-access=ALL-UNNAMED \
   --add-modules=jdk.incubator.vector \
   -XX:+HeapDumpOnOutOfMemoryError \
   -Xmx14G -Djvector.experimental.enable_native_vectorization=true \
-  -jar benchmarks-jmh/target/benchmarks-jmh-${VERSION}-SNAPSHOT.jar 
+  -jar benchmarks-jmh/target/benchmarks-jmh-${VERSION}.jar 
 ```
 
 You can add additional optional JMH arguments dynamically from command line. For example, to run the benchmarks with 4 forks, 5 warmup iterations, 5 measurement iterations, 2 threads, and 10 seconds warmup time per iteration, use the following command:
 ```shell
-VERSION="4.0.0-beta.6"
+# Get version from pom.xml
+VERSION=$(mvn help:evaluate -Dexpression=revision -q -DforceStdout)
 java --enable-native-access=ALL-UNNAMED \
   --add-modules=jdk.incubator.vector \
   -XX:+HeapDumpOnOutOfMemoryError \
   -Xmx14G -Djvector.experimental.enable_native_vectorization=true \
-  -jar benchmarks-jmh/target/benchmarks-jmh-${VERSION}-SNAPSHOT.jar \
+  -jar benchmarks-jmh/target/benchmarks-jmh-${VERSION}.jar \
   -f 4 -wi 5 -i 5 -t 2 -w 10s
 ```
 
@@ -41,33 +43,102 @@ Common JMH command line options you can use in the configuration or command line
 
 For example in the below command lines we are going to run only `IndexConstructionWithRandomSetBenchmark`
 ```shell
-VERSION="4.0.0-beta.6"
+# Get version from pom.xml
+VERSION=$(mvn help:evaluate -Dexpression=revision -q -DforceStdout)
 BENCHMARK_NAME="IndexConstructionWithRandomSetBenchmark"
 mvn clean install -DskipTests=true
 java --enable-native-access=ALL-UNNAMED \
   --add-modules=jdk.incubator.vector \
   -XX:+HeapDumpOnOutOfMemoryError \
   -Xmx20G -Djvector.experimental.enable_native_vectorization=true \
-  -jar benchmarks-jmh/target/benchmarks-jmh-${VERSION}-SNAPSHOT.jar $BENCHMARK_NAME
+  -jar benchmarks-jmh/target/benchmarks-jmh-${VERSION}.jar $BENCHMARK_NAME
 ```
 
 Same example for PQ training benchmark
 ```shell
-VERSION="4.0.0-beta.6"
+# Get version from pom.xml
+VERSION=$(mvn help:evaluate -Dexpression=revision -q -DforceStdout)
 BENCHMARK_NAME="PQTrainingWithRandomVectorsBenchmark"
 mvn clean install -DskipTests=true
 java --enable-native-access=ALL-UNNAMED \
   --add-modules=jdk.incubator.vector \
   -XX:+HeapDumpOnOutOfMemoryError \
   -Xmx20G -Djvector.experimental.enable_native_vectorization=true \
-  -jar benchmarks-jmh/target/benchmarks-jmh-${VERSION}-SNAPSHOT.jar $BENCHMARK_NAME
+  -jar benchmarks-jmh/target/benchmarks-jmh-${VERSION}.jar $BENCHMARK_NAME
 ```
 
 If you want to rerun a specific benchmark without testing the entire grid of scenarios defined in the benchmark.
 You can just do the following to set M and beamWidth:
 ```shell
-VERSION="4.0.0-beta.6"
-java -jar benchmarks-jmh/target/benchmarks-jmh-${VERSION}-SNAPSHOT.jar IndexConstructionWithStaticSetBenchmark -p M=32 -p beamWidth=100 
+# Get version from pom.xml
+VERSION=$(mvn help:evaluate -Dexpression=revision -q -DforceStdout)
+java -jar benchmarks-jmh/target/benchmarks-jmh-${VERSION}.jar IndexConstructionWithStaticSetBenchmark -p M=32 -p beamWidth=100 
+```
+### Running benchmarks with auxiliary counters
+
+For benchmarks that include auxiliary counters (like `RecallWithRandomVectorsBenchmark`), run with CSV output to capture all metrics:
+
+```shell
+# Get version from pom.xml
+VERSION=$(mvn help:evaluate -Dexpression=revision -q -DforceStdout)
+BENCHMARK_NAME="RecallWithRandomVectorsBenchmark"
+mvn clean install -DskipTests=true
+java --enable-native-access=ALL-UNNAMED \
+  --add-modules=jdk.incubator.vector \
+  -XX:+HeapDumpOnOutOfMemoryError \
+  -Xmx20G -Djvector.experimental.enable_native_vectorization=true \
+  -jar benchmarks-jmh/target/benchmarks-jmh-${VERSION}.jar $BENCHMARK_NAME -rf csv -rff results.csv
+```
+
+## Formatting benchmark results
+
+For benchmarks that output auxiliary counters (like recall metrics, visited counts, etc.), you can use the provided Python formatter to create a clean tabular view of the results.
+
+### Setting up the Python environment
+
+First, create a virtual environment and install the required dependencies:
+
+```shell
+# Create virtual environment
+python3 -m venv .venv
+
+# Activate the virtual environment
+# On macOS/Linux:
+source .venv/bin/activate
+# On Windows:
+# .venv\Scripts\activate
+
+# Install pandas dependency
+pip install pandas
+```
+
+### Using the results formatter
+
+After running a benchmark with CSV output (using `-rf csv -rff results.csv`), you can format the results:
+
+```shell
+# Make sure your virtual environment is activated
+source .venv/bin/activate
+
+# Run the formatter script (assumes results.csv is in the current directory)
+python benchmarks-jmh/scripts/jmh_results_formatter.py
+```
+
+The formatter will output a clean table showing:
+- **k**: Number of nearest neighbors requested
+- **PQ_Subspaces**: Number of Product Quantization subspaces
+- **Time_ms**: Execution time in milliseconds
+- **Recall**: Average recall score
+- **ReRanked_Count**: Average number of vectors re-ranked
+- **Visited_Count**: Average number of nodes visited during search
+- **Expanded_Count_BaseLayer**: Average number of nodes expanded in base layer
+
+Example output:
+```
+ k  PQ_Subspaces   Time_ms  Recall  ReRanked_Count  Visited_Count  Expanded_Count_BaseLayer
+50             0  19.283   1.000             0.0        3290.8                     253.7
+50            16   4.137   0.700           250.0        2849.6                     252.1
+50            32   4.531   0.500           250.0        2881.9                     254.2
 ```
 
 
 
@@ -0,0 +1,80 @@
+import pandas as pd
+
+# Read the CSV file
+df = pd.read_csv('results.csv')
+
+# Filter for main benchmark results (execution time)
+main_results = df[df['Benchmark'].str.endswith('testOnHeapRandomVectorsWithRecall') &
+                  ~df['Benchmark'].str.contains(':')]
+
+# Filter for all auxiliary counters
+recall_results = df[df['Benchmark'].str.contains(':avgRecall')]
+reranked_results = df[df['Benchmark'].str.contains(':avgReRankedCount')]
+visited_results = df[df['Benchmark'].str.contains(':avgVisitedCount')]
+expanded_results = df[df['Benchmark'].str.contains(':avgExpandedCountBaseLayer')]
+
+# Merge all results on the numberOfPQSubspaces parameter
+summary = main_results.copy()
+
+# Merge recall results
+summary = summary.merge(
+    recall_results[['Param: numberOfPQSubspaces', 'Score']],
+    on='Param: numberOfPQSubspaces',
+    suffixes=('', '_avgRecall'),
+    how='left'
+)
+
+# Merge reranked count results
+summary = summary.merge(
+    reranked_results[['Param: numberOfPQSubspaces', 'Score']],
+    on='Param: numberOfPQSubspaces',
+    suffixes=('', '_avgReRankedCount'),
+    how='left'
+)
+
+# Merge visited count results
+summary = summary.merge(
+    visited_results[['Param: numberOfPQSubspaces', 'Score']],
+    on='Param: numberOfPQSubspaces',
+    suffixes=('', '_avgVisitedCount'),
+    how='left'
+)
+
+# Merge expanded count results
+summary = summary.merge(
+    expanded_results[['Param: numberOfPQSubspaces', 'Score']],
+    on='Param: numberOfPQSubspaces',
+    suffixes=('', '_avgExpandedCountBaseLayer'),
+    how='left'
+)
+
+# Create a clean summary table with all auxiliary counters
+summary_clean = summary[[
+    'Param: k',
+    'Param: numberOfPQSubspaces',
+    'Score',
+    'Score_avgRecall',
+    'Score_avgReRankedCount',
+    'Score_avgVisitedCount',
+    'Score_avgExpandedCountBaseLayer'
+]]
+
+# Rename columns for better readability
+summary_clean.columns = [
+    'k',
+    'PQ_Subspaces',
+    'Time_ms',
+    'Recall',
+    'ReRanked_Count',
+    'Visited_Count',
+    'Expanded_Count_BaseLayer'
+]
+
+# Format numeric columns for better display
+summary_clean['Time_ms'] = summary_clean['Time_ms'].round(3)
+summary_clean['Recall'] = summary_clean['Recall'].round(3)
+summary_clean['ReRanked_Count'] = summary_clean['ReRanked_Count'].round(1)
+summary_clean['Visited_Count'] = summary_clean['Visited_Count'].round(1)
+summary_clean['Expanded_Count_BaseLayer'] = summary_clean['Expanded_Count_BaseLayer'].round(1)
+
+print(summary_clean.to_string(index=False))
@@ -56,9 +56,8 @@ public class IndexConstructionWithRandomSetBenchmark {
     private int originalDimension;
     @Param({/*"10000",*/ "100000"/*, "1000000"*/})
     int numBaseVectors;
-
-    @Param({"Exact", "PQ"})
-    String buildScoreProviderType;
+    @Param({"0", "16"})
+    private int numberOfPQSubspaces;
 
     @Setup(Level.Invocation)
     public void setup() throws IOException {
@@ -71,21 +70,18 @@ public void setup() throws IOException {
         // wrap the raw vectors in a RandomAccessVectorValues
         ravv = new ListRandomAccessVectorValues(baseVectors, originalDimension);
 
-        if (buildScoreProviderType.equals("PQ")) {
+        if (numberOfPQSubspaces > 0) {
             log.info("Using PQ build score provider with original dimension: {}, M: {}, beam width: {}", originalDimension, M, beamWidth);
-            int numberOfSubspaces = getDefaultNumberOfSubspacesPerVector(originalDimension);
             final ProductQuantization pq = ProductQuantization.compute(ravv,
-                    numberOfSubspaces,
+                    numberOfPQSubspaces,
                     256,
                     true);
             final PQVectors pqVectors = (PQVectors) pq.encodeAll(ravv);
             buildScoreProvider = BuildScoreProvider.pqBuildScoreProvider(VectorSimilarityFunction.EUCLIDEAN, pqVectors);
-        } else if (buildScoreProviderType.equals("Exact")) {
+        } else {
             log.info("Using Exact build score provider with original dimension: {}, M: {}, beam width: {}", originalDimension, M, beamWidth);
             // score provider using the raw, in-memory vectors
             buildScoreProvider = BuildScoreProvider.randomAccessScoreProvider(ravv, VectorSimilarityFunction.EUCLIDEAN);
-        } else {
-            throw new IllegalArgumentException("Unknown build score provider type: " + buildScoreProviderType);
         }
 
     }
@@ -111,44 +107,4 @@ private VectorFloat<?> createRandomVector(int dimension) {
         }
         return vector;
     }
-
-    /**
-     * This method returns the default number of subspaces per vector for a given original dimension.
-     * Should be used as a default value for the number of subspaces per vector in case no value is provided.
-     *
-     * @param originalDimension original vector dimension
-     * @return default number of subspaces per vector
-     */
-    public static int getDefaultNumberOfSubspacesPerVector(int originalDimension) {
-        // the idea here is that higher dimensions compress well, but not so well that we should use fewer bits
-        // than a lower-dimension vector, which is what you could get with cutoff points to switch between (e.g.)
-        // D*0.5 and D*0.25. Thus, the following ensures that bytes per vector is strictly increasing with D.
-        int compressedBytes;
-        if (originalDimension <= 32) {
-            // We are compressing from 4-byte floats to single-byte codebook indexes,
-            // so this represents compression of 4x
-            // * GloVe-25 needs 25 BPV to achieve good recall
-            compressedBytes = originalDimension;
-        } else if (originalDimension <= 64) {
-            // * GloVe-50 performs fine at 25
-            compressedBytes = 32;
-        } else if (originalDimension <= 200) {
-            // * GloVe-100 and -200 perform well at 50 and 100 BPV, respectively
-            compressedBytes = (int) (originalDimension * 0.5);
-        } else if (originalDimension <= 400) {
-            // * NYTimes-256 actually performs fine at 64 BPV but we'll be conservative
-            // since we don't want BPV to decrease
-            compressedBytes = 100;
-        } else if (originalDimension <= 768) {
-            // allow BPV to increase linearly up to 192
-            compressedBytes = (int) (originalDimension * 0.25);
-        } else if (originalDimension <= 1536) {
-            // * ada002 vectors have good recall even at 192 BPV = compression of 32x
-            compressedBytes = 192;
-        } else {
-            // We have not tested recall with larger vectors than this, let's let it increase linearly
-            compressedBytes = (int) (originalDimension * 0.125);
-        }
-        return compressedBytes;
-    }
 }
@@ -58,7 +58,7 @@ public class PQDistanceCalculationBenchmark {
     private ProductQuantization pq;
     private BuildScoreProvider buildScoreProvider;
 
-    @Param({"768"})
+    @Param({"1536"})
     private int dimension;
 
     @Param({"10000"})
Original file line number	Diff line number	Diff line change
`@@ -36,3 +36,4 @@ hdf5/`
`36`	`36`
`37`	`37`	`# JMH generated files`
`38`	`38`	`dependency-reduced-pom.xml`
	`39`	`+results.csv`