Add float-bit panama implementation

thecoop · thecoop · commit 987dd5eb6d72 · 2025-03-12T16:44:10.000Z
diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/vector/DistanceFunctionBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/vector/DistanceFunctionBenchmark.java
@@ -148,11 +148,11 @@ public void findBenchmarkImpl() {
 
         float[] floatDocVector = new float[dims];
         byte[] byteDocVector = new byte[dims];
-        byte[] bitDocVector = new byte[dims/8];
+        byte[] bitDocVector = new byte[dims / 8];
 
         float[] floatQueryVector = new float[dims];
         byte[] byteQueryVector = new byte[dims];
-        byte[] bitQueryVector = new byte[dims/8];
+        byte[] bitQueryVector = new byte[dims / 8];
 
         r.nextBytes(byteDocVector);
         r.nextBytes(bitDocVector);
diff --git a/libs/simdvec/src/main/java/org/elasticsearch/simdvec/internal/vectorization/DefaultESVectorUtilSupport.java b/libs/simdvec/src/main/java/org/elasticsearch/simdvec/internal/vectorization/DefaultESVectorUtilSupport.java
@@ -69,13 +69,17 @@ public static int ipByteBitImpl(byte[] q, byte[] d) {
     }
 
     public static float ipFloatBitImpl(float[] q, byte[] d) {
+        return ipFloatBitImpl(q, d, 0);
+    }
+
+    static float ipFloatBitImpl(float[] q, byte[] d, int start) {
         assert q.length == d.length * Byte.SIZE;
         float acc0 = 0;
         float acc1 = 0;
         float acc2 = 0;
         float acc3 = 0;
         // now combine the two vectors, summing the byte dimensions where the bit in d is `1`
-        for (int i = 0; i < d.length; i++) {
+        for (int i = start; i < d.length; i++) {
             byte mask = d[i];
             acc0 = fma(q[i * Byte.SIZE + 0], (mask >> 7) & 1, acc0);
             acc1 = fma(q[i * Byte.SIZE + 1], (mask >> 6) & 1, acc1);
diff --git a/libs/simdvec/src/main21/java/org/elasticsearch/simdvec/internal/vectorization/PanamaESVectorUtilSupport.java b/libs/simdvec/src/main21/java/org/elasticsearch/simdvec/internal/vectorization/PanamaESVectorUtilSupport.java
@@ -10,8 +10,10 @@
 package org.elasticsearch.simdvec.internal.vectorization;
 
 import jdk.incubator.vector.ByteVector;
+import jdk.incubator.vector.FloatVector;
 import jdk.incubator.vector.IntVector;
 import jdk.incubator.vector.LongVector;
+import jdk.incubator.vector.VectorMask;
 import jdk.incubator.vector.VectorOperators;
 import jdk.incubator.vector.VectorShape;
 import jdk.incubator.vector.VectorSpecies;
@@ -55,6 +57,13 @@ public int ipByteBit(byte[] q, byte[] d) {
 
     @Override
     public float ipFloatBit(float[] q, byte[] d) {
+        if (q.length >= 16) {
+            if (VECTOR_BITSIZE >= 512) {
+                return ipFloatBit512(q, d);
+            } else if (VECTOR_BITSIZE == 256) {
+                return ipFloatBit256(q, d);
+            }
+        }
         return DefaultESVectorUtilSupport.ipFloatBitImpl(q, d);
     }
 
@@ -165,4 +174,56 @@ public static long ipByteBin128(byte[] q, byte[] d) {
         }
         return subRet0 + (subRet1 << 1) + (subRet2 << 2) + (subRet3 << 3);
     }
+
+    private static final VectorSpecies<Float> FLOAT_SPECIES_8 = FloatVector.SPECIES_256;
+    private static final VectorSpecies<Float> FLOAT_SPECIES_16 = FloatVector.SPECIES_512;
+
+    private static long reverse(byte b) {
+        // see https://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
+        return ((((b & 0xff) * 0x80200802L) & 0x0884422110L) * 0x0101010101L >> 32) & 0xff;
+    }
+
+    static float ipFloatBit512(float[] q, byte[] d) {
+        assert q.length == d.length * Byte.SIZE;
+        FloatVector acc = FloatVector.zero(FLOAT_SPECIES_16);
+
+        int i = 0;
+        for (; i < FLOAT_SPECIES_16.loopBound(q.length); i += FLOAT_SPECIES_16.length()) {
+            FloatVector floats = FloatVector.fromArray(FLOAT_SPECIES_16, q, i);
+            // use the two bytes corresponding to the same sections
+            // of the bit vector as a mask for addition
+            long maskBits = reverse(d[i / 8]) | reverse(d[i / 8 + 1]) << 8;
+            acc = acc.add(floats, VectorMask.fromLong(FLOAT_SPECIES_16, maskBits));
+        }
+
+        float sum = acc.reduceLanes(VectorOperators.ADD);
+        if (i < q.length) {
+            // do the tail
+            sum += DefaultESVectorUtilSupport.ipFloatBitImpl(q, d, i);
+        }
+
+        return sum;
+    }
+
+    static float ipFloatBit256(float[] q, byte[] d) {
+        assert q.length == d.length * Byte.SIZE;
+        FloatVector acc = FloatVector.zero(FLOAT_SPECIES_8);
+
+        int i = 0;
+        for (; i < FLOAT_SPECIES_8.loopBound(q.length); i += FLOAT_SPECIES_8.length()) {
+            FloatVector floats = FloatVector.fromArray(FLOAT_SPECIES_8, q, i);
+            // use the byte corresponding to the same section
+            // of the bit vector as a mask for addition
+            long maskBits = reverse(d[i / 8]);
+            acc = acc.add(floats, VectorMask.fromLong(FLOAT_SPECIES_8, maskBits));
+        }
+
+        float sum = acc.reduceLanes(VectorOperators.ADD);
+        if (i < q.length) {
+            // do the tail
+            sum += DefaultESVectorUtilSupport.ipFloatBitImpl(q, d, i);
+        }
+
+        return sum;
+    }
 }