Generalize SIMD distance implementation to n-length vectors

pierd · pierd · commit 3b4fde8594b1 · 2023-02-22T20:25:28.000+01:00
diff --git a/instant-distance-py/Cargo.toml b/instant-distance-py/Cargo.toml
@@ -15,8 +15,8 @@ name = "instant_distance"
 crate-type = ["cdylib"]
 
 [dependencies]
+aligned-vec = { version = "0.5.0", features = ["serde"] }
 bincode = "1.3.1"
 instant-distance = { version = "0.6", path = "../instant-distance", features = ["with-serde"] }
 pyo3 = { version = "0.18.0", features = ["extension-module"] }
 serde = { version = "1", features = ["derive"] }
-serde-big-array = "0.4.1"
diff --git a/instant-distance-py/src/lib.rs b/instant-distance-py/src/lib.rs
@@ -6,14 +6,14 @@ use std::fs::File;
 use std::io::{BufReader, BufWriter};
 use std::iter::FromIterator;
 
+use aligned_vec::AVec;
 use instant_distance::Point;
 use pyo3::conversion::IntoPy;
-use pyo3::exceptions::{PyTypeError, PyValueError};
+use pyo3::exceptions::PyValueError;
 use pyo3::types::{PyList, PyModule, PyString};
 use pyo3::{pyclass, pymethods, pymodule};
 use pyo3::{Py, PyAny, PyErr, PyObject, PyRef, PyRefMut, PyResult, Python};
 use serde::{Deserialize, Serialize};
-use serde_big_array::BigArray;
 
 #[pymodule]
 #[pyo3(name = "instant_distance")]
@@ -87,8 +87,7 @@ impl HnswMap {
 
 /// An instance of hierarchical navigable small worlds
 ///
-/// For now, this is specialized to only support 300-element (32-bit) float vectors
-/// with a squared Euclidean distance metric.
+/// For now, this uses a squared Euclidean distance metric.
 #[pyclass]
 struct Hnsw {
     inner: instant_distance::Hnsw<FloatArray>,
@@ -346,35 +345,32 @@ impl Neighbor {
     }
 }
 
-#[repr(align(32))]
 #[derive(Clone, Deserialize, Serialize)]
-struct FloatArray(#[serde(with = "BigArray")] [f32; DIMENSIONS]);
+struct FloatArray(AVec<f32>);
 
 impl TryFrom<&PyAny> for FloatArray {
     type Error = PyErr;
 
     fn try_from(value: &PyAny) -> Result<Self, Self::Error> {
-        let mut new = FloatArray([0.0; DIMENSIONS]);
-        for (i, val) in value.iter()?.enumerate() {
-            match i >= DIMENSIONS {
-                true => return Err(PyTypeError::new_err("point array too long")),
-                false => new.0[i] = val?.extract::<f32>()?,
-            }
+        let mut new = FloatArray(AVec::with_capacity(32, value.len()?));
+        for val in value.iter()? {
+            new.0.push(val?.extract()?);
         }
         Ok(new)
     }
 }
 
 impl Point for FloatArray {
     fn distance(&self, rhs: &Self) -> f32 {
+        debug_assert_eq!(self.0.len(), rhs.0.len());
+
         #[cfg(target_arch = "x86_64")]
         {
             use std::arch::x86_64::{
                 _mm256_castps256_ps128, _mm256_extractf128_ps, _mm256_fmadd_ps, _mm256_load_ps,
                 _mm256_setzero_ps, _mm256_sub_ps, _mm_add_ps, _mm_add_ss, _mm_cvtss_f32,
                 _mm_fmadd_ps, _mm_load_ps, _mm_movehl_ps, _mm_shuffle_ps, _mm_sub_ps,
             };
-            debug_assert_eq!(self.0.len() % 8, 4);
 
             unsafe {
                 let mut acc_8x = _mm256_setzero_ps();
@@ -389,16 +385,36 @@ impl Point for FloatArray {
                 let right = _mm256_castps256_ps128(acc_8x); // lower half
                 acc_4x = _mm_add_ps(acc_4x, right); // sum halves
 
-                let lh_4x = _mm_load_ps(self.0[DIMENSIONS - 4..].as_ptr());
-                let rh_4x = _mm_load_ps(rhs.0[DIMENSIONS - 4..].as_ptr());
-                let diff = _mm_sub_ps(lh_4x, rh_4x);
-                acc_4x = _mm_fmadd_ps(diff, diff, acc_4x);
+                // count of already processed dimensions
+                let mut processed_count = self.0.len() - self.0.len() % 8;
+
+                if self.0.len() % 8 >= 4 {
+                    // there are 4+ dimensions to process
+                    // let's process another 4 in a batch
+                    let lh_4x = _mm_load_ps(self.0[processed_count..].as_ptr());
+                    let rh_4x = _mm_load_ps(rhs.0[processed_count..].as_ptr());
+                    let diff = _mm_sub_ps(lh_4x, rh_4x);
+                    acc_4x = _mm_fmadd_ps(diff, diff, acc_4x);
+                    processed_count += 4;
+                }
 
+                // sum up the registers
                 let lower = _mm_movehl_ps(acc_4x, acc_4x);
                 acc_4x = _mm_add_ps(acc_4x, lower);
                 let upper = _mm_shuffle_ps(acc_4x, acc_4x, 0x1);
                 acc_4x = _mm_add_ss(acc_4x, upper);
-                _mm_cvtss_f32(acc_4x)
+                let mut distance = _mm_cvtss_f32(acc_4x);
+
+                // process the leftover dimensions (if any are left)
+                if processed_count < self.0.len() {
+                    distance += self.0[processed_count..]
+                        .iter()
+                        .zip(rhs.0[processed_count..].iter())
+                        .map(|(&a, &b)| (a - b).powi(2))
+                        .sum::<f32>()
+                }
+
+                distance
             }
         }
         #[cfg(not(target_arch = "x86_64"))]
@@ -430,5 +446,3 @@ impl IntoPy<Py<PyAny>> for &'_ MapValue {
         }
     }
 }
-
-const DIMENSIONS: usize = 300;