using dot

lerman25 · lerman25 · commit edf7ce3146b9 · 2025-04-08T22:09:19.000Z
diff --git a/src/VecSim/spaces/L2/L2_SVE_INT8.h b/src/VecSim/spaces/L2/L2_SVE_INT8.h
@@ -17,39 +17,9 @@ inline void L2SquareStep(const int8_t *&pVect1, const int8_t *&pVect2, size_t &o
     svint8_t v1_i8 = svld1_s8(pg, pVect1 + offset); // Load int8 vectors from pVect1
     svint8_t v2_i8 = svld1_s8(pg, pVect2 + offset); // Load int8 vectors from pVect2
 
-    svint16_t v1_16_l = svunpklo_s16(v1_i8);
-    svint16_t v1_16_h = svunpkhi_s16(v1_i8);
-    svint16_t v2_16_l = svunpklo_s16(v2_i8);
-    svint16_t v2_16_h = svunpkhi_s16(v2_i8);
-
-    // Calculate difference and square for low part
-    svint16_t diff_l = svsub_s16_x(pg, v1_16_l, v2_16_l);
-
-    // Unpacking to 32 bits is necessary for the multiplication
-    // The multiplication of two 16 bits numbers can overflow
-    // Maximal value of int8 - int8 is 255 (127 - (-128))
-    // 255^2 = 65025 while int16 can hold upto 32767
-
-    svint32_t diff32_l_l = svunpklo_s32(diff_l);
-    svint32_t diff32_l_h = svunpkhi_s32(diff_l);
-
-    // Result register is the same as the accumulator for better performance
-    svint32_t sq_l = svmul_s32_x(pg, diff32_l_l, diff32_l_l);
-    sq_l = svmla_s32_x(pg, sq_l, diff32_l_h, diff32_l_h);
-
-    svint16_t diff_h = svsub_s16_x(pg, v1_16_h, v2_16_h);
-
-    svint32_t diff32_h_l = svunpklo_s32(diff_h);
-    svint32_t diff32_h_h = svunpkhi_s32(diff_h);
-
-    // Result register is the same as the accumulator for better performance
-    svint32_t sq_h = svmul_s32_x(pg, diff32_h_l, diff32_h_l);
-    sq_h = svmla_s32_x(pg, sq_h, diff32_h_h, diff32_h_h);
-
-    // Accumulate
-    sum = svadd_s32_x(pg, sum, sq_l);
-    sum = svadd_s32_x(pg, sum, sq_h);
+    svint8_t abs_diff = svabd_s8_x(pg, v1_i8, v2_i8);
 
+    sum = svdot_s32(sum, abs_diff, abs_diff);
     offset += chunk; // Move to the next set of int8 elements
 }
 
@@ -103,39 +73,14 @@ float INT8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimens
         Because Inactive lanes are set to 0 in load */
 
         svbool_t pg = svwhilelt_b8_u64(offset, dimension);
-        svbool_t pg16 = svwhilelt_b16(offset, dimension);
-        svbool_t pg32 = svwhilelt_b32(offset, dimension);
 
         svint8_t v1_i8 = svld1_s8(pg, pVect1 + offset); // Load int8 vectors from pVect1
         svint8_t v2_i8 = svld1_s8(pg, pVect2 + offset); // Load int8 vectors from pVect2
 
-        svint16_t v1_16_l = svunpklo_s16(v1_i8);
-        svint16_t v1_16_h = svunpkhi_s16(v1_i8);
-        svint16_t v2_16_l = svunpklo_s16(v2_i8);
-        svint16_t v2_16_h = svunpkhi_s16(v2_i8);
-
-        // Calculate difference and square for low part
-        svint16_t diff_l = svsub_s16_x(pg16, v1_16_l, v2_16_l);
-
-        svint32_t diff32_l_l = svunpklo_s32(diff_l);
-        svint32_t diff32_l_h = svunpkhi_s32(diff_l);
-
-        // Result register is the same as the accumulator for better performance
-        svint32_t sq_l = svmul_s32_x(pg32, diff32_l_l, diff32_l_l);
-        sq_l = svmla_s32_x(pg32, sq_l, diff32_l_h, diff32_l_h);
-
-        svint16_t diff_h = svsub_s16_x(pg16, v1_16_h, v2_16_h);
-
-        svint32_t diff32_h_l = svunpklo_s32(diff_h);
-        svint32_t diff32_h_h = svunpkhi_s32(diff_h);
-
-        // Result register is the same as the accumulator for better performance
-        svint32_t sq_h = svmul_s32_x(pg32, diff32_h_l, diff32_h_l);
-        sq_h = svmla_s32_x(pg32, sq_h, diff32_h_h, diff32_h_h);
+        svint8_t abs_diff = svabd_s8_x(pg, v1_i8, v2_i8);
 
-        // Accumulate
-        sum3 = svadd_s32_x(pg32, sum3, sq_l);
-        sum3 = svadd_s32_x(pg32, sum3, sq_h);
+        // Can sum with taking into account pg because svld1 will set inactive lanes to 0
+        sum3 = svdot_s32(sum3, abs_diff, abs_diff);
     }
 
     sum0 = svadd_s32_x(all, sum0, sum1);
diff --git a/src/VecSim/spaces/L2/L2_SVE_UINT8.h b/src/VecSim/spaces/L2/L2_SVE_UINT8.h
@@ -9,48 +9,17 @@
 
 // Aligned step using svptrue_b8()
 inline void L2SquareStep(const uint8_t *&pVect1, const uint8_t *&pVect2, size_t &offset,
-                         svint32_t &sum, const size_t chunk) {
+                         svuint32_t &sum, const size_t chunk) {
     svbool_t pg = svptrue_b8();
     // Note: Because all the bits are 1, the extention to 16 and 32 bits does not make a difference
     // Otherwise, pg should be recalculated for 16 and 32 operations
 
     svuint8_t v1_ui8 = svld1_u8(pg, pVect1 + offset); // Load uint8 vectors from pVect1
     svuint8_t v2_ui8 = svld1_u8(pg, pVect2 + offset); // Load uint8 vectors from pVect2
 
-    // Unpack to 16 bits and reinterpret as signed is necessary for the subtraction
-    // The subtraction of two 8 bits numbers can overflow
+    svuint8_t abs_diff = svabd_u8_x(pg, v1_ui8, v2_ui8);
 
-    svint16_t v1_16_l = svreinterpret_s16(svunpklo_u16(v1_ui8));
-    svint16_t v1_16_h = svreinterpret_s16(svunpkhi_u16(v1_ui8));
-    svint16_t v2_16_l = svreinterpret_s16(svunpklo_u16(v2_ui8));
-    svint16_t v2_16_h = svreinterpret_s16(svunpkhi_u16(v2_ui8));
-
-    // Calculate difference and square for low part
-    svint16_t diff_l = svsub_s16_x(pg, v1_16_l, v2_16_l);
-
-    // Unpacking to 32 bits is necessary for the multiplication
-    // The multiplication of two 16 bits numbers can overflow
-    // Maximal value of uint8 - uint8 is 255 (255 - 0)
-    // 255^2 = 65025 while int16 can hold upto 32767
-
-    svint32_t diff32_l_l = svunpklo_s32(diff_l);
-    svint32_t diff32_l_h = svunpkhi_s32(diff_l);
-
-    // Result register is the same as the accumulator for better performance
-    svint32_t sq_l = svmul_s32_x(pg, diff32_l_l, diff32_l_l);
-    sq_l = svmla_s32_x(pg, sq_l, diff32_l_h, diff32_l_h);
-
-    svint16_t diff_h = svsub_s16_x(pg, v1_16_h, v2_16_h);
-
-    svint32_t diff32_h_l = svunpklo_s32(diff_h);
-    svint32_t diff32_h_h = svunpkhi_s32(diff_h);
-
-    // Result register is the same as the accumulator for better performance
-    svint32_t sq_h = svmul_s32_x(pg, diff32_h_l, diff32_h_l);
-    sq_h = svmla_s32_x(pg, sq_h, diff32_h_h, diff32_h_h);
-
-    sum = svadd_s32_x(pg, sum, sq_l);
-    sum = svadd_s32_x(pg, sum, sq_h);
+    sum = svdot_u32(sum, abs_diff, abs_diff);
 
     offset += chunk; // Move to the next set of uint8 elements
 }
@@ -72,10 +41,10 @@ float UINT8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimen
     // We can safely assume that the dimension is smaller than that
     // So using int32_t is safe
 
-    svint32_t sum0 = svdup_s32(0);
-    svint32_t sum1 = svdup_s32(0);
-    svint32_t sum2 = svdup_s32(0);
-    svint32_t sum3 = svdup_s32(0);
+    svuint32_t sum0 = svdup_u32(0);
+    svuint32_t sum1 = svdup_u32(0);
+    svuint32_t sum2 = svdup_u32(0);
+    svuint32_t sum3 = svdup_u32(0);
 
     size_t offset = 0;
     size_t num_main_blocks = dimension / chunk_size;
@@ -105,38 +74,13 @@ float UINT8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimen
         svuint8_t v1_ui8 = svld1_u8(pg, pVect1 + offset); // Load uint8 vectors from pVect1
         svuint8_t v2_ui8 = svld1_u8(pg, pVect2 + offset); // Load uint8 vectors from pVect2
 
-        svbool_t pg32 = svwhilelt_b32(offset, dimension);
-
-        svint16_t v1_16_l = svreinterpret_s16(svunpklo_u16(v1_ui8));
-        svint16_t v1_16_h = svreinterpret_s16(svunpkhi_u16(v1_ui8));
-        svint16_t v2_16_l = svreinterpret_s16(svunpklo_u16(v2_ui8));
-        svint16_t v2_16_h = svreinterpret_s16(svunpkhi_u16(v2_ui8));
-
-        // Calculate difference and square for low part
-        svint16_t diff_l = svsub_s16_x(svwhilelt_b16(offset, dimension), v1_16_l, v2_16_l);
-
-        svint32_t diff32_l_l = svunpklo_s32(diff_l);
-        svint32_t diff32_l_h = svunpkhi_s32(diff_l);
-
-        // Result register is the same as the accumulator for better performance
-        svint32_t sq_l = svmul_s32_x(pg32, diff32_l_l, diff32_l_l);
-        sq_l = svmla_s32_x(pg32, sq_l, diff32_l_h, diff32_l_h);
-
-        svint16_t diff_h = svsub_s16_x(pg32, v1_16_h, v2_16_h);
-
-        svint32_t diff32_h_l = svunpklo_s32(diff_h);
-        svint32_t diff32_h_h = svunpkhi_s32(diff_h);
-
-        // Result register is the same as the accumulator for better performance
-        svint32_t sq_h = svmul_s32_x(pg32, diff32_h_l, diff32_h_l);
-        sq_h = svmla_s32_x(pg32, sq_h, diff32_h_h, diff32_h_h);
+        svuint8_t abs_diff = svabd_u8_x(pg, v1_ui8, v2_ui8);
 
-        sum3 = svadd_s32_m(pg32, sum3, sq_l);
-        sum3 = svadd_s32_m(pg32, sum3, sq_h);
+        sum3 = svdot_u32(sum3, abs_diff, abs_diff);
     }
 
-    sum0 = svadd_s32_x(all, sum0, sum1);
-    sum2 = svadd_s32_x(all, sum2, sum3);
-    svint32_t sum_all = svadd_s32_x(all, sum0, sum2);
-    return svaddv_s32(svptrue_b32(), sum_all);
+    sum0 = svadd_u32_x(all, sum0, sum1);
+    sum2 = svadd_u32_x(all, sum2, sum3);
+    svuint32_t sum_all = svadd_u32_x(all, sum0, sum2);
+    return svaddv_u32(svptrue_b32(), sum_all);
 }