@@ -268,6 +268,17 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
268
268
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
269
269
270
270
#if defined(__ARM_NEON )
271
+
272
+ #ifdef _MSC_VER
273
+
274
+ #define ggml_vld1q_u32 (w ,x ,y ,z ) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
275
+
276
+ #else
277
+
278
+ #define ggml_vld1q_u32 (w ,x ,y ,z ) { (w), (x), (y), (z) }
279
+
280
+ #endif
281
+
271
282
#if !defined(__aarch64__ )
272
283
273
284
// 64-bit compatibility
@@ -8698,10 +8709,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * res
8698
8709
for (int ib32 = 0 ; ib32 < QK_K /32 ; ib32 += 2 ) {
8699
8710
q8b = ggml_vld1q_s8_x4 (q8 ); q8 += 64 ;
8700
8711
memcpy (aux32 , gas , 2 * sizeof (uint32_t )); gas += 2 * sizeof (uint32_t );
8701
- const uint32x4_t aux32x4_0 = { iq3xxs_grid [q3 [ 0 ]], iq3xxs_grid [q3 [ 1 ]], iq3xxs_grid [q3 [ 2 ]], iq3xxs_grid [q3 [ 3 ]]} ;
8702
- const uint32x4_t aux32x4_1 = { iq3xxs_grid [q3 [ 4 ]], iq3xxs_grid [q3 [ 5 ]], iq3xxs_grid [q3 [ 6 ]], iq3xxs_grid [q3 [ 7 ]]} ;
8703
- const uint32x4_t aux32x4_2 = { iq3xxs_grid [q3 [ 8 ]], iq3xxs_grid [q3 [ 9 ]], iq3xxs_grid [q3 [10 ]], iq3xxs_grid [q3 [11 ]]} ;
8704
- const uint32x4_t aux32x4_3 = { iq3xxs_grid [q3 [12 ]], iq3xxs_grid [q3 [13 ]], iq3xxs_grid [q3 [14 ]], iq3xxs_grid [q3 [15 ]]} ;
8712
+ const uint32x4_t aux32x4_0 = ggml_vld1q_u32 ( iq3xxs_grid [q3 [ 0 ]], iq3xxs_grid [q3 [ 1 ]], iq3xxs_grid [q3 [ 2 ]], iq3xxs_grid [q3 [ 3 ]]) ;
8713
+ const uint32x4_t aux32x4_1 = ggml_vld1q_u32 ( iq3xxs_grid [q3 [ 4 ]], iq3xxs_grid [q3 [ 5 ]], iq3xxs_grid [q3 [ 6 ]], iq3xxs_grid [q3 [ 7 ]]) ;
8714
+ const uint32x4_t aux32x4_2 = ggml_vld1q_u32 ( iq3xxs_grid [q3 [ 8 ]], iq3xxs_grid [q3 [ 9 ]], iq3xxs_grid [q3 [10 ]], iq3xxs_grid [q3 [11 ]]) ;
8715
+ const uint32x4_t aux32x4_3 = ggml_vld1q_u32 ( iq3xxs_grid [q3 [12 ]], iq3xxs_grid [q3 [13 ]], iq3xxs_grid [q3 [14 ]], iq3xxs_grid [q3 [15 ]]) ;
8705
8716
q3 += 16 ;
8706
8717
q3s .val [0 ] = vcombine_s8 (vld1_s8 ((const void * )(signs64 + ((aux32 [0 ] >> 0 ) & 127 ))), vld1_s8 ((const void * )(signs64 + ((aux32 [0 ] >> 7 ) & 127 ))));
8707
8718
q3s .val [1 ] = vcombine_s8 (vld1_s8 ((const void * )(signs64 + ((aux32 [0 ] >> 14 ) & 127 ))), vld1_s8 ((const void * )(signs64 + ((aux32 [0 ] >> 21 ) & 127 ))));
0 commit comments