@@ -268,6 +268,17 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
268268#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
269269
270270#if defined(__ARM_NEON )
271+
272+ #ifdef _MSC_VER
273+
274+ #define ggml_vld1q_u32 (w ,x ,y ,z ) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
275+
276+ #else
277+
278+ #define ggml_vld1q_u32 (w ,x ,y ,z ) { (w), (x), (y), (z) }
279+
280+ #endif
281+
271282#if !defined(__aarch64__ )
272283
273284// 64-bit compatibility
@@ -8698,10 +8709,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * res
86988709 for (int ib32 = 0 ; ib32 < QK_K /32 ; ib32 += 2 ) {
86998710 q8b = ggml_vld1q_s8_x4 (q8 ); q8 += 64 ;
87008711 memcpy (aux32 , gas , 2 * sizeof (uint32_t )); gas += 2 * sizeof (uint32_t );
8701- const uint32x4_t aux32x4_0 = { iq3xxs_grid [q3 [ 0 ]], iq3xxs_grid [q3 [ 1 ]], iq3xxs_grid [q3 [ 2 ]], iq3xxs_grid [q3 [ 3 ]]} ;
8702- const uint32x4_t aux32x4_1 = { iq3xxs_grid [q3 [ 4 ]], iq3xxs_grid [q3 [ 5 ]], iq3xxs_grid [q3 [ 6 ]], iq3xxs_grid [q3 [ 7 ]]} ;
8703- const uint32x4_t aux32x4_2 = { iq3xxs_grid [q3 [ 8 ]], iq3xxs_grid [q3 [ 9 ]], iq3xxs_grid [q3 [10 ]], iq3xxs_grid [q3 [11 ]]} ;
8704- const uint32x4_t aux32x4_3 = { iq3xxs_grid [q3 [12 ]], iq3xxs_grid [q3 [13 ]], iq3xxs_grid [q3 [14 ]], iq3xxs_grid [q3 [15 ]]} ;
8712+ const uint32x4_t aux32x4_0 = ggml_vld1q_u32 ( iq3xxs_grid [q3 [ 0 ]], iq3xxs_grid [q3 [ 1 ]], iq3xxs_grid [q3 [ 2 ]], iq3xxs_grid [q3 [ 3 ]]) ;
8713+ const uint32x4_t aux32x4_1 = ggml_vld1q_u32 ( iq3xxs_grid [q3 [ 4 ]], iq3xxs_grid [q3 [ 5 ]], iq3xxs_grid [q3 [ 6 ]], iq3xxs_grid [q3 [ 7 ]]) ;
8714+ const uint32x4_t aux32x4_2 = ggml_vld1q_u32 ( iq3xxs_grid [q3 [ 8 ]], iq3xxs_grid [q3 [ 9 ]], iq3xxs_grid [q3 [10 ]], iq3xxs_grid [q3 [11 ]]) ;
8715+ const uint32x4_t aux32x4_3 = ggml_vld1q_u32 ( iq3xxs_grid [q3 [12 ]], iq3xxs_grid [q3 [13 ]], iq3xxs_grid [q3 [14 ]], iq3xxs_grid [q3 [15 ]]) ;
87058716 q3 += 16 ;
87068717 q3s .val [0 ] = vcombine_s8 (vld1_s8 ((const void * )(signs64 + ((aux32 [0 ] >> 0 ) & 127 ))), vld1_s8 ((const void * )(signs64 + ((aux32 [0 ] >> 7 ) & 127 ))));
87078718 q3s .val [1 ] = vcombine_s8 (vld1_s8 ((const void * )(signs64 + ((aux32 [0 ] >> 14 ) & 127 ))), vld1_s8 ((const void * )(signs64 + ((aux32 [0 ] >> 21 ) & 127 ))));
0 commit comments