@@ -268,6 +268,17 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
268268#endif  // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) 
269269
270270#if  defined(__ARM_NEON )
271+ 
272+ #ifdef  _MSC_VER 
273+ 
274+ #define  ggml_vld1q_u32 (w ,x ,y ,z ) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
275+ 
276+ #else 
277+ 
278+ #define  ggml_vld1q_u32 (w ,x ,y ,z ) { (w), (x), (y), (z) }
279+ 
280+ #endif 
281+ 
271282#if  !defined(__aarch64__ )
272283
273284// 64-bit compatibility 
@@ -8698,10 +8709,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * res
86988709        for  (int  ib32  =  0 ; ib32  <  QK_K /32 ; ib32  +=  2 ) {
86998710            q8b  =  ggml_vld1q_s8_x4 (q8 ); q8  +=  64 ;
87008711            memcpy (aux32 , gas , 2 * sizeof (uint32_t )); gas  +=  2 * sizeof (uint32_t );
8701-             const  uint32x4_t  aux32x4_0  =  { iq3xxs_grid [q3 [ 0 ]], iq3xxs_grid [q3 [ 1 ]], iq3xxs_grid [q3 [ 2 ]], iq3xxs_grid [q3 [ 3 ]]} ;
8702-             const  uint32x4_t  aux32x4_1  =  { iq3xxs_grid [q3 [ 4 ]], iq3xxs_grid [q3 [ 5 ]], iq3xxs_grid [q3 [ 6 ]], iq3xxs_grid [q3 [ 7 ]]} ;
8703-             const  uint32x4_t  aux32x4_2  =  { iq3xxs_grid [q3 [ 8 ]], iq3xxs_grid [q3 [ 9 ]], iq3xxs_grid [q3 [10 ]], iq3xxs_grid [q3 [11 ]]} ;
8704-             const  uint32x4_t  aux32x4_3  =  { iq3xxs_grid [q3 [12 ]], iq3xxs_grid [q3 [13 ]], iq3xxs_grid [q3 [14 ]], iq3xxs_grid [q3 [15 ]]} ;
8712+             const  uint32x4_t  aux32x4_0  =  ggml_vld1q_u32 ( iq3xxs_grid [q3 [ 0 ]], iq3xxs_grid [q3 [ 1 ]], iq3xxs_grid [q3 [ 2 ]], iq3xxs_grid [q3 [ 3 ]]) ;
8713+             const  uint32x4_t  aux32x4_1  =  ggml_vld1q_u32 ( iq3xxs_grid [q3 [ 4 ]], iq3xxs_grid [q3 [ 5 ]], iq3xxs_grid [q3 [ 6 ]], iq3xxs_grid [q3 [ 7 ]]) ;
8714+             const  uint32x4_t  aux32x4_2  =  ggml_vld1q_u32 ( iq3xxs_grid [q3 [ 8 ]], iq3xxs_grid [q3 [ 9 ]], iq3xxs_grid [q3 [10 ]], iq3xxs_grid [q3 [11 ]]) ;
8715+             const  uint32x4_t  aux32x4_3  =  ggml_vld1q_u32 ( iq3xxs_grid [q3 [12 ]], iq3xxs_grid [q3 [13 ]], iq3xxs_grid [q3 [14 ]], iq3xxs_grid [q3 [15 ]]) ;
87058716            q3  +=  16 ;
87068717            q3s .val [0 ] =  vcombine_s8 (vld1_s8 ((const  void  * )(signs64  +  ((aux32 [0 ] >>  0 ) &  127 ))), vld1_s8 ((const  void  * )(signs64  +  ((aux32 [0 ] >>  7 ) &  127 ))));
87078718            q3s .val [1 ] =  vcombine_s8 (vld1_s8 ((const  void  * )(signs64  +  ((aux32 [0 ] >> 14 ) &  127 ))), vld1_s8 ((const  void  * )(signs64  +  ((aux32 [0 ] >> 21 ) &  127 ))));
0 commit comments