@@ -53,9 +53,9 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
5353
5454#if defined(__VXE__ ) || defined(__VXE2__ )
5555 for (int i = 0 ; i < nb ; i ++ ) {
56- __vector float srcv [8 ];
57- __vector float asrcv [8 ];
58- __vector float amaxv [8 ];
56+ float32x4_t srcv [8 ];
57+ float32x4_t asrcv [8 ];
58+ float32x4_t amaxv [8 ];
5959
6060 for (int j = 0 ; j < 8 ; j ++ ) srcv [j ] = vec_xl (0 , x + i * 32 + 4 * j );
6161 for (int j = 0 ; j < 8 ; j ++ ) asrcv [j ] = vec_abs (srcv [j ]);
@@ -74,8 +74,8 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
7474 y [i ].d = GGML_CPU_FP32_TO_FP16 (d );
7575
7676 for (int j = 0 ; j < 8 ; j ++ ) {
77- const __vector float v = vec_mul (srcv [j ], vec_splats (id ));
78- const __vector int32_t vi = vec_signed (v );
77+ const float32x4_t v = vec_mul (srcv [j ], vec_splats (id ));
78+ const int32x4_t vi = vec_signed (v );
7979
8080 y [i ].qs [4 * j + 0 ] = vec_extract (vi , 0 );
8181 y [i ].qs [4 * j + 1 ] = vec_extract (vi , 1 );
@@ -98,9 +98,9 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
9898
9999#if defined(__VXE__ ) || defined(__VXE2__ )
100100 for (int i = 0 ; i < nb ; i ++ ) {
101- __vector float srcv [8 ];
102- __vector float asrcv [8 ];
103- __vector float amaxv [8 ];
101+ float32x4_t srcv [8 ];
102+ float32x4_t asrcv [8 ];
103+ float32x4_t amaxv [8 ];
104104
105105 for (int j = 0 ; j < 8 ; j ++ ) srcv [j ] = vec_xl (0 , x + i * 32 + 4 * j );
106106 for (int j = 0 ; j < 8 ; j ++ ) asrcv [j ] = vec_abs (srcv [j ]);
@@ -118,11 +118,11 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
118118
119119 y [i ].d = GGML_CPU_FP32_TO_FP16 (d );
120120
121- __vector int32_t acc = vec_splats (0 );
121+ int32x4_t acc = vec_splats (0 );
122122
123123 for (int j = 0 ; j < 8 ; j ++ ) {
124- const __vector float v = vec_mul (srcv [j ], vec_splats (id ));
125- const __vector int32_t vi = vec_signed (v );
124+ const float32x4_t v = vec_mul (srcv [j ], vec_splats (id ));
125+ const int32x4_t vi = vec_signed (v );
126126
127127 y [i ].qs [4 * j + 0 ] = vec_extract (vi , 0 );
128128 y [i ].qs [4 * j + 1 ] = vec_extract (vi , 1 );
@@ -162,37 +162,36 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
162162 float sumf = 0 ;
163163
164164#if defined(__VXE__ ) || defined(__VXE2__ )
165- __vector float acc = vec_splats (0.0f );
165+ float32x4_t acc = vec_splats (0.0f );
166166
167- const __vector uint8_t v_m = vec_splats ((const uint8_t )0x0F );
168- const __vector int8_t v_s = vec_splats ( (const int8_t )0x08 );
167+ const uint8x16_t v_m = vec_splats ((const uint8_t )0x0F );
168+ const int8x16_t v_s = vec_splats ( (const int8_t )0x08 );
169169
170170 for (; ib < nb ; ++ ib ) {
171- const __vector uint8_t v_x = vec_xl (0 , x [ib ].qs );
172- const __vector int8_t v_xl = (const __vector int8_t )(v_x & v_m );
173- const __vector int8_t v_xh = (const __vector int8_t )(v_x >> 4 );
171+ const uint8x16_t v_x = vec_xl (0 , x [ib ].qs );
172+ const int8x16_t v_xl = (const int8x16_t )(v_x & v_m );
173+ const int8x16_t v_xh = (const int8x16_t )(v_x >> 4 );
174174
175- const __vector int8_t v_xls = vec_sub (v_xl , v_s );
176- const __vector int8_t v_xhs = vec_sub (v_xh , v_s );
175+ const int8x16_t v_xls = vec_sub (v_xl , v_s );
176+ const int8x16_t v_xhs = vec_sub (v_xh , v_s );
177177
178- const __vector int8_t v_yl = vec_xl (0 , y [ib ].qs );
179- const __vector int8_t v_yh = vec_xl (QK8_0 /2 , y [ib ].qs );
178+ const int8x16_t v_yl = vec_xl (0 , y [ib ].qs );
179+ const int8x16_t v_yh = vec_xl (QK8_0 /2 , y [ib ].qs );
180180
181- const __vector int16_t v_xylso = vec_mulo (v_xls , v_yl );
182- const __vector int16_t v_xylse = vec_mule (v_xls , v_yl );
183- const __vector int16_t v_xyhso = vec_mulo (v_xhs , v_yh );
184- const __vector int16_t v_xyhse = vec_mule (v_xhs , v_yh );
181+ const int16x8_t v_xylso = vec_mulo (v_xls , v_yl );
182+ const int16x8_t v_xylse = vec_mule (v_xls , v_yl );
183+ const int16x8_t v_xyhso = vec_mulo (v_xhs , v_yh );
184+ const int16x8_t v_xyhse = vec_mule (v_xhs , v_yh );
185185
186- __vector int16_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse ; v_xy_ += vec_reve (v_xy_ );
186+ int16x8_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse ; v_xy_ += vec_reve (v_xy_ );
187187
188- const __vector float v_xy = vec_float (vec_unpackh (v_xy_ ));
189- const __vector float v_d = vec_splats (GGML_CPU_FP16_TO_FP32 (x [ib ].d ) * GGML_CPU_FP16_TO_FP32 (y [ib ].d ));
188+ const float32x4_t v_xy = vec_float (vec_unpackh (v_xy_ ));
189+ const float32x4_t v_d = vec_splats (GGML_CPU_FP16_TO_FP32 (x [ib ].d ) * GGML_CPU_FP16_TO_FP32 (y [ib ].d ));
190190
191191 acc = vec_madd (v_xy , v_d , acc );
192192 }
193193
194- sumf = acc [0 ] + acc [1 ] + acc [2 ] + acc [3 ];
195-
194+ sumf = vec_hsum_f32x4 (acc );
196195 * s = sumf ;
197196#else
198197 UNUSED (nb );
@@ -249,8 +248,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
249248 acc = vec_madd (v_xy , v_d , acc );
250249 }
251250
252- sumf = acc [0 ] + acc [1 ] + acc [2 ] + acc [3 ] + summs ;
253-
251+ sumf = vec_hsum_f32x4 (acc ) + summs ;
254252 * s = sumf ;
255253#else
256254 UNUSED (nb );
@@ -351,7 +349,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
351349 v_sum1 = vec_madd (v_xy1f , v_d1 , v_sum1 );
352350 }
353351
354- sumf += vec_hsum (v_sum0 ) + vec_hsum (v_sum1 );
352+ sumf += vec_hsum_f32x4 (v_sum0 ) + vec_hsum_f32x4 (v_sum1 );
355353
356354 #pragma GCC unroll 4
357355 for (; ib < nb ; ++ ib ) {
@@ -390,7 +388,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
390388 const float32x4_t v_d = vec_splats (GGML_CPU_FP16_TO_FP32 (x0 -> d ) * GGML_CPU_FP16_TO_FP32 (y0 -> d ));
391389 const float32x4_t v_acc = vec_madd (v_xyf , v_d , vec_splats (0.0f ));
392390
393- sumf += vec_hsum (v_acc );
391+ sumf += vec_hsum_f32x4 (v_acc );
394392 }
395393
396394 * s = sumf ;
@@ -502,7 +500,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
502500 v_sum1 = vec_madd (v_xy1f , v_d1 , v_sum1 );
503501 }
504502
505- sumf += vec_hsum (v_sum0 ) + vec_hsum (v_sum1 ) + summs0 + summs1 ;
503+ sumf += vec_hsum_f32x4 (v_sum0 ) + vec_hsum_f32x4 (v_sum1 ) + summs0 + summs1 ;
506504
507505 #pragma GCC unroll 4
508506 for (; ib < nb ; ++ ib ) {
@@ -543,7 +541,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
543541 const float32x4_t v_d = vec_splats (GGML_CPU_FP16_TO_FP32 (x0 -> d ) * GGML_CPU_FP16_TO_FP32 (y0 -> d ));
544542 const float32x4_t v_acc = vec_madd (v_xyf , v_d , v_acc );
545543
546- sumf += vec_hsum (v_acc ) + summs ;
544+ sumf += vec_hsum_f32x4 (v_acc ) + summs ;
547545 }
548546
549547 * s = sumf ;
@@ -575,7 +573,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
575573 float sumf = 0 ;
576574
577575#if defined(__VXE__ ) || defined(__VXE2__ )
578- __vector float acc = vec_splats (0.0f );
576+ float32x4_t acc = vec_splats (0.0f );
579577
580578#pragma GCC unroll 8
581579 for (; ib < nb ; ++ ib ) {
@@ -594,7 +592,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
594592 acc = vec_madd (v_xy , v_d , acc );
595593 }
596594
597- sumf = acc [ 0 ] + acc [ 1 ] + acc [ 2 ] + acc [ 3 ] ;
595+ sumf = vec_hsum_f32x4 ( acc ) ;
598596
599597 * s = sumf ;
600598#else
@@ -718,10 +716,10 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
718716 isum2 = ggml_vec_dot (v_z , q3bytes [2 ], q8bytes [6 ]);
719717 isum3 = ggml_vec_dot (v_z , q3bytes [3 ], q8bytes [7 ]);
720718
721- isum += (isum0 [ 0 ] + isum0 [ 1 ] + isum0 [ 2 ] + isum0 [ 3 ] ) * scale [0 ];
722- isum += (isum1 [ 0 ] + isum1 [ 1 ] + isum1 [ 2 ] + isum1 [ 3 ] ) * scale [1 ];
723- isum += (isum2 [ 0 ] + isum2 [ 1 ] + isum2 [ 2 ] + isum2 [ 3 ] ) * scale [2 ];
724- isum += (isum3 [ 0 ] + isum3 [ 1 ] + isum3 [ 2 ] + isum3 [ 3 ] ) * scale [3 ];
719+ isum += vec_hsum_i32x4 (isum0 ) * scale [0 ];
720+ isum += vec_hsum_i32x4 (isum1 ) * scale [1 ];
721+ isum += vec_hsum_i32x4 (isum2 ) * scale [2 ];
722+ isum += vec_hsum_i32x4 (isum3 ) * scale [3 ];
725723
726724 scale += 4 ;
727725
@@ -819,7 +817,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
819817 v_xl [1 ] = (int8x16_t )vec_and (v_x [1 ], v_lm );
820818
821819 const int32x4_t p1 = ggml_vec_dot (ggml_vec_dot (v_z , v_xl [0 ], v_y [0 ]), v_xl [1 ], v_y [1 ]);
822- sumi1 += (p1 [ 0 ] + p1 [ 1 ] + p1 [ 2 ] + p1 [ 3 ] ) * scales [2 * j + 0 ];
820+ sumi1 += vec_hsum_i32x4 (p1 ) * scales [2 * j + 0 ];
823821
824822 v_y [0 ] = vec_xl (0 , y0 );
825823 v_y [1 ] = vec_xl (16 , y0 );
@@ -829,7 +827,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
829827 v_xl [1 ] = (int8x16_t )vec_sr (v_x [1 ], 4 );
830828
831829 const int32x4_t p2 = ggml_vec_dot (ggml_vec_dot (v_z , v_xl [0 ], v_y [0 ]), v_xl [1 ], v_y [1 ]);
832- sumi2 += (p2 [ 0 ] + p2 [ 1 ] + p2 [ 2 ] + p2 [ 3 ] ) * scales [2 * j + 1 ];
830+ sumi2 += vec_hsum_i32x4 (p2 ) * scales [2 * j + 1 ];
833831 }
834832
835833 sumf += d * (sumi1 + sumi2 );
@@ -911,7 +909,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
911909 const int32x4_t v_minsho = vec_mulo (v_ysums , v_minsh );
912910 const int32x4_t v_minshe = vec_mule (v_ysums , v_minsh );
913911 const int32x4_t v_mins = vec_add (v_minsho , v_minshe );
914- const int32_t mins = v_mins [ 0 ] + v_mins [ 1 ] + v_mins [ 2 ] + v_mins [ 3 ] ;
912+ const int32_t mins = vec_hsum_i32x4 ( v_mins ) ;
915913
916914 const uint8_t * scales = (const uint8_t * )utmp ;
917915 const uint8_t * GGML_RESTRICT x0l = x [i ].qs ;
@@ -948,8 +946,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
948946 int32x4_t sumi0 = ggml_vec_dot (ggml_vec_dot (v_z , q5b [0 ], v_y [0 ]), q5b [1 ], v_y [1 ]);
949947 int32x4_t sumi1 = ggml_vec_dot (ggml_vec_dot (v_z , q5b [2 ], v_y [2 ]), q5b [3 ], v_y [3 ]);
950948
951- sumi += (sumi0 [ 0 ] + sumi0 [ 1 ] + sumi0 [ 2 ] + sumi0 [ 3 ] ) * * scales ++ ;
952- sumi += (sumi1 [ 0 ] + sumi1 [ 1 ] + sumi1 [ 2 ] + sumi1 [ 3 ] ) * * scales ++ ;
949+ sumi += vec_hsum_i32x4 (sumi0 ) * * scales ++ ;
950+ sumi += vec_hsum_i32x4 (sumi1 ) * * scales ++ ;
953951 }
954952
955953 sumf += d * sumi - dmin * mins ;
@@ -1020,7 +1018,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
10201018 const int32x4_t v_minshe = vec_mule (v_ysumsh , v_scaleh );
10211019 const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe ;
10221020
1023- const int32_t mins = v_mins [ 0 ] + v_mins [ 1 ] + v_mins [ 2 ] + v_mins [ 3 ] ;
1021+ const int32_t mins = vec_hsum_i32x4 ( v_mins ) ;
10241022
10251023 int32_t isum = 0 ;
10261024 for (int j = 0 ; j < QK_K /128 ; ++ j ) {
@@ -1060,10 +1058,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
10601058 int32x4_t summs2 = ggml_vec_dot (v_z , q6b [2 ], v_y [2 ]);
10611059 int32x4_t summs3 = ggml_vec_dot (v_z , q6b [3 ], v_y [3 ]);
10621060
1063- isum += (summs0 [ 0 ] + summs0 [ 1 ] + summs0 [ 2 ] + summs0 [ 3 ] ) * scale [0 ] +
1064- (summs1 [ 0 ] + summs1 [ 1 ] + summs1 [ 2 ] + summs1 [ 3 ] ) * scale [1 ] +
1065- (summs2 [ 0 ] + summs2 [ 1 ] + summs2 [ 2 ] + summs2 [ 3 ] ) * scale [2 ] +
1066- (summs3 [ 0 ] + summs3 [ 1 ] + summs3 [ 2 ] + summs3 [ 3 ] ) * scale [3 ];
1061+ isum += vec_hsum_i32x4 (summs0 ) * scale [0 ] +
1062+ vec_hsum_i32x4 (summs1 ) * scale [1 ] +
1063+ vec_hsum_i32x4 (summs2 ) * scale [2 ] +
1064+ vec_hsum_i32x4 (summs3 ) * scale [3 ];
10671065
10681066 scale += 4 ;
10691067
@@ -1094,10 +1092,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
10941092 summs2 = ggml_vec_dot (v_z , q6b [2 ], v_y [2 ]);
10951093 summs3 = ggml_vec_dot (v_z , q6b [3 ], v_y [3 ]);
10961094
1097- isum += (summs0 [ 0 ] + summs0 [ 1 ] + summs0 [ 2 ] + summs0 [ 3 ] ) * scale [0 ] +
1098- (summs1 [ 0 ] + summs1 [ 1 ] + summs1 [ 2 ] + summs1 [ 3 ] ) * scale [1 ] +
1099- (summs2 [ 0 ] + summs2 [ 1 ] + summs2 [ 2 ] + summs2 [ 3 ] ) * scale [2 ] +
1100- (summs3 [ 0 ] + summs3 [ 1 ] + summs3 [ 2 ] + summs3 [ 3 ] ) * scale [3 ];
1095+ isum += vec_hsum_i32x4 (summs0 ) * scale [0 ] +
1096+ vec_hsum_i32x4 (summs1 ) * scale [1 ] +
1097+ vec_hsum_i32x4 (summs2 ) * scale [2 ] +
1098+ vec_hsum_i32x4 (summs3 ) * scale [3 ];
11011099
11021100 scale += 4 ;
11031101 }
@@ -1285,7 +1283,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
12851283 const int8x16_t v_yh = vec_xl (QK8_0 /2 , y0 -> qs );
12861284 const int32x4_t v_xy = ggml_vec_dot (ggml_vec_dot (vec_splats (0 ), v_xl , v_yl ), v_xh , v_yh );
12871285
1288- sumf += GGML_CPU_FP16_TO_FP32 (x0 -> d ) * GGML_CPU_FP16_TO_FP32 (y0 -> d ) * (v_xy [ 0 ] + v_xy [ 1 ] + v_xy [ 2 ] + v_xy [ 3 ] );
1286+ sumf += GGML_CPU_FP16_TO_FP32 (x0 -> d ) * GGML_CPU_FP16_TO_FP32 (y0 -> d ) * vec_hsum_i32x4 (v_xy );
12891287 }
12901288
12911289 * s = sumf ;
@@ -1354,8 +1352,8 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
13541352
13551353 h >>= 4 ;
13561354
1357- sumi1 += (vsumi0 [ 0 ] + vsumi0 [ 1 ] + vsumi0 [ 2 ] + vsumi0 [ 3 ] ) * ls1 ;
1358- sumi2 += (vsumi1 [ 0 ] + vsumi1 [ 1 ] + vsumi1 [ 2 ] + vsumi1 [ 3 ] ) * ls2 ;
1355+ sumi1 += vec_hsum_i32x4 (vsumi0 ) * ls1 ;
1356+ sumi2 += vec_hsum_i32x4 (vsumi1 ) * ls2 ;
13591357 }
13601358
13611359 sumf += GGML_CPU_FP16_TO_FP32 (x [ibl ].d ) * y [ibl ].d * (sumi1 + sumi2 );
0 commit comments