@@ -53,9 +53,9 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
53
53
54
54
#if defined(__VXE__ ) || defined(__VXE2__ )
55
55
for (int i = 0 ; i < nb ; i ++ ) {
56
- __vector float srcv [8 ];
57
- __vector float asrcv [8 ];
58
- __vector float amaxv [8 ];
56
+ float32x4_t srcv [8 ];
57
+ float32x4_t asrcv [8 ];
58
+ float32x4_t amaxv [8 ];
59
59
60
60
for (int j = 0 ; j < 8 ; j ++ ) srcv [j ] = vec_xl (0 , x + i * 32 + 4 * j );
61
61
for (int j = 0 ; j < 8 ; j ++ ) asrcv [j ] = vec_abs (srcv [j ]);
@@ -74,8 +74,8 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
74
74
y [i ].d = GGML_CPU_FP32_TO_FP16 (d );
75
75
76
76
for (int j = 0 ; j < 8 ; j ++ ) {
77
- const __vector float v = vec_mul (srcv [j ], vec_splats (id ));
78
- const __vector int32_t vi = vec_signed (v );
77
+ const float32x4_t v = vec_mul (srcv [j ], vec_splats (id ));
78
+ const int32x4_t vi = vec_signed (v );
79
79
80
80
y [i ].qs [4 * j + 0 ] = vec_extract (vi , 0 );
81
81
y [i ].qs [4 * j + 1 ] = vec_extract (vi , 1 );
@@ -98,9 +98,9 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
98
98
99
99
#if defined(__VXE__ ) || defined(__VXE2__ )
100
100
for (int i = 0 ; i < nb ; i ++ ) {
101
- __vector float srcv [8 ];
102
- __vector float asrcv [8 ];
103
- __vector float amaxv [8 ];
101
+ float32x4_t srcv [8 ];
102
+ float32x4_t asrcv [8 ];
103
+ float32x4_t amaxv [8 ];
104
104
105
105
for (int j = 0 ; j < 8 ; j ++ ) srcv [j ] = vec_xl (0 , x + i * 32 + 4 * j );
106
106
for (int j = 0 ; j < 8 ; j ++ ) asrcv [j ] = vec_abs (srcv [j ]);
@@ -118,11 +118,11 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
118
118
119
119
y [i ].d = GGML_CPU_FP32_TO_FP16 (d );
120
120
121
- __vector int32_t acc = vec_splats (0 );
121
+ int32x4_t acc = vec_splats (0 );
122
122
123
123
for (int j = 0 ; j < 8 ; j ++ ) {
124
- const __vector float v = vec_mul (srcv [j ], vec_splats (id ));
125
- const __vector int32_t vi = vec_signed (v );
124
+ const float32x4_t v = vec_mul (srcv [j ], vec_splats (id ));
125
+ const int32x4_t vi = vec_signed (v );
126
126
127
127
y [i ].qs [4 * j + 0 ] = vec_extract (vi , 0 );
128
128
y [i ].qs [4 * j + 1 ] = vec_extract (vi , 1 );
@@ -162,37 +162,36 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
162
162
float sumf = 0 ;
163
163
164
164
#if defined(__VXE__ ) || defined(__VXE2__ )
165
- __vector float acc = vec_splats (0.0f );
165
+ float32x4_t acc = vec_splats (0.0f );
166
166
167
- const __vector uint8_t v_m = vec_splats ((const uint8_t )0x0F );
168
- const __vector int8_t v_s = vec_splats ( (const int8_t )0x08 );
167
+ const uint8x16_t v_m = vec_splats ((const uint8_t )0x0F );
168
+ const int8x16_t v_s = vec_splats ( (const int8_t )0x08 );
169
169
170
170
for (; ib < nb ; ++ ib ) {
171
- const __vector uint8_t v_x = vec_xl (0 , x [ib ].qs );
172
- const __vector int8_t v_xl = (const __vector int8_t )(v_x & v_m );
173
- const __vector int8_t v_xh = (const __vector int8_t )(v_x >> 4 );
171
+ const uint8x16_t v_x = vec_xl (0 , x [ib ].qs );
172
+ const int8x16_t v_xl = (const int8x16_t )(v_x & v_m );
173
+ const int8x16_t v_xh = (const int8x16_t )(v_x >> 4 );
174
174
175
- const __vector int8_t v_xls = vec_sub (v_xl , v_s );
176
- const __vector int8_t v_xhs = vec_sub (v_xh , v_s );
175
+ const int8x16_t v_xls = vec_sub (v_xl , v_s );
176
+ const int8x16_t v_xhs = vec_sub (v_xh , v_s );
177
177
178
- const __vector int8_t v_yl = vec_xl (0 , y [ib ].qs );
179
- const __vector int8_t v_yh = vec_xl (QK8_0 /2 , y [ib ].qs );
178
+ const int8x16_t v_yl = vec_xl (0 , y [ib ].qs );
179
+ const int8x16_t v_yh = vec_xl (QK8_0 /2 , y [ib ].qs );
180
180
181
- const __vector int16_t v_xylso = vec_mulo (v_xls , v_yl );
182
- const __vector int16_t v_xylse = vec_mule (v_xls , v_yl );
183
- const __vector int16_t v_xyhso = vec_mulo (v_xhs , v_yh );
184
- const __vector int16_t v_xyhse = vec_mule (v_xhs , v_yh );
181
+ const int16x8_t v_xylso = vec_mulo (v_xls , v_yl );
182
+ const int16x8_t v_xylse = vec_mule (v_xls , v_yl );
183
+ const int16x8_t v_xyhso = vec_mulo (v_xhs , v_yh );
184
+ const int16x8_t v_xyhse = vec_mule (v_xhs , v_yh );
185
185
186
- __vector int16_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse ; v_xy_ += vec_reve (v_xy_ );
186
+ int16x8_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse ; v_xy_ += vec_reve (v_xy_ );
187
187
188
- const __vector float v_xy = vec_float (vec_unpackh (v_xy_ ));
189
- const __vector float v_d = vec_splats (GGML_CPU_FP16_TO_FP32 (x [ib ].d ) * GGML_CPU_FP16_TO_FP32 (y [ib ].d ));
188
+ const float32x4_t v_xy = vec_float (vec_unpackh (v_xy_ ));
189
+ const float32x4_t v_d = vec_splats (GGML_CPU_FP16_TO_FP32 (x [ib ].d ) * GGML_CPU_FP16_TO_FP32 (y [ib ].d ));
190
190
191
191
acc = vec_madd (v_xy , v_d , acc );
192
192
}
193
193
194
- sumf = acc [0 ] + acc [1 ] + acc [2 ] + acc [3 ];
195
-
194
+ sumf = vec_hsum_f32x4 (acc );
196
195
* s = sumf ;
197
196
#else
198
197
UNUSED (nb );
@@ -249,8 +248,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
249
248
acc = vec_madd (v_xy , v_d , acc );
250
249
}
251
250
252
- sumf = acc [0 ] + acc [1 ] + acc [2 ] + acc [3 ] + summs ;
253
-
251
+ sumf = vec_hsum_f32x4 (acc ) + summs ;
254
252
* s = sumf ;
255
253
#else
256
254
UNUSED (nb );
@@ -351,7 +349,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
351
349
v_sum1 = vec_madd (v_xy1f , v_d1 , v_sum1 );
352
350
}
353
351
354
- sumf += vec_hsum (v_sum0 ) + vec_hsum (v_sum1 );
352
+ sumf += vec_hsum_f32x4 (v_sum0 ) + vec_hsum_f32x4 (v_sum1 );
355
353
356
354
#pragma GCC unroll 4
357
355
for (; ib < nb ; ++ ib ) {
@@ -390,7 +388,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
390
388
const float32x4_t v_d = vec_splats (GGML_CPU_FP16_TO_FP32 (x0 -> d ) * GGML_CPU_FP16_TO_FP32 (y0 -> d ));
391
389
const float32x4_t v_acc = vec_madd (v_xyf , v_d , vec_splats (0.0f ));
392
390
393
- sumf += vec_hsum (v_acc );
391
+ sumf += vec_hsum_f32x4 (v_acc );
394
392
}
395
393
396
394
* s = sumf ;
@@ -502,7 +500,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
502
500
v_sum1 = vec_madd (v_xy1f , v_d1 , v_sum1 );
503
501
}
504
502
505
- sumf += vec_hsum (v_sum0 ) + vec_hsum (v_sum1 ) + summs0 + summs1 ;
503
+ sumf += vec_hsum_f32x4 (v_sum0 ) + vec_hsum_f32x4 (v_sum1 ) + summs0 + summs1 ;
506
504
507
505
#pragma GCC unroll 4
508
506
for (; ib < nb ; ++ ib ) {
@@ -543,7 +541,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
543
541
const float32x4_t v_d = vec_splats (GGML_CPU_FP16_TO_FP32 (x0 -> d ) * GGML_CPU_FP16_TO_FP32 (y0 -> d ));
544
542
const float32x4_t v_acc = vec_madd (v_xyf , v_d , v_acc );
545
543
546
- sumf += vec_hsum (v_acc ) + summs ;
544
+ sumf += vec_hsum_f32x4 (v_acc ) + summs ;
547
545
}
548
546
549
547
* s = sumf ;
@@ -575,7 +573,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
575
573
float sumf = 0 ;
576
574
577
575
#if defined(__VXE__ ) || defined(__VXE2__ )
578
- __vector float acc = vec_splats (0.0f );
576
+ float32x4_t acc = vec_splats (0.0f );
579
577
580
578
#pragma GCC unroll 8
581
579
for (; ib < nb ; ++ ib ) {
@@ -594,7 +592,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
594
592
acc = vec_madd (v_xy , v_d , acc );
595
593
}
596
594
597
- sumf = acc [ 0 ] + acc [ 1 ] + acc [ 2 ] + acc [ 3 ] ;
595
+ sumf = vec_hsum_f32x4 ( acc ) ;
598
596
599
597
* s = sumf ;
600
598
#else
@@ -718,10 +716,10 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
718
716
isum2 = ggml_vec_dot (v_z , q3bytes [2 ], q8bytes [6 ]);
719
717
isum3 = ggml_vec_dot (v_z , q3bytes [3 ], q8bytes [7 ]);
720
718
721
- isum += (isum0 [ 0 ] + isum0 [ 1 ] + isum0 [ 2 ] + isum0 [ 3 ] ) * scale [0 ];
722
- isum += (isum1 [ 0 ] + isum1 [ 1 ] + isum1 [ 2 ] + isum1 [ 3 ] ) * scale [1 ];
723
- isum += (isum2 [ 0 ] + isum2 [ 1 ] + isum2 [ 2 ] + isum2 [ 3 ] ) * scale [2 ];
724
- isum += (isum3 [ 0 ] + isum3 [ 1 ] + isum3 [ 2 ] + isum3 [ 3 ] ) * scale [3 ];
719
+ isum += vec_hsum_i32x4 (isum0 ) * scale [0 ];
720
+ isum += vec_hsum_i32x4 (isum1 ) * scale [1 ];
721
+ isum += vec_hsum_i32x4 (isum2 ) * scale [2 ];
722
+ isum += vec_hsum_i32x4 (isum3 ) * scale [3 ];
725
723
726
724
scale += 4 ;
727
725
@@ -819,7 +817,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
819
817
v_xl [1 ] = (int8x16_t )vec_and (v_x [1 ], v_lm );
820
818
821
819
const int32x4_t p1 = ggml_vec_dot (ggml_vec_dot (v_z , v_xl [0 ], v_y [0 ]), v_xl [1 ], v_y [1 ]);
822
- sumi1 += (p1 [ 0 ] + p1 [ 1 ] + p1 [ 2 ] + p1 [ 3 ] ) * scales [2 * j + 0 ];
820
+ sumi1 += vec_hsum_i32x4 (p1 ) * scales [2 * j + 0 ];
823
821
824
822
v_y [0 ] = vec_xl (0 , y0 );
825
823
v_y [1 ] = vec_xl (16 , y0 );
@@ -829,7 +827,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
829
827
v_xl [1 ] = (int8x16_t )vec_sr (v_x [1 ], 4 );
830
828
831
829
const int32x4_t p2 = ggml_vec_dot (ggml_vec_dot (v_z , v_xl [0 ], v_y [0 ]), v_xl [1 ], v_y [1 ]);
832
- sumi2 += (p2 [ 0 ] + p2 [ 1 ] + p2 [ 2 ] + p2 [ 3 ] ) * scales [2 * j + 1 ];
830
+ sumi2 += vec_hsum_i32x4 (p2 ) * scales [2 * j + 1 ];
833
831
}
834
832
835
833
sumf += d * (sumi1 + sumi2 );
@@ -911,7 +909,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
911
909
const int32x4_t v_minsho = vec_mulo (v_ysums , v_minsh );
912
910
const int32x4_t v_minshe = vec_mule (v_ysums , v_minsh );
913
911
const int32x4_t v_mins = vec_add (v_minsho , v_minshe );
914
- const int32_t mins = v_mins [ 0 ] + v_mins [ 1 ] + v_mins [ 2 ] + v_mins [ 3 ] ;
912
+ const int32_t mins = vec_hsum_i32x4 ( v_mins ) ;
915
913
916
914
const uint8_t * scales = (const uint8_t * )utmp ;
917
915
const uint8_t * GGML_RESTRICT x0l = x [i ].qs ;
@@ -948,8 +946,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
948
946
int32x4_t sumi0 = ggml_vec_dot (ggml_vec_dot (v_z , q5b [0 ], v_y [0 ]), q5b [1 ], v_y [1 ]);
949
947
int32x4_t sumi1 = ggml_vec_dot (ggml_vec_dot (v_z , q5b [2 ], v_y [2 ]), q5b [3 ], v_y [3 ]);
950
948
951
- sumi += (sumi0 [ 0 ] + sumi0 [ 1 ] + sumi0 [ 2 ] + sumi0 [ 3 ] ) * * scales ++ ;
952
- sumi += (sumi1 [ 0 ] + sumi1 [ 1 ] + sumi1 [ 2 ] + sumi1 [ 3 ] ) * * scales ++ ;
949
+ sumi += vec_hsum_i32x4 (sumi0 ) * * scales ++ ;
950
+ sumi += vec_hsum_i32x4 (sumi1 ) * * scales ++ ;
953
951
}
954
952
955
953
sumf += d * sumi - dmin * mins ;
@@ -1020,7 +1018,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1020
1018
const int32x4_t v_minshe = vec_mule (v_ysumsh , v_scaleh );
1021
1019
const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe ;
1022
1020
1023
- const int32_t mins = v_mins [ 0 ] + v_mins [ 1 ] + v_mins [ 2 ] + v_mins [ 3 ] ;
1021
+ const int32_t mins = vec_hsum_i32x4 ( v_mins ) ;
1024
1022
1025
1023
int32_t isum = 0 ;
1026
1024
for (int j = 0 ; j < QK_K /128 ; ++ j ) {
@@ -1060,10 +1058,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1060
1058
int32x4_t summs2 = ggml_vec_dot (v_z , q6b [2 ], v_y [2 ]);
1061
1059
int32x4_t summs3 = ggml_vec_dot (v_z , q6b [3 ], v_y [3 ]);
1062
1060
1063
- isum += (summs0 [ 0 ] + summs0 [ 1 ] + summs0 [ 2 ] + summs0 [ 3 ] ) * scale [0 ] +
1064
- (summs1 [ 0 ] + summs1 [ 1 ] + summs1 [ 2 ] + summs1 [ 3 ] ) * scale [1 ] +
1065
- (summs2 [ 0 ] + summs2 [ 1 ] + summs2 [ 2 ] + summs2 [ 3 ] ) * scale [2 ] +
1066
- (summs3 [ 0 ] + summs3 [ 1 ] + summs3 [ 2 ] + summs3 [ 3 ] ) * scale [3 ];
1061
+ isum += vec_hsum_i32x4 (summs0 ) * scale [0 ] +
1062
+ vec_hsum_i32x4 (summs1 ) * scale [1 ] +
1063
+ vec_hsum_i32x4 (summs2 ) * scale [2 ] +
1064
+ vec_hsum_i32x4 (summs3 ) * scale [3 ];
1067
1065
1068
1066
scale += 4 ;
1069
1067
@@ -1094,10 +1092,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1094
1092
summs2 = ggml_vec_dot (v_z , q6b [2 ], v_y [2 ]);
1095
1093
summs3 = ggml_vec_dot (v_z , q6b [3 ], v_y [3 ]);
1096
1094
1097
- isum += (summs0 [ 0 ] + summs0 [ 1 ] + summs0 [ 2 ] + summs0 [ 3 ] ) * scale [0 ] +
1098
- (summs1 [ 0 ] + summs1 [ 1 ] + summs1 [ 2 ] + summs1 [ 3 ] ) * scale [1 ] +
1099
- (summs2 [ 0 ] + summs2 [ 1 ] + summs2 [ 2 ] + summs2 [ 3 ] ) * scale [2 ] +
1100
- (summs3 [ 0 ] + summs3 [ 1 ] + summs3 [ 2 ] + summs3 [ 3 ] ) * scale [3 ];
1095
+ isum += vec_hsum_i32x4 (summs0 ) * scale [0 ] +
1096
+ vec_hsum_i32x4 (summs1 ) * scale [1 ] +
1097
+ vec_hsum_i32x4 (summs2 ) * scale [2 ] +
1098
+ vec_hsum_i32x4 (summs3 ) * scale [3 ];
1101
1099
1102
1100
scale += 4 ;
1103
1101
}
@@ -1285,7 +1283,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
1285
1283
const int8x16_t v_yh = vec_xl (QK8_0 /2 , y0 -> qs );
1286
1284
const int32x4_t v_xy = ggml_vec_dot (ggml_vec_dot (vec_splats (0 ), v_xl , v_yl ), v_xh , v_yh );
1287
1285
1288
- sumf += GGML_CPU_FP16_TO_FP32 (x0 -> d ) * GGML_CPU_FP16_TO_FP32 (y0 -> d ) * (v_xy [ 0 ] + v_xy [ 1 ] + v_xy [ 2 ] + v_xy [ 3 ] );
1286
+ sumf += GGML_CPU_FP16_TO_FP32 (x0 -> d ) * GGML_CPU_FP16_TO_FP32 (y0 -> d ) * vec_hsum_i32x4 (v_xy );
1289
1287
}
1290
1288
1291
1289
* s = sumf ;
@@ -1354,8 +1352,8 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
1354
1352
1355
1353
h >>= 4 ;
1356
1354
1357
- sumi1 += (vsumi0 [ 0 ] + vsumi0 [ 1 ] + vsumi0 [ 2 ] + vsumi0 [ 3 ] ) * ls1 ;
1358
- sumi2 += (vsumi1 [ 0 ] + vsumi1 [ 1 ] + vsumi1 [ 2 ] + vsumi1 [ 3 ] ) * ls2 ;
1355
+ sumi1 += vec_hsum_i32x4 (vsumi0 ) * ls1 ;
1356
+ sumi2 += vec_hsum_i32x4 (vsumi1 ) * ls2 ;
1359
1357
}
1360
1358
1361
1359
sumf += GGML_CPU_FP16_TO_FP32 (x [ibl ].d ) * y [ibl ].d * (sumi1 + sumi2 );
0 commit comments