@@ -740,19 +740,19 @@ inline static float vaddvq_f32(float32x4_t v) {
740740 return vgetq_lane_f32 (v , 0 ) + vgetq_lane_f32 (v , 1 ) + vgetq_lane_f32 (v , 2 ) + vgetq_lane_f32 (v , 3 );
741741}
742742
743- float vminvq_f32 (float32x4_t v ) {
743+ inline static float vminvq_f32 (float32x4_t v ) {
744744 return
745745 MIN (MIN (vgetq_lane_f32 (v , 0 ), vgetq_lane_f32 (v , 1 )),
746746 MIN (vgetq_lane_f32 (v , 2 ), vgetq_lane_f32 (v , 3 )));
747747}
748748
749- float vmaxvq_f32 (float32x4_t v ) {
749+ inline static float vmaxvq_f32 (float32x4_t v ) {
750750 return
751751 MAX (MAX (vgetq_lane_f32 (v , 0 ), vgetq_lane_f32 (v , 1 )),
752752 MAX (vgetq_lane_f32 (v , 2 ), vgetq_lane_f32 (v , 3 )));
753753}
754754
755- int32x4_t vcvtnq_s32_f32 (float32x4_t v ) {
755+ inline static int32x4_t vcvtnq_s32_f32 (float32x4_t v ) {
756756 int32x4_t res ;
757757
758758 res [0 ] = roundf (vgetq_lane_f32 (v , 0 ));
@@ -766,7 +766,6 @@ int32x4_t vcvtnq_s32_f32(float32x4_t v) {
766766#endif
767767#endif
768768
769-
770769#define QK4_0 32
771770typedef struct {
772771 ggml_fp16_t d ; // delta
@@ -1056,6 +1055,39 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
10561055 y [i ].qs [4 * j + 3 ] = vgetq_lane_s32 (vi , 3 );
10571056 }
10581057 }
1058+ #elif defined(__wasm_simd128__ )
1059+ for (int i = 0 ; i < nb ; i ++ ) {
1060+ v128_t srcv [8 ];
1061+ v128_t asrcv [8 ];
1062+ v128_t amaxv [8 ];
1063+
1064+ for (int j = 0 ; j < 8 ; j ++ ) srcv [j ] = wasm_v128_load (x + i * 32 + 4 * j );
1065+ for (int j = 0 ; j < 8 ; j ++ ) asrcv [j ] = wasm_f32x4_abs (srcv [j ]);
1066+
1067+ for (int j = 0 ; j < 4 ; j ++ ) amaxv [2 * j ] = wasm_f32x4_max (asrcv [2 * j ], asrcv [2 * j + 1 ]);
1068+ for (int j = 0 ; j < 2 ; j ++ ) amaxv [4 * j ] = wasm_f32x4_max (amaxv [4 * j ], amaxv [4 * j + 2 ]);
1069+ for (int j = 0 ; j < 1 ; j ++ ) amaxv [8 * j ] = wasm_f32x4_max (amaxv [8 * j ], amaxv [8 * j + 4 ]);
1070+
1071+ const float amax = MAX (MAX (wasm_f32x4_extract_lane (amaxv [0 ], 0 ),
1072+ wasm_f32x4_extract_lane (amaxv [0 ], 1 )),
1073+ MAX (wasm_f32x4_extract_lane (amaxv [0 ], 2 ),
1074+ wasm_f32x4_extract_lane (amaxv [0 ], 3 )));
1075+
1076+ const float d = amax / ((1 << 7 ) - 1 );
1077+ const float id = d ? 1.0f /d : 0.0f ;
1078+
1079+ y [i ].d = GGML_FP32_TO_FP16 (d );
1080+
1081+ for (int j = 0 ; j < 8 ; j ++ ) {
1082+ const v128_t v = wasm_f32x4_mul (srcv [j ], wasm_f32x4_splat (id ));
1083+ const v128_t vi = wasm_i32x4_trunc_sat_f32x4 (v );
1084+
1085+ y [i ].qs [4 * j + 0 ] = wasm_i32x4_extract_lane (vi , 0 );
1086+ y [i ].qs [4 * j + 1 ] = wasm_i32x4_extract_lane (vi , 1 );
1087+ y [i ].qs [4 * j + 2 ] = wasm_i32x4_extract_lane (vi , 2 );
1088+ y [i ].qs [4 * j + 3 ] = wasm_i32x4_extract_lane (vi , 3 );
1089+ }
1090+ }
10591091#elif defined(__AVX2__ ) || defined(__AVX__ )
10601092 for (int i = 0 ; i < nb ; i ++ ) {
10611093 // Load elements into 4 AVX vectors
@@ -1224,6 +1256,48 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
12241256
12251257 y [i ].s = d * vaddvq_s32 (accv );
12261258 }
1259+ #elif defined(__wasm_simd128__ )
1260+ for (int i = 0 ; i < nb ; i ++ ) {
1261+ v128_t srcv [8 ];
1262+ v128_t asrcv [8 ];
1263+ v128_t amaxv [8 ];
1264+
1265+ for (int j = 0 ; j < 8 ; j ++ ) srcv [j ] = wasm_v128_load (x + i * 32 + 4 * j );
1266+ for (int j = 0 ; j < 8 ; j ++ ) asrcv [j ] = wasm_f32x4_abs (srcv [j ]);
1267+
1268+ for (int j = 0 ; j < 4 ; j ++ ) amaxv [2 * j ] = wasm_f32x4_max (asrcv [2 * j ], asrcv [2 * j + 1 ]);
1269+ for (int j = 0 ; j < 2 ; j ++ ) amaxv [4 * j ] = wasm_f32x4_max (amaxv [4 * j ], amaxv [4 * j + 2 ]);
1270+ for (int j = 0 ; j < 1 ; j ++ ) amaxv [8 * j ] = wasm_f32x4_max (amaxv [8 * j ], amaxv [8 * j + 4 ]);
1271+
1272+ const float amax = MAX (MAX (wasm_f32x4_extract_lane (amaxv [0 ], 0 ),
1273+ wasm_f32x4_extract_lane (amaxv [0 ], 1 )),
1274+ MAX (wasm_f32x4_extract_lane (amaxv [0 ], 2 ),
1275+ wasm_f32x4_extract_lane (amaxv [0 ], 3 )));
1276+
1277+ const float d = amax / ((1 << 7 ) - 1 );
1278+ const float id = d ? 1.0f /d : 0.0f ;
1279+
1280+ y [i ].d = d ;
1281+
1282+ v128_t accv = wasm_i32x4_splat (0 );
1283+
1284+ for (int j = 0 ; j < 8 ; j ++ ) {
1285+ const v128_t v = wasm_f32x4_mul (srcv [j ], wasm_f32x4_splat (id ));
1286+ const v128_t vi = wasm_i32x4_trunc_sat_f32x4 (v );
1287+
1288+ y [i ].qs [4 * j + 0 ] = wasm_i32x4_extract_lane (vi , 0 );
1289+ y [i ].qs [4 * j + 1 ] = wasm_i32x4_extract_lane (vi , 1 );
1290+ y [i ].qs [4 * j + 2 ] = wasm_i32x4_extract_lane (vi , 2 );
1291+ y [i ].qs [4 * j + 3 ] = wasm_i32x4_extract_lane (vi , 3 );
1292+
1293+ accv = wasm_i32x4_add (accv , vi );
1294+ }
1295+
1296+ y [i ].s = d * (wasm_i32x4_extract_lane (accv , 0 ) +
1297+ wasm_i32x4_extract_lane (accv , 1 ) +
1298+ wasm_i32x4_extract_lane (accv , 2 ) +
1299+ wasm_i32x4_extract_lane (accv , 3 ));
1300+ }
12271301#elif defined(__AVX2__ ) || defined(__AVX__ )
12281302 for (int i = 0 ; i < nb ; i ++ ) {
12291303 // Load elements into 4 AVX vectors
@@ -2598,7 +2672,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
25982672 const block_q8_0 * restrict y0 = & y [i ];
25992673
26002674 const v128_t m4b = wasm_i8x16_splat (0x0F );
2601- const v128_t s16b = wasm_i8x16_splat (0x10 );
26022675
26032676 // extract the 5th bit
26042677 memcpy (& qh , x0 -> qh , sizeof (qh ));
@@ -2636,15 +2709,14 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
26362709 const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h );
26372710 const v128_t v1hh = wasm_i16x8_extend_high_i8x16 (v1h );
26382711
2639- const float x0d = GGML_FP16_TO_FP32 (x0 -> d );
2640-
26412712 // dot product
26422713 sumv = wasm_f32x4_add (sumv , wasm_f32x4_mul (wasm_f32x4_convert_i32x4 (
26432714 wasm_i32x4_add (
26442715 wasm_i32x4_add (wasm_i32x4_dot_i16x8 (v0lfl , v1ll ),
26452716 wasm_i32x4_dot_i16x8 (v0lfh , v1lh )),
26462717 wasm_i32x4_add (wasm_i32x4_dot_i16x8 (v0hfl , v1hl ),
2647- wasm_i32x4_dot_i16x8 (v0hfh , v1hh )))), wasm_f32x4_splat (x0d * y0 -> d )));
2718+ wasm_i32x4_dot_i16x8 (v0hfh , v1hh )))),
2719+ wasm_f32x4_splat (GGML_FP16_TO_FP32 (x0 -> d ) * GGML_FP16_TO_FP32 (y0 -> d ))));
26482720 }
26492721
26502722 * s = wasm_f32x4_extract_lane (sumv , 0 ) + wasm_f32x4_extract_lane (sumv , 1 ) +
@@ -2868,8 +2940,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
28682940 const v128_t v0l = wasm_v128_and (v0 , m4b );
28692941 const v128_t v0h = wasm_u8x16_shr (v0 , 4 );
28702942
2871- static bool x = true;
2872-
28732943 // add high bit
28742944 const v128_t v0lf = wasm_v128_or (v0l , qhl );
28752945 const v128_t v0hf = wasm_v128_or (v0h , qhh );
@@ -2892,11 +2962,11 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
28922962 // dot product
28932963 sumv = wasm_f32x4_add (sumv ,
28942964 wasm_f32x4_mul (wasm_f32x4_convert_i32x4 (wasm_i32x4_add (
2895- wasm_i32x4_add (wasm_i32x4_dot_i16x8 (v0lfl , v1ll ),
2896- wasm_i32x4_dot_i16x8 (v0lfh , v1lh )),
2897- wasm_i32x4_add (wasm_i32x4_dot_i16x8 (v0hfl , v1hl ),
2898- wasm_i32x4_dot_i16x8 (v0hfh , v1hh )))),
2899- wasm_f32x4_splat (GGML_FP16_TO_FP32 (x0 -> d ) * y0 -> d ));
2965+ wasm_i32x4_add (wasm_i32x4_dot_i16x8 (v0lfl , v1ll ),
2966+ wasm_i32x4_dot_i16x8 (v0lfh , v1lh )),
2967+ wasm_i32x4_add (wasm_i32x4_dot_i16x8 (v0hfl , v1hl ),
2968+ wasm_i32x4_dot_i16x8 (v0hfh , v1hh )))),
2969+ wasm_f32x4_splat (GGML_FP16_TO_FP32 (x0 -> d ) * y0 -> d ))) ;
29002970 }
29012971
29022972 * s = wasm_f32x4_extract_lane (sumv , 0 ) + wasm_f32x4_extract_lane (sumv , 1 ) +
0 commit comments