@@ -48,8 +48,10 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
4848 const int sve_register_length = ggml_cpu_get_sve_cnt () * 8 ;
4949 const int ggml_f32_epr = sve_register_length / 32 ;
5050 const svfloat32_t inactive1 = svdup_n_f32 (0.0f );
51- const svint32_t inactive2 = svdup_n_s32 (0 );
5251 const svbool_t pg = svptrue_b32 ();
52+ svfloat32_t zero = svdup_f32 (0.0f );
53+ svfloat32_t half = svdup_f32 (0.5f );
54+
5355 for (int i = 0 ; i < nb ; i += 1 ) {
5456 svfloat32_t srcv1 , asrcv1 ;
5557 svfloat32_t sv_max = svdup_n_f32 (0.0f );
@@ -67,8 +69,14 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
6769 for (int j = 0 ; j < QK8_0 ; j += ggml_f32_epr ) {
6870 srcv1 = svld1_f32 (pg , x + i * 32 + j );
6971 const svfloat32_t v1 = svmul_n_f32_m (pg , srcv1 , id );
70- const svint32_t vi1 = svcvt_s32_f32_m (inactive2 , pg , v1 );
71- svst1b_s32 (pg , & y [i ].qs [j ], vi1 );
72+
73+ svbool_t ge_zero = svcmpge_f32 (pg , v1 , zero );
74+ svfloat32_t v_pos = svadd_f32_m (pg , v1 , half );
75+ svfloat32_t v_neg = svsub_f32_m (pg , v1 , half );
76+
77+ svfloat32_t v_rounded = svsel_f32 (ge_zero , v_pos , v_neg );
78+ svint32_t result = svcvt_s32_f32_x (pg , v_rounded );
79+ svst1b_s32 (pg , & y [i ].qs [j ], result );
7280 }
7381 }
7482#elif defined(__ARM_NEON )
0 commit comments