@@ -284,6 +284,45 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
284284
285285    float32x4_t  v_acc  =  vec_splats (0.0f );
286286
287+     #pragma  GCC unroll 8
288+     for  (; ib  +  1  <  nb ; ib  +=  2 ) {
289+         const  block_mxfp4  *  GGML_RESTRICT  x0  =  & x [ib  +  0 ];
290+         const  block_mxfp4  *  GGML_RESTRICT  x1  =  & x [ib  +  1 ];
291+         const  block_q8_0   *  GGML_RESTRICT  y0  =  & y [ib  +  0 ];
292+         const  block_q8_0   *  GGML_RESTRICT  y1  =  & y [ib  +  1 ];
293+ 
294+         const  uint8x16_t  v_x0  =  vec_xl (0 , x0 -> qs );
295+         const  uint8x16_t  v_x1  =  vec_xl (0 , x1 -> qs );
296+ 
297+         int8x16_t  v_x0l  =  (int8x16_t )vec_and (v_x0 , v_m );
298+         int8x16_t  v_x0h  =  (int8x16_t )vec_sr (v_x0 , 4 );
299+         int8x16_t  v_x1l  =  (int8x16_t )vec_and (v_x1 , v_m );
300+         int8x16_t  v_x1h  =  (int8x16_t )vec_sr (v_x1 , 4 );
301+ 
302+         v_x0l  =  vec_perm (v_k , v_k , (uchar8x16_t )v_x0l );
303+         v_x0h  =  vec_perm (v_k , v_k , (uchar8x16_t )v_x0h );
304+         v_x1l  =  vec_perm (v_k , v_k , (uchar8x16_t )v_x1l );
305+         v_x1h  =  vec_perm (v_k , v_k , (uchar8x16_t )v_x1h );
306+ 
307+         const  int8x16_t  v_y0l  =  vec_xl (0 ,       y0 -> qs );
308+         const  int8x16_t  v_y0h  =  vec_xl (QK8_0 /2 , y0 -> qs );
309+         const  int8x16_t  v_y1l  =  vec_xl (0 ,       y1 -> qs );
310+         const  int8x16_t  v_y1h  =  vec_xl (QK8_0 /2 , y1 -> qs );
311+ 
312+         const  int32x4_t  v_xy0  =  ggml_vec_dot (ggml_vec_dot (vec_splats (0 ), v_x0l , v_y0l ), v_x0h , v_y0h );
313+         const  int32x4_t  v_xy1  =  ggml_vec_dot (ggml_vec_dot (vec_splats (0 ), v_x1l , v_y1l ), v_x1h , v_y1h );
314+ 
315+         const  float32x4_t  v_xy0f  =  vec_float (v_xy0 );
316+         const  float32x4_t  v_xy1f  =  vec_float (v_xy1 );
317+ 
318+         const  float32x4_t  v_d0  =  vec_splats (GGML_E8M0_TO_FP32_HALF (x0 -> e ) *  GGML_CPU_FP16_TO_FP32 (y0 -> d ));
319+         const  float32x4_t  v_d1  =  vec_splats (GGML_E8M0_TO_FP32_HALF (x1 -> e ) *  GGML_CPU_FP16_TO_FP32 (y1 -> d ));
320+ 
321+         v_acc  =  vec_madd (v_xy0f , v_d0 , v_acc );
322+         v_acc  =  vec_madd (v_xy1f , v_d1 , v_acc );
323+     }
324+ 
325+     #pragma  GCC unroll 8
287326    for  (; ib  <  nb ; ++ ib ) {
288327        const  block_mxfp4  *  GGML_RESTRICT  x0  =  & x [ib  +  0 ];
289328        const  block_q8_0   *  GGML_RESTRICT  y0  =  & y [ib  +  0 ];
0 commit comments