@@ -2700,7 +2700,7 @@ static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
27002700 const __m128i row_scale_f16 = _mm_shuffle_epi32 (_mm_maskload_epi32 ((int const *)(a_ptrs[rp][b].d ), loadMask), 68 );
27012701 const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD (row_scale_f16);
27022702
2703- // Multiply with appropiate scales and accumulate
2703+ // Multiply with appropriate scales and accumulate
27042704 acc_rows[rp * 4 ] = _mm512_fmadd_ps (_mm512_cvtepi32_ps (iacc_row_0), _mm512_mul_ps (col_scale_f32, _mm512_shuffle_ps (row_scale_f32, row_scale_f32, 0 )), acc_rows[rp * 4 ]);
27052705 acc_rows[rp * 4 + 1 ] = _mm512_fmadd_ps (_mm512_cvtepi32_ps (iacc_row_1), _mm512_mul_ps (col_scale_f32, _mm512_shuffle_ps (row_scale_f32, row_scale_f32, 85 )), acc_rows[rp * 4 + 1 ]);
27062706 acc_rows[rp * 4 + 2 ] = _mm512_fmadd_ps (_mm512_cvtepi32_ps (iacc_row_2), _mm512_mul_ps (col_scale_f32, _mm512_shuffle_ps (row_scale_f32, row_scale_f32, 170 )), acc_rows[rp * 4 + 2 ]);
@@ -2891,7 +2891,7 @@ static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
28912891 const __m128i row_scale_f16 = _mm_shuffle_epi32 (_mm_maskload_epi32 ((int const *)(a_ptr[b].d ), loadMask), 68 );
28922892 const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD (row_scale_f16);
28932893
2894- // Multiply with appropiate scales and accumulate
2894+ // Multiply with appropriate scales and accumulate
28952895 acc_rows[0 ] = _mm512_fmadd_ps (_mm512_cvtepi32_ps (iacc_row_0), _mm512_mul_ps (col_scale_f32, _mm512_shuffle_ps (row_scale_f32, row_scale_f32, 0 )), acc_rows[0 ]);
28962896 acc_rows[1 ] = _mm512_fmadd_ps (_mm512_cvtepi32_ps (iacc_row_1), _mm512_mul_ps (col_scale_f32, _mm512_shuffle_ps (row_scale_f32, row_scale_f32, 85 )), acc_rows[1 ]);
28972897 acc_rows[2 ] = _mm512_fmadd_ps (_mm512_cvtepi32_ps (iacc_row_2), _mm512_mul_ps (col_scale_f32, _mm512_shuffle_ps (row_scale_f32, row_scale_f32, 170 )), acc_rows[2 ]);
@@ -3064,7 +3064,7 @@ static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
30643064 // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
30653065 const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD (a_ptrs[rp][b].d , loadMask);
30663066
3067- // Multiply with appropiate scales and accumulate
3067+ // Multiply with appropriate scales and accumulate
30683068 acc_rows[rp * 4 ] = _mm256_fmadd_ps (_mm256_cvtepi32_ps (iacc_row_0), _mm256_mul_ps (col_scale_f32, _mm256_shuffle_ps (row_scale_f32, row_scale_f32, 0 )), acc_rows[rp * 4 ]);
30693069 acc_rows[rp * 4 + 1 ] = _mm256_fmadd_ps (_mm256_cvtepi32_ps (iacc_row_1), _mm256_mul_ps (col_scale_f32, _mm256_shuffle_ps (row_scale_f32, row_scale_f32, 85 )), acc_rows[rp * 4 + 1 ]);
30703070 acc_rows[rp * 4 + 2 ] = _mm256_fmadd_ps (_mm256_cvtepi32_ps (iacc_row_2), _mm256_mul_ps (col_scale_f32, _mm256_shuffle_ps (row_scale_f32, row_scale_f32, 170 )), acc_rows[rp * 4 + 2 ]);
@@ -3229,7 +3229,7 @@ static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
32293229 // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
32303230 const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD (a_ptr[b].d , loadMask);
32313231
3232- // Multiply with appropiate scales and accumulate
3232+ // Multiply with appropriate scales and accumulate
32333233 acc_rows[0 ] = _mm256_fmadd_ps (_mm256_cvtepi32_ps (iacc_row_0), _mm256_mul_ps (col_scale_f32, _mm256_shuffle_ps (row_scale_f32, row_scale_f32, 0 )), acc_rows[0 ]);
32343234 acc_rows[1 ] = _mm256_fmadd_ps (_mm256_cvtepi32_ps (iacc_row_1), _mm256_mul_ps (col_scale_f32, _mm256_shuffle_ps (row_scale_f32, row_scale_f32, 85 )), acc_rows[1 ]);
32353235 acc_rows[2 ] = _mm256_fmadd_ps (_mm256_cvtepi32_ps (iacc_row_2), _mm256_mul_ps (col_scale_f32, _mm256_shuffle_ps (row_scale_f32, row_scale_f32, 170 )), acc_rows[2 ]);
0 commit comments