@@ -1003,11 +1003,10 @@ inline __m256i accum_q4_0_quants(const __m256i * v, const int8_t * qs) {
10031003template <int nrc_y>
10041004static void mul_mat_q4_0_r8_q8_2_avx2 (int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
10051005 GGML_ASSERT (nrc_x%8 == 0 );
1006- Q8<nrc_y, block_q8_1_x4 > q8 (info);
1006+ Q8<nrc_y, block_q8_2_x4 > q8 (info);
10071007 auto m4 = _mm256_set1_epi8 (0xf );
10081008 int nb = n / QK4_NL;
10091009 __m256i v[8 ];
1010- GGML_ASSERT (nb%4 == 0 );
10111010 if constexpr (nrc_y == 1 ) {
10121011 union { __m256 vec; float val[8 ]; } helper;
10131012 for (int ix = 0 ; ix < nrc_x; ix += 8 ) {
@@ -1026,14 +1025,14 @@ static void mul_mat_q4_0_r8_q8_2_avx2(int n, const void * vx, size_t bx, const D
10261025 }
10271026 }
10281027 for (int ib = 4 *(nb/4 ); ib < nb; ++ib) {
1029- auto qy = (const block_q8_1 *)q8.y [0 ];
1028+ auto qy = (const block_q8_2 *)q8.y [0 ];
10301029 auto scales = _mm256_cvtph_ps (_mm_loadu_si128 ((const __m128i *)iq4[ib].d ));
10311030 prepare_q4_0_quants_avx2 (iq4[ib].qs , v, m4);
10321031 auto sumi = accum_q4_0_quants (v, qy[ib].qs );
1033- ggml_bf16_t d{qy[ib]. d }, s{qy[ib]. s } ;
1034- auto d4d8 = _mm256_mul_ps (scales, _mm256_set1_ps (GGML_BF16_TO_FP32 (d) ));
1032+ auto [d8, m8] = ScaleHelperQ8_2::prepare1 (qy + ib) ;
1033+ auto d4d8 = _mm256_mul_ps (scales, _mm256_set1_ps (d8 ));
10351034 acc1 = _mm256_fmadd_ps (d4d8, _mm256_cvtepi32_ps (sumi), acc1);
1036- acc2 = _mm256_fmadd_ps (scales, _mm256_set1_ps (GGML_BF16_TO_FP32 (s) ), acc2);
1035+ acc2 = _mm256_fmadd_ps (scales, _mm256_set1_ps (m8 ), acc2);
10371036 }
10381037 acc1 = _mm256_fmadd_ps (acc2, _mm256_set1_ps (-8 .f ), acc1);
10391038 info.store (ix, 0 , acc1);
@@ -1077,12 +1076,12 @@ static void mul_mat_q4_0_r8_q8_2_avx2(int n, const void * vx, size_t bx, const D
10771076 auto scales_m = _mm256_mul_ps (scales, _mm256_set1_ps (-8 .f ));
10781077 prepare_q4_0_quants_avx2 (iq4[ib].qs , v, m4);
10791078 for (int iy = 0 ; iy < nrc_y; ++iy) {
1080- auto qy = (const block_q8_1 *)q8.y [iy];
1079+ auto qy = (const block_q8_2 *)q8.y [iy];
10811080 auto sumi = accum_q4_0_quants (v, qy[ib].qs );
1082- ggml_bf16_t d{qy[ib]. d }, s{qy[ib]. s } ;
1083- auto d4d8 = _mm256_mul_ps (scales, _mm256_set1_ps (GGML_BF16_TO_FP32 (d) ));
1081+ auto [d8, m8] = ScaleHelperQ8_2::prepare1 (qy + ib) ;
1082+ auto d4d8 = _mm256_mul_ps (scales, _mm256_set1_ps (d8 ));
10841083 acc[iy] = _mm256_fmadd_ps (d4d8, _mm256_cvtepi32_ps (sumi), acc[iy]);
1085- acc[iy] = _mm256_fmadd_ps (scales_m, _mm256_set1_ps (GGML_BF16_TO_FP32 (s) ), acc[iy]);
1084+ acc[iy] = _mm256_fmadd_ps (scales_m, _mm256_set1_ps (m8 ), acc[iy]);
10861085 }
10871086 }
10881087 for (int iy = 0 ; iy < nrc_y; ++iy) {
@@ -1101,7 +1100,7 @@ static void mul_mat_q4_0_r8_q8_2(int n, const void * vx, size_t bx, const DataIn
11011100 return ;
11021101 }
11031102 GGML_ASSERT (nrc_x%16 == 0 );
1104- Q8<nrc_y, block_q8_1_x4 > q8 (info);
1103+ Q8<nrc_y, block_q8_2_x4 > q8 (info);
11051104 auto m4 = _mm512_set1_epi8 (0xf );
11061105 int nb = n / QK4_NL;
11071106 __m512 acc[2 *nrc_y] = {};
@@ -1159,10 +1158,10 @@ static void mul_mat_q4_0_r8_q8_2(int n, const void * vx, size_t bx, const DataIn
11591158 for (int iy = 0 ; iy < nrc_y; ++iy) {
11601159 auto qy = (const block_q8_1 *)q8.y [iy];
11611160 auto sumi = dot (qy[ib].qs );
1162- ggml_bf16_t d{qy[ib]. d }, s{qy[ib]. s } ;
1163- auto dy = _mm512_set1_ps (GGML_BF16_TO_FP32 (d) );
1161+ auto [d8, m8] = ScaleHelperQ8_2::prepare1 (qy + ib) ;
1162+ auto dy = _mm512_set1_ps (d8 );
11641163 acc[2 *iy+0 ] = _mm512_fmadd_ps (_mm512_mul_ps (scales, dy), _mm512_cvtepi32_ps (sumi), acc[2 *iy+0 ]);
1165- acc[2 *iy+1 ] = _mm512_fmadd_ps (scales, _mm512_set1_ps (GGML_BF16_TO_FP32 (s) ), acc[2 *iy+1 ]);
1164+ acc[2 *iy+1 ] = _mm512_fmadd_ps (scales, _mm512_set1_ps (m8 ), acc[2 *iy+1 ]);
11661165 }
11671166 }
11681167 for (int iy = 0 ; iy < nrc_y; ++iy) {
@@ -1245,12 +1244,12 @@ static void mul_mat_q5_0_r4_q8_2_avx2(int n, const void * vx, size_t bx, const D
12451244 for (int ib = 4 *(nb/4 ); ib < nb; ++ib) {
12461245 auto scales = prepare (iq5[ib]);
12471246 for (int iy = 0 ; iy < nrc_y; ++iy) {
1248- auto qy = (const block_q8_1 *)q8.y [iy];
1247+ auto qy = (const block_q8_2 *)q8.y [iy];
12491248 auto sumi = dot (_mm256_loadu_si256 ((const __m256i*)qy[ib].qs ));
1250- ggml_bf16_t d{qy[ib]. d }, s{qy[ib]. s } ;
1251- auto d4d8 = _mm256_mul_ps (scales, _mm256_set1_ps (GGML_BF16_TO_FP32 (d) ));
1249+ auto [d8, m8] = ScaleHelperQ8_2::prepare1 (qy + ib) ;
1250+ auto d4d8 = _mm256_mul_ps (scales, _mm256_set1_ps (d8 ));
12521251 acc[iy] = _mm256_fmadd_ps (d4d8, _mm256_cvtepi32_ps (sumi), acc[iy]);
1253- acc[iy] = _mm256_fmadd_ps (scales, _mm256_set1_ps (-8 .f *GGML_BF16_TO_FP32 (s) ), acc[iy]);
1252+ acc[iy] = _mm256_fmadd_ps (scales, _mm256_set1_ps (-8 .f *m8 ), acc[iy]);
12541253 }
12551254 }
12561255 for (int iy = 0 ; iy < nrc_y; ++iy) {
@@ -1325,12 +1324,12 @@ static void mul_mat_q5_0_r4_q8_2(int n, const void * vx, size_t bx, const DataIn
13251324 for (int ib = 4 *(nb/4 ); ib < nb; ++ib) {
13261325 auto scales = prepare (iq5l[ib], iq5h[ib]);
13271326 for (int iy = 0 ; iy < nrc_y; ++iy) {
1328- auto qy = (const block_q8_1 *)q8.y [iy];
1327+ auto qy = (const block_q8_2 *)q8.y [iy];
13291328 auto sumi = dot (_mm256_loadu_si256 ((const __m256i*)qy[ib].qs ));
1330- ggml_bf16_t d{qy[ib]. d }, s{qy[ib]. s } ;
1331- auto dy = _mm512_set1_ps (GGML_BF16_TO_FP32 (d) );
1329+ auto [d8, m8] = ScaleHelperQ8_2::prepare1 (qy + ib) ;
1330+ auto dy = _mm512_set1_ps (d8 );
13321331 acc[2 *iy+0 ] = _mm512_fmadd_ps (_mm512_mul_ps (scales, dy), _mm512_cvtepi32_ps (sumi), acc[2 *iy+0 ]);
1333- acc[2 *iy+1 ] = _mm512_fmadd_ps (scales, _mm512_set1_ps (GGML_BF16_TO_FP32 (s) ), acc[2 *iy+1 ]);
1332+ acc[2 *iy+1 ] = _mm512_fmadd_ps (scales, _mm512_set1_ps (m8 ), acc[2 *iy+1 ]);
13341333 }
13351334 }
13361335 for (int iy = 0 ; iy < nrc_y; ++iy) {
@@ -1415,12 +1414,12 @@ static void mul_mat_q6_0_r4_q8_2_avx2(int n, const void * vx, size_t bx, const D
14151414 for (int ib = 4 *(nb/4 ); ib < nb; ++ib) {
14161415 auto scales = prepare (iq6[ib]);
14171416 for (int iy = 0 ; iy < nrc_y; ++iy) {
1418- auto qy = (const block_q8_1 *)q8.y [iy];
1417+ auto qy = (const block_q8_2 *)q8.y [iy];
14191418 auto sumi = dot (_mm256_loadu_si256 ((const __m256i*)qy[ib].qs ));
1420- ggml_bf16_t d{qy[ib]. d }, s{qy[ib]. s } ;
1421- auto d4d8 = _mm256_mul_ps (scales, _mm256_set1_ps (GGML_BF16_TO_FP32 (d) ));
1419+ auto [d8, m8] = ScaleHelperQ8_2::prepare1 (qy + ib) ;
1420+ auto d4d8 = _mm256_mul_ps (scales, _mm256_set1_ps (d8 ));
14221421 acc[iy] = _mm256_fmadd_ps (d4d8, _mm256_cvtepi32_ps (sumi), acc[iy]);
1423- acc[iy] = _mm256_fmadd_ps (scales, _mm256_set1_ps (-16 .f *GGML_BF16_TO_FP32 (s) ), acc[iy]);
1422+ acc[iy] = _mm256_fmadd_ps (scales, _mm256_set1_ps (-16 .f *m8 ), acc[iy]);
14241423 }
14251424 }
14261425
@@ -1495,12 +1494,12 @@ static void mul_mat_q6_0_r4_q8_2(int n, const void * vx, size_t bx, const DataIn
14951494 for (int ib = 4 *(nb/4 ); ib < nb; ++ib) {
14961495 auto scales = prepare (iq6l[ib], iq6h[ib]);
14971496 for (int iy = 0 ; iy < nrc_y; ++iy) {
1498- auto qy = (const block_q8_1 *)q8.y [iy];
1497+ auto qy = (const block_q8_2 *)q8.y [iy];
14991498 auto sumi = dot (_mm256_loadu_si256 ((const __m256i*)qy[ib].qs ));
1500- ggml_bf16_t d{qy[ib]. d }, s{qy[ib]. s } ;
1501- auto dy = _mm512_set1_ps (GGML_BF16_TO_FP32 (d) );
1499+ auto [d8, m8] = ScaleHelperQ8_2::prepare1 (qy + ib) ;
1500+ auto dy = _mm512_set1_ps (d8 );
15021501 acc[2 *iy+0 ] = _mm512_fmadd_ps (_mm512_mul_ps (scales, dy), _mm512_cvtepi32_ps (sumi), acc[2 *iy+0 ]);
1503- acc[2 *iy+1 ] = _mm512_fmadd_ps (scales, _mm512_set1_ps (GGML_BF16_TO_FP32 (s) ), acc[2 *iy+1 ]);
1502+ acc[2 *iy+1 ] = _mm512_fmadd_ps (scales, _mm512_set1_ps (m8 ), acc[2 *iy+1 ]);
15041503 }
15051504 }
15061505 for (int iy = 0 ; iy < nrc_y; ++iy) {
0 commit comments