@@ -3957,19 +3957,19 @@ static void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, c
39573957 // Shuffle pattern one - right side input
39583958 const __m256i rhs_mat_0145_00_sp1 = _mm256_shuffle_epi32 (rhs_mat_0145_00, 136 ); // B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3)
39593959 const __m256i rhs_mat_2367_00_sp1 = _mm256_shuffle_epi32 (rhs_mat_2367_00, 136 ); // B02(0-3) B03(0-3) B02(0-3) B03(0-3) B06(0-3) B07(0-3) B06(0-3) B07(0-3)
3960-
3960+
39613961 const __m256i rhs_mat_0145_01_sp1 = _mm256_shuffle_epi32 (rhs_mat_0145_01, 136 ); // B00(8-11) B01(8-11) B00(8-11) B01(8-11) B04(8-11) B05(8-11) B04(8-11) B05(8-11)
39623962 const __m256i rhs_mat_2367_01_sp1 = _mm256_shuffle_epi32 (rhs_mat_2367_01, 136 ); // B02(8-11) B03(8-11) B02(8-11) B03(8-11) B06(8-11) B07(8-11) B06(8-11) B07(8-11)
3963-
3963+
39643964 const __m256i rhs_mat_0145_02_sp1 = _mm256_shuffle_epi32 (rhs_mat_0145_02, 136 ); // B00(16-19) B01(16-19) B00(16-19) B01(16-19) B04(16-19) B05(16-19) B04(16-19) B05(16-19)
39653965 const __m256i rhs_mat_2367_02_sp1 = _mm256_shuffle_epi32 (rhs_mat_2367_02, 136 ); // B02(16-19) B03(16-19) B02(16-19) B03(16-19) B06(16-19) B07(16-19) B06(16-19) B07(16-19)
3966-
3966+
39673967 const __m256i rhs_mat_0145_03_sp1 = _mm256_shuffle_epi32 (rhs_mat_0145_03, 136 ); // B00(24-27) B01(24-27) B00(24-27) B01(24-27) B04(24-27) B05(24-27) B04(24-27) B05(24-27)
39683968 const __m256i rhs_mat_2367_03_sp1 = _mm256_shuffle_epi32 (rhs_mat_2367_03, 136 ); // B02(24-27) B03(24-27) B02(24-27) B03(24-27) B06(24-27) B07(24-27) B06(24-27) B07(24-27)
3969-
3969+
39703970 const __m256i rhs_mat_0145_10_sp1 = _mm256_shuffle_epi32 (rhs_mat_0145_10, 136 ); // B10(0-3) B11(0-3) B10(0-3) B11(0-3) B14(0-3) B15(0-3) B14(0-3) B15(0-3)
39713971 const __m256i rhs_mat_2367_10_sp1 = _mm256_shuffle_epi32 (rhs_mat_2367_10, 136 ); // B12(0-3) B13(0-3) B12(0-3) B13(0-3) B16(0-3) B17(0-3) B16(0-3) B17(0-3)
3972-
3972+
39733973 const __m256i rhs_mat_0145_11_sp1 = _mm256_shuffle_epi32 (rhs_mat_0145_11, 136 ); // B10(8-11) B11(8-11) B10(8-11) B11(8-11) B14(8-11) B15(8-11) B14(8-11) B15(8-11)
39743974 const __m256i rhs_mat_2367_11_sp1 = _mm256_shuffle_epi32 (rhs_mat_2367_11, 136 ); // B12(8-11) B13(8-11) B12(8-11) B13(8-11) B16(8-11) B17(8-11) B16(8-11) B17(8-11)
39753975
@@ -3978,30 +3978,30 @@ static void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, c
39783978
39793979 const __m256i rhs_mat_0145_13_sp1 = _mm256_shuffle_epi32 (rhs_mat_0145_13, 136 ); // B10(24-27) B11(24-27) B10(24-27) B11(24-27) B14(24-27) B15(24-27) B14(24-27) B15(24-27)
39803980 const __m256i rhs_mat_2367_13_sp1 = _mm256_shuffle_epi32 (rhs_mat_2367_13, 136 ); // B12(24-27) B13(24-27) B12(24-27) B13(24-27) B16(24-27) B17(24-27) B16(24-27) B17(24-27)
3981-
3981+
39823982
39833983 // Shuffle pattern two - right side input
39843984 const __m256i rhs_mat_0145_00_sp2 = _mm256_shuffle_epi32 (rhs_mat_0145_00, 221 ); // B00(4-7) B01(4-7) B00(4-7) B01(4-7) B04(4-7) B05(4-7) B04(4-7) B05(4-7)
39853985 const __m256i rhs_mat_2367_00_sp2 = _mm256_shuffle_epi32 (rhs_mat_2367_00, 221 ); // B02(4-7) B03(4-7) B02(4-7) B03(4-7) B06(4-7) B07(4-7) B06(4-7) B07(4-7)
3986-
3986+
39873987 const __m256i rhs_mat_0145_01_sp2 = _mm256_shuffle_epi32 (rhs_mat_0145_01, 221 ); // B00(12-15) B01(12-15) B00(12-15) B01(12-15) B04(12-15) B05(12-15) B04(12-15) B05(12-15)
39883988 const __m256i rhs_mat_2367_01_sp2 = _mm256_shuffle_epi32 (rhs_mat_2367_01, 221 ); // B02(12-15) B03(12-15) B02(12-15) B03(12-15) B06(12-15) B07(12-15) B06(12-15) B07(12-15)
3989-
3989+
39903990 const __m256i rhs_mat_0145_02_sp2 = _mm256_shuffle_epi32 (rhs_mat_0145_02, 221 ); // B00(20-23) B01(20-23) B00(20-23) B01(20-23) B04(20-23) B05(20-23) B04(20-23) B05(20-23)
39913991 const __m256i rhs_mat_2367_02_sp2 = _mm256_shuffle_epi32 (rhs_mat_2367_02, 221 ); // B02(20-23) B03(20-23) B02(20-23) B03(20-23) B06(20-23) B07(20-23) B06(20-23) B07(20-23)
3992-
3992+
39933993 const __m256i rhs_mat_0145_03_sp2 = _mm256_shuffle_epi32 (rhs_mat_0145_03, 221 ); // B00(28-31) B01(28-31) B00(28-31) B01(28-31) B04(28-31) B05(28-31) B04(28-31) B05(28-31)
39943994 const __m256i rhs_mat_2367_03_sp2 = _mm256_shuffle_epi32 (rhs_mat_2367_03, 221 ); // B02(28-31) B03(28-31) B02(28-31) B03(28-31) B06(28-31) B07(28-31) B06(28-31) B07(28-31)
3995-
3995+
39963996 const __m256i rhs_mat_0145_10_sp2 = _mm256_shuffle_epi32 (rhs_mat_0145_10, 221 ); // B10(4-7) B11(4-7) B10(4-7) B11(4-7) B14(4-7) B15(4-7) B14(4-7) B15(4-7)
39973997 const __m256i rhs_mat_2367_10_sp2 = _mm256_shuffle_epi32 (rhs_mat_2367_10, 221 ); // B12(4-7) B13(4-7) B12(4-7) B13(4-7) B16(4-7) B17(4-7) B16(4-7) B17(4-7)
3998-
3998+
39993999 const __m256i rhs_mat_0145_11_sp2 = _mm256_shuffle_epi32 (rhs_mat_0145_11, 221 ); // B10(12-15) B11(12-15) B10(12-15) B11(12-15) B14(12-15) B15(12-15) B14(12-15) B15(12-15)
40004000 const __m256i rhs_mat_2367_11_sp2 = _mm256_shuffle_epi32 (rhs_mat_2367_11, 221 ); // B12(12-15) B13(12-15) B12(12-15) B13(12-15) B16(12-15) B17(12-15) B16(12-15) B17(12-15)
4001-
4001+
40024002 const __m256i rhs_mat_0145_12_sp2 = _mm256_shuffle_epi32 (rhs_mat_0145_12, 221 ); // B10(20-23) B11(20-23) B10(20-23) B11(20-23) B14(20-23) B15(20-23) B14(20-23) B15(20-23)
40034003 const __m256i rhs_mat_2367_12_sp2 = _mm256_shuffle_epi32 (rhs_mat_2367_12, 221 ); // B12(20-23) B13(20-23) B12(20-23) B13(20-23) B16(20-23) B17(20-23) B16(20-23) B17(20-23)
4004-
4004+
40054005 const __m256i rhs_mat_0145_13_sp2 = _mm256_shuffle_epi32 (rhs_mat_0145_13, 221 ); // B10(28-31) B11(28-31) B10(28-31) B11(28-31) B14(28-31) B15(28-31) B14(28-31) B15(28-31)
40064006 const __m256i rhs_mat_2367_13_sp2 = _mm256_shuffle_epi32 (rhs_mat_2367_13, 221 ); // B12(28-31) B13(28-31) B12(28-31) B13(28-31) B16(28-31) B17(28-31) B16(28-31) B17(28-31)
40074007
@@ -4095,7 +4095,7 @@ static void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, c
40954095 const __m256i lhs_mat_23_11_sp1 = _mm256_shuffle_epi32 (lhs_mat_23_11, 160 ); // A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11)
40964096
40974097 const __m256i lhs_mat_01_12_sp1 = _mm256_shuffle_epi32 (lhs_mat_01_12, 160 ); // A10(16-19) A10(16-19) A11(16-19) A11(16-19) A10(16-19) A10(16-19) A11(16-19) A11(16-19)
4098- const __m256i lhs_mat_23_12_sp1 = _mm256_shuffle_epi32 (lhs_mat_23_12, 160 ); // A12(16-19) A13(16-19) A12(16-19) A13(16-19) A12(16-19) A13(16-19) A12(16-19) A13(16-19)
4098+ const __m256i lhs_mat_23_12_sp1 = _mm256_shuffle_epi32 (lhs_mat_23_12, 160 ); // A12(16-19) A13(16-19) A12(16-19) A13(16-19) A12(16-19) A13(16-19) A12(16-19) A13(16-19)
40994099
41004100 const __m256i lhs_mat_01_13_sp1 = _mm256_shuffle_epi32 (lhs_mat_01_13, 160 ); // A10(24-27) A10(24-27) A11(24-27) A11(24-27) A10(24-27) A10(24-27) A11(24-27) A11(24-27)
41014101 const __m256i lhs_mat_23_13_sp1 = _mm256_shuffle_epi32 (lhs_mat_23_13, 160 ); // A12(24-27) A13(24-27) A12(24-27) A13(24-27) A12(24-27) A13(24-27) A12(24-27) A13(24-27)
@@ -4212,7 +4212,7 @@ static void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, c
42124212 }
42134213 for (; y < nr / 4 ; y++) {
42144214
4215- const block_q8_Kx4 * a_ptr = a_ptr_start + (y * nb);;
4215+ const block_q8_Kx4 * a_ptr = a_ptr_start + (y * nb);
42164216
42174217 for (int64_t x = 0 ; x < nc / 8 ; x++) {
42184218
@@ -4592,7 +4592,7 @@ static void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, c
45924592 for (int sb = 0 ; sb < 8 ; sb++) {
45934593 uint8_t *mins = (uint8_t *) utmp + 8 + sb * 16 ;
45944594 for (int m = 0 ; m < 4 ; m++) {
4595- int16_t *bsums = ( int16_t *) a_ptr[l].bsums + (sb * 8 ) + (m * 4 ) - ((sb % 2 ) * 6 );
4595+ int16_t *bsums = a_ptr[l].bsums + (sb * 8 ) + (m * 4 ) - ((sb % 2 ) * 6 );
45964596 for (int j = 0 ; j < ncols_interleaved; j++) {
45974597 sum_minf[m][j] += mins[j] * (bsums[0 ] + bsums[1 ]) * GGML_FP16_TO_FP32 (b_ptr[l].dmin [j]) * a_ptr[l].d [m];
45984598 }
0 commit comments