@@ -2489,11 +2489,11 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
24892489 __m256i signextendlut = _mm256_castsi128_si256 (_mm_set_epi8 (-1 , -2 , -3 , -4 , -5 , -6 , -7 , -8 , 7 , 6 , 5 , 4 , 3 , 2 , 1 , 0 ));
24902490 signextendlut = _mm256_permute2f128_si256 (signextendlut , signextendlut , 0 );
24912491 // Permute mask used for easier vector processing at later stages
2492- __m256i requiredOrder = _mm256_set_epi32 (3 , 2 , 1 , 0 , 7 , 6 , 5 , 4 );
2492+ __m256i requiredOrder = _mm256_set_epi32 (3 , 2 , 1 , 0 , 7 , 6 , 5 , 4 );
24932493 int64_t xstart = 0 ;
2494- int anr = nr - nr %16 ; // Used to align nr with boundary of 16
2494+ int anr = nr - nr %16 ; // Used to align nr with boundary of 16
24952495#ifdef __AVX512F__
2496- int anc = nc - nc %16 ; // Used to align nc with boundary of 16
2496+ int anc = nc - nc %16 ; // Used to align nc with boundary of 16
24972497 // Mask to mask out nibbles from packed bytes expanded to 512 bit length
24982498 const __m512i m4bexpanded = _mm512_set1_epi8 (0x0F );
24992499 // Lookup table to convert signed nibbles to signed bytes expanded to 512 bit length
@@ -2510,9 +2510,9 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
25102510 }
25112511
25122512 // Take group of two block_q4_0x8 structures at each pass of the loop and perform dot product operation
2513- for (int64_t x = xstart ; x < anc / 8 ; x += 2 ) {
2513+ for (int64_t x = xstart ; x < anc / 8 ; x += 2 ) {
25142514
2515- const block_q4_0x8 * b_ptr_0 = b_ptr_start + (x * b_nb );
2515+ const block_q4_0x8 * b_ptr_0 = b_ptr_start + (( x ) * b_nb );
25162516 const block_q4_0x8 * b_ptr_1 = b_ptr_start + ((x + 1 ) * b_nb );
25172517
25182518 // Master FP accumulators
@@ -2703,9 +2703,9 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
27032703 const block_q8_0x4 * a_ptr = a_ptr_start + (y * nb );
27042704
27052705 // Take group of two block_q4_0x8 structures at each pass of the loop and perform dot product operation
2706- for (int64_t x = 0 ; x < anc / 8 ; x += 2 ) {
2706+ for (int64_t x = 0 ; x < anc / 8 ; x += 2 ) {
27072707
2708- const block_q4_0x8 * b_ptr_0 = b_ptr_start + (x * b_nb );
2708+ const block_q4_0x8 * b_ptr_0 = b_ptr_start + (( x ) * b_nb );
27092709 const block_q4_0x8 * b_ptr_1 = b_ptr_start + ((x + 1 ) * b_nb );
27102710
27112711 // Master FP accumulators
@@ -2887,11 +2887,11 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
28872887 }
28882888 }
28892889 }
2890- if (anc != nc ) {
2890+ if (anc != nc ) {
28912891 xstart = anc /8 ;
28922892 y = 0 ;
28932893 }
2894- #endif
2894+ #endif // __AVX512F__
28952895
28962896 // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
28972897
0 commit comments