@@ -1337,28 +1337,28 @@ void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
13371337 // Index : 32 - 39, 96 - 103
13381338 const __m256i rhs_vec_0123_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_4, m4b), rhs_hbit_0123_20);
13391339 const __m256i rhs_vec_0123_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_4, 4), m4b), rhs_hbit_0123_60);
1340-
1340+
13411341 const __m256i rhs_vec_4567_20 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_4, m4b), rhs_hbit_4567_20);
13421342 const __m256i rhs_vec_4567_60 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_4, 4), m4b), rhs_hbit_4567_60);
13431343
13441344 // Index : 40 - 47, 104 - 111
13451345 const __m256i rhs_vec_0123_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_5, m4b), rhs_hbit_0123_21);
13461346 const __m256i rhs_vec_0123_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_5, 4), m4b), rhs_hbit_0123_61);
1347-
1347+
13481348 const __m256i rhs_vec_4567_21 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_5, m4b), rhs_hbit_4567_21);
13491349 const __m256i rhs_vec_4567_61 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_5, 4), m4b), rhs_hbit_4567_61);
13501350
13511351 // Index : 48 - 55, 112 - 119
13521352 const __m256i rhs_vec_0123_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_6, m4b), rhs_hbit_0123_30);
13531353 const __m256i rhs_vec_0123_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_6, 4), m4b), rhs_hbit_0123_70);
1354-
1354+
13551355 const __m256i rhs_vec_4567_30 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_6, m4b), rhs_hbit_4567_30);
13561356 const __m256i rhs_vec_4567_70 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_6, 4), m4b), rhs_hbit_4567_70);
13571357
13581358 // Index : 56 - 63, 120 - 127
13591359 const __m256i rhs_vec_0123_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_0123_7, m4b), rhs_hbit_0123_31);
13601360 const __m256i rhs_vec_0123_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_0123_7, 4), m4b), rhs_hbit_0123_71);
1361-
1361+
13621362 const __m256i rhs_vec_4567_31 = _mm256_or_si256(_mm256_and_si256(rhs_raw_lbit_4567_7, m4b), rhs_hbit_4567_31);
13631363 const __m256i rhs_vec_4567_71 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(rhs_raw_lbit_4567_7, 4), m4b), rhs_hbit_4567_71);
13641364
@@ -1441,7 +1441,7 @@ void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
14411441 __m256i iacc_7 = _mm256_setzero_si256();
14421442
14431443 // Dot product done within 32 bit lanes and accumulated in the same vector
1444- // First done for 0th sub block and then for seven (1st - 7th) other sub blocks processed for each sb (sb < QK_K/128 loop)
1444+ // First done for 0th sub block and then for seven (1st - 7th) other sub blocks processed for each sb (sb < QK_K/128 loop)
14451445 // B0(0-3) B4(0-3) B1(0-3) B5(0-3) B2(0-3) B6(0-3) B3(0-3) B7(0-3) with A0(0-3)
14461446 // B0(4-7) B4(4-7) B1(4-7) B5(4-7) B2(4-7) B6(4-7) B3(4-7) B7(4-7) with A0(4-7)
14471447 // B0(8-11) B4(8-11) B1(8-11) B5(8-11) B2(8-11) B6(8-11) B3(8-11) B7(8-11) with A0(8-11)
@@ -1524,7 +1524,7 @@ void ggml_gemv_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
15241524 // Accumulated output values permuted so as to be stored in appropriate order post accumulation
15251525 acc_row = _mm256_permutevar8x32_ps(acc_row, finalpermutemask);
15261526 _mm256_storeu_ps(s + (y * nr + x * 8), acc_row);
1527-
1527+
15281528 }
15291529 }
15301530#else
@@ -7636,7 +7636,7 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
76367636 _mm512_storeu_ps((float * )(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
76377637
76387638 }
7639- }
7639+ }
76407640 }
76417641
76427642 for (; y < nr / 4; y ++){
@@ -8589,15 +8589,15 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
85898589 for (int i = 0; i < 4; i++) {
85908590 _mm512_storeu_ps((float * )(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
85918591 }
8592- }
8592+ }
85938593 }
85948594
85958595 if (anc != nc) {
85968596 xstart = anc/8;
85978597 y = 0;
85988598 }
85998599
8600- #endif
8600+ #endif
86018601
86028602 //Take group of four block_q8_Kx4 structures at each pass of the loop and perform dot product operation
86038603 for (; y < anr / 4; y += 4){
@@ -9412,8 +9412,8 @@ void ggml_gemm_q6_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
94129412 // Store the accumulated values
94139413 for (int i = 0; i < 16; i++) {
94149414 _mm256_storeu_ps((float * )(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
9415- }
9416- }
9415+ }
9416+ }
94179417 }
94189418
94199419 for (; y < nr / 4; y ++) {
0 commit comments