@@ -446,10 +446,10 @@ void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
446446 }
447447 for (int l = 0 ; l < nb; l++) {
448448 for (int k = 0 ; k < (qk / (4 * blocklen)); k++) {
449- uint8_t *scales_0 = ( uint8_t *) b_ptr[l].scales + (k / 4 ) * 64 ;
450- uint8_t *scales_1 = ( uint8_t *) b_ptr[l].scales + (k / 4 ) * 64 + 16 ;
451- uint8_t *scales_2 = ( uint8_t *) b_ptr[l].scales + (k / 4 ) * 64 + 32 ;
452- uint8_t *scales_3 = ( uint8_t *) b_ptr[l].scales + (k / 4 ) * 64 + 48 ;
449+ const uint8_t *scales_0 = b_ptr[l].scales + (k / 4 ) * 64 ;
450+ const uint8_t *scales_1 = b_ptr[l].scales + (k / 4 ) * 64 + 16 ;
451+ const uint8_t *scales_2 = b_ptr[l].scales + (k / 4 ) * 64 + 32 ;
452+ const uint8_t *scales_3 = b_ptr[l].scales + (k / 4 ) * 64 + 48 ;
453453 for (int j = 0 ; j < ncols_interleaved; j++) {
454454 sumi1 = 0 ;
455455 sumi2 = 0 ;
@@ -471,13 +471,13 @@ void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
471471 sumi2 = sumi2 * (scales_1[offset] & 0xF );
472472 sumi3 = sumi3 * (scales_2[offset] & 0xF );
473473 sumi4 = sumi4 * (scales_3[offset] & 0xF );
474- sumi += sumi1 + sumi2 + sumi3 + sumi4;
474+ sumi += sumi1 + sumi2 + sumi3 + sumi4;
475475 }
476- sumf[j] += sumi * GGML_FP16_TO_FP32 (b_ptr[l].d [j]) * a_ptr[l].d ;
476+ sumf[j] += sumi * GGML_FP16_TO_FP32 (b_ptr[l].d [j]) * a_ptr[l].d ;
477477 }
478478 }
479479 for (int sb = 0 ; sb < 8 ; sb++) {
480- uint8_t *mins = ( uint8_t *) b_ptr[l].scales + sb * 16 ;
480+ const uint8_t *mins = b_ptr[l].scales + sb * 16 ;
481481 for (int j = 0 ; j < ncols_interleaved; j++){
482482 sum_minf[j] += ((mins[j * 2 ] >> 4 ) * a_ptr[l].bsums [sb * 2 ] + (mins[(j * 2 )+ 1 ] >> 4 ) * a_ptr[l].bsums [sb * 2 + 1 ]) * GGML_FP16_TO_FP32 (b_ptr[l].dmin [j]) * a_ptr[l].d ;
483483 }
@@ -826,10 +826,10 @@ void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
826826 for (int l = 0 ; l < nb; l++) {
827827 for (int k = 0 ; k < (qk / (4 * blocklen)); k++) {
828828
829- uint8_t *scales_0 = ( uint8_t *) b_ptr[l].scales + (k / 4 ) * 64 ;
830- uint8_t *scales_1 = ( uint8_t *) b_ptr[l].scales + (k / 4 ) * 64 + 16 ;
831- uint8_t *scales_2 = ( uint8_t *) b_ptr[l].scales + (k / 4 ) * 64 + 32 ;
832- uint8_t *scales_3 = ( uint8_t *) b_ptr[l].scales + (k / 4 ) * 64 + 48 ;
829+ const uint8_t *scales_0 = b_ptr[l].scales + (k / 4 ) * 64 ;
830+ const uint8_t *scales_1 = b_ptr[l].scales + (k / 4 ) * 64 + 16 ;
831+ const uint8_t *scales_2 = b_ptr[l].scales + (k / 4 ) * 64 + 32 ;
832+ const uint8_t *scales_3 = b_ptr[l].scales + (k / 4 ) * 64 + 48 ;
833833 for (int m = 0 ; m < 4 ; m++) {
834834 for (int j = 0 ; j < ncols_interleaved; j++) {
835835 sumi1 = 0 ;
@@ -858,7 +858,7 @@ void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
858858 }
859859 }
860860 for (int sb = 0 ; sb < 8 ; sb++) {
861- uint8_t *mins = ( uint8_t *) b_ptr[l].scales + sb * 16 ;
861+ const uint8_t *mins = b_ptr[l].scales + sb * 16 ;
862862 for (int m = 0 ; m < 4 ; m++) {
863863 const int16_t *bsums = a_ptr[l].bsums + (sb * 8 ) + (m * 4 ) - ((sb % 2 ) * 6 );
864864 for (int j = 0 ; j < ncols_interleaved; j++) {
@@ -1112,7 +1112,6 @@ static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_in
11121112 // The output Q2_Kx8 structure has 128 bytes for storing scales and mins
11131113 // Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure
11141114 // For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures
1115- uint8_t s[128 ], m[128 ];
11161115
11171116 for (int i = 0 ; i < 128 ; i++){
11181117
@@ -1121,9 +1120,6 @@ static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_in
11211120 // Index for selecting scale
11221121 int src2 = ((i / 16 ) * 2 ) + (i % 2 );
11231122
1124- s[i] = in[src1].scales [src2] & 15 ;
1125- m[i] = in[src1].scales [src2] & 240 ;
1126-
11271123 out.scales [i] = in[src1].scales [src2];
11281124 }
11291125 return out;
0 commit comments