@@ -190,8 +190,8 @@ static __global__ void mul_mat_vec_q(
190190
191191    const  uint32_t  channel_bias = ids ? channel_x : channel_dst;
192192
193-     float  x_biases[ncols_dst][rows_per_cuda_block]     = { { 0 .0f  } };
194-     float  gate_biases[ncols_dst][rows_per_cuda_block]  = { { 0 .0f  } };
193+     float  x_biases[ncols_dst]    = { { 0 .0f  } };
194+     float  gate_biases[ncols_dst] = { { 0 .0f  } };
195195    if  constexpr  (has_fusion) {
196196        if  (use_bias) {
197197            x_bias = x_bias + sample_dst*stride_sample_dst + channel_bias*stride_channel_dst + row0;
@@ -200,7 +200,7 @@ static __global__ void mul_mat_vec_q(
200200            if  (threadIdx .x  < rows_per_cuda_block && threadIdx .y  == 0  &&
201201                (rows_per_cuda_block == 1  || uint32_t (row0 + threadIdx .x ) < stride_col_dst)) {
202202                for  (int  j = 0 ; j < ncols_dst; ++j) {
203-                     x_biases[j][ threadIdx . x ]  = x_bias[j * stride_col_dst + threadIdx .x ];
203+                     x_biases[j] = x_bias[j * stride_col_dst + threadIdx .x ];
204204                }
205205            }
206206        }
@@ -209,7 +209,7 @@ static __global__ void mul_mat_vec_q(
209209            if  (threadIdx .x  < rows_per_cuda_block && threadIdx .y  == 0  &&
210210                (rows_per_cuda_block == 1  || uint32_t (row0 + threadIdx .x ) < stride_col_dst)) {
211211                for  (int  j = 0 ; j < ncols_dst; ++j) {
212-                     gate_biases[j][ threadIdx . x ]  = gate_bias[j * stride_col_dst + threadIdx .x ];
212+                     gate_biases[j] = gate_bias[j * stride_col_dst + threadIdx .x ];
213213                }
214214            }
215215        }
@@ -299,12 +299,12 @@ static __global__ void mul_mat_vec_q(
299299            float  result = tmp[j][threadIdx .x ];
300300            if  constexpr  (has_fusion) {
301301                if  (use_bias) {
302-                     result += x_biases[j][ threadIdx . x ] ;
302+                     result += x_biases[j];
303303                }
304304                if  (use_gate) {
305305                    float  gate_value = tmp_gate[j][threadIdx .x ];
306306                    if  (use_gate_bias) {
307-                         gate_value += gate_biases[j][ threadIdx . x ] ;
307+                         gate_value += gate_biases[j];
308308                    }
309309                    switch  (active_glu) {
310310                        case  GGML_GLU_OP_SWIGLU:
0 commit comments