@@ -811,7 +811,7 @@ static void quantize_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRIC
811811 // i.e first four bsums from the first super block, followed by first four bsums from second super block and so on
812812 for (int j = 0 ; j < QK_K * 4 ; j++) {
813813 int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
814- int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
814+ int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
815815 src_offset += (j % blck_size_interleave);
816816 int index = (((j & 31 ) >> 3 ) << 2 ) + ((j >> 8 ) << 4 ) + ((j >> 6 ) & 3 );
817817
@@ -5295,8 +5295,7 @@ template <> void gemv<block_q4_K, 8, 8>(int n, float * s, size_t bs, const void
52955295 ggml_gemv_q4_K_8x8_q8_K (n, s, bs, vx, vy, nr, nc);
52965296}
52975297
5298- template <>
5299- void gemv<block_iq4_nl, 4 , 4 >(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
5298+ template <> void gemv<block_iq4_nl, 4 , 4 >(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
53005299 ggml_gemv_iq4_nl_4x4_q8_0 (n, s, bs, vx, vy, nr, nc);
53015300}
53025301
@@ -5320,8 +5319,7 @@ template <> void gemm<block_q4_K, 8, 8>(int n, float * s, size_t bs, const void
53205319 ggml_gemm_q4_K_8x8_q8_K (n, s, bs, vx, vy, nr, nc);
53215320}
53225321
5323- template <>
5324- void gemm<block_iq4_nl, 4 , 4 >(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
5322+ template <> void gemm<block_iq4_nl, 4 , 4 >(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
53255323 ggml_gemm_iq4_nl_4x4_q8_0 (n, s, bs, vx, vy, nr, nc);
53265324}
53275325
@@ -5335,17 +5333,17 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
53355333 bool work_size (int /* n_threads */ , const struct ggml_tensor * op, size_t & size) override {
53365334 // not realy a GGML_TYPE_Q8_0 but same size.
53375335 switch (op->op ) {
5338- case GGML_OP_MUL_MAT:
5339- size = ggml_row_size (PARAM_TYPE, ggml_nelements (op->src [1 ]));
5340- return true ;
5341- case GGML_OP_MUL_MAT_ID:
5342- size = ggml_row_size (PARAM_TYPE, ggml_nelements (op->src [1 ]));
5343- size = GGML_PAD (size, sizeof (int64_t )); // + padding for next bloc.
5344- size += sizeof (int64_t ) * (1 +op->src [0 ]->ne [2 ]) * op->src [1 ]->ne [2 ];
5345- return true ;
5346- default :
5347- // GGML_ABORT("fatal error");
5348- break ;
5336+ case GGML_OP_MUL_MAT:
5337+ size = ggml_row_size (PARAM_TYPE, ggml_nelements (op->src [1 ]));
5338+ return true ;
5339+ case GGML_OP_MUL_MAT_ID:
5340+ size = ggml_row_size (PARAM_TYPE, ggml_nelements (op->src [1 ]));
5341+ size = GGML_PAD (size, sizeof (int64_t )); // + padding for next bloc.
5342+ size += sizeof (int64_t ) * (1 +op->src [0 ]->ne [2 ]) * op->src [1 ]->ne [2 ];
5343+ return true ;
5344+ default :
5345+ // GGML_ABORT("fatal error");
5346+ break ;
53495347 }
53505348 return false ;
53515349 }
@@ -5399,12 +5397,13 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
53995397 const ggml_from_float_t from_float = ggml_get_type_traits_cpu (PARAM_TYPE)->from_float ;
54005398
54015399 int64_t i11_processed = 0 ;
5402- if (PARAM_TYPE == GGML_TYPE_Q8_K) {
5400+ if (PARAM_TYPE == GGML_TYPE_Q8_K) {
54035401 for (int64_t i11 = ith * 4 ; i11 < ne11 - ne11 % 4 ; i11 += nth * 4 ) {
54045402 quantize_mat_q8_K ((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4 , ne10,
54055403 INTER_SIZE);
54065404 }
54075405 } else {
5406+ GGML_ASSERT (PARAM_TYPE == GGML_TYPE_Q8_0);
54085407 for (int64_t i11 = ith * 4 ; i11 < ne11 - ne11 % 4 ; i11 += nth * 4 ) {
54095408 quantize_mat_q8_0 ((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4 , ne10,
54105409 INTER_SIZE);
@@ -5422,7 +5421,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
54225421 int64_t src0_start = (ith * ne01) / nth;
54235422 int64_t src0_end = ((ith + 1 ) * ne01) / nth;
54245423 src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
5425- src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
5424+ src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
54265425 if (src0_start >= src0_end) {
54275426 return ;
54285427 }
@@ -5452,7 +5451,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
54525451 const int ith = params->ith ;
54535452 const int nth = params->nth ;
54545453
5455- const ggml_from_float_t from_float = ggml_get_type_traits_cpu (GGML_TYPE_Q8_0 )->from_float ;
5454+ const ggml_from_float_t from_float = ggml_get_type_traits_cpu (PARAM_TYPE )->from_float ;
54565455
54575456 // we don't support permuted src0 or src1
54585457 GGML_ASSERT (nb00 == ggml_type_size (src0->type ));
@@ -5474,7 +5473,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
54745473 const int n_ids = ids->ne [0 ]; // n_expert_used
54755474 const int n_as = ne02; // n_expert
54765475
5477- const size_t nbw1 = ggml_row_size (GGML_TYPE_Q8_0 , ne10);
5476+ const size_t nbw1 = ggml_row_size (PARAM_TYPE , ne10);
54785477 const size_t nbw2 = nbw1*ne11;
54795478 const size_t nbw3 = nbw2*ne12;
54805479
@@ -5486,12 +5485,13 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
54865485 GGML_ASSERT (params->wsize >= (GGML_PAD (nbw3, sizeof (int64_t )) + n_as * sizeof (int64_t ) +
54875486 n_as * ne12 * sizeof (mmid_row_mapping)));
54885487
5489- auto wdata = (char *) params->wdata ;
5490- auto wdata_src1_end = (char *) wdata + GGML_PAD (nbw3, sizeof (int64_t ));
5491- int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
5488+ auto wdata = (char *) params->wdata ;
5489+ auto wdata_src1_end = (char *) wdata + GGML_PAD (nbw3, sizeof (int64_t ));
5490+ int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
5491+
54925492 struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
54935493
5494- // src1: float32 => block_q8_0
5494+ // src1: float32 => param type
54955495 for (int64_t i12 = 0 ; i12 < ne12; ++i12) {
54965496 for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
54975497 from_float ((float *)((char *) src1->data + i12 * nb12 + i11 * nb11),
@@ -5537,21 +5537,22 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
55375537
55385538 int64_t src0_cur_start = (ith * ne01) / nth;
55395539 int64_t src0_cur_end = ((ith + 1 ) * ne01) / nth;
5540- src0_cur_start =
5541- (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start;
5542- src0_cur_end = (src0_cur_end % NB_COLS) ? src0_cur_end + NB_COLS - (src0_cur_end % NB_COLS) : src0_cur_end;
5540+
5541+ src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start;
5542+ src0_cur_end = (src0_cur_end % NB_COLS) ? src0_cur_end + NB_COLS - (src0_cur_end % NB_COLS) : src0_cur_end;
55435543
55445544 if (src0_cur_start >= src0_cur_end) return ;
55455545
55465546 for (int ir1 = 0 ; ir1 < nr1; ir1++) {
55475547 struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW (cur_a, ir1);
5548- const int id = row_mapping.i1 ; // selected expert index
55495548
5550- const int64_t i11 = id % ne11;
5551- const int64_t i12 = row_mapping.i2 ; // row index in src1
5549+ const int id = row_mapping.i1 ; // selected expert index
5550+
5551+ const int64_t i11 = id % ne11;
5552+ const int64_t i12 = row_mapping.i2 ; // row index in src1
55525553
5553- const int64_t i1 = id; // selected expert index
5554- const int64_t i2 = i12; // row
5554+ const int64_t i1 = id; // selected expert index
5555+ const int64_t i2 = i12; // row
55555556
55565557 auto src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2);
55575558
@@ -5578,7 +5579,7 @@ static const tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
55785579static const tensor_traits<block_q4_K, 8 , 8 , GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
55795580
55805581// instance for IQ4
5581- static const tensor_traits<block_iq4_nl, 4 , 4 , GGML_TYPE_IQ4_NL > iq4_nl_4x4_q8_0;
5582+ static const tensor_traits<block_iq4_nl, 4 , 4 , GGML_TYPE_Q8_0 > iq4_nl_4x4_q8_0;
55825583
55835584} // namespace ggml::cpu::aarch64
55845585
0 commit comments