@@ -14115,28 +14115,28 @@ UseGgmlGemm1:;
1411514115 assert(params->wsize >= ne13*nbw3);
1411614116 GGML_ASSERT(src1->type == GGML_TYPE_F32);
1411714117
14118- #ifdef GGML_USE_IQK_MULMAT
14119- int ts = type_traits[vec_dot_type].type_size;
14120- int bs = type_traits[vec_dot_type].blck_size;
14121- int64_t blocks_per_row = ne10/bs;
14122- int64_t num_blocks = ne11*ne12*ne13*blocks_per_row;
14123- int gcd = simple_gcd(128, ts); // 128 is to cover cache line sizes for common architectures without getting involved
14124- // with trying to get it from ggml
14125- int64_t num_blocks_gcd = (num_blocks + gcd - 1)/gcd;
14126- int64_t block_per_thread = ((num_blocks_gcd + nth - 1)/nth)*gcd;
14127- int64_t first_block = ith*block_per_thread;
14128- int64_t last_block = MIN(num_blocks, first_block + block_per_thread);
14129- while (first_block < last_block) {
14130- int64_t i13 = first_block/(ne11*ne12*blocks_per_row);
14131- int64_t i12 = (first_block - i13*ne11*ne12*blocks_per_row)/(ne11*blocks_per_row);
14132- int64_t i11 = (first_block - (i13*ne12 + i12)*ne11*blocks_per_row)/blocks_per_row;
14133- int64_t i10 = first_block % blocks_per_row;
14134- int64_t blocks_to_do = MIN(blocks_per_row - i10, last_block - first_block);
14135- from_float((float *)((char *)src1->data + i13*nb13 + i12*nb12 + i11*nb11) + i10*bs,
14136- (void *)(wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + i10*ts), blocks_to_do*bs);
14137- first_block += blocks_to_do;
14138- }
14139- #else
14118+ // #ifdef GGML_USE_IQK_MULMAT
14119+ // int ts = type_traits[vec_dot_type].type_size;
14120+ // int bs = type_traits[vec_dot_type].blck_size;
14121+ // int64_t blocks_per_row = ne10/bs;
14122+ // int64_t num_blocks = ne11*ne12*ne13*blocks_per_row;
14123+ // int gcd = simple_gcd(128, ts); // 128 is to cover cache line sizes for common architectures without getting involved
14124+ // // with trying to get it from ggml
14125+ // int64_t num_blocks_gcd = (num_blocks + gcd - 1)/gcd;
14126+ // int64_t block_per_thread = ((num_blocks_gcd + nth - 1)/nth)*gcd;
14127+ // int64_t first_block = ith*block_per_thread;
14128+ // int64_t last_block = MIN(num_blocks, first_block + block_per_thread);
14129+ // while (first_block < last_block) {
14130+ // int64_t i13 = first_block/(ne11*ne12*blocks_per_row);
14131+ // int64_t i12 = (first_block - i13*ne11*ne12*blocks_per_row)/(ne11*blocks_per_row);
14132+ // int64_t i11 = (first_block - (i13*ne12 + i12)*ne11*blocks_per_row)/blocks_per_row;
14133+ // int64_t i10 = first_block % blocks_per_row;
14134+ // int64_t blocks_to_do = MIN(blocks_per_row - i10, last_block - first_block);
14135+ // from_float((float *)((char *)src1->data + i13*nb13 + i12*nb12 + i11*nb11) + i10*bs,
14136+ // (void *)(wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + i10*ts), blocks_to_do*bs);
14137+ // first_block += blocks_to_do;
14138+ // }
14139+ // #else
1414014140
1414114141 for (int64_t i13 = 0; i13 < ne13; ++i13) {
1414214142 for (int64_t i12 = 0; i12 < ne12; ++i12) {
@@ -14158,7 +14158,7 @@ UseGgmlGemm1:;
1415814158 }
1415914159 }
1416014160 }
14161- #endif
14161+ // #endif
1416214162
1416314163 ggml_barrier(params->shared);
1416414164
0 commit comments