diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 314a38fb2..bd340fadd 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -111,7 +111,6 @@ option(GGML_BLAS "ggml: use BLAS" set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING "ggml: BLAS library vendor") option(GGML_LLAMAFILE "ggml: use LLAMAFILE" OFF) -option(GGML_IQK_MUL_MAT "ggml: use optimized iqk matrix multiplications" ON) option(GGML_CUDA "ggml: use CUDA" OFF) option(GGML_MUSA "ggml: use MUSA" OFF) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index b0db417de..ff29fab59 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -255,44 +255,41 @@ endif() set (GGML_SOURCES_IQK iqk/iqk_quantize.cpp) set (GGML_HEADERS_IQK iqk/iqk_config.h) -if (GGML_IQK_MUL_MAT) - message(STATUS "Using optimized iqk matrix multiplications") - add_compile_definitions(GGML_USE_IQK_MULMAT) - set(GGML_SOURCES_IQK_MM iqk/iqk_mul_mat.cpp - iqk/iqk_flash_attn.cpp - iqk/fa/iqk_fa_576_512.cpp - iqk/fa/iqk_fa_192_128.cpp - iqk/fa/iqk_fa_256_256.cpp - iqk/fa/iqk_fa_128_128.cpp - iqk/fa/iqk_fa_96_96.cpp - iqk/fa/iqk_fa_64_64.cpp - iqk/iqk_gemm_floats.cpp - iqk/iqk_gemm_kquants.cpp - iqk/iqk_gemm_ktquants.cpp - iqk/iqk_gemm_iquants.cpp - iqk/iqk_gemm_iqk_quants.cpp - iqk/iqk_gemm_1bit.cpp - iqk/iqk_gemm_legacy_quants.cpp) - set(GGML_HEADERS_IQK_MM iqk/iqk_mul_mat.h - iqk/iqk_flash_impl.h - iqk/fa/iqk_fa_templates.h - iqk/iqk_gemm_floats.h - iqk/iqk_gemm_kquants.h - iqk/iqk_gemm_ktquants.h - iqk/iqk_gemm_iquants.h - iqk/iqk_gemm_iqk_quants.h - iqk/iqk_gemm_1bit.h - iqk/iqk_gemm_legacy_quants.h) - if (GGML_IQK_FLASH_ATTENTION) - message(STATUS "Enabling IQK Flash Attention kernels") - add_compile_definitions(GGML_IQK_FLASH_ATTENTION) - if (GGML_IQK_FA_ALL_QUANTS) - message(STATUS "Including all IQK FA kernels") - add_compile_definitions(GGML_IQK_FA_ALL_QUANTS) - endif() - else() - message(STATUS "Disabling IQK Flash Attention kernels") +message(STATUS "Using optimized iqk matrix multiplications") +set(GGML_SOURCES_IQK_MM iqk/iqk_mul_mat.cpp + iqk/iqk_flash_attn.cpp + iqk/fa/iqk_fa_576_512.cpp + iqk/fa/iqk_fa_192_128.cpp + iqk/fa/iqk_fa_256_256.cpp + iqk/fa/iqk_fa_128_128.cpp + iqk/fa/iqk_fa_96_96.cpp + iqk/fa/iqk_fa_64_64.cpp + iqk/iqk_gemm_floats.cpp + iqk/iqk_gemm_kquants.cpp + iqk/iqk_gemm_ktquants.cpp + iqk/iqk_gemm_iquants.cpp + iqk/iqk_gemm_iqk_quants.cpp + iqk/iqk_gemm_1bit.cpp + iqk/iqk_gemm_legacy_quants.cpp) +set(GGML_HEADERS_IQK_MM iqk/iqk_mul_mat.h + iqk/iqk_flash_impl.h + iqk/fa/iqk_fa_templates.h + iqk/iqk_gemm_floats.h + iqk/iqk_gemm_kquants.h + iqk/iqk_gemm_ktquants.h + iqk/iqk_gemm_iquants.h + iqk/iqk_gemm_iqk_quants.h + iqk/iqk_gemm_1bit.h + iqk/iqk_gemm_legacy_quants.h) +if (GGML_IQK_FLASH_ATTENTION) + message(STATUS "Enabling IQK Flash Attention kernels") + add_compile_definitions(GGML_IQK_FLASH_ATTENTION) + if (GGML_IQK_FA_ALL_QUANTS) + message(STATUS "Including all IQK FA kernels") + add_compile_definitions(GGML_IQK_FA_ALL_QUANTS) endif() +else() + message(STATUS "Disabling IQK Flash Attention kernels") endif() if (GGML_LLAMAFILE) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 220c0c997..05b764199 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -10,11 +10,9 @@ #include "ggml-quants.h" #include "ggml-impl.h" -#if GGML_USE_IQK_MULMAT #include "iqk/iqk_config.h" #include "iqk/iqk_mul_mat.h" #include "iqk/iqk_quantize.h" -#endif #include @@ -3933,11 +3931,7 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int6 } void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) { -#ifdef GGML_USE_IQK_MULMAT iqk_quantize_row_q8_K(x, y, k); -#else - quantize_row_q8_K_ref(x, y, k); -#endif } //===================================== Dot ptoducts ================================= @@ -4023,11 +4017,9 @@ static inline __m128i get_scale_shuffle(int i) { #endif void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_Q4_0, vx, bx, GGML_TYPE_Q8_0, vy, by, s, bs, 0, 1)) { return; } -#endif const int qk = QK8_0; const int nb = n / qk; @@ -4510,11 +4502,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r } void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_Q4_1, vx, bx, GGML_TYPE_Q8_1, vy, by, s, bs, 0, 1)) { return; } -#endif const int qk = QK8_1; const int nb = n / qk; @@ -4802,7 +4792,6 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r } void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT #ifdef __AVX2__ const enum ggml_type vec_dot_type = GGML_TYPE_Q8_1; #else @@ -4811,7 +4800,6 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_Q5_0, vx, bx, vec_dot_type, vy, by, s, bs, 0, 1)) { return; } -#endif const int qk = QK8_0; const int nb = n / qk; @@ -5167,11 +5155,9 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r } void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_Q5_1, vx, bx, GGML_TYPE_Q8_1, vy, by, s, bs, 0, 1)) { return; } -#endif const int qk = QK8_1; const int nb = n / qk; @@ -5546,7 +5532,6 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r } void ggml_vec_dot_q6_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT #ifdef __AVX2__ const enum ggml_type vec_dot_type = GGML_TYPE_Q8_1; #else @@ -5555,13 +5540,11 @@ void ggml_vec_dot_q6_0_q8_0(int n, float * restrict s, size_t bs, const void * r if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_Q6_0, vx, bx, vec_dot_type, vy, by, s, bs, 0, 1)) { return; } -#endif // TODO *s = 0; } void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT #ifdef HAVE_FANCY_SIMD enum ggml_type dot_type = GGML_TYPE_Q8_1_X4; #else @@ -5570,7 +5553,6 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_Q8_0, vx, bx, dot_type, vy, by, s, bs, 0, 1)) { return; } -#endif const int qk = QK8_0; const int nb = n / qk; @@ -11940,11 +11922,9 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void } void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_IQ4_NL, vx, bx, GGML_TYPE_Q8_0, vy, by, s, bs, 0, 1)) { return; } -#endif assert(nrc == 1); UNUSED(nrc); UNUSED(bx); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index d8025a5a4..6b2136205 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -12,10 +12,8 @@ #include "ggml.h" #include "ggml-aarch64.h" #include "iqk/iqk_quantize.h" -#if GGML_USE_IQK_MULMAT #include "iqk/iqk_mul_mat.h" #include "iqk/iqk_config.h" -#endif #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW @@ -715,15 +713,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float = quantize_row_q4_0, .from_float_ref = (ggml_from_float_t) quantize_row_q4_0_ref, .vec_dot = ggml_vec_dot_q4_0_q8_0, -#if GGML_USE_IQK_MULMAT #if defined __AVX2__ .vec_dot_type = GGML_TYPE_Q8_2_X4, #else .vec_dot_type = GGML_TYPE_Q8_0_X4, #endif -#else - .vec_dot_type = GGML_TYPE_Q8_0, -#endif #if defined (__ARM_FEATURE_MATMUL_INT8) .nrows = 2, #else @@ -740,15 +734,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float = quantize_row_q4_1, .from_float_ref = (ggml_from_float_t) quantize_row_q4_1_ref, .vec_dot = ggml_vec_dot_q4_1_q8_1, -#if GGML_USE_IQK_MULMAT #if defined __AVX2__ .vec_dot_type = GGML_TYPE_Q8_2_X4, #else .vec_dot_type = GGML_TYPE_Q8_1_X4, #endif -#else - .vec_dot_type = GGML_TYPE_Q8_1, -#endif #if defined (__ARM_FEATURE_MATMUL_INT8) .nrows = 2, #else @@ -791,14 +781,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float = quantize_row_q5_0, .from_float_ref = (ggml_from_float_t) quantize_row_q5_0_ref, .vec_dot = ggml_vec_dot_q5_0_q8_0, -#if GGML_USE_IQK_MULMAT #if defined __AVX2__ .vec_dot_type = GGML_TYPE_Q8_2_X4, #else .vec_dot_type = GGML_TYPE_Q8_0_X4, -#endif -#else - .vec_dot_type = GGML_TYPE_Q8_0, #endif .nrows = 1, .row_meta_size = 0, @@ -812,14 +798,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float = quantize_row_q5_1, .from_float_ref = (ggml_from_float_t) quantize_row_q5_1_ref, .vec_dot = ggml_vec_dot_q5_1_q8_1, -#if GGML_USE_IQK_MULMAT #ifdef __AVX2__ .vec_dot_type = GGML_TYPE_Q8_2_X4, #else .vec_dot_type = GGML_TYPE_Q8_1_X4, -#endif -#else - .vec_dot_type = GGML_TYPE_Q8_1, #endif .nrows = 1, .row_meta_size = 0, @@ -833,14 +815,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float = quantize_row_q6_0, .from_float_ref = (ggml_from_float_t) quantize_row_q6_0_ref, .vec_dot = ggml_vec_dot_q6_0_q8_0, -#if GGML_USE_IQK_MULMAT #if defined __AVX2__ .vec_dot_type = GGML_TYPE_Q8_2_X4, #else .vec_dot_type = GGML_TYPE_Q8_0_X4, -#endif -#else - .vec_dot_type = GGML_TYPE_Q8_0, #endif .nrows = 1, .row_meta_size = 0, @@ -855,7 +833,6 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_ref = (ggml_from_float_t) quantize_row_q8_0_ref, .from_float_to_mat = quantize_mat_q8_0, .vec_dot = ggml_vec_dot_q8_0_q8_0, -#if GGML_USE_IQK_MULMAT #ifdef HAVE_FANCY_SIMD // Remember: we cannot add 128 to the Q8 quants and use iblock sum in Q8_1 to subtract as we do on Zen4 for pure AVX2 // because there the result of the _mm256_maddubs_epi16() instruction may overflow the int16_t range @@ -864,9 +841,6 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { #else .vec_dot_type = GGML_TYPE_Q8_0_X4, #endif -#else - .vec_dot_type = GGML_TYPE_Q8_0, -#endif #if defined (__ARM_FEATURE_MATMUL_INT8) .nrows = 2, #else @@ -1288,14 +1262,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float = quantize_row_iq4_nl, .from_float_ref = (ggml_from_float_t)quantize_row_iq4_nl_ref, .vec_dot = ggml_vec_dot_iq4_nl_q8_0, -#if GGML_USE_IQK_MULMAT #if defined HAVE_FANCY_SIMD .vec_dot_type = GGML_TYPE_Q8_2_X4, #else .vec_dot_type = GGML_TYPE_Q8_0_X4, -#endif -#else - .vec_dot_type = GGML_TYPE_Q8_0, #endif .nrows = 1, .row_meta_size = 0, @@ -1713,14 +1683,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float = quantize_row_iq4_nl_r4, .from_float_ref = (ggml_from_float_t)quantize_row_iq4_nl_r4_ref, .vec_dot = vec_dot_iq4_nl_r4_q8_0, -#if GGML_USE_IQK_MULMAT #if defined __AVX2__ .vec_dot_type = GGML_TYPE_Q8_2_X4, #else .vec_dot_type = GGML_TYPE_Q8_0_X4, -#endif -#else - .vec_dot_type = GGML_TYPE_Q8_0, #endif .nrows = 1, .row_meta_size = 0, @@ -1747,14 +1713,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float = quantize_row_q4_0_r8, .from_float_ref = (ggml_from_float_t)quantize_row_q4_0_r8_ref, .vec_dot = vec_dot_q4_0_r8_q8_0, -#if GGML_USE_IQK_MULMAT #if defined __AVX2__ .vec_dot_type = GGML_TYPE_Q8_2_X4, #else .vec_dot_type = GGML_TYPE_Q8_0_X4, -#endif -#else - .vec_dot_type = GGML_TYPE_Q8_0, #endif .nrows = 1, .row_meta_size = 0, @@ -1768,14 +1730,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float = quantize_row_q8_0_r8, .from_float_ref = (ggml_from_float_t)quantize_row_q8_0_r8_ref, .vec_dot = vec_dot_q8_0_r8_q8_0, -#if GGML_USE_IQK_MULMAT #if defined __AVX2__ .vec_dot_type = GGML_TYPE_Q8_2_X4, #else .vec_dot_type = GGML_TYPE_Q8_0_X4, -#endif -#else - .vec_dot_type = GGML_TYPE_Q8_0, #endif .nrows = 1, .row_meta_size = 0, @@ -1789,14 +1747,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float = quantize_row_q5_0_r4, .from_float_ref = (ggml_from_float_t)quantize_row_q5_0_r4_ref, .vec_dot = vec_dot_q5_0_r4_q8_0, -#if GGML_USE_IQK_MULMAT #if defined __AVX2__ .vec_dot_type = GGML_TYPE_Q8_2_X4, #else .vec_dot_type = GGML_TYPE_Q8_0_X4, -#endif -#else - .vec_dot_type = GGML_TYPE_Q8_0, #endif .nrows = 1, .row_meta_size = 0, @@ -1810,14 +1764,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float = quantize_row_q6_0_r4, .from_float_ref = (ggml_from_float_t)quantize_row_q6_0_r4_ref, .vec_dot = vec_dot_q6_0_r4_q8_0, -#if GGML_USE_IQK_MULMAT #if defined __AVX2__ .vec_dot_type = GGML_TYPE_Q8_2_X4, #else .vec_dot_type = GGML_TYPE_Q8_0_X4, -#endif -#else - .vec_dot_type = GGML_TYPE_Q8_0, #endif .nrows = 1, .row_meta_size = 0, @@ -14501,10 +14451,6 @@ static void ggml_compute_forward_mul_mat( ggml_from_float_t const from_float = type_traits[vec_dot_type].from_float; int64_t const vec_dot_num_rows = type_traits[type].nrows; int64_t const matmul_num_cols = type_traits[type].ncols; -#if !GGML_USE_IQK_MULMAT - ggml_from_float_to_mat_t const from_float_to_mat = type_traits[vec_dot_type].from_float_to_mat; - int64_t const blck_size_interleave = type_traits[type].blck_size_interleave; -#endif ggml_gemv_t const gemv = type_traits[type].gemv; ggml_gemm_t const gemm = type_traits[type].gemm; @@ -14532,7 +14478,6 @@ static void ggml_compute_forward_mul_mat( const int64_t r3 = ne13 / ne03; #endif -#if GGML_USE_IQK_MULMAT if (dst->type == GGML_TYPE_F32) { if (iqk_mul_mat_4d(ne01, ne11, ne00, ne02, ne03, ne12, ne13, nb02, nb03, nb12, nb13, nb2/sizeof(float), nb3/sizeof(float), @@ -14540,7 +14485,6 @@ static void ggml_compute_forward_mul_mat( src1->type, src1->data, nb11, (float *)dst->data, nb1/sizeof(float), ith, nth)) return; } -#endif #if GGML_USE_LLAMAFILE @@ -14579,62 +14523,23 @@ UseGgmlGemm1:; assert(params->wsize >= ne13*nbw3); if (src1->type != GGML_TYPE_F32) { -#if GGML_USE_IQK_MULMAT char * work_buffer = wdata + ne13*nbw3 + ith*ne10*sizeof(float); GGML_ASSERT(params->wsize >= ne13*nbw3 + nth*ne10*sizeof(float)); iqk_quantize_any(src1->type, vec_dot_type, ne10, ne11, ne12, ne13, nb10, nb11, nb12, nb13, src1->data, wdata, work_buffer, type_traits[src1->type].to_float, from_float, ith, nth); -#else - GGML_ABORT("fatal error"); -#endif } else { - -//#ifdef GGML_USE_IQK_MULMAT -// int ts = type_traits[vec_dot_type].type_size; -// int bs = type_traits[vec_dot_type].blck_size; -// int64_t blocks_per_row = ne10/bs; -// int64_t num_blocks = ne11*ne12*ne13*blocks_per_row; -// int gcd = simple_gcd(128, ts); // 128 is to cover cache line sizes for common architectures without getting involved -// // with trying to get it from ggml -// int64_t num_blocks_gcd = (num_blocks + gcd - 1)/gcd; -// int64_t block_per_thread = ((num_blocks_gcd + nth - 1)/nth)*gcd; -// int64_t first_block = ith*block_per_thread; -// int64_t last_block = MIN(num_blocks, first_block + block_per_thread); -// while (first_block < last_block) { -// int64_t i13 = first_block/(ne11*ne12*blocks_per_row); -// int64_t i12 = (first_block - i13*ne11*ne12*blocks_per_row)/(ne11*blocks_per_row); -// int64_t i11 = (first_block - (i13*ne12 + i12)*ne11*blocks_per_row)/blocks_per_row; -// int64_t i10 = first_block % blocks_per_row; -// int64_t blocks_to_do = MIN(blocks_per_row - i10, last_block - first_block); -// from_float((float *)((char *)src1->data + i13*nb13 + i12*nb12 + i11*nb11) + i10*bs, -// (void *)(wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + i10*ts), blocks_to_do*bs); -// first_block += blocks_to_do; -// } -//#else - - for (int64_t i13 = 0; i13 < ne13; ++i13) { - for (int64_t i12 = 0; i12 < ne12; ++i12) { - int64_t i11_processed = 0; -#if !GGML_USE_IQK_MULMAT - if ((ggml_n_dims(src1) == 2) && from_float_to_mat && gemm) { - for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) { - from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), - (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1), - 4, ne10, blck_size_interleave); + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + int64_t i11_processed = 0; + for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) { + from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), + (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1), + ne10); } - i11_processed = ne11 - ne11 % 4; - } -#endif - for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) { - from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), - (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1), - ne10); } } } -//#endif - } ggml_barrier(params->shared); @@ -14653,7 +14558,6 @@ UseGgmlGemm1:; const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; -#if GGML_USE_IQK_MULMAT if (src1->type != vec_dot_type && dst->type == GGML_TYPE_F32) { const size_t row_size = ggml_row_size(vec_dot_type, ne10); if (iqk_mul_mat_4d(ne01, ne11, ne00, @@ -14663,7 +14567,6 @@ UseGgmlGemm1:; vec_dot_type, wdata, row_size, (float *)dst->data, nb1/sizeof(float), ith, nth)) return; } -#endif #if GGML_USE_LLAMAFILE if (src1->type != vec_dot_type) { @@ -14901,7 +14804,6 @@ static void ggml_compute_forward_mul_mat_id( const int64_t nr0 = ne01; // src0 rows const int64_t nr1 = cne1; // src1 rows // -#if GGML_USE_IQK_MULMAT if (ne13 == 1 && dst->type == GGML_TYPE_F32) { if (!iqk_mul_mat_moe(nr0, nr1, ne00, ne11, src0->type, (const char *)src0_cur, nb01, ///ggml_type_size(src0->type), @@ -14911,7 +14813,6 @@ static void ggml_compute_forward_mul_mat_id( continue; } IQK_MulMat_Not_Available:; -#endif if (((ggml_n_dims(src0) - 1) == 2) && gemv) { int64_t src0_cur_start = (ith * ne01) / nth; @@ -15041,7 +14942,6 @@ IQK_MulMat_Not_Available:; #undef MMID_MATRIX_ROW } -#if GGML_USE_IQK_MULMAT static void ggml_compute_forward_mul_mat_id_up_gate( const struct ggml_compute_params * params, struct ggml_tensor * dst) { @@ -15176,32 +15076,10 @@ static void ggml_compute_forward_mul_mat_id_up_gate( (float *)dst->data, nb1, nb2, matrix_rows + cur_a*ne12, ith, nth)) GGML_ABORT("fatal error"); -// if (nth%2 == 0) { -// const char * src0_d = ith%2 == 0 ? src0_1_cur : src0_2_cur; -// void * dst_d = ith%2 == 0 ? dst1->data : dst2->data; -// if (!iqk_mul_mat_moe(nr0, nr1, ne00, ne11, -// type, src0_d, nb01, -// vec_dot_type, (const char *)wdata, row_size, -// (float *)dst_d, nb1, nb2, -// matrix_rows + cur_a*ne12, ith/2, nth/2)) GGML_ABORT("fatal error"); -// -// } else { -// if (!iqk_mul_mat_moe(nr0, nr1, ne00, ne11, -// src0_1->type, (const char *)src0_1_cur, nb01, -// vec_dot_type, (const char *)wdata, row_size, -// (float *)dst1->data, nb1, nb2, -// matrix_rows + cur_a*ne12, ith, nth)) GGML_ABORT("fatal error"); -// if (!iqk_mul_mat_moe(nr0, nr1, ne00, ne11, -// src0_2->type, (const char *)src0_2_cur, nb01, -// vec_dot_type, (const char *)wdata, row_size, -// (float *)dst2->data, nb1, nb2, -// matrix_rows + cur_a*ne12, ith, nth)) GGML_ABORT("fatal error"); -// } } #undef MMID_MATRIX_ROW } -#endif // ggml_compute_forward_out_prod @@ -18251,7 +18129,6 @@ static void ggml_compute_forward_flash_attn_ext_f16( scale /= softcap; } -#if GGML_USE_IQK_MULMAT if (iqk_flash_attn_noalibi(q->type, mask->type, max_bias, q->ne[3], q->ne[2], q->nb[3], q->nb[2], k->ne[3], k->ne[2], k->nb[3], k->nb[2], @@ -18303,7 +18180,6 @@ static void ggml_compute_forward_flash_attn_ext_f16( //IQK_Flash_Attn_NotAvailable:; // printf("iqk_flash was rejected\n"); // } -#endif const uint32_t n_head = neq2; const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head)); @@ -21930,7 +21806,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa const int64_t D = MAX(Dk, Dv); cur = 3*sizeof(float)*D*n_tasks; // 3x head size/thread -#if GGML_USE_IQK_MULMAT +#if GGML_IQK_FLASH_ATTENTION size_t qsize = 0; const struct ggml_tensor * q = node->src[0]; const struct ggml_tensor * k = node->src[1]; diff --git a/ggml/src/iqk/iqk_mul_mat.cpp b/ggml/src/iqk/iqk_mul_mat.cpp index 43be08857..b0c7c7a67 100644 --- a/ggml/src/iqk/iqk_mul_mat.cpp +++ b/ggml/src/iqk/iqk_mul_mat.cpp @@ -902,6 +902,7 @@ bool iqk_flash_attn_impl(int int_type_k, // type of k #else // IQK_IMPLEMENT extern "C" IQK_API bool iqk_mul_mat(int, long, long, long, int, const void *, long, int, const void *, long, float *, long, int, int) { + GGML_ABORT("=============================== Unsupported CPU"); return false; } @@ -911,11 +912,13 @@ extern "C" IQK_API bool iqk_mul_mat_4d(long /*Nx*/, long /*Ny*/, long /*ne00*/, int /*typeA*/, const void * /*A*/, long /*strideA*/, int /*typeB*/, const void * /*B*/, long /*strideB*/, float * /*C*/, long /*stride_C*/, int /*ith*/, int /*nth*/) { + GGML_ABORT("=============================== Unsupported CPU"); return false; } extern "C" IQK_API bool iqk_mul_mat_moe(long, long, long, int, int, const void *, long, int, const void *, long, float *, long, long, const void *, int, int) { + GGML_ABORT("=============================== Unsupported CPU"); return false; } @@ -923,7 +926,8 @@ extern "C" IQK_API bool iqk_moe_fused_up_gate(long /*Nx*/, long /*Ny*/, long /*n int /*typeA*/, const void * /*Aup*/, const void * /*Agate*/, long /*strideA*/, int /*typeB*/, const void * /*B*/, long /*strideB*/, float * /*C*/, long /*nb1*/, long /*nb2*/, const void * /*vrow_mapping*/, int /*ith*/, int /*nth*/) { + GGML_ABORT("=============================== Unsupported CPU"); return false; } -#endif \ No newline at end of file +#endif diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index c1f7a8e4e..2d81c9d51 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -4,9 +4,7 @@ // SPDX-License-Identifier: MIT // -#if GGML_USE_IQK_MULMAT #include "iqk_mul_mat.h" -#endif #include "ggml-quants.h" #include "ggml-impl.h" #define GGML_COMMON_IMPL_C @@ -356,11 +354,9 @@ void ggml_vec_dot_iq1_bn_q8_K64(int n, float * s, size_t bs, const void * vx, si static_assert(QK_IQ1BN == 64, "This dot product implementation for iq1_bn requires a block size of 64"); -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ1_BN, vx, 0, GGML_TYPE_Q8_K64, vy, 0, s, 0, 0, 1)) { return; } -#endif const block_iq1_bn * x = (const block_iq1_bn *)vx; @@ -407,11 +403,9 @@ void vec_dot_iq2_bn_q8_K64(int n, float * s, size_t bs, const void * vx, size_t static_assert(QK_IQ1BN == 64, "This dot product implementation for iq2_bn requires a block size of 64"); -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ2_BN, vx, 0, GGML_TYPE_Q8_K64, vy, 0, s, 0, 0, 1)) { return; } -#endif constexpr int Nj = QK_IQ1BN/4; @@ -1187,11 +1181,9 @@ void vec_dot_iq2_k_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, GGML_UNUSED(by); GGML_UNUSED(bs); -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ2_K, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ABORT("not implemented"); @@ -1492,11 +1484,9 @@ void vec_dot_iq2_ks_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx GGML_UNUSED(by); GGML_UNUSED(bs); -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ2_KS, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif const ggml_half * dptr = (const ggml_half *)vx; const float d = GGML_FP16_TO_FP32(*dptr); @@ -1834,11 +1824,9 @@ void vec_dot_iq3_k_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, GGML_UNUSED(by); GGML_UNUSED(bs); -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ3_K, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ABORT("not implemented"); } @@ -1883,11 +1871,9 @@ void vec_dot_iq4_k_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, GGML_UNUSED(by); GGML_UNUSED(bs); -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ4_K, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif const int nb = n / QK_K; @@ -2183,11 +2169,9 @@ void vec_dot_iq5_k_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, GGML_UNUSED(by); GGML_UNUSED(bs); -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ5_K, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif const int nb = n / QK_K; @@ -2524,11 +2508,9 @@ void vec_dot_iq6_k_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, GGML_UNUSED(by); GGML_UNUSED(bs); -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ6_K, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ABORT("not implemented"); @@ -3379,11 +3361,9 @@ void dequantize_row_iq4_ks(const block_iq4_ks * x, float * y, int64_t k) { void vec_dot_iq4_ks_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { constexpr int kBlockSize = 32; -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ4_KS, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK_K == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -3624,11 +3604,9 @@ void dequantize_row_iq5_ks(const block_iq5_ks * x, float * y, int64_t k) { void vec_dot_iq5_ks_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { constexpr int kBlockSize = 32; -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ5_KS, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK_K == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -4079,11 +4057,9 @@ void dequantize_row_iq4_kss(const block_iq4_kss * x, float * y, int64_t k) { } void vec_dot_iq4_kss_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ4_KSS, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK_K == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -4163,11 +4139,9 @@ void dequantize_row_iq4_nl_r4(const block_iq4_nl_r4 * x, float * y, int64_t k) { } void vec_dot_iq4_nl_r4_q8_0(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ4_NL_R4, vx, 0, GGML_TYPE_Q8_0, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -4268,11 +4242,9 @@ void dequantize_row_q4_0_r8(const block_iq4_nl_r8 * x, float * y, int64_t k) { } void vec_dot_q4_0_r8_q8_0(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_Q4_0_R8, vx, 0, GGML_TYPE_Q8_0, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -4368,11 +4340,9 @@ void dequantize_row_q8_0_r8(const block_q8_0_r8 * x, float * y, int64_t k) { } void vec_dot_q8_0_r8_q8_0(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_Q8_0_R8, vx, 0, GGML_TYPE_Q8_0, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -4469,11 +4439,9 @@ void dequantize_row_q5_0_r4(const block_q5_0_r4 * x, float * y, int64_t k) { } void vec_dot_q5_0_r4_q8_0(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_Q5_0_R4, vx, 0, GGML_TYPE_Q8_0, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -4566,11 +4534,9 @@ void dequantize_row_q6_0_r4(const block_q6_0_r4 * x, float * y, int64_t k) { } void vec_dot_q6_0_r4_q8_0(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_Q6_0_R4, vx, 0, GGML_TYPE_Q8_0, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -4658,11 +4624,9 @@ void dequantize_row_iq4_xs_r8(const block_iq4_xs_r8 * x, float * y, int64_t k) { } void vec_dot_iq4_xs_r8_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ4_XS_R8, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -4759,11 +4723,9 @@ void dequantize_row_iq4_ks_r4(const block_iq4_ks_r4 * x, float * y, int64_t k) { } void vec_dot_iq4_ks_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ4_KS_R4, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -4881,11 +4843,9 @@ void dequantize_row_iq2_bn_r4(const block_iq2_bn * x, float * y, int64_t k) { } void vec_dot_iq2_bn_r4_q8_K64(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ2_BN_R4, vx, 0, GGML_TYPE_Q8_K64, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -5002,11 +4962,9 @@ void dequantize_row_q4_k_r4(const block_q4_k_r4 * x, float * y, int64_t k) { } void vec_dot_q4_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_Q4_K_R4, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -5120,11 +5078,9 @@ void dequantize_row_q6_k_r4(const block_q6_k_r4 * x, float * y, int64_t k) { } void vec_dot_q6_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_Q6_K_R4, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -5240,11 +5196,9 @@ void dequantize_row_q5_k_r4(const block_q5_k_r4 * x, float * y, int64_t k) { } void vec_dot_q5_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_Q5_K_R4, vx, 0, GGML_TYPE_Q8_K32, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -5376,11 +5330,9 @@ void dequantize_row_q3_k_r4(const block_q3_k_r4 * x, float * y, int64_t k) { } void vec_dot_q3_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_Q3_K_R4, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -5491,11 +5443,9 @@ void dequantize_row_q2_k_r4(const block_q2_k_r4 * x, float * y, int64_t k) { } void vec_dot_q2_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_Q2_K_R4, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -5605,11 +5555,9 @@ void dequantize_row_iq4_k_r4(const block_iq4_k_r4 * x, float * y, int64_t k) { } void vec_dot_iq4_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ4_K_R4, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -5742,11 +5690,9 @@ void dequantize_row_iq5_k_r4(const block_iq5_k_r4 * x, float * y, int64_t k) { } void vec_dot_iq5_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ5_K_R4, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -5862,11 +5808,9 @@ void dequantize_row_iq5_ks_r4(const block_iq5_ks_r4 * x, float * y, int64_t k) { } void vec_dot_iq5_ks_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ5_KS_R4, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -5960,11 +5904,9 @@ void dequantize_row_q8_k_r8(const block_q8_k_r8 * x, float * y, int64_t k) { } void vec_dot_q8_k_r8_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_Q8_K_R8, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -6104,11 +6046,9 @@ void dequantize_row_q8_KV_r8(const void * vx, float * y, int64_t k) { } void vec_dot_q8_KV_r8_q8_KV(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_Q8_KV_R8, vx, 0, GGML_TYPE_Q8_KV, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -6277,11 +6217,9 @@ void dequantize_row_iq3_k_r4(const block_iq3_k_r4 * x, float * y, int64_t k) { } void vec_dot_iq3_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ3_K_R4, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -6400,11 +6338,9 @@ void dequantize_row_iq2_k_r4(const block_iq2_k_r4 * x, float * y, int64_t k) { } void vec_dot_iq2_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ2_K_R4, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -6516,11 +6452,9 @@ void dequantize_row_iq2_xxs_r4(const block_iq2_xxs_r4 * x, float * y, int64_t k) } void vec_dot_iq2_xxs_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ2_XXS_R4, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -6603,11 +6537,9 @@ void dequantize_row_iq2_xs_r4(const block_iq2_xs_r4 * x, float * y, int64_t k) { } void vec_dot_iq2_xs_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ2_XS_R4, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -6690,11 +6622,9 @@ void dequantize_row_iq2_s_r4(const block_iq2_s_r4 * x, float * y, int64_t k) { } void vec_dot_iq2_s_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ2_S_R4, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -6793,11 +6723,9 @@ void dequantize_row_iq3_xxs_r4(const block_iq3_xxs_r4 * x, float * y, int64_t k) } void vec_dot_iq3_xxs_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ3_XXS_R4, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -6894,11 +6822,9 @@ void dequantize_row_iq3_s_r4(const block_iq3_s_r4 * x, float * y, int64_t k) { } void vec_dot_iq3_s_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ3_S_R4, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -7025,11 +6951,9 @@ void dequantize_row_iq1_s_r4(const block_iq1_s_r4 * x, float * y, int64_t n) { } void vec_dot_iq1_s_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ1_S_R4, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -7171,11 +7095,9 @@ void dequantize_row_iq1_m_r4(const block_iq1_m_r4 * x, float * y, int64_t n) { } void vec_dot_iq1_m_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ1_M_R4, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -7211,11 +7133,9 @@ void dequantize_row_q8_KV(const void * x, float * y, int64_t k) { } void vec_dot_q8_KV_q8_KV(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_Q8_KV, vx, 0, GGML_TYPE_Q8_KV, vy, 0, s, 0, 0, 1)) { return; } -#endif GGML_ASSERT(n%QK4_NL == 0); GGML_ASSERT(nrc == 1); GGML_UNUSED(bs); @@ -8274,11 +8194,9 @@ void vec_dot_iq2_kt_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx GGML_UNUSED(by); GGML_UNUSED(bs); -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ2_KT, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif } @@ -8536,11 +8454,9 @@ void vec_dot_iq3_kt_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx GGML_UNUSED(by); GGML_UNUSED(bs); -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ3_KT, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif } @@ -8787,10 +8703,8 @@ void vec_dot_iq4_kt_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx GGML_UNUSED(by); GGML_UNUSED(bs); -#if GGML_USE_IQK_MULMAT if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ4_KT, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { return; } -#endif }