Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ option(GGML_BLAS "ggml: use BLAS"
set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
"ggml: BLAS library vendor")
option(GGML_LLAMAFILE "ggml: use LLAMAFILE" OFF)
option(GGML_IQK_MUL_MAT "ggml: use optimized iqk matrix multiplications" ON)

option(GGML_CUDA "ggml: use CUDA" OFF)
option(GGML_MUSA "ggml: use MUSA" OFF)
Expand Down
71 changes: 34 additions & 37 deletions ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -255,44 +255,41 @@ endif()

set (GGML_SOURCES_IQK iqk/iqk_quantize.cpp)
set (GGML_HEADERS_IQK iqk/iqk_config.h)
if (GGML_IQK_MUL_MAT)
message(STATUS "Using optimized iqk matrix multiplications")
add_compile_definitions(GGML_USE_IQK_MULMAT)
set(GGML_SOURCES_IQK_MM iqk/iqk_mul_mat.cpp
iqk/iqk_flash_attn.cpp
iqk/fa/iqk_fa_576_512.cpp
iqk/fa/iqk_fa_192_128.cpp
iqk/fa/iqk_fa_256_256.cpp
iqk/fa/iqk_fa_128_128.cpp
iqk/fa/iqk_fa_96_96.cpp
iqk/fa/iqk_fa_64_64.cpp
iqk/iqk_gemm_floats.cpp
iqk/iqk_gemm_kquants.cpp
iqk/iqk_gemm_ktquants.cpp
iqk/iqk_gemm_iquants.cpp
iqk/iqk_gemm_iqk_quants.cpp
iqk/iqk_gemm_1bit.cpp
iqk/iqk_gemm_legacy_quants.cpp)
set(GGML_HEADERS_IQK_MM iqk/iqk_mul_mat.h
iqk/iqk_flash_impl.h
iqk/fa/iqk_fa_templates.h
iqk/iqk_gemm_floats.h
iqk/iqk_gemm_kquants.h
iqk/iqk_gemm_ktquants.h
iqk/iqk_gemm_iquants.h
iqk/iqk_gemm_iqk_quants.h
iqk/iqk_gemm_1bit.h
iqk/iqk_gemm_legacy_quants.h)
if (GGML_IQK_FLASH_ATTENTION)
message(STATUS "Enabling IQK Flash Attention kernels")
add_compile_definitions(GGML_IQK_FLASH_ATTENTION)
if (GGML_IQK_FA_ALL_QUANTS)
message(STATUS "Including all IQK FA kernels")
add_compile_definitions(GGML_IQK_FA_ALL_QUANTS)
endif()
else()
message(STATUS "Disabling IQK Flash Attention kernels")
message(STATUS "Using optimized iqk matrix multiplications")
set(GGML_SOURCES_IQK_MM iqk/iqk_mul_mat.cpp
iqk/iqk_flash_attn.cpp
iqk/fa/iqk_fa_576_512.cpp
iqk/fa/iqk_fa_192_128.cpp
iqk/fa/iqk_fa_256_256.cpp
iqk/fa/iqk_fa_128_128.cpp
iqk/fa/iqk_fa_96_96.cpp
iqk/fa/iqk_fa_64_64.cpp
iqk/iqk_gemm_floats.cpp
iqk/iqk_gemm_kquants.cpp
iqk/iqk_gemm_ktquants.cpp
iqk/iqk_gemm_iquants.cpp
iqk/iqk_gemm_iqk_quants.cpp
iqk/iqk_gemm_1bit.cpp
iqk/iqk_gemm_legacy_quants.cpp)
set(GGML_HEADERS_IQK_MM iqk/iqk_mul_mat.h
iqk/iqk_flash_impl.h
iqk/fa/iqk_fa_templates.h
iqk/iqk_gemm_floats.h
iqk/iqk_gemm_kquants.h
iqk/iqk_gemm_ktquants.h
iqk/iqk_gemm_iquants.h
iqk/iqk_gemm_iqk_quants.h
iqk/iqk_gemm_1bit.h
iqk/iqk_gemm_legacy_quants.h)
if (GGML_IQK_FLASH_ATTENTION)
message(STATUS "Enabling IQK Flash Attention kernels")
add_compile_definitions(GGML_IQK_FLASH_ATTENTION)
if (GGML_IQK_FA_ALL_QUANTS)
message(STATUS "Including all IQK FA kernels")
add_compile_definitions(GGML_IQK_FA_ALL_QUANTS)
endif()
else()
message(STATUS "Disabling IQK Flash Attention kernels")
endif()

if (GGML_LLAMAFILE)
Expand Down
20 changes: 0 additions & 20 deletions ggml/src/ggml-quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,9 @@

#include "ggml-quants.h"
#include "ggml-impl.h"
#if GGML_USE_IQK_MULMAT
#include "iqk/iqk_config.h"
#include "iqk/iqk_mul_mat.h"
#include "iqk/iqk_quantize.h"
#endif


#include <math.h>
Expand Down Expand Up @@ -3933,11 +3931,7 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int6
}

void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
#ifdef GGML_USE_IQK_MULMAT
iqk_quantize_row_q8_K(x, y, k);
#else
quantize_row_q8_K_ref(x, y, k);
#endif
}

//===================================== Dot ptoducts =================================
Expand Down Expand Up @@ -4023,11 +4017,9 @@ static inline __m128i get_scale_shuffle(int i) {
#endif

void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
#if GGML_USE_IQK_MULMAT
if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_Q4_0, vx, bx, GGML_TYPE_Q8_0, vy, by, s, bs, 0, 1)) {
return;
}
#endif
const int qk = QK8_0;
const int nb = n / qk;

Expand Down Expand Up @@ -4510,11 +4502,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
}

void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
#if GGML_USE_IQK_MULMAT
if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_Q4_1, vx, bx, GGML_TYPE_Q8_1, vy, by, s, bs, 0, 1)) {
return;
}
#endif
const int qk = QK8_1;
const int nb = n / qk;

Expand Down Expand Up @@ -4802,7 +4792,6 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
}

void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
#if GGML_USE_IQK_MULMAT
#ifdef __AVX2__
const enum ggml_type vec_dot_type = GGML_TYPE_Q8_1;
#else
Expand All @@ -4811,7 +4800,6 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_Q5_0, vx, bx, vec_dot_type, vy, by, s, bs, 0, 1)) {
return;
}
#endif
const int qk = QK8_0;
const int nb = n / qk;

Expand Down Expand Up @@ -5167,11 +5155,9 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
}

void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
#if GGML_USE_IQK_MULMAT
if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_Q5_1, vx, bx, GGML_TYPE_Q8_1, vy, by, s, bs, 0, 1)) {
return;
}
#endif
const int qk = QK8_1;
const int nb = n / qk;

Expand Down Expand Up @@ -5546,7 +5532,6 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
}

void ggml_vec_dot_q6_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
#if GGML_USE_IQK_MULMAT
#ifdef __AVX2__
const enum ggml_type vec_dot_type = GGML_TYPE_Q8_1;
#else
Expand All @@ -5555,13 +5540,11 @@ void ggml_vec_dot_q6_0_q8_0(int n, float * restrict s, size_t bs, const void * r
if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_Q6_0, vx, bx, vec_dot_type, vy, by, s, bs, 0, 1)) {
return;
}
#endif
// TODO
*s = 0;
}

void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
#if GGML_USE_IQK_MULMAT
#ifdef HAVE_FANCY_SIMD
enum ggml_type dot_type = GGML_TYPE_Q8_1_X4;
#else
Expand All @@ -5570,7 +5553,6 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_Q8_0, vx, bx, dot_type, vy, by, s, bs, 0, 1)) {
return;
}
#endif
const int qk = QK8_0;
const int nb = n / qk;

Expand Down Expand Up @@ -11940,11 +11922,9 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
}

void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
#if GGML_USE_IQK_MULMAT
if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_IQ4_NL, vx, bx, GGML_TYPE_Q8_0, vy, by, s, bs, 0, 1)) {
return;
}
#endif
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
Expand Down
Loading