Skip to content

Commit f4b014c

Browse files
committed
IK_Llama partial activation WIP
Forgotten changes, more convergence & updated refs To understand how those combos vec_dot / repacking work. Update ggml-cpu-quants.c Update iqk_mul_mat.cpp Revert "Update iqk_mul_mat.cpp" This reverts commit 8cae909. Revert "Update ggml-cpu.c" This reverts commit 4cd7494. Add quantize row declarations for IQ quants IK_Llama partial activation WIP Forgotten changes, more convergence & updated refs To understand how those combos vec_dot / repacking work.
1 parent 99d12de commit f4b014c

File tree

3 files changed

+247
-45
lines changed

3 files changed

+247
-45
lines changed

ggml/src/ggml-cpu/ggml-cpu-quants.c

Lines changed: 44 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1478,23 +1478,11 @@ static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -1
14781478

14791479
//===================================== Q8_K ==============================================
14801480

1481-
// void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
1482-
// quantize_row_q8_K_ref(x, y, k);
1483-
// }
1484-
1485-
// void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
1486-
// #ifdef GGML_USE_IQK_MULMAT
1487-
// iqk_quantize_row_q8_K(x, y, k);
1488-
// #else
1489-
// quantize_row_q8_K_ref(x, y, k);
1490-
// #endif
1491-
// }
1492-
1493-
// void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
1494-
// quantize_row_q8_K_ref(x, y, k);
1495-
// }
14961481

14971482
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
1483+
#ifdef GGML_USE_IQK_MULMAT
1484+
iqk_quantize_row_q8_K(x, y, k);
1485+
#else
14981486
#ifdef __wasm_simd128__
14991487
assert(k % QK_K == 0);
15001488
const int64_t nb = k / QK_K;
@@ -1576,6 +1564,7 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
15761564
#else
15771565
quantize_row_q8_K_ref(x, y, k);
15781566
#endif
1567+
#endif
15791568
}
15801569

15811570
//===================================== Dot products =================================
@@ -1661,6 +1650,11 @@ static inline __m128i get_scale_shuffle(int i) {
16611650
#endif
16621651

16631652
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1653+
#if GGML_USE_IQK_MULMAT
1654+
if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_Q4_0, vx, bx, GGML_TYPE_Q8_0, vy, by, s, bs, 0, 1)) {
1655+
return;
1656+
}
1657+
#endif
16641658
const int qk = QK8_0;
16651659
const int nb = n / qk;
16661660

@@ -2359,6 +2353,11 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
23592353
}
23602354

23612355
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2356+
#if GGML_USE_IQK_MULMAT
2357+
if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_Q4_1, vx, bx, GGML_TYPE_Q8_1, vy, by, s, bs, 0, 1)) {
2358+
return;
2359+
}
2360+
#endif
23622361
const int qk = QK8_1;
23632362
const int nb = n / qk;
23642363

@@ -2679,6 +2678,16 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
26792678
}
26802679

26812680
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2681+
#if GGML_USE_IQK_MULMAT
2682+
#ifdef __AVX2__
2683+
const enum ggml_type vec_dot_type = GGML_TYPE_Q8_1;
2684+
#else
2685+
const enum ggml_type vec_dot_type = GGML_TYPE_Q8_0;
2686+
#endif
2687+
if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_Q5_0, vx, bx, vec_dot_type, vy, by, s, bs, 0, 1)) {
2688+
return;
2689+
}
2690+
#endif
26822691
const int qk = QK8_0;
26832692
const int nb = n / qk;
26842693

@@ -3002,6 +3011,11 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
30023011
}
30033012

30043013
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3014+
#if GGML_USE_IQK_MULMAT
3015+
if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_Q5_1, vx, bx, GGML_TYPE_Q8_1, vy, by, s, bs, 0, 1)) {
3016+
return;
3017+
}
3018+
#endif
30053019
const int qk = QK8_1;
30063020
const int nb = n / qk;
30073021

@@ -3361,6 +3375,16 @@ void ggml_vec_dot_q6_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
33613375
}
33623376

33633377
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
3378+
#if GGML_USE_IQK_MULMAT
3379+
#ifdef HAVE_FANCY_SIMD
3380+
enum ggml_type dot_type = GGML_TYPE_Q8_1_X4;
3381+
#else
3382+
enum ggml_type dot_type = GGML_TYPE_Q8_0_X4;
3383+
#endif
3384+
if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_Q8_0, vx, bx, dot_type, vy, by, s, bs, 0, 1)) {
3385+
return;
3386+
}
3387+
#endif
33643388
const int qk = QK8_0;
33653389
const int nb = n / qk;
33663390

@@ -13086,6 +13110,11 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
1308613110
}
1308713111

1308813112
void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
13113+
#if GGML_USE_IQK_MULMAT
13114+
if (iqk_mul_mat(nrc, nrc, n, GGML_TYPE_IQ4_NL, vx, bx, GGML_TYPE_Q8_0, vy, by, s, bs, 0, 1)) {
13115+
return;
13116+
}
13117+
#endif
1308913118
assert(nrc == 1);
1309013119
UNUSED(nrc);
1309113120
UNUSED(bx);

0 commit comments

Comments
 (0)