Skip to content

Commit 7f61b30

Browse files
ikawrakowIwan Kawrakow
andauthored
IQ1_M_R4: better 1.75 bpw quants (#187)
* iq1_m_r4: basics (quantize/dequantize) * iq1_m_r4: Zen4 gemm * iq1_m_r4: neon gemm * iq1_m_r4: switch to q8_0_x4 also on AVX2/Zen4 With the deltas being per group of 8, we cannot make use of the q8 sums stored in q8_1, so we get a tiny gain by using q8_0_x4. * iq1_m_r4: rename mul_mat_iq1_m_r4_q8_1 to mul_mat_iq1_m_r4_q8_0 --------- Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent a6f9f2e commit 7f61b30

File tree

11 files changed

+553
-230
lines changed

11 files changed

+553
-230
lines changed

examples/quantize/quantize.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
3030
{ "IQ2_M_R4", LLAMA_FTYPE_MOSTLY_IQ2_M_R4, " 2.7 bpw quantization", },
3131
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
3232
{ "IQ1_S_R4", LLAMA_FTYPE_MOSTLY_IQ1_S_R4, " 1.5 bpw quantization", },
33+
{ "IQ1_M_R4", LLAMA_FTYPE_MOSTLY_IQ1_M_R4, " 1.75 bpw quantization", },
3334
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
3435
{ "IQ1_BN", LLAMA_FTYPE_MOSTLY_IQ1_BN, " 1.62 bpw quantization (Bitnet)", },
3536
{ "IQ2_BN", LLAMA_FTYPE_MOSTLY_IQ2_BN, " 2.00 bpw quantization (Bitnet)", },
@@ -512,6 +513,7 @@ int main(int argc, char ** argv) {
512513
params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS_R4 ||
513514
params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
514515
params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S_R4 ||
516+
params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M_R4 ||
515517
params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M)) {
516518
fprintf(stderr, "\n==========================================================================================================\n");
517519
fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");

ggml/include/ggml.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,7 @@ extern "C" {
432432
GGML_TYPE_IQ3_S_R4 = 221,
433433
GGML_TYPE_IQ2_S_R4 = 222,
434434
GGML_TYPE_IQ4_XS_R4 = 223,
435+
GGML_TYPE_IQ1_M_R4 = 229,
435436
GGML_TYPE_BF16_R16 = 230,
436437
GGML_TYPE_Q6_0_R4 = 233,
437438
GGML_TYPE_IQ2_BN_R4 = 335,
@@ -516,6 +517,7 @@ extern "C" {
516517
GGML_FTYPE_MOSTLY_IQ3_S_R4 = 220, // except 1d tensors
517518
GGML_FTYPE_MOSTLY_IQ2_S_R4 = 221, // except 1d tensors
518519
GGML_FTYPE_MOSTLY_IQ4_XS_R4 = 222, // except 1d tensors
520+
GGML_FTYPE_MOSTLY_IQ1_M_R4 = 223, // except 1d tensors
519521
GGML_FTYPE_MOSTLY_BF16_R16 = 224, // except 1d tensors
520522
GGML_FTYPE_MOSTLY_Q6_0_R4 = 227, // except 1d tensors
521523
GGML_FTYPE_MOSTLY_IQ2_BN_R4 = 329, // except 1d tensors

ggml/src/ggml-common.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -499,6 +499,14 @@ typedef struct {
499499
} block_iq1_m;
500500
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
501501

502+
// 1.75 bpw - blocks of 32 with 4 interleaved rows = 128 quants
503+
typedef struct {
504+
uint8_t qs[16]; // grid index, low 8 bits
505+
uint8_t qh[ 8]; // grid index, high 3 bits + grid shift bits (for two groups of 8)
506+
uint8_t scales[4]; // 4-bit block scales
507+
} block_iq1_m_r4;
508+
static_assert(sizeof(block_iq1_m_r4) == 28, "wrong iq1_m_r4 block size/padding");
509+
502510
//
503511
// Bitnet and TriLM - implemented as 1.625 bpw
504512
//

ggml/src/ggml-quants.c

Lines changed: 179 additions & 224 deletions
Large diffs are not rendered by default.

ggml/src/ggml-quants.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGM
4343
void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
4444
void quantize_row_iq1_bn_ref (const float * GGML_RESTRICT x, block_iq1_bn * GGML_RESTRICT y, int64_t k);
4545
void quantize_row_iq1_s_ref (const float * GGML_RESTRICT x, block_iq1_s * GGML_RESTRICT y, int64_t k);
46+
void quantize_row_iq1_m_ref (const float * GGML_RESTRICT x, block_iq1_m * GGML_RESTRICT y, int64_t k);
4647

4748
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
4849
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
@@ -68,6 +69,7 @@ void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y,
6869
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
6970
void quantize_row_iq1_bn (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
7071
void quantize_row_iq1_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
72+
void quantize_row_iq1_m (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
7173

7274
// Dequantization
7375
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
@@ -152,6 +154,8 @@ void iq3xs_free_impl(int grid_size);
152154

153155
void iq1s_process_1block(int block_size, const float * xb, const float * weight, int8_t * L,
154156
float * the_scale, uint16_t * the_index, int * the_shift, float * pairs, float * sumx, float * sumw);
157+
void iq1m_process_1block(const float * xb, const float * weight, int8_t * L,
158+
float * the_scale, uint16_t * the_index, int * the_shift, float * pairs);
155159

156160
#if defined(__ARM_FEATURE_SVE)
157161
extern int ggml_sve_cnt_b;

ggml/src/ggml.c

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1202,13 +1202,26 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
12021202
.type_size = sizeof(block_iq1_m),
12031203
.is_quantized = true,
12041204
.to_float = (ggml_to_float_t) dequantize_row_iq1_m,
1205-
.from_float = NULL,
1206-
.from_float_ref = NULL,
1205+
.from_float = quantize_row_iq1_m,
1206+
.from_float_ref = (ggml_from_float_t)quantize_row_iq1_m_ref,
12071207
.vec_dot = ggml_vec_dot_iq1_m_q8_K,
12081208
.vec_dot_type = GGML_TYPE_Q8_K,
12091209
.nrows = 1,
12101210
.row_meta_size = 0,
12111211
},
1212+
[GGML_TYPE_IQ1_M_R4] = {
1213+
.type_name = "iq1_m_r4",
1214+
.blck_size = 32,
1215+
.type_size = sizeof(block_iq1_m_r4)/4,
1216+
.is_quantized = true,
1217+
.to_float = (ggml_to_float_t) dequantize_row_iq1_m_r4,
1218+
.from_float = quantize_row_iq1_m_r4,
1219+
.from_float_ref = (ggml_from_float_t)quantize_row_iq1_m_r4_ref,
1220+
.vec_dot = vec_dot_iq1_m_r4_q8_k,
1221+
.vec_dot_type = GGML_TYPE_Q8_0_X4,
1222+
.nrows = 1,
1223+
.row_meta_size = 2,
1224+
},
12121225
[GGML_TYPE_IQ1_BN] = {
12131226
.type_name = "iq1_bn",
12141227
.blck_size = QK_IQ1BN,
@@ -4401,6 +4414,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
44014414
case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
44024415
case GGML_FTYPE_MOSTLY_IQ2_S_R4: wtype = GGML_TYPE_IQ2_S_R4; break;
44034416
case GGML_FTYPE_MOSTLY_IQ1_S_R4: wtype = GGML_TYPE_IQ1_S_R4; break;
4417+
case GGML_FTYPE_MOSTLY_IQ1_M_R4: wtype = GGML_TYPE_IQ1_M_R4; break;
44044418
case GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = GGML_TYPE_Q4_0_4_4; break;
44054419
case GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = GGML_TYPE_Q4_0_4_8; break;
44064420
case GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = GGML_TYPE_Q4_0_8_8; break;
@@ -10949,6 +10963,7 @@ static void ggml_compute_forward_add(
1094910963
case GGML_TYPE_IQ2_S:
1095010964
case GGML_TYPE_IQ2_S_R4:
1095110965
case GGML_TYPE_IQ1_S_R4:
10966+
case GGML_TYPE_IQ1_M_R4:
1095210967
case GGML_TYPE_Q4_0_4_4:
1095310968
case GGML_TYPE_Q4_0_4_8:
1095410969
case GGML_TYPE_Q4_0_8_8:
@@ -11418,6 +11433,7 @@ static void ggml_compute_forward_add1(
1141811433
case GGML_TYPE_IQ2_S:
1141911434
case GGML_TYPE_IQ2_S_R4:
1142011435
case GGML_TYPE_IQ1_S_R4:
11436+
case GGML_TYPE_IQ1_M_R4:
1142111437
case GGML_TYPE_Q4_0_4_4:
1142211438
case GGML_TYPE_Q4_0_4_8:
1142311439
case GGML_TYPE_Q4_0_8_8:
@@ -11584,6 +11600,7 @@ static void ggml_compute_forward_acc(
1158411600
case GGML_TYPE_IQ2_S:
1158511601
case GGML_TYPE_IQ2_S_R4:
1158611602
case GGML_TYPE_IQ1_S_R4:
11603+
case GGML_TYPE_IQ1_M_R4:
1158711604
case GGML_TYPE_Q4_0_4_4:
1158811605
case GGML_TYPE_Q4_0_4_8:
1158911606
case GGML_TYPE_Q4_0_8_8:
@@ -14823,6 +14840,7 @@ static void ggml_compute_forward_out_prod(
1482314840
case GGML_TYPE_IQ2_S:
1482414841
case GGML_TYPE_IQ2_S_R4:
1482514842
case GGML_TYPE_IQ1_S_R4:
14843+
case GGML_TYPE_IQ1_M_R4:
1482614844
case GGML_TYPE_Q4_0_4_4:
1482714845
case GGML_TYPE_Q4_0_4_8:
1482814846
case GGML_TYPE_Q4_0_8_8:
@@ -15229,6 +15247,7 @@ static void ggml_compute_forward_set(
1522915247
case GGML_TYPE_IQ2_S:
1523015248
case GGML_TYPE_IQ2_S_R4:
1523115249
case GGML_TYPE_IQ1_S_R4:
15250+
case GGML_TYPE_IQ1_M_R4:
1523215251
case GGML_TYPE_Q4_0_4_4:
1523315252
case GGML_TYPE_Q4_0_4_8:
1523415253
case GGML_TYPE_Q4_0_8_8:
@@ -15529,6 +15548,7 @@ static void ggml_compute_forward_get_rows(
1552915548
case GGML_TYPE_IQ2_S:
1553015549
case GGML_TYPE_IQ2_S_R4:
1553115550
case GGML_TYPE_IQ1_S_R4:
15551+
case GGML_TYPE_IQ1_M_R4:
1553215552
case GGML_TYPE_Q4_0_4_4:
1553315553
case GGML_TYPE_Q4_0_4_8:
1553415554
case GGML_TYPE_Q4_0_8_8:
@@ -16158,6 +16178,7 @@ static void ggml_compute_forward_clamp(
1615816178
case GGML_TYPE_IQ2_S:
1615916179
case GGML_TYPE_IQ2_S_R4:
1616016180
case GGML_TYPE_IQ1_S_R4:
16181+
case GGML_TYPE_IQ1_M_R4:
1616116182
case GGML_TYPE_Q8_K:
1616216183
case GGML_TYPE_Q8_K64:
1616316184
case GGML_TYPE_Q8_K16:
@@ -22914,6 +22935,7 @@ void ggml_quantize_init(enum ggml_type type) {
2291422935
case GGML_TYPE_IQ2_S:
2291522936
case GGML_TYPE_IQ1_S:
2291622937
case GGML_TYPE_IQ1_M: iq2xs_init_impl(type); break;
22938+
case GGML_TYPE_IQ1_M_R4:iq2xs_init_impl(GGML_TYPE_IQ1_M); break;
2291722939
case GGML_TYPE_IQ1_S_R4:iq2xs_init_impl(GGML_TYPE_IQ1_S); break;
2291822940
case GGML_TYPE_IQ3_XXS_R4:
2291922941
case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
@@ -22998,6 +23020,7 @@ size_t ggml_quantize_chunk(
2299823020
case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
2299923021
case GGML_TYPE_IQ2_S_R4:result = quantize_iq2_s_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
2300023022
case GGML_TYPE_IQ1_S_R4:result = quantize_iq1_s_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
23023+
case GGML_TYPE_IQ1_M_R4:result = quantize_iq1_m_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
2300123024
case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
2300223025
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
2300323026
case GGML_TYPE_IQ1_BN: result = quantize_iq1_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;

0 commit comments

Comments
 (0)