Skip to content

Commit 3788953

Browse files
committed
Revert "PreMerge IK quants and Treillis"
This reverts commit 946c6503dfd1dc5fd5f4f232918d76323ce77e9d.
1 parent 0298c10 commit 3788953

File tree

18 files changed

+16
-3748
lines changed

18 files changed

+16
-3748
lines changed

examples/quantize/quantize.cpp

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
3333
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
3434
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
3535
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
36-
{ "IQ3_KT", LLAMA_FTYPE_MOSTLY_IQ3_KT, " 3.125 bpw trellis quantization", },
37-
{ "IQ4_KT", LLAMA_FTYPE_MOSTLY_IQ4_KT, " 4.0 bpw trellis quantization", },
3836
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
3937
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
4038
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
@@ -52,16 +50,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
5250
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
5351
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
5452
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
55-
{ "IQ4_KS", LLAMA_FTYPE_MOSTLY_IQ4_KS, " 4.25 bpw non-linear quantization", },
56-
{ "IQ4_KSS", LLAMA_FTYPE_MOSTLY_IQ4_KSS, " 4.0 bpw non-linear quantization", },
57-
{ "IQ2_K", LLAMA_FTYPE_MOSTLY_IQ2_K, " 2.375 bpw non-linear quantization",},
58-
{ "IQ2_KS", LLAMA_FTYPE_MOSTLY_IQ2_KS, " 2.1875 bpw non-linear quantization",},
59-
{ "IQ2_KT", LLAMA_FTYPE_MOSTLY_IQ2_KT, " 2.125 bpw trellis quantization", },
60-
{ "IQ3_K", LLAMA_FTYPE_MOSTLY_IQ3_K, " 3.44 bpw non-linear quantization", },
61-
{ "IQ3_KL", LLAMA_FTYPE_MOSTLY_IQ3_KL, " 4 bpw non-linear quantization mix",},
62-
{ "IQ4_K", LLAMA_FTYPE_MOSTLY_IQ4_K, " 4.5 bpw non-linear quantization", },
63-
{ "IQ5_K", LLAMA_FTYPE_MOSTLY_IQ5_K, " 5.5 bpw non-linear quantization", },
64-
{ "IQ6_K", LLAMA_FTYPE_MOSTLY_IQ6_K, " 6.6 bpw non-linear quantization", },
6553
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
6654
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
6755
{ "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },

ggml/include/ggml.h

Lines changed: 0 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -397,22 +397,6 @@ extern "C" {
397397
GGML_TYPE_TQ2_0 = 35,
398398
//
399399
GGML_TYPE_Q6_0 = 133,
400-
GGML_TYPE_IQ1_BN = 134,
401-
GGML_TYPE_IQ2_BN = 135,
402-
GGML_TYPE_Q8_K64 = 136,
403-
GGML_TYPE_IQ2_K = 137,
404-
GGML_TYPE_IQ3_K = 138,
405-
GGML_TYPE_IQ4_K = 139,
406-
GGML_TYPE_IQ5_K = 140,
407-
GGML_TYPE_IQ6_K = 141,
408-
// depricated: GGML_TYPE_IQ2_TN = 142,
409-
// depricated: GGML_TYPE_IQ1_TN = 143,
410-
GGML_TYPE_IQ4_KS = 144,
411-
GGML_TYPE_IQ2_KS = 145,
412-
GGML_TYPE_IQ4_KSS = 146,
413-
GGML_TYPE_IQ2_KT = 147,
414-
GGML_TYPE_IQ3_KT = 148,
415-
GGML_TYPE_IQ4_KT = 149,
416400
GGML_TYPE_COUNT,
417401
};
418402

@@ -459,21 +443,6 @@ extern "C" {
459443
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
460444
//
461445
GGML_FTYPE_MOSTLY_Q6_0 = 127, // except 1d tensors
462-
GGML_FTYPE_MOSTLY_IQ1_BN = 128, // except 1d tensors
463-
GGML_FTYPE_MOSTLY_IQ2_BN = 129, // except 1d tensors
464-
GGML_FTYPE_MOSTLY_IQ2_K = 130, // except 1d tensors
465-
GGML_FTYPE_MOSTLY_IQ3_K = 131, // except 1d tensors
466-
GGML_FTYPE_MOSTLY_IQ4_K = 132, // except 1d tensors
467-
GGML_FTYPE_MOSTLY_IQ5_K = 133, // except 1d tensors
468-
GGML_FTYPE_MOSTLY_IQ6_K = 134, // except 1d tensors
469-
// depricated: GGML_FTYPE_MOSTLY_IQ2_TN = 135, // except 1d tensors
470-
// depricated: GGML_FTYPE_MOSTLY_IQ1_TN = 136, // except 1d tensors
471-
GGML_FTYPE_MOSTLY_IQ4_KS = 137, // except 1d tensors
472-
GGML_FTYPE_MOSTLY_IQ2_KS = 138, // except 1d tensors
473-
GGML_FTYPE_MOSTLY_IQ4_KSS = 139, // except 1d tensors
474-
GGML_FTYPE_MOSTLY_IQ2_KT = 140, // except 1d tensors
475-
GGML_FTYPE_MOSTLY_IQ3_KT = 141, // except 1d tensors
476-
GGML_FTYPE_MOSTLY_IQ4_KT = 142, // except 1d tensors
477446
};
478447

479448
// available tensor operations:

ggml/src/ggml-common.h

Lines changed: 0 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -429,89 +429,6 @@ typedef struct {
429429
} block_iq4_xs;
430430
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
431431

432-
typedef struct {
433-
uint8_t scales[QK_K/32];
434-
uint8_t qs[QK_K/2];
435-
} block_iq4_ks;
436-
static_assert(sizeof(block_iq4_ks) == QK_K/32 + QK_K/2, "wrong iq4_ks block size/padding");
437-
438-
typedef struct {
439-
uint32_t qs[QK_K/8];
440-
} block_iq4_kss;
441-
static_assert(sizeof(block_iq4_kss) == QK_K/8*sizeof(uint32_t), "wrong iq4_kss block size/padding");
442-
443-
typedef struct {
444-
ggml_half d;
445-
uint16_t extra;
446-
uint8_t scales[QK_K/32];
447-
uint8_t qs[QK_K/4];
448-
} block_iq2_k;
449-
static_assert(sizeof(block_iq2_k) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/32 + QK_K/4, "wrong iq2_k block size/padding");
450-
451-
typedef struct {
452-
uint16_t extra;
453-
uint8_t scales[QK_K/64];
454-
uint8_t qs[QK_K/4];
455-
} block_iq2_ks;
456-
static_assert(sizeof(block_iq2_ks) == sizeof(uint16_t) + QK_K/64 + QK_K/4, "wrong iq2_ks block size/padding");
457-
458-
typedef struct {
459-
uint8_t scales[QK_K/64];
460-
uint8_t ql[QK_K/4];
461-
} block_iq2_kt;
462-
static_assert(sizeof(block_iq2_kt) == QK_K/4 + QK_K/64, "wrong iq2_kt block size/padding");
463-
464-
typedef struct {
465-
uint8_t scales[QK_K/64];
466-
uint8_t ql[QK_K/4];
467-
uint8_t qh[QK_K/8];
468-
} block_iq3_kt;
469-
static_assert(sizeof(block_iq3_kt) == QK_K/4 + QK_K/8 + QK_K/64, "wrong iq3_kt block size/padding");
470-
471-
typedef struct {
472-
uint32_t qs[QK_K/8];
473-
} block_iq4_kt;
474-
static_assert(sizeof(block_iq4_kt) == QK_K/2, "wrong iq4_kt block size/padding");
475-
476-
typedef struct {
477-
ggml_half d;
478-
uint16_t extra;
479-
uint16_t scales_h;
480-
uint8_t scales_l[QK_K/32];
481-
uint8_t qs[QK_K/4];
482-
uint8_t qh[QK_K/8];
483-
} block_iq3_k;
484-
static_assert(sizeof(block_iq3_k) == sizeof(ggml_half) + 2*sizeof(uint16_t) + QK_K/32 + QK_K/4 + QK_K/8, "wrong iq3_k block size/padding");
485-
486-
typedef struct {
487-
ggml_half d;
488-
uint16_t extra;
489-
uint8_t scales_h[QK_K/64];
490-
uint8_t scales_l[QK_K/32];
491-
uint8_t qs[QK_K/2];
492-
} block_iq4_k;
493-
static_assert(sizeof(block_iq4_k) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/2 + 3*QK_K/64, "wrong iq4_k block size/padding");
494-
495-
typedef struct {
496-
ggml_half d;
497-
uint16_t extra;
498-
uint8_t scales_h[QK_K/64];
499-
uint8_t scales_l[QK_K/32];
500-
uint8_t qs[QK_K/2];
501-
uint8_t qh[QK_K/8];
502-
} block_iq5_k;
503-
static_assert(sizeof(block_iq5_k) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/2 + QK_K/8 + 3*QK_K/64, "wrong iq5_k block size/padding");
504-
505-
typedef struct {
506-
ggml_half d;
507-
uint16_t extra;
508-
int8_t scales[QK_K/16];
509-
uint8_t qs[QK_K/2];
510-
uint8_t qh[QK_K/4];
511-
} block_iq6_k;
512-
static_assert(sizeof(block_iq6_k) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/2 + QK_K/4 + QK_K/16, "wrong iq6_k block size/padding");
513-
514-
515432
#endif // GGML_COMMON_DECL
516433
#endif // GGML_COMMON_DECL
517434

0 commit comments

Comments
 (0)