Skip to content

Commit 3a0324b

Browse files
committed
Reimplant missing bits of PRs & IK_Llama commits.
1 parent c038eaa commit 3a0324b

File tree

4 files changed

+64
-2
lines changed

4 files changed

+64
-2
lines changed

src/llama-kv-cache.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,13 @@ struct llama_kv_cache {
7777
}
7878
};
7979

80+
// block of KV slots to move when defragging
81+
struct llama_kv_defrag_move {
82+
uint32_t src;
83+
uint32_t dst;
84+
uint32_t len;
85+
};
86+
8087
// a structure holds information about the slot found in llama_kv_cache_find_slot
8188
struct llama_kv_cache_slot_info {
8289
std::pair<uint32_t, uint32_t> boundaries; // slot boundaries [begin, end)

src/llama-model-loader.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -517,6 +517,7 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
517517
case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
518518
case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
519519
case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break;
520+
case GGML_TYPE_Q6_0: ftype = LLAMA_FTYPE_MOSTLY_Q6_0; break;
520521
case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break;
521522
case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break;
522523
case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break;
@@ -531,8 +532,21 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
531532
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
532533
case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
533534
case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break;
535+
// case GGML_TYPE_IQ1_BN: ftype = LLAMA_FTYPE_MOSTLY_IQ1_BN; break;
536+
// case GGML_TYPE_IQ2_BN: ftype = LLAMA_FTYPE_MOSTLY_IQ2_BN; break;
534537
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
535538
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
539+
case GGML_TYPE_IQ4_KS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_KS; break;
540+
case GGML_TYPE_IQ4_KSS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_KSS; break;
541+
case GGML_TYPE_IQ2_KS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_KS; break;
542+
case GGML_TYPE_IQ2_KT: ftype = LLAMA_FTYPE_MOSTLY_IQ2_KT; break;
543+
case GGML_TYPE_IQ2_K: ftype = LLAMA_FTYPE_MOSTLY_IQ2_K; break;
544+
case GGML_TYPE_IQ3_K: ftype = LLAMA_FTYPE_MOSTLY_IQ3_K; break;
545+
case GGML_TYPE_IQ3_KT: ftype = LLAMA_FTYPE_MOSTLY_IQ3_KT; break;
546+
case GGML_TYPE_IQ4_KT: ftype = LLAMA_FTYPE_MOSTLY_IQ4_KT; break;
547+
case GGML_TYPE_IQ4_K: ftype = LLAMA_FTYPE_MOSTLY_IQ4_K; break;
548+
case GGML_TYPE_IQ5_K: ftype = LLAMA_FTYPE_MOSTLY_IQ5_K; break;
549+
case GGML_TYPE_IQ6_K: ftype = LLAMA_FTYPE_MOSTLY_IQ6_K; break;
536550
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
537551
default:
538552
{

src/llama-model.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
9999
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
100100
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
101101
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
102+
case LLAMA_FTYPE_MOSTLY_Q6_0: return "Q6_0";
102103
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
103104
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
104105
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
@@ -122,6 +123,20 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
122123
case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
123124
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
124125
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
126+
case LLAMA_FTYPE_MOSTLY_IQ4_KS: return "IQ4_KS - 4.25 bpw";
127+
case LLAMA_FTYPE_MOSTLY_IQ4_KSS: return "IQ4_KSS - 4.0 bpw";
128+
case LLAMA_FTYPE_MOSTLY_IQ2_KS: return "IQ2_KS - 2.1875 bpw";
129+
case LLAMA_FTYPE_MOSTLY_IQ2_KT: return "IQ2_KT - 2.125 bpw";
130+
case LLAMA_FTYPE_MOSTLY_IQ2_K: return "IQ2_K - 2.375 bpw";
131+
case LLAMA_FTYPE_MOSTLY_IQ3_K: return "IQ3_K - 3.4325 bpw";
132+
case LLAMA_FTYPE_MOSTLY_IQ3_KL: return "IQ3_KL - 4 bpw";
133+
case LLAMA_FTYPE_MOSTLY_IQ3_KT: return "IQ3_KT - 3.125 bpw";
134+
case LLAMA_FTYPE_MOSTLY_IQ4_KT: return "IQ4_KT - 4.0 bpw";
135+
case LLAMA_FTYPE_MOSTLY_IQ4_K: return "IQ4_K - 4.5 bpw";
136+
case LLAMA_FTYPE_MOSTLY_IQ5_K: return "IQ5_K - 5.5 bpw";
137+
case LLAMA_FTYPE_MOSTLY_IQ6_K: return "IQ6_K - 6.6 bpw";
138+
// case LLAMA_FTYPE_MOSTLY_IQ1_BN: return "IQ1_BN - 1.625 bpw Bitnet";
139+
// case LLAMA_FTYPE_MOSTLY_IQ2_BN: return "IQ2_BN - 2.00 bpw Bitnet";
125140
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
126141
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
127142

src/llama-quant.cpp

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -394,9 +394,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
394394
case GGML_TYPE_IQ1_M:
395395
case GGML_TYPE_Q2_K:
396396
case GGML_TYPE_Q3_K:
397-
case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_Q4_0; break;
397+
case GGML_TYPE_IQ2_KS:
398+
case GGML_TYPE_IQ2_K:
399+
case GGML_TYPE_IQ3_K:
400+
case GGML_TYPE_IQ2_KT:
401+
case GGML_TYPE_IQ3_KT:
402+
case GGML_TYPE_IQ4_KSS:
403+
case GGML_TYPE_IQ4_KS:
404+
case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
405+
case GGML_TYPE_IQ4_K:
406+
case GGML_TYPE_IQ4_KT:
398407
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
399-
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
408+
case GGML_TYPE_IQ5_K:
409+
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q6_0; break;
410+
case GGML_TYPE_IQ6_K:
400411
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
401412
default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
402413
}
@@ -473,6 +484,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
473484
case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break;
474485
case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break;
475486
case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
487+
case LLAMA_FTYPE_MOSTLY_Q6_0: default_type = GGML_TYPE_Q6_0; break;
476488
case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
477489
case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
478490
case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
@@ -494,13 +506,27 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
494506
case LLAMA_FTYPE_MOSTLY_TQ2_0: default_type = GGML_TYPE_TQ2_0; break;
495507
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break;
496508
case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break;
509+
case LLAMA_FTYPE_MOSTLY_IQ2_KS: default_type = GGML_TYPE_IQ2_KS; break;
510+
case LLAMA_FTYPE_MOSTLY_IQ2_KT: default_type = GGML_TYPE_IQ2_KT; break;
497511
case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break;
498512
case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break;
499513
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
514+
case LLAMA_FTYPE_MOSTLY_IQ3_KT: default_type = GGML_TYPE_IQ3_KT; break;
515+
case LLAMA_FTYPE_MOSTLY_IQ4_KT: default_type = GGML_TYPE_IQ4_KT; break;
500516
case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break;
501517
case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break;
518+
// case LLAMA_FTYPE_MOSTLY_IQ1_BN: default_type = GGML_TYPE_IQ1_BN; break;
519+
// case LLAMA_FTYPE_MOSTLY_IQ2_BN: default_type = GGML_TYPE_IQ2_BN; break;
502520
case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
503521
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
522+
case LLAMA_FTYPE_MOSTLY_IQ4_KS: default_type = GGML_TYPE_IQ4_KS; break;
523+
case LLAMA_FTYPE_MOSTLY_IQ4_KSS: default_type = GGML_TYPE_IQ4_KSS; break;
524+
case LLAMA_FTYPE_MOSTLY_IQ2_K: default_type = GGML_TYPE_IQ2_K; break;
525+
case LLAMA_FTYPE_MOSTLY_IQ3_K: default_type = GGML_TYPE_IQ3_K; break;
526+
case LLAMA_FTYPE_MOSTLY_IQ3_KL: default_type = GGML_TYPE_IQ3_K; break;
527+
case LLAMA_FTYPE_MOSTLY_IQ4_K: default_type = GGML_TYPE_IQ4_K; break;
528+
case LLAMA_FTYPE_MOSTLY_IQ5_K: default_type = GGML_TYPE_IQ5_K; break;
529+
case LLAMA_FTYPE_MOSTLY_IQ6_K: default_type = GGML_TYPE_IQ6_K; break;
504530
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
505531
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
506532

0 commit comments

Comments
 (0)