Skip to content

Commit 03c5167

Browse files
committed
Adapt token embeddings and output.weight to vocab size
due to the huge increase of the embeddings and output weight size for models with huge vocab, they seem to quantize with less loss.
1 parent 97c0ae0 commit 03c5167

File tree

1 file changed

+28
-5
lines changed

1 file changed

+28
-5
lines changed

src/llama.cpp

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15878,15 +15878,23 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1587815878
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
1587915879
new_type = GGML_TYPE_Q8_0;
1588015880
}
15881-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M ||
15882-
ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
15883-
new_type = GGML_TYPE_Q4_K;
15881+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
15882+
if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ4_XS;
15883+
else new_type = GGML_TYPE_Q4_K;
15884+
}
15885+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
15886+
if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q4_K;
15887+
else new_type = GGML_TYPE_Q5_K;
1588415888
}
1588515889
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
1588615890
ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||
1588715891
ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) {
1588815892
new_type = GGML_TYPE_Q5_K;
1588915893
}
15894+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
15895+
if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q5_K;
15896+
else new_type = GGML_TYPE_Q6_K;
15897+
}
1589015898
else if (new_type != GGML_TYPE_Q8_0) {
1589115899
new_type = GGML_TYPE_Q6_K;
1589215900
}
@@ -15895,10 +15903,25 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1589515903
if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
1589615904
new_type = qs.params->token_embedding_type;
1589715905
} else {
15898-
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M ||
15899-
ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
15906+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
15907+
if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ2_XS;
15908+
else new_type = GGML_TYPE_IQ2_S;
15909+
}
15910+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
1590015911
new_type = GGML_TYPE_IQ2_S;
1590115912
}
15913+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
15914+
if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ2_S;
15915+
else new_type = GGML_TYPE_IQ3_XXS;
15916+
}
15917+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
15918+
if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ3_XXS;
15919+
else new_type = GGML_TYPE_IQ3_S;
15920+
}
15921+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
15922+
if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ3_S;
15923+
else new_type = GGML_TYPE_IQ4_XS;
15924+
}
1590215925
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
1590315926
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS;
1590415927
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;

0 commit comments

Comments
 (0)