@@ -39,10 +39,10 @@ index b16c462f..6d1568f1 100644
3939 const int idx = gguf_find_tensor(ctx, name);
4040 if (idx < 0) {
4141diff --git a/src/llama.cpp b/src/llama.cpp
42- index 24e1f1f0..39045ca5 100644
42+ index 24e1f1f0..9957ea30 100644
4343--- a/src/llama.cpp
4444+++ b/src/llama.cpp
45- @@ -205,6 +205,15 @@ enum llm_arch {
45+ @@ -205,6 +205,16 @@ enum llm_arch {
4646 LLM_ARCH_GRANITE,
4747 LLM_ARCH_GRANITE_MOE,
4848 LLM_ARCH_CHAMELEON,
@@ -55,10 +55,11 @@ index 24e1f1f0..39045ca5 100644
5555+ LLM_ARCH_HYVID,
5656+ LLM_ARCH_WAN,
5757+ LLM_ARCH_HIDREAM,
58+ + LLM_ARCH_COSMOS,
5859 LLM_ARCH_UNKNOWN,
5960 };
6061
61- @@ -258,6 +267,15 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
62+ @@ -258,6 +268,16 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
6263 { LLM_ARCH_GRANITE, "granite" },
6364 { LLM_ARCH_GRANITE_MOE, "granitemoe" },
6465 { LLM_ARCH_CHAMELEON, "chameleon" },
@@ -71,10 +72,11 @@ index 24e1f1f0..39045ca5 100644
7172+ { LLM_ARCH_HYVID, "hyvid" },
7273+ { LLM_ARCH_WAN, "wan" },
7374+ { LLM_ARCH_HIDREAM, "hidream" },
75+ + { LLM_ARCH_COSMOS, "cosmos" },
7476 { LLM_ARCH_UNKNOWN, "(unknown)" },
7577 };
7678
77- @@ -1531,6 +1549,15 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
79+ @@ -1531,6 +1551,16 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
7880 { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
7981 },
8082 },
@@ -87,10 +89,11 @@ index 24e1f1f0..39045ca5 100644
8789+ { LLM_ARCH_HYVID, {}},
8890+ { LLM_ARCH_WAN, {}},
8991+ { LLM_ARCH_HIDREAM, {}},
92+ + { LLM_ARCH_COSMOS, {}},
9093 {
9194 LLM_ARCH_UNKNOWN,
9295 {
93- @@ -5403,6 +5430,23 @@ static void llm_load_hparams(
96+ @@ -5403,6 +5433,24 @@ static void llm_load_hparams(
9497 // get general kv
9598 ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
9699
@@ -105,6 +108,7 @@ index 24e1f1f0..39045ca5 100644
105108+ case LLM_ARCH_HYVID:
106109+ case LLM_ARCH_WAN:
107110+ case LLM_ARCH_HIDREAM:
111+ + case LLM_ARCH_COSMOS:
108112+ model.ftype = ml.ftype;
109113+ return;
110114+ default:
@@ -114,7 +118,7 @@ index 24e1f1f0..39045ca5 100644
114118 // get hparams kv
115119 ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
116120
117- @@ -18016,6 +18060,129 @@ static void llama_tensor_dequantize_internal(
121+ @@ -18016,6 +18064,132 @@ static void llama_tensor_dequantize_internal(
118122 workers.clear();
119123 }
120124
@@ -149,7 +153,8 @@ index 24e1f1f0..39045ca5 100644
149153+ (name.find(".to_v.weight") != std::string::npos) ||
150154+ (name.find(".v.weight") != std::string::npos) ||
151155+ (name.find(".attn.w1v.weight") != std::string::npos) ||
152- + (name.find(".attn.w2v.weight") != std::string::npos)
156+ + (name.find(".attn.w2v.weight") != std::string::npos) ||
157+ + (name.find("_attn.v_proj.weight") != std::string::npos)
153158+ ){
154159+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
155160+ new_type = GGML_TYPE_Q3_K;
@@ -184,7 +189,9 @@ index 24e1f1f0..39045ca5 100644
184189+ (name.find("ffn_down") != std::string::npos) ||
185190+ ((name.find("experts.") != std::string::npos) && (name.find(".w2.weight") != std::string::npos)) ||
186191+ (name.find(".ffn.2.weight") != std::string::npos) || // is this even the right way around?
187- + (name.find(".ff.net.2.weight") != std::string::npos)
192+ + (name.find(".ff.net.2.weight") != std::string::npos) ||
193+ + (name.find(".mlp.layer2.weight") != std::string::npos) ||
194+ + (name.find(".adaln_modulation_mlp.2.weight") != std::string::npos)
188195+ ) {
189196+ // TODO: add back `layer_info` with some model specific logic + logic further down
190197+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
@@ -244,7 +251,7 @@ index 24e1f1f0..39045ca5 100644
244251 static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
245252 const std::string name = ggml_get_name(tensor);
246253
247- @@ -18513,7 +18680 ,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
254+ @@ -18513,7 +18687 ,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
248255 if (llama_model_has_encoder(&model)) {
249256 n_attn_layer *= 3;
250257 }
@@ -255,7 +262,7 @@ index 24e1f1f0..39045ca5 100644
255262 }
256263
257264 size_t total_size_org = 0;
258- @@ -18547,6 +18716 ,51 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
265+ @@ -18547,6 +18723 ,51 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
259266 ctx_outs[i_split] = gguf_init_empty();
260267 }
261268 gguf_add_tensor(ctx_outs[i_split], tensor);
@@ -307,7 +314,7 @@ index 24e1f1f0..39045ca5 100644
307314 }
308315
309316 // Set split info if needed
310- @@ -18647,6 +18861,92 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
317+ @@ -18647,6 +18868,101 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
311318 // do not quantize relative position bias (T5)
312319 quantize &= name.find("attn_rel_b.weight") == std::string::npos;
313320
@@ -392,6 +399,15 @@ index 24e1f1f0..39045ca5 100644
392399+ quantize &= name.find(".ff_i.gate.weight") == std::string::npos;
393400+ quantize &= name.find("caption_projection.") == std::string::npos;
394401+ }
402+ + if (model.arch == LLM_ARCH_COSMOS) {
403+ + image_model = true;
404+ + quantize &= name.find("p_embedder.") == std::string::npos;
405+ + quantize &= name.find("t_embedder.") == std::string::npos;
406+ + quantize &= name.find("t_embedding_norm.") == std::string::npos;
407+ + quantize &= name.find("x_embedder.") == std::string::npos;
408+ + quantize &= name.find("pos_embedder.") == std::string::npos;
409+ + quantize &= name.find("final_layer.") == std::string::npos;
410+ + }
395411+ // ignore 3D/4D tensors for image models as the code was never meant to handle these
396412+ if (image_model) {
397413+ quantize &= ggml_n_dims(tensor) == 2;
@@ -400,7 +416,7 @@ index 24e1f1f0..39045ca5 100644
400416 enum ggml_type new_type;
401417 void * new_data;
402418 size_t new_size;
403- @@ -18655,6 +18955 ,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
419+ @@ -18655,6 +18971 ,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
404420 new_type = default_type;
405421
406422 // get more optimal quantization type based on the tensor shape, layer, etc.
@@ -410,7 +426,7 @@ index 24e1f1f0..39045ca5 100644
410426 if (!params->pure && ggml_is_quantized(default_type)) {
411427 new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
412428 }
413- @@ -18664,6 +18967 ,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
429+ @@ -18664,6 +18983 ,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
414430 if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
415431 new_type = params->output_tensor_type;
416432 }
0 commit comments