Skip to content

Commit 5c19d77

Browse files
committed
Add CogVLM to conversion script
1 parent c4cf462 commit 5c19d77

File tree

4 files changed

+182
-30
lines changed

4 files changed

+182
-30
lines changed

convert_hf_to_gguf.py

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -577,9 +577,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
577577
toktypes: list[int] = []
578578

579579
from transformers import AutoTokenizer
580-
# DEBIAN_FRONTEND=noninteractive means that the script is running in a non-interactive environment (i.e. CI), so we cannot answer Y/N when it asks for user input
581-
is_cli_non_interactive = os.environ.get("DEBIAN_FRONTEND", "") == "noninteractive"
582-
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=is_cli_non_interactive)
580+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
583581
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
584582
assert max(tokenizer.vocab.values()) < vocab_size
585583

@@ -5186,6 +5184,58 @@ def _reverse_hf_permute(data_torch, n_heads, hidden_dim):
51865184
data_torch = data_torch.repeat_interleave(n_heads, 0)
51875185
return data_torch
51885186

5187+
@Model.register("CogAgentForCausalLM")
5188+
class CogVLMModel(Model):
5189+
model_arch = gguf.MODEL_ARCH.COGVLM
5190+
5191+
def __init__(self, *args, **kwargs):
5192+
super().__init__(*args, **kwargs)
5193+
self.ftype = gguf.LlamaFileType.ALL_F32
5194+
5195+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5196+
# Skip boi and eoi tensors for now
5197+
if name.endswith("boi"):
5198+
return []
5199+
if name.endswith("eoi"):
5200+
return []
5201+
if name.startswith("model.vision"):
5202+
return []
5203+
if name.startswith("model.cross_vision"):
5204+
return []
5205+
5206+
return [(self.map_tensor_name(name), data_torch)]
5207+
5208+
def set_vocab(self):
5209+
from transformers import AutoTokenizer
5210+
tokenizer = AutoTokenizer.from_pretrained('lmsys/vicuna-7b-v1.5')
5211+
vocab_size = len(tokenizer.vocab.items())
5212+
5213+
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
5214+
added_vocab = tokenizer.get_added_vocab()
5215+
tokens: list[str] = []
5216+
toktypes: list[int] = []
5217+
5218+
for i in range(vocab_size):
5219+
if i not in reverse_vocab:
5220+
tokens.append(f"[PAD{i}]")
5221+
toktypes.append(gguf.TokenType.UNUSED)
5222+
else:
5223+
token: str = reverse_vocab[i]
5224+
if token in added_vocab:
5225+
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
5226+
toktypes.append(gguf.TokenType.CONTROL)
5227+
else:
5228+
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
5229+
toktypes.append(gguf.TokenType.USER_DEFINED)
5230+
else:
5231+
toktypes.append(gguf.TokenType.NORMAL)
5232+
tokens.append(token)
5233+
5234+
self.gguf_writer.add_tokenizer_model("llama")
5235+
self.gguf_writer.add_tokenizer_pre("default")
5236+
self.gguf_writer.add_token_list(tokens)
5237+
self.gguf_writer.add_token_types(toktypes)
5238+
51895239

51905240
###### CONVERSION LOGIC ######
51915241

gguf-py/gguf/constants.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,7 @@ class MODEL_ARCH(IntEnum):
308308
GRANITE_MOE = auto()
309309
CHAMELEON = auto()
310310
WAVTOKENIZER_DEC = auto()
311+
COGVLM = auto()
311312
# vision models
312313
VISION_LLAVA = auto()
313314
VISION_MOBILEVLM = auto()
@@ -441,6 +442,19 @@ class MODEL_TENSOR(IntEnum):
441442
POSNET_ATTN_K = auto()
442443
POSNET_ATTN_V = auto()
443444
POSNET_ATTN_OUT = auto()
445+
ATTN_TXT_QKV = auto()
446+
ATTN_IMG_QKV = auto()
447+
ATTN_TXT_DENSE = auto()
448+
ATTN_IMG_DENSE = auto()
449+
CROSS_ATTN_Q = auto()
450+
CROSS_ATTN_KV = auto()
451+
CROSS_ATTN_DENSE = auto()
452+
FFN_TXT_UP = auto()
453+
FFN_TXT_GATE = auto()
454+
FFN_TXT_DOWN = auto()
455+
FFN_IMG_UP = auto()
456+
FFN_IMG_GATE = auto()
457+
FFN_IMG_DOWN = auto()
444458
# vision
445459
V_MMPROJ = auto()
446460
V_MMPROJ_FC = auto()
@@ -533,6 +547,7 @@ class MODEL_TENSOR(IntEnum):
533547
MODEL_ARCH.GRANITE: "granite",
534548
MODEL_ARCH.GRANITE_MOE: "granitemoe",
535549
MODEL_ARCH.CHAMELEON: "chameleon",
550+
MODEL_ARCH.COGVLM: "cogvlm",
536551
MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
537552
# vision
538553
MODEL_ARCH.VISION_LLAVA: "llava",
@@ -666,6 +681,19 @@ class MODEL_TENSOR(IntEnum):
666681
MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k",
667682
MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v",
668683
MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output",
684+
MODEL_TENSOR.ATTN_TXT_QKV: "blk.{bid}.attn_txt_qkv",
685+
MODEL_TENSOR.ATTN_IMG_QKV: "blk.{bid}.attn_img_qkv",
686+
MODEL_TENSOR.ATTN_TXT_DENSE: "blk.{bid}.attn_txt_dense",
687+
MODEL_TENSOR.ATTN_IMG_DENSE: "blk.{bid}.attn_img_dense",
688+
MODEL_TENSOR.CROSS_ATTN_Q: "blk.{bid}.cross_attn_q",
689+
MODEL_TENSOR.CROSS_ATTN_KV: "blk.{bid}.cross_attn_kv",
690+
MODEL_TENSOR.CROSS_ATTN_DENSE: "blk.{bid}.cross_attn_dense",
691+
MODEL_TENSOR.FFN_TXT_UP: "blk.{bid}.ffn_txt_up",
692+
MODEL_TENSOR.FFN_TXT_GATE: "blk.{bid}.ffn_txt_gate",
693+
MODEL_TENSOR.FFN_TXT_DOWN: "blk.{bid}.ffn_txt_down",
694+
MODEL_TENSOR.FFN_IMG_UP: "blk.{bid}.ffn_img_up",
695+
MODEL_TENSOR.FFN_IMG_GATE: "blk.{bid}.ffn_img_gate",
696+
MODEL_TENSOR.FFN_IMG_DOWN: "blk.{bid}.ffn_img_down",
669697
# vision
670698
MODEL_TENSOR.V_MMPROJ: "v.mmproj_{bid}",
671699
MODEL_TENSOR.V_MMPROJ_FC: "v.mmproj.fc",
@@ -1621,6 +1649,27 @@ class MODEL_TENSOR(IntEnum):
16211649
MODEL_TENSOR.FFN_DOWN,
16221650
MODEL_TENSOR.FFN_UP,
16231651
],
1652+
MODEL_ARCH.COGVLM: [
1653+
MODEL_TENSOR.TOKEN_EMBD,
1654+
MODEL_TENSOR.OUTPUT_NORM,
1655+
MODEL_TENSOR.OUTPUT,
1656+
MODEL_TENSOR.ATTN_NORM,
1657+
MODEL_TENSOR.ATTN_TXT_QKV,
1658+
MODEL_TENSOR.ATTN_IMG_QKV,
1659+
MODEL_TENSOR.ATTN_TXT_DENSE,
1660+
MODEL_TENSOR.ATTN_IMG_DENSE,
1661+
MODEL_TENSOR.ATTN_NORM_2,
1662+
MODEL_TENSOR.CROSS_ATTN_Q,
1663+
MODEL_TENSOR.CROSS_ATTN_KV,
1664+
MODEL_TENSOR.CROSS_ATTN_DENSE,
1665+
MODEL_TENSOR.FFN_NORM,
1666+
MODEL_TENSOR.FFN_TXT_UP,
1667+
MODEL_TENSOR.FFN_TXT_GATE,
1668+
MODEL_TENSOR.FFN_TXT_DOWN,
1669+
MODEL_TENSOR.FFN_IMG_UP,
1670+
MODEL_TENSOR.FFN_IMG_GATE,
1671+
MODEL_TENSOR.FFN_IMG_DOWN,
1672+
],
16241673
MODEL_ARCH.WAVTOKENIZER_DEC: [
16251674
MODEL_TENSOR.TOKEN_EMBD,
16261675
MODEL_TENSOR.TOKEN_EMBD_NORM,

gguf-py/gguf/tensor_mapping.py

Lines changed: 61 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ class TensorNameMap:
2828
"transformer.token_embeddings", # openelm
2929
"shared", # t5
3030
"rwkv.embeddings", # rwkv
31+
"model.embed_tokens", # cogvlm
3132
),
3233

3334
# Token type embeddings
@@ -55,7 +56,7 @@ class TensorNameMap:
5556
# Output
5657
MODEL_TENSOR.OUTPUT: (
5758
"embed_out", # gptneox
58-
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2 phimoe
59+
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2 phimoe cogvlm
5960
"output", # llama-pth bloom internlm2
6061
"word_embeddings_for_head", # persimmon
6162
"lm_head.linear", # phi2
@@ -68,7 +69,7 @@ class TensorNameMap:
6869
MODEL_TENSOR.OUTPUT_NORM: (
6970
"gpt_neox.final_layer_norm", # gptneox
7071
"transformer.ln_f", # gpt2 gpt-j falcon jais exaone
71-
"model.norm", # llama-hf baichuan internlm2 olmoe olmo2 phimoe
72+
"model.norm", # llama-hf baichuan internlm2 olmoe olmo2 phimoe nemotron cogvlm
7273
"norm", # llama-pth
7374
"transformer.norm_f", # mpt dbrx
7475
"ln_f", # refact bloom qwen gpt2
@@ -80,7 +81,6 @@ class TensorNameMap:
8081
"transformer.rms_norm", # Grok
8182
"encoder.final_layernorm", # chatglm
8283
"transformer.norm", # openelm
83-
"model.norm", # nemotron
8484
"rwkv.ln_out", # rwkv
8585
"backbone.final_layer_norm", # wavtokenizer
8686
),
@@ -108,7 +108,7 @@ class TensorNameMap:
108108
"transformer.h.{bid}.input_layernorm", # falcon7b
109109
"h.{bid}.input_layernorm", # bloom
110110
"transformer.h.{bid}.ln_mlp", # falcon40b
111-
"model.layers.{bid}.input_layernorm", # llama-hf nemotron olmoe phimoe
111+
"model.layers.{bid}.input_layernorm", # llama-hf nemotron olmoe phimoe cogvlm
112112
"layers.{bid}.attention_norm", # llama-pth
113113
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
114114
"model.layers.{bid}.ln1", # yi
@@ -127,9 +127,10 @@ class TensorNameMap:
127127

128128
# Attention norm 2
129129
MODEL_TENSOR.ATTN_NORM_2: (
130-
"transformer.h.{bid}.ln_attn", # falcon40b
131-
"encoder.layer.{bid}.layer_norm_1", # jina-v2-code
132-
"rwkv.blocks.{bid}.ln2", # rwkv
130+
"transformer.h.{bid}.ln_attn", # falcon40b
131+
"encoder.layer.{bid}.layer_norm_1", # jina-v2-code
132+
"rwkv.blocks.{bid}.ln2", # rwkv
133+
"model.layers.{bid}.post_cross_attention_layernorm", # cogvlm
133134
),
134135

135136
# Attention query-key-value
@@ -242,7 +243,7 @@ class TensorNameMap:
242243
"transformer.h.{bid}.ln_2", # gpt2 refact qwen jais exaone
243244
"h.{bid}.post_attention_layernorm", # bloom
244245
"transformer.blocks.{bid}.norm_2", # mpt
245-
"model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron olmoe phimoe
246+
"model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron olmoe phimoe cogvlm
246247
"layers.{bid}.ffn_norm", # llama-pth
247248
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
248249
"model.layers.{bid}.ln2", # yi
@@ -788,6 +789,58 @@ class TensorNameMap:
788789
"backbone.posnet.{bid}.proj_out", # wavtokenizer
789790
),
790791

792+
MODEL_TENSOR.ATTN_TXT_QKV: (
793+
"model.layers.{bid}.self_attn.language_expert_query_key_value", #cogvlm
794+
),
795+
796+
MODEL_TENSOR.ATTN_IMG_QKV: (
797+
"model.layers.{bid}.self_attn.vision_expert_query_key_value", #cogvlm
798+
),
799+
800+
MODEL_TENSOR.ATTN_TXT_DENSE: (
801+
"model.layers.{bid}.self_attn.language_expert_dense", #cogvlm
802+
),
803+
804+
MODEL_TENSOR.ATTN_IMG_DENSE: (
805+
"model.layers.{bid}.self_attn.vision_expert_dense", #cogvlm
806+
),
807+
808+
MODEL_TENSOR.CROSS_ATTN_Q: (
809+
"model.layers.{bid}.cross_attn.query", # cogvlm
810+
),
811+
812+
MODEL_TENSOR.CROSS_ATTN_KV: (
813+
"model.layers.{bid}.cross_attn.key_value", # cogvlm
814+
),
815+
816+
MODEL_TENSOR.CROSS_ATTN_DENSE: (
817+
"model.layers.{bid}.cross_attn.dense", # cogvlm
818+
),
819+
820+
MODEL_TENSOR.FFN_TXT_UP: (
821+
"model.layers.{bid}.mlp.language_mlp.up_proj", # cogvlm
822+
),
823+
824+
MODEL_TENSOR.FFN_TXT_GATE: (
825+
"model.layers.{bid}.mlp.language_mlp.gate_proj", # cogvlm
826+
),
827+
828+
MODEL_TENSOR.FFN_TXT_DOWN: (
829+
"model.layers.{bid}.mlp.language_mlp.down_proj", # cogvlm
830+
),
831+
832+
MODEL_TENSOR.FFN_IMG_UP: (
833+
"model.layers.{bid}.mlp.vision_mlp.up_proj", # cogvlm
834+
),
835+
836+
MODEL_TENSOR.FFN_IMG_GATE: (
837+
"model.layers.{bid}.mlp.vision_mlp.gate_proj", # cogvlm
838+
),
839+
840+
MODEL_TENSOR.FFN_IMG_DOWN: (
841+
"model.layers.{bid}.mlp.vision_mlp.down_proj", # cogvlm
842+
),
843+
791844
#############################################################################
792845

793846
MODEL_TENSOR.V_MMPROJ: (

src/llama-arch.cpp

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1302,25 +1302,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
13021302
{
13031303
LLM_ARCH_COGVLM,
13041304
{
1305-
{ LLM_TENSOR_TOKEN_EMBD, "embed_tokens" },
1306-
{ LLM_TENSOR_OUTPUT_NORM, "norm" },
1307-
{ LLM_TENSOR_OUTPUT, "lm_head" },
1308-
{ LLM_TENSOR_ATTN_NORM, "layers.%d.input_layernorm" }, // input_norm_w
1309-
{ LLM_TENSOR_ATTN_TXT_QKV, "layers.%d.self_attn.language_expert_query_key_value" }, // language_qkv_w
1310-
{ LLM_TENSOR_ATTN_IMG_QKV, "layers.%d.self_attn.vision_expert_query_key_value" }, // vision_qkv_w
1311-
{ LLM_TENSOR_ATTN_TXT_DENSE, "layers.%d.self_attn.language_expert_dense" }, // language_dense_w
1312-
{ LLM_TENSOR_ATTN_IMG_DENSE, "layers.%d.self_attn.vision_expert_dense" }, // vision_dense_w
1313-
{ LLM_TENSOR_ATTN_NORM_2, "layers.%d.post_cross_attention_layernorm" }, // self_attn_norm_w
1314-
{ LLM_TENSOR_CROSS_ATTN_Q, "layers.%d.cross_attn.query" }, // cross_query_w
1315-
{ LLM_TENSOR_CROSS_ATTN_KV, "layers.%d.cross_attn.key_value" }, // cross_query_kv
1316-
{ LLM_TENSOR_CROSS_ATTN_DENSE, "layers.%d.cross_attn.dense" }, // cross_dense_w
1317-
{ LLM_TENSOR_FFN_NORM, "layers.%d.post_attention_layernorm" }, // attn_norm_w
1318-
{ LLM_TENSOR_FFN_TXT_UP, "layers.%d.mlp.language_mlp.up_proj" }, // language_up_proj_w
1319-
{ LLM_TENSOR_FFN_TXT_GATE, "layers.%d.mlp.language_mlp.gate_proj" }, // language_gate_proj_w
1320-
{ LLM_TENSOR_FFN_TXT_DOWN, "layers.%d.mlp.language_mlp.down_proj" }, // language_down_proj_w
1321-
{ LLM_TENSOR_FFN_IMG_UP, "layers.%d.mlp.vision_mlp.up_proj" }, // vision_up_proj_w
1322-
{ LLM_TENSOR_FFN_IMG_GATE, "layers.%d.mlp.vision_mlp.gate_proj" }, // vision_gate_proj_w
1323-
{ LLM_TENSOR_FFN_IMG_DOWN, "layers.%d.mlp.vision_mlp.down_proj" } // vision_down_proj_w
1305+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1306+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1307+
{ LLM_TENSOR_OUTPUT, "output" },
1308+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, // input_norm_w
1309+
{ LLM_TENSOR_ATTN_TXT_QKV, "blk.%d.attn_txt_qkv" }, // language_qkv_w
1310+
{ LLM_TENSOR_ATTN_IMG_QKV, "blk.%d.attn_img_qkv" }, // vision_qkv_w
1311+
{ LLM_TENSOR_ATTN_TXT_DENSE, "blk.%d.attn_txt_dense" }, // language_dense_w
1312+
{ LLM_TENSOR_ATTN_IMG_DENSE, "blk.%d.attn_img_dense" }, // vision_dense_w
1313+
{ LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" }, // self_attn_norm_w
1314+
{ LLM_TENSOR_CROSS_ATTN_Q, "blk.%d.cross_attn_q" }, // cross_query_w
1315+
{ LLM_TENSOR_CROSS_ATTN_KV, "blk.%d.cross_attn_kv" }, // cross_query_kv
1316+
{ LLM_TENSOR_CROSS_ATTN_DENSE, "blk.%d.cross_attn_dense" }, // cross_dense_w
1317+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, // attn_norm_w
1318+
{ LLM_TENSOR_FFN_TXT_UP, "blk.%d.ffn_txt_up" }, // language_up_proj_w
1319+
{ LLM_TENSOR_FFN_TXT_GATE, "blk.%d.ffn_txt_gate" }, // language_gate_proj_w
1320+
{ LLM_TENSOR_FFN_TXT_DOWN, "blk.%d.ffn_txt_down" }, // language_down_proj_w
1321+
{ LLM_TENSOR_FFN_IMG_UP, "blk.%d.ffn_img_up" }, // vision_up_proj_w
1322+
{ LLM_TENSOR_FFN_IMG_GATE, "blk.%d.ffn_img_gate" }, // vision_gate_proj_w
1323+
{ LLM_TENSOR_FFN_IMG_DOWN, "blk.%d.ffn_img_down" } // vision_down_proj_w
13241324
},
13251325
},
13261326
{

0 commit comments

Comments
 (0)