Skip to content

Commit bacddc0

Browse files
Tianyue-Zhaongxson
andauthored
model: Add support for CogVLM model (#15002)
* Added GGUF mappings for CogVLM model * Add tensor mapping for CogVLM visual encoder * Add CogVLM to conversion script, no vision part yet * Added CogVLM vision model to conversion script * Add graph for CogVLM CLIP model * Add graph for CogVLM * Fixes for CogVLM. Now compiles. * Model now runs * Fixes for cogvlm graph * Account for graph context change after rebase * Changes for whitespace * Changes in convert script according to comments * Switch CogVLM LLM graph to merged QKV tensor * Use rope_type variable instead of direct definition * Change CogVLM CLIP encoder to use SWIGLU * Switch CogVLM CLIP to use merged QKV * Apply rebase edits and remove ggml_cont call that is now unnecessary * clean up --------- Co-authored-by: Xuan Son Nguyen <[email protected]>
1 parent 229bf68 commit bacddc0

File tree

9 files changed

+501
-26
lines changed

9 files changed

+501
-26
lines changed

convert_hf_to_gguf.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1528,7 +1528,7 @@ def set_gguf_parameters(self):
15281528
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
15291529
self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
15301530
self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
1531-
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"]))
1531+
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads"]))
15321532

15331533
# preprocessor config
15341534
image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
@@ -9493,6 +9493,37 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
94939493

94949494
return [] # skip other tensors
94959495

9496+
9497+
@ModelBase.register("CogVLMForCausalLM")
9498+
class CogVLMVisionModel(MmprojModel):
9499+
9500+
def set_gguf_parameters(self):
9501+
super().set_gguf_parameters()
9502+
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
9503+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.COGVLM)
9504+
9505+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9506+
del bid # unused
9507+
9508+
if not name.startswith("model.vision."):
9509+
return []
9510+
9511+
return [(self.map_tensor_name(name), data_torch)]
9512+
9513+
9514+
@ModelBase.register("CogVLMForCausalLM")
9515+
class CogVLMModel(LlamaModel):
9516+
model_arch = gguf.MODEL_ARCH.COGVLM
9517+
9518+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9519+
del bid # unused
9520+
9521+
# block vision tensors
9522+
if name.startswith("model.vision."):
9523+
return []
9524+
9525+
return [(self.map_tensor_name(name), data_torch)]
9526+
94969527
###### CONVERSION LOGIC ######
94979528

94989529

gguf-py/gguf/constants.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,7 @@ class MODEL_ARCH(IntEnum):
420420
SEED_OSS = auto()
421421
GROVEMOE = auto()
422422
APERTUS = auto()
423+
COGVLM = auto()
423424

424425

425426
class VISION_PROJECTOR_TYPE(IntEnum):
@@ -430,6 +431,7 @@ class VISION_PROJECTOR_TYPE(IntEnum):
430431
GLM_EDGE = auto()
431432
MERGER = auto()
432433
GEMMA3 = auto()
434+
COGVLM = auto()
433435

434436

435437
class MODEL_TENSOR(IntEnum):
@@ -600,6 +602,11 @@ class MODEL_TENSOR(IntEnum):
600602
SHORTCONV_CONV = auto()
601603
SHORTCONV_INPROJ = auto()
602604
SHORTCONV_OUTPROJ = auto()
605+
VISEXP_ATTN_QKV = auto()
606+
VISEXP_ATTN_OUT = auto()
607+
VISEXP_GATE = auto()
608+
VISEXP_DOWN = auto()
609+
VISEXP_UP = auto()
603610
# vision
604611
V_MMPROJ = auto()
605612
V_MMPROJ_FC = auto()
@@ -609,6 +616,7 @@ class MODEL_TENSOR(IntEnum):
609616
V_ENC_EMBD_PATCH = auto()
610617
V_ENC_EMBD_POS = auto()
611618
V_ENC_INPUT_NORM = auto()
619+
V_ENC_ATTN_QKV = auto()
612620
V_ENC_ATTN_Q = auto()
613621
V_ENC_ATTN_Q_NORM = auto()
614622
V_ENC_ATTN_K = auto()
@@ -640,6 +648,12 @@ class MODEL_TENSOR(IntEnum):
640648
V_RESMPL_QUERY = auto() # minicpmv
641649
V_TOK_EMBD_IMG_BREAK = auto() # pixtral
642650
V_MM_PATCH_MERGER = auto() # mistral small 3.1
651+
V_MM_POST_FC_NORM = auto() # cogvlm
652+
V_MM_UP = auto() # cogvlm
653+
V_MM_DOWN = auto() # cogvlm
654+
V_MM_GATE = auto() # cogvlm
655+
V_TOK_BOI = auto() # cogvlm
656+
V_TOK_EOI = auto() # cogvlm
643657
# audio (mtmd)
644658
A_ENC_EMBD_POS = auto()
645659
A_ENC_CONV1D = auto()
@@ -766,6 +780,7 @@ class MODEL_TENSOR(IntEnum):
766780
MODEL_ARCH.SEED_OSS: "seed_oss",
767781
MODEL_ARCH.GROVEMOE: "grovemoe",
768782
MODEL_ARCH.APERTUS: "apertus",
783+
MODEL_ARCH.COGVLM: "cogvlm",
769784
}
770785

771786
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@@ -946,6 +961,11 @@ class MODEL_TENSOR(IntEnum):
946961
MODEL_TENSOR.SHORTCONV_CONV: "blk.{bid}.shortconv.conv",
947962
MODEL_TENSOR.SHORTCONV_INPROJ: "blk.{bid}.shortconv.in_proj",
948963
MODEL_TENSOR.SHORTCONV_OUTPROJ: "blk.{bid}.shortconv.out_proj",
964+
MODEL_TENSOR.VISEXP_ATTN_QKV: "blk.{bid}.vis_attn_qkv",
965+
MODEL_TENSOR.VISEXP_ATTN_OUT: "blk.{bid}.vis_attn_output",
966+
MODEL_TENSOR.VISEXP_GATE: "blk.{bid}.vis_gate",
967+
MODEL_TENSOR.VISEXP_DOWN: "blk.{bid}.vis_down",
968+
MODEL_TENSOR.VISEXP_UP: "blk.{bid}.vis_up",
949969
# vision
950970
MODEL_TENSOR.V_MMPROJ: "mm.{bid}",
951971
MODEL_TENSOR.V_MMPROJ_FC: "mm.model.fc",
@@ -954,6 +974,7 @@ class MODEL_TENSOR(IntEnum):
954974
MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd",
955975
MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd",
956976
MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd",
977+
MODEL_TENSOR.V_ENC_ATTN_QKV: "v.blk.{bid}.attn_qkv",
957978
MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q",
958979
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: "v.blk.{bid}.attn_q_norm",
959980
MODEL_TENSOR.V_ENC_ATTN_K: "v.blk.{bid}.attn_k",
@@ -986,6 +1007,12 @@ class MODEL_TENSOR(IntEnum):
9861007
MODEL_TENSOR.V_RESMPL_QUERY: "resampler.query",
9871008
MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: "v.token_embd.img_break", # pixtral
9881009
MODEL_TENSOR.V_MM_PATCH_MERGER: "mm.patch_merger", # mistral small 3.1
1010+
MODEL_TENSOR.V_MM_POST_FC_NORM: "mm.post_fc_norm", # cogvlm
1011+
MODEL_TENSOR.V_MM_UP: "mm.up",
1012+
MODEL_TENSOR.V_MM_DOWN: "mm.down",
1013+
MODEL_TENSOR.V_MM_GATE: "mm.gate",
1014+
MODEL_TENSOR.V_TOK_BOI: "v.boi",
1015+
MODEL_TENSOR.V_TOK_EOI: "v.eoi",
9891016
# audio (mtmd)
9901017
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
9911018
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
@@ -1023,6 +1050,7 @@ class MODEL_TENSOR(IntEnum):
10231050
MODEL_TENSOR.V_ENC_EMBD_PATCH,
10241051
MODEL_TENSOR.V_ENC_EMBD_POS,
10251052
MODEL_TENSOR.V_ENC_INPUT_NORM,
1053+
MODEL_TENSOR.V_ENC_ATTN_QKV,
10261054
MODEL_TENSOR.V_ENC_ATTN_Q,
10271055
MODEL_TENSOR.V_ENC_ATTN_Q_NORM,
10281056
MODEL_TENSOR.V_ENC_ATTN_K,
@@ -1054,6 +1082,12 @@ class MODEL_TENSOR(IntEnum):
10541082
MODEL_TENSOR.V_RESMPL_QUERY,
10551083
MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK,
10561084
MODEL_TENSOR.V_MM_PATCH_MERGER,
1085+
MODEL_TENSOR.V_MM_POST_FC_NORM,
1086+
MODEL_TENSOR.V_MM_UP,
1087+
MODEL_TENSOR.V_MM_DOWN,
1088+
MODEL_TENSOR.V_MM_GATE,
1089+
MODEL_TENSOR.V_TOK_BOI,
1090+
MODEL_TENSOR.V_TOK_EOI,
10571091
# audio
10581092
MODEL_TENSOR.A_ENC_EMBD_POS,
10591093
MODEL_TENSOR.A_ENC_CONV1D,
@@ -2837,6 +2871,23 @@ class MODEL_TENSOR(IntEnum):
28372871
MODEL_TENSOR.FFN_DOWN_CHEXP,
28382872
MODEL_TENSOR.FFN_UP_CHEXP,
28392873
],
2874+
MODEL_ARCH.COGVLM: [
2875+
MODEL_TENSOR.TOKEN_EMBD,
2876+
MODEL_TENSOR.OUTPUT_NORM,
2877+
MODEL_TENSOR.OUTPUT,
2878+
MODEL_TENSOR.ATTN_NORM,
2879+
MODEL_TENSOR.ATTN_QKV,
2880+
MODEL_TENSOR.ATTN_OUT,
2881+
MODEL_TENSOR.FFN_NORM,
2882+
MODEL_TENSOR.FFN_GATE,
2883+
MODEL_TENSOR.FFN_DOWN,
2884+
MODEL_TENSOR.FFN_UP,
2885+
MODEL_TENSOR.VISEXP_ATTN_QKV,
2886+
MODEL_TENSOR.VISEXP_ATTN_OUT,
2887+
MODEL_TENSOR.VISEXP_GATE,
2888+
MODEL_TENSOR.VISEXP_UP,
2889+
MODEL_TENSOR.VISEXP_DOWN,
2890+
],
28402891
# TODO
28412892
}
28422893

@@ -3063,6 +3114,7 @@ class VisionProjectorType:
30633114
LFM2 = "lfm2"
30643115
KIMIVL = "kimivl"
30653116
LIGHTONOCR = "lightonocr"
3117+
COGVLM = "cogvlm"
30663118

30673119

30683120
# Items here are (block size, type size)

0 commit comments

Comments
 (0)