Skip to content

Commit c912c67

Browse files
committed
wip llama 4 conversion
1 parent 2af6880 commit c912c67

File tree

3 files changed

+58
-10
lines changed

3 files changed

+58
-10
lines changed

convert_hf_to_gguf.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2044,6 +2044,32 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
20442044
return super().modify_tensors(data_torch, name, bid)
20452045

20462046

2047+
@ModelBase.register("Llama4ForConditionalGeneration")
2048+
class Llama4VisionModel(VisionModel):
2049+
def __init__(self, *args, **kwargs):
2050+
super().__init__(*args, **kwargs)
2051+
# IMPORTANT: the normal "intermediate_size" is renamed to "intermediate_size_mlp", we need to undo this
2052+
self.hparams["intermediate_size_moe"] = self.hparams["intermediate_size"]
2053+
self.hparams["intermediate_size"] = self.hparams["intermediate_size_mlp"]
2054+
2055+
def set_gguf_parameters(self):
2056+
super().set_gguf_parameters()
2057+
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.LLAMA4)
2058+
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams["norm_eps"])
2059+
self.gguf_writer.add_vision_projector_scale_factor((1.0 / self.hparams["pixel_shuffle_ratio"]) // 1)
2060+
assert self.hparams["hidden_act"] == "gelu"
2061+
self.gguf_writer.add_vision_use_gelu(True)
2062+
2063+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2064+
del bid # unused
2065+
if "multi_modal_projector" in name or "vision_model" in name:
2066+
# process vision tensors
2067+
if "positional_embedding_vlm" in name:
2068+
name += ".weight"
2069+
return []
2070+
2071+
2072+
20472073
@ModelBase.register("Mistral3ForConditionalGeneration")
20482074
class Mistral3Model(LlamaModel):
20492075
model_arch = gguf.MODEL_ARCH.LLAMA

gguf-py/gguf/constants.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -481,15 +481,17 @@ class MODEL_TENSOR(IntEnum):
481481
V_ENC_EMBD_CLS = auto()
482482
V_ENC_EMBD_PATCH = auto()
483483
V_ENC_EMBD_POS = auto()
484+
V_ENC_INPUT_NORM = auto()
484485
V_ENC_ATTN_Q = auto()
485486
V_ENC_ATTN_K = auto()
486487
V_ENC_ATTN_V = auto()
487-
V_ENC_INPUT_NORM = auto()
488-
V_ENC_OUTPUT = auto()
489-
V_ENC_OUTPUT_NORM = auto()
488+
V_ENC_ATTN_O = auto()
489+
V_ENC_ATTN_O_NORM = auto()
490+
V_ENC_POST_ATTN_NORM = auto()
490491
V_ENC_FFN_UP = auto()
491492
V_ENC_FFN_GATE = auto()
492493
V_ENC_FFN_DOWN = auto()
494+
V_ENC_FFN_POST_NORM = auto()
493495
V_PRE_NORM = auto()
494496
V_POST_NORM = auto()
495497
V_MM_INP_NORM = auto()
@@ -742,11 +744,13 @@ class MODEL_TENSOR(IntEnum):
742744
MODEL_TENSOR.V_ENC_ATTN_K: "v.blk.{bid}.attn_k",
743745
MODEL_TENSOR.V_ENC_ATTN_V: "v.blk.{bid}.attn_v",
744746
MODEL_TENSOR.V_ENC_INPUT_NORM: "v.blk.{bid}.ln1",
745-
MODEL_TENSOR.V_ENC_OUTPUT: "v.blk.{bid}.attn_out",
746-
MODEL_TENSOR.V_ENC_OUTPUT_NORM: "v.blk.{bid}.ln2",
747+
MODEL_TENSOR.V_ENC_ATTN_O: "v.blk.{bid}.attn_out",
748+
MODEL_TENSOR.V_ENC_ATTN_O_NORM: "v.blk.{bid}.attn_out_norm",
749+
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: "v.blk.{bid}.ln2",
747750
MODEL_TENSOR.V_ENC_FFN_UP: "v.blk.{bid}.ffn_up",
748751
MODEL_TENSOR.V_ENC_FFN_GATE: "v.blk.{bid}.ffn_gate",
749752
MODEL_TENSOR.V_ENC_FFN_DOWN: "v.blk.{bid}.ffn_down",
753+
MODEL_TENSOR.V_ENC_FFN_POST_NORM: "v.blk.{bid}.ffn_post_norm",
750754
MODEL_TENSOR.V_PRE_NORM: "v.pre_ln",
751755
MODEL_TENSOR.V_POST_NORM: "v.post_ln",
752756
MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection",
@@ -776,15 +780,17 @@ class MODEL_TENSOR(IntEnum):
776780
MODEL_TENSOR.V_ENC_EMBD_CLS,
777781
MODEL_TENSOR.V_ENC_EMBD_PATCH,
778782
MODEL_TENSOR.V_ENC_EMBD_POS,
783+
MODEL_TENSOR.V_ENC_INPUT_NORM,
779784
MODEL_TENSOR.V_ENC_ATTN_Q,
780785
MODEL_TENSOR.V_ENC_ATTN_K,
781786
MODEL_TENSOR.V_ENC_ATTN_V,
782-
MODEL_TENSOR.V_ENC_INPUT_NORM,
783-
MODEL_TENSOR.V_ENC_OUTPUT,
784-
MODEL_TENSOR.V_ENC_OUTPUT_NORM,
787+
MODEL_TENSOR.V_ENC_ATTN_O,
788+
MODEL_TENSOR.V_ENC_ATTN_O_NORM,
789+
MODEL_TENSOR.V_ENC_POST_ATTN_NORM,
785790
MODEL_TENSOR.V_ENC_FFN_UP,
786791
MODEL_TENSOR.V_ENC_FFN_GATE,
787792
MODEL_TENSOR.V_ENC_FFN_DOWN,
793+
MODEL_TENSOR.V_ENC_FFN_POST_NORM,
788794
MODEL_TENSOR.V_PRE_NORM,
789795
MODEL_TENSOR.V_POST_NORM,
790796
MODEL_TENSOR.V_MM_INP_PROJ,
@@ -2162,6 +2168,7 @@ class VisionProjectorType:
21622168
GEMMA3 = "gemma3"
21632169
IDEFICS3 = "idefics3"
21642170
PIXTRAL = "pixtral"
2171+
LLAMA4 = "llama4"
21652172

21662173

21672174
# Items here are (block size, type size)

gguf-py/gguf/tensor_mapping.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -900,10 +900,12 @@ class TensorNameMap:
900900

901901
MODEL_TENSOR.V_MMPROJ_FC: (
902902
"model.connector.modality_projection.proj", # SmolVLM
903+
"multi_modal_projector.linear_1", # llama 4
903904
),
904905

905906
MODEL_TENSOR.V_MMPROJ_MLP: (
906907
"model.mm_projector.mlp.mlp.{bid}",
908+
"vision_model.vision_adapter.mlp.fc{bid}.weight", # llama 4
907909
),
908910

909911
MODEL_TENSOR.V_MMPROJ_PEG: (
@@ -912,39 +914,45 @@ class TensorNameMap:
912914

913915
MODEL_TENSOR.V_ENC_EMBD_CLS: (
914916
"vision_tower.vision_model.embeddings.class_embedding",
917+
"vision_model.class_embedding", # llama 4
915918
),
916919

917920
MODEL_TENSOR.V_ENC_EMBD_PATCH: (
918921
"vision_tower.vision_model.embeddings.patch_embedding",
919922
"vpm.embeddings.patch_embedding",
920923
"model.vision_model.embeddings.patch_embedding", # SmolVLM
921924
"vision_tower.patch_conv", # pixtral
925+
"vision_model.patch_embedding.linear", # llama 4
922926
),
923927

924928
MODEL_TENSOR.V_ENC_EMBD_POS: (
925929
"vision_tower.vision_model.embeddings.position_embedding",
926930
"vpm.embeddings.position_embedding",
927931
"model.vision_model.embeddings.position_embedding", # SmolVLM
932+
"vision_model.positional_embedding_vlm", # llama 4
928933
),
929934

930935
MODEL_TENSOR.V_ENC_ATTN_Q: (
931936
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj",
932937
"vpm.encoder.layers.{bid}.self_attn.q_proj",
933938
"model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
939+
"vision_model.model.layers.{bid}.self_attn.q_proj", # llama4
934940
"vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral
935941
),
936942

937943
MODEL_TENSOR.V_ENC_ATTN_K: (
938944
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
939945
"vpm.encoder.layers.{bid}.self_attn.k_proj",
940946
"model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
947+
"vision_model.model.layers.{bid}.self_attn.k_proj", # llama4
941948
"vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral
942949
),
943950

944951
MODEL_TENSOR.V_ENC_ATTN_V: (
945952
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
946953
"vpm.encoder.layers.{bid}.self_attn.v_proj",
947954
"model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
955+
"vision_model.model.layers.{bid}.self_attn.v_proj", # llama4
948956
"vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral
949957
),
950958

@@ -953,19 +961,22 @@ class TensorNameMap:
953961
"vpm.encoder.layers.{bid}.layer_norm1",
954962
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
955963
"vision_tower.transformer.layers.{bid}.attention_norm", # pixtral
964+
"vision_model.model.layers.{bid}.input_layernorm", # llama4
956965
),
957966

958-
MODEL_TENSOR.V_ENC_OUTPUT: (
967+
MODEL_TENSOR.V_ENC_ATTN_O: (
959968
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
960969
"vpm.encoder.layers.{bid}.self_attn.out_proj",
961970
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
971+
"vision_model.model.layers.{bid}.self_attn.o_proj", # llama4
962972
"vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral
963973
),
964974

965-
MODEL_TENSOR.V_ENC_OUTPUT_NORM: (
975+
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
966976
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
967977
"vpm.encoder.layers.{bid}.layer_norm2",
968978
"model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
979+
"vision_model.model.layers.{bid}.post_attention_layernorm", # llama4
969980
"vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral
970981
),
971982

@@ -974,6 +985,7 @@ class TensorNameMap:
974985
"vpm.encoder.layers.{bid}.mlp.fc1",
975986
"model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3 (note: name is swapped)
976987
"vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral
988+
"vision_model.model.layers.{bid}.mlp.fc1", # llama4
977989
),
978990

979991
MODEL_TENSOR.V_ENC_FFN_GATE: (
@@ -985,16 +997,19 @@ class TensorNameMap:
985997
"vpm.encoder.layers.{bid}.mlp.fc2",
986998
"model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3 (note: name is swapped)
987999
"vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral
1000+
"vision_model.model.layers.{bid}.mlp.fc2", # llama4
9881001
),
9891002

9901003
MODEL_TENSOR.V_PRE_NORM: (
9911004
"vision_tower.vision_model.pre_layrnorm",
9921005
"vision_tower.ln_pre", # pixtral
1006+
"vision_model.layernorm_pre", # llama4
9931007
),
9941008

9951009
MODEL_TENSOR.V_POST_NORM: (
9961010
"vision_tower.vision_model.post_layernorm",
9971011
"model.vision_model.post_layernorm", # SmolVLM
1012+
"vision_model.layernorm_post", # llama4
9981013
),
9991014

10001015
MODEL_TENSOR.V_MM_INP_PROJ: (

0 commit comments

Comments
 (0)