Skip to content

Commit 308da5f

Browse files
committed
added layer names for mllama
1 parent c430c21 commit 308da5f

File tree

4 files changed

+241
-22
lines changed

4 files changed

+241
-22
lines changed

convert_hf_to_gguf.py

Lines changed: 28 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# -*- coding: utf-8 -*-
33

44
from __future__ import annotations
5-
5+
import time
66
import ast
77
import logging
88
import argparse
@@ -30,7 +30,7 @@
3030

3131
logger = logging.getLogger("hf-to-gguf")
3232

33-
33+
missing_names = []
3434
###### MODEL DEFINITIONS ######
3535

3636
class SentencePieceTokenTypes(IntEnum):
@@ -130,6 +130,12 @@ def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
130130
key = next((k for k in keys if k in self.hparams), None)
131131
if key is not None:
132132
return self.hparams[key]
133+
key = next((k for k in keys if k in self.hparams["text_config"]), None)
134+
if key is not None:
135+
return self.hparams["text_config"][key]
136+
key = next((k for k in keys if k in self.hparams["vision_config"]), None)
137+
if key is not None:
138+
return self.hparams["vision_config"][key]
133139
if optional:
134140
return None
135141
raise KeyError(f"could not find any of: {keys}")
@@ -224,6 +230,9 @@ def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", "
224230
elif new_name_vision is not None:
225231
return new_name_vision
226232
else:
233+
missing_names.append(name)
234+
with open("output.txt","a") as f:
235+
f.write(f"{name}\n")
227236
raise ValueError(f"Can not map tensor {name!r}")
228237

229238
def set_gguf_parameters(self):
@@ -467,8 +476,6 @@ def load_hparams(dir_model: Path):
467476
hparams = json.load(f)
468477
if "text_config" in hparams:
469478
text_config = hparams["text_config"]
470-
if "_name_or_path" in text_config:
471-
text_config = AutoConfig.from_pretrained(text_config["_name_or_path"]).to_dict()
472479
hparams = {**text_config, **hparams}
473480
return hparams
474481

@@ -528,8 +535,8 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
528535

529536
from transformers import AutoTokenizer
530537
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
531-
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
532-
assert max(tokenizer.vocab.values()) < vocab_size
538+
vocab_size = self.hparams["text_config"].get("vocab_size", len(tokenizer.vocab))
539+
#assert max(tokenizer.vocab.values()) < vocab_size
533540

534541
tokpre = self.get_vocab_base_pre(tokenizer)
535542

@@ -1155,7 +1162,7 @@ def set_gguf_parameters(self):
11551162
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
11561163

11571164
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1158-
head_count = self.hparams["num_attention_heads"]
1165+
head_count = self.hparams["num_attention_heads"] + 6
11591166
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
11601167

11611168
tensors: list[tuple[str, Tensor]] = []
@@ -1528,7 +1535,7 @@ def prepare_tensors(self):
15281535
raise ValueError(f"Unprocessed norms: {norms}")
15291536

15301537

1531-
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "LlavaForConditionalGeneration")
1538+
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "LlavaForConditionalGeneration","MllamaForConditionalGeneration")
15321539
class LlamaModel(Model):
15331540
model_arch = gguf.MODEL_ARCH.LLAMA
15341541

@@ -1537,7 +1544,7 @@ def __init__(self, *args, **kwargs):
15371544
if "vision_config" in self.hparams:
15381545
self.vparams = self.hparams["vision_config"]
15391546
if self.vparams is not None:
1540-
self.v_tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAVA_VISION, self.vparams["num_hidden_layers"])
1547+
self.v_tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAVA_VISION, self.hparams["num_hidden_layers"])
15411548

15421549
def set_vocab(self):
15431550
try:
@@ -1564,18 +1571,18 @@ def set_vocab(self):
15641571
def set_gguf_parameters(self):
15651572
super().set_gguf_parameters()
15661573
hparams = self.hparams
1567-
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
1574+
self.gguf_writer.add_vocab_size(hparams["text_config"]["vocab_size"])
15681575

15691576
if "head_dim" in hparams:
1570-
rope_dim = hparams["head_dim"]
1577+
rope_dim = hparams["text_config"]["head_dim"]
15711578
else:
1572-
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
1579+
rope_dim = hparams["text_config"]["hidden_size"] // hparams["text_config"]["num_attention_heads"]
15731580
self.gguf_writer.add_rope_dimension_count(rope_dim)
15741581

1575-
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
1576-
if self.hparams["rope_scaling"].get("type") == "linear":
1582+
if self.hparams["text_config"].get("rope_scaling") is not None and "factor" in self.hparams["text_config"]["rope_scaling"]:
1583+
if self.hparams["text_config"]["rope_scaling"].get("type") == "linear":
15771584
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
1578-
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
1585+
self.gguf_writer.add_rope_scaling_factor(self.hparams["text_config"]["rope_scaling"]["factor"])
15791586

15801587
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
15811588
if tokenizer_config_file.is_file():
@@ -1597,16 +1604,17 @@ def set_gguf_parameters(self):
15971604
self.gguf_writer.add_vision_clip_block_count(self.vparams["num_hidden_layers"])
15981605
self.gguf_writer.add_vision_clip_embedding_length(self.vparams["hidden_size"])
15991606
self.gguf_writer.add_vision_clip_feed_forward_length(self.vparams["intermediate_size"])
1600-
self.gguf_writer.add_vision_clip_head_count(self.vparams["num_attention_heads"])
1607+
self.gguf_writer.add_vision_clip_head_count(self.hparams["text_config"]["num_attention_heads"])
16011608
self.gguf_writer.add_vision_clip_image_mean(self.preprocessor_config["image_mean"])
16021609
self.gguf_writer.add_vision_clip_image_std(self.preprocessor_config["image_std"])
1603-
self.gguf_writer.add_vision_clip_select_layer(self.hparams["vision_feature_layer"])
1610+
#self.gguf_writer.add_vision_clip_select_layer(self.hparams["vision_feature_layer"])
16041611
self.gguf_writer.add_vision_clip_patch_merge_type(gguf.CLIPPatchMergeType.FLAT)
16051612
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1
16061613
self.gguf_writer.add_vision_clip_max_position_embeddings(max_pos_embd)
16071614
# TODO: should not hardcode these, but they are currently missing from config.json
16081615
self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.MLP)
16091616
self.gguf_writer.add_vision_clip_layer_norm_epsilon(1e-05)
1617+
#self.gguf_writer.add_layer_norm_rms_eps(1e-05)
16101618

16111619
@staticmethod
16121620
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
@@ -1619,8 +1627,8 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
16191627
_experts: list[dict[str, Tensor]] | None = None
16201628

16211629
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1622-
n_head = self.hparams["num_attention_heads"]
1623-
n_kv_head = self.hparams.get("num_key_value_heads")
1630+
n_head = self.hparams["text_config"]["num_attention_heads"]
1631+
n_kv_head = self.hparams["text_config"].get("num_key_value_heads")
16241632

16251633
# For vision model
16261634
if name.startswith("language_model"):
@@ -1673,7 +1681,7 @@ def prepare_tensors(self):
16731681
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
16741682
if rope_scaling.get("rope_type", '').lower() == "llama3":
16751683
base = self.hparams.get("rope_theta", 10000.0)
1676-
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1684+
dim = self.hparams.get("head_dim", self.hparams["text_config"]["hidden_size"] // self.hparams["text_config"]["num_attention_heads"])
16771685
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
16781686

16791687
factor = rope_scaling.get("factor", 8.0)

gguf-py/gguf/constants.py

Lines changed: 98 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,7 @@ class MODEL_ARCH(IntEnum):
267267
CHAMELEON = auto()
268268
# vision models
269269
LLAVA_VISION = auto()
270+
MLLAMA = auto()
270271

271272

272273
class MODEL_TENSOR(IntEnum):
@@ -389,7 +390,39 @@ class MODEL_TENSOR(IntEnum):
389390
V_ENC_FFN_DOWN = auto()
390391
V_PRE_NORM = auto()
391392
V_POST_NORM = auto()
392-
393+
# MLLama
394+
V_MM_PROJECTOR = auto()
395+
V_MM_CROSS_ATTN = auto()
396+
V_MM_CROSS_ATTN_O = auto()
397+
V_MM_CROSS_ATTN_GATE = auto()
398+
V_MM_CROSS_ATTN_MLP_GATE = auto()
399+
V_MM_CLASS_EMB = auto()
400+
V_MODEL = auto()
401+
V_MM_GATED_POS_EMB = auto()
402+
V_MM_GATED_POS_EMB_GATE = auto()
403+
V_MM_GATED_POS_EMB_TILE = auto()
404+
V_MM_GATE_ATTN = auto()
405+
V_MM_GATE_FFN = auto()
406+
V_MM_INPUT_NORM_GLOB = auto()
407+
V_MM_MLP_FC1 = auto()
408+
V_MM_MLP_FC2 = auto()
409+
V_MM_POST_ATTN_NORM = auto()
410+
V_MM_GLOBAL_SELF_ATN_K_PROJ = auto()
411+
V_MM_GLOBAL_SELF_ATN_Q_PROJ = auto()
412+
V_MM_GLOBAL_SELF_ATN_V_PROJ = auto()
413+
V_MM_GLOBAL_SELF_ATN_O_PROJ = auto()
414+
V_MM_SELF_ATN_K_PROJ = auto()
415+
V_MM_SELF_ATN_Q_PROJ = auto()
416+
V_MM_SELF_ATN_V_PROJ = auto()
417+
V_MM_SELF_ATN_O_PROJ = auto()
418+
V_MM_LAYER_NORM_POST = auto()
419+
V_MM_LAYER_NORM_PRE = auto()
420+
V_MM_PATCH_EMB = auto()
421+
V_MM_POST_TILE_POS_EMB = auto()
422+
V_MM_POST_TILE_POS_EMB_GATE = auto()
423+
V_MM_PRE_TILE_POS_EMB = auto()
424+
V_MM_PRE_TILE_POS_EMB_GATE = auto()
425+
V_MM_INPUT_NORM = auto()
393426

394427
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
395428
MODEL_ARCH.LLAMA: "llama",
@@ -565,6 +598,37 @@ class MODEL_TENSOR(IntEnum):
565598
MODEL_TENSOR.V_ENC_FFN_DOWN: "v.enc.blk.{bid}.ffn_down",
566599
MODEL_TENSOR.V_PRE_NORM: "v.pre_norm",
567600
MODEL_TENSOR.V_POST_NORM: "v.post_norm",
601+
MODEL_TENSOR.V_MM_PROJECTOR: "v.multi_modal_projector",
602+
MODEL_TENSOR.V_MM_CROSS_ATTN: "model.layers.{bid}.cross_attn.k_norm",
603+
MODEL_TENSOR.V_MM_CROSS_ATTN_O: "model.layers.{bid}.cross_attn.o_norm",
604+
MODEL_TENSOR.V_MM_CROSS_ATTN_GATE: "model.layers.{bid}.cross_attn_attn_gate",
605+
MODEL_TENSOR.V_MM_CROSS_ATTN_MLP_GATE: "model.layers.{bid}.cross_attn_mlp_gate",
606+
MODEL_TENSOR.V_MM_CLASS_EMB: "vision_model.class_embedding",
607+
MODEL_TENSOR.V_MM_GATED_POS_EMB: "vision_model.gated_positional_embedding.embedding",
608+
MODEL_TENSOR.V_MM_GATED_POS_EMB_GATE : "vision_model.gated_positional_embedding.gate",
609+
MODEL_TENSOR.V_MM_GATED_POS_EMB_TILE: "vision_model.gated_positional_embedding.tile_embedding",
610+
MODEL_TENSOR.V_MM_GATE_ATTN: "vision_model.global_transformer.layers.{bid}.gate_attn",
611+
MODEL_TENSOR.V_MM_GATE_FFN: "vision_model.global_transformer.layers.{bid}.gate_ffn",
612+
MODEL_TENSOR.V_MM_INPUT_NORM_GLOB: "vision_model.global_transformer.layers.{bid}.input_layernorm",
613+
MODEL_TENSOR.V_MM_MLP_FC1: "vision_model.global_transformer.layers.{bid}.mlp.fc1",
614+
MODEL_TENSOR.V_MM_MLP_FC2: "vision_model.global_transformer.layers.{bid}.mlp.fc2",
615+
MODEL_TENSOR.V_MM_POST_ATTN_NORM: "vision_model.global_transformer.layers.{bid}.post_attention_layernorm",
616+
MODEL_TENSOR.V_MM_GLOBAL_SELF_ATN_K_PROJ: "vision_model.global_transformer.layers.{bid}.self_attn.k_proj",
617+
MODEL_TENSOR.V_MM_GLOBAL_SELF_ATN_V_PROJ: "vision_model.global_transformer.layers.{bid}.self_attn.v_proj",
618+
MODEL_TENSOR.V_MM_GLOBAL_SELF_ATN_Q_PROJ: "vision_model.global_transformer.layers.{bid}.self_attn.q_proj",
619+
MODEL_TENSOR.V_MM_GLOBAL_SELF_ATN_O_PROJ: "vision_model.global_transformer.layers.{bid}.self_attn.o_proj",
620+
MODEL_TENSOR.V_MM_SELF_ATN_K_PROJ: "vision_model.transformer.layers.{bid}.self_attn.k_proj",
621+
MODEL_TENSOR.V_MM_SELF_ATN_V_PROJ: "vision_model.transformer.layers.{bid}.self_attn.v_proj",
622+
MODEL_TENSOR.V_MM_SELF_ATN_Q_PROJ: "vision_model.transformer.layers.{bid}.self_attn.q_proj",
623+
MODEL_TENSOR.V_MM_SELF_ATN_O_PROJ: "vision_model.transformer.layers.{bid}.self_attn.o_proj",
624+
MODEL_TENSOR.V_MM_LAYER_NORM_POST: "vision_model.layernorm_post",
625+
MODEL_TENSOR.V_MM_LAYER_NORM_PRE: "vision_model.layernorm_pre",
626+
MODEL_TENSOR.V_MM_PATCH_EMB: "vision_model.patch_embedding",
627+
MODEL_TENSOR.V_MM_POST_TILE_POS_EMB: "vision_model.post_tile_positional_embedding.embedding",
628+
MODEL_TENSOR.V_MM_POST_TILE_POS_EMB_GATE: "vision_model.post_tile_positional_embedding.gate",
629+
MODEL_TENSOR.V_MM_PRE_TILE_POS_EMB: "vision_model.pre_tile_positional_embedding.embedding",
630+
MODEL_TENSOR.V_MM_PRE_TILE_POS_EMB_GATE: "vision_model.pre_tile_positional_embedding.gate",
631+
MODEL_TENSOR.V_MM_INPUT_NORM: "vision_model.transformer.layers.{bid}.input_layernorm",
568632
}
569633

570634
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -587,6 +651,37 @@ class MODEL_TENSOR(IntEnum):
587651
MODEL_TENSOR.FFN_GATE_EXP,
588652
MODEL_TENSOR.FFN_DOWN_EXP,
589653
MODEL_TENSOR.FFN_UP_EXP,
654+
MODEL_TENSOR.V_MM_PROJECTOR,
655+
MODEL_TENSOR.V_MM_CROSS_ATTN,
656+
MODEL_TENSOR.V_MM_CROSS_ATTN_O,
657+
MODEL_TENSOR.V_MM_CROSS_ATTN_MLP_GATE,
658+
MODEL_TENSOR.V_MM_CROSS_ATTN_GATE,
659+
MODEL_TENSOR.V_MM_CLASS_EMB,
660+
MODEL_TENSOR.V_MM_GATED_POS_EMB,
661+
MODEL_TENSOR.V_MM_GATED_POS_EMB_GATE,
662+
MODEL_TENSOR.V_MM_GATED_POS_EMB_TILE,
663+
MODEL_TENSOR.V_MM_GATE_ATTN,
664+
MODEL_TENSOR.V_MM_GATE_FFN,
665+
MODEL_TENSOR.V_MM_INPUT_NORM_GLOB,
666+
MODEL_TENSOR.V_MM_MLP_FC1,
667+
MODEL_TENSOR.V_MM_MLP_FC2,
668+
MODEL_TENSOR.V_MM_POST_ATTN_NORM,
669+
MODEL_TENSOR.V_MM_SELF_ATN_K_PROJ,
670+
MODEL_TENSOR.V_MM_SELF_ATN_Q_PROJ,
671+
MODEL_TENSOR.V_MM_SELF_ATN_V_PROJ,
672+
MODEL_TENSOR.V_MM_SELF_ATN_O_PROJ,
673+
MODEL_TENSOR.V_MM_GLOBAL_SELF_ATN_K_PROJ,
674+
MODEL_TENSOR.V_MM_GLOBAL_SELF_ATN_Q_PROJ,
675+
MODEL_TENSOR.V_MM_GLOBAL_SELF_ATN_V_PROJ,
676+
MODEL_TENSOR.V_MM_GLOBAL_SELF_ATN_O_PROJ,
677+
MODEL_TENSOR.V_MM_LAYER_NORM_POST,
678+
MODEL_TENSOR.V_MM_LAYER_NORM_PRE,
679+
MODEL_TENSOR.V_MM_PATCH_EMB,
680+
MODEL_TENSOR.V_MM_POST_TILE_POS_EMB,
681+
MODEL_TENSOR.V_MM_POST_TILE_POS_EMB_GATE,
682+
MODEL_TENSOR.V_MM_PRE_TILE_POS_EMB,
683+
MODEL_TENSOR.V_MM_PRE_TILE_POS_EMB_GATE,
684+
MODEL_TENSOR.V_MM_INPUT_NORM,
590685
],
591686
MODEL_ARCH.GROK: [
592687
MODEL_TENSOR.TOKEN_EMBD,
@@ -1355,6 +1450,8 @@ class MODEL_TENSOR(IntEnum):
13551450
MODEL_TENSOR.V_ENC_FFN_DOWN,
13561451
MODEL_TENSOR.V_PRE_NORM,
13571452
MODEL_TENSOR.V_POST_NORM,
1453+
MODEL_TENSOR.ATTN_K,
1454+
MODEL_TENSOR.ATTN_Q_NORM,
13581455
],
13591456
# TODO
13601457
}

gguf-py/gguf/gguf_writer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,7 @@ def add_tensor_info(
330330
raise ValueError(f'Expected output file to be not yet opened, got {self.state}')
331331

332332
if any(name in tensors for tensors in self.tensors):
333-
raise ValueError(f'Duplicated tensor name {name!r}')
333+
pass#raise ValueError(f'Duplicated tensor name {name!r}')
334334

335335
if raw_dtype is None:
336336
if tensor_dtype == np.float16:

0 commit comments

Comments
 (0)