Skip to content

Commit cd806a7

Browse files
committed
add llava to conversion
1 parent 1b2f992 commit cd806a7

File tree

4 files changed

+200
-4
lines changed

4 files changed

+200
-4
lines changed

convert_hf_to_gguf.py

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,11 @@ class Model:
6666
dir_model_card: Path
6767
is_lora: bool
6868

69+
# for vision model
70+
vparams: dict[str, Any] | None = None
71+
v_tensor_map: gguf.TensorNameMap
72+
v_tensor_names: set[str] | None
73+
6974
# subclasses should define this!
7075
model_arch: gguf.MODEL_ARCH
7176

@@ -210,9 +215,13 @@ def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int |
210215

211216
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
212217
new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes)
213-
if new_name is None:
218+
new_name_vision = self.v_tensor_map.get_name(key=name, try_suffixes=try_suffixes)
219+
if new_name is not None:
220+
return new_name
221+
elif new_name_vision is not None:
222+
return new_name_vision
223+
else:
214224
raise ValueError(f"Can not map tensor {name!r}")
215-
return new_name
216225

217226
def set_gguf_parameters(self):
218227
self.gguf_writer.add_block_count(self.block_count)
@@ -452,7 +461,10 @@ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]
452461
@staticmethod
453462
def load_hparams(dir_model: Path):
454463
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
455-
return json.load(f)
464+
hparams = json.load(f)
465+
if "text_config" in hparams:
466+
hparams = {**hparams, **hparams["text_config"]}
467+
return hparams
456468

457469
@classmethod
458470
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
@@ -1501,10 +1513,17 @@ def prepare_tensors(self):
15011513
raise ValueError(f"Unprocessed norms: {norms}")
15021514

15031515

1504-
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
1516+
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "LlavaForConditionalGeneration")
15051517
class LlamaModel(Model):
15061518
model_arch = gguf.MODEL_ARCH.LLAMA
15071519

1520+
def __init__(self, *args, **kwargs):
1521+
super().__init__(*args, **kwargs)
1522+
if "vision_config" in self.hparams:
1523+
self.vparams = self.hparams["vision_config"]
1524+
if self.vparams is not None:
1525+
self.v_tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAVA_VISION, self.vparams["num_hidden_layers"])
1526+
15081527
def set_vocab(self):
15091528
try:
15101529
self._set_vocab_sentencepiece()
@@ -1554,6 +1573,17 @@ def set_gguf_parameters(self):
15541573
if self.hparams.get("vocab_size", 32000) == 49152:
15551574
self.gguf_writer.add_add_bos_token(False)
15561575

1576+
# For vision model
1577+
if self.vparams is not None:
1578+
self.gguf_writer.add_vision_type("clip")
1579+
self.gguf_writer.add_vision_image_size(self.vparams["image_size"])
1580+
self.gguf_writer.add_vision_patch_size(self.vparams["patch_size"])
1581+
self.gguf_writer.add_vision_clip_architecture("llava")
1582+
self.gguf_writer.add_vision_clip_block_count(self.vparams["num_hidden_layers"])
1583+
self.gguf_writer.add_vision_clip_embedding_length(self.vparams["hidden_size"])
1584+
self.gguf_writer.add_vision_clip_feed_forward_length(self.vparams["intermediate_size"])
1585+
self.gguf_writer.add_vision_clip_head_count(self.vparams["num_attention_heads"])
1586+
15571587
@staticmethod
15581588
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
15591589
if n_head_kv is not None and n_head != n_head_kv:
@@ -1568,6 +1598,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
15681598
n_head = self.hparams["num_attention_heads"]
15691599
n_kv_head = self.hparams.get("num_key_value_heads")
15701600

1601+
if name.startswith("language_model"):
1602+
name = name.replace("language_model.", "")
1603+
15711604
if name.endswith(("q_proj.weight", "q_proj.bias")):
15721605
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
15731606
if name.endswith(("k_proj.weight", "k_proj.bias")):

gguf-py/gguf/constants.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,26 @@ class Adapter:
178178
TYPE = "adapter.type"
179179
LORA_ALPHA = "adapter.lora.alpha"
180180

181+
class Vision:
182+
# only support vision.type = "clip" for now
183+
TYPE = "vision.type"
184+
IMAGE_SIZE = "vision.image_size"
185+
PATCH_SIZE = "vision.patch_size"
186+
IMAGE_MEAN = "vision.image_mean"
187+
IMAGE_STD = "vision.image_std"
188+
189+
class Clip:
190+
ARCHITECTURE = "vision.clip.architecture"
191+
CONTEXT_LENGTH = "vision.clip.context_length"
192+
EMBEDDING_LENGTH = "vision.clip.embedding_length"
193+
BLOCK_COUNT = "vision.clip.block_count"
194+
FEED_FORWARD_LENGTH = "vision.clip.feed_forward_length"
195+
PROJECTION_TYPE = "vision.clip.projection_type"
196+
PROJECTION_DIM = "vision.clip.projection_dim"
197+
USE_GELU = "vision.clip.use_gelu"
198+
HEAD_COUNT = "vision.clip.attention.head_count"
199+
LAYERNORM_EPS = "vision.clip.attention.layer_norm_epsilon"
200+
181201
#
182202
# recommended mapping of model tensor names for storage in gguf
183203
#
@@ -238,6 +258,8 @@ class MODEL_ARCH(IntEnum):
238258
GRANITE = auto()
239259
GRANITE_MOE = auto()
240260
CHAMELEON = auto()
261+
# vision models
262+
LLAVA_VISION = auto()
241263

242264

243265
class MODEL_TENSOR(IntEnum):
@@ -345,6 +367,22 @@ class MODEL_TENSOR(IntEnum):
345367
ENC_FFN_DOWN = auto()
346368
ENC_FFN_UP = auto()
347369
ENC_OUTPUT_NORM = auto()
370+
# vision
371+
V_MMPROJ_A = auto()
372+
V_MMPROJ_B = auto()
373+
V_ENC_EMBD_CLS = auto()
374+
V_ENC_EMBD_PATCH = auto()
375+
V_ENC_EMBD_POS = auto()
376+
V_ENC_ATTN_Q = auto()
377+
V_ENC_ATTN_K = auto()
378+
V_ENC_ATTN_V = auto()
379+
V_ENC_INPUT_NORM = auto()
380+
V_ENC_OUTPUT = auto()
381+
V_ENC_OUTPUT_NORM = auto()
382+
V_ENC_FFN_UP = auto()
383+
V_ENC_FFN_DOWN = auto()
384+
V_PRE_NORM = auto()
385+
V_POST_NORM = auto()
348386

349387

350388
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -397,6 +435,8 @@ class MODEL_TENSOR(IntEnum):
397435
MODEL_ARCH.GRANITE: "granite",
398436
MODEL_ARCH.GRANITE_MOE: "granitemoe",
399437
MODEL_ARCH.CHAMELEON: "chameleon",
438+
# vision
439+
MODEL_ARCH.LLAVA_VISION: "llava",
400440
}
401441

402442
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -504,6 +544,22 @@ class MODEL_TENSOR(IntEnum):
504544
MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down",
505545
MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
506546
MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
547+
# vision
548+
MODEL_TENSOR.V_MMPROJ_A: "v.mmproj_a",
549+
MODEL_TENSOR.V_MMPROJ_B: "v.mmproj_b",
550+
MODEL_TENSOR.V_ENC_EMBD_CLS: "v.enc.embd.cls",
551+
MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.enc.embd.patch",
552+
MODEL_TENSOR.V_ENC_EMBD_POS: "v.enc.embd.pos",
553+
MODEL_TENSOR.V_ENC_ATTN_Q: "v.enc.blk.{bid}.attn_q",
554+
MODEL_TENSOR.V_ENC_ATTN_K: "v.enc.blk.{bid}.attn_k",
555+
MODEL_TENSOR.V_ENC_ATTN_V: "v.enc.blk.{bid}.attn_v",
556+
MODEL_TENSOR.V_ENC_INPUT_NORM: "v.enc.blk.{bid}.input_norm",
557+
MODEL_TENSOR.V_ENC_OUTPUT: "v.enc.blk.{bid}.output",
558+
MODEL_TENSOR.V_ENC_OUTPUT_NORM: "v.enc.blk.{bid}.output_norm",
559+
MODEL_TENSOR.V_ENC_FFN_UP: "v.enc.blk.{bid}.ffn_up",
560+
MODEL_TENSOR.V_ENC_FFN_DOWN: "v.enc.blk.{bid}.ffn_down",
561+
MODEL_TENSOR.V_PRE_NORM: "v.pre_norm",
562+
MODEL_TENSOR.V_POST_NORM: "v.post_norm",
507563
}
508564

509565
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -1279,6 +1335,23 @@ class MODEL_TENSOR(IntEnum):
12791335
MODEL_TENSOR.FFN_DOWN,
12801336
MODEL_TENSOR.FFN_UP,
12811337
],
1338+
MODEL_ARCH.LLAVA_VISION: [
1339+
MODEL_TENSOR.V_MMPROJ_A,
1340+
MODEL_TENSOR.V_MMPROJ_B,
1341+
MODEL_TENSOR.V_ENC_EMBD_CLS,
1342+
MODEL_TENSOR.V_ENC_EMBD_PATCH,
1343+
MODEL_TENSOR.V_ENC_EMBD_POS,
1344+
MODEL_TENSOR.V_ENC_ATTN_Q,
1345+
MODEL_TENSOR.V_ENC_ATTN_K,
1346+
MODEL_TENSOR.V_ENC_ATTN_V,
1347+
MODEL_TENSOR.V_ENC_INPUT_NORM,
1348+
MODEL_TENSOR.V_ENC_OUTPUT,
1349+
MODEL_TENSOR.V_ENC_OUTPUT_NORM,
1350+
MODEL_TENSOR.V_ENC_FFN_UP,
1351+
MODEL_TENSOR.V_ENC_FFN_DOWN,
1352+
MODEL_TENSOR.V_PRE_NORM,
1353+
MODEL_TENSOR.V_POST_NORM,
1354+
],
12821355
# TODO
12831356
}
12841357

gguf-py/gguf/gguf_writer.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -814,6 +814,36 @@ def add_remove_extra_whitespaces(self, value: bool) -> None:
814814
def add_precompiled_charsmap(self, charsmap: Sequence[bytes]) -> None:
815815
self.add_array(Keys.Tokenizer.PRECOMPILED_CHARSMAP, charsmap)
816816

817+
def add_vision_type(self, value: str) -> None:
818+
self.add_string(Keys.Vision.TYPE, value)
819+
820+
def add_vision_image_size(self, value: int) -> None:
821+
self.add_uint32(Keys.Vision.IMAGE_SIZE, value)
822+
823+
def add_vision_patch_size(self, value: int) -> None:
824+
self.add_uint32(Keys.Vision.PATCH_SIZE, value)
825+
826+
def add_vision_clip_architecture(self, value: str) -> None:
827+
self.add_string(Keys.Vision.Clip.ARCHITECTURE, value)
828+
829+
def add_vision_clip_context_length(self, value: int) -> None:
830+
self.add_uint32(Keys.Vision.Clip.CONTEXT_LENGTH, value)
831+
832+
def add_vision_clip_embedding_length(self, value: int) -> None:
833+
self.add_uint32(Keys.Vision.Clip.EMBEDDING_LENGTH, value)
834+
835+
def add_vision_clip_block_count(self, value: int) -> None:
836+
self.add_uint32(Keys.Vision.Clip.BLOCK_COUNT, value)
837+
838+
def add_vision_clip_feed_forward_length(self, value: int) -> None:
839+
self.add_uint32(Keys.Vision.Clip.FEED_FORWARD_LENGTH, value)
840+
841+
def add_vision_clip_head_count(self, value: int) -> None:
842+
self.add_uint32(Keys.Vision.Clip.HEAD_COUNT, value)
843+
844+
def add_vision_clip_layer_norm_epsilon(self, value: float) -> None:
845+
self.add_float32(Keys.Vision.Clip.LAYERNORM_EPS, value)
846+
817847
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
818848
if not isinstance(value, str):
819849
template_default = None

gguf-py/gguf/tensor_mapping.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -679,6 +679,66 @@ class TensorNameMap:
679679
MODEL_TENSOR.ENC_OUTPUT_NORM: (
680680
"encoder.final_layer_norm", # t5
681681
),
682+
683+
MODEL_TENSOR.V_MMPROJ_A: (
684+
"multi_modal_projector.linear_1",
685+
),
686+
687+
MODEL_TENSOR.V_MMPROJ_B: (
688+
"multi_modal_projector.linear_2",
689+
),
690+
691+
MODEL_TENSOR.V_ENC_EMBD_CLS: (
692+
"vision_tower.vision_model.embeddings.class_embedding",
693+
),
694+
695+
MODEL_TENSOR.V_ENC_EMBD_PATCH: (
696+
"vision_tower.vision_model.embeddings.patch_embedding",
697+
),
698+
699+
MODEL_TENSOR.V_ENC_EMBD_POS: (
700+
"vision_tower.vision_model.embeddings.position_embedding",
701+
),
702+
703+
MODEL_TENSOR.V_ENC_ATTN_Q: (
704+
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj",
705+
),
706+
707+
MODEL_TENSOR.V_ENC_ATTN_K: (
708+
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
709+
),
710+
711+
MODEL_TENSOR.V_ENC_ATTN_V: (
712+
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
713+
),
714+
715+
MODEL_TENSOR.V_ENC_INPUT_NORM: (
716+
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm1",
717+
),
718+
719+
MODEL_TENSOR.V_ENC_OUTPUT: (
720+
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
721+
),
722+
723+
MODEL_TENSOR.V_ENC_OUTPUT_NORM: (
724+
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
725+
),
726+
727+
MODEL_TENSOR.V_ENC_FFN_UP: (
728+
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
729+
),
730+
731+
MODEL_TENSOR.V_ENC_FFN_DOWN: (
732+
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2",
733+
),
734+
735+
MODEL_TENSOR.V_PRE_NORM: (
736+
"vision_tower.vision_model.pre_layrnorm",
737+
),
738+
739+
MODEL_TENSOR.V_POST_NORM: (
740+
"vision_tower.vision_model.post_layernorm",
741+
),
682742
}
683743

684744
# architecture-specific block mappings

0 commit comments

Comments
 (0)