|
17 | 17 | from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast |
18 | 18 | from itertools import chain |
19 | 19 |
|
| 20 | +from transformers import AutoConfig |
20 | 21 | import math |
21 | 22 | import numpy as np |
22 | 23 | import torch |
@@ -66,6 +67,12 @@ class Model: |
66 | 67 | metadata_override: Path | None |
67 | 68 | dir_model_card: Path |
68 | 69 |
|
| 70 | + # for vision model |
| 71 | + preprocessor_config: dict[str, Any] | None = None |
| 72 | + vparams: dict[str, Any] | None = None |
| 73 | + v_tensor_map: gguf.TensorNameMap |
| 74 | + v_tensor_names: set[str] | None |
| 75 | + |
69 | 76 | # subclasses should define this! |
70 | 77 | model_arch: gguf.MODEL_ARCH |
71 | 78 |
|
@@ -95,6 +102,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, |
95 | 102 | self.metadata_override = metadata_override |
96 | 103 | self.model_name = model_name |
97 | 104 | self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py |
| 105 | + self.preprocessor_config = self.load_preprocessor_config(self.dir_model) |
98 | 106 |
|
99 | 107 | # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type |
100 | 108 | if self.ftype == gguf.LlamaFileType.GUESSED: |
@@ -210,9 +218,13 @@ def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | |
210 | 218 |
|
211 | 219 | def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: |
212 | 220 | new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes) |
213 | | - if new_name is None: |
| 221 | + new_name_vision = self.v_tensor_map.get_name(key=name, try_suffixes=try_suffixes) |
| 222 | + if new_name is not None: |
| 223 | + return new_name |
| 224 | + elif new_name_vision is not None: |
| 225 | + return new_name_vision |
| 226 | + else: |
214 | 227 | raise ValueError(f"Can not map tensor {name!r}") |
215 | | - return new_name |
216 | 228 |
|
217 | 229 | def set_gguf_parameters(self): |
218 | 230 | self.gguf_writer.add_block_count(self.block_count) |
@@ -466,7 +478,24 @@ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str] |
466 | 478 | @staticmethod |
467 | 479 | def load_hparams(dir_model: Path): |
468 | 480 | with open(dir_model / "config.json", "r", encoding="utf-8") as f: |
469 | | - return json.load(f) |
| 481 | + hparams = json.load(f) |
| 482 | + if "text_config" in hparams: |
| 483 | + text_config = hparams["text_config"] |
| 484 | + # for example, llava-1.5-7b-hf misses the language model config, need to retrieve it via model ID |
| 485 | + if "_name_or_path" in text_config: |
| 486 | + text_config = AutoConfig.from_pretrained(text_config["_name_or_path"]).to_dict() |
| 487 | + hparams = {**text_config, **hparams} |
| 488 | + return hparams |
| 489 | + |
| 490 | + @staticmethod |
| 491 | + def load_preprocessor_config(dir_model: Path): |
| 492 | + # TODO: this varies vastly among models, need to handle more cases in the future |
| 493 | + file_path = dir_model / "preprocessor_config.json" |
| 494 | + if os.path.exists(file_path): |
| 495 | + with open(file_path, "r", encoding="utf-8") as f: |
| 496 | + return json.load(f) |
| 497 | + else: |
| 498 | + return None |
470 | 499 |
|
471 | 500 | @classmethod |
472 | 501 | def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: |
@@ -1557,10 +1586,17 @@ def prepare_tensors(self): |
1557 | 1586 | raise ValueError(f"Unprocessed norms: {norms}") |
1558 | 1587 |
|
1559 | 1588 |
|
1560 | | -@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") |
| 1589 | +@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "LlavaForConditionalGeneration") |
1561 | 1590 | class LlamaModel(Model): |
1562 | 1591 | model_arch = gguf.MODEL_ARCH.LLAMA |
1563 | 1592 |
|
| 1593 | + def __init__(self, *args, **kwargs): |
| 1594 | + super().__init__(*args, **kwargs) |
| 1595 | + if "vision_config" in self.hparams: |
| 1596 | + self.vparams = self.hparams["vision_config"] |
| 1597 | + if self.vparams is not None: |
| 1598 | + self.v_tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAVA_VISION, self.vparams["num_hidden_layers"]) |
| 1599 | + |
1564 | 1600 | def set_vocab(self): |
1565 | 1601 | try: |
1566 | 1602 | self._set_vocab_sentencepiece() |
@@ -1594,6 +1630,26 @@ def set_vocab(self): |
1594 | 1630 | if self.hparams.get("vocab_size", 32000) == 49152: |
1595 | 1631 | self.gguf_writer.add_add_bos_token(False) |
1596 | 1632 |
|
| 1633 | + # For vision model |
| 1634 | + if self.vparams is not None and self.preprocessor_config is not None: |
| 1635 | + self.gguf_writer.add_vision_type("clip-vit") |
| 1636 | + self.gguf_writer.add_vision_image_size(self.vparams["image_size"]) |
| 1637 | + self.gguf_writer.add_vision_patch_size(self.vparams["patch_size"]) |
| 1638 | + self.gguf_writer.add_vision_clip_architecture("llava") |
| 1639 | + self.gguf_writer.add_vision_clip_block_count(self.vparams["num_hidden_layers"]) |
| 1640 | + self.gguf_writer.add_vision_clip_embedding_length(self.vparams["hidden_size"]) |
| 1641 | + self.gguf_writer.add_vision_clip_feed_forward_length(self.vparams["intermediate_size"]) |
| 1642 | + self.gguf_writer.add_vision_clip_head_count(self.vparams["num_attention_heads"]) |
| 1643 | + self.gguf_writer.add_vision_clip_image_mean(self.preprocessor_config["image_mean"]) |
| 1644 | + self.gguf_writer.add_vision_clip_image_std(self.preprocessor_config["image_std"]) |
| 1645 | + self.gguf_writer.add_vision_clip_select_layer(self.hparams["vision_feature_layer"]) |
| 1646 | + self.gguf_writer.add_vision_clip_patch_merge_type(gguf.CLIPPatchMergeType.FLAT) |
| 1647 | + max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1 |
| 1648 | + self.gguf_writer.add_vision_clip_max_position_embeddings(max_pos_embd) |
| 1649 | + # TODO: should not hardcode these, but they are currently missing from config.json |
| 1650 | + self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.MLP) |
| 1651 | + self.gguf_writer.add_vision_clip_layer_norm_epsilon(1e-05) |
| 1652 | + |
1597 | 1653 | def set_gguf_parameters(self): |
1598 | 1654 | super().set_gguf_parameters() |
1599 | 1655 | hparams = self.hparams |
@@ -1624,6 +1680,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter |
1624 | 1680 | n_head = self.hparams["num_attention_heads"] |
1625 | 1681 | n_kv_head = self.hparams.get("num_key_value_heads") |
1626 | 1682 |
|
| 1683 | + # For vision model |
| 1684 | + if name.startswith("language_model"): |
| 1685 | + name = name.replace("language_model.", "") |
| 1686 | + if "post_layernorm" in name: |
| 1687 | + return [] # skip post_layernorm |
| 1688 | + |
1627 | 1689 | if name.endswith(("q_proj.weight", "q_proj.bias")): |
1628 | 1690 | data_torch = LlamaModel.permute(data_torch, n_head, n_head) |
1629 | 1691 | if name.endswith(("k_proj.weight", "k_proj.bias")): |
|
0 commit comments