|
17 | 17 | from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast |
18 | 18 | from itertools import chain |
19 | 19 |
|
20 | | -from transformers import AutoConfig |
| 20 | +from transformers import AutoConfig, AutoImageProcessor |
21 | 21 | import math |
22 | 22 | import numpy as np |
23 | 23 | import torch |
@@ -68,9 +68,10 @@ class Model: |
68 | 68 | dir_model_card: Path |
69 | 69 |
|
70 | 70 | # for vision model |
| 71 | + vision_arch: gguf.MODEL_ARCH | None = None |
71 | 72 | preprocessor_config: dict[str, Any] | None = None |
72 | 73 | vparams: dict[str, Any] | None = None |
73 | | - v_tensor_map: gguf.TensorNameMap |
| 74 | + v_tensor_map: gguf.TensorNameMap | None = None |
74 | 75 | v_tensor_names: set[str] | None |
75 | 76 |
|
76 | 77 | # subclasses should define this! |
@@ -102,7 +103,6 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, |
102 | 103 | self.metadata_override = metadata_override |
103 | 104 | self.model_name = model_name |
104 | 105 | self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py |
105 | | - self.preprocessor_config = self.load_preprocessor_config(self.dir_model) |
106 | 106 |
|
107 | 107 | # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type |
108 | 108 | if self.ftype == gguf.LlamaFileType.GUESSED: |
@@ -218,7 +218,7 @@ def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | |
218 | 218 |
|
219 | 219 | def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: |
220 | 220 | new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes) |
221 | | - new_name_vision = self.v_tensor_map.get_name(key=name, try_suffixes=try_suffixes) |
| 221 | + new_name_vision = self.v_tensor_map.get_name(key=name, try_suffixes=try_suffixes) if self.v_tensor_map is not None else None |
222 | 222 | if new_name is not None: |
223 | 223 | return new_name |
224 | 224 | elif new_name_vision is not None: |
@@ -488,14 +488,17 @@ def load_hparams(dir_model: Path): |
488 | 488 | return hparams |
489 | 489 |
|
490 | 490 | @staticmethod |
491 | | - def load_preprocessor_config(dir_model: Path): |
| 491 | + def load_preprocessor_config(dir_or_model_id: Path | str): |
492 | 492 | # TODO: this varies vastly among models, need to handle more cases in the future |
493 | | - file_path = dir_model / "preprocessor_config.json" |
494 | | - if os.path.exists(file_path): |
495 | | - with open(file_path, "r", encoding="utf-8") as f: |
496 | | - return json.load(f) |
| 493 | + if isinstance(dir_or_model_id, Path): |
| 494 | + file_path = dir_or_model_id / "preprocessor_config.json" |
| 495 | + if os.path.exists(file_path): |
| 496 | + with open(file_path, "r", encoding="utf-8") as f: |
| 497 | + return json.load(f) |
| 498 | + else: |
| 499 | + raise Exception(f"Preprocessor config not found at {file_path}") |
497 | 500 | else: |
498 | | - return None |
| 501 | + return AutoImageProcessor.from_pretrained(dir_or_model_id).to_dict() |
499 | 502 |
|
500 | 503 | @classmethod |
501 | 504 | def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: |
@@ -1586,16 +1589,31 @@ def prepare_tensors(self): |
1586 | 1589 | raise ValueError(f"Unprocessed norms: {norms}") |
1587 | 1590 |
|
1588 | 1591 |
|
1589 | | -@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "LlavaForConditionalGeneration") |
| 1592 | +@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "LlavaForConditionalGeneration", "MobileLlamaForCausalLM") |
1590 | 1593 | class LlamaModel(Model): |
1591 | 1594 | model_arch = gguf.MODEL_ARCH.LLAMA |
1592 | 1595 |
|
1593 | 1596 | def __init__(self, *args, **kwargs): |
1594 | 1597 | super().__init__(*args, **kwargs) |
1595 | | - if "vision_config" in self.hparams: |
| 1598 | + |
| 1599 | + model_type = self.hparams.get("model_type", None) |
| 1600 | + self.vision_arch = None |
| 1601 | + |
| 1602 | + # only tested with https://huggingface.co/llava-hf/llava-1.5-7b-hf |
| 1603 | + if "vision_config" in self.hparams and model_type == "llava": |
1596 | 1604 | self.vparams = self.hparams["vision_config"] |
1597 | | - if self.vparams is not None: |
1598 | | - self.v_tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAVA_VISION, self.vparams["num_hidden_layers"]) |
| 1605 | + self.preprocessor_config = self.load_preprocessor_config(self.dir_model) |
| 1606 | + self.vision_arch = gguf.MODEL_ARCH.VISION_LLAVA |
| 1607 | + |
| 1608 | + # only tested with https://huggingface.co/mtgv/MobileVLM_V2-1.7B |
| 1609 | + if "mm_vision_tower" in self.hparams and model_type == "mobilevlm": |
| 1610 | + vision_model_id = self.hparams["mm_vision_tower"] |
| 1611 | + self.vparams = AutoConfig.from_pretrained(vision_model_id).to_dict()["vision_config"] |
| 1612 | + self.preprocessor_config = self.load_preprocessor_config(vision_model_id) |
| 1613 | + self.vision_arch = gguf.MODEL_ARCH.VISION_MOBILEVLM |
| 1614 | + |
| 1615 | + if self.vparams is not None and self.vision_arch is not None: |
| 1616 | + self.v_tensor_map = gguf.get_tensor_name_map(self.vision_arch, self.vparams["num_hidden_layers"]) |
1599 | 1617 |
|
1600 | 1618 | def set_vocab(self): |
1601 | 1619 | try: |
@@ -1631,23 +1649,31 @@ def set_vocab(self): |
1631 | 1649 | self.gguf_writer.add_add_bos_token(False) |
1632 | 1650 |
|
1633 | 1651 | # For vision model |
1634 | | - if self.vparams is not None and self.preprocessor_config is not None: |
| 1652 | + if self.vparams is not None and self.preprocessor_config is not None and self.vision_arch is not None: |
1635 | 1653 | self.gguf_writer.add_vision_type("clip-vit") |
1636 | 1654 | self.gguf_writer.add_vision_image_size(self.vparams["image_size"]) |
1637 | 1655 | self.gguf_writer.add_vision_patch_size(self.vparams["patch_size"]) |
1638 | | - self.gguf_writer.add_vision_clip_architecture("llava") |
| 1656 | + self.gguf_writer.add_vision_clip_architecture(gguf.MODEL_ARCH_NAMES[self.vision_arch]) |
1639 | 1657 | self.gguf_writer.add_vision_clip_block_count(self.vparams["num_hidden_layers"]) |
1640 | 1658 | self.gguf_writer.add_vision_clip_embedding_length(self.vparams["hidden_size"]) |
1641 | 1659 | self.gguf_writer.add_vision_clip_feed_forward_length(self.vparams["intermediate_size"]) |
1642 | 1660 | self.gguf_writer.add_vision_clip_head_count(self.vparams["num_attention_heads"]) |
1643 | 1661 | self.gguf_writer.add_vision_clip_image_mean(self.preprocessor_config["image_mean"]) |
1644 | 1662 | self.gguf_writer.add_vision_clip_image_std(self.preprocessor_config["image_std"]) |
1645 | | - self.gguf_writer.add_vision_clip_select_layer(self.hparams["vision_feature_layer"]) |
1646 | 1663 | self.gguf_writer.add_vision_clip_patch_merge_type(gguf.CLIPPatchMergeType.FLAT) |
1647 | 1664 | max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1 |
1648 | 1665 | self.gguf_writer.add_vision_clip_max_position_embeddings(max_pos_embd) |
| 1666 | + if "vision_feature_layer" in self.hparams: |
| 1667 | + self.gguf_writer.add_vision_clip_select_layer(self.hparams["vision_feature_layer"]) |
| 1668 | + elif "mm_vision_select_layer" in self.hparams: |
| 1669 | + self.gguf_writer.add_vision_clip_select_layer(self.hparams["mm_vision_select_layer"]) |
| 1670 | + else: |
| 1671 | + raise ValueError("gguf: can not find vision_feature_layer parameter.") |
1649 | 1672 | # TODO: should not hardcode these, but they are currently missing from config.json |
1650 | | - self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.MLP) |
| 1673 | + if self.vision_arch == gguf.MODEL_ARCH.VISION_LLAVA: |
| 1674 | + self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.MLP) |
| 1675 | + if self.vision_arch == gguf.MODEL_ARCH.VISION_MOBILEVLM: |
| 1676 | + self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.LDPV2) |
1651 | 1677 | self.gguf_writer.add_vision_clip_layer_norm_epsilon(1e-05) |
1652 | 1678 |
|
1653 | 1679 | def set_gguf_parameters(self): |
@@ -1683,6 +1709,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter |
1683 | 1709 | # For vision model |
1684 | 1710 | if name.startswith("language_model"): |
1685 | 1711 | name = name.replace("language_model.", "") |
| 1712 | + else: |
| 1713 | + name = name.replace("model.vision_tower.", "") |
1686 | 1714 | if "post_layernorm" in name: |
1687 | 1715 | return [] # skip post_layernorm |
1688 | 1716 |
|
@@ -2101,7 +2129,7 @@ def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: |
2101 | 2129 | return n_dims > 1 |
2102 | 2130 |
|
2103 | 2131 |
|
2104 | | -@Model.register("MiniCPMForCausalLM") |
| 2132 | +@Model.register("MiniCPMForCausalLM", "MiniCPMV") |
2105 | 2133 | class MiniCPMModel(Model): |
2106 | 2134 | model_arch = gguf.MODEL_ARCH.MINICPM |
2107 | 2135 |
|
|
0 commit comments