Skip to content

Commit 6cabdda

Browse files
committed
add back convert hf to gguf
1 parent 0a81051 commit 6cabdda

File tree

7 files changed

+266
-6
lines changed

7 files changed

+266
-6
lines changed

convert_hf_to_gguf.py

Lines changed: 66 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
1818
from itertools import chain
1919

20+
from transformers import AutoConfig
2021
import math
2122
import numpy as np
2223
import torch
@@ -66,6 +67,12 @@ class Model:
6667
metadata_override: Path | None
6768
dir_model_card: Path
6869

70+
# for vision model
71+
preprocessor_config: dict[str, Any] | None = None
72+
vparams: dict[str, Any] | None = None
73+
v_tensor_map: gguf.TensorNameMap
74+
v_tensor_names: set[str] | None
75+
6976
# subclasses should define this!
7077
model_arch: gguf.MODEL_ARCH
7178

@@ -95,6 +102,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
95102
self.metadata_override = metadata_override
96103
self.model_name = model_name
97104
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
105+
self.preprocessor_config = self.load_preprocessor_config(self.dir_model)
98106

99107
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
100108
if self.ftype == gguf.LlamaFileType.GUESSED:
@@ -210,9 +218,13 @@ def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int |
210218

211219
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
212220
new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes)
213-
if new_name is None:
221+
new_name_vision = self.v_tensor_map.get_name(key=name, try_suffixes=try_suffixes)
222+
if new_name is not None:
223+
return new_name
224+
elif new_name_vision is not None:
225+
return new_name_vision
226+
else:
214227
raise ValueError(f"Can not map tensor {name!r}")
215-
return new_name
216228

217229
def set_gguf_parameters(self):
218230
self.gguf_writer.add_block_count(self.block_count)
@@ -466,7 +478,24 @@ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]
466478
@staticmethod
467479
def load_hparams(dir_model: Path):
468480
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
469-
return json.load(f)
481+
hparams = json.load(f)
482+
if "text_config" in hparams:
483+
text_config = hparams["text_config"]
484+
# for example, llava-1.5-7b-hf misses the language model config, need to retrieve it via model ID
485+
if "_name_or_path" in text_config:
486+
text_config = AutoConfig.from_pretrained(text_config["_name_or_path"]).to_dict()
487+
hparams = {**text_config, **hparams}
488+
return hparams
489+
490+
@staticmethod
491+
def load_preprocessor_config(dir_model: Path):
492+
# TODO: this varies vastly among models, need to handle more cases in the future
493+
file_path = dir_model / "preprocessor_config.json"
494+
if os.path.exists(file_path):
495+
with open(file_path, "r", encoding="utf-8") as f:
496+
return json.load(f)
497+
else:
498+
return None
470499

471500
@classmethod
472501
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
@@ -1557,10 +1586,17 @@ def prepare_tensors(self):
15571586
raise ValueError(f"Unprocessed norms: {norms}")
15581587

15591588

1560-
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
1589+
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "LlavaForConditionalGeneration")
15611590
class LlamaModel(Model):
15621591
model_arch = gguf.MODEL_ARCH.LLAMA
15631592

1593+
def __init__(self, *args, **kwargs):
1594+
super().__init__(*args, **kwargs)
1595+
if "vision_config" in self.hparams:
1596+
self.vparams = self.hparams["vision_config"]
1597+
if self.vparams is not None:
1598+
self.v_tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAVA_VISION, self.vparams["num_hidden_layers"])
1599+
15641600
def set_vocab(self):
15651601
try:
15661602
self._set_vocab_sentencepiece()
@@ -1594,6 +1630,26 @@ def set_vocab(self):
15941630
if self.hparams.get("vocab_size", 32000) == 49152:
15951631
self.gguf_writer.add_add_bos_token(False)
15961632

1633+
# For vision model
1634+
if self.vparams is not None and self.preprocessor_config is not None:
1635+
self.gguf_writer.add_vision_type("clip-vit")
1636+
self.gguf_writer.add_vision_image_size(self.vparams["image_size"])
1637+
self.gguf_writer.add_vision_patch_size(self.vparams["patch_size"])
1638+
self.gguf_writer.add_vision_clip_architecture("llava")
1639+
self.gguf_writer.add_vision_clip_block_count(self.vparams["num_hidden_layers"])
1640+
self.gguf_writer.add_vision_clip_embedding_length(self.vparams["hidden_size"])
1641+
self.gguf_writer.add_vision_clip_feed_forward_length(self.vparams["intermediate_size"])
1642+
self.gguf_writer.add_vision_clip_head_count(self.vparams["num_attention_heads"])
1643+
self.gguf_writer.add_vision_clip_image_mean(self.preprocessor_config["image_mean"])
1644+
self.gguf_writer.add_vision_clip_image_std(self.preprocessor_config["image_std"])
1645+
self.gguf_writer.add_vision_clip_select_layer(self.hparams["vision_feature_layer"])
1646+
self.gguf_writer.add_vision_clip_patch_merge_type(gguf.CLIPPatchMergeType.FLAT)
1647+
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1
1648+
self.gguf_writer.add_vision_clip_max_position_embeddings(max_pos_embd)
1649+
# TODO: should not hardcode these, but they are currently missing from config.json
1650+
self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.MLP)
1651+
self.gguf_writer.add_vision_clip_layer_norm_epsilon(1e-05)
1652+
15971653
def set_gguf_parameters(self):
15981654
super().set_gguf_parameters()
15991655
hparams = self.hparams
@@ -1624,6 +1680,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
16241680
n_head = self.hparams["num_attention_heads"]
16251681
n_kv_head = self.hparams.get("num_key_value_heads")
16261682

1683+
# For vision model
1684+
if name.startswith("language_model"):
1685+
name = name.replace("language_model.", "")
1686+
if "post_layernorm" in name:
1687+
return [] # skip post_layernorm
1688+
16271689
if name.endswith(("q_proj.weight", "q_proj.bias")):
16281690
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
16291691
if name.endswith(("k_proj.weight", "k_proj.bias")):

examples/server/server.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2949,6 +2949,7 @@ struct server_context {
29492949
batch.n_seq_id + i,
29502950
batch.seq_id + i,
29512951
batch.logits + i,
2952+
nullptr,
29522953
};
29532954

29542955
const int ret = llama_decode(ctx, batch_view);

gguf-py/gguf/constants.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,9 @@ class Tokenizer:
202202
FIM_PAD_ID = "tokenizer.ggml.fim_pad_token_id"
203203
FIM_REP_ID = "tokenizer.ggml.fim_rep_token_id"
204204
FIM_SEP_ID = "tokenizer.ggml.fim_sep_token_id"
205+
# Vision models
206+
IMAGE_START_ID = "tokenizer.ggml.image_start_token_id"
207+
IMAGE_END_ID = "tokenizer.ggml.image_end_token_id"
205208
# deprecated:
206209
PREFIX_ID = "tokenizer.ggml.prefix_token_id"
207210
SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
@@ -211,6 +214,31 @@ class Adapter:
211214
TYPE = "adapter.type"
212215
LORA_ALPHA = "adapter.lora.alpha"
213216

217+
class Vision:
218+
# only support vision.type = "clip-vit" for now
219+
TYPE = "vision.type"
220+
IMAGE_SIZE = "vision.image_size"
221+
PATCH_SIZE = "vision.patch_size"
222+
IMAGE_MEAN = "vision.image_mean"
223+
IMAGE_STD = "vision.image_std"
224+
225+
class Clip:
226+
ARCHITECTURE = "vision.clip.architecture"
227+
CONTEXT_LENGTH = "vision.clip.context_length"
228+
EMBEDDING_LENGTH = "vision.clip.embedding_length"
229+
BLOCK_COUNT = "vision.clip.block_count"
230+
FEED_FORWARD_LENGTH = "vision.clip.feed_forward_length"
231+
PROJECTION_TYPE = "vision.clip.projection_type"
232+
PROJECTION_DIM = "vision.clip.projection_dim"
233+
USE_GELU = "vision.clip.use_gelu"
234+
MAX_POS_EMBEDDING = "vision.clip.max_position_embeddings"
235+
MAX_SLICES = "vision.clip.max_slices"
236+
PROJECTOR_TYPE = "vision.clip.projector_type"
237+
SELECT_LAYER = "vision.clip.select_layer"
238+
PATCH_MERGE_TYPE = "vision.clip.patch_merge_type"
239+
HEAD_COUNT = "vision.clip.attention.head_count"
240+
LAYERNORM_EPS = "vision.clip.attention.layer_norm_epsilon"
241+
214242
#
215243
# recommended mapping of model tensor names for storage in gguf
216244
#
@@ -279,6 +307,8 @@ class MODEL_ARCH(IntEnum):
279307
GRANITE_MOE = auto()
280308
CHAMELEON = auto()
281309
WAVTOKENIZER_DEC = auto()
310+
# vision models
311+
LLAVA_VISION = auto()
282312

283313

284314
class MODEL_TENSOR(IntEnum):
@@ -390,6 +420,7 @@ class MODEL_TENSOR(IntEnum):
390420
ENC_OUTPUT_NORM = auto()
391421
CLS = auto() # classifier
392422
CLS_OUT = auto() # classifier output projection
423+
# wavtokenizer
393424
CONV1D = auto()
394425
CONVNEXT_DW = auto()
395426
CONVNEXT_NORM = auto()
@@ -406,6 +437,21 @@ class MODEL_TENSOR(IntEnum):
406437
POSNET_ATTN_K = auto()
407438
POSNET_ATTN_V = auto()
408439
POSNET_ATTN_OUT = auto()
440+
# vision
441+
V_MMPROJ = auto()
442+
V_ENC_EMBD_CLS = auto()
443+
V_ENC_EMBD_PATCH = auto()
444+
V_ENC_EMBD_POS = auto()
445+
V_ENC_ATTN_Q = auto()
446+
V_ENC_ATTN_K = auto()
447+
V_ENC_ATTN_V = auto()
448+
V_ENC_INPUT_NORM = auto()
449+
V_ENC_OUTPUT = auto()
450+
V_ENC_OUTPUT_NORM = auto()
451+
V_ENC_FFN_UP = auto()
452+
V_ENC_FFN_DOWN = auto()
453+
V_PRE_NORM = auto()
454+
V_POST_NORM = auto()
409455

410456

411457
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -593,6 +639,21 @@ class MODEL_TENSOR(IntEnum):
593639
MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k",
594640
MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v",
595641
MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output",
642+
# vision
643+
MODEL_TENSOR.V_MMPROJ: "v.mmproj_{bid}",
644+
MODEL_TENSOR.V_ENC_EMBD_CLS: "v.enc.embd.cls",
645+
MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.enc.embd.patch",
646+
MODEL_TENSOR.V_ENC_EMBD_POS: "v.enc.embd.pos",
647+
MODEL_TENSOR.V_ENC_ATTN_Q: "v.enc.blk.{bid}.attn_q",
648+
MODEL_TENSOR.V_ENC_ATTN_K: "v.enc.blk.{bid}.attn_k",
649+
MODEL_TENSOR.V_ENC_ATTN_V: "v.enc.blk.{bid}.attn_v",
650+
MODEL_TENSOR.V_ENC_INPUT_NORM: "v.enc.blk.{bid}.input_norm",
651+
MODEL_TENSOR.V_ENC_OUTPUT: "v.enc.blk.{bid}.output",
652+
MODEL_TENSOR.V_ENC_OUTPUT_NORM: "v.enc.blk.{bid}.output_norm",
653+
MODEL_TENSOR.V_ENC_FFN_UP: "v.enc.blk.{bid}.ffn_up",
654+
MODEL_TENSOR.V_ENC_FFN_DOWN: "v.enc.blk.{bid}.ffn_down",
655+
MODEL_TENSOR.V_PRE_NORM: "v.pre_norm",
656+
MODEL_TENSOR.V_POST_NORM: "v.post_norm",
596657
}
597658

598659
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -1534,6 +1595,22 @@ class MODEL_TENSOR(IntEnum):
15341595
MODEL_TENSOR.POSNET_ATTN_V,
15351596
MODEL_TENSOR.POSNET_ATTN_OUT,
15361597
],
1598+
MODEL_ARCH.LLAVA_VISION: [
1599+
MODEL_TENSOR.V_MMPROJ,
1600+
MODEL_TENSOR.V_ENC_EMBD_CLS,
1601+
MODEL_TENSOR.V_ENC_EMBD_PATCH,
1602+
MODEL_TENSOR.V_ENC_EMBD_POS,
1603+
MODEL_TENSOR.V_ENC_ATTN_Q,
1604+
MODEL_TENSOR.V_ENC_ATTN_K,
1605+
MODEL_TENSOR.V_ENC_ATTN_V,
1606+
MODEL_TENSOR.V_ENC_INPUT_NORM,
1607+
MODEL_TENSOR.V_ENC_OUTPUT,
1608+
MODEL_TENSOR.V_ENC_OUTPUT_NORM,
1609+
MODEL_TENSOR.V_ENC_FFN_UP,
1610+
MODEL_TENSOR.V_ENC_FFN_DOWN,
1611+
MODEL_TENSOR.V_PRE_NORM,
1612+
MODEL_TENSOR.V_POST_NORM,
1613+
],
15371614
# TODO
15381615
}
15391616

@@ -1615,6 +1692,15 @@ class PoolingType(IntEnum):
16151692
CLS = 2
16161693

16171694

1695+
class CLIPProjectorType(Enum):
1696+
MLP = 'mlp'
1697+
1698+
1699+
class CLIPPatchMergeType(Enum):
1700+
FLAT = 'flat'
1701+
SPATIAL_UNPAD = 'spatial_unpad'
1702+
1703+
16181704
class GGMLQuantizationType(IntEnum):
16191705
F32 = 0
16201706
F16 = 1

gguf-py/gguf/gguf_writer.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
PoolingType,
2828
TokenType,
2929
ExpertGatingFuncType,
30+
CLIPPatchMergeType,
31+
CLIPProjectorType,
3032
)
3133

3234
from .quants import quant_shape_from_byte_shape
@@ -874,6 +876,57 @@ def add_remove_extra_whitespaces(self, value: bool) -> None:
874876

875877
def add_precompiled_charsmap(self, charsmap: Sequence[bytes]) -> None:
876878
self.add_array(Keys.Tokenizer.PRECOMPILED_CHARSMAP, charsmap)
879+
880+
def add_vision_type(self, value: str) -> None:
881+
self.add_string(Keys.Vision.TYPE, value)
882+
883+
def add_vision_image_size(self, value: int) -> None:
884+
self.add_uint32(Keys.Vision.IMAGE_SIZE, value)
885+
886+
def add_vision_patch_size(self, value: int) -> None:
887+
self.add_uint32(Keys.Vision.PATCH_SIZE, value)
888+
889+
def add_vision_clip_architecture(self, value: str) -> None:
890+
self.add_string(Keys.Vision.Clip.ARCHITECTURE, value)
891+
892+
def add_vision_clip_context_length(self, value: int) -> None:
893+
self.add_uint32(Keys.Vision.Clip.CONTEXT_LENGTH, value)
894+
895+
def add_vision_clip_embedding_length(self, value: int) -> None:
896+
self.add_uint32(Keys.Vision.Clip.EMBEDDING_LENGTH, value)
897+
898+
def add_vision_clip_block_count(self, value: int) -> None:
899+
self.add_uint32(Keys.Vision.Clip.BLOCK_COUNT, value)
900+
901+
def add_vision_clip_feed_forward_length(self, value: int) -> None:
902+
self.add_uint32(Keys.Vision.Clip.FEED_FORWARD_LENGTH, value)
903+
904+
def add_vision_clip_head_count(self, value: int) -> None:
905+
self.add_uint32(Keys.Vision.Clip.HEAD_COUNT, value)
906+
907+
def add_vision_clip_max_position_embeddings(self, value: int) -> None:
908+
self.add_uint32(Keys.Vision.Clip.MAX_POS_EMBEDDING, value)
909+
910+
def add_vision_clip_projector_type(self, value: CLIPProjectorType) -> None:
911+
self.add_string(Keys.Vision.Clip.PROJECTOR_TYPE, value.value)
912+
913+
def add_vision_clip_max_slices(self, value: int) -> None:
914+
self.add_uint32(Keys.Vision.Clip.MAX_SLICES, value)
915+
916+
def add_vision_clip_select_layer(self, value: int) -> None:
917+
self.add_int32(Keys.Vision.Clip.SELECT_LAYER, value)
918+
919+
def add_vision_clip_patch_merge_type(self, value: CLIPPatchMergeType) -> None:
920+
self.add_string(Keys.Vision.Clip.PATCH_MERGE_TYPE, value.value)
921+
922+
def add_vision_clip_layer_norm_epsilon(self, value: float) -> None:
923+
self.add_float32(Keys.Vision.Clip.LAYERNORM_EPS, value)
924+
925+
def add_vision_clip_image_mean(self, value: Sequence[float]) -> None:
926+
self.add_array(Keys.Vision.IMAGE_MEAN, value)
927+
928+
def add_vision_clip_image_std(self, value: Sequence[float]) -> None:
929+
self.add_array(Keys.Vision.IMAGE_STD, value)
877930

878931
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
879932
if not isinstance(value, str):

0 commit comments

Comments
 (0)