Skip to content

Commit 431bb08

Browse files
committed
change gguf KV from clip to vit
1 parent 4a7ab89 commit 431bb08

File tree

6 files changed

+103
-103
lines changed

6 files changed

+103
-103
lines changed

convert_hf_to_gguf.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -281,17 +281,17 @@ def set_gguf_parameters(self):
281281

282282
# Vision model parameters
283283
if self.vparams is not None and self.preprocessor_config is not None and self.vision_arch is not None:
284-
self.gguf_writer.add_vision_type("clip-vit")
284+
self.gguf_writer.add_vision_type("vit")
285285
self.gguf_writer.add_vision_image_size(self.vparams["image_size"])
286286
self.gguf_writer.add_vision_patch_size(self.vparams["patch_size"])
287-
self.gguf_writer.add_vision_clip_architecture(gguf.MODEL_ARCH_NAMES[self.vision_arch])
288-
self.gguf_writer.add_vision_clip_block_count(self.vparams["num_hidden_layers"])
289-
self.gguf_writer.add_vision_clip_embedding_length(self.vparams["hidden_size"])
290-
self.gguf_writer.add_vision_clip_feed_forward_length(self.vparams["intermediate_size"])
291-
self.gguf_writer.add_vision_clip_head_count(self.vparams["num_attention_heads"])
292-
self.gguf_writer.add_vision_clip_image_mean(self.preprocessor_config["image_mean"])
293-
self.gguf_writer.add_vision_clip_image_std(self.preprocessor_config["image_std"])
294-
self.gguf_writer.add_vision_clip_select_layer(self.find_hparam(["vision_feature_layer", "mm_vision_select_layer"]))
287+
self.gguf_writer.add_vision_vit_architecture(gguf.MODEL_ARCH_NAMES[self.vision_arch])
288+
self.gguf_writer.add_vision_vit_block_count(self.vparams["num_hidden_layers"])
289+
self.gguf_writer.add_vision_vit_embedding_length(self.vparams["hidden_size"])
290+
self.gguf_writer.add_vision_vit_feed_forward_length(self.vparams["intermediate_size"])
291+
self.gguf_writer.add_vision_vit_head_count(self.vparams["num_attention_heads"])
292+
self.gguf_writer.add_vision_vit_image_mean(self.preprocessor_config["image_mean"])
293+
self.gguf_writer.add_vision_vit_image_std(self.preprocessor_config["image_std"])
294+
self.gguf_writer.add_vision_vit_select_layer(self.find_hparam(["vision_feature_layer", "mm_vision_select_layer"]))
295295

296296
self.gguf_writer.add_file_type(self.ftype)
297297
logger.info(f"gguf: file type = {self.ftype}")
@@ -1690,15 +1690,15 @@ def set_gguf_parameters(self):
16901690

16911691
# For vision model
16921692
if self.vparams is not None:
1693-
self.gguf_writer.add_vision_clip_patch_merge_type(gguf.CLIPPatchMergeType.FLAT)
1693+
self.gguf_writer.add_vision_vit_patch_merge_type(gguf.CLIPPatchMergeType.FLAT)
16941694
# TODO: should not hardcode these, but they are currently missing from config.json
16951695
if self.vision_arch == gguf.MODEL_ARCH.VISION_LLAVA:
1696-
self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.MLP)
1696+
self.gguf_writer.add_vision_vit_projector_type(gguf.constants.CLIPProjectorType.MLP)
16971697
if self.vision_arch == gguf.MODEL_ARCH.VISION_MOBILEVLM:
1698-
self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.LDPV2)
1699-
self.gguf_writer.add_vision_clip_layer_norm_epsilon(1e-05)
1698+
self.gguf_writer.add_vision_vit_projector_type(gguf.constants.CLIPProjectorType.LDPV2)
1699+
self.gguf_writer.add_vision_vit_layer_norm_epsilon(1e-05)
17001700
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1
1701-
self.gguf_writer.add_vision_clip_max_position_embeddings(max_pos_embd)
1701+
self.gguf_writer.add_vision_vit_max_position_embeddings(max_pos_embd)
17021702

17031703
@staticmethod
17041704
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
@@ -2193,11 +2193,11 @@ def set_gguf_parameters(self):
21932193

21942194
# For vision model
21952195
if self.vparams is not None and self.proj_type is not None:
2196-
self.gguf_writer.add_vision_clip_patch_merge_type(gguf.CLIPPatchMergeType.FLAT)
2197-
self.gguf_writer.add_vision_clip_projector_type(self.proj_type)
2198-
self.gguf_writer.add_vision_clip_layer_norm_epsilon(1e-06)
2196+
self.gguf_writer.add_vision_vit_patch_merge_type(gguf.CLIPPatchMergeType.FLAT)
2197+
self.gguf_writer.add_vision_vit_projector_type(self.proj_type)
2198+
self.gguf_writer.add_vision_vit_layer_norm_epsilon(1e-06)
21992199
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2
2200-
self.gguf_writer.add_vision_clip_max_position_embeddings(max_pos_embd)
2200+
self.gguf_writer.add_vision_vit_max_position_embeddings(max_pos_embd)
22012201

22022202

22032203
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:

gguf-py/gguf/constants.py

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -215,29 +215,29 @@ class Adapter:
215215
LORA_ALPHA = "adapter.lora.alpha"
216216

217217
class Vision:
218-
# only support vision.type = "clip-vit" for now
218+
# only support vision.type = "vit" for now
219219
TYPE = "vision.type"
220220
IMAGE_SIZE = "vision.image_size"
221221
PATCH_SIZE = "vision.patch_size"
222222
IMAGE_MEAN = "vision.image_mean"
223223
IMAGE_STD = "vision.image_std"
224224

225-
class Clip:
226-
ARCHITECTURE = "vision.clip.architecture"
227-
CONTEXT_LENGTH = "vision.clip.context_length"
228-
EMBEDDING_LENGTH = "vision.clip.embedding_length"
229-
BLOCK_COUNT = "vision.clip.block_count"
230-
FEED_FORWARD_LENGTH = "vision.clip.feed_forward_length"
231-
PROJECTION_TYPE = "vision.clip.projection_type"
232-
PROJECTION_DIM = "vision.clip.projection_dim"
233-
USE_GELU = "vision.clip.use_gelu"
234-
MAX_POS_EMBEDDING = "vision.clip.max_position_embeddings"
235-
MAX_SLICES = "vision.clip.max_slices"
236-
PROJECTOR_TYPE = "vision.clip.projector_type"
237-
SELECT_LAYER = "vision.clip.select_layer"
238-
PATCH_MERGE_TYPE = "vision.clip.patch_merge_type"
239-
HEAD_COUNT = "vision.clip.attention.head_count"
240-
LAYERNORM_EPS = "vision.clip.attention.layer_norm_epsilon"
225+
class Vit:
226+
ARCHITECTURE = "vision.vit.architecture"
227+
CONTEXT_LENGTH = "vision.vit.context_length"
228+
EMBEDDING_LENGTH = "vision.vit.embedding_length"
229+
BLOCK_COUNT = "vision.vit.block_count"
230+
FEED_FORWARD_LENGTH = "vision.vit.feed_forward_length"
231+
PROJECTION_TYPE = "vision.vit.projection_type"
232+
PROJECTION_DIM = "vision.vit.projection_dim"
233+
USE_GELU = "vision.vit.use_gelu"
234+
MAX_POS_EMBEDDING = "vision.vit.max_position_embeddings"
235+
MAX_SLICES = "vision.vit.max_slices"
236+
PROJECTOR_TYPE = "vision.vit.projector_type"
237+
SELECT_LAYER = "vision.vit.select_layer"
238+
PATCH_MERGE_TYPE = "vision.vit.patch_merge_type"
239+
HEAD_COUNT = "vision.vit.attention.head_count"
240+
LAYERNORM_EPS = "vision.vit.attention.layer_norm_epsilon"
241241

242242
#
243243
# recommended mapping of model tensor names for storage in gguf

gguf-py/gguf/gguf_writer.py

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -886,46 +886,46 @@ def add_vision_image_size(self, value: int) -> None:
886886
def add_vision_patch_size(self, value: int) -> None:
887887
self.add_uint32(Keys.Vision.PATCH_SIZE, value)
888888

889-
def add_vision_clip_architecture(self, value: str) -> None:
890-
self.add_string(Keys.Vision.Clip.ARCHITECTURE, value)
889+
def add_vision_vit_architecture(self, value: str) -> None:
890+
self.add_string(Keys.Vision.Vit.ARCHITECTURE, value)
891891

892-
def add_vision_clip_context_length(self, value: int) -> None:
893-
self.add_uint32(Keys.Vision.Clip.CONTEXT_LENGTH, value)
892+
def add_vision_vit_context_length(self, value: int) -> None:
893+
self.add_uint32(Keys.Vision.Vit.CONTEXT_LENGTH, value)
894894

895-
def add_vision_clip_embedding_length(self, value: int) -> None:
896-
self.add_uint32(Keys.Vision.Clip.EMBEDDING_LENGTH, value)
895+
def add_vision_vit_embedding_length(self, value: int) -> None:
896+
self.add_uint32(Keys.Vision.Vit.EMBEDDING_LENGTH, value)
897897

898-
def add_vision_clip_block_count(self, value: int) -> None:
899-
self.add_uint32(Keys.Vision.Clip.BLOCK_COUNT, value)
898+
def add_vision_vit_block_count(self, value: int) -> None:
899+
self.add_uint32(Keys.Vision.Vit.BLOCK_COUNT, value)
900900

901-
def add_vision_clip_feed_forward_length(self, value: int) -> None:
902-
self.add_uint32(Keys.Vision.Clip.FEED_FORWARD_LENGTH, value)
901+
def add_vision_vit_feed_forward_length(self, value: int) -> None:
902+
self.add_uint32(Keys.Vision.Vit.FEED_FORWARD_LENGTH, value)
903903

904-
def add_vision_clip_head_count(self, value: int) -> None:
905-
self.add_uint32(Keys.Vision.Clip.HEAD_COUNT, value)
904+
def add_vision_vit_head_count(self, value: int) -> None:
905+
self.add_uint32(Keys.Vision.Vit.HEAD_COUNT, value)
906906

907-
def add_vision_clip_max_position_embeddings(self, value: int) -> None:
908-
self.add_uint32(Keys.Vision.Clip.MAX_POS_EMBEDDING, value)
907+
def add_vision_vit_max_position_embeddings(self, value: int) -> None:
908+
self.add_uint32(Keys.Vision.Vit.MAX_POS_EMBEDDING, value)
909909

910-
def add_vision_clip_projector_type(self, value: CLIPProjectorType) -> None:
911-
self.add_string(Keys.Vision.Clip.PROJECTOR_TYPE, value.value)
910+
def add_vision_vit_projector_type(self, value: CLIPProjectorType) -> None:
911+
self.add_string(Keys.Vision.Vit.PROJECTOR_TYPE, value.value)
912912

913-
def add_vision_clip_max_slices(self, value: int) -> None:
914-
self.add_uint32(Keys.Vision.Clip.MAX_SLICES, value)
913+
def add_vision_vit_max_slices(self, value: int) -> None:
914+
self.add_uint32(Keys.Vision.Vit.MAX_SLICES, value)
915915

916-
def add_vision_clip_select_layer(self, value: int) -> None:
917-
self.add_int32(Keys.Vision.Clip.SELECT_LAYER, value)
916+
def add_vision_vit_select_layer(self, value: int) -> None:
917+
self.add_int32(Keys.Vision.Vit.SELECT_LAYER, value)
918918

919-
def add_vision_clip_patch_merge_type(self, value: CLIPPatchMergeType) -> None:
920-
self.add_string(Keys.Vision.Clip.PATCH_MERGE_TYPE, value.value)
919+
def add_vision_vit_patch_merge_type(self, value: CLIPPatchMergeType) -> None:
920+
self.add_string(Keys.Vision.Vit.PATCH_MERGE_TYPE, value.value)
921921

922-
def add_vision_clip_layer_norm_epsilon(self, value: float) -> None:
923-
self.add_float32(Keys.Vision.Clip.LAYERNORM_EPS, value)
922+
def add_vision_vit_layer_norm_epsilon(self, value: float) -> None:
923+
self.add_float32(Keys.Vision.Vit.LAYERNORM_EPS, value)
924924

925-
def add_vision_clip_image_mean(self, value: Sequence[float]) -> None:
925+
def add_vision_vit_image_mean(self, value: Sequence[float]) -> None:
926926
self.add_array(Keys.Vision.IMAGE_MEAN, value)
927927

928-
def add_vision_clip_image_std(self, value: Sequence[float]) -> None:
928+
def add_vision_vit_image_std(self, value: Sequence[float]) -> None:
929929
self.add_array(Keys.Vision.IMAGE_STD, value)
930930

931931
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:

src/llama-arch.cpp

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -195,21 +195,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
195195
{ LLM_KV_VISION_PATCH_SIZE, "vision.patch_size" },
196196
{ LLM_KV_VISION_IMAGE_MEAN, "vision.image_mean" },
197197
{ LLM_KV_VISION_IMAGE_STD, "vision.image_std" },
198-
{ LLM_KV_VISION_CLIP_ARCHITECTURE, "vision.clip.architecture" },
199-
{ LLM_KV_VISION_CLIP_CONTEXT_LENGTH, "vision.clip.context_length" },
200-
{ LLM_KV_VISION_CLIP_EMBEDDING_LENGTH, "vision.clip.embedding_length" },
201-
{ LLM_KV_VISION_CLIP_BLOCK_COUNT, "vision.clip.block_count" },
202-
{ LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH, "vision.clip.feed_forward_length" },
203-
{ LLM_KV_VISION_CLIP_PROJECTION_TYPE, "vision.clip.projection_type" },
204-
{ LLM_KV_VISION_CLIP_PROJECTION_DIM, "vision.clip.projection_dim" },
205-
{ LLM_KV_VISION_CLIP_USE_GELU, "vision.clip.use_gelu" },
206-
{ LLM_KV_VISION_CLIP_MAX_POS_EMBD, "vision.clip.max_position_embeddings" },
207-
{ LLM_KV_VISION_CLIP_MAX_SLICES, "vision.clip.max_slices" },
208-
{ LLM_KV_VISION_CLIP_PROJECTOR_TYPE, "vision.clip.projector_type" },
209-
{ LLM_KV_VISION_CLIP_SELECT_LAYER, "vision.clip.select_layer" },
210-
{ LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE, "vision.clip.patch_merge_type" },
211-
{ LLM_KV_VISION_CLIP_HEAD_COUNT, "vision.clip.attention.head_count" },
212-
{ LLM_KV_VISION_CLIP_LAYERNORM_EPS, "vision.clip.attention.layer_norm_epsilon" },
198+
{ LLM_KV_VISION_VIT_ARCHITECTURE, "vision.vit.architecture" },
199+
{ LLM_KV_VISION_VIT_CONTEXT_LENGTH, "vision.vit.context_length" },
200+
{ LLM_KV_VISION_VIT_EMBEDDING_LENGTH, "vision.vit.embedding_length" },
201+
{ LLM_KV_VISION_VIT_BLOCK_COUNT, "vision.vit.block_count" },
202+
{ LLM_KV_VISION_VIT_FEED_FORWARD_LENGTH, "vision.vit.feed_forward_length" },
203+
{ LLM_KV_VISION_VIT_PROJECTION_TYPE, "vision.vit.projection_type" },
204+
{ LLM_KV_VISION_VIT_PROJECTION_DIM, "vision.vit.projection_dim" },
205+
{ LLM_KV_VISION_VIT_USE_GELU, "vision.vit.use_gelu" },
206+
{ LLM_KV_VISION_VIT_MAX_POS_EMBD, "vision.vit.max_position_embeddings" },
207+
{ LLM_KV_VISION_VIT_MAX_SLICES, "vision.vit.max_slices" },
208+
{ LLM_KV_VISION_VIT_PROJECTOR_TYPE, "vision.vit.projector_type" },
209+
{ LLM_KV_VISION_VIT_SELECT_LAYER, "vision.vit.select_layer" },
210+
{ LLM_KV_VISION_VIT_PATCH_MERGE_TYPE, "vision.vit.patch_merge_type" },
211+
{ LLM_KV_VISION_VIT_HEAD_COUNT, "vision.vit.attention.head_count" },
212+
{ LLM_KV_VISION_VIT_LAYERNORM_EPS, "vision.vit.attention.layer_norm_epsilon" },
213213

214214
// deprecated
215215
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },

src/llama-arch.h

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -205,21 +205,21 @@ enum llm_kv {
205205
LLM_KV_VISION_PATCH_SIZE,
206206
LLM_KV_VISION_IMAGE_MEAN,
207207
LLM_KV_VISION_IMAGE_STD,
208-
LLM_KV_VISION_CLIP_ARCHITECTURE,
209-
LLM_KV_VISION_CLIP_CONTEXT_LENGTH,
210-
LLM_KV_VISION_CLIP_EMBEDDING_LENGTH,
211-
LLM_KV_VISION_CLIP_BLOCK_COUNT,
212-
LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH,
213-
LLM_KV_VISION_CLIP_PROJECTION_TYPE,
214-
LLM_KV_VISION_CLIP_PROJECTION_DIM,
215-
LLM_KV_VISION_CLIP_USE_GELU,
216-
LLM_KV_VISION_CLIP_MAX_POS_EMBD,
217-
LLM_KV_VISION_CLIP_MAX_SLICES,
218-
LLM_KV_VISION_CLIP_PROJECTOR_TYPE,
219-
LLM_KV_VISION_CLIP_SELECT_LAYER,
220-
LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE,
221-
LLM_KV_VISION_CLIP_HEAD_COUNT,
222-
LLM_KV_VISION_CLIP_LAYERNORM_EPS,
208+
LLM_KV_VISION_VIT_ARCHITECTURE,
209+
LLM_KV_VISION_VIT_CONTEXT_LENGTH,
210+
LLM_KV_VISION_VIT_EMBEDDING_LENGTH,
211+
LLM_KV_VISION_VIT_BLOCK_COUNT,
212+
LLM_KV_VISION_VIT_FEED_FORWARD_LENGTH,
213+
LLM_KV_VISION_VIT_PROJECTION_TYPE,
214+
LLM_KV_VISION_VIT_PROJECTION_DIM,
215+
LLM_KV_VISION_VIT_USE_GELU,
216+
LLM_KV_VISION_VIT_MAX_POS_EMBD,
217+
LLM_KV_VISION_VIT_MAX_SLICES,
218+
LLM_KV_VISION_VIT_PROJECTOR_TYPE,
219+
LLM_KV_VISION_VIT_SELECT_LAYER,
220+
LLM_KV_VISION_VIT_PATCH_MERGE_TYPE,
221+
LLM_KV_VISION_VIT_HEAD_COUNT,
222+
LLM_KV_VISION_VIT_LAYERNORM_EPS,
223223

224224
// deprecated:
225225
LLM_KV_TOKENIZER_PREFIX_ID,

src/llama-model.cpp

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1251,36 +1251,36 @@ void llama_model::load_hparams(llama_model_loader & ml) {
12511251
auto & vparams = clip.hparams;
12521252
std::string vision_type;
12531253
ml.get_key(LLM_KV_VISION_TYPE, vision_type, false);
1254-
if (vision_type == "clip-vit") {
1255-
LLAMA_LOG_INFO("%s: loading clip-vit vision model\n", __func__);
1254+
if (vision_type == "vit") {
1255+
LLAMA_LOG_INFO("%s: loading ViT vision model\n", __func__);
12561256
has_vision = true;
12571257
ml.get_key(LLM_KV_VISION_IMAGE_SIZE, vparams.image_size, true);
12581258
ml.get_key(LLM_KV_VISION_PATCH_SIZE, vparams.patch_size, true);
12591259
ml.get_key_or_arr(LLM_KV_VISION_IMAGE_MEAN, vparams.image_mean, 3, true);
12601260
ml.get_key_or_arr(LLM_KV_VISION_IMAGE_STD, vparams.image_std, 3, true);
1261-
ml.get_key(LLM_KV_VISION_CLIP_EMBEDDING_LENGTH, vparams.hidden_size, true);
1262-
ml.get_key(LLM_KV_VISION_CLIP_BLOCK_COUNT, vparams.n_layer, true);
1263-
ml.get_key(LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH, vparams.n_intermediate, true);
1264-
ml.get_key(LLM_KV_VISION_CLIP_HEAD_COUNT, vparams.n_head, true);
1265-
ml.get_key(LLM_KV_VISION_CLIP_LAYERNORM_EPS, vparams.eps, true);
1266-
ml.get_key(LLM_KV_VISION_CLIP_SELECT_LAYER, vparams.select_layer, true);
1267-
ml.get_key(LLM_KV_VISION_CLIP_MAX_POS_EMBD, vparams.max_pos_embd, true);
1261+
ml.get_key(LLM_KV_VISION_VIT_EMBEDDING_LENGTH, vparams.hidden_size, true);
1262+
ml.get_key(LLM_KV_VISION_VIT_BLOCK_COUNT, vparams.n_layer, true);
1263+
ml.get_key(LLM_KV_VISION_VIT_FEED_FORWARD_LENGTH, vparams.n_intermediate, true);
1264+
ml.get_key(LLM_KV_VISION_VIT_HEAD_COUNT, vparams.n_head, true);
1265+
ml.get_key(LLM_KV_VISION_VIT_LAYERNORM_EPS, vparams.eps, true);
1266+
ml.get_key(LLM_KV_VISION_VIT_SELECT_LAYER, vparams.select_layer, true);
1267+
ml.get_key(LLM_KV_VISION_VIT_MAX_POS_EMBD, vparams.max_pos_embd, true);
12681268
{
12691269
std::string name;
1270-
ml.get_key(LLM_KV_VISION_CLIP_PROJECTOR_TYPE, name, true);
1270+
ml.get_key(LLM_KV_VISION_VIT_PROJECTOR_TYPE, name, true);
12711271
vparams.proj_type = clip_projector_type_from_name(name);
12721272
if (vparams.proj_type == CLIP_PROJECTOR_TYPE_UNKNOWN) {
12731273
throw std::runtime_error(format("unsupported clip projector type: %s", name.c_str()));
12741274
}
12751275
}
12761276
{
12771277
std::string name;
1278-
ml.get_key(LLM_KV_VISION_CLIP_PATCH_MERGE_TYPE, name, false);
1278+
ml.get_key(LLM_KV_VISION_VIT_PATCH_MERGE_TYPE, name, false);
12791279
vparams.mm_patch_merge_type = mm_patch_merge_from_name(name);
12801280
}
12811281
{
12821282
std::string arch;
1283-
ml.get_key(LLM_KV_VISION_CLIP_ARCHITECTURE, arch, true);
1283+
ml.get_key(LLM_KV_VISION_VIT_ARCHITECTURE, arch, true);
12841284
vparams.arch = vision_arch_from_string(arch);
12851285
if (vparams.arch == VISION_ARCH_UNKNOWN) {
12861286
throw std::runtime_error(format("unsupported vision arch: %s", arch.c_str()));

0 commit comments

Comments
 (0)