Skip to content

Commit c3a654c

Browse files
committed
add SmolVLM
1 parent 25a97ce commit c3a654c

File tree

9 files changed

+171
-10
lines changed

9 files changed

+171
-10
lines changed

convert_hf_to_gguf.py

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,10 @@ def set_gguf_parameters(self):
292292
self.gguf_writer.add_vision_vit_head_count(self.vparams["num_attention_heads"])
293293
self.gguf_writer.add_vision_vit_image_mean(self.preprocessor_config["image_mean"])
294294
self.gguf_writer.add_vision_vit_image_std(self.preprocessor_config["image_std"])
295-
self.gguf_writer.add_vision_vit_select_layer(self.find_hparam(["vision_feature_layer", "mm_vision_select_layer"]))
295+
try:
296+
self.gguf_writer.add_vision_vit_select_layer(self.find_hparam(["vision_feature_layer", "mm_vision_select_layer"]))
297+
except KeyError:
298+
self.gguf_writer.add_vision_vit_select_layer(0)
296299

297300
self.gguf_writer.add_file_type(self.ftype)
298301
logger.info(f"gguf: file type = {self.ftype}")
@@ -506,8 +509,9 @@ def load_hparams(dir_model: Path):
506509
hparams = json.load(f)
507510
if "text_config" in hparams:
508511
text_config = hparams["text_config"]
512+
model_id = text_config.get("_name_or_path", None)
509513
# for example, llava-1.5-7b-hf misses the language model config, need to retrieve it via model ID
510-
if "_name_or_path" in text_config:
514+
if model_id is not None and model_id != "None" and model_id != "":
511515
text_config = AutoConfig.from_pretrained(text_config["_name_or_path"]).to_dict()
512516
hparams = {**text_config, **hparams}
513517
return hparams
@@ -1616,7 +1620,7 @@ def prepare_tensors(self):
16161620
raise ValueError(f"Unprocessed norms: {norms}")
16171621

16181622

1619-
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "LlavaForConditionalGeneration", "MobileLlamaForCausalLM")
1623+
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "LlavaForConditionalGeneration", "MobileLlamaForCausalLM", "Idefics3ForConditionalGeneration")
16201624
class LlamaModel(Model):
16211625
model_arch = gguf.MODEL_ARCH.LLAMA
16221626

@@ -1640,6 +1644,11 @@ def __init__(self, *args, **kwargs):
16401644
self.preprocessor_config = AutoImageProcessor.from_pretrained(vision_model_id).to_dict()
16411645
self.vision_arch = gguf.MODEL_ARCH.VISION_MOBILEVLM
16421646

1647+
if "vision_config" in self.hparams and model_type == "idefics3":
1648+
self.vparams = self.hparams["vision_config"]
1649+
self.preprocessor_config = self.load_preprocessor_config(self.dir_model)
1650+
self.vision_arch = gguf.MODEL_ARCH.VISION_IDEFICS3
1651+
16431652
if self.vparams is not None and self.vision_arch is not None:
16441653
self.v_tensor_map = gguf.get_tensor_name_map(self.vision_arch, self.vparams["num_hidden_layers"])
16451654

@@ -1694,14 +1703,20 @@ def set_gguf_parameters(self):
16941703

16951704
# For vision model
16961705
if self.vparams is not None:
1706+
max_pos_embd = -1
16971707
self.gguf_writer.add_vision_vit_patch_merge_type(gguf.CLIPPatchMergeType.FLAT)
16981708
# TODO: should not hardcode these, but they are currently missing from config.json
16991709
if self.vision_arch == gguf.MODEL_ARCH.VISION_LLAVA:
17001710
self.gguf_writer.add_vision_vit_projector_type(gguf.constants.CLIPProjectorType.MLP)
1711+
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1
17011712
if self.vision_arch == gguf.MODEL_ARCH.VISION_MOBILEVLM:
17021713
self.gguf_writer.add_vision_vit_projector_type(gguf.constants.CLIPProjectorType.LDPV2)
1714+
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1
1715+
if self.vision_arch == gguf.MODEL_ARCH.VISION_IDEFICS3:
1716+
self.gguf_writer.add_vision_vit_projector_type(gguf.constants.CLIPProjectorType.MLP)
1717+
self.gguf_writer.add_vision_vit_scale_factor(self.hparams["scale_factor"])
1718+
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2
17031719
self.gguf_writer.add_vision_vit_layer_norm_epsilon(1e-05)
1704-
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1
17051720
self.gguf_writer.add_vision_vit_max_position_embeddings(max_pos_embd)
17061721

17071722
@staticmethod
@@ -1717,19 +1732,23 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
17171732
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
17181733
n_head = self.hparams["num_attention_heads"]
17191734
n_kv_head = self.hparams.get("num_key_value_heads")
1735+
is_vision_tensor = "vision_tower" in name or "vision_model" in name
17201736

17211737
# For vision model
17221738
if name.startswith("language_model"):
17231739
name = name.replace("language_model.", "")
1740+
if name.startswith("model.text_model"):
1741+
name = name.replace("text_model.", "") # for SmolVLM
17241742
else:
17251743
name = name.replace("model.vision_tower.", "")
1726-
if "post_layernorm" in name:
1744+
if "post_layernorm" in name and self.vision_arch != gguf.MODEL_ARCH.VISION_IDEFICS3:
17271745
return [] # skip post_layernorm
17281746

1729-
if name.endswith(("q_proj.weight", "q_proj.bias")):
1730-
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
1731-
if name.endswith(("k_proj.weight", "k_proj.bias")):
1732-
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
1747+
if not is_vision_tensor:
1748+
if name.endswith(("q_proj.weight", "q_proj.bias")):
1749+
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
1750+
if name.endswith(("k_proj.weight", "k_proj.bias")):
1751+
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
17331752

17341753
# process the experts separately
17351754
if name.find("block_sparse_moe.experts") != -1:

gguf-py/gguf/constants.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@ class Vit:
238238
PATCH_MERGE_TYPE = "vision.vit.patch_merge_type"
239239
HEAD_COUNT = "vision.vit.attention.head_count"
240240
LAYERNORM_EPS = "vision.vit.attention.layer_norm_epsilon"
241+
SCALE_FACTOR = "vision.vit.scale_factor" # only used by idefics3 for now
241242

242243
#
243244
# recommended mapping of model tensor names for storage in gguf
@@ -311,6 +312,7 @@ class MODEL_ARCH(IntEnum):
311312
VISION_LLAVA = auto()
312313
VISION_MOBILEVLM = auto()
313314
VISION_MINICPMV = auto()
315+
VISION_IDEFICS3 = auto()
314316

315317

316318
class MODEL_TENSOR(IntEnum):
@@ -441,6 +443,7 @@ class MODEL_TENSOR(IntEnum):
441443
POSNET_ATTN_OUT = auto()
442444
# vision
443445
V_MMPROJ = auto()
446+
V_MMPROJ_FC = auto()
444447
V_MMPROJ_MLP = auto()
445448
V_MMPROJ_PEG = auto()
446449
V_ENC_EMBD_CLS = auto()
@@ -535,6 +538,7 @@ class MODEL_TENSOR(IntEnum):
535538
MODEL_ARCH.VISION_LLAVA: "llava",
536539
MODEL_ARCH.VISION_MOBILEVLM: "mobilevlm",
537540
MODEL_ARCH.VISION_MINICPMV: "minicpmv",
541+
MODEL_ARCH.VISION_IDEFICS3: "idefics3",
538542
}
539543

540544
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -664,6 +668,7 @@ class MODEL_TENSOR(IntEnum):
664668
MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output",
665669
# vision
666670
MODEL_TENSOR.V_MMPROJ: "v.mmproj_{bid}",
671+
MODEL_TENSOR.V_MMPROJ_FC: "v.mmproj.fc",
667672
MODEL_TENSOR.V_MMPROJ_MLP: "v.mmproj.mlp.{bid}",
668673
MODEL_TENSOR.V_MMPROJ_PEG: "v.mmproj.peg.{bid}",
669674
MODEL_TENSOR.V_ENC_EMBD_CLS: "v.enc.embd.cls",
@@ -1695,6 +1700,20 @@ class MODEL_TENSOR(IntEnum):
16951700
MODEL_TENSOR.V_TOK_EMBD_SLICE,
16961701
MODEL_TENSOR.V_TOK_EMBD_END_SLICE,
16971702
],
1703+
MODEL_ARCH.VISION_IDEFICS3: [
1704+
MODEL_TENSOR.V_MMPROJ_FC,
1705+
MODEL_TENSOR.V_ENC_EMBD_PATCH,
1706+
MODEL_TENSOR.V_ENC_EMBD_POS,
1707+
MODEL_TENSOR.V_ENC_ATTN_Q,
1708+
MODEL_TENSOR.V_ENC_ATTN_K,
1709+
MODEL_TENSOR.V_ENC_ATTN_V,
1710+
MODEL_TENSOR.V_ENC_INPUT_NORM,
1711+
MODEL_TENSOR.V_ENC_OUTPUT,
1712+
MODEL_TENSOR.V_ENC_OUTPUT_NORM,
1713+
MODEL_TENSOR.V_ENC_FFN_UP,
1714+
MODEL_TENSOR.V_ENC_FFN_DOWN,
1715+
MODEL_TENSOR.V_POST_NORM,
1716+
],
16981717
# TODO
16991718
}
17001719

gguf-py/gguf/gguf_writer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -928,6 +928,9 @@ def add_vision_vit_image_mean(self, value: Sequence[float]) -> None:
928928
def add_vision_vit_image_std(self, value: Sequence[float]) -> None:
929929
self.add_array(Keys.Vision.IMAGE_STD, value)
930930

931+
def add_vision_vit_scale_factor(self, value: int) -> None:
932+
self.add_int32(Keys.Vision.Vit.SCALE_FACTOR, value)
933+
931934
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
932935
if not isinstance(value, str):
933936
template_default = None

gguf-py/gguf/tensor_mapping.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -794,6 +794,10 @@ class TensorNameMap:
794794
"multi_modal_projector.linear_{bid}",
795795
),
796796

797+
MODEL_TENSOR.V_MMPROJ_FC: (
798+
"model.connector.modality_projection.proj", # SmolVLM
799+
),
800+
797801
MODEL_TENSOR.V_MMPROJ_MLP: (
798802
"model.mm_projector.mlp.mlp.{bid}",
799803
),
@@ -809,51 +813,61 @@ class TensorNameMap:
809813
MODEL_TENSOR.V_ENC_EMBD_PATCH: (
810814
"vision_tower.vision_model.embeddings.patch_embedding",
811815
"vpm.embeddings.patch_embedding",
816+
"model.vision_model.embeddings.patch_embedding", # SmolVLM
812817
),
813818

814819
MODEL_TENSOR.V_ENC_EMBD_POS: (
815820
"vision_tower.vision_model.embeddings.position_embedding",
816821
"vpm.embeddings.position_embedding",
822+
"model.vision_model.embeddings.position_embedding", # SmolVLM
817823
),
818824

819825
MODEL_TENSOR.V_ENC_ATTN_Q: (
820826
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj",
821827
"vpm.encoder.layers.{bid}.self_attn.q_proj",
828+
"model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
822829
),
823830

824831
MODEL_TENSOR.V_ENC_ATTN_K: (
825832
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
826833
"vpm.encoder.layers.{bid}.self_attn.k_proj",
834+
"model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
827835
),
828836

829837
MODEL_TENSOR.V_ENC_ATTN_V: (
830838
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
831839
"vpm.encoder.layers.{bid}.self_attn.v_proj",
840+
"model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
832841
),
833842

834843
MODEL_TENSOR.V_ENC_INPUT_NORM: (
835844
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm1",
836845
"vpm.encoder.layers.{bid}.layer_norm1",
846+
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
837847
),
838848

839849
MODEL_TENSOR.V_ENC_OUTPUT: (
840850
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
841851
"vpm.encoder.layers.{bid}.self_attn.out_proj",
852+
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
842853
),
843854

844855
MODEL_TENSOR.V_ENC_OUTPUT_NORM: (
845856
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
846857
"vpm.encoder.layers.{bid}.layer_norm2",
858+
"model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
847859
),
848860

849861
MODEL_TENSOR.V_ENC_FFN_UP: (
850862
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
851863
"vpm.encoder.layers.{bid}.mlp.fc1",
864+
"model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM
852865
),
853866

854867
MODEL_TENSOR.V_ENC_FFN_DOWN: (
855868
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2",
856869
"vpm.encoder.layers.{bid}.mlp.fc2",
870+
"model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM
857871
),
858872

859873
MODEL_TENSOR.V_PRE_NORM: (
@@ -862,6 +876,7 @@ class TensorNameMap:
862876

863877
MODEL_TENSOR.V_POST_NORM: (
864878
"vision_tower.vision_model.post_layernorm",
879+
"model.vision_model.post_layernorm", # SmolVLM
865880
),
866881

867882
MODEL_TENSOR.V_RESMPL_POS_EMBD_K: (

src/llama-arch.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
6666
{ LLM_ARCH_VISION_LLAVA, "llava" },
6767
{ LLM_ARCH_VISION_MOBILEVLM, "mobilevlm" },
6868
{ LLM_ARCH_VISION_MINICPMV, "minicpmv" },
69+
{ LLM_ARCH_VISION_IDEFICS3, "idefics3" },
6970
{ LLM_ARCH_UNKNOWN, "(unknown)" },
7071
};
7172

@@ -214,6 +215,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
214215
{ LLM_KV_VISION_VIT_PATCH_MERGE_TYPE, "vision.vit.patch_merge_type" },
215216
{ LLM_KV_VISION_VIT_HEAD_COUNT, "vision.vit.attention.head_count" },
216217
{ LLM_KV_VISION_VIT_LAYERNORM_EPS, "vision.vit.attention.layer_norm_epsilon" },
218+
{ LLM_KV_VISION_VIT_SCALE_FACTOR, "vision.vit.scale_factor" },
217219

218220
// deprecated
219221
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
@@ -1388,6 +1390,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
13881390
{ LLM_TENSOR_V_TOK_EMBD_END_SLICE, "v.tok_embd.end_slice" },
13891391
}
13901392
},
1393+
{
1394+
LLM_ARCH_VISION_IDEFICS3,
1395+
{
1396+
{ LLM_TENSOR_V_MMPROJ_FC, "v.mmproj.fc" },
1397+
{ LLM_TENSOR_V_ENC_EMBD_CLS, "v.enc.embd.cls" },
1398+
{ LLM_TENSOR_V_ENC_EMBD_PATCH, "v.enc.embd.patch" },
1399+
{ LLM_TENSOR_V_ENC_EMBD_POS, "v.enc.embd.pos" },
1400+
{ LLM_TENSOR_V_ENC_ATTN_Q, "v.enc.blk.%d.attn_q" },
1401+
{ LLM_TENSOR_V_ENC_ATTN_K, "v.enc.blk.%d.attn_k" },
1402+
{ LLM_TENSOR_V_ENC_ATTN_V, "v.enc.blk.%d.attn_v" },
1403+
{ LLM_TENSOR_V_ENC_INPUT_NORM, "v.enc.blk.%d.input_norm" },
1404+
{ LLM_TENSOR_V_ENC_OUTPUT, "v.enc.blk.%d.output" },
1405+
{ LLM_TENSOR_V_ENC_OUTPUT_NORM, "v.enc.blk.%d.output_norm" },
1406+
{ LLM_TENSOR_V_ENC_FFN_UP, "v.enc.blk.%d.ffn_up" },
1407+
{ LLM_TENSOR_V_ENC_FFN_DOWN, "v.enc.blk.%d.ffn_down" },
1408+
{ LLM_TENSOR_V_PRE_NORM, "v.pre_norm" },
1409+
{ LLM_TENSOR_V_POST_NORM, "v.post_norm" },
1410+
}
1411+
},
13911412
{
13921413
LLM_ARCH_UNKNOWN,
13931414
{

src/llama-arch.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ enum llm_arch {
7070
LLM_ARCH_VISION_LLAVA,
7171
LLM_ARCH_VISION_MOBILEVLM,
7272
LLM_ARCH_VISION_MINICPMV,
73+
LLM_ARCH_VISION_IDEFICS3,
7374
LLM_ARCH_UNKNOWN,
7475
};
7576

@@ -218,6 +219,7 @@ enum llm_kv {
218219
LLM_KV_VISION_VIT_PATCH_MERGE_TYPE,
219220
LLM_KV_VISION_VIT_HEAD_COUNT,
220221
LLM_KV_VISION_VIT_LAYERNORM_EPS,
222+
LLM_KV_VISION_VIT_SCALE_FACTOR,
221223

222224
// deprecated:
223225
LLM_KV_TOKENIZER_PREFIX_ID,
@@ -354,6 +356,7 @@ enum llm_tensor {
354356
LLM_TENSOR_POS_NET_ATTN_OUT,
355357
// vision
356358
LLM_TENSOR_V_MMPROJ,
359+
LLM_TENSOR_V_MMPROJ_FC,
357360
LLM_TENSOR_V_MMPROJ_MLP,
358361
LLM_TENSOR_V_MMPROJ_PEG,
359362
LLM_TENSOR_V_ENC_EMBD_CLS,

src/llama-model.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1265,6 +1265,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
12651265
ml.get_key(LLM_KV_VISION_VIT_LAYERNORM_EPS, vparams.eps, true);
12661266
ml.get_key(LLM_KV_VISION_VIT_SELECT_LAYER, vparams.select_layer, true);
12671267
ml.get_key(LLM_KV_VISION_VIT_MAX_POS_EMBD, vparams.max_pos_embd, true);
1268+
ml.get_key(LLM_KV_VISION_VIT_SCALE_FACTOR, vparams.scale_factor, false);
12681269
{
12691270
std::string name;
12701271
ml.get_key(LLM_KV_VISION_VIT_PROJECTOR_TYPE, name, true);
@@ -3555,6 +3556,42 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
35553556
vit.mm_tok_embd_slice = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_TOK_EMBD_SLICE, "weight"), {n_embd});
35563557
vit.mm_tok_embd_end_slice = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_TOK_EMBD_END_SLICE, "weight"), {n_embd});
35573558

3559+
for (int i = 0; i < n_vlayer; ++i) {
3560+
auto & layer = vit.layers[i];
3561+
3562+
layer.k_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_K, "weight", i), {n_vembd, n_vembd}, 0);
3563+
layer.k_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_K, "bias" , i), {n_vembd}, 0);
3564+
layer.v_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_V, "weight", i), {n_vembd, n_vembd}, 0);
3565+
layer.v_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_V, "bias" , i), {n_vembd}, 0);
3566+
layer.q_w = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_Q, "weight", i), {n_vembd, n_vembd}, 0);
3567+
layer.q_b = create_tensor(tn(LLM_TENSOR_V_ENC_ATTN_Q, "bias" , i), {n_vembd}, 0);
3568+
3569+
layer.ffn_up_w = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_UP, "weight", i), {n_vembd, n_vff}, 0);
3570+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_UP, "bias" , i), {n_vff}, 0);
3571+
layer.ffn_down_w = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_DOWN, "weight", i), {n_vff, n_vembd}, 0);
3572+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_V_ENC_FFN_DOWN, "bias" , i), {n_vembd}, 0);
3573+
3574+
layer.norm_in_w = create_tensor(tn(LLM_TENSOR_V_ENC_INPUT_NORM, "weight", i), {n_vembd}, 0);
3575+
layer.norm_in_b = create_tensor(tn(LLM_TENSOR_V_ENC_INPUT_NORM, "bias" , i), {n_vembd}, 0);
3576+
layer.norm_out_w = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "weight", i), {n_vembd}, 0);
3577+
layer.norm_out_b = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT_NORM, "bias" , i), {n_vembd}, 0);
3578+
3579+
layer.output_w = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT, "weight", i), {n_vembd, n_vembd}, 0);
3580+
layer.output_b = create_tensor(tn(LLM_TENSOR_V_ENC_OUTPUT, "bias" , i), {n_vembd}, 0);
3581+
}
3582+
} break;
3583+
case LLM_ARCH_VISION_IDEFICS3:
3584+
{
3585+
int scale_factor = vit.hparams.scale_factor;
3586+
vit.projection = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_MMPROJ_FC, "weight"), {n_vembd * scale_factor * scale_factor, n_embd});
3587+
3588+
vit.patch_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_vembd});
3589+
vit.patch_bias = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_PATCH, "bias" ), {n_vembd});
3590+
vit.position_embeddings = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_ENC_EMBD_POS, "weight"), {n_vembd, max_pos_embd});
3591+
3592+
vit.post_norm_w = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_POST_NORM, "weight"), {n_vembd});
3593+
vit.post_norm_b = ml.create_tensor(ctx_vision, tn(LLM_TENSOR_V_POST_NORM, "bias" ), {n_vembd});
3594+
35583595
for (int i = 0; i < n_vlayer; ++i) {
35593596
auto & layer = vit.layers[i];
35603597

@@ -4085,6 +4122,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
40854122
case LLM_ARCH_VISION_LLAVA:
40864123
case LLM_ARCH_VISION_MOBILEVLM:
40874124
case LLM_ARCH_VISION_MINICPMV:
4125+
case LLM_ARCH_VISION_IDEFICS3:
40884126
GGML_ABORT("vision arch does not use RoPE");
40894127

40904128
// all model arches should be listed explicitly here

0 commit comments

Comments
 (0)