Skip to content

Commit 85c7cda

Browse files
committed
mtmd: fix vision model processing
1 parent b6b9f02 commit 85c7cda

File tree

4 files changed

+111
-47
lines changed

4 files changed

+111
-47
lines changed

convert_hf_to_gguf.py

Lines changed: 39 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1445,7 +1445,7 @@ class MmprojModel(ModelBase):
14451445
preprocessor_config: dict[str, Any]
14461446
global_config: dict[str, Any]
14471447

1448-
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "width.clip-l-14-224.layers", "sam_vit_b.layers"]
1448+
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "layers"]
14491449

14501450
has_vision_encoder: bool = True # by default
14511451
has_audio_encoder: bool = False
@@ -1494,8 +1494,8 @@ def __init__(self, *args, **kwargs):
14941494
# FIXME: DeepseekOCRVisionModel specific hack
14951495
if self.block_count is None:
14961496
if isinstance(self, DeepseekOCRVisionModel):
1497-
clip_block_count = self.hparams['width']['clip-l-14-224']['layers']
1498-
sam_block_count = self.hparams['width']['sam_vit_b']['layers']
1497+
print(self.hparams)
1498+
clip_block_count = self.hparams['layers']
14991499
if clip_block_count is not None:
15001500
self.block_count = clip_block_count
15011501
if sam_block_count is not None:
@@ -5793,6 +5793,16 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
57935793

57945794
@ModelBase.register("DeepseekOCRForCausalLM")
57955795
class DeepseekOCRVisionModel(MmprojModel):
5796+
def __init__(self, *args, **kwargs):
5797+
super().__init__(*args, **kwargs)
5798+
5799+
proc_fname = self.dir_model / "processor_config.json"
5800+
5801+
if proc_fname.is_file():
5802+
with open(proc_fname, "r") as f:
5803+
self.preprocessor_config = json.load(f)
5804+
5805+
57965806
def set_gguf_parameters(self):
57975807
super().set_gguf_parameters()
57985808
hparams = self.hparams
@@ -5811,10 +5821,25 @@ def set_gguf_parameters(self):
58115821
# in this case, we are converting a test model
58125822
self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor)
58135823

5824+
# SAM configuration
5825+
sam_hparams = hparams['sam']
5826+
self.gguf_writer.add_vision_sam_layers_count(sam_hparams['layers'])
5827+
self.gguf_writer.add_vision_sam_embedding_length(sam_hparams['width'])
5828+
58145829
def get_vision_config(self) -> dict[str, Any]:
5815-
orig_vision_config = self.global_config.get("vision_config")
5830+
vision_config: dict[str, Any] | None = self.global_config.get("vision_config")
5831+
5832+
if not vision_config:
5833+
raise ValueError("DeepseekOCR model requires 'vision_config' in the model configuration, but it was not found")
5834+
5835+
vision_config['sam'] = vision_config['width']['sam_vit_b']
5836+
vision_config.update(vision_config['width']['clip-l-14-224'])
5837+
vision_config['hidden_size'] = vision_config['width']
5838+
vision_config['num_heads'] = vision_config['heads']
5839+
vision_config['intermediate_size'] = vision_config['heads'] * 4
5840+
5841+
return vision_config
58165842

5817-
super().get_vision_config()
58185843

58195844
def tensor_force_quant(self, name, new_name, bid, n_dims):
58205845
# related to https://github.com/ggml-org/llama.cpp/issues/13025
@@ -5825,27 +5850,17 @@ def tensor_force_quant(self, name, new_name, bid, n_dims):
58255850
return super().tensor_force_quant(name, new_name, bid, n_dims)
58265851

58275852
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5828-
del bid # unused
5829-
5830-
if "vision_model.head." in name:
5831-
return [] # skip redundant tensors for tinygemma3
5832-
5833-
if name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
5834-
or name.startswith("multimodal_projector.") or name.startswith("vision_model."):
5835-
# process vision tensors
5836-
name = name.replace("_weight", ".weight")
5837-
5838-
# correct norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector
5839-
# the other norm values are part of SigLIP model, and they are already correct
5840-
# ref code: Gemma3RMSNorm
5841-
if "soft_emb_norm.weight" in name:
5842-
logger.info(f"Correcting norm value for '{name}'")
5843-
data_torch = data_torch + 1
5844-
5845-
return [(self.map_tensor_name(name), data_torch)]
5853+
# Only process vision-related tensors, skip language model tensors
5854+
# Vision components: sam_model, vision_model, projector, image_newline, view_seperator
5855+
# Language model components to skip: lm_head, embed_tokens, layers, norm
5856+
if name.startswith(("lm_head.", "model.embed_tokens.", "model.layers.", "model.norm.")):
5857+
return []
58465858

5847-
return [] # skip other tensors
5859+
if ".attn.rel_pos_h" in name or ".attn.rel_pos_w" in name:
5860+
return [(self.map_tensor_name(name, try_suffixes=("",)), data_torch)]
58485861

5862+
return [(self.map_tensor_name(name), data_torch)]
5863+
58495864

58505865
@ModelBase.register("Gemma3nForConditionalGeneration")
58515866
class Gemma3NModel(Gemma3Model):

gguf-py/gguf/constants.py

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,10 @@ class Attention:
287287
class Projector:
288288
SCALE_FACTOR = "clip.vision.projector.scale_factor"
289289

290+
class SAM:
291+
BLOCK_COUNT = "clip.vision.sam.block_count"
292+
EMBEDDING_LENGTH = "clip.vision.sam.embedding_length"
293+
290294
class ClipAudio:
291295
NUM_MEL_BINS = "clip.audio.num_mel_bins"
292296
EMBEDDING_LENGTH = "clip.audio.embedding_length"
@@ -664,20 +668,21 @@ class MODEL_TENSOR(IntEnum):
664668
V_MM_GATE = auto() # cogvlm
665669
V_TOK_BOI = auto() # cogvlm
666670
V_TOK_EOI = auto() # cogvlm
667-
# DeepSeek-OCR sam_model
668-
V_SAM_POS_EMBD = auto()
669-
V_SAM_PATCH_EMBD = auto()
670-
V_SAM_PRE_NORM = auto()
671-
V_SAM_POST_NORM = auto()
672-
V_SAM_ATTN_POS_H = auto()
673-
V_SAM_ATTN_POS_W = auto()
674-
V_SAM_ATTN_QKV = auto()
675-
V_SAM_ATTN_OUT = auto()
676-
V_SAM_MLP_LIN_1 = auto()
677-
V_SAM_MLP_LIN_2 = auto()
678-
V_SAM_NECK = auto()
679-
V_SAM_NET_2 = auto()
680-
V_SAM_NET_3 = auto()
671+
V_SAM_POS_EMBD = auto() # Deepseek-OCR
672+
V_SAM_PATCH_EMBD = auto() # Deepseek-OCR
673+
V_SAM_PRE_NORM = auto() # Deepseek-OCR
674+
V_SAM_POST_NORM = auto() # Deepseek-OCR
675+
V_SAM_ATTN_POS_H = auto() # Deepseek-OCR
676+
V_SAM_ATTN_POS_W = auto() # Deepseek-OCR
677+
V_SAM_ATTN_QKV = auto() # Deepseek-OCR
678+
V_SAM_ATTN_OUT = auto() # Deepseek-OCR
679+
V_SAM_MLP_LIN_1 = auto() # Deepseek-OCR
680+
V_SAM_MLP_LIN_2 = auto() # Deepseek-OCR
681+
V_SAM_NECK = auto() # Deepseek-OCR
682+
V_SAM_NET_2 = auto() # Deepseek-OCR
683+
V_SAM_NET_3 = auto() # Deepseek-OCR
684+
V_ENC_EMBD_IMGNL = auto() # Deepseek-OCR
685+
V_ENC_EMBD_VSEP = auto() # Deepseek-OCR
681686

682687
# audio (mtmd)
683688
A_ENC_EMBD_POS = auto()
@@ -1059,6 +1064,8 @@ class MODEL_TENSOR(IntEnum):
10591064
MODEL_TENSOR.V_SAM_NECK: "v.sam.neck.{bid}",
10601065
MODEL_TENSOR.V_SAM_NET_2: "v.sam.net_2",
10611066
MODEL_TENSOR.V_SAM_NET_3: "v.sam.net_3",
1067+
MODEL_TENSOR.V_ENC_EMBD_IMGNL: "v.image_newline_embd", # Deepseek-OCR
1068+
MODEL_TENSOR.V_ENC_EMBD_VSEP: "v.view_separator_embd", # Deepseek-OCR
10621069
# audio (mtmd)
10631070
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
10641071
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
@@ -1095,6 +1102,8 @@ class MODEL_TENSOR(IntEnum):
10951102
MODEL_TENSOR.V_ENC_EMBD_CLS,
10961103
MODEL_TENSOR.V_ENC_EMBD_PATCH,
10971104
MODEL_TENSOR.V_ENC_EMBD_POS,
1105+
MODEL_TENSOR.V_ENC_EMBD_IMGNL,
1106+
MODEL_TENSOR.V_ENC_EMBD_VSEP,
10981107
MODEL_TENSOR.V_ENC_INPUT_NORM,
10991108
MODEL_TENSOR.V_ENC_ATTN_QKV,
11001109
MODEL_TENSOR.V_ENC_ATTN_Q,
@@ -1137,6 +1146,19 @@ class MODEL_TENSOR(IntEnum):
11371146
MODEL_TENSOR.V_MM_GATE,
11381147
MODEL_TENSOR.V_TOK_BOI,
11391148
MODEL_TENSOR.V_TOK_EOI,
1149+
MODEL_TENSOR.V_SAM_POS_EMBD,
1150+
MODEL_TENSOR.V_SAM_PATCH_EMBD,
1151+
MODEL_TENSOR.V_SAM_PRE_NORM,
1152+
MODEL_TENSOR.V_SAM_POST_NORM,
1153+
MODEL_TENSOR.V_SAM_ATTN_POS_H,
1154+
MODEL_TENSOR.V_SAM_ATTN_POS_W,
1155+
MODEL_TENSOR.V_SAM_ATTN_QKV,
1156+
MODEL_TENSOR.V_SAM_ATTN_OUT,
1157+
MODEL_TENSOR.V_SAM_MLP_LIN_1,
1158+
MODEL_TENSOR.V_SAM_MLP_LIN_2,
1159+
MODEL_TENSOR.V_SAM_NECK,
1160+
MODEL_TENSOR.V_SAM_NET_2,
1161+
MODEL_TENSOR.V_SAM_NET_3,
11401162
# audio
11411163
MODEL_TENSOR.A_ENC_EMBD_POS,
11421164
MODEL_TENSOR.A_ENC_CONV1D,

gguf-py/gguf/gguf_writer.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1077,6 +1077,12 @@ def add_vision_n_wa_pattern(self, value: int) -> None:
10771077
def add_vision_is_deepstack_layers(self, layers: Sequence[bool]) -> None:
10781078
self.add_array(Keys.ClipVision.IS_DEEPSTACK_LAYERS, layers)
10791079

1080+
1081+
def add_vision_sam_layers_count(self, value: int) -> None:
1082+
self.add_uint32(Keys.ClipVision.SAM.BLOCK_COUNT, value)
1083+
1084+
def add_vision_sam_embedding_length(self, value: int) -> None:
1085+
self.add_uint32(Keys.ClipVision.SAM.EMBEDDING_LENGTH, value)
10801086
# audio models
10811087

10821088
def add_audio_projection_dim(self, value: int) -> None:

gguf-py/gguf/tensor_mapping.py

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1179,6 +1179,7 @@ class TensorNameMap:
11791179
MODEL_TENSOR.V_MMPROJ_FC: (
11801180
"model.connector.modality_projection.proj", # SmolVLM
11811181
"model.vision.linear_proj.linear_proj", # cogvlm
1182+
"model.projector.layers", # Deepseek-OCR
11821183
),
11831184

11841185
MODEL_TENSOR.V_MMPROJ_MLP: (
@@ -1197,6 +1198,7 @@ class TensorNameMap:
11971198
"model.vision_tower.embeddings.cls_token", # Intern-S1
11981199
"vision_model.class_embedding", # llama 4
11991200
"model.vision.patch_embedding.cls_embedding", # cogvlm
1201+
"model.vision_model.embeddings.class_embedding", # Deepseek-OCR
12001202
),
12011203

12021204
MODEL_TENSOR.V_ENC_EMBD_PATCH: (
@@ -1210,6 +1212,7 @@ class TensorNameMap:
12101212
"visual.patch_embed.proj", # qwen2vl
12111213
"vision_tower.patch_embed.proj", # kimi-vl
12121214
"model.vision.patch_embedding.proj", # cogvlm
1215+
"model.vision_model.embeddings.patch_embedding", # Deepseek-OCR CLIP
12131216
),
12141217

12151218
MODEL_TENSOR.V_ENC_EMBD_POS: (
@@ -1222,10 +1225,19 @@ class TensorNameMap:
12221225
"visual.pos_embed", # qwen3vl
12231226
"model.vision.patch_embedding.position_embedding", # cogvlm
12241227
),
1228+
1229+
MODEL_TENSOR.V_ENC_EMBD_IMGNL: (
1230+
"model.image_newline", # Deepseek-OCR
1231+
),
1232+
1233+
MODEL_TENSOR.V_ENC_EMBD_VSEP: (
1234+
"model.view_seperator", # Deepseek-OCR
1235+
),
12251236

12261237
MODEL_TENSOR.V_ENC_ATTN_QKV: (
12271238
"visual.blocks.{bid}.attn.qkv", # qwen3vl
12281239
"model.vision.transformer.layers.{bid}.attention.query_key_value", # cogvlm
1240+
"model.vision_model.transformer.layers.{bid}.self_attn.qkv_proj", # Deepseek-OCR CLIP
12291241
),
12301242

12311243
MODEL_TENSOR.V_ENC_ATTN_Q: (
@@ -1238,6 +1250,7 @@ class TensorNameMap:
12381250
"vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral
12391251
"visual.blocks.{bid}.attn.q", # qwen2vl, generated
12401252
"vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated
1253+
"model.vision_model.transformer.layers.{bid}.self_attn.q_proj", # Deepseek-OCR CLIP, generated
12411254
),
12421255

12431256
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
@@ -1255,6 +1268,7 @@ class TensorNameMap:
12551268
"vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral
12561269
"visual.blocks.{bid}.attn.k", # qwen2vl, generated
12571270
"vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated
1271+
"model.vision_model.transformer.layers.{bid}.self_attn.k_proj", # Deepseek-OCR CLIP, generated
12581272
),
12591273

12601274
MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
@@ -1272,6 +1286,7 @@ class TensorNameMap:
12721286
"vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral
12731287
"visual.blocks.{bid}.attn.v", # qwen2vl, generated
12741288
"vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated
1289+
"model.vision_model.transformer.layers.{bid}.self_attn.v_proj", # Deepseek-OCR CLIP, generated
12751290
),
12761291

12771292
MODEL_TENSOR.V_ENC_INPUT_NORM: (
@@ -1286,6 +1301,7 @@ class TensorNameMap:
12861301
"visual.blocks.{bid}.norm1", # qwen2vl
12871302
"vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1)
12881303
"model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm
1304+
"model.vision_model.transformer.layers.{bid}.layer_norm1", # Deepseek-OCR CLIP
12891305
),
12901306

12911307
MODEL_TENSOR.V_ENC_ATTN_O: (
@@ -1301,6 +1317,7 @@ class TensorNameMap:
13011317
"visual.blocks.{bid}.attn.proj", # qwen2vl
13021318
"vision_tower.encoder.blocks.{bid}.wo", # kimi-vl
13031319
"model.vision.transformer.layers.{bid}.attention.dense", # cogvlm
1320+
"model.vision_model.transformer.layers.{bid}.self_attn.out_proj", # Deepseek-OCR CLIP
13041321
),
13051322

13061323
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
@@ -1315,6 +1332,7 @@ class TensorNameMap:
13151332
"visual.blocks.{bid}.norm2", # qwen2vl
13161333
"vision_tower.encoder.blocks.{bid}.norm1", # kimi-vl (norm0/norm1)
13171334
"model.vision.transformer.layers.{bid}.post_attention_layernorm", # cogvlm
1335+
"model.vision_model.transformer.layers.{bid}.layer_norm2", # Deepseek-OCR CLIP
13181336
),
13191337

13201338
MODEL_TENSOR.V_ENC_FFN_UP: (
@@ -1329,6 +1347,7 @@ class TensorNameMap:
13291347
"visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl
13301348
"visual.blocks.{bid}.mlp.linear_fc1", # qwen3vl
13311349
"vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1)
1350+
"model.vision_model.transformer.layers.{bid}.mlp.fc1", # Deepseek-OCR CLIP
13321351
"model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm
13331352
),
13341353

@@ -1351,6 +1370,7 @@ class TensorNameMap:
13511370
"visual.blocks.{bid}.mlp.linear_fc2", # qwen3vl
13521371
"vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1)
13531372
"model.vision.transformer.layers.{bid}.mlp.fc2", # cogvlm
1373+
"model.vision_model.transformer.layers.{bid}.mlp.fc2", # Deepseek-OCR CLIP
13541374
),
13551375

13561376
MODEL_TENSOR.V_LAYER_SCALE_1: (
@@ -1368,6 +1388,7 @@ class TensorNameMap:
13681388
"vision_tower.ln_pre", # pixtral-hf
13691389
"vision_encoder.ln_pre", # pixtral
13701390
"vision_model.layernorm_pre", # llama4
1391+
"model.vision_model.pre_layrnorm", # Deepseek-OCR CLIP
13711392
),
13721393

13731394
MODEL_TENSOR.V_POST_NORM: (
@@ -1460,11 +1481,11 @@ class TensorNameMap:
14601481
),
14611482

14621483
MODEL_TENSOR.V_SAM_POS_EMBD: (
1463-
"model.sam_model.pos_embed"
1484+
"model.sam_model.pos_embed",
14641485
),
14651486

14661487
MODEL_TENSOR.V_SAM_PATCH_EMBD: (
1467-
"model.sam_model.patch_embed.proj"
1488+
"model.sam_model.patch_embed.proj",
14681489
),
14691490

14701491
MODEL_TENSOR.V_SAM_PRE_NORM: (
@@ -1476,19 +1497,19 @@ class TensorNameMap:
14761497
),
14771498

14781499
MODEL_TENSOR.V_SAM_ATTN_POS_H: (
1479-
"model.sam_model.blocks.{bid}.attn.rel_pos_h"
1500+
"model.sam_model.blocks.{bid}.attn.rel_pos_h",
14801501
),
14811502

14821503
MODEL_TENSOR.V_SAM_ATTN_POS_W: (
1483-
"model.sam_model.blocks.{bid}.attn.rel_pos_w"
1504+
"model.sam_model.blocks.{bid}.attn.rel_pos_w",
14841505
),
14851506

14861507
MODEL_TENSOR.V_SAM_ATTN_QKV: (
1487-
"model.sam_model.blocks.{bid}.attn.qkv"
1508+
"model.sam_model.blocks.{bid}.attn.qkv",
14881509
),
14891510

14901511
MODEL_TENSOR.V_SAM_ATTN_OUT: (
1491-
"model.sam_model.blocks.{bid}.attn.proj"
1512+
"model.sam_model.blocks.{bid}.attn.proj",
14921513
),
14931514

14941515
MODEL_TENSOR.V_SAM_MLP_LIN_1: (
@@ -1500,15 +1521,15 @@ class TensorNameMap:
15001521
),
15011522

15021523
MODEL_TENSOR.V_SAM_NECK: (
1503-
"model.sam_model.neck.{bid}"
1524+
"model.sam_model.neck.{bid}",
15041525
),
15051526

15061527
MODEL_TENSOR.V_SAM_NET_2: (
1507-
"model.sam_model.net_2"
1528+
"model.sam_model.net_2",
15081529
),
15091530

15101531
MODEL_TENSOR.V_SAM_NET_3: (
1511-
"model.sam_model.net_3"
1532+
"model.sam_model.net_3",
15121533
),
15131534

15141535
MODEL_TENSOR.V_MM_POST_FC_NORM: (

0 commit comments

Comments
 (0)