Skip to content

Commit 43a130b

Browse files
committed
mtmd: llama.cpp DeepSeekOCR support
init commit
1 parent 7f3e9d3 commit 43a130b

File tree

5 files changed

+696
-21
lines changed

5 files changed

+696
-21
lines changed

convert_hf_to_gguf.py

Lines changed: 95 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -620,6 +620,9 @@ def load_hparams(dir_model: Path, is_mistral_format: bool):
620620
if "thinker_config" in config:
621621
# rename for Qwen2.5-Omni
622622
config["text_config"] = config["thinker_config"]["text_config"]
623+
if "language_config" in config:
624+
# rename for DeepSeekOCR
625+
config["text_config"] = config["language_config"]
623626
return config
624627

625628
@classmethod
@@ -1442,7 +1445,7 @@ class MmprojModel(ModelBase):
14421445
preprocessor_config: dict[str, Any]
14431446
global_config: dict[str, Any]
14441447

1445-
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]
1448+
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "width.clip-l-14-224.layers", "sam_vit_b.layers"]
14461449

14471450
has_vision_encoder: bool = True # by default
14481451
has_audio_encoder: bool = False
@@ -1488,13 +1491,31 @@ def __init__(self, *args, **kwargs):
14881491
# TODO @ngxson : this is a hack to support both vision and audio encoders
14891492
have_multiple_encoders = self.has_audio_encoder and self.has_vision_encoder
14901493
self.block_count = 128 if have_multiple_encoders else self.find_hparam(self.n_block_keys, True)
1494+
# FIXME: DeepseekOCRVisionModel specific hack
1495+
if self.block_count is None:
1496+
if isinstance(self, DeepseekOCRVisionModel):
1497+
clip_block_count = self.hparams['width']['clip-l-14-224']['layers']
1498+
sam_block_count = self.hparams['width']['sam_vit_b']['layers']
1499+
if clip_block_count is not None:
1500+
self.block_count = clip_block_count
1501+
if sam_block_count is not None:
1502+
self.block_count = sam_block_count if self.block_count is None else self.block_count + sam_block_count
1503+
if self.block_count is None:
1504+
raise KeyError(f"could not find block count using any of: {self.n_block_keys}")
14911505
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
14921506

14931507
# load preprocessor config
14941508
self.preprocessor_config = {}
14951509
if not self.is_mistral_format:
1496-
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
1497-
self.preprocessor_config = json.load(f)
1510+
# check if preprocessor_config.json exists
1511+
if (self.dir_model / "preprocessor_config.json").is_file():
1512+
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
1513+
self.preprocessor_config = json.load(f)
1514+
else:
1515+
# try "processing_config" file if exists
1516+
if (self.dir_model / "processing_config.json").is_file():
1517+
with open(self.dir_model / "processing_config.json", "r", encoding="utf-8") as f:
1518+
self.preprocessor_config = json.load(f)
14981519

14991520
def get_vision_config(self) -> dict[str, Any] | None:
15001521
config_name = "vision_config" if not self.is_mistral_format else "vision_encoder"
@@ -5770,6 +5791,61 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
57705791

57715792
return [] # skip other tensors
57725793

5794+
@ModelBase.register("DeepseekOCRForCausalLM")
5795+
class DeepseekOCRVisionModel(MmprojModel):
5796+
def set_gguf_parameters(self):
5797+
super().set_gguf_parameters()
5798+
hparams = self.hparams
5799+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DEEPSEEKOCR)
5800+
# default values below are taken from HF tranformers code
5801+
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
5802+
self.gguf_writer.add_vision_use_gelu(True)
5803+
# calculate proj_scale_factor (used by tinygemma3 test model)
5804+
image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
5805+
n_per_side = int(image_seq_length ** 0.5)
5806+
image_size = self.hparams["image_size"]
5807+
patch_size = self.hparams["patch_size"]
5808+
proj_scale_factor = (image_size // patch_size) // n_per_side
5809+
if proj_scale_factor > 0 and proj_scale_factor != 4:
5810+
# we only need to write this if it's not the default value
5811+
# in this case, we are converting a test model
5812+
self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor)
5813+
5814+
def get_vision_config(self) -> dict[str, Any]:
5815+
orig_vision_config = self.global_config.get("vision_config")
5816+
5817+
super().get_vision_config()
5818+
5819+
def tensor_force_quant(self, name, new_name, bid, n_dims):
5820+
# related to https://github.com/ggml-org/llama.cpp/issues/13025
5821+
if "input_projection" in name:
5822+
return gguf.GGMLQuantizationType.F16
5823+
if ".embeddings." in name:
5824+
return gguf.GGMLQuantizationType.F32
5825+
return super().tensor_force_quant(name, new_name, bid, n_dims)
5826+
5827+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5828+
del bid # unused
5829+
5830+
if "vision_model.head." in name:
5831+
return [] # skip redundant tensors for tinygemma3
5832+
5833+
if name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
5834+
or name.startswith("multimodal_projector.") or name.startswith("vision_model."):
5835+
# process vision tensors
5836+
name = name.replace("_weight", ".weight")
5837+
5838+
# correct norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector
5839+
# the other norm values are part of SigLIP model, and they are already correct
5840+
# ref code: Gemma3RMSNorm
5841+
if "soft_emb_norm.weight" in name:
5842+
logger.info(f"Correcting norm value for '{name}'")
5843+
data_torch = data_torch + 1
5844+
5845+
return [(self.map_tensor_name(name), data_torch)]
5846+
5847+
return [] # skip other tensors
5848+
57735849

57745850
@ModelBase.register("Gemma3nForConditionalGeneration")
57755851
class Gemma3NModel(Gemma3Model):
@@ -6943,6 +7019,7 @@ def prepare_tensors(self):
69437019
@ModelBase.register(
69447020
"DeepseekV2ForCausalLM",
69457021
"DeepseekV3ForCausalLM",
7022+
"DeepseekOCRForCausalLM",
69467023
"KimiVLForConditionalGeneration",
69477024
)
69487025
class DeepseekV2Model(TextModel):
@@ -7009,31 +7086,35 @@ def set_gguf_parameters(self):
70097086

70107087
super().set_gguf_parameters()
70117088
hparams = self.hparams
7089+
kv_lora_rank = hparams["q_lora_rank"] if hparams["q_lora_rank"] is not None else 512
7090+
routed_scaling_factor = hparams.get("routed_scaling_factor", 1.0)
7091+
norm_topk_prob = hparams.get("norm_topk_prob", False)
7092+
scoring_func = hparams.get("scoring_func", "softmax")
70127093

70137094
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
70147095
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
70157096
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
70167097
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
7017-
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
7098+
self.gguf_writer.add_kv_lora_rank(kv_lora_rank)
70187099

70197100
# note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
7020-
self.gguf_writer.add_key_length(hparams["kv_lora_rank"] + hparams["qk_rope_head_dim"])
7021-
self.gguf_writer.add_value_length(hparams["kv_lora_rank"])
7101+
self.gguf_writer.add_key_length(kv_lora_rank + hparams["qk_rope_head_dim"])
7102+
self.gguf_writer.add_value_length(kv_lora_rank)
70227103
self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
70237104
self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
70247105

70257106
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
70267107
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
70277108
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
7028-
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
7029-
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
7109+
self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
7110+
self.gguf_writer.add_expert_weights_norm(norm_topk_prob)
70307111

7031-
if hparams["scoring_func"] == "sigmoid":
7112+
if scoring_func == "sigmoid":
70327113
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
7033-
elif hparams["scoring_func"] == "softmax":
7114+
elif scoring_func == "softmax":
70347115
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
70357116
else:
7036-
raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}")
7117+
raise ValueError(f"Unsupported scoring_func value: {scoring_func}")
70377118

70387119
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
70397120

@@ -7043,12 +7124,14 @@ def set_gguf_parameters(self):
70437124
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
70447125
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
70457126
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_scaling["mscale_all_dim"])
7127+
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6))
70467128

70477129
_experts: list[dict[str, Tensor]] | None = None
70487130

70497131
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
70507132
# skip vision tensors and remove "language_model." for Kimi-VL
7051-
if "vision_tower" in name or "multi_modal_projector" in name:
7133+
if "vision_" in name or "multi_modal_projector" in name \
7134+
or "image_newline" in name or "model.projector" in name or "sam_model" in name or "view_seperator" in name:
70527135
return []
70537136

70547137
if name.startswith("language_model."):

gguf-py/gguf/constants.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -664,6 +664,21 @@ class MODEL_TENSOR(IntEnum):
664664
V_MM_GATE = auto() # cogvlm
665665
V_TOK_BOI = auto() # cogvlm
666666
V_TOK_EOI = auto() # cogvlm
667+
# DeepSeek-OCR sam_model
668+
V_SAM_POS_EMBD = auto()
669+
V_SAM_PATCH_EMBD = auto()
670+
V_SAM_PRE_NORM = auto()
671+
V_SAM_POST_NORM = auto()
672+
V_SAM_ATTN_POS_H = auto()
673+
V_SAM_ATTN_POS_W = auto()
674+
V_SAM_ATTN_QKV = auto()
675+
V_SAM_ATTN_OUT = auto()
676+
V_SAM_MLP_LIN_1 = auto()
677+
V_SAM_MLP_LIN_2 = auto()
678+
V_SAM_NECK = auto()
679+
V_SAM_NET_2 = auto()
680+
V_SAM_NET_3 = auto()
681+
667682
# audio (mtmd)
668683
A_ENC_EMBD_POS = auto()
669684
A_ENC_CONV1D = auto()
@@ -1030,6 +1045,20 @@ class MODEL_TENSOR(IntEnum):
10301045
MODEL_TENSOR.V_MM_GATE: "mm.gate",
10311046
MODEL_TENSOR.V_TOK_BOI: "v.boi",
10321047
MODEL_TENSOR.V_TOK_EOI: "v.eoi",
1048+
# DeepSeek-OCR sam_model
1049+
MODEL_TENSOR.V_SAM_POS_EMBD: "v.sam.pos_embd",
1050+
MODEL_TENSOR.V_SAM_PATCH_EMBD: "v.sam.patch_embd",
1051+
MODEL_TENSOR.V_SAM_PRE_NORM: "v.sam.blk.{bid}.pre_ln",
1052+
MODEL_TENSOR.V_SAM_POST_NORM: "v.sam.blk.{bid}.post_ln",
1053+
MODEL_TENSOR.V_SAM_ATTN_POS_H: "v.sam.blk.{bid}.attn.pos_h",
1054+
MODEL_TENSOR.V_SAM_ATTN_POS_W: "v.sam.blk.{bid}.attn.pos_w",
1055+
MODEL_TENSOR.V_SAM_ATTN_QKV: "v.sam.blk.{bid}.attn.qkv",
1056+
MODEL_TENSOR.V_SAM_ATTN_OUT: "v.sam.blk.{bid}.attn.out",
1057+
MODEL_TENSOR.V_SAM_MLP_LIN_1: "v.sam.blk.{bid}.mlp.lin1",
1058+
MODEL_TENSOR.V_SAM_MLP_LIN_2: "v.sam.blk.{bid}.mlp.lin2",
1059+
MODEL_TENSOR.V_SAM_NECK: "v.sam.neck.{bid}",
1060+
MODEL_TENSOR.V_SAM_NET_2: "v.sam.net_2",
1061+
MODEL_TENSOR.V_SAM_NET_3: "v.sam.net_3",
10331062
# audio (mtmd)
10341063
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
10351064
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
@@ -2247,7 +2276,9 @@ class MODEL_TENSOR(IntEnum):
22472276
MODEL_TENSOR.ATTN_Q_B,
22482277
MODEL_TENSOR.ATTN_KV_A_MQA,
22492278
MODEL_TENSOR.ATTN_KV_B,
2279+
MODEL_TENSOR.ATTN_K,
22502280
MODEL_TENSOR.ATTN_K_B,
2281+
MODEL_TENSOR.ATTN_V,
22512282
MODEL_TENSOR.ATTN_V_B,
22522283
MODEL_TENSOR.ATTN_Q_A_NORM,
22532284
MODEL_TENSOR.ATTN_KV_A_NORM,
@@ -3207,6 +3238,7 @@ class VisionProjectorType:
32073238
LIGHTONOCR = "lightonocr"
32083239
COGVLM = "cogvlm"
32093240
JANUS_PRO = "janus_pro"
3241+
DEEPSEEKOCR = "deepseekocr"
32103242

32113243

32123244
# Items here are (block size, type size)

gguf-py/gguf/tensor_mapping.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
from typing import Sequence
44

5+
from numpy.f2py.auxfuncs import throw_error
6+
57
from .constants import MODEL_ARCH, MODEL_TENSOR, MODEL_TENSORS, TENSOR_NAMES
68

79

@@ -1457,6 +1459,58 @@ class TensorNameMap:
14571459
"model.visual.deepstack_merger_list.{bid}.linear_fc2", # deepstack in qwen3vl
14581460
),
14591461

1462+
MODEL_TENSOR.V_SAM_POS_EMBD: (
1463+
"model.sam_model.pos_embed"
1464+
),
1465+
1466+
MODEL_TENSOR.V_SAM_PATCH_EMBD: (
1467+
"model.sam_model.patch_embed.proj"
1468+
),
1469+
1470+
MODEL_TENSOR.V_SAM_PRE_NORM: (
1471+
"model.sam_model.blocks.{bid}.norm1", # deepstack in qwen3vl
1472+
),
1473+
1474+
MODEL_TENSOR.V_SAM_POST_NORM: (
1475+
"model.sam_model.blocks.{bid}.norm2", # deepstack in qwen3vl
1476+
),
1477+
1478+
MODEL_TENSOR.V_SAM_ATTN_POS_H: (
1479+
"model.sam_model.blocks.{bid}.attn.rel_pos_h"
1480+
),
1481+
1482+
MODEL_TENSOR.V_SAM_ATTN_POS_W: (
1483+
"model.sam_model.blocks.{bid}.attn.rel_pos_w"
1484+
),
1485+
1486+
MODEL_TENSOR.V_SAM_ATTN_QKV: (
1487+
"model.sam_model.blocks.{bid}.attn.qkv"
1488+
),
1489+
1490+
MODEL_TENSOR.V_SAM_ATTN_OUT: (
1491+
"model.sam_model.blocks.{bid}.attn.proj"
1492+
),
1493+
1494+
MODEL_TENSOR.V_SAM_MLP_LIN_1: (
1495+
"model.sam_model.blocks.{bid}.mlp.lin1",
1496+
),
1497+
1498+
MODEL_TENSOR.V_SAM_MLP_LIN_2: (
1499+
"model.sam_model.blocks.{bid}.mlp.lin2",
1500+
),
1501+
1502+
MODEL_TENSOR.V_SAM_NECK: (
1503+
"model.sam_model.neck.{bid}"
1504+
),
1505+
1506+
MODEL_TENSOR.V_SAM_NET_2: (
1507+
"model.sam_model.net_2"
1508+
),
1509+
1510+
MODEL_TENSOR.V_SAM_NET_3: (
1511+
"model.sam_model.net_3"
1512+
),
1513+
14601514
MODEL_TENSOR.V_MM_POST_FC_NORM: (
14611515
"model.vision.linear_proj.norm1", # cogvlm
14621516
),

tools/mtmd/clip-impl.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,24 @@
129129
#define TN_TOK_BOI "v.boi"
130130
#define TN_TOK_EOI "v.eoi"
131131

132+
// deepseek-ocr
133+
#define TN_SAM_POS_EMBD "sam.pos_embd"
134+
#define TN_SAM_PATCH_EMBD "sam.patch_embd"
135+
#define TN_SAM_PRE_NORM "sam.blk.%d.pre_ln"
136+
#define TN_SAM_POST_NORM "sam.blk.%d.post_ln"
137+
#define TN_SAM_ATTN_POS_H "sam.blk.%d.attn.pos_h"
138+
#define TN_SAM_ATTN_POS_W "sam.blk.%d.attn.pos_w"
139+
#define TN_SAM_ATTN_QKV "sam.blk.%d.attn.qkv"
140+
#define TN_SAM_ATTN_OUT "sam.blk.%d.attn.out"
141+
#define TN_SAM_MLP_LIN_1 "sam.blk.%d.mlp.lin1"
142+
#define TN_SAM_MLP_LIN_2 "sam.blk.%d.mlp.lin2"
143+
#define TN_SAM_NECK "sam.neck.%d"
144+
#define TN_SAM_NET_2 "sam.net_2"
145+
#define TN_SAM_NET_3 "sam.net_3"
146+
147+
148+
#define TN_SAM_ATTN_OUT "sam.blk.%d.attn_out"
149+
132150
// align x to upper multiple of n
133151
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
134152

@@ -156,6 +174,7 @@ enum projector_type {
156174
PROJECTOR_TYPE_LIGHTONOCR,
157175
PROJECTOR_TYPE_COGVLM,
158176
PROJECTOR_TYPE_JANUS_PRO,
177+
PROJECTOR_TYPE_DEEPSEEK_OCR,
159178
PROJECTOR_TYPE_UNKNOWN,
160179
};
161180

@@ -182,6 +201,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
182201
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
183202
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
184203
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
204+
{ PROJECTOR_TYPE_DEEPSEEK_OCR,"deepseek_orc"},
185205
};
186206

187207
static projector_type clip_projector_type_from_string(const std::string & str) {

0 commit comments

Comments
 (0)