Skip to content

Commit 13dc6fb

Browse files
authored
Merge branch 'sf/deepseek-ocr' into sf/deepseek-ocr
2 parents e8b2610 + 97e0907 commit 13dc6fb

File tree

3 files changed

+34
-25
lines changed

3 files changed

+34
-25
lines changed

convert_hf_to_gguf.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5790,16 +5790,16 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
57905790

57915791
@ModelBase.register("DeepseekOCRForCausalLM")
57925792
class DeepseekOCRVisionModel(MmprojModel):
5793-
def __init__(self, *args, **kwargs):
5793+
def __init__(self, *args, **kwargs):
57945794
super().__init__(*args, **kwargs)
5795-
5795+
57965796
proc_fname = self.dir_model / "processor_config.json"
5797-
5797+
57985798
if proc_fname.is_file():
57995799
with open(proc_fname, "r") as f:
58005800
self.preprocessor_config = json.load(f)
5801-
5802-
5801+
5802+
58035803
def set_gguf_parameters(self):
58045804
super().set_gguf_parameters()
58055805
hparams = self.hparams
@@ -5857,7 +5857,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
58575857
return [(self.map_tensor_name(name, try_suffixes=("",)), data_torch)]
58585858

58595859
return [(self.map_tensor_name(name), data_torch)]
5860-
5860+
58615861

58625862
@ModelBase.register("Gemma3nForConditionalGeneration")
58635863
class Gemma3NModel(Gemma3Model):

tools/mtmd/clip-impl.h

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -130,18 +130,18 @@
130130
#define TN_TOK_EOI "v.eoi"
131131

132132
// deepseek-ocr
133-
#define TN_SAM_POS_EMBD "sam.pos_embd"
134-
#define TN_SAM_PATCH_EMBD "sam.patch_embd.%s"
135-
#define TN_SAM_PRE_NORM "sam.blk.%d.pre_ln.%s"
136-
#define TN_SAM_POST_NORM "sam.blk.%d.post_ln"
137-
#define TN_SAM_ATTN_POS_H "sam.blk.%d.attn.pos_h"
138-
#define TN_SAM_ATTN_POS_W "sam.blk.%d.attn.pos_w"
139-
#define TN_SAM_ATTN_QKV "sam.blk.%d.attn.qkv.%s"
140-
#define TN_SAM_ATTN_OUT "sam.blk.%d.attn.out.%s"
141-
#define TN_SAM_FFN_UP "sam.blk.%d.mlp.lin1.%s"
142-
#define TN_SAM_FFN_DOWN "sam.blk.%d.mlp.lin2.%s"
143-
#define TN_SAM_NECK "sam.neck.%d.%s"
144-
#define TN_SAM_NET "sam.net_%d.%s"
133+
#define TN_SAM_POS_EMBD "v.sam.pos_embd"
134+
#define TN_SAM_PATCH_EMBD "v.sam.patch_embd.%s"
135+
#define TN_SAM_PRE_NORM "v.sam.blk.%d.pre_ln.%s"
136+
#define TN_SAM_POST_NORM "v.sam.blk.%d.post_ln"
137+
#define TN_SAM_ATTN_POS_H "v.sam.blk.%d.attn.pos_h"
138+
#define TN_SAM_ATTN_POS_W "v.sam.blk.%d.attn.pos_w"
139+
#define TN_SAM_ATTN_QKV "v.sam.blk.%d.attn.qkv.%s"
140+
#define TN_SAM_ATTN_OUT "v.sam.blk.%d.attn.out.%s"
141+
#define TN_SAM_FFN_UP "v.sam.blk.%d.mlp.lin1.%s"
142+
#define TN_SAM_FFN_DOWN "v.sam.blk.%d.mlp.lin2.%s"
143+
#define TN_SAM_NECK "v.sam.neck.%d.%s"
144+
#define TN_SAM_NET "v.sam.net_%d.%s"
145145

146146
// align x to upper multiple of n
147147
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
@@ -170,7 +170,7 @@ enum projector_type {
170170
PROJECTOR_TYPE_LIGHTONOCR,
171171
PROJECTOR_TYPE_COGVLM,
172172
PROJECTOR_TYPE_JANUS_PRO,
173-
PROJECTOR_TYPE_DEEPSEEK_OCR,
173+
PROJECTOR_TYPE_DEEPSEEKOCR,
174174
PROJECTOR_TYPE_UNKNOWN,
175175
};
176176

@@ -197,7 +197,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
197197
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
198198
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
199199
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
200-
{ PROJECTOR_TYPE_DEEPSEEK_OCR,"deepseek_orc"},
200+
{ PROJECTOR_TYPE_DEEPSEEKOCR,"deepseekocr"},
201201
};
202202

203203
static projector_type clip_projector_type_from_string(const std::string & str) {

tools/mtmd/clip.cpp

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -682,8 +682,8 @@ struct clip_graph {
682682

683683
const int enc_n_patches = enc_image_size / enc_patch_size; // 64
684684

685-
ggml_tensor * inpL = build_enc_inp(inp_raw, enc_patch_size, enc_image_size, enc_n_embd);
686-
ggml_tensor * cur = ggml_add(ctx0, inpL, model.position_embeddings);
685+
ggml_tensor * inpL = build_enc_inp(inp_raw, enc_patch_size, enc_n_patches, enc_n_embd);
686+
ggml_tensor * cur = ggml_add(ctx0, inpL, model.pos_embed);
687687

688688
// loop over layers
689689
for (int il = 0; il < _depth; il++) {
@@ -842,7 +842,7 @@ struct clip_graph {
842842
ggml_tensor * inp_raw = build_inp_raw();
843843

844844

845-
ggml_tensor * global_features_1 = build_sam_enc(inp_raw);
845+
ggml_tensor * global_features_1 = build_sam_enc(inp_raw, std::max(img.nx, img.ny));
846846

847847
ggml_tensor * global_features_2 = build_dp_ocr_clip(inp_raw, global_features_1);
848848

@@ -2862,6 +2862,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
28622862
{
28632863
res = graph.build_cogvlm();
28642864
} break;
2865+
case PROJECTOR_TYPE_DEEPSEEKOCR:
2866+
{
2867+
res = graph.build_deepseek_ocr();
2868+
} break;
28652869
default:
28662870
{
28672871
res = graph.build_llava();
@@ -3187,6 +3191,11 @@ struct clip_model_loader {
31873191
hparams.ffn_op = FFN_GELU_ERF;
31883192
log_ffn_op = "gelu_erf"; // temporary solution for logging
31893193
} break;
3194+
case PROJECTOR_TYPE_DEEPSEEKOCR:
3195+
{
3196+
hparams.set_limit_image_tokens(8, 1024);
3197+
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
3198+
} break;
31903199
default:
31913200
break;
31923201
}
@@ -3574,7 +3583,7 @@ struct clip_model_loader {
35743583
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
35753584
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
35763585
} break;
3577-
case PROJECTOR_TYPE_DEEPSEEK_OCR:
3586+
case PROJECTOR_TYPE_DEEPSEEKOCR:
35783587
{
35793588
model.pos_embed = get_tensor(TN_SAM_POS_EMBD);
35803589
model.patch_embed_proj_w = get_tensor(string_format(TN_SAM_PATCH_EMBD, "weight"));
@@ -4830,7 +4839,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
48304839
}
48314840
}
48324841
} break;
4833-
case PROJECTOR_TYPE_DEEPSEEK_OCR:
4842+
case PROJECTOR_TYPE_DEEPSEEKOCR:
48344843
{
48354844
// configurable, or read from params
48364845
const int min_num = 2;

0 commit comments

Comments
 (0)