@@ -682,8 +682,8 @@ struct clip_graph {
682682
683683 const int enc_n_patches = enc_image_size / enc_patch_size; // 64
684684
685- ggml_tensor * inpL = build_enc_inp (inp_raw, enc_patch_size, enc_image_size , enc_n_embd);
686- ggml_tensor * cur = ggml_add (ctx0, inpL, model.position_embeddings );
685+ ggml_tensor * inpL = build_enc_inp (inp_raw, enc_patch_size, enc_n_patches , enc_n_embd);
686+ ggml_tensor * cur = ggml_add (ctx0, inpL, model.pos_embed );
687687
688688 // loop over layers
689689 for (int il = 0 ; il < _depth; il++) {
@@ -842,7 +842,7 @@ struct clip_graph {
842842 ggml_tensor * inp_raw = build_inp_raw ();
843843
844844
845- ggml_tensor * global_features_1 = build_sam_enc (inp_raw);
845+ ggml_tensor * global_features_1 = build_sam_enc (inp_raw, std::max (img. nx , img. ny ) );
846846
847847 ggml_tensor * global_features_2 = build_dp_ocr_clip (inp_raw, global_features_1);
848848
@@ -2862,6 +2862,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
28622862 {
28632863 res = graph.build_cogvlm ();
28642864 } break ;
2865+ case PROJECTOR_TYPE_DEEPSEEKOCR:
2866+ {
2867+ res = graph.build_deepseek_ocr ();
2868+ } break ;
28652869 default :
28662870 {
28672871 res = graph.build_llava ();
@@ -3187,6 +3191,11 @@ struct clip_model_loader {
31873191 hparams.ffn_op = FFN_GELU_ERF;
31883192 log_ffn_op = " gelu_erf" ; // temporary solution for logging
31893193 } break ;
3194+ case PROJECTOR_TYPE_DEEPSEEKOCR:
3195+ {
3196+ hparams.set_limit_image_tokens (8 , 1024 );
3197+ hparams.set_warmup_n_tokens (256 ); // avoid OOM on warmup
3198+ } break ;
31903199 default :
31913200 break ;
31923201 }
@@ -3574,7 +3583,7 @@ struct clip_model_loader {
35743583 model.mm_1_w = get_tensor (string_format (TN_LLAVA_PROJ, 1 , " weight" ));
35753584 model.mm_1_b = get_tensor (string_format (TN_LLAVA_PROJ, 1 , " bias" ));
35763585 } break ;
3577- case PROJECTOR_TYPE_DEEPSEEK_OCR :
3586+ case PROJECTOR_TYPE_DEEPSEEKOCR :
35783587 {
35793588 model.pos_embed = get_tensor (TN_SAM_POS_EMBD);
35803589 model.patch_embed_proj_w = get_tensor (string_format (TN_SAM_PATCH_EMBD, " weight" ));
@@ -4830,7 +4839,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
48304839 }
48314840 }
48324841 } break ;
4833- case PROJECTOR_TYPE_DEEPSEEK_OCR :
4842+ case PROJECTOR_TYPE_DEEPSEEKOCR :
48344843 {
48354844 // configurable, or read from params
48364845 const int min_num = 2 ;
0 commit comments