@@ -837,34 +837,32 @@ struct clip_graph {
837837 ggml_cgraph * build_deepseek_ocr () {
838838 // patch embedding
839839 ggml_tensor * inp_raw = build_inp_raw ();
840-
841-
842840 ggml_tensor * global_features_1 = build_sam_enc (inp_raw, std::max (img.nx , img.ny ));
843-
844841 ggml_tensor * global_features_2 = build_dp_ocr_clip (global_features_1);
845-
842+
846843 // FIXME remove n_patches is hardcoded
847-
844+
848845 // torch global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
849- global_features_1 = ggml_cont (ctx0,ggml_permute (ctx0, global_features_1,2 , 1 , 0 , 3 ));
846+ global_features_1 = ggml_cont (ctx0,ggml_permute (ctx0, global_features_1, 1 , 2 , 0 , 3 ));
850847 int clip_n_patches = global_features_1->ne [1 ] * global_features_1->ne [2 ];
851-
848+
852849 // flatten 2nd and 3rd dims
853850 global_features_1 = ggml_reshape_2d (ctx0, global_features_1, global_features_1->ne [0 ], clip_n_patches);
854-
851+
855852 // remove CLS token
856- global_features_2 = ggml_view_2d (ctx0, global_features_2,
857- n_embd, clip_n_patches,
858- ggml_row_size (global_features_2->type , n_embd), 0 );
859-
860- ggml_tensor * global_features = ggml_concat (ctx0, global_features_2, global_features_1, 1 );
853+ global_features_2 = ggml_view_2d (ctx0, global_features_2, n_embd, clip_n_patches,
854+ global_features_2->nb [1 ], global_features_2->nb [1 ]);
855+
856+ ggml_tensor * global_features = ggml_concat (ctx0, global_features_2, global_features_1, 0 );
861857 global_features = ggml_reshape_2d (ctx0, global_features, 2 * n_embd,clip_n_patches);
862858 global_features = ggml_cont (ctx0, global_features);
863859 global_features = ggml_mul_mat (ctx0, model.fc_w , global_features);
864860 global_features = ggml_add (ctx0, global_features, model.fc_b );
865861
866862 global_features = build_global_local_features (ctx0,global_features);
867- global_features = ggml_cont (ctx0, ggml_permute (ctx0, global_features, 1 , 0 , 2 , 3 ));
863+
864+ cb (global_features, " dsocr_output" , -1 );
865+
868866 ggml_build_forward_expand (gf, global_features);
869867 return gf;
870868 }
@@ -878,30 +876,23 @@ struct clip_graph {
878876 GGML_ASSERT (model.image_newline != nullptr );
879877 GGML_ASSERT (model.view_seperator != nullptr );
880878
881- // 1) global_features: [n_dim, h*w] -> [n_dim, w, h] -> [h, w, n_dim]
882879 const auto h = static_cast <int >(std::sqrt (static_cast <float >(global_features->ne [1 ])));
883880 const auto w = h;
884881 const auto n_dim = global_features->ne [0 ];
885- ggml_tensor * t = ggml_reshape_4d (ctx0, global_features, n_dim, h, w, 1 ); // (n_dim, w, h)
886- t = ggml_cont (ctx0, ggml_permute (ctx0, t, 2 , 1 , 0 , 3 )); // (h, w, n_dim)
887- ggml_tensor * nl = ggml_cont (ctx0,ggml_permute (ctx0, model.image_newline , 2 , 1 , 0 , 3 ));
888- nl = ggml_repeat_4d (ctx0, nl, h, 1 , n_dim, 1 ); // n_pos rows
889882
883+ ggml_tensor * cur;
884+ ggml_tensor * imgnl;
885+ ggml_tensor * vs;
890886
891- // 2) image_newline: [n_dim] -> [1, 1, n_dim] -> repeat to [h, 1, n_dim]
892- t = ggml_concat (ctx0, t, nl, 1 ); // (h, w+1, n_dim)
893-
894- t = ggml_reshape_2d (ctx0, t, n_dim, h* (h + 1 )); // (n_dim, h*(w+1))
895-
896-
897- // 5) append view_separator as an extra "token":
898- // view_separator: [n_dim] -> [n_dim, 1]
899- ggml_tensor * vs = ggml_reshape_2d (ctx0, model.view_seperator , n_dim, 1 ); // (n_dim, 1)
900-
901- // concat along token dimension (dim=1):
902- t = ggml_concat (ctx0, t, vs, 1 ); // (n_dim, h*(w+1) + 1)
887+ cur = ggml_reshape_3d (ctx0, global_features, n_dim, w, h);
888+ imgnl = ggml_repeat_4d (ctx0, model.image_newline , n_dim, 1 , h, 1 );
889+ cur = ggml_reshape_2d (ctx0, ggml_concat (ctx0, cur, imgnl, 1 ), n_dim, (w+1 )*h);
890+ cb (cur, " insert_imgnl" , -1 );
891+ vs = ggml_reshape_2d (ctx0, model.view_seperator , n_dim, 1 ); // (n_dim, 1)
892+ cur = ggml_concat (ctx0, cur, vs, 1 ); // (n_dim, h*(w+1) + 1)
893+ cb (cur, " insert_vs" , -1 );
903894
904- return t ;
895+ return cur ;
905896 }
906897
907898
@@ -1596,8 +1587,8 @@ struct clip_graph {
15961587 ggml_tensor * positions = ggml_cast (ctx0, ggml_arange (ctx0, 0 , n_pos, 1 ), GGML_TYPE_I32);
15971588 ggml_tensor * learned_pos_embd = ggml_get_rows (ctx0, new_pos_embd, positions);
15981589
1599- ggml_tensor * cur = build_vit (inp, n_pos, norm_t , hparams. ffn_op , learned_pos_embd,
1600- nullptr ); // shape [1024, 16, 16]
1590+ ggml_tensor * cur = build_vit (inp, n_pos, norm_t , ffn_op_type::FFN_GELU_QUICK,
1591+ learned_pos_embd, nullptr ); // shape [1024, 16, 16]
16011592
16021593 ggml_build_forward_expand (gf, cur);
16031594
@@ -2394,7 +2385,7 @@ struct clip_graph {
23942385 // pre-layernorm
23952386 if (model.pre_ln_w ) {
23962387 inpL = build_norm (inpL, model.pre_ln_w , model.pre_ln_b , norm_t , eps, -1 );
2397- cb (inpL, " vit_pre_ln " , -1 );
2388+ cb (inpL, " pre_ln " , -1 );
23982389 }
23992390
24002391 // loop over layers
@@ -5411,12 +5402,15 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
54115402 } break ;
54125403 case PROJECTOR_TYPE_DEEPSEEKOCR:
54135404 {
5414- int x_patch = img->nx / (params.patch_size );
5415-
5416- n_patches += x_patch + 1 ;
5417- n_patches = 1280 ;
5418-
5419-
5405+ // SAM encoder applies two stride-2 convolutions (net_2 and net_3)
5406+ // which reduces spatial dimensions by 4x in each direction (16x total)
5407+ // E.g., 64x64 -> 16x16 patches
5408+ n_patches /= 16 ;
5409+
5410+ // build_global_local_features adds image newlines and view separator
5411+ // Formula: h*(w+1) + 1 where h = w = sqrt(n_patches)
5412+ int h = static_cast <int >(std::sqrt (static_cast <float >(n_patches)));
5413+ n_patches = h * (h + 1 ) + 1 ;
54205414 } break ;
54215415 default :
54225416 GGML_ABORT (" unsupported projector type" );
@@ -5807,7 +5801,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
58075801 bool is_stored = false ;
58085802 std::vector<std::string> patterns = {
58095803 /* Add tensor names here to dump (e.g. "sam_output") */
5810- " vit_pre_ln"
58115804 };
58125805
58135806 for (auto & p : patterns) {
0 commit comments