@@ -739,8 +739,8 @@ struct clip_graph {
739739
740740 struct ggml_tensor * q_r = ggml_reshape_4d (ctx0, Qcur, enc_d_heads, W, H, B * enc_n_heads);
741741
742- struct ggml_tensor * rel_w = ggml_cont (ctx0,ggml_permute (ctx0,
743- ggml_mul_mat (ctx0,
742+ struct ggml_tensor * rel_w = ggml_cont (ctx0,ggml_permute (ctx0,
743+ ggml_mul_mat (ctx0,
744744 rw,
745745 ggml_cont (ctx0, ggml_permute (ctx0, q_r, 0 , 2 , 1 , 3 ))),
746746 0 , 2 , 1 , 3 ));
@@ -801,9 +801,8 @@ struct clip_graph {
801801
802802 cur = sam_layer_norm_2d (ctx0, cur, 256 , model.neck_3_w , model.neck_3_b , hparams.eps );
803803
804- // TODO : check conv padding
805- cur = ggml_conv_2d_s1_ph (ctx0, model.net_2 , cur);
806- cur = ggml_conv_2d_s1_ph (ctx0, model.net_3 , cur);
804+ cur = ggml_conv_2d (ctx0, model.net_2 , cur, 2 ,2 ,1 ,1 , 1 ,1 );
805+ cur = ggml_conv_2d (ctx0, model.net_3 , cur, 2 ,2 ,1 ,1 , 1 ,1 );
807806
808807 ggml_build_forward_expand (gf, cur);
809808 return cur;
@@ -838,22 +837,27 @@ struct clip_graph {
838837
839838 ggml_tensor * global_features_2 = build_dp_ocr_clip (global_features_1);
840839
840+ // FIXME remove n_patches is hardcoded
841+ int clip_n_patches = 256 ; // FIXME hardcoded for sam 1024x1024 with 16x16 patches
842+
841843 // torch global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
842844 global_features_1 = ggml_cont (ctx0,ggml_permute (ctx0, global_features_1,2 ,1 ,0 ,3 ));
843- global_features_1 = ggml_reshape_2d (ctx0, global_features_1, n_embd, n_patches);
845+ // flatten 2nd and 3rd dims
846+ global_features_1 = ggml_reshape_2d (ctx0, global_features_1, global_features_1->ne [0 ], clip_n_patches);
844847
845848 // remove CLS token
846849 global_features_2 = ggml_view_2d (ctx0, global_features_2,
847- n_embd, n_patches ,
850+ n_embd, clip_n_patches ,
848851 ggml_row_size (global_features_2->type , n_embd), 0 );
849852
850853 ggml_tensor * global_features = ggml_concat (ctx0, global_features_2, global_features_1, 1 );
851- global_features = ggml_reshape_2d (ctx0, global_features, 2 * n_embd, n_patches );
854+ global_features = ggml_reshape_2d (ctx0, global_features, 2 * n_embd,clip_n_patches );
852855 global_features = ggml_cont (ctx0, global_features);
853856 global_features = ggml_mul_mat (ctx0, model.fc_w , global_features);
854857 global_features = ggml_add (ctx0, global_features, model.fc_b );
855858
856859 global_features = build_global_local_features (ctx0,global_features);
860+ global_features = ggml_cont (ctx0, ggml_permute (ctx0, global_features, 1 , 0 , 2 , 3 ));
857861 ggml_build_forward_expand (gf, global_features);
858862 return gf;
859863 }
@@ -868,16 +872,16 @@ struct clip_graph {
868872 GGML_ASSERT (model.view_seperator != nullptr );
869873
870874 // 1) global_features: [n_dim, h*w] -> [n_dim, w, h] -> [h, w, n_dim]
871- ggml_tensor * t = ggml_reshape_4d (ctx0, global_features, 1280 , 64 , 64 , 1 ); // (n_dim, w, h)
875+ ggml_tensor * t = ggml_reshape_4d (ctx0, global_features, 1280 , 16 , 16 , 1 ); // (n_dim, w, h)
872876 t = ggml_cont (ctx0, ggml_permute (ctx0, t, 2 , 1 , 0 , 3 )); // (h, w, n_dim)
873877 ggml_tensor * nl = ggml_cont (ctx0,ggml_permute (ctx0, model.image_newline , 2 , 1 , 0 , 3 ));
874- nl = ggml_repeat_4d (ctx0, nl, 64 , 1 , 1280 , 1 ); // n_pos rows
878+ nl = ggml_repeat_4d (ctx0, nl, 16 , 1 , 1280 , 1 ); // n_pos rows
875879
876880
877881 // 2) image_newline: [n_dim] -> [1, 1, n_dim] -> repeat to [h, 1, n_dim]
878882 t = ggml_concat (ctx0, t, nl, 1 ); // (h, w+1, n_dim)
879883
880- t = ggml_reshape_2d (ctx0, t, 1280 , 64 * (64 + 1 )); // (n_dim, h*(w+1))
884+ t = ggml_reshape_2d (ctx0, t, 1280 , 16 * (16 + 1 )); // (n_dim, h*(w+1))
881885
882886
883887 // 5) append view_separator as an extra "token":
@@ -1538,9 +1542,12 @@ struct clip_graph {
15381542 GGML_ASSERT (model.class_embedding != nullptr );
15391543 GGML_ASSERT (model.position_embeddings != nullptr );
15401544
1541- const int n_pos = n_patches + 1 ;
1542- ggml_tensor * inp = ggml_cont (ctx0,ggml_permute (ctx0, patch_embeds,2 ,1 ,0 ,3 ));
1543- inp = ggml_reshape_2d (ctx0, inp, n_embd, n_patches);
1545+ ggml_tensor * inp = ggml_cpy (ctx0, patch_embeds, ggml_dup_tensor (ctx0, patch_embeds));
1546+
1547+
1548+ const int n_pos = 257 ; // +1 for [CLS]
1549+ inp = ggml_cont (ctx0,ggml_permute (ctx0, inp,2 ,1 ,0 ,3 ));
1550+ inp = ggml_reshape_2d (ctx0, inp, n_embd, inp->ne [1 ]*inp->ne [2 ]*inp->ne [3 ]);
15441551
15451552
15461553
@@ -1552,7 +1559,9 @@ struct clip_graph {
15521559
15531560 // for selecting learned pos embd, used by ViT
15541561 ggml_tensor * positions = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_pos);
1555- cb (positions, " positions" , -1 );
1562+ ggml_set_name (positions, " positions" );
1563+ ggml_set_input (positions);
1564+
15561565 ggml_tensor * learned_pos_embd = ggml_get_rows (ctx0, model.position_embeddings , positions);
15571566
15581567
@@ -2525,7 +2534,7 @@ struct clip_graph {
25252534 ggml_tensor * q_coord = ggml_arange (ctx, 0 .0f , static_cast <float >(q_size), 1 .0f ); // [q_size]
25262535 ggml_tensor * k_coord = ggml_arange (ctx, 0 .0f , static_cast <float >(k_size), 1 .0f ); // [k_size]
25272536 ggml_tensor * rel = ggml_new_tensor_2d (ctx, GGML_TYPE_F32, k_size, q_size);
2528-
2537+
25292538 // broadcast reshape:
25302539 q_coord = ggml_cont (ctx,
25312540 ggml_repeat (ctx,
@@ -2538,8 +2547,8 @@ struct clip_graph {
25382547 float q_scale = std::max ((float )k_size/q_size, 1 .0f );
25392548 float k_scale = std::max ((float )q_size/k_size, 1 .0f );
25402549
2541- // This wouldn't be triggered in DeepSeek-OCR. Just for compatibility with
2542- // the original implementation.
2550+ // This wouldn't be triggered in DeepSeek-OCR. Just for compatibility with
2551+ // the original implementation.
25432552 if (q_size != k_size) {
25442553 q_coord = ggml_scale_inplace (ctx, q_coord, q_scale);
25452554 k_coord = ggml_scale_inplace (ctx, k_coord, k_scale);
@@ -2548,7 +2557,7 @@ struct clip_graph {
25482557 // -------------------------------------------------
25492558 // relative_coords = q - k + (k_size - 1) // SAME as PyTorch when no scaling
25502559 // -------------------------------------------------
2551-
2560+
25522561 rel = ggml_sub (ctx, q_coord, k_coord); // [q_size, k_size]
25532562 rel = ggml_scale_bias (ctx, rel, 1 .0f , (k_size - 1 .0f )*k_scale); // [q_size, k_size]
25542563 // Clamp to [0, L-1] range for valid indexing
@@ -2559,10 +2568,10 @@ struct clip_graph {
25592568 // -------------------------------------------------
25602569
25612570 ggml_tensor * idx_2d = ggml_cast (ctx, rel, GGML_TYPE_I32); // [q_size, k_size]
2562-
2571+
25632572 // Gather from rel_pos → [qk, C]
25642573 // -------------------------------------------------
2565-
2574+
25662575 // flatten to 1D for ggml_get_rows
25672576 int qk = q_size * k_size;
25682577 ggml_tensor * idx_flat = ggml_reshape_1d (ctx, idx_2d, qk); // [qk]
@@ -5237,9 +5246,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
52375246 } break ;
52385247 case PROJECTOR_TYPE_DEEPSEEKOCR:
52395248 {
5240- int x_patch = img->nx / (params.patch_size );
5241-
5242- n_patches += x_patch + 1 ;
5249+ n_patches = 1280 ;
52435250
52445251 } break ;
52455252 default :
@@ -5573,10 +5580,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
55735580 case PROJECTOR_TYPE_VOXTRAL:
55745581 case PROJECTOR_TYPE_JANUS_PRO:
55755582 case PROJECTOR_TYPE_COGVLM:
5576- case PROJECTOR_TYPE_DEEPSEEKOCR:
55775583 {
55785584 // do nothing
55795585 } break ;
5586+ case PROJECTOR_TYPE_DEEPSEEKOCR:
5587+ {
5588+ // FIXME we need correct this when all model configs are set correctly
5589+ // n_patch is not correct right now
5590+ int32_t n_pos = 16 * 16 + 1 ; // hardcode for now
5591+ std::vector<int32_t > positions (n_pos);
5592+ for (int i = 0 ; i < n_pos; i++) {
5593+ positions[i] = i;
5594+ }
5595+ set_input_i32 (" positions" , positions);
5596+ } break ;
55805597 case PROJECTOR_TYPE_LLAMA4:
55815598 {
55825599 // set the 2D positions
0 commit comments