@@ -819,6 +819,7 @@ struct clip_graph {
819819 // TODO: better implementation
820820 layer = ggml_permute (ctx0, ggml_norm (ctx0, ggml_cont (ctx0, ggml_permute (ctx0, layer, 1 , 2 , 0 , 3 )), eps), 2 , 0 ,
821821 1 , 3 );
822+ layer = ggml_cont (ctx0, layer);
822823
823824 layer =
824825 ggml_add (ctx0, ggml_mul (ctx0, ggml_repeat (ctx0, ggml_reshape_3d (ctx0, w, 1 , 1 , n_channels), layer), layer),
@@ -1537,8 +1538,7 @@ struct clip_graph {
15371538 GGML_ASSERT (model.position_embeddings != nullptr );
15381539
15391540 const int n_pos = n_patches + 1 ;
1540- ggml_tensor * inp = ggml_permute (ctx0, patch_embeds,2 ,1 ,0 ,3 );
1541- inp = ggml_cont (ctx0, inp);
1541+ ggml_tensor * inp = ggml_cont (ctx0,ggml_permute (ctx0, patch_embeds,2 ,1 ,0 ,3 ));
15421542 inp = ggml_reshape_2d (ctx0, inp, n_embd, n_patches);
15431543
15441544
@@ -1550,7 +1550,7 @@ struct clip_graph {
15501550 norm_type norm_t = NORM_TYPE_NORMAL;
15511551
15521552 // for selecting learned pos embd, used by ViT
1553- struct ggml_tensor * positions = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_pos);
1553+ ggml_tensor * positions = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_pos);
15541554 cb (positions, " positions" , -1 );
15551555 ggml_tensor * learned_pos_embd = ggml_get_rows (ctx0, model.position_embeddings , positions);
15561556
@@ -5218,7 +5218,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
52185218 } break ;
52195219 case PROJECTOR_TYPE_DEEPSEEKOCR:
52205220 {
5221- n_patches += 2 ;
5221+ int x_patch = img->nx / (params.patch_size );
5222+
5223+ n_patches += x_patch + 1 ;
5224+
52225225 } break ;
52235226 default :
52245227 GGML_ABORT (" unsupported projector type" );
0 commit comments