@@ -342,10 +342,6 @@ struct clip_model {
342342 ggml_tensor * mm_model_ln_kv_b = nullptr ;
343343 ggml_tensor * mm_model_ln_post_w = nullptr ;
344344 ggml_tensor * mm_model_ln_post_b = nullptr ;
345- ggml_tensor * mm_model_ffn_up_w = nullptr ;
346- ggml_tensor * mm_model_ffn_up_b = nullptr ;
347- ggml_tensor * mm_model_ffn_down_w = nullptr ;
348- ggml_tensor * mm_model_ffn_down_b = nullptr ;
349345
350346 // gemma3
351347 ggml_tensor * mm_input_proj_w = nullptr ;
@@ -1169,38 +1165,33 @@ struct clip_graph {
11691165 cb (cur, " vit_out" , -1 );
11701166
11711167 {
1172- // SiglipMultiheadAttentionPoolingHead
1173- int64_t n_pos = cur->ne [1 ];
1174- ggml_tensor * Qcur = ggml_repeat (ctx0, model.mm_model_query , cur);
1175- ggml_tensor * Kcur = cur;
1176- ggml_tensor * Vcur = cur;
1177-
1178- Qcur = ggml_reshape_3d (ctx0, Qcur, d_head, n_head, n_pos);
1179- Kcur = ggml_reshape_3d (ctx0, Kcur, d_head, n_head, n_pos);
1180- Vcur = ggml_reshape_3d (ctx0, Vcur, d_head, n_head, n_pos);
1181-
1182- cb (Qcur, " resampl_Qcur" , -1 );
1183- cb (Kcur, " resampl_Kcur" , -1 );
1184- cb (Vcur, " resampl_Vcur" , -1 );
1185-
1186- float kq_scale = 1 .0f / sqrtf ((float )(d_head));
1187- cur = build_attn (model.mm_model_attn_o_w , model.mm_model_attn_o_b ,
1188- Qcur, Kcur, Vcur, nullptr , kq_scale, -1 );
1189-
1190- cb (cur, " resampl_attn_out" , -1 );
1191-
1192- cur = build_norm (cur, model.mm_model_ln_post_w , model.mm_model_ln_post_b ,
1193- NORM_TYPE_NORMAL, eps, -1 );
1194-
1195- cb (cur, " resampl_out" , -1 );
1196- }
1168+ // mlp_AR
1169+ float proj_norm_eps = 1e-5 ; // PaddleOCR uses hard-coded value eps=1e-5 for Projector
1170+ cur = build_norm (cur,
1171+ model.mm_input_norm_w , model.mm_input_norm_b ,
1172+ NORM_TYPE_NORMAL, proj_norm_eps, -1 );
1173+ // cur = build_patch_merge_permute(cur, hparams.proj_scale_factor);
1174+
1175+ // stack and padding
1176+ int64_t stride = hparams.proj_scale_factor * hparams.proj_scale_factor ;
1177+ int64_t n_embd = cur->ne [0 ];
1178+ int64_t n_tokens = cur->ne [1 ];
1179+ int64_t n_tokens_padded = CLIP_ALIGN (n_tokens, stride);
1180+ int64_t n_pad = n_tokens_padded - n_tokens;
1181+ if (n_pad > 0 ) {
1182+ cur = ggml_view_1d (ctx0, cur, ggml_nelements (cur), 0 );
1183+ cur = ggml_pad (ctx0, cur, n_pad * n_embd, 0 , 0 , 0 );
1184+ }
1185+ cur = ggml_view_2d (ctx0, cur,
1186+ n_embd * stride,
1187+ n_tokens_padded / stride,
1188+ ggml_row_size (cur->type , n_embd * stride), 0 );
1189+ cb (cur, " after_stacked" , -1 );
11971190
1198- {
1199- // SiglipMLP
12001191 cur = build_ffn (cur,
1201- model.mm_model_ffn_up_w , model.mm_model_ffn_up_b ,
1192+ model.mm_1_w , model.mm_1_b ,
12021193 nullptr , nullptr ,
1203- model.mm_model_ffn_down_w , model.mm_model_ffn_down_b ,
1194+ model.mm_2_w , model.mm_2_b ,
12041195 hparams.ffn_op , -1 );
12051196 cb (cur, " mlp_out" , -1 );
12061197 }
@@ -2521,7 +2512,7 @@ struct clip_model_loader {
25212512 } break ;
25222513 case PROJECTOR_TYPE_PADDLEOCR:
25232514 {
2524- hparams.proj_scale_factor = 1 ;
2515+ hparams.proj_scale_factor = 2 ;
25252516 } break ;
25262517 default :
25272518 break ;
@@ -2862,11 +2853,6 @@ struct clip_model_loader {
28622853 model.mm_model_attn_o_b = get_tensor (string_format (TN_RESAMPL_ATTN, " out" , " bias" ));
28632854 model.mm_model_ln_post_w = get_tensor (string_format (TN_RESAMPL_LN, " post" , " weight" ));
28642855 model.mm_model_ln_post_b = get_tensor (string_format (TN_RESAMPL_LN, " post" , " bias" ));
2865- // resampler ffn
2866- model.mm_model_ffn_up_w = get_tensor (string_format (TN_RESAMPL_FFN_UP, " weight" ));
2867- model.mm_model_ffn_up_b = get_tensor (string_format (TN_RESAMPL_FFN_UP, " bias" ));
2868- model.mm_model_ffn_down_w = get_tensor (string_format (TN_RESAMPL_FFN_DOWN, " weight" ));
2869- model.mm_model_ffn_down_b = get_tensor (string_format (TN_RESAMPL_FFN_DOWN, " bias" ));
28702856 // projector ffn
28712857 model.mm_1_w = get_tensor (string_format (TN_LLAVA_PROJ, 1 , " weight" ));
28722858 model.mm_1_b = get_tensor (string_format (TN_LLAVA_PROJ, 1 , " bias" ));
@@ -3967,7 +3953,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
39673953 } break ;
39683954 case PROJECTOR_TYPE_LFM2:
39693955 case PROJECTOR_TYPE_KIMIVL:
3970- case PROJECTOR_TYPE_PADDLEOCR:
39713956 {
39723957 // dynamic size
39733958 int scale_factor = ctx->model .hparams .proj_scale_factor ;
@@ -3976,6 +3961,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
39763961 int y_patch = CLIP_ALIGN (img->ny , out_patch_size) / out_patch_size;
39773962 n_patches = x_patch * y_patch;
39783963 } break ;
3964+ case PROJECTOR_TYPE_PADDLEOCR:
3965+ {
3966+ // dynamic size
3967+ int scale_factor = ctx->model .hparams .proj_scale_factor ;
3968+ int stride = scale_factor * scale_factor;
3969+ n_patches = CLIP_ALIGN (n_patches, stride) / stride;
3970+ } break ;
39793971 case PROJECTOR_TYPE_PIXTRAL:
39803972 {
39813973 // dynamic size
0 commit comments