@@ -342,6 +342,10 @@ struct clip_model {
342342 ggml_tensor * mm_model_ln_kv_b = nullptr ;
343343 ggml_tensor * mm_model_ln_post_w = nullptr ;
344344 ggml_tensor * mm_model_ln_post_b = nullptr ;
345+ ggml_tensor * mm_model_ffn_up_w = nullptr ;
346+ ggml_tensor * mm_model_ffn_up_b = nullptr ;
347+ ggml_tensor * mm_model_ffn_down_w = nullptr ;
348+ ggml_tensor * mm_model_ffn_down_b = nullptr ;
345349
346350 // gemma3
347351 ggml_tensor * mm_input_proj_w = nullptr ;
@@ -1136,6 +1140,77 @@ struct clip_graph {
11361140 return gf;
11371141 }
11381142
1143+ ggml_cgraph * build_paddleocr () {
1144+ // 2D input positions
1145+ ggml_tensor * pos_h = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_patches);
1146+ ggml_set_name (pos_h, " pos_h" );
1147+ ggml_set_input (pos_h);
1148+
1149+ ggml_tensor * pos_w = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_patches);
1150+ ggml_set_name (pos_w, " pos_w" );
1151+ ggml_set_input (pos_w);
1152+
1153+ ggml_tensor * learned_pos_embd = resize_position_embeddings ();
1154+
1155+ // build ViT with 2D position embeddings
1156+ auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
1157+ // first half is X axis and second half is Y axis
1158+ return build_rope_2d (ctx0, cur, pos_w, pos_h, hparams.rope_theta , false );
1159+ };
1160+
1161+ ggml_tensor * inp = build_inp ();
1162+ ggml_tensor * cur = build_vit (
1163+ inp, n_patches,
1164+ NORM_TYPE_NORMAL,
1165+ hparams.ffn_op ,
1166+ learned_pos_embd,
1167+ add_pos);
1168+
1169+ cb (cur, " vit_out" , -1 );
1170+
1171+ {
1172+ // SiglipMultiheadAttentionPoolingHead
1173+ int64_t n_pos = cur->ne [1 ];
1174+ ggml_tensor * Qcur = model.mm_model_query ;
1175+ ggml_tensor * Kcur = cur;
1176+ ggml_tensor * Vcur = cur;
1177+
1178+ Qcur = ggml_reshape_3d (ctx0, Qcur, d_head, n_head, n_pos);
1179+ Kcur = ggml_reshape_3d (ctx0, Kcur, d_head, n_head, n_pos);
1180+ Vcur = ggml_reshape_3d (ctx0, Vcur, d_head, n_head, n_pos);
1181+
1182+ cb (Qcur, " resampl_Qcur" , -1 );
1183+ cb (Kcur, " resampl_Kcur" , -1 );
1184+ cb (Vcur, " resampl_Vcur" , -1 );
1185+
1186+ float kq_scale = 1 .0f / sqrtf ((float )(d_head));
1187+ cur = build_attn (model.mm_model_attn_o_w , model.mm_model_attn_o_b ,
1188+ Qcur, Kcur, Vcur, nullptr , kq_scale, -1 );
1189+
1190+ cb (cur, " resampl_attn_out" , -1 );
1191+
1192+ cur = build_norm (cur, model.mm_model_ln_post_w , model.mm_model_ln_post_b ,
1193+ NORM_TYPE_NORMAL, eps, -1 );
1194+
1195+ cb (cur, " resampl_out" , -1 );
1196+ }
1197+
1198+ {
1199+ // SiglipMLP
1200+ cur = build_ffn (cur,
1201+ model.mm_model_ffn_up_w , model.mm_model_ffn_up_b ,
1202+ nullptr , nullptr ,
1203+ model.mm_model_ffn_down_w , model.mm_model_ffn_down_b ,
1204+ hparams.ffn_op , -1 );
1205+ cb (cur, " mlp_out" , -1 );
1206+ }
1207+
1208+ // build the graph
1209+ ggml_build_forward_expand (gf, cur);
1210+
1211+ return gf;
1212+ }
1213+
11391214 // this graph is used by llava, granite and glm
11401215 // due to having embedding_stack (used by granite), we cannot reuse build_vit
11411216 ggml_cgraph * build_llava () {
@@ -2125,6 +2200,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
21252200 {
21262201 res = graph.build_kimivl ();
21272202 } break ;
2203+ case PROJECTOR_TYPE_PADDLEOCR:
2204+ {
2205+ res = graph.build_paddleocr ();
2206+ } break ;
21282207 default :
21292208 {
21302209 res = graph.build_llava ();
@@ -2440,6 +2519,10 @@ struct clip_model_loader {
24402519 hparams.ffn_op = FFN_GELU_ERF;
24412520 log_ffn_op = " gelu_erf" ; // temporary solution for logging
24422521 } break ;
2522+ case PROJECTOR_TYPE_PADDLEOCR:
2523+ {
2524+ hparams.proj_scale_factor = 1 ;
2525+ } break ;
24432526 default :
24442527 break ;
24452528 }
@@ -2650,25 +2733,25 @@ struct clip_model_loader {
26502733 } break ;
26512734 case PROJECTOR_TYPE_MINICPMV:
26522735 {
2653- // model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD );
2654- model.mm_model_pos_embed_k = get_tensor (TN_MINICPMV_POS_EMBD_K );
2655- model.mm_model_query = get_tensor (TN_MINICPMV_QUERY );
2656- model.mm_model_proj = get_tensor (TN_MINICPMV_PROJ );
2657- model.mm_model_kv_proj = get_tensor (TN_MINICPMV_KV_PROJ );
2658- model.mm_model_attn_q_w = get_tensor (string_format (TN_MINICPMV_ATTN , " q" , " weight" ));
2659- model.mm_model_attn_k_w = get_tensor (string_format (TN_MINICPMV_ATTN , " k" , " weight" ));
2660- model.mm_model_attn_v_w = get_tensor (string_format (TN_MINICPMV_ATTN , " v" , " weight" ));
2661- model.mm_model_attn_q_b = get_tensor (string_format (TN_MINICPMV_ATTN , " q" , " bias" ));
2662- model.mm_model_attn_k_b = get_tensor (string_format (TN_MINICPMV_ATTN , " k" , " bias" ));
2663- model.mm_model_attn_v_b = get_tensor (string_format (TN_MINICPMV_ATTN , " v" , " bias" ));
2664- model.mm_model_attn_o_w = get_tensor (string_format (TN_MINICPMV_ATTN , " out" , " weight" ));
2665- model.mm_model_attn_o_b = get_tensor (string_format (TN_MINICPMV_ATTN , " out" , " bias" ));
2666- model.mm_model_ln_q_w = get_tensor (string_format (TN_MINICPMV_LN , " q" , " weight" ));
2667- model.mm_model_ln_q_b = get_tensor (string_format (TN_MINICPMV_LN , " q" , " bias" ));
2668- model.mm_model_ln_kv_w = get_tensor (string_format (TN_MINICPMV_LN , " kv" , " weight" ));
2669- model.mm_model_ln_kv_b = get_tensor (string_format (TN_MINICPMV_LN , " kv" , " bias" ));
2670- model.mm_model_ln_post_w = get_tensor (string_format (TN_MINICPMV_LN , " post" , " weight" ));
2671- model.mm_model_ln_post_b = get_tensor (string_format (TN_MINICPMV_LN , " post" , " bias" ));
2736+ // model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_RESAMPL_POS_EMBD );
2737+ model.mm_model_pos_embed_k = get_tensor (TN_RESAMPL_POS_EMBD_K );
2738+ model.mm_model_query = get_tensor (TN_RESAMPL_QUERY );
2739+ model.mm_model_proj = get_tensor (TN_RESAMPL_PROJ );
2740+ model.mm_model_kv_proj = get_tensor (TN_RESAMPL_KV_PROJ );
2741+ model.mm_model_attn_q_w = get_tensor (string_format (TN_RESAMPL_ATTN , " q" , " weight" ));
2742+ model.mm_model_attn_k_w = get_tensor (string_format (TN_RESAMPL_ATTN , " k" , " weight" ));
2743+ model.mm_model_attn_v_w = get_tensor (string_format (TN_RESAMPL_ATTN , " v" , " weight" ));
2744+ model.mm_model_attn_q_b = get_tensor (string_format (TN_RESAMPL_ATTN , " q" , " bias" ));
2745+ model.mm_model_attn_k_b = get_tensor (string_format (TN_RESAMPL_ATTN , " k" , " bias" ));
2746+ model.mm_model_attn_v_b = get_tensor (string_format (TN_RESAMPL_ATTN , " v" , " bias" ));
2747+ model.mm_model_attn_o_w = get_tensor (string_format (TN_RESAMPL_ATTN , " out" , " weight" ));
2748+ model.mm_model_attn_o_b = get_tensor (string_format (TN_RESAMPL_ATTN , " out" , " bias" ));
2749+ model.mm_model_ln_q_w = get_tensor (string_format (TN_RESAMPL_LN , " q" , " weight" ));
2750+ model.mm_model_ln_q_b = get_tensor (string_format (TN_RESAMPL_LN , " q" , " bias" ));
2751+ model.mm_model_ln_kv_w = get_tensor (string_format (TN_RESAMPL_LN , " kv" , " weight" ));
2752+ model.mm_model_ln_kv_b = get_tensor (string_format (TN_RESAMPL_LN , " kv" , " bias" ));
2753+ model.mm_model_ln_post_w = get_tensor (string_format (TN_RESAMPL_LN , " post" , " weight" ));
2754+ model.mm_model_ln_post_b = get_tensor (string_format (TN_RESAMPL_LN , " post" , " bias" ));
26722755 } break ;
26732756 case PROJECTOR_TYPE_GLM_EDGE:
26742757 {
@@ -2766,6 +2849,32 @@ struct clip_model_loader {
27662849 model.mm_model_mlp_1_w = get_tensor (string_format (TN_MVLM_PROJ_MLP, 1 , " weight" ));
27672850 model.mm_model_mlp_2_w = get_tensor (string_format (TN_MVLM_PROJ_MLP, 2 , " weight" ));
27682851 } break ;
2852+ case PROJECTOR_TYPE_PADDLEOCR:
2853+ {
2854+ model.mm_model_query = get_tensor (TN_RESAMPL_QUERY);
2855+ model.mm_model_attn_q_w = get_tensor (string_format (TN_RESAMPL_ATTN, " q" , " weight" ));
2856+ model.mm_model_attn_k_w = get_tensor (string_format (TN_RESAMPL_ATTN, " k" , " weight" ));
2857+ model.mm_model_attn_v_w = get_tensor (string_format (TN_RESAMPL_ATTN, " v" , " weight" ));
2858+ model.mm_model_attn_q_b = get_tensor (string_format (TN_RESAMPL_ATTN, " q" , " bias" ));
2859+ model.mm_model_attn_k_b = get_tensor (string_format (TN_RESAMPL_ATTN, " k" , " bias" ));
2860+ model.mm_model_attn_v_b = get_tensor (string_format (TN_RESAMPL_ATTN, " v" , " bias" ));
2861+ model.mm_model_attn_o_w = get_tensor (string_format (TN_RESAMPL_ATTN, " out" , " weight" ));
2862+ model.mm_model_attn_o_b = get_tensor (string_format (TN_RESAMPL_ATTN, " out" , " bias" ));
2863+ model.mm_model_ln_post_w = get_tensor (string_format (TN_RESAMPL_LN, " post" , " weight" ));
2864+ model.mm_model_ln_post_b = get_tensor (string_format (TN_RESAMPL_LN, " post" , " bias" ));
2865+ // resampler ffn
2866+ model.mm_model_ffn_up_w = get_tensor (string_format (TN_RESAMPL_FFN_UP, " weight" ));
2867+ model.mm_model_ffn_up_b = get_tensor (string_format (TN_RESAMPL_FFN_UP, " bias" ));
2868+ model.mm_model_ffn_down_w = get_tensor (string_format (TN_RESAMPL_FFN_DOWN, " weight" ));
2869+ model.mm_model_ffn_down_b = get_tensor (string_format (TN_RESAMPL_FFN_DOWN, " bias" ));
2870+ // projector ffn
2871+ model.mm_1_w = get_tensor (string_format (TN_LLAVA_PROJ, 1 , " weight" ));
2872+ model.mm_1_b = get_tensor (string_format (TN_LLAVA_PROJ, 1 , " bias" ));
2873+ model.mm_2_w = get_tensor (string_format (TN_LLAVA_PROJ, 2 , " weight" ));
2874+ model.mm_2_b = get_tensor (string_format (TN_LLAVA_PROJ, 2 , " bias" ));
2875+ model.mm_input_norm_w = get_tensor (TN_MM_INP_NORM);
2876+ model.mm_input_norm_b = get_tensor (TN_MM_INP_NORM_B);
2877+ } break ;
27692878 default :
27702879 GGML_ASSERT (false && " unknown projector type" );
27712880 }
@@ -3856,6 +3965,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
38563965 } break ;
38573966 case PROJECTOR_TYPE_LFM2:
38583967 case PROJECTOR_TYPE_KIMIVL:
3968+ case PROJECTOR_TYPE_PADDLEOCR:
38593969 {
38603970 // dynamic size
38613971 int scale_factor = ctx->model .hparams .proj_scale_factor ;
@@ -4247,6 +4357,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
42474357 } break ;
42484358 case PROJECTOR_TYPE_PIXTRAL:
42494359 case PROJECTOR_TYPE_KIMIVL:
4360+ case PROJECTOR_TYPE_PADDLEOCR:
42504361 {
42514362 // set the 2D positions
42524363 int n_patches_per_col = image_size_width / patch_size;
@@ -4402,6 +4513,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
44024513 return ctx->model .mm_fc_w ->ne [1 ];
44034514 case PROJECTOR_TYPE_LFM2:
44044515 case PROJECTOR_TYPE_KIMIVL:
4516+ case PROJECTOR_TYPE_PADDLEOCR:
44054517 return ctx->model .mm_2_w ->ne [1 ];
44064518 default :
44074519 GGML_ABORT (" Unknown projector type" );
0 commit comments