Skip to content

Commit ddfaca7

Browse files
committed
model load ok
1 parent be80289 commit ddfaca7

File tree

3 files changed

+145
-25
lines changed

3 files changed

+145
-25
lines changed

tools/mtmd/clip-impl.h

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -95,12 +95,14 @@
9595
#define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model)
9696

9797
// mimicpmv
98-
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
99-
#define TN_MINICPMV_QUERY "resampler.query"
100-
#define TN_MINICPMV_PROJ "resampler.proj.weight"
101-
#define TN_MINICPMV_KV_PROJ "resampler.kv.weight"
102-
#define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
103-
#define TN_MINICPMV_LN "resampler.ln_%s.%s"
98+
#define TN_RESAMPL_POS_EMBD_K "resampler.pos_embed_k"
99+
#define TN_RESAMPL_QUERY "resampler.query"
100+
#define TN_RESAMPL_PROJ "resampler.proj.weight"
101+
#define TN_RESAMPL_KV_PROJ "resampler.kv.weight"
102+
#define TN_RESAMPL_ATTN "resampler.attn.%s.%s"
103+
#define TN_RESAMPL_LN "resampler.ln_%s.%s"
104+
#define TN_RESAMPL_FFN_UP "resampler.ffn_up.%s"
105+
#define TN_RESAMPL_FFN_DOWN "resampler.ffn_down.%s"
104106

105107
#define TN_GLM_ADAPER_CONV "adapter.conv.%s"
106108
#define TN_GLM_ADAPTER_LINEAR "adapter.linear.linear.%s"
@@ -139,6 +141,7 @@ enum projector_type {
139141
PROJECTOR_TYPE_VOXTRAL,
140142
PROJECTOR_TYPE_LFM2,
141143
PROJECTOR_TYPE_KIMIVL,
144+
PROJECTOR_TYPE_PADDLEOCR,
142145
PROJECTOR_TYPE_UNKNOWN,
143146
};
144147

@@ -161,6 +164,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
161164
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
162165
{ PROJECTOR_TYPE_LFM2, "lfm2"},
163166
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
167+
{ PROJECTOR_TYPE_PADDLEOCR, "paddleocr"},
164168
};
165169

166170
static projector_type clip_projector_type_from_string(const std::string & str) {

tools/mtmd/clip.cpp

Lines changed: 131 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,10 @@ struct clip_model {
342342
ggml_tensor * mm_model_ln_kv_b = nullptr;
343343
ggml_tensor * mm_model_ln_post_w = nullptr;
344344
ggml_tensor * mm_model_ln_post_b = nullptr;
345+
ggml_tensor * mm_model_ffn_up_w = nullptr;
346+
ggml_tensor * mm_model_ffn_up_b = nullptr;
347+
ggml_tensor * mm_model_ffn_down_w = nullptr;
348+
ggml_tensor * mm_model_ffn_down_b = nullptr;
345349

346350
// gemma3
347351
ggml_tensor * mm_input_proj_w = nullptr;
@@ -1136,6 +1140,77 @@ struct clip_graph {
11361140
return gf;
11371141
}
11381142

1143+
ggml_cgraph * build_paddleocr() {
1144+
// 2D input positions
1145+
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
1146+
ggml_set_name(pos_h, "pos_h");
1147+
ggml_set_input(pos_h);
1148+
1149+
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
1150+
ggml_set_name(pos_w, "pos_w");
1151+
ggml_set_input(pos_w);
1152+
1153+
ggml_tensor * learned_pos_embd = resize_position_embeddings();
1154+
1155+
// build ViT with 2D position embeddings
1156+
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
1157+
// first half is X axis and second half is Y axis
1158+
return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
1159+
};
1160+
1161+
ggml_tensor * inp = build_inp();
1162+
ggml_tensor * cur = build_vit(
1163+
inp, n_patches,
1164+
NORM_TYPE_NORMAL,
1165+
hparams.ffn_op,
1166+
learned_pos_embd,
1167+
add_pos);
1168+
1169+
cb(cur, "vit_out", -1);
1170+
1171+
{
1172+
// SiglipMultiheadAttentionPoolingHead
1173+
int64_t n_pos = cur->ne[1];
1174+
ggml_tensor * Qcur = model.mm_model_query;
1175+
ggml_tensor * Kcur = cur;
1176+
ggml_tensor * Vcur = cur;
1177+
1178+
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
1179+
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
1180+
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
1181+
1182+
cb(Qcur, "resampl_Qcur", -1);
1183+
cb(Kcur, "resampl_Kcur", -1);
1184+
cb(Vcur, "resampl_Vcur", -1);
1185+
1186+
float kq_scale = 1.0f / sqrtf((float)(d_head));
1187+
cur = build_attn(model.mm_model_attn_o_w, model.mm_model_attn_o_b,
1188+
Qcur, Kcur, Vcur, nullptr, kq_scale, -1);
1189+
1190+
cb(cur, "resampl_attn_out", -1);
1191+
1192+
cur = build_norm(cur, model.mm_model_ln_post_w, model.mm_model_ln_post_b,
1193+
NORM_TYPE_NORMAL, eps, -1);
1194+
1195+
cb(cur, "resampl_out", -1);
1196+
}
1197+
1198+
{
1199+
// SiglipMLP
1200+
cur = build_ffn(cur,
1201+
model.mm_model_ffn_up_w, model.mm_model_ffn_up_b,
1202+
nullptr, nullptr,
1203+
model.mm_model_ffn_down_w, model.mm_model_ffn_down_b,
1204+
hparams.ffn_op, -1);
1205+
cb(cur, "mlp_out", -1);
1206+
}
1207+
1208+
// build the graph
1209+
ggml_build_forward_expand(gf, cur);
1210+
1211+
return gf;
1212+
}
1213+
11391214
// this graph is used by llava, granite and glm
11401215
// due to having embedding_stack (used by granite), we cannot reuse build_vit
11411216
ggml_cgraph * build_llava() {
@@ -2125,6 +2200,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
21252200
{
21262201
res = graph.build_kimivl();
21272202
} break;
2203+
case PROJECTOR_TYPE_PADDLEOCR:
2204+
{
2205+
res = graph.build_paddleocr();
2206+
} break;
21282207
default:
21292208
{
21302209
res = graph.build_llava();
@@ -2440,6 +2519,10 @@ struct clip_model_loader {
24402519
hparams.ffn_op = FFN_GELU_ERF;
24412520
log_ffn_op = "gelu_erf"; // temporary solution for logging
24422521
} break;
2522+
case PROJECTOR_TYPE_PADDLEOCR:
2523+
{
2524+
hparams.proj_scale_factor = 1;
2525+
} break;
24432526
default:
24442527
break;
24452528
}
@@ -2650,25 +2733,25 @@ struct clip_model_loader {
26502733
} break;
26512734
case PROJECTOR_TYPE_MINICPMV:
26522735
{
2653-
// model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
2654-
model.mm_model_pos_embed_k = get_tensor(TN_MINICPMV_POS_EMBD_K);
2655-
model.mm_model_query = get_tensor(TN_MINICPMV_QUERY);
2656-
model.mm_model_proj = get_tensor(TN_MINICPMV_PROJ);
2657-
model.mm_model_kv_proj = get_tensor(TN_MINICPMV_KV_PROJ);
2658-
model.mm_model_attn_q_w = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "weight"));
2659-
model.mm_model_attn_k_w = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "weight"));
2660-
model.mm_model_attn_v_w = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "weight"));
2661-
model.mm_model_attn_q_b = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "bias"));
2662-
model.mm_model_attn_k_b = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "bias"));
2663-
model.mm_model_attn_v_b = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "bias"));
2664-
model.mm_model_attn_o_w = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "weight"));
2665-
model.mm_model_attn_o_b = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "bias"));
2666-
model.mm_model_ln_q_w = get_tensor(string_format(TN_MINICPMV_LN, "q", "weight"));
2667-
model.mm_model_ln_q_b = get_tensor(string_format(TN_MINICPMV_LN, "q", "bias"));
2668-
model.mm_model_ln_kv_w = get_tensor(string_format(TN_MINICPMV_LN, "kv", "weight"));
2669-
model.mm_model_ln_kv_b = get_tensor(string_format(TN_MINICPMV_LN, "kv", "bias"));
2670-
model.mm_model_ln_post_w = get_tensor(string_format(TN_MINICPMV_LN, "post", "weight"));
2671-
model.mm_model_ln_post_b = get_tensor(string_format(TN_MINICPMV_LN, "post", "bias"));
2736+
// model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_RESAMPL_POS_EMBD);
2737+
model.mm_model_pos_embed_k = get_tensor(TN_RESAMPL_POS_EMBD_K);
2738+
model.mm_model_query = get_tensor(TN_RESAMPL_QUERY);
2739+
model.mm_model_proj = get_tensor(TN_RESAMPL_PROJ);
2740+
model.mm_model_kv_proj = get_tensor(TN_RESAMPL_KV_PROJ);
2741+
model.mm_model_attn_q_w = get_tensor(string_format(TN_RESAMPL_ATTN, "q", "weight"));
2742+
model.mm_model_attn_k_w = get_tensor(string_format(TN_RESAMPL_ATTN, "k", "weight"));
2743+
model.mm_model_attn_v_w = get_tensor(string_format(TN_RESAMPL_ATTN, "v", "weight"));
2744+
model.mm_model_attn_q_b = get_tensor(string_format(TN_RESAMPL_ATTN, "q", "bias"));
2745+
model.mm_model_attn_k_b = get_tensor(string_format(TN_RESAMPL_ATTN, "k", "bias"));
2746+
model.mm_model_attn_v_b = get_tensor(string_format(TN_RESAMPL_ATTN, "v", "bias"));
2747+
model.mm_model_attn_o_w = get_tensor(string_format(TN_RESAMPL_ATTN, "out", "weight"));
2748+
model.mm_model_attn_o_b = get_tensor(string_format(TN_RESAMPL_ATTN, "out", "bias"));
2749+
model.mm_model_ln_q_w = get_tensor(string_format(TN_RESAMPL_LN, "q", "weight"));
2750+
model.mm_model_ln_q_b = get_tensor(string_format(TN_RESAMPL_LN, "q", "bias"));
2751+
model.mm_model_ln_kv_w = get_tensor(string_format(TN_RESAMPL_LN, "kv", "weight"));
2752+
model.mm_model_ln_kv_b = get_tensor(string_format(TN_RESAMPL_LN, "kv", "bias"));
2753+
model.mm_model_ln_post_w = get_tensor(string_format(TN_RESAMPL_LN, "post", "weight"));
2754+
model.mm_model_ln_post_b = get_tensor(string_format(TN_RESAMPL_LN, "post", "bias"));
26722755
} break;
26732756
case PROJECTOR_TYPE_GLM_EDGE:
26742757
{
@@ -2766,6 +2849,32 @@ struct clip_model_loader {
27662849
model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
27672850
model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
27682851
} break;
2852+
case PROJECTOR_TYPE_PADDLEOCR:
2853+
{
2854+
model.mm_model_query = get_tensor(TN_RESAMPL_QUERY);
2855+
model.mm_model_attn_q_w = get_tensor(string_format(TN_RESAMPL_ATTN, "q", "weight"));
2856+
model.mm_model_attn_k_w = get_tensor(string_format(TN_RESAMPL_ATTN, "k", "weight"));
2857+
model.mm_model_attn_v_w = get_tensor(string_format(TN_RESAMPL_ATTN, "v", "weight"));
2858+
model.mm_model_attn_q_b = get_tensor(string_format(TN_RESAMPL_ATTN, "q", "bias"));
2859+
model.mm_model_attn_k_b = get_tensor(string_format(TN_RESAMPL_ATTN, "k", "bias"));
2860+
model.mm_model_attn_v_b = get_tensor(string_format(TN_RESAMPL_ATTN, "v", "bias"));
2861+
model.mm_model_attn_o_w = get_tensor(string_format(TN_RESAMPL_ATTN, "out", "weight"));
2862+
model.mm_model_attn_o_b = get_tensor(string_format(TN_RESAMPL_ATTN, "out", "bias"));
2863+
model.mm_model_ln_post_w = get_tensor(string_format(TN_RESAMPL_LN, "post", "weight"));
2864+
model.mm_model_ln_post_b = get_tensor(string_format(TN_RESAMPL_LN, "post", "bias"));
2865+
// resampler ffn
2866+
model.mm_model_ffn_up_w = get_tensor(string_format(TN_RESAMPL_FFN_UP, "weight"));
2867+
model.mm_model_ffn_up_b = get_tensor(string_format(TN_RESAMPL_FFN_UP, "bias"));
2868+
model.mm_model_ffn_down_w = get_tensor(string_format(TN_RESAMPL_FFN_DOWN, "weight"));
2869+
model.mm_model_ffn_down_b = get_tensor(string_format(TN_RESAMPL_FFN_DOWN, "bias"));
2870+
// projector ffn
2871+
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
2872+
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
2873+
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
2874+
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
2875+
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
2876+
model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B);
2877+
} break;
27692878
default:
27702879
GGML_ASSERT(false && "unknown projector type");
27712880
}
@@ -3856,6 +3965,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
38563965
} break;
38573966
case PROJECTOR_TYPE_LFM2:
38583967
case PROJECTOR_TYPE_KIMIVL:
3968+
case PROJECTOR_TYPE_PADDLEOCR:
38593969
{
38603970
// dynamic size
38613971
int scale_factor = ctx->model.hparams.proj_scale_factor;
@@ -4247,6 +4357,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
42474357
} break;
42484358
case PROJECTOR_TYPE_PIXTRAL:
42494359
case PROJECTOR_TYPE_KIMIVL:
4360+
case PROJECTOR_TYPE_PADDLEOCR:
42504361
{
42514362
// set the 2D positions
42524363
int n_patches_per_col = image_size_width / patch_size;
@@ -4402,6 +4513,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
44024513
return ctx->model.mm_fc_w->ne[1];
44034514
case PROJECTOR_TYPE_LFM2:
44044515
case PROJECTOR_TYPE_KIMIVL:
4516+
case PROJECTOR_TYPE_PADDLEOCR:
44054517
return ctx->model.mm_2_w->ne[1];
44064518
default:
44074519
GGML_ABORT("Unknown projector type");

tools/mtmd/mtmd.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,10 @@ struct mtmd_context {
275275
img_beg = "<img>";
276276
img_end = "</img>";
277277

278+
} else if (proj == PROJECTOR_TYPE_PADDLEOCR) {
279+
// <|IMAGE_START|> ... (image embeddings) ... <|IMAGE_END|>
280+
img_beg = "<|IMAGE_START|>";
281+
img_end = "<|IMAGE_END|>";
278282
}
279283
}
280284

0 commit comments

Comments
 (0)