@@ -249,9 +249,11 @@ struct clip_vision_model {
249249 struct ggml_tensor * mm_4_w = nullptr ;
250250 struct ggml_tensor * mm_4_b = nullptr ;
251251
252- // GLMV-Edge projection
252+ // GLMV-Edge projection
253253 struct ggml_tensor * mm_model_adapter_conv_w = nullptr ;
254254 struct ggml_tensor * mm_model_adapter_conv_b = nullptr ;
255+ struct ggml_tensor * mm_glm_tok_boi = nullptr ;
256+ struct ggml_tensor * mm_glm_tok_eoi = nullptr ;
255257
256258 // MobileVLM projection
257259 struct ggml_tensor * mm_model_mlp_1_w = nullptr ;
@@ -1559,6 +1561,13 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
15591561 embeddings = ggml_mul (ctx0, embeddings,x);
15601562 embeddings = ggml_mul_mat (ctx0, model.mm_model_mlp_3_w , embeddings);
15611563 }
1564+ // arrangement of BOI/EOI token embeddings
1565+ // note: these embeddings are not present in text model, hence we cannot process them as text tokens
1566+ // see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53
1567+ {
1568+ embeddings = ggml_concat (ctx0, model.mm_glm_tok_boi , embeddings, 1 ); // BOI
1569+ embeddings = ggml_concat (ctx0, embeddings, model.mm_glm_tok_eoi , 1 ); // EOI
1570+ }
15621571 }
15631572
15641573 else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
@@ -1972,12 +1981,14 @@ struct clip_model_loader {
19721981 {
19731982 vision_model.mm_model_adapter_conv_w = get_tensor (string_format (TN_GLM_ADAPER_CONV, " weight" ));
19741983 vision_model.mm_model_adapter_conv_b = get_tensor (string_format (TN_GLM_ADAPER_CONV, " bias" ));
1975- vision_model.mm_model_mlp_0_w = get_tensor (string_format (TN_GLM_ADAPTER_LINEAR," weight" ));
1976- vision_model.mm_model_ln_q_w = get_tensor (string_format (TN_GLM_ADAPTER_NORM_1," weight" ));
1977- vision_model.mm_model_ln_q_b = get_tensor (string_format (TN_GLM_ADAPTER_NORM_1," bias" ));
1978- vision_model.mm_model_mlp_1_w = get_tensor (string_format (TN_GLM_ADAPTER_D_H_2_4H," weight" ));
1979- vision_model.mm_model_mlp_2_w = get_tensor (string_format (TN_GLM_ADAPTER_GATE," weight" ));
1980- vision_model.mm_model_mlp_3_w = get_tensor (string_format (TN_GLM_ADAPTER_D_4H_2_H," weight" ));
1984+ vision_model.mm_model_mlp_0_w = get_tensor (string_format (TN_GLM_ADAPTER_LINEAR, " weight" ));
1985+ vision_model.mm_model_ln_q_w = get_tensor (string_format (TN_GLM_ADAPTER_NORM_1, " weight" ));
1986+ vision_model.mm_model_ln_q_b = get_tensor (string_format (TN_GLM_ADAPTER_NORM_1, " bias" ));
1987+ vision_model.mm_model_mlp_1_w = get_tensor (string_format (TN_GLM_ADAPTER_D_H_2_4H, " weight" ));
1988+ vision_model.mm_model_mlp_2_w = get_tensor (string_format (TN_GLM_ADAPTER_GATE, " weight" ));
1989+ vision_model.mm_model_mlp_3_w = get_tensor (string_format (TN_GLM_ADAPTER_D_4H_2_H, " weight" ));
1990+ vision_model.mm_glm_tok_boi = get_tensor (string_format (TN_TOK_GLM_BOI, " weight" ));
1991+ vision_model.mm_glm_tok_eoi = get_tensor (string_format (TN_TOK_GLM_EOI, " weight" ));
19811992 } break ;
19821993 case PROJECTOR_TYPE_QWEN2VL:
19831994 case PROJECTOR_TYPE_QWEN25VL:
@@ -2948,6 +2959,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
29482959
29492960 if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
29502961 n_patches /= 4 ;
2962+ n_patches += 2 ; // for BOI and EOI token embeddings
29512963 } else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
29522964 if (ctx->minicpmv_version == 2 ) {
29532965 n_patches = 96 ;
0 commit comments