@@ -104,7 +104,6 @@ static std::string format(const char * fmt, ...) {
104104#define TN_POS_EMBD " %s.position_embd.weight"
105105#define TN_CLASS_EMBD " v.class_embd"
106106#define TN_PATCH_EMBD " v.patch_embd.weight"
107- #define TN_PATCH_BIAS " v.patch_embd.bias"
108107#define TN_ATTN_K " %s.blk.%d.attn_k.%s"
109108#define TN_ATTN_Q " %s.blk.%d.attn_q.%s"
110109#define TN_ATTN_V " %s.blk.%d.attn_v.%s"
@@ -426,7 +425,6 @@ struct clip_vision_model {
426425 // embeddings
427426 struct ggml_tensor * class_embedding;
428427 struct ggml_tensor * patch_embeddings;
429- struct ggml_tensor * patch_bias;
430428 struct ggml_tensor * position_embeddings;
431429
432430 struct ggml_tensor * pre_ln_w;
@@ -503,11 +501,6 @@ struct clip_ctx {
503501 bool use_gelu = false ;
504502 int32_t ftype = 1 ;
505503
506- bool has_class_embedding = true ;
507- bool has_pre_norm = true ;
508- bool has_post_norm = false ;
509- bool has_patch_bias = false ;
510-
511504 struct gguf_context * ctx_gguf;
512505 struct ggml_context * ctx_data;
513506
@@ -533,7 +526,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
533526 const int patch_size = hparams.patch_size ;
534527 const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
535528 const int num_patches_per_side = image_size / patch_size; GGML_UNUSED (num_patches_per_side);
536- const int num_positions = num_patches + (ctx-> has_class_embedding ? 1 : 0 ) ;
529+ const int num_positions = num_patches + 1 ;
537530 const int hidden_size = hparams.hidden_size ;
538531 const int n_head = hparams.n_head ;
539532 const int d_head = hidden_size / n_head;
@@ -564,23 +557,16 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
564557 inp = ggml_reshape_3d (ctx0, inp, num_patches, hidden_size, batch_size);
565558 inp = ggml_cont (ctx0, ggml_permute (ctx0, inp, 1 , 0 , 2 , 3 ));
566559
567- if (ctx->has_patch_bias ) {
568- // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
569- inp = ggml_add (ctx0, inp, model.patch_bias );
570- }
571-
572560 // concat class_embeddings and patch_embeddings
573- struct ggml_tensor * embeddings = inp;
574- if (ctx->has_class_embedding ) {
575- embeddings = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
576- embeddings = ggml_acc (ctx0, embeddings, model.class_embedding ,
577- embeddings->nb [1 ], embeddings->nb [2 ], embeddings->nb [3 ], 0 );
578- embeddings = ggml_acc (ctx0, embeddings, inp,
579- embeddings->nb [1 ], embeddings->nb [2 ], embeddings->nb [3 ], model.class_embedding ->nb [1 ]);
580- }
561+ struct ggml_tensor * embeddings = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
581562 ggml_set_name (embeddings, " embeddings" );
582563 ggml_set_input (embeddings);
583564
565+ embeddings = ggml_acc (ctx0, embeddings, model.class_embedding ,
566+ embeddings->nb [1 ], embeddings->nb [2 ], embeddings->nb [3 ], 0 );
567+
568+ embeddings = ggml_acc (ctx0, embeddings, inp,
569+ embeddings->nb [1 ], embeddings->nb [2 ], embeddings->nb [3 ], model.class_embedding ->nb [1 ]);
584570
585571 struct ggml_tensor * positions = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, num_positions);
586572 ggml_set_name (positions, " positions" );
@@ -590,7 +576,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
590576 ggml_add (ctx0, embeddings, ggml_get_rows (ctx0, model.position_embeddings , positions));
591577
592578 // pre-layernorm
593- if (ctx-> has_pre_norm ) {
579+ {
594580 embeddings = ggml_norm (ctx0, embeddings, eps);
595581 ggml_set_name (embeddings, " pre_ln" );
596582
@@ -678,14 +664,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
678664 embeddings = cur;
679665 }
680666
681- // post-layernorm
682- if (ctx->has_post_norm ) {
683- embeddings = ggml_norm (ctx0, embeddings, eps);
684- ggml_set_name (embeddings, " post_ln" );
685-
686- embeddings = ggml_add (ctx0, ggml_mul (ctx0, embeddings, model.post_ln_w ), model.post_ln_b );
687- }
688-
689667 // llava projector
690668 {
691669 embeddings = ggml_reshape_2d (ctx0, embeddings, embeddings->ne [0 ], embeddings->ne [1 ]);
@@ -1170,39 +1148,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
11701148
11711149 }
11721150
1173- try {
1174- vision_model.class_embedding = get_tensor (new_clip->ctx_data , TN_CLASS_EMBD);
1175- new_clip->has_class_embedding = true ;
1176- } catch (const std::exception& e) {
1177- new_clip->has_class_embedding = false ;
1178- }
1179-
1180- try {
1181- vision_model.pre_ln_w = get_tensor (new_clip->ctx_data , format (TN_LN_PRE, " v" , " weight" ));
1182- vision_model.pre_ln_b = get_tensor (new_clip->ctx_data , format (TN_LN_PRE, " v" , " bias" ));
1183- new_clip->has_pre_norm = true ;
1184- } catch (std::exception & e) {
1185- new_clip->has_pre_norm = false ;
1186- }
1187-
1188- try {
1189- vision_model.post_ln_w = get_tensor (new_clip->ctx_data , format (TN_LN_POST, " v" , " weight" ));
1190- vision_model.post_ln_b = get_tensor (new_clip->ctx_data , format (TN_LN_POST, " v" , " bias" ));
1191- new_clip->has_post_norm = true ;
1192- } catch (std::exception & e) {
1193- new_clip->has_post_norm = false ;
1194- }
1195-
1196- try {
1197- vision_model.patch_bias = get_tensor (new_clip->ctx_data , TN_PATCH_BIAS);
1198- new_clip->has_patch_bias = true ;
1199- } catch (std::exception & e) {
1200- new_clip->has_patch_bias = false ;
1201- }
1202-
12031151 try {
12041152 vision_model.patch_embeddings = get_tensor (new_clip->ctx_data , TN_PATCH_EMBD);
1153+ vision_model.class_embedding = get_tensor (new_clip->ctx_data , TN_CLASS_EMBD);
12051154 vision_model.position_embeddings = get_tensor (new_clip->ctx_data , format (TN_POS_EMBD, " v" ));
1155+ vision_model.pre_ln_w = get_tensor (new_clip->ctx_data , format (TN_LN_PRE, " v" , " weight" ));
1156+ vision_model.pre_ln_b = get_tensor (new_clip->ctx_data , format (TN_LN_PRE, " v" , " bias" ));
12061157 } catch (const std::exception& e) {
12071158 LOG_TEE (" %s: failed to load vision model tensors\n " , __func__);
12081159 }
0 commit comments