@@ -343,11 +343,6 @@ struct clip_ctx {
343343 bool use_silu = false ;
344344 int32_t ftype = 1 ;
345345
346- bool has_class_embedding = true ;
347- bool has_pre_norm = true ;
348- bool has_post_norm = false ;
349- bool has_patch_bias = false ;
350-
351346 struct gguf_context * ctx_gguf = nullptr ;
352347 struct ggml_context * ctx_data = nullptr ;
353348
@@ -510,7 +505,7 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
510505 }
511506
512507 // post-layernorm
513- if (ctx-> has_post_norm ) {
508+ if (model. post_ln_w ) {
514509 embeddings = ggml_norm (ctx0, embeddings, eps);
515510 ggml_set_name (embeddings, " post_ln" );
516511
@@ -586,7 +581,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
586581 const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
587582 const int patches_w = image_size_width / patch_size;
588583 const int patches_h = image_size_height / patch_size;
589- const int num_positions = num_patches + (ctx-> has_class_embedding ? 1 : 0 );
584+ const int num_positions = num_patches + (model. class_embedding ? 1 : 0 );
590585 const int num_position_ids = ctx->has_qwen2vl_merger ? num_positions * 4 : num_positions;
591586 const int hidden_size = hparams.hidden_size ;
592587 const int n_head = hparams.n_head ;
@@ -638,7 +633,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
638633 inp = ggml_cont (ctx0, ggml_permute (ctx0, inp, 1 , 0 , 2 , 3 ));
639634 }
640635
641- if (ctx-> has_patch_bias ) {
636+ if (model. patch_bias ) {
642637 // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
643638 inp = ggml_add (ctx0, inp, model.patch_bias );
644639 }
@@ -647,7 +642,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
647642
648643 if (ctx->has_llava_projector ) {
649644 // concat class_embeddings and patch_embeddings
650- if (ctx-> has_class_embedding ) {
645+ if (model. class_embedding ) {
651646 embeddings = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
652647 ggml_set_name (embeddings, " embeddings" );
653648 ggml_set_input (embeddings);
@@ -684,7 +679,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
684679 }
685680
686681 // pre-layernorm
687- if (ctx-> has_pre_norm ) {
682+ if (model. pre_ln_w ) {
688683 embeddings = ggml_norm (ctx0, embeddings, eps);
689684 ggml_set_name (embeddings, " pre_ln" );
690685
@@ -794,7 +789,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
794789 }
795790
796791 // post-layernorm
797- if (ctx-> has_post_norm ) {
792+ if (model. post_ln_w ) {
798793 embeddings = ggml_norm (ctx0, embeddings, eps);
799794 ggml_set_name (embeddings, " post_ln" );
800795
@@ -1470,12 +1465,6 @@ struct clip_model_loader {
14701465 GGML_ASSERT (false && " unknown projector type" );
14711466 }
14721467
1473- // TODO @ngxson : this is legacy code, need to be removed
1474- ctx_clip.has_class_embedding = vision_model.class_embedding != nullptr ;
1475- ctx_clip.has_pre_norm = vision_model.pre_ln_w != nullptr ;
1476- ctx_clip.has_post_norm = vision_model.post_ln_w != nullptr ;
1477- ctx_clip.has_patch_bias = vision_model.patch_bias != nullptr ;
1478-
14791468 // load data
14801469 {
14811470 std::vector<uint8_t > read_buf;
@@ -2506,7 +2495,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
25062495 }
25072496 const int patch_size = hparams.patch_size ;
25082497 const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
2509- const int num_positions = num_patches + (ctx-> has_class_embedding ? 1 : 0 );
2498+ const int num_positions = num_patches + (model. class_embedding ? 1 : 0 );
25102499 if (ctx->load_image_size ==nullptr ){
25112500 ctx->load_image_size = clip_image_size_init ();
25122501 }
@@ -2591,16 +2580,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
25912580 free (pos_embed_data);
25922581 }
25932582 }
2594- else {
2595- {
2596- if (ctx->has_class_embedding ) {
2597- struct ggml_tensor * embeddings = ggml_graph_get_tensor (gf, " embeddings" );
2583+ else {
2584+ if (model.class_embedding ) {
2585+ struct ggml_tensor * embeddings = ggml_graph_get_tensor (gf, " embeddings" );
25982586
2599- void * zero_mem = malloc (ggml_nbytes (embeddings));
2600- memset (zero_mem, 0 , ggml_nbytes (embeddings));
2601- ggml_backend_tensor_set (embeddings, zero_mem, 0 , ggml_nbytes (embeddings));
2602- free (zero_mem);
2603- }
2587+ void * zero_mem = malloc (ggml_nbytes (embeddings));
2588+ memset (zero_mem, 0 , ggml_nbytes (embeddings));
2589+ ggml_backend_tensor_set (embeddings, zero_mem, 0 , ggml_nbytes (embeddings));
2590+ free (zero_mem);
26042591 }
26052592
26062593 if (ctx->has_qwen2vl_merger ) {
@@ -2648,7 +2635,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
26482635 // The patches vector is used to get rows to index into the embeds with;
26492636 // we should skip dim 0 only if we have CLS to avoid going out of bounds
26502637 // when retrieving the rows.
2651- int patch_offset = ctx-> has_class_embedding ? 1 : 0 ;
2638+ int patch_offset = model. class_embedding ? 1 : 0 ;
26522639 int * patches_data = (int *)malloc (ggml_nbytes (patches));
26532640 for (int i = 0 ; i < num_patches; i++) {
26542641 patches_data[i] = i + patch_offset;
0 commit comments