@@ -835,14 +835,23 @@ struct clip_graph {
835835 ggml_tensor * global_features_2 = build_dp_ocr_clip (inp_raw, global_features_1);
836836
837837 // torch global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
838- ggml_tensor * global_features = ggml_concat (ctx0, global_features_1, global_features_2, 0 );
838+ global_features_1 = ggml_permute (ctx0, global_features_1,2 ,1 ,0 ,3 );
839+ global_features_1 = ggml_cont (ctx0, global_features_1);
840+ global_features_1 = ggml_reshape_2d (ctx0, global_features_1, n_embd, n_patches);
841+ // remove CLS token
842+ global_features_2 = ggml_view_2d (ctx0, global_features_2,
843+ n_embd, n_patches,
844+ ggml_row_size (global_features_2->type , n_embd), 0 );
845+
846+ ggml_tensor * global_features = ggml_concat (ctx0, global_features_2, global_features_1, 1 );
839847 global_features = build_global_local_features (
840848 ctx0,
841849 global_features,
842850 n_patches_y,
843851 n_patches_x,
844852 n_embd
845853 );
854+ ggml_build_forward_expand (gf, global_features);
846855
847856 return gf;
848857 }
@@ -858,8 +867,8 @@ struct clip_graph {
858867 int n_dim) {
859868 GGML_ASSERT (model.image_newline != nullptr );
860869 GGML_ASSERT (model.view_seperator != nullptr );
861- GGML_ASSERT (global_features->ne [0 ] == ( int64_t ) n_dim);
862- GGML_ASSERT (global_features->ne [1 ] == ( int64_t ) (h * w));
870+ GGML_ASSERT (global_features->ne [0 ] == static_cast < int64_t >( n_dim) );
871+ GGML_ASSERT (global_features->ne [1 ] == static_cast < int64_t >( 2 * (h * w) ));
863872
864873 // 1) global_features: [n_dim, h*w] -> [n_dim, w, h] -> [h, w, n_dim]
865874 ggml_tensor * t = ggml_reshape_3d (ctx0, global_features, n_dim, w, h); // (n_dim, w, h)
@@ -1552,8 +1561,7 @@ struct clip_graph {
15521561
15531562 // for selecting learned pos embd, used by ViT
15541563 struct ggml_tensor * positions = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_pos);
1555- ggml_set_name (positions, " positions" );
1556- ggml_set_input (positions);
1564+ cb (positions, " positions" , -1 );
15571565 ggml_tensor * learned_pos_embd = ggml_get_rows (ctx0, model.position_embeddings , positions);
15581566
15591567
@@ -3607,6 +3615,9 @@ struct clip_model_loader {
36073615 model.net_2 = get_tensor (string_format (TN_SAM_NET, 2 , " weight" ));
36083616 model.net_3 = get_tensor (string_format (TN_SAM_NET, 3 , " weight" ));
36093617 }
3618+ model.image_newline = get_tensor (TN_IMAGE_NEWLINE, false );
3619+ model.view_seperator = get_tensor (TN_IMAGE_SEPERATOR, false );
3620+
36103621 break ;
36113622 default :
36123623 GGML_ASSERT (false && " unknown projector type" );
0 commit comments