@@ -1555,8 +1555,8 @@ struct clip_graph {
15551555 ggml_tensor * inp = ggml_cpy (ctx0, patch_embeds, ggml_dup_tensor (ctx0, patch_embeds));
15561556
15571557
1558- inp = ggml_cont (ctx0,ggml_permute (ctx0, inp,2 , 1 , 0 , 3 ) );
1559- inp = ggml_reshape_2d (ctx0, inp, n_embd, inp-> ne [ 1 ]*inp-> ne [ 2 ]*inp-> ne [ 3 ] );
1558+ inp = ggml_reshape_2d (ctx0, inp, inp-> ne [ 0 ]*inp-> ne [ 1 ], inp-> ne [ 2 ] );
1559+ inp = ggml_cont (ctx0, ggml_permute (ctx0, inp, 1 , 0 , 2 , 3 ) );
15601560
15611561 ggml_tensor * new_pos_embd = ggml_cpy (ctx0, model.position_embeddings , ggml_dup_tensor (ctx0, model.position_embeddings ));
15621562
@@ -1587,7 +1587,7 @@ struct clip_graph {
15871587
15881588
15891589 // add CLS token
1590- inp = ggml_concat (ctx0, inp, model.class_embedding , 1 );
1590+ inp = ggml_concat (ctx0, model.class_embedding , inp , 1 );
15911591
15921592 // TODO : check norm type for dp-ocr-clip
15931593 norm_type norm_t = NORM_TYPE_NORMAL;
@@ -1596,7 +1596,6 @@ struct clip_graph {
15961596 ggml_tensor * positions = ggml_cast (ctx0, ggml_arange (ctx0, 0 , n_pos, 1 ), GGML_TYPE_I32);
15971597 ggml_tensor * learned_pos_embd = ggml_get_rows (ctx0, new_pos_embd, positions);
15981598
1599-
16001599 ggml_tensor * cur = build_vit (inp, n_pos, norm_t , hparams.ffn_op , learned_pos_embd,
16011600 nullptr ); // shape [1024, 16, 16]
16021601
@@ -2395,7 +2394,7 @@ struct clip_graph {
23952394 // pre-layernorm
23962395 if (model.pre_ln_w ) {
23972396 inpL = build_norm (inpL, model.pre_ln_w , model.pre_ln_b , norm_t , eps, -1 );
2398- cb (inpL, " pre_ln " , -1 );
2397+ cb (inpL, " vit_pre_ln " , -1 );
23992398 }
24002399
24012400 // loop over layers
@@ -5808,7 +5807,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
58085807 bool is_stored = false ;
58095808 std::vector<std::string> patterns = {
58105809 /* Add tensor names here to dump (e.g. "sam_output") */
5811- " sam_output "
5810+ " vit_pre_ln "
58125811 };
58135812
58145813 for (auto & p : patterns) {
0 commit comments