@@ -667,9 +667,9 @@ struct clip_graph {
667667 constexpr int _depth = 12 ;
668668 constexpr int enc_n_heads = 12 ;
669669 constexpr int enc_d_heads = enc_n_embd / enc_n_heads;
670- constexpr int _prompt_n_embd = 256 ;
670+ // constexpr int _prompt_n_embd = 256;
671671 constexpr int enc_patch_size = 16 ;
672- constexpr int _window_size = 14 ;
672+ // constexpr int _window_size = 14;
673673
674674 const int enc_n_patches = enc_image_size / enc_patch_size; // 64
675675
@@ -834,7 +834,7 @@ struct clip_graph {
834834
835835 ggml_tensor * global_features_1 = build_sam_enc (inp_raw, std::max (img.nx , img.ny ));
836836
837- ggml_tensor * global_features_2 = build_dp_ocr_clip (inp_raw, global_features_1);
837+ ggml_tensor * global_features_2 = build_dp_ocr_clip (global_features_1);
838838
839839 // torch global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
840840 global_features_1 = ggml_permute (ctx0, global_features_1,2 ,1 ,0 ,3 );
@@ -1532,7 +1532,7 @@ struct clip_graph {
15321532 return gf;
15331533 }
15341534
1535- ggml_tensor * build_dp_ocr_clip (ggml_tensor * inpL, ggml_tensor * patch_embeds) {
1535+ ggml_tensor * build_dp_ocr_clip (ggml_tensor * patch_embeds) {
15361536 GGML_ASSERT (model.class_embedding != nullptr );
15371537 GGML_ASSERT (model.position_embeddings != nullptr );
15381538
@@ -2466,6 +2466,8 @@ struct clip_graph {
24662466 return inpL;
24672467 }
24682468
2469+ // Implementation based on approach suggested by Acly
2470+ // See: https://github.com/ggml-org/llama.cpp/pull/17383#issuecomment-3554227091
24692471 static ggml_tensor* window_partition (ggml_context* ctx, ggml_tensor* x, int window) {
24702472 auto [c, w, h, b] = x->ne ;
24712473 // same as
@@ -2486,6 +2488,8 @@ struct clip_graph {
24862488 return x;
24872489 }
24882490
2491+ // Implementation based on approach suggested by Acly
2492+ // See: https://github.com/ggml-org/llama.cpp/pull/17383#issuecomment-3554227091
24892493 static ggml_tensor* window_unpartition (ggml_context* m, ggml_tensor* x, int w, int h, int window) {
24902494 int64_t c = x->ne [0 ];
24912495 // same as
@@ -4881,7 +4885,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
48814885 const int min_num = 2 ;
48824886 const int max_num = 9 ;
48834887 const int image_size = params.image_size ; // typically 640
4884- const bool use_thumbnail = true ; // mimic python's use_thumbnail
4888+ // const bool use_thumbnail = true; // mimic python's use_thumbnail
48854889
48864890 // original image size
48874891 const int orig_w = original_size.width ;
0 commit comments