99#include " ggml-backend.h"
1010#include " gguf.h"
1111
12- // #ifdef GGML_USE_CUDA
13- // #include "ggml-cuda.h"
14- // #endif
15- //
16- // #ifdef GGML_USE_SYCL
17- // #include "ggml-sycl.h"
18- // #endif
19- //
20- // #ifdef GGML_USE_METAL
21- // #include "ggml-metal.h"
22- // #endif
23- //
24- // #ifdef GGML_USE_CANN
25- // #include "ggml-cann.h"
26- // #endif
27- //
28- // #ifdef GGML_USE_VULKAN
29- // #include "ggml-vulkan.h"
30- // #endif
12+ #ifdef GGML_USE_CUDA
13+ #include " ggml-cuda.h"
14+ #endif
15+
16+ #ifdef GGML_USE_SYCL
17+ #include " ggml-sycl.h"
18+ #endif
19+
20+ #ifdef GGML_USE_METAL
21+ #include " ggml-metal.h"
22+ #endif
23+
24+ #ifdef GGML_USE_CANN
25+ #include " ggml-cann.h"
26+ #endif
27+
28+ #ifdef GGML_USE_VULKAN
29+ #include " ggml-vulkan.h"
30+ #endif
3131
3232#define STB_IMAGE_IMPLEMENTATION
3333#include " stb_image.h"
@@ -106,6 +106,8 @@ static std::string format(const char * fmt, ...) {
106106#define KEY_HAS_GLM_PROJ " clip.has_glm_projector"
107107#define KEY_MINICPMV_VERSION " clip.minicpmv_version"
108108#define KEY_HAS_QWEN2VL_MERGER " clip.has_qwen2vl_merger"
109+ #define KEY_IS_QWEN2_5 " clip.is_qwen2_5"
110+ #define KEY_RMS_NORM_EPS " clip.%s.attention.rms_norm_epsilon"
109111#define KEY_USE_GELU " clip.use_gelu"
110112#define KEY_USE_SILU " clip.use_silu"
111113#define KEY_N_EMBD " clip.%s.embedding_length"
@@ -583,6 +585,7 @@ struct clip_ctx {
583585 bool has_minicpmv_projector = false ;
584586 bool has_glm_projector = false ;
585587 bool has_qwen2vl_merger = false ;
588+ bool is_qwen2_5 = false ;
586589 int minicpmv_version = 2 ;
587590
588591 struct clip_vision_model vision_model;
@@ -734,7 +737,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
734737 if (ctx->has_minicpmv_projector ) {
735738 int pos_w = image_size_width/patch_size;
736739 int pos_h = image_size_height/patch_size;
737- if (ctx->minicpmv_version == 2 ) {
740+ if (ctx->is_qwen2_5 ) {
741+ pos_embed = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 2048 , pos_w * pos_h, 1 );
742+ }
743+ else if (ctx->minicpmv_version == 2 ) {
738744 pos_embed = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 4096 , pos_w * pos_h, 1 );
739745 }
740746 else if (ctx->minicpmv_version == 3 ) {
@@ -774,8 +780,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
774780 {
775781 cur = ggml_norm (ctx0, cur, eps);
776782
777- cur = ggml_add (ctx0, ggml_mul (ctx0, cur, model.layers [il].ln_1_w ),
778- model.layers [il].ln_1_b );
783+ if (ctx->is_qwen2_5 ) {
784+ // RMSNorm for Qwen2.5 (no bias)
785+ cur = ggml_mul (ctx0, cur, model.layers [il].ln_1_w );
786+ } else {
787+ // Standard LayerNorm with bias
788+ cur = ggml_add (ctx0, ggml_mul (ctx0, cur, model.layers [il].ln_1_w ),
789+ model.layers [il].ln_1_b );
790+ }
779791 }
780792
781793 // self-attention
@@ -834,22 +846,47 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
834846 {
835847 cur = ggml_norm (ctx0, cur, eps);
836848
837- cur = ggml_add (ctx0, ggml_mul (ctx0, cur, model.layers [il].ln_2_w ), model.layers [il].ln_2_b );
849+ if (ctx->is_qwen2_5 ) {
850+ // RMSNorm for Qwen2.5 (no bias)
851+ cur = ggml_mul (ctx0, cur, model.layers [il].ln_2_w );
852+ } else {
853+ // Standard LayerNorm with bias
854+ cur = ggml_add (ctx0, ggml_mul (ctx0, cur, model.layers [il].ln_2_w ),
855+ model.layers [il].ln_2_b );
856+ }
838857 }
839858
840- cur = ggml_mul_mat (ctx0, model.layers [il].ff_i_w , cur);
841- cur = ggml_add (ctx0, cur, model.layers [il].ff_i_b );
859+ // For Qwen2.5, the MLP uses SiLU gated activation
860+ if (ctx->is_qwen2_5 ) {
861+ // Qwen2.5 uses SiLU gated activation
862+ // ffn_down is the gate_proj, ffn_up is the up_proj
863+ struct ggml_tensor * gate = ggml_mul_mat (ctx0, model.layers [il].ff_i_w , cur);
864+ struct ggml_tensor * up = ggml_mul_mat (ctx0, model.layers [il].ff_i_b , cur); // using ff_i_b as up_proj weight
865+
866+ // Apply SiLU to the gate
867+ gate = ggml_silu_inplace (ctx0, gate);
868+
869+ // Multiply gate and up
870+ cur = ggml_mul (ctx0, gate, up);
842871
843- if (ctx->use_gelu ) {
844- cur = ggml_gelu_inplace (ctx0, cur);
845- } else if (ctx->use_silu ) {
846- cur = ggml_silu_inplace (ctx0, cur);
872+ // Apply down projection
873+ cur = ggml_mul_mat (ctx0, model.layers [il].ff_o_w , cur);
847874 } else {
848- cur = ggml_gelu_quick_inplace (ctx0, cur);
849- }
875+ // Original MLP
876+ cur = ggml_mul_mat (ctx0, model.layers [il].ff_i_w , cur);
877+ cur = ggml_add (ctx0, cur, model.layers [il].ff_i_b );
878+
879+ if (ctx->use_gelu ) {
880+ cur = ggml_gelu_inplace (ctx0, cur);
881+ } else if (ctx->use_silu ) {
882+ cur = ggml_silu_inplace (ctx0, cur);
883+ } else {
884+ cur = ggml_gelu_quick_inplace (ctx0, cur);
885+ }
850886
851- cur = ggml_mul_mat (ctx0, model.layers [il].ff_o_w , cur);
852- cur = ggml_add (ctx0, cur, model.layers [il].ff_o_b );
887+ cur = ggml_mul_mat (ctx0, model.layers [il].ff_o_w , cur);
888+ cur = ggml_add (ctx0, cur, model.layers [il].ff_o_b );
889+ }
853890
854891 // residual 2
855892 cur = ggml_add (ctx0, embeddings, cur);
@@ -1085,7 +1122,12 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
10851122 const int d_head = 128 ;
10861123 int n_head = hidden_size/d_head;
10871124 int num_query = 96 ;
1088- if (ctx->minicpmv_version == 2 ) {
1125+ if (ctx->is_qwen2_5 ) {
1126+ hidden_size = 2048 ;
1127+ n_head = hidden_size/d_head;
1128+ num_query = 64 ;
1129+ }
1130+ else if (ctx->minicpmv_version == 2 ) {
10891131 hidden_size = 4096 ;
10901132 n_head = hidden_size/d_head;
10911133 num_query = 96 ;
@@ -1296,30 +1338,30 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
12961338 }
12971339 }
12981340
1299- // #ifdef GGML_USE_CUDA
1300- // new_clip->backend = ggml_backend_cuda_init(0);
1301- // LOG_INF("%s: CLIP using CUDA backend\n", __func__);
1302- // #endif
1303- //
1304- // #ifdef GGML_USE_METAL
1305- // new_clip->backend = ggml_backend_metal_init();
1306- // LOG_INF("%s: CLIP using Metal backend\n", __func__);
1307- // #endif
1308- //
1309- // #ifdef GGML_USE_CANN
1310- // new_clip->backend = ggml_backend_cann_init(0);
1311- // LOG_INF("%s: CLIP using CANN backend\n", __func__);
1312- // #endif
1313- //
1314- // #ifdef GGML_USE_VULKAN
1315- // new_clip->backend = ggml_backend_vk_init(0);
1316- // LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
1317- // #endif
1318- //
1319- // #ifdef GGML_USE_SYCL
1320- // new_clip->backend = ggml_backend_sycl_init(0);
1321- // LOG_INF("%s: CLIP using SYCL backend\n", __func__);
1322- // #endif
1341+ #ifdef GGML_USE_CUDA
1342+ new_clip->backend = ggml_backend_cuda_init (0 );
1343+ LOG_INF (" %s: CLIP using CUDA backend\n " , __func__);
1344+ #endif
1345+
1346+ #ifdef GGML_USE_METAL
1347+ new_clip->backend = ggml_backend_metal_init ();
1348+ LOG_INF (" %s: CLIP using Metal backend\n " , __func__);
1349+ #endif
1350+
1351+ #ifdef GGML_USE_CANN
1352+ new_clip->backend = ggml_backend_cann_init (0 );
1353+ LOG_INF (" %s: CLIP using CANN backend\n " , __func__);
1354+ #endif
1355+
1356+ #ifdef GGML_USE_VULKAN
1357+ new_clip->backend = ggml_backend_vk_init (0 );
1358+ LOG_INF (" %s: CLIP using Vulkan backend\n " , __func__);
1359+ #endif
1360+
1361+ #ifdef GGML_USE_SYCL
1362+ new_clip->backend = ggml_backend_sycl_init (0 );
1363+ LOG_INF (" %s: CLIP using SYCL backend\n " , __func__);
1364+ #endif
13231365
13241366 if (!new_clip->backend ) {
13251367 new_clip->backend = ggml_backend_cpu_init ();
@@ -1360,6 +1402,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
13601402 }
13611403 // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
13621404
1405+ idx = gguf_find_key (ctx, KEY_IS_QWEN2_5);
1406+ if (idx != -1 ) {
1407+ new_clip->is_qwen2_5 = gguf_get_val_bool (ctx, idx);
1408+ }
1409+
13631410 GGML_ASSERT (new_clip->has_vision_encoder );
13641411 GGML_ASSERT (!new_clip->has_text_encoder );
13651412
@@ -2942,7 +2989,10 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
29422989 return ctx->vision_model .mm_3_b ->ne [0 ];
29432990 }
29442991 if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
2945- if (ctx->minicpmv_version == 2 ) {
2992+ if (ctx->is_qwen2_5 ) {
2993+ return 2048 ;
2994+ }
2995+ else if (ctx->minicpmv_version == 2 ) {
29462996 return 4096 ;
29472997 }
29482998 else if (ctx->minicpmv_version == 3 ) {
@@ -2956,6 +3006,11 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
29563006 return ctx->vision_model .mm_model_mlp_3_w ->ne [1 ];
29573007 }
29583008 if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
3009+ // For Qwen2.5, the output dimension is 2048 instead of 3584
3010+ if (ctx->is_qwen2_5 ) {
3011+ LOG_INF (" %s: Qwen2.5 detected, using output dimension 2048\n " , __func__);
3012+ return 2048 ;
3013+ }
29593014 return ctx->vision_model .mm_1_b ->ne [0 ];
29603015 }
29613016
@@ -2976,6 +3031,9 @@ bool clip_is_glm(const struct clip_ctx * ctx) {
29763031bool clip_is_qwen2vl (const struct clip_ctx * ctx) {
29773032 return ctx->has_qwen2vl_merger ;
29783033}
3034+ bool clip_is_qwen2_5vl (const struct clip_ctx * ctx) {
3035+ return ctx->is_qwen2_5 ;
3036+ }
29793037
29803038// Determine the number of encoder layers to iterate over
29813039int get_deepest_feature_layer (const struct clip_ctx * ctx) {
0 commit comments