Skip to content

Commit f2e4eea

Browse files
WIP: Support for Qwen2.5VL in clipp.cpp
1 parent b9f01e9 commit f2e4eea

File tree

3 files changed

+138
-62
lines changed

3 files changed

+138
-62
lines changed

examples/llava/clip.cpp

Lines changed: 117 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -9,25 +9,25 @@
99
#include "ggml-backend.h"
1010
#include "gguf.h"
1111

12-
//#ifdef GGML_USE_CUDA
13-
//#include "ggml-cuda.h"
14-
//#endif
15-
//
16-
//#ifdef GGML_USE_SYCL
17-
//#include "ggml-sycl.h"
18-
//#endif
19-
//
20-
//#ifdef GGML_USE_METAL
21-
//#include "ggml-metal.h"
22-
//#endif
23-
//
24-
//#ifdef GGML_USE_CANN
25-
//#include "ggml-cann.h"
26-
//#endif
27-
//
28-
//#ifdef GGML_USE_VULKAN
29-
//#include "ggml-vulkan.h"
30-
//#endif
12+
#ifdef GGML_USE_CUDA
13+
#include "ggml-cuda.h"
14+
#endif
15+
16+
#ifdef GGML_USE_SYCL
17+
#include "ggml-sycl.h"
18+
#endif
19+
20+
#ifdef GGML_USE_METAL
21+
#include "ggml-metal.h"
22+
#endif
23+
24+
#ifdef GGML_USE_CANN
25+
#include "ggml-cann.h"
26+
#endif
27+
28+
#ifdef GGML_USE_VULKAN
29+
#include "ggml-vulkan.h"
30+
#endif
3131

3232
#define STB_IMAGE_IMPLEMENTATION
3333
#include "stb_image.h"
@@ -106,6 +106,8 @@ static std::string format(const char * fmt, ...) {
106106
#define KEY_HAS_GLM_PROJ "clip.has_glm_projector"
107107
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
108108
#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger"
109+
#define KEY_IS_QWEN2_5 "clip.is_qwen2_5"
110+
#define KEY_RMS_NORM_EPS "clip.%s.attention.rms_norm_epsilon"
109111
#define KEY_USE_GELU "clip.use_gelu"
110112
#define KEY_USE_SILU "clip.use_silu"
111113
#define KEY_N_EMBD "clip.%s.embedding_length"
@@ -583,6 +585,7 @@ struct clip_ctx {
583585
bool has_minicpmv_projector = false;
584586
bool has_glm_projector = false;
585587
bool has_qwen2vl_merger = false;
588+
bool is_qwen2_5 = false;
586589
int minicpmv_version = 2;
587590

588591
struct clip_vision_model vision_model;
@@ -734,7 +737,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
734737
if (ctx->has_minicpmv_projector) {
735738
int pos_w = image_size_width/patch_size;
736739
int pos_h = image_size_height/patch_size;
737-
if (ctx->minicpmv_version == 2) {
740+
if (ctx->is_qwen2_5) {
741+
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 2048, pos_w * pos_h, 1);
742+
}
743+
else if (ctx->minicpmv_version == 2) {
738744
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
739745
}
740746
else if (ctx->minicpmv_version == 3) {
@@ -774,8 +780,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
774780
{
775781
cur = ggml_norm(ctx0, cur, eps);
776782

777-
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w),
778-
model.layers[il].ln_1_b);
783+
if (ctx->is_qwen2_5) {
784+
// RMSNorm for Qwen2.5 (no bias)
785+
cur = ggml_mul(ctx0, cur, model.layers[il].ln_1_w);
786+
} else {
787+
// Standard LayerNorm with bias
788+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w),
789+
model.layers[il].ln_1_b);
790+
}
779791
}
780792

781793
// self-attention
@@ -834,22 +846,47 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
834846
{
835847
cur = ggml_norm(ctx0, cur, eps);
836848

837-
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
849+
if (ctx->is_qwen2_5) {
850+
// RMSNorm for Qwen2.5 (no bias)
851+
cur = ggml_mul(ctx0, cur, model.layers[il].ln_2_w);
852+
} else {
853+
// Standard LayerNorm with bias
854+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w),
855+
model.layers[il].ln_2_b);
856+
}
838857
}
839858

840-
cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
841-
cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
859+
// For Qwen2.5, the MLP uses SiLU gated activation
860+
if (ctx->is_qwen2_5) {
861+
// Qwen2.5 uses SiLU gated activation
862+
// ffn_down is the gate_proj, ffn_up is the up_proj
863+
struct ggml_tensor * gate = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
864+
struct ggml_tensor * up = ggml_mul_mat(ctx0, model.layers[il].ff_i_b, cur); // using ff_i_b as up_proj weight
865+
866+
// Apply SiLU to the gate
867+
gate = ggml_silu_inplace(ctx0, gate);
868+
869+
// Multiply gate and up
870+
cur = ggml_mul(ctx0, gate, up);
842871

843-
if (ctx->use_gelu) {
844-
cur = ggml_gelu_inplace(ctx0, cur);
845-
} else if (ctx->use_silu) {
846-
cur = ggml_silu_inplace(ctx0, cur);
872+
// Apply down projection
873+
cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
847874
} else {
848-
cur = ggml_gelu_quick_inplace(ctx0, cur);
849-
}
875+
// Original MLP
876+
cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
877+
cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
878+
879+
if (ctx->use_gelu) {
880+
cur = ggml_gelu_inplace(ctx0, cur);
881+
} else if (ctx->use_silu) {
882+
cur = ggml_silu_inplace(ctx0, cur);
883+
} else {
884+
cur = ggml_gelu_quick_inplace(ctx0, cur);
885+
}
850886

851-
cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
852-
cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
887+
cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
888+
cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
889+
}
853890

854891
// residual 2
855892
cur = ggml_add(ctx0, embeddings, cur);
@@ -1085,7 +1122,12 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
10851122
const int d_head = 128;
10861123
int n_head = hidden_size/d_head;
10871124
int num_query = 96;
1088-
if (ctx->minicpmv_version == 2) {
1125+
if (ctx->is_qwen2_5) {
1126+
hidden_size = 2048;
1127+
n_head = hidden_size/d_head;
1128+
num_query = 64;
1129+
}
1130+
else if (ctx->minicpmv_version == 2) {
10891131
hidden_size = 4096;
10901132
n_head = hidden_size/d_head;
10911133
num_query = 96;
@@ -1296,30 +1338,30 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
12961338
}
12971339
}
12981340

1299-
//#ifdef GGML_USE_CUDA
1300-
// new_clip->backend = ggml_backend_cuda_init(0);
1301-
// LOG_INF("%s: CLIP using CUDA backend\n", __func__);
1302-
//#endif
1303-
//
1304-
//#ifdef GGML_USE_METAL
1305-
// new_clip->backend = ggml_backend_metal_init();
1306-
// LOG_INF("%s: CLIP using Metal backend\n", __func__);
1307-
//#endif
1308-
//
1309-
//#ifdef GGML_USE_CANN
1310-
// new_clip->backend = ggml_backend_cann_init(0);
1311-
// LOG_INF("%s: CLIP using CANN backend\n", __func__);
1312-
//#endif
1313-
//
1314-
//#ifdef GGML_USE_VULKAN
1315-
// new_clip->backend = ggml_backend_vk_init(0);
1316-
// LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
1317-
//#endif
1318-
//
1319-
//#ifdef GGML_USE_SYCL
1320-
// new_clip->backend = ggml_backend_sycl_init(0);
1321-
// LOG_INF("%s: CLIP using SYCL backend\n", __func__);
1322-
//#endif
1341+
#ifdef GGML_USE_CUDA
1342+
new_clip->backend = ggml_backend_cuda_init(0);
1343+
LOG_INF("%s: CLIP using CUDA backend\n", __func__);
1344+
#endif
1345+
1346+
#ifdef GGML_USE_METAL
1347+
new_clip->backend = ggml_backend_metal_init();
1348+
LOG_INF("%s: CLIP using Metal backend\n", __func__);
1349+
#endif
1350+
1351+
#ifdef GGML_USE_CANN
1352+
new_clip->backend = ggml_backend_cann_init(0);
1353+
LOG_INF("%s: CLIP using CANN backend\n", __func__);
1354+
#endif
1355+
1356+
#ifdef GGML_USE_VULKAN
1357+
new_clip->backend = ggml_backend_vk_init(0);
1358+
LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
1359+
#endif
1360+
1361+
#ifdef GGML_USE_SYCL
1362+
new_clip->backend = ggml_backend_sycl_init(0);
1363+
LOG_INF("%s: CLIP using SYCL backend\n", __func__);
1364+
#endif
13231365

13241366
if (!new_clip->backend) {
13251367
new_clip->backend = ggml_backend_cpu_init();
@@ -1360,6 +1402,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
13601402
}
13611403
// GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
13621404

1405+
idx = gguf_find_key(ctx, KEY_IS_QWEN2_5);
1406+
if (idx != -1) {
1407+
new_clip->is_qwen2_5 = gguf_get_val_bool(ctx, idx);
1408+
}
1409+
13631410
GGML_ASSERT(new_clip->has_vision_encoder);
13641411
GGML_ASSERT(!new_clip->has_text_encoder);
13651412

@@ -2942,7 +2989,10 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
29422989
return ctx->vision_model.mm_3_b->ne[0];
29432990
}
29442991
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
2945-
if (ctx->minicpmv_version == 2) {
2992+
if (ctx->is_qwen2_5) {
2993+
return 2048;
2994+
}
2995+
else if (ctx->minicpmv_version == 2) {
29462996
return 4096;
29472997
}
29482998
else if (ctx->minicpmv_version == 3) {
@@ -2956,6 +3006,11 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
29563006
return ctx->vision_model.mm_model_mlp_3_w->ne[1];
29573007
}
29583008
if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
3009+
// For Qwen2.5, the output dimension is 2048 instead of 3584
3010+
if (ctx->is_qwen2_5) {
3011+
LOG_INF("%s: Qwen2.5 detected, using output dimension 2048\n", __func__);
3012+
return 2048;
3013+
}
29593014
return ctx->vision_model.mm_1_b->ne[0];
29603015
}
29613016

@@ -2976,6 +3031,9 @@ bool clip_is_glm(const struct clip_ctx * ctx) {
29763031
bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
29773032
return ctx->has_qwen2vl_merger;
29783033
}
3034+
bool clip_is_qwen2_5vl(const struct clip_ctx * ctx) {
3035+
return ctx->is_qwen2_5;
3036+
}
29793037

29803038
// Determine the number of encoder layers to iterate over
29813039
int get_deepest_feature_layer(const struct clip_ctx * ctx) {

examples/llava/clip.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out
9898
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
9999
CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
100100
CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
101+
CLIP_API bool clip_is_qwen2_5vl(const struct clip_ctx * ctx);
101102

102103
CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
103104

examples/llava/qwen2_5_vl_surgery.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,10 @@
22
from typing import Dict
33

44
import torch
5-
import numpy as np
65
from gguf import *
76
from transformers import (
87
Qwen2_5_VLForConditionalGeneration,
98
Qwen2_5_VLProcessor,
10-
Qwen2_5_VLConfig,
119
)
1210

1311
VISION = "clip.vision"
@@ -137,15 +135,34 @@ def main(args):
137135
fout.add_bool("clip.use_silu", True)
138136
fout.add_bool("clip.use_gelu", False)
139137

138+
# Add missing keys
139+
# 1. mm_patch_merge_type - Qwen2.5 uses a flat merge type
140+
fout.add_string("clip.vision.mm_patch_merge_type", "flat")
141+
142+
# 2. image_grid_pinpoints - For Qwen2.5, we'll provide standard resolution options
143+
# These are common grid pinpoints for image processing, defining possible resolutions
144+
grid_pinpoints = [224, 224, 336, 336, 448, 448, 560, 560]
145+
fout.add_array("clip.vision.image_grid_pinpoints", grid_pinpoints)
146+
147+
# 3. feature_layer - Typically set to the last layer(s) for feature extraction
148+
# For Qwen2.5, we'll use the final layer
149+
feature_layers = [vcfg.depth] # Use the last layer
150+
fout.add_array("clip.vision.feature_layer", feature_layers)
151+
152+
# 4. image_crop_resolution - Set to the same as image_size by default
153+
image_size = 14 * 40 # same as used below
154+
fout.add_uint32("clip.vision.image_crop_resolution", image_size)
155+
140156
tensor_map = find_vision_tensors(model, np_dtype)
141157
for name, data in tensor_map.items():
142158
fout.add_tensor(name, data)
143159

144160
fout.add_uint32("clip.vision.patch_size", vcfg.patch_size)
145-
fout.add_uint32("clip.vision.image_size", 14 * 40) # reasonable size divisible by (14*2)
161+
fout.add_uint32("clip.vision.image_size", image_size) # reasonable size divisible by (14*2)
146162
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.hidden_size)
147163
fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
148164
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads)
165+
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
149166
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth)
150167
fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), vcfg.intermediate_size)
151168
fout.add_name(model_name)

0 commit comments

Comments
 (0)