Skip to content

Commit 0e6f8b2

Browse files
committed
Port of Qwen3-VL for latest ik_llama.cpp
- convert_hf_to_gguf.py - Not touched, use llama.cpp to convert model instead - sysl and metal support for imrope not added - Vulkan support for imrope not tested - Code not tested
1 parent 14760aa commit 0e6f8b2

File tree

21 files changed

+841
-73
lines changed

21 files changed

+841
-73
lines changed

examples/mtmd/clip-impl.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
3737
#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor"
3838
#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size"
39+
#define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers"
3940

4041
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
4142
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
@@ -59,6 +60,7 @@
5960
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
6061
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
6162
#define TN_PATCH_BIAS "v.patch_embd.bias"
63+
#define TN_ATTN_QKV "%s.blk.%d.attn_qkv.%s"
6264
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
6365
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
6466
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
@@ -89,6 +91,9 @@
8991
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
9092
#define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model)
9193
#define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model)
94+
#define TN_DEEPSTACK_NORM "v.deepstack.%d.norm.%s" // qwen3vl deepstack
95+
#define TN_DEEPSTACK_FC1 "v.deepstack.%d.fc1.%s" // qwen3vl deepstack
96+
#define TN_DEEPSTACK_FC2 "v.deepstack.%d.fc2.%s" // qwen3vl deepstack
9297

9398
// mimicpmv
9499
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
@@ -123,6 +128,7 @@ enum projector_type {
123128
PROJECTOR_TYPE_MINICPMV,
124129
PROJECTOR_TYPE_GLM_EDGE,
125130
PROJECTOR_TYPE_QWEN2VL,
131+
PROJECTOR_TYPE_QWEN3VL,
126132
PROJECTOR_TYPE_GEMMA3,
127133
PROJECTOR_TYPE_IDEFICS3,
128134
PROJECTOR_TYPE_PIXTRAL,
@@ -146,6 +152,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
146152
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
147153
{ PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"},
148154
{ PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"},
155+
{ PROJECTOR_TYPE_QWEN3VL, "qwen3vl_merger"},
149156
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
150157
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
151158
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},

examples/mtmd/clip.cpp

Lines changed: 247 additions & 7 deletions
Large diffs are not rendered by default.

examples/mtmd/clip.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct
9393
int clip_is_minicpmv(const struct clip_ctx * ctx);
9494
bool clip_is_glm(const struct clip_ctx * ctx);
9595
bool clip_is_qwen2vl(const struct clip_ctx * ctx);
96+
bool clip_is_qwen3vl(const struct clip_ctx * ctx);
9697
bool clip_is_llava(const struct clip_ctx * ctx);
9798
bool clip_is_gemma3(const struct clip_ctx * ctx);
9899

examples/mtmd/mtmd.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ struct mtmd_context {
252252
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
253253
img_end = "[IMG_END]";
254254

255-
} else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL) {
255+
} else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL) {
256256
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
257257
img_beg = "<|vision_start|>";
258258
img_end = "<|vision_end|>";

ggml/include/ggml.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,7 @@
259259
#define GGML_ROPE_TYPE_NEOX 2
260260
#define GGML_ROPE_TYPE_MROPE 8
261261
#define GGML_ROPE_TYPE_VISION 24
262+
#define GGML_ROPE_TYPE_IMROPE 40 // binary: 101000
262263

263264
#define GGML_MROPE_SECTIONS 4
264265

ggml/src/ggml-cuda/rope.cu

Lines changed: 28 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ template<bool forward, bool has_ff, typename T>
125125
static __global__ void rope_multi(
126126
const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2,
127127
const int n_dims, const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor,
128-
const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors, const mrope_sections sections) {
128+
const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors, const mrope_sections sections, const bool is_imrope) {
129129
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
130130

131131
if (i0 >= ne0) {
@@ -152,17 +152,27 @@ static __global__ void rope_multi(
152152
const int sector = (i0 / 2) % sect_dims;
153153

154154
float theta_base = 0.0;
155-
if (sector < sections.v[0]) {
156-
theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
157-
}
158-
else if (sector >= sections.v[0] && sector < sec_w) {
159-
theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
160-
}
161-
else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
162-
theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
163-
}
164-
else if (sector >= sec_w + sections.v[2]) {
165-
theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
155+
if (is_imrope) {
156+
if (sector % 3 == 1 && sector < 3 * sections.v[1]) { // h
157+
theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
158+
} else if (sector % 3 == 2 && sector < 3 * sections.v[2]) { // w
159+
theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
160+
} else if (sector % 3 == 0 && sector < 3 * sections.v[0]) { // t
161+
theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
162+
}
163+
} else {
164+
if (sector < sections.v[0]) {
165+
theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
166+
}
167+
else if (sector >= sections.v[0] && sector < sec_w) {
168+
theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
169+
}
170+
else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
171+
theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
172+
}
173+
else if (sector >= sec_w + sections.v[2]) {
174+
theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
175+
}
166176
}
167177

168178
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
@@ -276,7 +286,7 @@ template<bool forward, typename T>
276286
static void rope_multi_cuda(
277287
const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
278288
const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
279-
const rope_corr_dims corr_dims, const float * freq_factors, const mrope_sections sections, cudaStream_t stream) {
289+
const rope_corr_dims corr_dims, const float * freq_factors, const mrope_sections sections, const bool is_imrope, cudaStream_t stream) {
280290
GGML_ASSERT(ne0 % 2 == 0);
281291
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
282292
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
@@ -287,11 +297,11 @@ static void rope_multi_cuda(
287297
if (freq_factors == nullptr) {
288298
rope_multi<forward, false, T><<<block_nums, block_dims, 0, stream>>>(
289299
x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
290-
attn_factor, corr_dims, theta_scale, freq_factors, sections);
300+
attn_factor, corr_dims, theta_scale, freq_factors, sections, is_imrope);
291301
} else {
292302
rope_multi<forward, true, T><<<block_nums, block_dims, 0, stream>>>(
293303
x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
294-
attn_factor, corr_dims, theta_scale, freq_factors, sections);
304+
attn_factor, corr_dims, theta_scale, freq_factors, sections, is_imrope);
295305
}
296306
}
297307

@@ -369,6 +379,7 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
369379

370380
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
371381
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
382+
const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
372383
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
373384

374385
if (is_mrope) {
@@ -406,11 +417,11 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
406417
if (src0->type == GGML_TYPE_F32) {
407418
rope_multi_cuda<forward>(
408419
(const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
409-
freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
420+
freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, is_imrope, stream);
410421
} else if (src0->type == GGML_TYPE_F16) {
411422
rope_multi_cuda<forward>(
412423
(const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
413-
freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
424+
freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, is_imrope, stream);
414425
} else {
415426
GGML_ABORT("fatal error");
416427
}

ggml/src/ggml-vulkan.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -815,6 +815,7 @@ struct vk_op_rope_push_constants {
815815
uint32_t s1;
816816
uint32_t s2;
817817
int32_t sections[4];
818+
uint32_t is_imrope;
818819
uint32_t is_back;
819820
};
820821

@@ -6754,6 +6755,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
67546755
const int mode = ((const int32_t *) dst->op_params)[2];
67556756
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
67566757
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
6758+
const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
67576759
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
67586760

67596761
if (is_neox) {
@@ -6763,7 +6765,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
67636765
if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
67646766
return ctx->device->pipeline_rope_neox_f16;
67656767
}
6766-
} else if (is_mrope && !is_vision) {
6768+
} else if ((is_mrope || is_imrope) && !is_vision) {
67676769
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
67686770
return ctx->device->pipeline_rope_multi_f32;
67696771
}
@@ -7970,6 +7972,8 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, cons
79707972
memcpy(sections, (int32_t *) dst->op_params + 11, sizeof(int)*4);
79717973
}
79727974

7975+
const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
7976+
79737977
float corr_dims[2];
79747978
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
79757979

@@ -7982,7 +7986,7 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, cons
79827986
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
79837987
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
79847988
src2 != nullptr, (uint32_t)src0->ne[2], s1, s2,
7985-
sections[0], sections[1], sections[2], sections[3], backprop
7989+
sections[0], sections[1], sections[2], sections[3], is_imrope, backprop
79867990
}, dryrun);
79877991
}
79887992

ggml/src/ggml.c

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18339,7 +18339,7 @@ static void ggml_rope_cache_init(
1833918339
}
1834018340

1834118341
static void ggml_mrope_cache_init(
18342-
float theta_base_t, float theta_base_h, float theta_base_w, float theta_base_e, int sections[4], bool indep_sects,
18342+
float theta_base_t, float theta_base_h, float theta_base_w, float theta_base_e, int sections[4], bool is_imrope, bool indep_sects,
1834318343
float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
1834418344
float * cache, float sin_sign, float theta_scale) {
1834518345
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
@@ -18374,14 +18374,26 @@ static void ggml_mrope_cache_init(
1837418374
}
1837518375

1837618376
float theta = theta_t;
18377-
if (sector >= sections[0] && sector < sec_w) {
18378-
theta = theta_h;
18379-
}
18380-
else if (sector >= sec_w && sector < sec_w + sections[2]) {
18381-
theta = theta_w;
18382-
}
18383-
else if (sector >= sec_w + sections[2]) {
18384-
theta = theta_e;
18377+
if (is_imrope) { // qwen3vl apply interleaved mrope
18378+
if (sector % 3 == 1 && sector < 3 * sections[1]) {
18379+
theta = theta_h;
18380+
} else if (sector % 3 == 2 && sector < 3 * sections[2]) {
18381+
theta = theta_w;
18382+
} else if (sector % 3 == 0 && sector < 3 * sections[0]) {
18383+
theta = theta_t;
18384+
} else {
18385+
theta = theta_e;
18386+
}
18387+
} else {
18388+
if (sector >= sections[0] && sector < sec_w) {
18389+
theta = theta_h;
18390+
}
18391+
else if (sector >= sec_w && sector < sec_w + sections[2]) {
18392+
theta = theta_w;
18393+
}
18394+
else if (sector >= sec_w + sections[2]) {
18395+
theta = theta_e;
18396+
}
1838518397
}
1838618398

1838718399
rope_yarn(
@@ -18454,6 +18466,7 @@ static void ggml_compute_forward_rope_f32(
1845418466

1845518467
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
1845618468
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE; // ggml_rope_multi, multimodal rotary position embedding
18469+
const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; // qwen3vl apply interleaved mrope
1845718470
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
1845818471

1845918472
if (is_mrope) {
@@ -18492,7 +18505,7 @@ static void ggml_compute_forward_rope_f32(
1849218505
const int64_t p_w = pos[i2 + ne2 * 2];
1849318506
const int64_t p_e = pos[i2 + ne2 * 3];
1849418507
ggml_mrope_cache_init(
18495-
p_t, p_h, p_w, p_e, sections, is_vision,
18508+
p_t, p_h, p_w, p_e, sections, is_imrope, is_vision,
1849618509
freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
1849718510
}
1849818511

@@ -18640,6 +18653,7 @@ static void ggml_compute_forward_rope_f16(
1864018653

1864118654
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
1864218655
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
18656+
const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
1864318657
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
1864418658

1864518659
if (is_mrope) {
@@ -18678,7 +18692,7 @@ static void ggml_compute_forward_rope_f16(
1867818692
const int64_t p_w = pos[i2 + ne2 * 2];
1867918693
const int64_t p_e = pos[i2 + ne2 * 3];
1868018694
ggml_mrope_cache_init(
18681-
p_t, p_h, p_w, p_e, sections, is_vision,
18695+
p_t, p_h, p_w, p_e, sections, is_imrope, is_vision,
1868218696
freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
1868318697
}
1868418698

ggml/src/vulkan-shaders/rope_head.comp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ layout (push_constant) uniform parameter {
2929
uint s1;
3030
uint s2;
3131
int sections[4];
32+
uint is_imrope;
3233
uint is_back;
3334
} p;
3435

ggml/src/vulkan-shaders/rope_multi.comp

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -32,17 +32,29 @@ void main() {
3232
const uint sector = (i0 / 2) % sect_dims;
3333

3434
float theta_base = 0.0;
35-
if (sector < p.sections[0]) {
36-
theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f);
37-
}
38-
else if (sector >= p.sections[0] && sector < sec_w) {
39-
theta_base = data_pos[channel_x + ne2 * 1]*pow(p.theta_scale, i0/2.0f);
40-
}
41-
else if (sector >= sec_w && sector < sec_w + p.sections[2]) {
42-
theta_base = data_pos[channel_x + ne2 * 2]*pow(p.theta_scale, i0/2.0f);
43-
}
44-
else if (sector >= sec_w + p.sections[2]) {
45-
theta_base = data_pos[channel_x + ne2 * 3]*pow(p.theta_scale, i0/2.0f);
35+
if (p.is_imrope != 0) {
36+
if (sector % 3 == 1 && sector < 3 * p.sections[1]) {
37+
theta_base = data_pos[channel_x + ne2 * 1]*pow(p.theta_scale, i0/2.0f);
38+
} else if (sector % 3 == 2 && sector < 3 * p.sections[2]) {
39+
theta_base = data_pos[channel_x + ne2 * 2]*pow(p.theta_scale, i0/2.0f);
40+
} else if (sector % 3 == 0 && sector < 3 * p.sections[0]) {
41+
theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f);
42+
} else {
43+
theta_base = data_pos[channel_x + ne2 * 3]*pow(p.theta_scale, i0/2.0f);
44+
}
45+
} else {
46+
if (sector < p.sections[0]) {
47+
theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f);
48+
}
49+
else if (sector >= p.sections[0] && sector < sec_w) {
50+
theta_base = data_pos[channel_x + ne2 * 1]*pow(p.theta_scale, i0/2.0f);
51+
}
52+
else if (sector >= sec_w && sector < sec_w + p.sections[2]) {
53+
theta_base = data_pos[channel_x + ne2 * 2]*pow(p.theta_scale, i0/2.0f);
54+
}
55+
else if (sector >= sec_w + p.sections[2]) {
56+
theta_base = data_pos[channel_x + ne2 * 3]*pow(p.theta_scale, i0/2.0f);
57+
}
4658
}
4759

4860
const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f;

0 commit comments

Comments
 (0)