Skip to content

Commit fe4e731

Browse files
committed
add qwen2.5 vl support
1 parent f88daa5 commit fe4e731

File tree

6 files changed

+830
-332
lines changed

6 files changed

+830
-332
lines changed

examples/cli/main.cpp

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1142,17 +1142,7 @@ int main(int argc, const char* argv[]) {
11421142
SDParams params;
11431143
params.verbose = true;
11441144
sd_set_log_callback(sd_log_cb, (void*)&params);
1145-
auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
1146-
return false;
1147-
};
1148-
// auto tokenizer = CLIPTokenizer();
1149-
auto tokenizer = Qwen::Qwen2Tokenizer();
1150-
std::string text("a lovely cat");
1151-
auto tokens = tokenizer.encode(text, on_new_token_cb);
1152-
for (auto token : tokens) {
1153-
std::cout << token << " ";
1154-
}
1155-
std::cout << std::endl;
1145+
Qwen::Qwen2_5_VLEmbedder::load_from_file_and_test(argv[1]);
11561146
exit(1);
11571147
parse_args(argc, argv, params);
11581148
params.sample_params.guidance.slg.layers = params.skip_layers.data();

ggml_extend.hpp

Lines changed: 29 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1119,9 +1119,9 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention(struct ggml_context* ctx
11191119
return kqv;
11201120
}
11211121

1122-
// q: [N, L_q, C] or [N*n_head, L_q, d_head]
1123-
// k: [N, L_k, C] or [N*n_head, L_k, d_head]
1124-
// v: [N, L_k, C] or [N, L_k, n_head, d_head]
1122+
// q: [N, L_q, C(n_head*d_head)] or [N*n_head, L_q, d_head]
1123+
// k: [N, L_k, n_kv_head*d_head] or [N*n_kv_head, L_k, d_head]
1124+
// v: [N, L_k, n_kv_head*d_head] or [N, L_k, n_kv_head, d_head]
11251125
// mask: [N, L_q, L_k]
11261126
// return: [N, L_q, C]
11271127
__STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context* ctx,
@@ -1139,27 +1139,31 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
11391139
int64_t C;
11401140
int64_t N;
11411141
int64_t d_head;
1142+
int64_t n_kv_head;
11421143
if (!skip_reshape) {
1143-
L_q = q->ne[1];
1144-
L_k = k->ne[1];
1145-
C = q->ne[0];
1146-
N = q->ne[2];
1147-
d_head = C / n_head;
1148-
q = ggml_reshape_4d(ctx, q, d_head, n_head, L_q, N); // [N, L_q, n_head, d_head]
1149-
q = ggml_nn_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3)); // [N, n_head, L_q, d_head]
1150-
q = ggml_reshape_3d(ctx, q, d_head, L_q, n_head * N); // [N * n_head, L_q, d_head]
1151-
1152-
k = ggml_reshape_4d(ctx, k, d_head, n_head, L_k, N); // [N, L_k, n_head, d_head]
1153-
k = ggml_nn_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3)); // [N, n_head, L_k, d_head]
1154-
k = ggml_reshape_3d(ctx, k, d_head, L_k, n_head * N); // [N * n_head, L_k, d_head]
1155-
1156-
v = ggml_reshape_4d(ctx, v, d_head, n_head, L_k, N); // [N, L_k, n_head, d_head]
1144+
L_q = q->ne[1];
1145+
L_k = k->ne[1];
1146+
C = q->ne[0];
1147+
N = q->ne[2];
1148+
d_head = C / n_head;
1149+
n_kv_head = k->ne[0] / d_head;
1150+
1151+
q = ggml_reshape_4d(ctx, q, d_head, n_head, L_q, N); // [N, L_q, n_head, d_head]
1152+
q = ggml_nn_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3)); // [N, n_head, L_q, d_head]
1153+
q = ggml_reshape_3d(ctx, q, d_head, L_q, n_head * N); // [N * n_head, L_q, d_head]
1154+
1155+
k = ggml_reshape_4d(ctx, k, d_head, n_kv_head, L_k, N); // [N, L_k, n_kv_head, d_head]
1156+
k = ggml_nn_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3)); // [N, n_kv_head, L_k, d_head]
1157+
k = ggml_reshape_3d(ctx, k, d_head, L_k, n_kv_head * N); // [N * n_kv_head, L_k, d_head]
1158+
1159+
v = ggml_reshape_4d(ctx, v, d_head, n_kv_head, L_k, N); // [N, L_k, n_kv_head, d_head]
11571160
} else {
1158-
L_q = q->ne[1];
1159-
L_k = k->ne[1];
1160-
d_head = v->ne[0];
1161-
N = v->ne[3];
1162-
C = d_head * n_head;
1161+
L_q = q->ne[1];
1162+
L_k = k->ne[1];
1163+
d_head = v->ne[0];
1164+
N = v->ne[3];
1165+
n_kv_head = k->ne[2] / N;
1166+
C = d_head * n_head;
11631167
}
11641168

11651169
float scale = (1.0f / sqrt((float)d_head));
@@ -1174,7 +1178,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
11741178
k_in = ggml_cast(ctx, k_in, GGML_TYPE_F16);
11751179

11761180
v_in = ggml_nn_cont(ctx, ggml_permute(ctx, v_in, 0, 2, 1, 3));
1177-
v_in = ggml_reshape_3d(ctx, v_in, d_head, L_k, n_head * N);
1181+
v_in = ggml_reshape_3d(ctx, v_in, d_head, L_k, n_kv_head * N);
11781182
if (kv_pad != 0) {
11791183
v_in = ggml_pad(ctx, v_in, 0, kv_pad, 0, 0);
11801184
}
@@ -1232,8 +1236,8 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
12321236
// if (flash_attn) {
12331237
// LOG_DEBUG("fallback to default attention, L_q:%d L_k:%d n_head:%d C:%d d_head:%d N:%d", L_q, L_k, n_head, C, d_head, N);
12341238
// }
1235-
v = ggml_nn_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3)); // [N, n_head, d_head, L_k]
1236-
v = ggml_reshape_3d(ctx, v, L_k, d_head, n_head * N); // [N * n_head, d_head, L_k]
1239+
v = ggml_nn_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3)); // [N, n_kv_head, d_head, L_k]
1240+
v = ggml_reshape_3d(ctx, v, L_k, d_head, n_kv_head * N); // [N * n_kv_head, d_head, L_k]
12371241

12381242
auto kq = ggml_mul_mat(ctx, k, q); // [N * n_head, L_q, L_k]
12391243
kq = ggml_scale_inplace(ctx, kq, scale);

model.cpp

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,9 @@ const char* unused_tensors[] = {
110110
"embedding_manager",
111111
"denoiser.sigmas",
112112
"text_encoders.t5xxl.transformer.encoder.embed_tokens.weight", // only used during training
113+
"qwen2vl.output.weight",
114+
"qwen2vl.lm_head.",
115+
"qwen2vl.visual.",
113116
};
114117

115118
bool is_unused_tensor(std::string name) {
@@ -193,6 +196,21 @@ std::unordered_map<std::string, std::string> pmid_v2_name_map = {
193196
"pmid.qformer_perceiver.token_proj.fc2.weight"},
194197
};
195198

199+
std::unordered_map<std::string, std::string> qwenvl_name_map{
200+
{"token_embd.", "model.embed_tokens."},
201+
{"blk.", "model.layers."},
202+
{"attn_q.", "self_attn.q_proj."},
203+
{"attn_k.", "self_attn.k_proj."},
204+
{"attn_v.", "self_attn.v_proj."},
205+
{"attn_output.", "self_attn.o_proj."},
206+
{"attn_norm.", "input_layernorm."},
207+
{"ffn_down.", "mlp.down_proj."},
208+
{"ffn_gate.", "mlp.gate_proj."},
209+
{"ffn_up.", "mlp.up_proj."},
210+
{"ffn_norm.", "post_attention_layernorm."},
211+
{"output_norm.", "model.norm."},
212+
};
213+
196214
std::string convert_cond_model_name(const std::string& name) {
197215
std::string new_name = name;
198216
std::string prefix;
@@ -250,6 +268,13 @@ std::string convert_cond_model_name(const std::string& name) {
250268
if (pos != std::string::npos) {
251269
new_name.replace(pos, 11, "layer.0.SelfAttention.relative_attention_bias.");
252270
}
271+
} else if (contains(name, "qwen2vl")) {
272+
for (auto kv : qwenvl_name_map) {
273+
size_t pos = new_name.find(kv.first);
274+
if (pos != std::string::npos) {
275+
new_name.replace(pos, kv.first.size(), kv.second);
276+
}
277+
}
253278
} else if (name == "text_encoders.t5xxl.transformer.token_embd.weight") {
254279
new_name = "text_encoders.t5xxl.transformer.shared.weight";
255280
}
@@ -580,7 +605,11 @@ std::string convert_tensor_name(std::string name) {
580605
// name.replace(pos, strlen("lora_B"), "lora_down");
581606
// }
582607
std::string new_name = name;
583-
if (starts_with(name, "cond_stage_model.") || starts_with(name, "conditioner.embedders.") || starts_with(name, "text_encoders.") || ends_with(name, ".vision_model.visual_projection.weight")) {
608+
if (starts_with(name, "cond_stage_model.") ||
609+
starts_with(name, "conditioner.embedders.") ||
610+
starts_with(name, "text_encoders.") ||
611+
ends_with(name, ".vision_model.visual_projection.weight") ||
612+
starts_with(name, "qwen2vl")) {
584613
new_name = convert_cond_model_name(name);
585614
} else if (starts_with(name, "first_stage_model.decoder")) {
586615
new_name = convert_vae_decoder_name(name);

0 commit comments

Comments
 (0)