leejet
diff --git a/‎examples/cli/main.cpp‎
Lines changed: 1 addition & 11 deletions b/‎examples/cli/main.cpp‎
Lines changed: 1 addition & 11 deletions
diff --git a/‎ggml_extend.hpp‎
Lines changed: 29 additions & 25 deletions b/‎ggml_extend.hpp‎
Lines changed: 29 additions & 25 deletions
diff --git a/‎model.cpp‎
Lines changed: 30 additions & 1 deletion b/‎model.cpp‎
Lines changed: 30 additions & 1 deletion
@@ -1142,17 +1142,7 @@ int main(int argc, const char* argv[]) {
     SDParams params;
     params.verbose = true;
     sd_set_log_callback(sd_log_cb, (void*)&params);
-    auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
-        return false;
-    };
-    // auto tokenizer = CLIPTokenizer();
-    auto tokenizer = Qwen::Qwen2Tokenizer();
-    std::string text("a lovely cat");
-    auto tokens = tokenizer.encode(text, on_new_token_cb);
-    for (auto token : tokens) {
-        std::cout << token << " ";
-    }
-    std::cout << std::endl;
+    Qwen::Qwen2_5_VLEmbedder::load_from_file_and_test(argv[1]);
     exit(1);
     parse_args(argc, argv, params);
     params.sample_params.guidance.slg.layers                 = params.skip_layers.data();
 
@@ -1119,9 +1119,9 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention(struct ggml_context* ctx
     return kqv;
 }
 
-// q: [N, L_q, C] or [N*n_head, L_q, d_head]
-// k: [N, L_k, C] or [N*n_head, L_k, d_head]
-// v: [N, L_k, C] or [N, L_k, n_head, d_head]
+// q: [N, L_q, C(n_head*d_head)] or [N*n_head, L_q, d_head]
+// k: [N, L_k, n_kv_head*d_head] or [N*n_kv_head, L_k, d_head]
+// v: [N, L_k, n_kv_head*d_head] or [N, L_k, n_kv_head, d_head]
 // mask: [N, L_q, L_k]
 // return: [N, L_q, C]
 __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context* ctx,
@@ -1139,27 +1139,31 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
     int64_t C;
     int64_t N;
     int64_t d_head;
+    int64_t n_kv_head;
     if (!skip_reshape) {
-        L_q    = q->ne[1];
-        L_k    = k->ne[1];
-        C      = q->ne[0];
-        N      = q->ne[2];
-        d_head = C / n_head;
-        q      = ggml_reshape_4d(ctx, q, d_head, n_head, L_q, N);      // [N, L_q, n_head, d_head]
-        q      = ggml_nn_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3));  // [N, n_head, L_q, d_head]
-        q      = ggml_reshape_3d(ctx, q, d_head, L_q, n_head * N);     // [N * n_head, L_q, d_head]
-
-        k = ggml_reshape_4d(ctx, k, d_head, n_head, L_k, N);      // [N, L_k, n_head, d_head]
-        k = ggml_nn_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));  // [N, n_head, L_k, d_head]
-        k = ggml_reshape_3d(ctx, k, d_head, L_k, n_head * N);     // [N * n_head, L_k, d_head]
-
-        v = ggml_reshape_4d(ctx, v, d_head, n_head, L_k, N);  // [N, L_k, n_head, d_head]
+        L_q       = q->ne[1];
+        L_k       = k->ne[1];
+        C         = q->ne[0];
+        N         = q->ne[2];
+        d_head    = C / n_head;
+        n_kv_head = k->ne[0] / d_head;
+
+        q = ggml_reshape_4d(ctx, q, d_head, n_head, L_q, N);      // [N, L_q, n_head, d_head]
+        q = ggml_nn_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3));  // [N, n_head, L_q, d_head]
+        q = ggml_reshape_3d(ctx, q, d_head, L_q, n_head * N);     // [N * n_head, L_q, d_head]
+
+        k = ggml_reshape_4d(ctx, k, d_head, n_kv_head, L_k, N);   // [N, L_k, n_kv_head, d_head]
+        k = ggml_nn_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));  // [N, n_kv_head, L_k, d_head]
+        k = ggml_reshape_3d(ctx, k, d_head, L_k, n_kv_head * N);  // [N * n_kv_head, L_k, d_head]
+
+        v = ggml_reshape_4d(ctx, v, d_head, n_kv_head, L_k, N);  // [N, L_k, n_kv_head, d_head]
     } else {
-        L_q    = q->ne[1];
-        L_k    = k->ne[1];
-        d_head = v->ne[0];
-        N      = v->ne[3];
-        C      = d_head * n_head;
+        L_q       = q->ne[1];
+        L_k       = k->ne[1];
+        d_head    = v->ne[0];
+        N         = v->ne[3];
+        n_kv_head = k->ne[2] / N;
+        C         = d_head * n_head;
     }
 
     float scale = (1.0f / sqrt((float)d_head));
@@ -1174,7 +1178,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
         k_in = ggml_cast(ctx, k_in, GGML_TYPE_F16);
 
         v_in = ggml_nn_cont(ctx, ggml_permute(ctx, v_in, 0, 2, 1, 3));
-        v_in = ggml_reshape_3d(ctx, v_in, d_head, L_k, n_head * N);
+        v_in = ggml_reshape_3d(ctx, v_in, d_head, L_k, n_kv_head * N);
         if (kv_pad != 0) {
             v_in = ggml_pad(ctx, v_in, 0, kv_pad, 0, 0);
         }
@@ -1232,8 +1236,8 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
         // if (flash_attn) {
         //     LOG_DEBUG("fallback to default attention, L_q:%d L_k:%d n_head:%d C:%d d_head:%d N:%d", L_q, L_k, n_head, C, d_head, N);
         // }
-        v = ggml_nn_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3));  // [N, n_head, d_head, L_k]
-        v = ggml_reshape_3d(ctx, v, L_k, d_head, n_head * N);     // [N * n_head, d_head, L_k]
+        v = ggml_nn_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3));  // [N, n_kv_head, d_head, L_k]
+        v = ggml_reshape_3d(ctx, v, L_k, d_head, n_kv_head * N);  // [N * n_kv_head, d_head, L_k]
 
         auto kq = ggml_mul_mat(ctx, k, q);  // [N * n_head, L_q, L_k]
         kq      = ggml_scale_inplace(ctx, kq, scale);
 
@@ -110,6 +110,9 @@ const char* unused_tensors[] = {
     "embedding_manager",
     "denoiser.sigmas",
     "text_encoders.t5xxl.transformer.encoder.embed_tokens.weight",  // only used during training
+    "qwen2vl.output.weight",
+    "qwen2vl.lm_head.",
+    "qwen2vl.visual.",
 };
 
 bool is_unused_tensor(std::string name) {
@@ -193,6 +196,21 @@ std::unordered_map<std::string, std::string> pmid_v2_name_map = {
      "pmid.qformer_perceiver.token_proj.fc2.weight"},
 };
 
+std::unordered_map<std::string, std::string> qwenvl_name_map{
+    {"token_embd.", "model.embed_tokens."},
+    {"blk.", "model.layers."},
+    {"attn_q.", "self_attn.q_proj."},
+    {"attn_k.", "self_attn.k_proj."},
+    {"attn_v.", "self_attn.v_proj."},
+    {"attn_output.", "self_attn.o_proj."},
+    {"attn_norm.", "input_layernorm."},
+    {"ffn_down.", "mlp.down_proj."},
+    {"ffn_gate.", "mlp.gate_proj."},
+    {"ffn_up.", "mlp.up_proj."},
+    {"ffn_norm.", "post_attention_layernorm."},
+    {"output_norm.", "model.norm."},
+};
+
 std::string convert_cond_model_name(const std::string& name) {
     std::string new_name = name;
     std::string prefix;
@@ -250,6 +268,13 @@ std::string convert_cond_model_name(const std::string& name) {
         if (pos != std::string::npos) {
             new_name.replace(pos, 11, "layer.0.SelfAttention.relative_attention_bias.");
         }
+    } else if (contains(name, "qwen2vl")) {
+        for (auto kv : qwenvl_name_map) {
+            size_t pos = new_name.find(kv.first);
+            if (pos != std::string::npos) {
+                new_name.replace(pos, kv.first.size(), kv.second);
+            }
+        }
     } else if (name == "text_encoders.t5xxl.transformer.token_embd.weight") {
         new_name = "text_encoders.t5xxl.transformer.shared.weight";
     }
@@ -580,7 +605,11 @@ std::string convert_tensor_name(std::string name) {
     //     name.replace(pos, strlen("lora_B"), "lora_down");
     // }
     std::string new_name = name;
-    if (starts_with(name, "cond_stage_model.") || starts_with(name, "conditioner.embedders.") || starts_with(name, "text_encoders.") || ends_with(name, ".vision_model.visual_projection.weight")) {
+    if (starts_with(name, "cond_stage_model.") ||
+        starts_with(name, "conditioner.embedders.") ||
+        starts_with(name, "text_encoders.") ||
+        ends_with(name, ".vision_model.visual_projection.weight") ||
+        starts_with(name, "qwen2vl")) {
         new_name = convert_cond_model_name(name);
     } else if (starts_with(name, "first_stage_model.decoder")) {
         new_name = convert_vae_decoder_name(name);