foldl
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎convert.py‎
Lines changed: 5 additions & 2 deletions b/‎convert.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎docs/models.md‎
Lines changed: 3 additions & 1 deletion b/‎docs/models.md‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎models/llama.h‎
Lines changed: 2 additions & 2 deletions b/‎models/llama.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎models/moonshot.cpp‎
Lines changed: 5 additions & 3 deletions b/‎models/moonshot.cpp‎
Lines changed: 5 additions & 3 deletions
@@ -31,6 +31,7 @@ LittleAcademia[<a href="https://github.com/foldl/little-academia"   style="text-
 
 **What's New:**
 
+* 2025-09-23: Qwen2.5-VL
 * 2025-09-15: Ling/Ring-mini-2.0
 * 2025-09-08: GroveMoE
 * 2025-09-03: Apertus
 
@@ -4486,7 +4486,9 @@ def state_dict_pp(cls, config, state_dict):
             if name == 'visual.patch_embed.proj.weight':
                 shape = tensor.shape
                 assert len(shape) == 5
-                r[name] = tensor.view(shape[0], shape[1] * shape[2] * shape[3] * shape[4])
+                assert shape[2] == 2
+                r[name.replace('proj.weight', 'proj.0.weight')] = tensor[:, :, 0, :, :]
+                r[name.replace('proj.weight', 'proj.1.weight')] = tensor[:, :, 1, :, :]
             elif name.endswith('.attn.qkv.bias') or name.endswith('.attn.qkv.weight'):
                 #print(f'shape: {name} = {tensor.shape}')
                 num_heads = config.vision_config['hidden_size']
@@ -4543,7 +4545,8 @@ def get_weight_names(config):
             "visual.merger.mlp.0.weight",
             "visual.merger.mlp.2.bias",
             "visual.merger.mlp.2.weight",
-            "visual.patch_embed.proj.weight",
+            "visual.patch_embed.proj.0.weight",
+            "visual.patch_embed.proj.1.weight",
         ]
 
         return weight_names
 
@@ -361,8 +361,10 @@ Please use `--format completion` for these models.
     * `native_resolution`: use native resolution or not, default: `false` (This seems sensitive to quantization, so defaults to `false`).
     * `fps`: Default 1.0.
 
-* Qwen (`Qwen2AudioForConditionalGeneration`)
+* Qwen (`Qwen2AudioForConditionalGeneration`, `Qwen2_5_VLForConditionalGeneration`)
     * [x] Qwen2-Audio: [7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct/tree/0a095220c30b7b31434169c3086508ef3ea5bf0a)
+    * [x] Qwen2.5-VL: [3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct/tree/66285546d2b821cf421d4f5eb2576359d3770cd3), [7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/tree/cc594898137f460bfe9f0759e9844b3ce807cfb5)
+    * [x] MiMo-VL: [7B-RL](https://huggingface.co/XiaomiMiMo/MiMo-VL-7B-RL/tree/460c34be0c6cfe79b6b311647ae9112784f80b73), [7B-RL-2508](https://huggingface.co/XiaomiMiMo/MiMo-VL-7B-RL-2508/tree/4bfb270765825d2fa059011deb4c96fdd579be6f)
 
 * SmolVLM2 (`SmolVLMForConditionalGeneration`)
     * [x] [2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct/tree/482adb537c021c86670beed01cd58990d01e72e4)
 
@@ -380,11 +380,11 @@ namespace chatllm::llama::v4
         }
     };
 
-    class LlamaNormedSelfAttention : public QKNormedAttention<L2Norm, BaseAttention>
+    class LlamaNormedSelfAttention : public QKNormedAttention<L2NormInplace, BaseAttention>
     {
     public:
         LlamaNormedSelfAttention(InitContext *ctx, int hidden_size, int num_attention_heads, int num_kv_heads, int head_dim, int max_length)
-            : QKNormedAttention<L2Norm, BaseAttention>(ctx, hidden_size, num_attention_heads, num_kv_heads, head_dim, max_length, false, false)
+            : QKNormedAttention<L2NormInplace, BaseAttention>(ctx, hidden_size, num_attention_heads, num_kv_heads, head_dim, max_length, false, false)
         {
             post_norm = true;
         }
 
@@ -251,26 +251,28 @@ namespace chatllm::kimi::vit
         void before_forward(ComputeContext *ctx, const int n_past, const int qlen) override
         {
             const int len = grid_h * grid_w;
+            std::vector<int> v_pos_w;
             std::vector<int> v_pos_h;
 
             CHATLLM_CHECK(len <= max_length);
 
             ggml::set_dim(pos,   0, len);
             ggml::set_dim(pos_h, 0, len);
 
+            v_pos_w.resize(len);
             v_pos_h.resize(len);
 
             for (int i = 0; i < grid_h; i++)
             {
                 for (int j = 0; j < grid_w; j++)
                 {
-                    v_pos  [i * grid_w + j] = j;
+                    v_pos_w[i * grid_w + j] = j;
                     v_pos_h[i * grid_w + j] = i;
                 }
             }
 
-            Backend::write_tensor_data(pos,     v_pos.data(), 0, len * sizeof(v_pos[0]));
-            Backend::write_tensor_data(pos_h, v_pos_h.data(), 0, len * sizeof(v_pos[0]));
+            Backend::write_tensor_data(pos,   v_pos_w.data(), 0, len * sizeof(v_pos_w[0]));
+            Backend::write_tensor_data(pos_h, v_pos_h.data(), 0, len * sizeof(v_pos_h[0]));
         }
 
         ggml::tensor *apply_2d_rope(ComputeContext *ctx, ggml::tensor *hidden, int hidden_size, ggml::tensor *pos_w, ggml::tensor *pos_h) const
Original file line number	Diff line number	Diff line change
`@@ -380,11 +380,11 @@ namespace chatllm::llama::v4`
`380`	`380`	`}`
`381`	`381`	`};`
`382`	`382`
`383`		`- class LlamaNormedSelfAttention : public QKNormedAttention<L2Norm, BaseAttention>`
	`383`	`+ class LlamaNormedSelfAttention : public QKNormedAttention<L2NormInplace, BaseAttention>`
`384`	`384`	`{`
`385`	`385`	`public:`
`386`	`386`	`LlamaNormedSelfAttention(InitContext *ctx, int hidden_size, int num_attention_heads, int num_kv_heads, int head_dim, int max_length)`
`387`		`- : QKNormedAttention<L2Norm, BaseAttention>(ctx, hidden_size, num_attention_heads, num_kv_heads, head_dim, max_length, false, false)`
	`387`	`+ : QKNormedAttention<L2NormInplace, BaseAttention>(ctx, hidden_size, num_attention_heads, num_kv_heads, head_dim, max_length, false, false)`
`388`	`388`	`{`
`389`	`389`	`post_norm = true;`
`390`	`390`	`}`
Original file line number	Diff line number	Diff line change
`@@ -251,26 +251,28 @@ namespace chatllm::kimi::vit`
`251`	`251`	`void before_forward(ComputeContext *ctx, const int n_past, const int qlen) override`
`252`	`252`	`{`
`253`	`253`	`const int len = grid_h * grid_w;`
	`254`	`+ std::vector<int> v_pos_w;`
`254`	`255`	`std::vector<int> v_pos_h;`
`255`	`256`
`256`	`257`	`CHATLLM_CHECK(len <= max_length);`
`257`	`258`
`258`	`259`	`ggml::set_dim(pos, 0, len);`
`259`	`260`	`ggml::set_dim(pos_h, 0, len);`
`260`	`261`
	`262`	`+ v_pos_w.resize(len);`
`261`	`263`	`v_pos_h.resize(len);`
`262`	`264`
`263`	`265`	`for (int i = 0; i < grid_h; i++)`
`264`	`266`	`{`
`265`	`267`	`for (int j = 0; j < grid_w; j++)`
`266`	`268`	`{`
`267`		`- v_pos [i * grid_w + j] = j;`
	`269`	`+ v_pos_w[i * grid_w + j] = j;`
`268`	`270`	`v_pos_h[i * grid_w + j] = i;`
`269`	`271`	`}`
`270`	`272`	`}`
`271`	`273`
`272`		`- Backend::write_tensor_data(pos, v_pos.data(), 0, len * sizeof(v_pos[0]));`
`273`		`- Backend::write_tensor_data(pos_h, v_pos_h.data(), 0, len * sizeof(v_pos[0]));`
	`274`	`+ Backend::write_tensor_data(pos, v_pos_w.data(), 0, len * sizeof(v_pos_w[0]));`
	`275`	`+ Backend::write_tensor_data(pos_h, v_pos_h.data(), 0, len * sizeof(v_pos_h[0]));`
`274`	`276`	`}`
`275`	`277`
`276`	`278`	`ggml::tensor apply_2d_rope(ComputeContext ctx, ggml::tensor hidden, int hidden_size, ggml::tensor pos_w, ggml::tensor *pos_h) const`