foldl
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎convert.py‎
Lines changed: 93 additions & 0 deletions b/‎convert.py‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎docs/models.md‎
Lines changed: 3 additions & 0 deletions b/‎docs/models.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎models/kimi.cpp‎
Lines changed: 2 additions & 69 deletions b/‎models/kimi.cpp‎
Lines changed: 2 additions & 69 deletions
@@ -13,6 +13,7 @@ pure C++ implementation based on [@ggerganov](https://github.com/ggerganov)'s [g
 
 **What's New:**
 
+* 2025-06-10: SmolVLM2
 * 2025-06-07: MiniCPM4
 * 2025-06-06: Qwen-3 Embedding & Reranker
 * 2025-06-03: Kimi-VL
 
@@ -212,6 +212,7 @@ class ModelType(Enum):
 
     Qwen2_5VL               = ModelTypeTagChatImageVideoIn + 0x0000001
     KimiVL                  = ModelTypeTagChatImageVideoIn + 0x0000100
+    SmolVLM                 = ModelTypeTagChatImageVideoIn + 0x0000200
 
     MiniCPM_O               = ModelTypeTagChatImageVideoAudioInAudioOut + 0x0000001
 
@@ -1836,6 +1837,96 @@ def get_weight_names(config):
         r = Llama3Converter.get_weight_names(config)
         return r[:-1]
 
+class SmolVLMConverter(BaseConverter):
+    MODEL_TYPE = ModelType.SmolVLM
+
+    @classmethod
+    def state_dict_pp(cls, config, state_dict):
+        r = {}
+        for name in state_dict:
+            tensor: torch.Tensor = state_dict[name]
+
+            if name.startswith('model.text_model.'):
+                name = name.replace('model.text_model.', 'model.')
+                r[name] = SmolLMConverter.pp(SmolVLMConverter.txt_config, name, tensor)
+            elif name.startswith('model.vision_model'):
+                name = name.replace('model.vision_model.', 'vision_model.')
+
+                if 'mlp.fc1.' in name:
+                    name = name.replace('.fc1.', '.fc0.')
+                elif 'mlp.fc2.' in name:
+                    name = name.replace('.fc2.', '.fc1.')
+                elif '.out_proj.' in name:
+                    name = name.replace('.out_proj.', '.o_proj.')
+                elif name.startswith('vision_model.post_layernorm'):
+                    name = name.replace('.post_layernorm.', '.final_layernorm.')
+
+                r[name] = tensor
+            elif name.startswith('vision_tower.'):
+                r[name.replace('vision_tower.', 'vision_model.')] = tensor
+            elif name == 'model.connector.modality_projection.proj.weight':
+                r["multi_modal_projector.proj.weight"] = tensor
+            else:
+                r[name] = tensor
+
+        return r
+
+    @staticmethod
+    def dump_config(f, config, ggml_type):
+        SmolVLMConverter.txt_config = AttributeDict(config.text_config)
+        if SmolVLMConverter.txt_config.bos_token_id is None:
+            SmolVLMConverter.txt_config.bos_token_id = 128_000
+        if SmolVLMConverter.txt_config.eos_token_id is None:
+            SmolVLMConverter.txt_config.eos_token_id = 128_001
+        if SmolVLMConverter.txt_config.num_attention_heads is None:
+            SmolVLMConverter.txt_config.num_attention_heads = 32
+        if SmolVLMConverter.txt_config.hidden_act is None:
+            SmolVLMConverter.txt_config.hidden_act = 'silu'
+        if SmolVLMConverter.txt_config.num_key_value_heads is None:
+            SmolVLMConverter.txt_config.num_key_value_heads = SmolVLMConverter.txt_config.num_attention_heads
+        if SmolVLMConverter.txt_config.tie_word_embeddings is None:
+            SmolVLMConverter.txt_config.tie_word_embeddings = False
+
+        assert not SmolVLMConverter.txt_config.tie_word_embeddings
+        assert not SmolVLMConverter.txt_config.qk_layer_norms
+        assert not SmolVLMConverter.txt_config.use_resampler
+        SmolLMConverter.dump_config(f, SmolVLMConverter.txt_config, ggml_type)
+
+    @staticmethod
+    def get_weight_names(config):
+        weight_names = Llama3Converter.get_weight_names(SmolVLMConverter.txt_config)
+
+        for i in range(config.vision_config['num_hidden_layers']):
+            weight_names += [
+                f"vision_model.encoder.layers.{i}.self_attn.q_proj.bias",
+                f"vision_model.encoder.layers.{i}.self_attn.q_proj.weight",
+                f"vision_model.encoder.layers.{i}.self_attn.k_proj.bias",
+                f"vision_model.encoder.layers.{i}.self_attn.k_proj.weight",
+                f"vision_model.encoder.layers.{i}.self_attn.v_proj.bias",
+                f"vision_model.encoder.layers.{i}.self_attn.v_proj.weight",
+                f"vision_model.encoder.layers.{i}.self_attn.o_proj.bias",
+                f"vision_model.encoder.layers.{i}.self_attn.o_proj.weight",
+                f"vision_model.encoder.layers.{i}.mlp.fc0.bias",
+                f"vision_model.encoder.layers.{i}.mlp.fc0.weight",
+                f"vision_model.encoder.layers.{i}.mlp.fc1.bias",
+                f"vision_model.encoder.layers.{i}.mlp.fc1.weight",
+                f"vision_model.encoder.layers.{i}.layer_norm1.bias",
+                f"vision_model.encoder.layers.{i}.layer_norm1.weight",
+                f"vision_model.encoder.layers.{i}.layer_norm2.bias",
+                f"vision_model.encoder.layers.{i}.layer_norm2.weight",
+            ]
+
+        weight_names += [
+            "multi_modal_projector.proj.weight",
+            "vision_model.final_layernorm.bias",
+            "vision_model.final_layernorm.weight",
+            "vision_model.embeddings.position_embedding.weight",
+            "vision_model.embeddings.patch_embedding.bias",
+            "vision_model.embeddings.patch_embedding.weight",
+        ]
+
+        return weight_names
+
 class LlamaMultiConverter(BaseConverter):
     MODEL_TYPE = ModelType.LlaMAMulti
 
@@ -6965,6 +7056,8 @@ def main():
         Llama3Converter.convert(config, model_files, vocab, ggml_type, args.save_path)
     elif arch == 'smollm':
         SmolLMConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
+    elif arch == 'SmolVLMForConditionalGeneration':
+        SmolVLMConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
     elif arch == 'XverseForCausalLM':
         if config.num_experts is None:
             LlamaConverter.MODEL_TYPE = ModelType.XVERSE
 
@@ -305,6 +305,9 @@ Please use `--format completion` for these models.
 * Kimi (`KimiVLForConditionalGeneration`)
     * [x] VL: [A3B-Instruct](https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/tree/7a3c132a7b0f1f1677f5a72f258bd3afded7d357), [A3B-Thinking](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking/commit/16681d8ac24e505088698e4e34ea494dd6e24400)
 
+* SmolVLM2 (`SmolVLMForConditionalGeneration`)
+    * [x] [2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct/tree/482adb537c021c86670beed01cd58990d01e72e4)
+
 ## RAG Models
 
 * Text Embedding (`XLMRobertaModel`)
 
@@ -221,65 +221,6 @@ namespace vit
         Linear    linear_2;
     };
 
-    struct merge_patch_param
-    {
-        int grid_h;
-        int grid_w;
-        int merge_kernel_size[2];
-    };
-
-    static void ggml_custom_merge_patch(struct ggml_tensor * dst , const struct ggml_tensor * src, int ith, int nth, const merge_patch_param * param)
-    {
-        const int kernel_height = param->merge_kernel_size[0];
-        const int kernel_width  = param->merge_kernel_size[1];
-        const int new_height = param->grid_h / kernel_height;
-        const int new_width  = param->grid_w / kernel_width;
-
-        CHATLLM_CHECK(ggml::get_dim(src, 1) == (int64_t)param->grid_h * param->grid_w);
-
-        const int64_t nr  = ggml::nrows(dst);
-        const int64_t dr  = (nr + nth - 1)/nth;
-        const int64_t ir0 = dr*ith;
-        const int64_t ir1 = MIN(ir0 + dr, nr);
-
-        const int64_t nb1 = src->nb[1];
-        const int64_t nb2 = nb1 * kernel_width;
-        const int64_t nb3 = nb2 * kernel_height;
-        const int64_t nb4 = nb3 * new_width;
-
-        for (int64_t i4 = 0; i4 < new_height; i4++)
-        {
-            for (int64_t i3 = 0; i3 < new_width; i3++)
-            {
-                for (int64_t i2 = 0; i2 < kernel_height; i2++)
-                {
-                    for (int64_t i1 = 0; i1 < kernel_width; i1++)
-                    {
-                        const int64_t ir = (i2 + i4 * kernel_height) * param->grid_w + (i1 + i3 * kernel_width);
-                        if (ir < ir0) continue;
-                        if (ir > ir1) break;
-
-                        const void *src_data  = (void *)((char *)  src->data + ir*nb1);
-                              void *dst_data  = (void *)((char *)  dst->data + i4*nb4 + i3*nb3  + i2*nb2 + i1*nb1);
-                        memcpy(dst_data, src_data, nb1);
-                    }
-                }
-            }
-        }
-    }
-
-    static void ggml_custom_merge_patch(struct ggml_tensor * dst, int ith, int nth, void * userdata)
-    {
-        const merge_patch_param *param = (const merge_patch_param *)userdata;
-
-        const struct ggml_tensor * a = dst->src[0];
-        CHATLLM_CHECK(ggml::is_contiguous(a));
-        CHATLLM_CHECK(ggml::get_dim(a, 3) == 1);
-        CHATLLM_CHECK(ggml::get_dim(a, 2) == 1);
-
-        ggml_custom_merge_patch(dst, a, ith, nth, param);
-    }
-
     class VisionTransformer : public Block
     {
     public:
@@ -333,15 +274,7 @@ namespace vit
             merge_param.grid_h = grid_h;
             merge_param.grid_w = grid_w;
 
-            const int64_t kernel_height = merge_param.merge_kernel_size[0];
-            const int64_t kernel_width  = merge_param.merge_kernel_size[1];
-            const int64_t new_height    = grid_h / kernel_height;
-            const int64_t new_width     = grid_w / kernel_width;
-
-            std::vector<ggml::tensor *> params;
-            params.push_back(x);
-            auto reshaped_seq = ggml::custom(ctx, ggml_custom_merge_patch, GGML_N_TASKS_MAX, &merge_param, params, ggml::type_of(x),
-                                ggml::get_dim(x, 0), kernel_height * kernel_width * new_height * new_width * ggml::get_dim(x, 2), 1, 1);
+            auto reshaped_seq = ggml::merge_patch(ctx, x, &merge_param);
             return reshaped_seq;
         }
 
@@ -372,7 +305,7 @@ namespace vit
         MultiModalProjector multi_modal_projector;
     protected:
         bool loaded;
-        merge_patch_param merge_param;
+        ggml::merge_patch_param merge_param;
     };
 
     class VisualEmbeddingGeneration