Skip to content

Commit 2011a4d

Browse files
committed
add SmolVLM2
1 parent fdbeedc commit 2011a4d

File tree

12 files changed

+1041
-161
lines changed

12 files changed

+1041
-161
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ pure C++ implementation based on [@ggerganov](https://github.com/ggerganov)'s [g
1313

1414
**What's New:**
1515

16+
* 2025-06-10: SmolVLM2
1617
* 2025-06-07: MiniCPM4
1718
* 2025-06-06: Qwen-3 Embedding & Reranker
1819
* 2025-06-03: Kimi-VL

convert.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@ class ModelType(Enum):
212212

213213
Qwen2_5VL = ModelTypeTagChatImageVideoIn + 0x0000001
214214
KimiVL = ModelTypeTagChatImageVideoIn + 0x0000100
215+
SmolVLM = ModelTypeTagChatImageVideoIn + 0x0000200
215216

216217
MiniCPM_O = ModelTypeTagChatImageVideoAudioInAudioOut + 0x0000001
217218

@@ -1836,6 +1837,96 @@ def get_weight_names(config):
18361837
r = Llama3Converter.get_weight_names(config)
18371838
return r[:-1]
18381839

1840+
class SmolVLMConverter(BaseConverter):
1841+
MODEL_TYPE = ModelType.SmolVLM
1842+
1843+
@classmethod
1844+
def state_dict_pp(cls, config, state_dict):
1845+
r = {}
1846+
for name in state_dict:
1847+
tensor: torch.Tensor = state_dict[name]
1848+
1849+
if name.startswith('model.text_model.'):
1850+
name = name.replace('model.text_model.', 'model.')
1851+
r[name] = SmolLMConverter.pp(SmolVLMConverter.txt_config, name, tensor)
1852+
elif name.startswith('model.vision_model'):
1853+
name = name.replace('model.vision_model.', 'vision_model.')
1854+
1855+
if 'mlp.fc1.' in name:
1856+
name = name.replace('.fc1.', '.fc0.')
1857+
elif 'mlp.fc2.' in name:
1858+
name = name.replace('.fc2.', '.fc1.')
1859+
elif '.out_proj.' in name:
1860+
name = name.replace('.out_proj.', '.o_proj.')
1861+
elif name.startswith('vision_model.post_layernorm'):
1862+
name = name.replace('.post_layernorm.', '.final_layernorm.')
1863+
1864+
r[name] = tensor
1865+
elif name.startswith('vision_tower.'):
1866+
r[name.replace('vision_tower.', 'vision_model.')] = tensor
1867+
elif name == 'model.connector.modality_projection.proj.weight':
1868+
r["multi_modal_projector.proj.weight"] = tensor
1869+
else:
1870+
r[name] = tensor
1871+
1872+
return r
1873+
1874+
@staticmethod
1875+
def dump_config(f, config, ggml_type):
1876+
SmolVLMConverter.txt_config = AttributeDict(config.text_config)
1877+
if SmolVLMConverter.txt_config.bos_token_id is None:
1878+
SmolVLMConverter.txt_config.bos_token_id = 128_000
1879+
if SmolVLMConverter.txt_config.eos_token_id is None:
1880+
SmolVLMConverter.txt_config.eos_token_id = 128_001
1881+
if SmolVLMConverter.txt_config.num_attention_heads is None:
1882+
SmolVLMConverter.txt_config.num_attention_heads = 32
1883+
if SmolVLMConverter.txt_config.hidden_act is None:
1884+
SmolVLMConverter.txt_config.hidden_act = 'silu'
1885+
if SmolVLMConverter.txt_config.num_key_value_heads is None:
1886+
SmolVLMConverter.txt_config.num_key_value_heads = SmolVLMConverter.txt_config.num_attention_heads
1887+
if SmolVLMConverter.txt_config.tie_word_embeddings is None:
1888+
SmolVLMConverter.txt_config.tie_word_embeddings = False
1889+
1890+
assert not SmolVLMConverter.txt_config.tie_word_embeddings
1891+
assert not SmolVLMConverter.txt_config.qk_layer_norms
1892+
assert not SmolVLMConverter.txt_config.use_resampler
1893+
SmolLMConverter.dump_config(f, SmolVLMConverter.txt_config, ggml_type)
1894+
1895+
@staticmethod
1896+
def get_weight_names(config):
1897+
weight_names = Llama3Converter.get_weight_names(SmolVLMConverter.txt_config)
1898+
1899+
for i in range(config.vision_config['num_hidden_layers']):
1900+
weight_names += [
1901+
f"vision_model.encoder.layers.{i}.self_attn.q_proj.bias",
1902+
f"vision_model.encoder.layers.{i}.self_attn.q_proj.weight",
1903+
f"vision_model.encoder.layers.{i}.self_attn.k_proj.bias",
1904+
f"vision_model.encoder.layers.{i}.self_attn.k_proj.weight",
1905+
f"vision_model.encoder.layers.{i}.self_attn.v_proj.bias",
1906+
f"vision_model.encoder.layers.{i}.self_attn.v_proj.weight",
1907+
f"vision_model.encoder.layers.{i}.self_attn.o_proj.bias",
1908+
f"vision_model.encoder.layers.{i}.self_attn.o_proj.weight",
1909+
f"vision_model.encoder.layers.{i}.mlp.fc0.bias",
1910+
f"vision_model.encoder.layers.{i}.mlp.fc0.weight",
1911+
f"vision_model.encoder.layers.{i}.mlp.fc1.bias",
1912+
f"vision_model.encoder.layers.{i}.mlp.fc1.weight",
1913+
f"vision_model.encoder.layers.{i}.layer_norm1.bias",
1914+
f"vision_model.encoder.layers.{i}.layer_norm1.weight",
1915+
f"vision_model.encoder.layers.{i}.layer_norm2.bias",
1916+
f"vision_model.encoder.layers.{i}.layer_norm2.weight",
1917+
]
1918+
1919+
weight_names += [
1920+
"multi_modal_projector.proj.weight",
1921+
"vision_model.final_layernorm.bias",
1922+
"vision_model.final_layernorm.weight",
1923+
"vision_model.embeddings.position_embedding.weight",
1924+
"vision_model.embeddings.patch_embedding.bias",
1925+
"vision_model.embeddings.patch_embedding.weight",
1926+
]
1927+
1928+
return weight_names
1929+
18391930
class LlamaMultiConverter(BaseConverter):
18401931
MODEL_TYPE = ModelType.LlaMAMulti
18411932

@@ -6965,6 +7056,8 @@ def main():
69657056
Llama3Converter.convert(config, model_files, vocab, ggml_type, args.save_path)
69667057
elif arch == 'smollm':
69677058
SmolLMConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
7059+
elif arch == 'SmolVLMForConditionalGeneration':
7060+
SmolVLMConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
69687061
elif arch == 'XverseForCausalLM':
69697062
if config.num_experts is None:
69707063
LlamaConverter.MODEL_TYPE = ModelType.XVERSE

docs/models.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,9 @@ Please use `--format completion` for these models.
305305
* Kimi (`KimiVLForConditionalGeneration`)
306306
* [x] VL: [A3B-Instruct](https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/tree/7a3c132a7b0f1f1677f5a72f258bd3afded7d357), [A3B-Thinking](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking/commit/16681d8ac24e505088698e4e34ea494dd6e24400)
307307

308+
* SmolVLM2 (`SmolVLMForConditionalGeneration`)
309+
* [x] [2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct/tree/482adb537c021c86670beed01cd58990d01e72e4)
310+
308311
## RAG Models
309312

310313
* Text Embedding (`XLMRobertaModel`)

models/kimi.cpp

Lines changed: 2 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -221,65 +221,6 @@ namespace vit
221221
Linear linear_2;
222222
};
223223

224-
struct merge_patch_param
225-
{
226-
int grid_h;
227-
int grid_w;
228-
int merge_kernel_size[2];
229-
};
230-
231-
static void ggml_custom_merge_patch(struct ggml_tensor * dst , const struct ggml_tensor * src, int ith, int nth, const merge_patch_param * param)
232-
{
233-
const int kernel_height = param->merge_kernel_size[0];
234-
const int kernel_width = param->merge_kernel_size[1];
235-
const int new_height = param->grid_h / kernel_height;
236-
const int new_width = param->grid_w / kernel_width;
237-
238-
CHATLLM_CHECK(ggml::get_dim(src, 1) == (int64_t)param->grid_h * param->grid_w);
239-
240-
const int64_t nr = ggml::nrows(dst);
241-
const int64_t dr = (nr + nth - 1)/nth;
242-
const int64_t ir0 = dr*ith;
243-
const int64_t ir1 = MIN(ir0 + dr, nr);
244-
245-
const int64_t nb1 = src->nb[1];
246-
const int64_t nb2 = nb1 * kernel_width;
247-
const int64_t nb3 = nb2 * kernel_height;
248-
const int64_t nb4 = nb3 * new_width;
249-
250-
for (int64_t i4 = 0; i4 < new_height; i4++)
251-
{
252-
for (int64_t i3 = 0; i3 < new_width; i3++)
253-
{
254-
for (int64_t i2 = 0; i2 < kernel_height; i2++)
255-
{
256-
for (int64_t i1 = 0; i1 < kernel_width; i1++)
257-
{
258-
const int64_t ir = (i2 + i4 * kernel_height) * param->grid_w + (i1 + i3 * kernel_width);
259-
if (ir < ir0) continue;
260-
if (ir > ir1) break;
261-
262-
const void *src_data = (void *)((char *) src->data + ir*nb1);
263-
void *dst_data = (void *)((char *) dst->data + i4*nb4 + i3*nb3 + i2*nb2 + i1*nb1);
264-
memcpy(dst_data, src_data, nb1);
265-
}
266-
}
267-
}
268-
}
269-
}
270-
271-
static void ggml_custom_merge_patch(struct ggml_tensor * dst, int ith, int nth, void * userdata)
272-
{
273-
const merge_patch_param *param = (const merge_patch_param *)userdata;
274-
275-
const struct ggml_tensor * a = dst->src[0];
276-
CHATLLM_CHECK(ggml::is_contiguous(a));
277-
CHATLLM_CHECK(ggml::get_dim(a, 3) == 1);
278-
CHATLLM_CHECK(ggml::get_dim(a, 2) == 1);
279-
280-
ggml_custom_merge_patch(dst, a, ith, nth, param);
281-
}
282-
283224
class VisionTransformer : public Block
284225
{
285226
public:
@@ -333,15 +274,7 @@ namespace vit
333274
merge_param.grid_h = grid_h;
334275
merge_param.grid_w = grid_w;
335276

336-
const int64_t kernel_height = merge_param.merge_kernel_size[0];
337-
const int64_t kernel_width = merge_param.merge_kernel_size[1];
338-
const int64_t new_height = grid_h / kernel_height;
339-
const int64_t new_width = grid_w / kernel_width;
340-
341-
std::vector<ggml::tensor *> params;
342-
params.push_back(x);
343-
auto reshaped_seq = ggml::custom(ctx, ggml_custom_merge_patch, GGML_N_TASKS_MAX, &merge_param, params, ggml::type_of(x),
344-
ggml::get_dim(x, 0), kernel_height * kernel_width * new_height * new_width * ggml::get_dim(x, 2), 1, 1);
277+
auto reshaped_seq = ggml::merge_patch(ctx, x, &merge_param);
345278
return reshaped_seq;
346279
}
347280

@@ -372,7 +305,7 @@ namespace vit
372305
MultiModalProjector multi_modal_projector;
373306
protected:
374307
bool loaded;
375-
merge_patch_param merge_param;
308+
ggml::merge_patch_param merge_param;
376309
};
377310

378311
class VisualEmbeddingGeneration

0 commit comments

Comments
 (0)