@@ -212,6 +212,7 @@ class ModelType(Enum):
212212
213213 Qwen2_5VL = ModelTypeTagChatImageVideoIn + 0x0000001
214214 KimiVL = ModelTypeTagChatImageVideoIn + 0x0000100
215+ SmolVLM = ModelTypeTagChatImageVideoIn + 0x0000200
215216
216217 MiniCPM_O = ModelTypeTagChatImageVideoAudioInAudioOut + 0x0000001
217218
@@ -1836,6 +1837,96 @@ def get_weight_names(config):
18361837 r = Llama3Converter .get_weight_names (config )
18371838 return r [:- 1 ]
18381839
1840+ class SmolVLMConverter (BaseConverter ):
1841+ MODEL_TYPE = ModelType .SmolVLM
1842+
1843+ @classmethod
1844+ def state_dict_pp (cls , config , state_dict ):
1845+ r = {}
1846+ for name in state_dict :
1847+ tensor : torch .Tensor = state_dict [name ]
1848+
1849+ if name .startswith ('model.text_model.' ):
1850+ name = name .replace ('model.text_model.' , 'model.' )
1851+ r [name ] = SmolLMConverter .pp (SmolVLMConverter .txt_config , name , tensor )
1852+ elif name .startswith ('model.vision_model' ):
1853+ name = name .replace ('model.vision_model.' , 'vision_model.' )
1854+
1855+ if 'mlp.fc1.' in name :
1856+ name = name .replace ('.fc1.' , '.fc0.' )
1857+ elif 'mlp.fc2.' in name :
1858+ name = name .replace ('.fc2.' , '.fc1.' )
1859+ elif '.out_proj.' in name :
1860+ name = name .replace ('.out_proj.' , '.o_proj.' )
1861+ elif name .startswith ('vision_model.post_layernorm' ):
1862+ name = name .replace ('.post_layernorm.' , '.final_layernorm.' )
1863+
1864+ r [name ] = tensor
1865+ elif name .startswith ('vision_tower.' ):
1866+ r [name .replace ('vision_tower.' , 'vision_model.' )] = tensor
1867+ elif name == 'model.connector.modality_projection.proj.weight' :
1868+ r ["multi_modal_projector.proj.weight" ] = tensor
1869+ else :
1870+ r [name ] = tensor
1871+
1872+ return r
1873+
1874+ @staticmethod
1875+ def dump_config (f , config , ggml_type ):
1876+ SmolVLMConverter .txt_config = AttributeDict (config .text_config )
1877+ if SmolVLMConverter .txt_config .bos_token_id is None :
1878+ SmolVLMConverter .txt_config .bos_token_id = 128_000
1879+ if SmolVLMConverter .txt_config .eos_token_id is None :
1880+ SmolVLMConverter .txt_config .eos_token_id = 128_001
1881+ if SmolVLMConverter .txt_config .num_attention_heads is None :
1882+ SmolVLMConverter .txt_config .num_attention_heads = 32
1883+ if SmolVLMConverter .txt_config .hidden_act is None :
1884+ SmolVLMConverter .txt_config .hidden_act = 'silu'
1885+ if SmolVLMConverter .txt_config .num_key_value_heads is None :
1886+ SmolVLMConverter .txt_config .num_key_value_heads = SmolVLMConverter .txt_config .num_attention_heads
1887+ if SmolVLMConverter .txt_config .tie_word_embeddings is None :
1888+ SmolVLMConverter .txt_config .tie_word_embeddings = False
1889+
1890+ assert not SmolVLMConverter .txt_config .tie_word_embeddings
1891+ assert not SmolVLMConverter .txt_config .qk_layer_norms
1892+ assert not SmolVLMConverter .txt_config .use_resampler
1893+ SmolLMConverter .dump_config (f , SmolVLMConverter .txt_config , ggml_type )
1894+
1895+ @staticmethod
1896+ def get_weight_names (config ):
1897+ weight_names = Llama3Converter .get_weight_names (SmolVLMConverter .txt_config )
1898+
1899+ for i in range (config .vision_config ['num_hidden_layers' ]):
1900+ weight_names += [
1901+ f"vision_model.encoder.layers.{ i } .self_attn.q_proj.bias" ,
1902+ f"vision_model.encoder.layers.{ i } .self_attn.q_proj.weight" ,
1903+ f"vision_model.encoder.layers.{ i } .self_attn.k_proj.bias" ,
1904+ f"vision_model.encoder.layers.{ i } .self_attn.k_proj.weight" ,
1905+ f"vision_model.encoder.layers.{ i } .self_attn.v_proj.bias" ,
1906+ f"vision_model.encoder.layers.{ i } .self_attn.v_proj.weight" ,
1907+ f"vision_model.encoder.layers.{ i } .self_attn.o_proj.bias" ,
1908+ f"vision_model.encoder.layers.{ i } .self_attn.o_proj.weight" ,
1909+ f"vision_model.encoder.layers.{ i } .mlp.fc0.bias" ,
1910+ f"vision_model.encoder.layers.{ i } .mlp.fc0.weight" ,
1911+ f"vision_model.encoder.layers.{ i } .mlp.fc1.bias" ,
1912+ f"vision_model.encoder.layers.{ i } .mlp.fc1.weight" ,
1913+ f"vision_model.encoder.layers.{ i } .layer_norm1.bias" ,
1914+ f"vision_model.encoder.layers.{ i } .layer_norm1.weight" ,
1915+ f"vision_model.encoder.layers.{ i } .layer_norm2.bias" ,
1916+ f"vision_model.encoder.layers.{ i } .layer_norm2.weight" ,
1917+ ]
1918+
1919+ weight_names += [
1920+ "multi_modal_projector.proj.weight" ,
1921+ "vision_model.final_layernorm.bias" ,
1922+ "vision_model.final_layernorm.weight" ,
1923+ "vision_model.embeddings.position_embedding.weight" ,
1924+ "vision_model.embeddings.patch_embedding.bias" ,
1925+ "vision_model.embeddings.patch_embedding.weight" ,
1926+ ]
1927+
1928+ return weight_names
1929+
18391930class LlamaMultiConverter (BaseConverter ):
18401931 MODEL_TYPE = ModelType .LlaMAMulti
18411932
@@ -6965,6 +7056,8 @@ def main():
69657056 Llama3Converter .convert (config , model_files , vocab , ggml_type , args .save_path )
69667057 elif arch == 'smollm' :
69677058 SmolLMConverter .convert (config , model_files , vocab , ggml_type , args .save_path )
7059+ elif arch == 'SmolVLMForConditionalGeneration' :
7060+ SmolVLMConverter .convert (config , model_files , vocab , ggml_type , args .save_path )
69687061 elif arch == 'XverseForCausalLM' :
69697062 if config .num_experts is None :
69707063 LlamaConverter .MODEL_TYPE = ModelType .XVERSE
0 commit comments