From 32c6e529d553ebb07a2e901b82d4e3ad862ae73a Mon Sep 17 00:00:00 2001 From: Harry Hu Date: Mon, 25 Aug 2025 01:52:20 -0400 Subject: [PATCH] [Model] Updated model preset with more models --- python/mlc_llm/model/model_preset.py | 389 +++++++++++++++++++++++++++ 1 file changed, 389 insertions(+) diff --git a/python/mlc_llm/model/model_preset.py b/python/mlc_llm/model/model_preset.py index c8245366e3..59b1fa101c 100644 --- a/python/mlc_llm/model/model_preset.py +++ b/python/mlc_llm/model/model_preset.py @@ -1103,6 +1103,31 @@ "use_cache": True, "vocab_size": 103168, }, + # "gemma_2b": { + # "architectures": ["GemmaForCausalLM"], + # "attention_bias": False, + # "attention_dropout": 0.0, + # "bos_token_id": 2, + # "eos_token_id": 1, + # "head_dim": 256, + # "hidden_act": "gelu", + # "hidden_size": 2048, + # "initializer_range": 0.02, + # "intermediate_size": 16384, + # "max_position_embeddings": 8192, + # "model_type": "gemma", + # "num_attention_heads": 8, + # "num_hidden_layers": 18, + # "num_key_value_heads": 1, + # "pad_token_id": 0, + # "rms_norm_eps": 1e-06, + # "rope_scaling": None, + # "rope_theta": 10000.0, + # "torch_dtype": "bfloat16", + # "transformers_version": "4.38.0.dev0", + # "use_cache": True, + # "vocab_size": 256000, + # }, "gemma2_2b": { "architectures": ["Gemma2ForCausalLM"], "attention_bias": False, @@ -1196,6 +1221,75 @@ "vocab_size": 256000, "_attn_implementation": "eager", }, + # "gemma2_2b-jpn": { + # "architectures": [ + # "Gemma2ForCausalLM" + # ], + # "attention_bias": False, + # "attention_dropout": 0.0, + # "attn_logit_softcapping": 50.0, + # "bos_token_id": 2, + # "cache_implementation": "hybrid", + # "dtype": "bfloat16", + # "eos_token_id": 1, + # "final_logit_softcapping": 30.0, + # "head_dim": 256, + # "hidden_activation": "gelu_pytorch_tanh", + # "hidden_size": 2304, + # "initializer_range": 0.02, + # "intermediate_size": 9216, + # "max_position_embeddings": 8192, + # "model_type": "gemma2", + # "num_attention_heads": 8, + # "num_hidden_layers": 26, + # "num_key_value_heads": 4, + # "pad_token_id": 0, + # "query_pre_attn_scalar": 224, + # "rms_norm_eps": 1e-06, + # "rope_theta": 10000.0, + # "sliding_window": 4096, + # "torch_dtype": "bfloat16", + # "transformers_version": "4.44.2", + # "use_cache": True, + # "vocab_size": 256000, + # }, + # "gemma3_1b_it": { + # "architectures": [ + # "Gemma3ForCausalLM" + # ], + # "attention_bias": False, + # "attention_dropout": 0.0, + # "attn_logit_softcapping": None, + # "bos_token_id": 2, + # "cache_implementation": "hybrid", + # "eos_token_id": [ + # 1, + # 106 + # ], + # "final_logit_softcapping": None, + # "head_dim": 256, + # "hidden_activation": "gelu_pytorch_tanh", + # "hidden_size": 1152, + # "initializer_range": 0.02, + # "intermediate_size": 6912, + # "max_position_embeddings": 32768, + # "model_type": "gemma3_text", + # "num_attention_heads": 4, + # "num_hidden_layers": 26, + # "num_key_value_heads": 1, + # "pad_token_id": 0, + # "query_pre_attn_scalar": 256, + # "rms_norm_eps": 1e-06, + # "rope_local_base_freq": 10000, + # "rope_scaling": None, + # "rope_theta": 1000000, + # "sliding_window": 512, + # "sliding_window_pattern": 6, + # "torch_dtype": "bfloat16", + # "transformers_version": "4.50.0.dev0", + # "use_cache": True, + # "vocab_size": 262144, + # }, "rwkv5_3b": { "architectures": ["RwkvForCausalLM"], "auto_map": { @@ -1664,6 +1758,187 @@ "use_sliding_window": False, "vocab_size": 152064, }, + # commented to save CI time + # "qwen3_0.6b": { + # "architectures": [ + # "Qwen3ForCausalLM" + # ], + # "attention_bias": False, + # "attention_dropout": 0.0, + # "bos_token_id": 151643, + # "eos_token_id": 151645, + # "head_dim": 128, + # "hidden_act": "silu", + # "hidden_size": 1024, + # "initializer_range": 0.02, + # "intermediate_size": 3072, + # "max_position_embeddings": 40960, + # "max_window_layers": 28, + # "model_type": "qwen3", + # "num_attention_heads": 16, + # "num_hidden_layers": 28, + # "num_key_value_heads": 8, + # "rms_norm_eps": 1e-06, + # "rope_scaling": None, + # "rope_theta": 1000000, + # "sliding_window": None, + # "tie_word_embeddings": True, + # "torch_dtype": "bfloat16", + # "transformers_version": "4.51.0", + # "use_cache": True, + # "use_sliding_window": False, + # "vocab_size": 151936, + # }, + # "qwen3_1.7b": { + # "architectures": [ + # "Qwen3ForCausalLM" + # ], + # "attention_bias": False, + # "attention_dropout": 0.0, + # "bos_token_id": 151643, + # "eos_token_id": 151645, + # "head_dim": 128, + # "hidden_act": "silu", + # "hidden_size": 2048, + # "initializer_range": 0.02, + # "intermediate_size": 6144, + # "max_position_embeddings": 40960, + # "max_window_layers": 28, + # "model_type": "qwen3", + # "num_attention_heads": 16, + # "num_hidden_layers": 28, + # "num_key_value_heads": 8, + # "rms_norm_eps": 1e-06, + # "rope_scaling": None, + # "rope_theta": 1000000, + # "sliding_window": None, + # "tie_word_embeddings": True, + # "torch_dtype": "bfloat16", + # "transformers_version": "4.51.0", + # "use_cache": True, + # "use_sliding_window": False, + # "vocab_size": 151936, + # }, + # "qwen3_4b": { + # "architectures": [ + # "Qwen3ForCausalLM" + # ], + # "attention_bias": False, + # "attention_dropout": 0.0, + # "bos_token_id": 151643, + # "eos_token_id": 151645, + # "head_dim": 128, + # "hidden_act": "silu", + # "hidden_size": 2560, + # "initializer_range": 0.02, + # "intermediate_size": 9728, + # "max_position_embeddings": 40960, + # "max_window_layers": 36, + # "model_type": "qwen3", + # "num_attention_heads": 32, + # "num_hidden_layers": 36, + # "num_key_value_heads": 8, + # "rms_norm_eps": 1e-06, + # "rope_scaling": None, + # "rope_theta": 1000000, + # "sliding_window": None, + # "tie_word_embeddings": True, + # "torch_dtype": "bfloat16", + # "transformers_version": "4.51.0", + # "use_cache": True, + # "use_sliding_window": False, + # "vocab_size": 151936, + # }, + # "qwen3_4b_thinking_2507": { + # "architectures": [ + # "Qwen3ForCausalLM" + # ], + # "attention_bias": False, + # "attention_dropout": 0.0, + # "bos_token_id": 151643, + # "eos_token_id": 151645, + # "head_dim": 128, + # "hidden_act": "silu", + # "hidden_size": 2560, + # "initializer_range": 0.02, + # "intermediate_size": 9728, + # "max_position_embeddings": 262144, + # "max_window_layers": 36, + # "model_type": "qwen3", + # "num_attention_heads": 32, + # "num_hidden_layers": 36, + # "num_key_value_heads": 8, + # "rms_norm_eps": 1e-06, + # "rope_scaling": None, + # "rope_theta": 5000000, + # "sliding_window": None, + # "tie_word_embeddings": True, + # "torch_dtype": "bfloat16", + # "transformers_version": "4.51.0", + # "use_cache": True, + # "use_sliding_window": False, + # "vocab_size": 151936, + # }, + # "qwen3_4b_instruct_2507": { + # "architectures": [ + # "Qwen3ForCausalLM" + # ], + # "attention_bias": False, + # "attention_dropout": 0.0, + # "bos_token_id": 151643, + # "eos_token_id": 151645, + # "head_dim": 128, + # "hidden_act": "silu", + # "hidden_size": 2560, + # "initializer_range": 0.02, + # "intermediate_size": 9728, + # "max_position_embeddings": 262144, + # "max_window_layers": 36, + # "model_type": "qwen3", + # "num_attention_heads": 32, + # "num_hidden_layers": 36, + # "num_key_value_heads": 8, + # "rms_norm_eps": 1e-06, + # "rope_scaling": None, + # "rope_theta": 5000000, + # "sliding_window": None, + # "tie_word_embeddings": True, + # "torch_dtype": "bfloat16", + # "transformers_version": "4.51.0", + # "use_cache": True, + # "use_sliding_window": False, + # "vocab_size": 151936, + # }, + # "qwen3_8b": { + # "architectures": [ + # "Qwen3ForCausalLM" + # ], + # "attention_bias": False, + # "attention_dropout": 0.0, + # "bos_token_id": 151643, + # "eos_token_id": 151645, + # "head_dim": 128, + # "hidden_act": "silu", + # "hidden_size": 4096, + # "initializer_range": 0.02, + # "intermediate_size": 12288, + # "max_position_embeddings": 40960, + # "max_window_layers": 36, + # "model_type": "qwen3", + # "num_attention_heads": 32, + # "num_hidden_layers": 36, + # "num_key_value_heads": 8, + # "rms_norm_eps": 1e-06, + # "rope_scaling": None, + # "rope_theta": 1000000, + # "sliding_window": None, + # "tie_word_embeddings": False, + # "torch_dtype": "bfloat16", + # "transformers_version": "4.51.0", + # "use_cache": True, + # "use_sliding_window": False, + # "vocab_size": 151936, + # }, "internlm2": { "architectures": ["InternLM2ForCausalLM"], "attn_implementation": "eager", @@ -1831,6 +2106,120 @@ "use_cache": True, "vocab_size": 49152, }, + # "smollm2_1_7b": { + # "architectures": [ + # "LlamaForCausalLM" + # ], + # "attention_bias": False, + # "attention_dropout": 0.0, + # "bos_token_id": 1, + # "eos_token_id": 2, + # "hidden_act": "silu", + # "hidden_size": 2048, + # "initializer_range": 0.02, + # "intermediate_size": 8192, + # "max_position_embeddings": 8192, + # "mlp_bias": False, + # "model_type": "llama", + # "num_attention_heads": 32, + # "num_hidden_layers": 24, + # "num_key_value_heads": 32, + # "pad_token_id": 2, + # "pretraining_tp": 1, + # "rms_norm_eps": 1e-05, + # "rope_scaling": None, + # "rope_theta": 130000, + # "tie_word_embeddings": True, + # "torch_dtype": "bfloat16", + # "transformers_version": "4.42.3", + # "transformers.js_config": { + # "dtype": "q4", + # "kv_cache_dtype": { + # "q4f16": "float16", + # "fp16": "float16" + # }, + # "use_external_data_format": { + # "model.onnx": True, + # "model_fp16.onnx": True + # } + # }, + # "use_cache": True, + # "vocab_size": 49152, + # }, + # "smollm2_360m": { + # "architectures": [ + # "LlamaForCausalLM" + # ], + # "attention_bias": False, + # "attention_dropout": 0.0, + # "bos_token_id": 1, + # "eos_token_id": 2, + # "hidden_act": "silu", + # "hidden_size": 960, + # "initializer_range": 0.02, + # "intermediate_size": 2560, + # "is_llama_config": True, + # "max_position_embeddings": 8192, + # "mlp_bias": False, + # "model_type": "llama", + # "num_attention_heads": 15, + # "num_hidden_layers": 32, + # "num_key_value_heads": 5, + # "pad_token_id": 2, + # "pretraining_tp": 1, + # "rms_norm_eps": 1e-05, + # "rope_interleaved": False, + # "rope_scaling": None, + # "rope_theta": 100000, + # "tie_word_embeddings": True, + # "torch_dtype": "bfloat16", + # "transformers_version": "4.42.3", + # "transformers.js_config": { + # "kv_cache_dtype": { + # "q4f16": "float16", + # "fp16": "float16" + # } + # }, + # "use_cache": True, + # "vocab_size": 49152, + # }, + # "smollm2_135m": { + # "architectures": [ + # "LlamaForCausalLM" + # ], + # "attention_bias": False, + # "attention_dropout": 0.0, + # "bos_token_id": 1, + # "eos_token_id": 2, + # "hidden_act": "silu", + # "hidden_size": 576, + # "initializer_range": 0.041666666666666664, + # "intermediate_size": 1536, + # "is_llama_config": True, + # "max_position_embeddings": 8192, + # "mlp_bias": False, + # "model_type": "llama", + # "num_attention_heads": 9, + # "num_hidden_layers": 30, + # "num_key_value_heads": 3, + # "pad_token_id": 2, + # "pretraining_tp": 1, + # "rms_norm_eps": 1e-05, + # "rope_interleaved": False, + # "rope_scaling": None, + # "rope_theta": 100000, + # "tie_word_embeddings": True, + # "torch_dtype": "bfloat16", + # "transformers_version": "4.42.3", + # "transformers.js_config": { + # "kv_cache_dtype": { + # "q4f16": "float16", + # "fp16": "float16" + # } + # }, + # "use_cache": True, + # "vocab_size": 49152, + # }, "aya-23": { "architectures": ["CohereForCausalLM"], "attention_bias": False,