-
Notifications
You must be signed in to change notification settings - Fork 76
feat: add Kimi-K2.5 (moonshotai/Kimi-K2.5) model support in HYBRID mode #403
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 6 commits
997a02f
5fcd29a
86e8b1d
d921975
5cca3e7
3270e64
71707e0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,7 +24,7 @@ | |
| def get_context_mla_test_cases(): | ||
| dtype_list = [tensorrt_llm.bindings.DataType.BF16] # not support f8 for trt < v1.1 | ||
| test_cases = [] | ||
| n_list = [128] | ||
| n_list = [64, 128] | ||
|
||
| b_list = [1, 2, 4, 8, 16, 32, 64, 128, 256] | ||
| s_list = [1, 16, 32, 64, 128, 256, 512, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 10240, 12288, 16384, 32768] | ||
| for n in n_list: | ||
|
|
@@ -59,7 +59,7 @@ def get_context_mla_test_cases(): | |
| def get_generation_mla_test_cases(): | ||
| dtype_list = [tensorrt_llm.bindings.DataType.BF16] # not support f8 for trt < v1.1 | ||
| test_cases = [] | ||
| n_list = [128] | ||
| n_list = [64, 128] | ||
|
||
| for n in n_list: | ||
| for b in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]: | ||
| for s in [ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -145,6 +145,12 @@ def _add_default_mode_arguments(parser): | |
| help="Optional end-to-end request latency target (ms). Enables request-latency optimization mode.", | ||
| ) | ||
| parser.add_argument("--prefix", type=int, default=0, help="Prefix cache length. Default to 0.") | ||
| parser.add_argument( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we remove wideep and design in a seperate PR? |
||
| "--enable-wideep", | ||
| action="store_true", | ||
| default=False, | ||
| help="Enable wide expert-parallelism search space (effective for DeepSeek models with trtllm/sglang backends).", | ||
| ) | ||
|
|
||
|
|
||
| def _add_experiments_mode_arguments(parser): | ||
|
|
@@ -576,6 +582,7 @@ def build_default_task_configs( | |
| tpot: float = 30.0, | ||
| request_latency: float | None = None, | ||
| prefix: int = 0, | ||
| enable_wideep: bool = False, | ||
| ) -> dict[str, TaskConfig]: | ||
| """Build agg and disagg task configs for default mode comparison. | ||
|
|
||
|
|
@@ -594,6 +601,7 @@ def build_default_task_configs( | |
| tpot: Time per output token target in ms. | ||
| request_latency: Optional end-to-end request latency target (ms). | ||
| prefix: Prefix cache length. | ||
| enable_wideep: Enable wide expert-parallelism search space. | ||
|
|
||
| Returns: | ||
| Dict with TaskConfig objects. When backend='auto', returns 6 configs | ||
|
|
@@ -621,6 +629,7 @@ def build_default_task_configs( | |
| "request_latency": request_latency, | ||
| "prefix": prefix, | ||
| "database_mode": database_mode, | ||
| "enable_wideep": enable_wideep, | ||
| } | ||
|
|
||
| task_configs: dict[str, TaskConfig] = {} | ||
|
|
@@ -1332,6 +1341,7 @@ def main(args): | |
| tpot=args.tpot, | ||
| request_latency=args.request_latency, | ||
| prefix=args.prefix, | ||
| enable_wideep=args.enable_wideep, | ||
| ) | ||
| elif args.mode == "exp": | ||
| try: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,26 @@ | ||
| { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we need to do a full copy paste. this is missing quant field. https://huggingface.co/moonshotai/Kimi-K2-Instruct/blob/main/config.json |
||
| "architectures": ["DeepseekV3ForCausalLM"], | ||
| "model_type": "kimi_k2", | ||
| "num_hidden_layers": 61, | ||
| "hidden_size": 7168, | ||
| "num_attention_heads": 64, | ||
| "num_key_value_heads": 64, | ||
| "intermediate_size": 18432, | ||
| "vocab_size": 163840, | ||
| "max_position_embeddings": 131072, | ||
| "n_routed_experts": 384, | ||
| "n_shared_experts": 1, | ||
| "num_experts_per_tok": 8, | ||
| "moe_intermediate_size": 2048, | ||
| "moe_layer_freq": 1, | ||
| "first_k_dense_replace": 1, | ||
| "kv_lora_rank": 512, | ||
| "q_lora_rank": 1536, | ||
| "qk_nope_head_dim": 128, | ||
| "qk_rope_head_dim": 64, | ||
| "v_head_dim": 128, | ||
| "routed_scaling_factor": 2.827, | ||
| "torch_dtype": "bfloat16", | ||
| "use_cache": true, | ||
| "tie_word_embeddings": false | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,26 @@ | ||
| { | ||
| "architectures": ["DeepseekV3ForCausalLM"], | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. looks liek this model employs a 4bit quant, https://huggingface.co/moonshotai/Kimi-K2-Thinking/blob/main/config.json |
||
| "model_type": "kimi_k2", | ||
| "num_hidden_layers": 61, | ||
| "hidden_size": 7168, | ||
| "num_attention_heads": 64, | ||
| "num_key_value_heads": 64, | ||
| "intermediate_size": 18432, | ||
| "vocab_size": 163840, | ||
| "max_position_embeddings": 262144, | ||
| "n_routed_experts": 384, | ||
| "n_shared_experts": 1, | ||
| "num_experts_per_tok": 8, | ||
| "moe_intermediate_size": 2048, | ||
| "moe_layer_freq": 1, | ||
| "first_k_dense_replace": 1, | ||
| "kv_lora_rank": 512, | ||
| "q_lora_rank": 1536, | ||
| "qk_nope_head_dim": 128, | ||
| "qk_rope_head_dim": 64, | ||
| "v_head_dim": 128, | ||
| "routed_scaling_factor": 2.827, | ||
| "torch_dtype": "bfloat16", | ||
| "use_cache": true, | ||
| "tie_word_embeddings": false | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,26 @@ | ||
| { | ||
| "architectures": ["KimiK25ForConditionalGeneration"], | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. how we handle the vision encoder?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here, no quantization, needs a full copy past |
||
| "model_type": "kimi_k25", | ||
| "num_hidden_layers": 61, | ||
| "hidden_size": 7168, | ||
| "num_attention_heads": 64, | ||
| "num_key_value_heads": 64, | ||
| "intermediate_size": 18432, | ||
| "vocab_size": 163840, | ||
| "max_position_embeddings": 262144, | ||
| "n_routed_experts": 384, | ||
| "n_shared_experts": 1, | ||
| "num_experts_per_tok": 8, | ||
| "moe_intermediate_size": 2048, | ||
| "moe_layer_freq": 1, | ||
| "first_k_dense_replace": 1, | ||
| "kv_lora_rank": 512, | ||
| "q_lora_rank": 1536, | ||
| "qk_nope_head_dim": 128, | ||
| "qk_rope_head_dim": 64, | ||
| "v_head_dim": 128, | ||
| "routed_scaling_factor": 2.827, | ||
| "torch_dtype": "bfloat16", | ||
| "use_cache": true, | ||
| "tie_word_embeddings": false | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i think we don't need this. same for the previous explanation. 128/tp_list[1,2,4,...,128] naturally covers 64/tp_list[1,2,4,...,64]