Skip to content
Draft
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions collector/common_test_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,7 @@ def _get_mla_common_test_cases(is_context: bool):
# num_heads, q_lora_rank, kv_lora_rank, qk_nope_head_dim, qk_rope_head_dim, v_head_dim
model_config_list = [
[128, 1536, 512, 128, 64, 128, "deepseek-ai/DeepSeek-V3"],
[64, 1536, 512, 128, 64, 128, "moonshotai/Kimi-K2.5"],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think we don't need this. same for the previous explanation. 128/tp_list[1,2,4,...,128] naturally covers 64/tp_list[1,2,4,...,64]

]

if is_context:
Expand Down
4 changes: 2 additions & 2 deletions collector/trtllm/collect_mla.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
def get_context_mla_test_cases():
dtype_list = [tensorrt_llm.bindings.DataType.BF16] # not support f8 for trt < v1.1
test_cases = []
n_list = [128]
n_list = [64, 128]
Copy link
Contributor

@tianhaox tianhaox Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the latency measurement is based on local num heads=n/tp. then you will have a lot of duplicate measurements. Remove 64 is a previous fix here. You don't need to modify this. i will suggest fixing that in sglang, to add tp 128 to collect_mla.py for sglang

b_list = [1, 2, 4, 8, 16, 32, 64, 128, 256]
s_list = [1, 16, 32, 64, 128, 256, 512, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 10240, 12288, 16384, 32768]
for n in n_list:
Expand Down Expand Up @@ -59,7 +59,7 @@ def get_context_mla_test_cases():
def get_generation_mla_test_cases():
dtype_list = [tensorrt_llm.bindings.DataType.BF16] # not support f8 for trt < v1.1
test_cases = []
n_list = [128]
n_list = [64, 128]
Copy link
Contributor

@tianhaox tianhaox Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above.

for n in n_list:
for b in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
for s in [
Expand Down
10 changes: 10 additions & 0 deletions src/aiconfigurator/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,12 @@ def _add_default_mode_arguments(parser):
help="Optional end-to-end request latency target (ms). Enables request-latency optimization mode.",
)
parser.add_argument("--prefix", type=int, default=0, help="Prefix cache length. Default to 0.")
parser.add_argument(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we remove wideep and design in a seperate PR?

"--enable-wideep",
action="store_true",
default=False,
help="Enable wide expert-parallelism search space (effective for DeepSeek models with trtllm/sglang backends).",
)


def _add_experiments_mode_arguments(parser):
Expand Down Expand Up @@ -576,6 +582,7 @@ def build_default_task_configs(
tpot: float = 30.0,
request_latency: float | None = None,
prefix: int = 0,
enable_wideep: bool = False,
) -> dict[str, TaskConfig]:
"""Build agg and disagg task configs for default mode comparison.

Expand All @@ -594,6 +601,7 @@ def build_default_task_configs(
tpot: Time per output token target in ms.
request_latency: Optional end-to-end request latency target (ms).
prefix: Prefix cache length.
enable_wideep: Enable wide expert-parallelism search space.

Returns:
Dict with TaskConfig objects. When backend='auto', returns 6 configs
Expand Down Expand Up @@ -621,6 +629,7 @@ def build_default_task_configs(
"request_latency": request_latency,
"prefix": prefix,
"database_mode": database_mode,
"enable_wideep": enable_wideep,
}

task_configs: dict[str, TaskConfig] = {}
Expand Down Expand Up @@ -1332,6 +1341,7 @@ def main(args):
tpot=args.tpot,
request_latency=args.request_latency,
prefix=args.prefix,
enable_wideep=args.enable_wideep,
)
elif args.mode == "exp":
try:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we need to do a full copy paste. this is missing quant field. https://huggingface.co/moonshotai/Kimi-K2-Instruct/blob/main/config.json

"architectures": ["DeepseekV3ForCausalLM"],
"model_type": "kimi_k2",
"num_hidden_layers": 61,
"hidden_size": 7168,
"num_attention_heads": 64,
"num_key_value_heads": 64,
"intermediate_size": 18432,
"vocab_size": 163840,
"max_position_embeddings": 131072,
"n_routed_experts": 384,
"n_shared_experts": 1,
"num_experts_per_tok": 8,
"moe_intermediate_size": 2048,
"moe_layer_freq": 1,
"first_k_dense_replace": 1,
"kv_lora_rank": 512,
"q_lora_rank": 1536,
"qk_nope_head_dim": 128,
"qk_rope_head_dim": 64,
"v_head_dim": 128,
"routed_scaling_factor": 2.827,
"torch_dtype": "bfloat16",
"use_cache": true,
"tie_word_embeddings": false
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"architectures": ["DeepseekV3ForCausalLM"],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks liek this model employs a 4bit quant,

https://huggingface.co/moonshotai/Kimi-K2-Thinking/blob/main/config.json
Are all these fields generated by claude? I think we need a full copy past to avoid misalignment.

"model_type": "kimi_k2",
"num_hidden_layers": 61,
"hidden_size": 7168,
"num_attention_heads": 64,
"num_key_value_heads": 64,
"intermediate_size": 18432,
"vocab_size": 163840,
"max_position_embeddings": 262144,
"n_routed_experts": 384,
"n_shared_experts": 1,
"num_experts_per_tok": 8,
"moe_intermediate_size": 2048,
"moe_layer_freq": 1,
"first_k_dense_replace": 1,
"kv_lora_rank": 512,
"q_lora_rank": 1536,
"qk_nope_head_dim": 128,
"qk_rope_head_dim": 64,
"v_head_dim": 128,
"routed_scaling_factor": 2.827,
"torch_dtype": "bfloat16",
"use_cache": true,
"tie_word_embeddings": false
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"architectures": ["KimiK25ForConditionalGeneration"],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how we handle the vision encoder?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here, no quantization, needs a full copy past

"model_type": "kimi_k25",
"num_hidden_layers": 61,
"hidden_size": 7168,
"num_attention_heads": 64,
"num_key_value_heads": 64,
"intermediate_size": 18432,
"vocab_size": 163840,
"max_position_embeddings": 262144,
"n_routed_experts": 384,
"n_shared_experts": 1,
"num_experts_per_tok": 8,
"moe_intermediate_size": 2048,
"moe_layer_freq": 1,
"first_k_dense_replace": 1,
"kv_lora_rank": 512,
"q_lora_rank": 1536,
"qk_nope_head_dim": 128,
"qk_rope_head_dim": 64,
"v_head_dim": 128,
"routed_scaling_factor": 2.827,
"torch_dtype": "bfloat16",
"use_cache": true,
"tie_word_embeddings": false
}
5 changes: 5 additions & 0 deletions src/aiconfigurator/sdk/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,10 @@ def get_default_models() -> set[str]:
# DeepSeek Models
"deepseek-ai/DeepSeek-V3",
"nvidia/DeepSeek-V3.1-NVFP4",
# Kimi Models
"moonshotai/Kimi-K2-Instruct",
"moonshotai/Kimi-K2-Thinking",
"moonshotai/Kimi-K2.5",
# Qwen 2.5 Models
"Qwen/Qwen2.5-1.5B",
"Qwen/Qwen2.5-7B",
Expand Down Expand Up @@ -288,6 +292,7 @@ def get_default_models() -> set[str]:
"Qwen3ForCausalLM": "LLAMA",
"DeepSeekForCausalLM": "DEEPSEEK",
"DeepseekV3ForCausalLM": "DEEPSEEK",
"KimiK25ForConditionalGeneration": "DEEPSEEK",
"NemotronForCausalLM": "NEMOTRONNAS",
"DeciLMForCausalLM": "NEMOTRONNAS",
"NemotronHForCausalLM": "NEMOTRONH",
Expand Down
5 changes: 3 additions & 2 deletions src/aiconfigurator/sdk/inference_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,9 +373,10 @@ def get_worker_candidates(
continue
if summary_df.empty:
if exceptions:
last = exceptions[-1]
raise RuntimeError(
f"No results found for any parallel configuration. Showing last exception: {exceptions[-1]}"
) from exceptions[-1]
f"No results found for any parallel configuration. Showing last exception: {last}"
) from last
if all_configs_oom:
raise RuntimeError(
"No results found: the model does not fit in GPU memory for any parallel "
Expand Down
27 changes: 14 additions & 13 deletions src/aiconfigurator/sdk/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1137,27 +1137,27 @@ def __init__(self, topk: int, num_experts: int, moe_inter_size: int, *args) -> N
ops.GEMM(
"context_q_b_proj_gemm",
self._num_layers,
24576 // tp_size,
self._num_heads * 192 // tp_size, # num_heads * (qk_nope_head_dim + qk_rope_head_dim)
1536,
gemm_quant_mode,
),
ops.GEMM(
"context_kv_b_proj_gemm",
self._num_layers,
32768 // tp_size,
self._num_heads * 256 // tp_size, # num_heads * (qk_nope_head_dim + v_head_dim)
512,
gemm_quant_mode,
), # agg ctx attn part
ops.ContextMLA(
"context_attention",
self._num_layers,
128 // tp_size,
self._num_heads // tp_size,
kvcache_quant_mode,
fmha_quant_mode,
), # agg ctx attn part
ops.GEMM(
"context_proj_gemm", self._num_layers, h, 128 * 128 // tp_size, gemm_quant_mode
), # agg ctx attn part
"context_proj_gemm", self._num_layers, h, self._num_heads * 128 // tp_size, gemm_quant_mode
), # agg ctx attn part; 128 = v_head_dim
ops.ElementWise("context_add_norm_2", self._num_layers, 2 * h, 2 * h, 0.8),
]
)
Expand Down Expand Up @@ -1294,7 +1294,7 @@ def __init__(self, topk: int, num_experts: int, moe_inter_size: int, *args) -> N
ops.GEMM(
"generation_q_b_proj_gemm",
self._num_layers * self._mtp_scale_factor,
24576 // tp_size,
self._num_heads * 192 // tp_size, # num_heads * (qk_nope_head_dim + qk_rope_head_dim)
1536,
gemm_quant_mode,
),
Expand All @@ -1308,7 +1308,7 @@ def __init__(self, topk: int, num_experts: int, moe_inter_size: int, *args) -> N
ops.GenerationMLA(
"generation_attention",
self._num_layers * self._mtp_scale_factor,
128 // tp_size,
self._num_heads // tp_size,
kvcache_quant_mode,
), # agg gen attn part
ops.MLABmm(
Expand Down Expand Up @@ -1596,25 +1596,26 @@ def __init__(self, topk: int, num_experts: int, moe_inter_size: int, *args) -> N
ops.GEMM(
"context_q_b_proj_gemm",
self._num_layers,
24576 // tp_size,
self._num_heads * 192 // tp_size, # num_heads * (qk_nope_head_dim + qk_rope_head_dim)
1536,
gemm_quant_mode,
),
ops.GEMM(
"context_kv_b_proj_gemm",
self._num_layers,
32768 // tp_size,
self._num_heads * 256 // tp_size, # num_heads * (qk_nope_head_dim + v_head_dim)
512,
gemm_quant_mode,
),
ops.ContextMLA(
"context_attention",
self._num_layers,
128 // tp_size,
self._num_heads // tp_size,
kvcache_quant_mode,
fmha_quant_mode,
),
ops.GEMM("context_proj_gemm", self._num_layers, h, 128 * 128 // tp_size, gemm_quant_mode),
# 128 = v_head_dim
ops.GEMM("context_proj_gemm", self._num_layers, h, self._num_heads * 128 // tp_size, gemm_quant_mode),
ops.ElementWise("context_add_norm_2", self._num_layers, 2 * h, 2 * h, 0.8),
]
)
Expand Down Expand Up @@ -1755,7 +1756,7 @@ def __init__(self, topk: int, num_experts: int, moe_inter_size: int, *args) -> N
ops.GEMM(
"generation_q_b_proj_gemm",
self._num_layers * self._mtp_scale_factor,
24576 // tp_size,
self._num_heads * 192 // tp_size, # num_heads * (qk_nope_head_dim + qk_rope_head_dim)
1536,
gemm_quant_mode,
),
Expand All @@ -1769,7 +1770,7 @@ def __init__(self, topk: int, num_experts: int, moe_inter_size: int, *args) -> N
ops.GenerationMLA(
"generation_attention",
self._num_layers * self._mtp_scale_factor,
128 // tp_size,
self._num_heads // tp_size,
kvcache_quant_mode,
),
ops.MLABmm(
Expand Down
3 changes: 2 additions & 1 deletion src/aiconfigurator/sdk/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,8 @@ def _mode_layers(cls, ctx: TaskContext) -> list[ConfigLayer]:

@staticmethod
def _base_common_layer(ctx: TaskContext) -> dict:
nextn = 1 if ctx.model_family == "DEEPSEEK" else 0
raw_config = get_model_config_from_model_path(ctx.model_path).get("raw_config", {})
nextn = raw_config.get("num_nextn_predict_layers", 0)
return {
"serving_mode": ctx.serving_mode,
"model_path": ctx.model_path,
Expand Down
34 changes: 23 additions & 11 deletions src/aiconfigurator/sdk/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,21 +442,33 @@ def _parse_hf_config_json(config: dict) -> dict:
f"Supported architectures: {', '.join(ARCHITECTURE_TO_MODEL_FAMILY.keys())}"
)

layers = config["num_hidden_layers"]
hidden_size = config["hidden_size"]
n = config["num_attention_heads"]
vocab = config["vocab_size"]
context = config["max_position_embeddings"]
# For multimodal VLMs (e.g., KimiK25ForConditionalGeneration), model params are
# nested under "text_config"; fall back to top-level config for pure text models.
effective_config = config.get("text_config", config)

layers = effective_config["num_hidden_layers"]
hidden_size = effective_config["hidden_size"]
n = effective_config["num_attention_heads"]
vocab = effective_config["vocab_size"]
context = effective_config["max_position_embeddings"]

# Handle nullable fields (e.g., Nemotron has null for these)
n_kv = config.get("num_key_value_heads") or 0
inter_size = config.get("intermediate_size") or 0
d = config.get("head_dim") or config.get("attention_head_dim") or (hidden_size // n if n > 0 else 0)
n_kv = effective_config.get("num_key_value_heads") or 0
inter_size = effective_config.get("intermediate_size") or 0
d = (
effective_config.get("head_dim")
or effective_config.get("attention_head_dim")
or (hidden_size // n if n > 0 else 0)
)

# MoE parameters
topk = config.get("num_experts_per_tok", 0)
num_experts = config.get("num_local_experts") or config.get("n_routed_experts") or config.get("num_experts", 0)
moe_inter_size = config.get("moe_intermediate_size", 0) or config.get("intermediate_size", 0)
topk = effective_config.get("num_experts_per_tok", 0)
num_experts = (
effective_config.get("num_local_experts")
or effective_config.get("n_routed_experts")
or effective_config.get("num_experts", 0)
)
moe_inter_size = effective_config.get("moe_intermediate_size", 0) or effective_config.get("intermediate_size", 0)

# Handle NemotronH-specific configuration (only fields unique to NemotronH)
extra_params = None
Expand Down