Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/aiconfigurator/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,12 @@ def _add_default_mode_arguments(parser):
help="Optional end-to-end request latency target (ms). Enables request-latency optimization mode.",
)
parser.add_argument("--prefix", type=int, default=0, help="Prefix cache length. Default to 0.")
parser.add_argument(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggest we remove the wideep support in this pr. Do more complete design in a seperate PR.

"--enable-wideep",
action="store_true",
default=False,
help="Enable wide expert-parallelism search space for MoE models.",
)


def _add_experiments_mode_arguments(parser):
Expand Down Expand Up @@ -568,6 +574,7 @@ def build_default_task_configs(
tpot: float = 30.0,
request_latency: float | None = None,
prefix: int = 0,
enable_wideep: bool = False,
) -> dict[str, TaskConfig]:
"""Build agg and disagg task configs for default mode comparison.

Expand All @@ -586,6 +593,7 @@ def build_default_task_configs(
tpot: Time per output token target in ms.
request_latency: Optional end-to-end request latency target (ms).
prefix: Prefix cache length.
enable_wideep: Enable wide expert-parallelism search space for MoE models.

Returns:
Dict with TaskConfig objects. When backend='auto', returns 6 configs
Expand Down Expand Up @@ -613,6 +621,7 @@ def build_default_task_configs(
"request_latency": request_latency,
"prefix": prefix,
"database_mode": database_mode,
"enable_wideep": enable_wideep,
}

task_configs: dict[str, TaskConfig] = {}
Expand Down Expand Up @@ -1206,6 +1215,7 @@ def main(args):
tpot=args.tpot,
request_latency=args.request_latency,
prefix=args.prefix,
enable_wideep=args.enable_wideep,
)
elif args.mode == "exp":
try:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is incorrect. without quant field.

"architectures": ["Glm4MoeForCausalLM"],
"model_type": "glm4_moe",
"num_hidden_layers": 92,
"hidden_size": 5120,
"num_attention_heads": 96,
"num_key_value_heads": 8,
"head_dim": 128,
"intermediate_size": 12288,
"vocab_size": 151552,
"max_position_embeddings": 202752,
"n_routed_experts": 160,
"n_shared_experts": 1,
"num_experts_per_tok": 8,
"moe_intermediate_size": 1536,
"first_k_dense_replace": 3,
"routed_scaling_factor": 2.5,
"torch_dtype": "bfloat16",
"use_cache": true,
"tie_word_embeddings": false
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"architectures": ["Glm4MoeLiteForCausalLM"],
"model_type": "glm4_moe_lite",
"num_hidden_layers": 47,
"hidden_size": 2048,
"num_attention_heads": 20,
"num_key_value_heads": 20,
"head_dim": 64,
"intermediate_size": 10240,
"vocab_size": 154880,
"max_position_embeddings": 202752,
"n_routed_experts": 64,
"n_shared_experts": 1,
"num_experts_per_tok": 4,
"moe_intermediate_size": 1536,
"first_k_dense_replace": 1,
"kv_lora_rank": 512,
"q_lora_rank": 768,
"qk_nope_head_dim": 192,
"qk_rope_head_dim": 64,
"v_head_dim": 256,
"routed_scaling_factor": 1.8,
"torch_dtype": "bfloat16",
"use_cache": true,
"tie_word_embeddings": false
}
27 changes: 27 additions & 0 deletions src/aiconfigurator/model_configs/zai-org--GLM-5-FP8_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i will suggest we do manual copy paste to avoid illusion.

"architectures": ["GlmMoeDsaForCausalLM"],
"model_type": "glm_moe_dsa",
"num_hidden_layers": 78,
"hidden_size": 6144,
"num_attention_heads": 64,
"num_key_value_heads": 64,
"head_dim": 64,
"intermediate_size": 12288,
"vocab_size": 154880,
"max_position_embeddings": 202752,
"n_routed_experts": 256,
"n_shared_experts": 1,
"num_experts_per_tok": 8,
"moe_intermediate_size": 2048,
"moe_layer_freq": 1,
"first_k_dense_replace": 3,
"kv_lora_rank": 512,
"q_lora_rank": 2048,
"qk_nope_head_dim": 192,
"qk_rope_head_dim": 64,
"v_head_dim": 256,
"routed_scaling_factor": 2.5,
"torch_dtype": "bfloat16",
"use_cache": true,
"tie_word_embeddings": false
}
27 changes: 27 additions & 0 deletions src/aiconfigurator/sdk/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,26 @@ class NemotronHConfig:
moe_shared_expert_intermediate_size: int = 0 # Optional: 0 for non-MoE NemotronH models


@dataclass(frozen=True)
class DeepSeekMLAConfig:
"""
Multi-head Latent Attention (MLA) configuration for DeepSeek-family models.

Attributes:
q_lora_rank (int): Rank of the query low-rank projection
kv_lora_rank (int): Rank of the key/value low-rank projection (compressed KV cache dim)
qk_nope_head_dim (int): Per-head dimension for the non-RoPE portion of Q/K
qk_rope_head_dim (int): Per-head dimension for the RoPE portion of Q/K
v_head_dim (int): Per-head value dimension
"""

q_lora_rank: int
kv_lora_rank: int
qk_nope_head_dim: int
qk_rope_head_dim: int
v_head_dim: int


def _get_support_matrix_resource():
"""Get the support_matrix.csv as a Traversable resource."""
return pkg_resources.files("aiconfigurator") / "systems" / "support_matrix.csv"
Expand Down Expand Up @@ -233,6 +253,10 @@ def get_default_models() -> set[str]:
# DeepSeek Models
"deepseek-ai/DeepSeek-V3",
"nvidia/DeepSeek-V3.1-NVFP4",
# GLM Models
"zai-org/GLM-4.7-Flash",
"zai-org/GLM-4.7-FP8",
"zai-org/GLM-5-FP8",
# Qwen 2.5 Models
"Qwen/Qwen2.5-1.5B",
"Qwen/Qwen2.5-7B",
Expand Down Expand Up @@ -290,6 +314,9 @@ def get_default_models() -> set[str]:
"MixtralForCausalLM": "MOE",
"GptOssForCausalLM": "MOE",
"Qwen3MoeForCausalLM": "MOE",
"GlmMoeDsaForCausalLM": "DEEPSEEK",
"Glm4MoeLiteForCausalLM": "DEEPSEEK",
"Glm4MoeForCausalLM": "MOE",
}

"""
Expand Down
9 changes: 7 additions & 2 deletions src/aiconfigurator/sdk/inference_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,9 +370,14 @@ def get_worker_candidates(
exceptions.append(e)
continue
if summary_df.empty:
if exceptions:
raise RuntimeError(
f"No results found for any parallel configuration. Showing last exception: {exceptions[-1]}"
) from exceptions[-1]
raise RuntimeError(
f"No results found for any parallel configuration. Showing last exception: {exceptions[-1]}"
) from exceptions[-1]
"No results found for any parallel configuration. "
"All configurations resulted in OOM at the smallest batch size."
)
return summary_df

def _pick_autoscale(
Expand Down
Loading