ai-dynamo · jasonqinzhou · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026 · Feb 21, 2026
@@ -142,6 +142,12 @@ def _add_default_mode_arguments(parser):
         help="Optional end-to-end request latency target (ms). Enables request-latency optimization mode.",
     )
     parser.add_argument("--prefix", type=int, default=0, help="Prefix cache length. Default to 0.")
+    parser.add_argument(
+        "--enable-wideep",
+        action="store_true",
+        default=False,
+        help="Enable wide expert-parallelism search space for MoE models.",
+    )
 
 
 def _add_experiments_mode_arguments(parser):
@@ -568,6 +574,7 @@ def build_default_task_configs(
     tpot: float = 30.0,
     request_latency: float | None = None,
     prefix: int = 0,
+    enable_wideep: bool = False,
 ) -> dict[str, TaskConfig]:
     """Build agg and disagg task configs for default mode comparison.
 
@@ -586,6 +593,7 @@ def build_default_task_configs(
         tpot: Time per output token target in ms.
         request_latency: Optional end-to-end request latency target (ms).
         prefix: Prefix cache length.
+        enable_wideep: Enable wide expert-parallelism search space for MoE models.
 
     Returns:
         Dict with TaskConfig objects. When backend='auto', returns 6 configs
@@ -613,6 +621,7 @@ def build_default_task_configs(
         "request_latency": request_latency,
         "prefix": prefix,
         "database_mode": database_mode,
+        "enable_wideep": enable_wideep,
     }
 
     task_configs: dict[str, TaskConfig] = {}
@@ -1206,6 +1215,7 @@ def main(args):
             tpot=args.tpot,
             request_latency=args.request_latency,
             prefix=args.prefix,
+            enable_wideep=args.enable_wideep,
         )
     elif args.mode == "exp":
         try:

@@ -0,0 +1,21 @@
+{
+  "architectures": ["Glm4MoeForCausalLM"],
+  "model_type": "glm4_moe",
+  "num_hidden_layers": 92,
+  "hidden_size": 5120,
+  "num_attention_heads": 96,
+  "num_key_value_heads": 8,
+  "head_dim": 128,
+  "intermediate_size": 12288,
+  "vocab_size": 151552,
+  "max_position_embeddings": 202752,
+  "n_routed_experts": 160,
+  "n_shared_experts": 1,
+  "num_experts_per_tok": 8,
+  "moe_intermediate_size": 1536,
+  "first_k_dense_replace": 3,
+  "routed_scaling_factor": 2.5,
+  "torch_dtype": "bfloat16",
+  "use_cache": true,
+  "tie_word_embeddings": false
+}
@@ -0,0 +1,26 @@
+{
+  "architectures": ["Glm4MoeLiteForCausalLM"],
+  "model_type": "glm4_moe_lite",
+  "num_hidden_layers": 47,
+  "hidden_size": 2048,
+  "num_attention_heads": 20,
+  "num_key_value_heads": 20,
+  "head_dim": 64,
+  "intermediate_size": 10240,
+  "vocab_size": 154880,
+  "max_position_embeddings": 202752,
+  "n_routed_experts": 64,
+  "n_shared_experts": 1,
+  "num_experts_per_tok": 4,
+  "moe_intermediate_size": 1536,
+  "first_k_dense_replace": 1,
+  "kv_lora_rank": 512,
+  "q_lora_rank": 768,
+  "qk_nope_head_dim": 192,
+  "qk_rope_head_dim": 64,
+  "v_head_dim": 256,
+  "routed_scaling_factor": 1.8,
+  "torch_dtype": "bfloat16",
+  "use_cache": true,
+  "tie_word_embeddings": false
+}
@@ -0,0 +1,27 @@
+{
+  "architectures": ["GlmMoeDsaForCausalLM"],
+  "model_type": "glm_moe_dsa",
+  "num_hidden_layers": 78,
+  "hidden_size": 6144,
+  "num_attention_heads": 64,
+  "num_key_value_heads": 64,
+  "head_dim": 64,
+  "intermediate_size": 12288,
+  "vocab_size": 154880,
+  "max_position_embeddings": 202752,
+  "n_routed_experts": 256,
+  "n_shared_experts": 1,
+  "num_experts_per_tok": 8,
+  "moe_intermediate_size": 2048,
+  "moe_layer_freq": 1,
+  "first_k_dense_replace": 3,
+  "kv_lora_rank": 512,
+  "q_lora_rank": 2048,
+  "qk_nope_head_dim": 192,
+  "qk_rope_head_dim": 64,
+  "v_head_dim": 256,
+  "routed_scaling_factor": 2.5,
+  "torch_dtype": "bfloat16",
+  "use_cache": true,
+  "tie_word_embeddings": false
+}
@@ -60,6 +60,26 @@ class NemotronHConfig:
     moe_shared_expert_intermediate_size: int = 0  # Optional: 0 for non-MoE NemotronH models
 
 
+@dataclass(frozen=True)
+class DeepSeekMLAConfig:
+    """
+    Multi-head Latent Attention (MLA) configuration for DeepSeek-family models.
+
+    Attributes:
+        q_lora_rank (int): Rank of the query low-rank projection
+        kv_lora_rank (int): Rank of the key/value low-rank projection (compressed KV cache dim)
+        qk_nope_head_dim (int): Per-head dimension for the non-RoPE portion of Q/K
+        qk_rope_head_dim (int): Per-head dimension for the RoPE portion of Q/K
+        v_head_dim (int): Per-head value dimension
+    """
+
+    q_lora_rank: int
+    kv_lora_rank: int
+    qk_nope_head_dim: int
+    qk_rope_head_dim: int
+    v_head_dim: int
+
+
 def _get_support_matrix_resource():
     """Get the support_matrix.csv as a Traversable resource."""
     return pkg_resources.files("aiconfigurator") / "systems" / "support_matrix.csv"
@@ -233,6 +253,10 @@ def get_default_models() -> set[str]:
     # DeepSeek Models
     "deepseek-ai/DeepSeek-V3",
     "nvidia/DeepSeek-V3.1-NVFP4",
+    # GLM Models
+    "zai-org/GLM-4.7-Flash",
+    "zai-org/GLM-4.7-FP8",
+    "zai-org/GLM-5-FP8",
     # Qwen 2.5 Models
     "Qwen/Qwen2.5-1.5B",
     "Qwen/Qwen2.5-7B",
@@ -290,6 +314,9 @@ def get_default_models() -> set[str]:
     "MixtralForCausalLM": "MOE",
     "GptOssForCausalLM": "MOE",
     "Qwen3MoeForCausalLM": "MOE",
+    "GlmMoeDsaForCausalLM": "DEEPSEEK",
+    "Glm4MoeLiteForCausalLM": "DEEPSEEK",
+    "Glm4MoeForCausalLM": "MOE",
 }
 
 """

@@ -370,9 +370,14 @@ def get_worker_candidates(
                 exceptions.append(e)
                 continue
         if summary_df.empty:
+            if exceptions:
+                raise RuntimeError(
+                    f"No results found for any parallel configuration. Showing last exception: {exceptions[-1]}"
+                ) from exceptions[-1]
             raise RuntimeError(
-                f"No results found for any parallel configuration. Showing last exception: {exceptions[-1]}"
-            ) from exceptions[-1]
+                "No results found for any parallel configuration. "
+                "All configurations resulted in OOM at the smallest batch size."
+            )
         return summary_df
 
     def _pick_autoscale(