refined IR and elaborate comments

bmarimuthu-nv · bmarimuthu-nv · commit a1a1d7948455 · 2026-01-08T16:33:28.000-08:00
Signed-off-by: Balamurugan Marimuthu &lt;246387390+bmarimuthu-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/models/patches/glm4_moe.py b/tensorrt_llm/_torch/auto_deploy/models/patches/glm4_moe.py
@@ -0,0 +1,90 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""GLM4-MoE model patches for auto-deploy compatibility.
+
+This module patches the GLM4-MoE model to make it compatible with torch.fx export
+by replacing data-dependent operations (torch.where/nonzero) with traceable custom ops.
+"""
+
+import types
+from typing import Dict
+
+import torch
+from transformers import AutoModelForCausalLM
+
+
+@torch.inference_mode()
+def glm4_moe_forward(self, hidden_states):
+    """Glm4MoeMoE forward function rewritten to enable torch export.
+
+    Replaces self.moe() call (which uses torch.where) with torch_moe custom op.
+    """
+    residuals = hidden_states
+    orig_shape = hidden_states.shape
+
+    # Gate directly returns (topk_indices, topk_weights)
+    topk_indices, topk_weights = self.gate(hidden_states)
+
+    # Flatten for MoE processing
+    hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+
+    # Replace self.moe() with torch_moe custom op
+    # self.experts is a ModuleList of Glm4MoeMLP, each with gate_proj, up_proj, down_proj
+    # Collect weights from each expert
+    w1_weight = [expert.gate_proj.weight for expert in self.experts]  # gate_proj
+    w2_weight = [expert.down_proj.weight for expert in self.experts]  # down_proj
+    w3_weight = [expert.up_proj.weight for expert in self.experts]  # up_proj
+
+    hidden_states = torch.ops.auto_deploy.torch_moe(
+        hidden_states,
+        topk_indices,
+        topk_weights,
+        w1_weight=w1_weight,
+        w2_weight=w2_weight,
+        w3_weight=w3_weight,
+    )
+
+    hidden_states = hidden_states.view(*orig_shape)
+
+    # Add shared experts output
+    hidden_states = hidden_states + self.shared_experts(residuals)
+
+    return hidden_states
+
+
+# Store original from_config
+_from_config_original = AutoModelForCausalLM.from_config
+
+# Module patches mapping
+CUSTOM_MODULE_PATCHES: Dict[str, callable] = {
+    "Glm4MoeMoE": glm4_moe_forward,
+}
+
+
+def get_model_from_config_patched(config, **kwargs):
+    """Patched from_config that applies GLM4-MoE module patches."""
+    model = _from_config_original(config, **kwargs)
+
+    # Patch modules
+    for _, module in model.named_modules():
+        if type(module).__name__ in CUSTOM_MODULE_PATCHES.keys():
+            module.forward = types.MethodType(CUSTOM_MODULE_PATCHES[type(module).__name__], module)
+
+    return model
+
+
+# Apply the patch
+AutoModelForCausalLM.from_config = get_model_from_config_patched
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py b/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py
@@ -1912,8 +1912,18 @@ def _shard_intermediate_attention_weights(
     Shard intermediate weights (e.g. q_norm, k_norm) for attention layers.
 
     For attention layers, there may be intermediate weights (like q_norm.weight, k_norm.weight)
-    that operate element-wise on the sharded output of q_proj/k_proj. These need to be sharded
-    along the same dimension.
+    that operate directly on q_proj/k_proj output (before reshaping to [batch, seq, num_heads, head_dim]).
+    These need to be sharded along the same head dimension.
+
+    Example1: - Norm on all heads directly on flattened Q/K output [batch, seq, hidden_size] (e.g. MiniMax):
+        self.q_norm = MiniMaxM2RMSNorm(self.head_dim * config.num_attention_heads, eps=config.rms_norm_eps)
+        weight shape: [num_heads * head_dim]
+        Status: Needs qk norm sharding. (will be handled in this function)
+
+    Example2: - Norm per head after reshaping to [batch, seq, num_heads, head_dim] (e.g. GLM 4.7):
+        self.q_norm = Glm4MoeRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        weight shape: [head_dim]
+        Status: No need to shard/will be skipped.
 
     Args:
         layer_subgraph: The attention layer subgraph
diff --git a/tensorrt_llm/_torch/auto_deploy/utils/logger.py b/tensorrt_llm/_torch/auto_deploy/utils/logger.py
@@ -16,29 +16,66 @@ def _get_dtype_or_type(val):
         return type(val).__name__
 
 
+def _get_shape_str(val):
+    """Get shape as 'dim0xdim1x...' string, or '?' if not available."""
+    if hasattr(val, "shape"):
+        # Handle symbolic dimensions (SymInt) by converting to str
+        dims = [str(int(d)) if str(d).isdigit() else str(d) for d in val.shape]
+        return "x".join(dims) if dims else "scalar"
+    return "?"
+
+
+def _get_shape_dtype_str(val):
+    """Return 'shape : dtype' string for a value."""
+    shape = _get_shape_str(val)
+    dtype = _get_dtype_or_type(val)
+    return f"{shape} : {dtype}"
+
+
 def dump_ssa_with_meta(f, mod):
     for node in mod.graph.nodes:
         # Write out IR in traditional SSA style
         if node.op == "placeholder":
             if "val" in node.meta:
-                dtype = _get_dtype_or_type(node.meta["val"])
+                shape_dtype = _get_shape_dtype_str(node.meta["val"])
             else:
-                dtype = "unknown"
-            f.write(f"%{node.name} : {dtype}\n")
+                shape_dtype = "? : unknown"
+            f.write(f"%{node.name} : {shape_dtype}\n")
         elif node.op in ("call_function", "call_method", "call_module"):
-            # Build inputs list in SSA format
+            # Build inputs list in SSA format with shape:dtype info
             input_vars = []
             for arg in node.args:
                 if hasattr(arg, "name"):
-                    input_vars.append(f"%{arg.name}")
+                    # Look up the arg node's metadata for shape/dtype
+                    if hasattr(arg, "meta") and "val" in arg.meta:
+                        arg_shape_dtype = _get_shape_dtype_str(arg.meta["val"])
+                        input_vars.append(f"%{arg.name} : {arg_shape_dtype}")
+                    else:
+                        input_vars.append(f"%{arg.name} : ? : unknown")
                 else:
                     input_vars.append(str(arg))
+
+            # Handle output shape/dtype (including multi-output)
             if "val" in node.meta:
-                out_dtype = _get_dtype_or_type(node.meta["val"])
+                out_val = node.meta["val"]
+                if isinstance(out_val, (tuple, list)):
+                    # Multi-output: (shape1, shape2) : (dtype1, dtype2)
+                    shapes = []
+                    dtypes = []
+                    for v in out_val:
+                        if v is not None:
+                            shapes.append(_get_shape_str(v))
+                            dtypes.append(str(_get_dtype_or_type(v)))
+                        else:
+                            shapes.append("?")
+                            dtypes.append("None")
+                    out_info = f"({', '.join(shapes)}) : ({', '.join(dtypes)})"
+                else:
+                    out_info = _get_shape_dtype_str(out_val)
             else:
-                out_dtype = "N/A"
-            # Standard SSA notation: %out = op(args) : out_dtype
-            f.write(f"%{node.name} = {node.target}({', '.join(input_vars)}) : {out_dtype}\n")
+                out_info = "? : N/A"
+            # Standard SSA notation: %out = op(args) : shape : dtype
+            f.write(f"%{node.name} = {node.target}({', '.join(input_vars)}) : {out_info}\n")
         elif node.op == "output":
             # Output assignment in SSA IR
             outputs = node.args[0] if isinstance(node.args[0], (tuple, list)) else [node.args[0]]