NVIDIA
diff --git a/‎tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py‎
Lines changed: 24 additions & 69 deletions b/‎tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py‎
Lines changed: 24 additions & 69 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/utils/node_utils.py‎
Lines changed: 65 additions & 19 deletions b/‎tensorrt_llm/_torch/auto_deploy/utils/node_utils.py‎
Lines changed: 65 additions & 19 deletions
@@ -43,7 +43,7 @@
     extract_weight_nodes,
     filtered_nodes,
     get_all_layer_subgraphs,
-    get_layer_after_linear_node,
+    get_all_weights_in_subgraph,
     is_any_attention_op,
     is_any_lin_op,
     is_any_moe_op,
@@ -1060,31 +1060,6 @@ def _resolve_tp_cls_from_node(node: Node):
     return WeightShardingInfo
 
 
-def _get_dim0_from_arg(gm: GraphModule, arg: Union[Node, torch.Tensor]) -> int:
-    """Helper to get the first dimension size of an argument (Node or Tensor)."""
-    if isinstance(arg, torch.Tensor):
-        return arg.shape[0]
-    if isinstance(arg, Node):
-        if arg.op == "get_attr":
-            # Traverse attributes to find the tensor
-            obj = gm
-            for atom in arg.target.split("."):
-                obj = getattr(obj, atom)
-            return obj.shape[0]
-        if "val" in arg.meta:
-            return shape(arg)[0]
-    raise ValueError(f"Cannot determine shape[0] for {arg}")
-
-
-def get_all_weights_in_subgraph(
-    sources: list[Node],
-    sinks: list[Node],
-):
-    """Get all weight nodes (get_attr nodes) in the subgraph between sources and sinks."""
-    weight_nodes = subgraph(sources, sinks, include=lambda n: n.op == "get_attr")
-    return weight_nodes
-
-
 def init_process_grid_from_config(
     config: ShardingTransformConfig,
 ) -> Dict[ShardingDim, Dict[str, int]]:
@@ -1247,6 +1222,7 @@ def _shard_parameter_node(
 
     # Shard weight using the unified function (also updates the parameter)
     weight_nodes = extract_weight_nodes(node)
+
     for weight_node in weight_nodes.weights:
         _, weight_new_shape = shard_weight_tensor(
             gm=gm,
@@ -1532,9 +1508,7 @@ def _insert_sharded_mxfp4_mlp_ep(
 
     # Add a dist all-reduce after the op (sum partial results across EP ranks)
     with gm.graph.inserting_after(node):
-        red = gm.graph.call_function(
-            torch.ops.auto_deploy.torch_dist_all_reduce, args=(node, config.allreduce_strategy.name)
-        )
+        red = gm.graph.call_function(torch.ops.auto_deploy.torch_dist_all_reduce, args=(node,))
         node.replace_all_uses_with(red)
         # keep dataflow: red(input=node)
         red.replace_input_with(red, node)
@@ -2018,47 +1992,37 @@ def detect_sharding_from_config(
         raise ValueError(f"Unsupported sharding source: {source}")
     tp_plan = config["tp_plan"]
 
-    # If the node is inside the attention module, we need to set min_local_shape to the
-    # head_dim - otherwise, we would risk splitting the heads into smaller shards.
-    # TODO: is there a better way to check if we are in attention module?
-    attn_names = [
-        "attention",
-        "Attention",
-        "attn",
-        "Attn",
-        "q_proj",
-        "k_proj",
-        "v_proj",
-        "o_proj",
-    ]
-
     num_shards = 0
     num_simple_shards = 0
     num_row_col_shards = 0
     num_attention_shards = 0
     num_ssm_shards = 0
-    head_dim = -1
     linear_nodes = list(filtered_nodes(gm.graph.nodes, is_any_lin_op))
 
+    # use layer_subgraphs to determine the layer_type
+    # and check the validity of the sharding transform
+    layer_subgraphs, unprocessed_linear_nodes = get_all_layer_subgraphs(gm)
+
     for lin_node in linear_nodes:
         # use node's weight name to get the module name
         weight_name = extract_weight_name(lin_node)
-
-        if any(attn_name in weight_name for attn_name in attn_names):
-            # find the next attention node and infer the head_dim
-            next_attention_node, _ = bfs(
-                lin_node, is_any_attention_op, attr_next="users", include_root=False
-            )
-            if next_attention_node is None:
-                # this is the last attention node in the graph. Take the previously found head_dim
-                assert head_dim != -1, "Head dim not found for the last attention node"
-            else:
-                head_dim = shape(next_attention_node)[-1]
-            min_local_shape = head_dim
-            layer_type = LayerType.ATTENTION
+        # get the parent layer_subgraph
+        layer_subgraph = [
+            layer
+            for layer in layer_subgraphs
+            if lin_node in layer.opening_nodes or lin_node == layer.terminating_node
+        ]
+        if len(layer_subgraph) == 1:
+            layer_subgraph = layer_subgraph[0]
+            layer_type = layer_subgraph.layer_type
         else:
-            min_local_shape = 1
-            layer_type = LayerType.MLP
+            if lin_node in unprocessed_linear_nodes:
+                layer_type = LayerType.UNKNOWN
+            else:
+                ad_logger.warning(
+                    f"Failed to find the parent layer_subgraph for linear node {lin_node}. Skipping."
+                )
+            continue
 
         # use regex to find if module_name matches any of the keys in sharding_config
         for key in tp_plan.keys():
@@ -2072,11 +2036,6 @@ def detect_sharding_from_config(
                 # we have a match. Get the config for this layer
                 config = tp_plan[key]
 
-                if config in ["colwise", "mamba"]:
-                    cur_node_index = linear_nodes.index(lin_node)
-                    layer_subgraph = get_layer_after_linear_node(
-                        linear_nodes, [cur_node_index - 1], enforce_strict_linear_history=False
-                    )
                 if config == "colwise":
                     _process_column_sharding(
                         layer_subgraph=layer_subgraph,
@@ -2089,7 +2048,6 @@ def detect_sharding_from_config(
                             split_dim=SplitDimension.ROW,
                             config=transform_container.config,
                             dist_op="all_reduce",
-                            min_local_shape=min_local_shape,
                             layer_type=layer_type,
                         )
                     ):
@@ -2116,7 +2074,6 @@ def detect_sharding_from_config(
                                     split_dim=SplitDimension.COLUMN,
                                     config=transform_container.config,
                                     dist_op=None,
-                                    min_local_shape=min_local_shape,
                                     layer_type=layer_type,
                                 )
                             )
@@ -2127,7 +2084,6 @@ def detect_sharding_from_config(
                                     split_dim=SplitDimension.ROW,
                                     config=transform_container.config,
                                     dist_op="all_reduce",
-                                    min_local_shape=min_local_shape,
                                     layer_type=layer_type,
                                 )
                             ):
@@ -2146,7 +2102,6 @@ def detect_sharding_from_config(
                             split_dim=SplitDimension.COLUMN,
                             config=transform_container.config,
                             dist_op="all_gather",
-                            min_local_shape=1,
                             layer_type=layer_type,
                         )
                     ):
@@ -2259,7 +2214,7 @@ def detect_column_row_shard(
         attention_nodes = list(filtered_nodes(layer_subgraph, is_any_attention_op))
         min_local_shape = 1
 
-        if config.simple_shard_only:
+        if config.simple_shard_only or layer.layer_type == LayerType.UNKNOWN:
             ad_logger.debug(
                 f"Forcing Simple Shard on nodes: {nodes_linear} with layer type: {layer.layer_type}"
             )
 
@@ -143,8 +143,19 @@ def get_quantization_params_from_linear_node(linear_op: torch.fx.node.Node):
     return input_params, weight_params, output_params
 
 
-def extract_weight_name(node: Node) -> str:
+def get_all_weights_in_subgraph(
+    sources: list[Node],
+    sinks: list[Node],
+):
+    """Get all weight nodes (get_attr nodes) in the subgraph between sources and sinks."""
+    weight_nodes = subgraph(sources, sinks, include=is_weight_node)
+    return weight_nodes
+
+
+def extract_weight_name(node: Node) -> Union[str, bool]:
     weight_nodes = extract_weight_nodes(node)
+    if len(weight_nodes.weights) == 0:
+        return False
     return weight_nodes.weights[0].node_key
 
 
@@ -431,6 +442,10 @@ def is_dist_op(node: Node) -> bool:
     return is_op(node, dist_ops)
 
 
+def is_weight_node(node: Node) -> bool:
+    return node.op == "get_attr" and node.target and has_shape(node) and len(shape(node)) > 0
+
+
 def get_user_if_pattern_match(node, ops, numusers, user_idx: int = 0):
     """Get a user from a node if the node matches a given op set and num of users."""
     if node is None:
@@ -531,6 +546,9 @@ def get_all_layer_subgraphs(gm: GraphModule) -> List[List[Node]]:
     assert gm.graph.nodes, "Graph is empty"
     layer_subgraphs = []
     linear_nodes = list(filtered_nodes(gm.graph.nodes, is_any_lin_op))
+
+    # find the embedding size of this model. Extract it from the input of the first linear node.
+    embd = get_weight_shape(linear_nodes[0], dim=-1)
     unprocessed_linear_nodes = set(linear_nodes)
     assert len(linear_nodes) > 0, "Could not find any linear nodes in the graph"
 
@@ -542,7 +560,7 @@ def get_all_layer_subgraphs(gm: GraphModule) -> List[List[Node]]:
         # opening is the list of linear nodes
         # layer_subgraph is the list of nodes between the opening and closing linear nodes
         # closing is the last linear node in the layer
-        layer_subgraph = get_layer_after_linear_node(linear_nodes, terminating_indices)
+        layer_subgraph = get_layer_after_linear_node(linear_nodes, terminating_indices, embd=embd)
         if layer_subgraph.opening_nodes is not None and len(layer_subgraph.opening_nodes) > 0:
             unprocessed_linear_nodes -= (
                 set(layer_subgraph.opening_nodes)
@@ -808,6 +826,7 @@ def get_weight_shape(node: Node, dim: Optional[int] = None) -> Optional[Union[in
 def get_layer_after_linear_node(
     linear_nodes: List[Node],
     terminating_indices: List[int],
+    embd: Optional[int] = None,
     match_on_shapes: bool = True,
     enforce_strict_linear_history: bool = True,
 ) -> LayerSubgraph:
@@ -882,8 +901,9 @@ def filter_condition(node: Node, embd: Optional[int] = None, dim: Optional[int]
                 layer_type=LayerType.UNKNOWN,
             )
         if match_on_shapes:
-            # get embedding size of the opening linear node
-            embd = get_weight_shape(linear_nodes[start_lin_index], dim=-1)
+            if embd is None:
+                # get embedding size of the opening linear node
+                embd = get_weight_shape(linear_nodes[start_lin_index], dim=-1)
             # partial init boundary_condition and filter_condition
             boundary_condition = partial(boundary_condition, embd=embd, dim=0)
             filter_condition = partial(filter_condition, embd=embd, dim=0)
@@ -892,6 +912,18 @@ def filter_condition(node: Node, embd: Optional[int] = None, dim: Optional[int]
             sources=[linear_nodes[start_lin_index]], boundary_condition=boundary_condition
         )
         lin_nodes_in_subgraph = list(filtered_nodes(forward_subgraph, filter_condition))
+        if len(lin_nodes_in_subgraph) > 1:
+            # it means that probably we went over the boundary of the layer.
+            # It may happen e.g., with MoLE (latent MoE), with the closing latent fc2 projection,
+            # when the subgraph spanned over fc2 "spills" over consecutive layers.
+            # Then, wrap this single linear node in  LayerType.UNKNOWN and return.
+            terminating_indices.append(start_lin_index)
+            return LayerSubgraph(
+                opening_nodes=[linear_nodes[start_lin_index]],
+                subgraph_nodes=[],
+                terminating_node=linear_nodes[start_lin_index],
+                layer_type=LayerType.UNKNOWN,
+            )
         start_lin_index += 1
     start_lin_index -= 1
     terminating_linear_node = lin_nodes_in_subgraph[0]
@@ -924,25 +956,39 @@ def filter_condition(node: Node, embd: Optional[int] = None, dim: Optional[int]
     ssm_nodes = list(filtered_nodes(interior_nodes, is_any_ssm_op))
     attention_nodes = list(filtered_nodes(interior_nodes, is_any_attention_op))
     intermediate_lin_nodes = list(filtered_nodes(interior_nodes, is_any_lin_op))
+    intermediate_weight_nodes = list(
+        filtered_nodes(
+            interior_nodes, lambda n: is_weight_node(n) and not is_any_lin_op(list(n.users)[0])
+        )
+    )
 
     layer_type = LayerType.MLP
     min_local_shape = 1
     if len(ssm_nodes) > 0:
-        assert len(ssm_nodes) == 1, "SSM layer must have exactly one SSM node"
-        layer_type = LayerType.SSM
-        # determine head size
-        min_local_shape = shape(ssm_nodes[0])[-1]
-    if len(attention_nodes) > 0:
-        assert len(attention_nodes) == 1, "Attention layer must have exactly one attention node"
-        layer_type = LayerType.ATTENTION
-        # determine head size
-        min_local_shape = shape(attention_nodes[0])[-1]
-    if len(intermediate_lin_nodes) > 0:
-        assert len(intermediate_lin_nodes) == 2, (
-            "MLA layer must have exactly two intermediate linear nodes"
-        )
-        assert len(attention_nodes) == 1, "MLA layer must have exactly one attention node"
-        layer_type = LayerType.MLA
+        if len(ssm_nodes) == 1:
+            layer_type = LayerType.SSM
+            # determine head size
+            min_local_shape = shape(ssm_nodes[0])[-1]
+        else:
+            layer_type = LayerType.UNKNOWN
+    if len(attention_nodes) > 0 and layer_type != LayerType.UNKNOWN:
+        if len(attention_nodes) == 1:
+            layer_type = LayerType.ATTENTION
+            # determine head size
+            min_local_shape = shape(attention_nodes[0])[-1]
+        else:
+            layer_type = LayerType.UNKNOWN
+    if len(intermediate_lin_nodes) > 0 and layer_type != LayerType.UNKNOWN:
+        if len(intermediate_lin_nodes) == 2 and len(attention_nodes) == 1:
+            layer_type = LayerType.MLA
+        else:
+            layer_type = LayerType.UNKNOWN
+    # only SSM or MLA layers can have weight nodes in the interior nodes
+    # TODO: Minimax does have RMSNorm inside attention, we need to
+    # support it in the future.
+    if len(intermediate_weight_nodes) > 0:
+        if layer_type not in [LayerType.SSM, LayerType.MLA]:
+            layer_type = LayerType.UNKNOWN
 
     layer_subgraph = LayerSubgraph(
         opening_nodes=opening_linear_nodes,