NVIDIA
diff --git a/‎tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py‎
Lines changed: 11 additions & 10 deletions b/‎tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py‎
Lines changed: 11 additions & 10 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py‎
Lines changed: 21 additions & 125 deletions b/‎tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py‎
Lines changed: 21 additions & 125 deletions
@@ -14,7 +14,7 @@
 from ...models.factory import ModelFactory
 from ...shim.interface import CachedSequenceInterface
 from ...utils.node_utils import (
-    extract_param_names_from_node,
+    extract_weight_nodes,
     get_quantization_params_from_linear_node,
     is_bmm_op,
     is_linear_op,
@@ -136,13 +136,12 @@ def _insert_quantized_linear(
 
         The state_dict is also updated to contain the sharded weights.
         """
-        param_name, _ = extract_param_names_from_node(node)
-        original_weight = gm.get_parameter(param_name[0])
-        new_param = nn.Parameter(self.quantize_weight(original_weight), requires_grad=False)
-        modname, _, attrname = param_name.rpartition(".")
+        weight_nodes = extract_weight_nodes(node)
+        lin_weight = weight_nodes.weights[0]
+        new_param = nn.Parameter(self.quantize_weight(lin_weight.tensor), requires_grad=False)
+        modname, _, attrname = lin_weight.node_key.rpartition(".")
 
-        submod = gm.get_submodule(modname)
-        setattr(submod, attrname, new_param)
+        setattr(lin_weight.submod, attrname, new_param)
 
         # check modelopt quantizers from graph
         if is_quantized_graph:
@@ -168,10 +167,12 @@ def _insert_quantized_linear(
             )
             # Note: canonicalize_graph() will remove input/weight/output quantizer
 
-        for scale_name, scale in self.default_scales(original_weight.shape).items():
-            submod.register_buffer(scale_name, scale)
+        for scale_name, scale in self.default_scales(lin_weight.tensor.shape).items():
+            lin_weight.submod.register_buffer(scale_name, scale)
 
-        gm._register_load_state_dict_pre_hook(partial(self.load_hook, weight_name=param_name))
+        gm._register_load_state_dict_pre_hook(
+            partial(self.load_hook, weight_name=lin_weight.node_key)
+        )
 
         with gm.graph.inserting_before(node):
             scales = {}
 
@@ -38,6 +38,7 @@
     LayerSubgraph,
     LayerType,
     bfs,
+    extract_weight_name,
     extract_weight_nodes,
     filtered_nodes,
     get_all_layer_subgraphs,
@@ -1272,10 +1273,6 @@ def split_fused_tensor(
             fused_dims: list = fused_weight_dims,
             d: int = dim,
         ) -> torch.Tensor:
-            # dim_d = t.shape[d]
-            # num_parts = 1
-            # part_size = dim_d // num_parts
-            # fused_dims = [part_size] * num_parts
             return torch.cat(
                 [split_tensor(w) for w in torch.split(t, fused_dims, dim=d)],
                 dim=d,
@@ -1343,23 +1340,35 @@ def _shard_parameter_node(
     # # Shard weight using the unified function (also updates the parameter)
     # original_weight = gm.get_parameter(weight_key)
     weight_nodes = extract_weight_nodes(node)
-    for weight_node, bias_node in weight_nodes:
+    for weight_node in weight_nodes.weights:
         _, weight_new_shape = shard_weight_tensor(
             gm=gm,
-            weight_tensor=weight_node.node,
+            weight_tensor=weight_node.tensor,
             param_key=weight_node.node_key,
             dim=dim,
             rank=rank,
             world_size=world_size,
             min_local_shape=min_local_shape,
             fused_weight_dims=fused_weight_dims,
         )
+        if quantization_cb is not None:
+            quantization_cb(
+                gm=gm,
+                submod=weight_node.submod,
+                node=node,
+                weight_key=weight_node.node_key,
+                weight_new_shape=weight_new_shape,
+                dim=dim,
+                rank=rank,
+                world_size=world_size,
+            )
 
-        if bias_node is not None and dim == 0:
+    for bias_node in weight_nodes.biases:
+        if dim == 0:
             # update bias for dim 0 --> we can handle it like the weight
             shard_weight_tensor(
                 gm=gm,
-                weight_tensor=bias_node.node,
+                weight_tensor=bias_node.tensor,
                 param_key=bias_node.node_key,
                 dim=dim,
                 rank=rank,
@@ -1381,18 +1390,6 @@ def _shard_parameter_node(
                 partial(_load_hook_remove, param_key=bias_node.node_key)
             )
 
-        if quantization_cb is not None:
-            quantization_cb(
-                gm=gm,
-                submod=weight_node.submod,
-                node=node,
-                weight_key=weight_node.node_key,
-                weight_new_shape=weight_new_shape,
-                dim=dim,
-                rank=rank,
-                world_size=world_size,
-            )
-
     # # # column shard with no gather: the output is sharded
     if not add_dist:
         return
@@ -1423,107 +1420,6 @@ def _update_node_args(node: Node, args: tuple) -> None:
     )
 
 
-def _insert_sharded_moe_stacked(
-    gm: GraphModule,
-    node: Node,
-    rank: int,
-    world_size: int,
-    allreduce_strategy: AllReduceStrategy,
-    scale_names: Sequence[str] = (),
-):
-    """Update the torch_moe node with sliced stacked weight tensors,
-    sharded `selected_experts` and `final_scales(router_logics)`.
-    Add an all_reduce node after the moe node.
-
-    For torch_moe with stacked tensor format (single-element lists containing 3D tensors).
-
-    NOTE: allreduce_strategy is MANDATORY and must be explicitly provided.
-    """
-    if allreduce_strategy is None:
-        raise ValueError(f"allreduce_strategy must be set for MoE sharding on node {node.name}")
-
-    # Extract the stacked tensors from single-element lists
-    # args[3] = w1_weight (Node representing list with one 3D tensor, or direct list)
-    # args[4] = w2_weight (Node representing list with one 3D tensor, or direct list)
-
-    # Helper to extract tensor node from list (handles both Node and direct list)
-    def extract_tensor_from_list_arg(list_arg):
-        if isinstance(list_arg, Node) and list_arg.target is list:
-            # It's a list() call node - extract from its args
-            return list_arg.args[0][0]  # args[0] is the list content, [0] is first element
-        elif isinstance(list_arg, (list, tuple)):
-            # Direct list
-            return list_arg[0]
-        else:
-            raise ValueError(f"Unexpected list format: {type(list_arg)}")
-
-    w3_w1_tensor_node = extract_tensor_from_list_arg(node.args[3])
-    w2_tensor_node = extract_tensor_from_list_arg(node.args[4])
-    num_experts = _get_dim0_from_arg(gm, w3_w1_tensor_node)
-
-    args = list(node.args)
-
-    # -- Handle selected_experts and final_scales sharding --
-    selected_experts = args[1]
-    final_scales = args[2]
-
-    experts_per_rank = num_experts // world_size
-
-    with gm.graph.inserting_before(node):
-        lower = experts_per_rank * rank
-        # selected_experts_local = selected_experts - low
-        selected_experts_local = gm.graph.create_node(
-            "call_function", operator.sub, args=(selected_experts, lower), kwargs={}
-        )
-
-        # For num_experts % world_size != 0 case,
-        # assign the last (num_experts % world_size) experts to the last rank
-        div_node = gm.graph.create_node(
-            "call_function", operator.floordiv, args=(selected_experts, experts_per_rank), kwargs={}
-        )
-
-        comp_op = torch.ge if rank == world_size - 1 else torch.eq
-        rank_mask = gm.graph.create_node("call_function", comp_op, args=(div_node, rank), kwargs={})
-
-        # final_scales_local = final_scales * rank_mask
-        final_scales_local = gm.graph.create_node(
-            "call_function", operator.mul, args=(final_scales, rank_mask), kwargs={}
-        )
-
-    # -- Transform expert weight parameters --
-    local_lo, local_hi = _split_range_last_remainder(num_experts, world_size, rank)
-
-    # Transform w3_w1_stacked: slice experts, swap [W1,W3]->[W3,W1], transpose (E,H,2I)->(E,2I,H)
-    if isinstance(w3_w1_tensor_node, Node):
-        _transform_bmm_moe_weight_param(
-            gm, w3_w1_tensor_node, local_lo, local_hi, swap_gate_up=True
-        )
-
-    # Transform w2_stacked: slice experts, transpose (E,I,H)->(E,H,I)
-    if isinstance(w2_tensor_node, Node):
-        _transform_bmm_moe_weight_param(gm, w2_tensor_node, local_lo, local_hi, swap_gate_up=False)
-
-    # -- Update args (keep same lists/nodes, just with transformed parameters) --
-    args[1] = selected_experts_local
-    args[2] = final_scales_local
-    # args[3] and args[4] stay the same - we modified the parameters in-place
-
-    ad_logger.debug(
-        f"Updated node {node}: replaced original arguments {node.args} with sharded arguments {args}."
-    )
-
-    node.args = tuple(args)
-
-    # -- add an all_reduce node --
-    with gm.graph.inserting_after(node):
-        dist_node = gm.graph.call_function(
-            torch.ops.auto_deploy.torch_dist_all_reduce.default,
-            args=(node, allreduce_strategy),
-        )
-        node.replace_all_uses_with(dist_node)
-        dist_node.replace_input_with(dist_node, node)
-
-
 def _insert_sharded_moe(
     gm: GraphModule,
     node: Node,
@@ -2251,9 +2147,9 @@ def detect_sharding_from_config(
 
     for lin_node in linear_nodes:
         # use node's weight name to get the module name
-        module_name = extract_weight_nodes(lin_node)[0].target
+        weight_name = extract_weight_name(lin_node)
 
-        if any(attn_name in module_name for attn_name in attn_names):
+        if any(attn_name in weight_name for attn_name in attn_names):
             # find the next attention node and infer the head_dim
             next_attention_node, _ = bfs(
                 lin_node, is_any_attention_op, attr_next="users", include_root=False
@@ -2277,7 +2173,7 @@ def detect_sharding_from_config(
             # Then we escape dots, and finally we replace @ with .*
             pattern_string = pattern_string.replace("*", "@")
             pattern_regex = re.escape(pattern_string).replace("@", ".*")
-            if re.match(pattern_regex, module_name):
+            if re.match(pattern_regex, weight_name):
                 # we have a match. Get the config for this layer
                 config = tp_plan[key]
 
@@ -2316,7 +2212,7 @@ def detect_sharding_from_config(
                 elif "local" in config:
                     # Check if this applies to shared experts in EP parallelism.
                     # If yes, apply the TP col-row shard.
-                    if "shared" in module_name:
+                    if "shared" in weight_name:
                         col_row_action = config.replace("local_", "")
                         if col_row_action == "colwise":
                             transform_container.add(