wip

greg-kwasniewski1 · greg-kwasniewski1 · commit 00c1d174d295 · 2025-12-26T03:25:33.000-08:00
Signed-off-by: greg-kwasniewski1 &lt;213329731+greg-kwasniewski1@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py b/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py
@@ -137,7 +137,7 @@ def _insert_quantized_linear(
         The state_dict is also updated to contain the sharded weights.
         """
         param_name, _ = extract_param_names_from_node(node)
-        original_weight = gm.get_parameter(param_name)
+        original_weight = gm.get_parameter(param_name[0])
         new_param = nn.Parameter(self.quantize_weight(original_weight), requires_grad=False)
         modname, _, attrname = param_name.rpartition(".")
 
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py b/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py
@@ -38,8 +38,7 @@
     LayerSubgraph,
     LayerType,
     bfs,
-    extract_param_names_from_node,
-    extract_weight_node,
+    extract_weight_nodes,
     filtered_nodes,
     get_all_layer_subgraphs,
     get_layer_after_linear_node,
@@ -48,7 +47,6 @@
     is_any_moe_op,
     is_any_ssm_op,
     is_op,
-    num_users_of_weight_node,
     shape,
     subgraph,
 )
@@ -1330,68 +1328,70 @@ def _shard_parameter_node(
 
     rank, world_size = config.rank, config.world_size
     allreduce_strategy = config.allreduce_strategy.name
-    num_users = num_users_of_weight_node(node)
-    if num_users > 1 or num_users == 0:
-        ad_logger.warning(
-            f"Weight node {node} has {num_users} users. This is not supported for sharding. Skipping."
-        )
-        return
-    # get weight and bias key
-    weight_key, bias_key = extract_param_names_from_node(node)
-
-    modname = weight_key.rpartition(".")[0]
-    submod = gm.get_submodule(modname)
-
-    # Shard weight using the unified function (also updates the parameter)
-    original_weight = gm.get_parameter(weight_key)
-
-    _, weight_new_shape = shard_weight_tensor(
-        gm=gm,
-        weight_tensor=original_weight,
-        param_key=weight_key,
-        dim=dim,
-        rank=rank,
-        world_size=world_size,
-        min_local_shape=min_local_shape,
-        fused_weight_dims=fused_weight_dims,
-    )
-
-    if bias_key is not None and dim == 0:
-        # update bias for dim 0 --> we can handle it like the weight
-        original_bias = gm.get_parameter(bias_key)
-        shard_weight_tensor(
+    # num_users = num_users_of_weight_node(node)
+    # if num_users > 1 or num_users == 0:
+    #     ad_logger.warning(
+    #         f"Weight node {node} has {num_users} users. This is not supported for sharding. Skipping."
+    #     )
+    #     return
+    # # get weight and bias key
+    # weight_key, bias_key = extract_param_names_from_node(node)
+
+    # modname = weight_key.rpartition(".")[0]
+    # submod = gm.get_submodule(modname)
+
+    # # Shard weight using the unified function (also updates the parameter)
+    # original_weight = gm.get_parameter(weight_key)
+    weight_nodes = extract_weight_nodes(node)
+    for weight_node, bias_node in weight_nodes:
+        _, weight_new_shape = shard_weight_tensor(
             gm=gm,
-            weight_tensor=original_bias,
-            param_key=bias_key,
+            weight_tensor=weight_node.node,
+            param_key=weight_node.node_key,
             dim=dim,
             rank=rank,
             world_size=world_size,
             min_local_shape=min_local_shape,
             fused_weight_dims=fused_weight_dims,
         )
-    elif bias_key is not None and rank != world_size - 1:
-        # update the bias for dim 1 --> in this case only the last rank gets the bias to avoid
-        # double counting it. For all other we will delete the bias.
-        args = list(node.args)
-        node_bias = args[2]
-        args[2] = None
-        node.args = tuple(args)
-        gm.graph.erase_node(node_bias)
-        bias_param_name = bias_key.rpartition(".")[-1]
-        setattr(submod, bias_param_name, None)
-        gm._register_load_state_dict_pre_hook(partial(_load_hook_remove, param_key=bias_key))
-
-    if quantization_cb is not None:
-        quantization_cb(
-            gm=gm,
-            submod=submod,
-            node=node,
-            weight_key=weight_key,
-            weight_new_shape=weight_new_shape,
-            dim=dim,
-            rank=rank,
-            world_size=world_size,
-        )
+
+        if bias_node is not None and dim == 0:
+            # update bias for dim 0 --> we can handle it like the weight
+            shard_weight_tensor(
+                gm=gm,
+                weight_tensor=bias_node.node,
+                param_key=bias_node.node_key,
+                dim=dim,
+                rank=rank,
+                world_size=world_size,
+                min_local_shape=min_local_shape,
+                fused_weight_dims=fused_weight_dims,
+            )
+        elif bias_node is not None and rank != world_size - 1:
+            # update the bias for dim 1 --> in this case only the last rank gets the bias to avoid
+            # double counting it. For all other we will delete the bias.
+            args = list(node.args)
+            node_bias = args[2]
+            args[2] = None
+            node.args = tuple(args)
+            gm.graph.erase_node(node_bias)
+            bias_param_name = bias_node.node_key.rpartition(".")[-1]
+            setattr(bias_node.submod, bias_param_name, None)
+            gm._register_load_state_dict_pre_hook(
+                partial(_load_hook_remove, param_key=bias_node.node_key)
+            )
+
+        if quantization_cb is not None:
+            quantization_cb(
+                gm=gm,
+                submod=weight_node.submod,
+                node=node,
+                weight_key=weight_node.node_key,
+                weight_new_shape=weight_new_shape,
+                dim=dim,
+                rank=rank,
+                world_size=world_size,
+            )
 
     # # # column shard with no gather: the output is sharded
     if not add_dist:
@@ -2251,7 +2251,7 @@ def detect_sharding_from_config(
 
     for lin_node in linear_nodes:
         # use node's weight name to get the module name
-        module_name = extract_weight_node(lin_node).target
+        module_name = extract_weight_nodes(lin_node)[0].target
 
         if any(attn_name in module_name for attn_name in attn_names):
             # find the next attention node and infer the head_dim
diff --git a/tensorrt_llm/_torch/auto_deploy/utils/node_utils.py b/tensorrt_llm/_torch/auto_deploy/utils/node_utils.py
@@ -8,6 +8,7 @@
 
 import torch
 from pydantic import BaseModel, ConfigDict
+from torch import nn
 from torch._ops import OpOverload, OpOverloadPacket
 from torch.fx import GraphModule, Node
 
@@ -51,6 +52,13 @@ class LayerSubgraph(BaseModel):
     min_local_shape: int = 1
 
 
+class WeightNode(BaseModel):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    node: Node
+    node_key: str
+    submod: nn.Module
+
+
 @dataclass
 class modelopt_quant_params:
     input_node: torch.fx.node.Node = None
@@ -129,10 +137,12 @@ def get_quantization_params_from_linear_node(linear_op: torch.fx.node.Node):
     return input_params, weight_params, output_params
 
 
-def extract_weight_node(node: Node) -> int:
-    """Extracts the weight node from the given parametrized node"""
+def extract_weight_nodes(node: Node) -> Tuple[List[WeightNode], List[WeightNode]]:
+    """Extracts the list of weight node and optional bias node from the given parametrized node"""
     gm = node.graph.owning_module
-    param_names = {name for name, _ in gm.named_parameters()}
+    param_names = {name for name, _ in gm.named_parameters()}.union(
+        {name for name, _ in gm.named_buffers()}
+    )
 
     def find_get_attr_node(weight_node: Node) -> Node:
         """Recursively traverse inputs of allowed nodes to find a node with 'get_attr' op."""
@@ -157,55 +167,68 @@ def find_get_attr_node(weight_node: Node) -> Node:
         return None
 
     if is_op(node, torch.ops.aten.bmm):
-        weight_node = node.args[1]
+        # no bias for bmm
+        return [WeightNode(node=node.args[1], node_key=node.args[1].target)], []
     # for other parametrized nodes, we need to find the weight node
     else:
-        weight_nodes = [
+        all_weight_nodes = [
             n for n in node.args if isinstance(n, Node) and find_get_attr_node(n) is not None
         ]
-        # can be two weights (if bias weight is present)
-        weight_node = None
-        if weight_nodes:
-            weight_node = weight_nodes[0]
-        # for modelopt quantized graph, there will be a quantize_op
-        _, weight_params, _ = get_quantization_params_from_linear_node(node)
-        weight_node = weight_params.input_node if weight_params else weight_node
-        assert weight_node is not None, "Expected at least one weight node in the parametrized node"
-    return find_get_attr_node(weight_node)
+        # separate weight nodes and bias nodes
+        weight_nodes = [n for n in all_weight_nodes if n.target.endswith("weight")]
+        bias_nodes = [n for n in all_weight_nodes if n.target.endswith("bias")]
+        weight_nodes = [
+            WeightNode(
+                node=n, node_key=n.target, submod=gm.get_submodule(n.target.rpartition(".")[0])
+            )
+            for n in weight_nodes
+        ]
+        bias_nodes = [
+            WeightNode(
+                node=n, node_key=n.target, submod=gm.get_submodule(n.target.rpartition(".")[0])
+            )
+            for n in bias_nodes
+        ]
+    return weight_nodes, bias_nodes
 
 
 def num_users_of_weight_node(node: Node) -> int:
     """Returns the number of users of the weight node of the given parametrized node."""
-    weight_node = extract_weight_node(node)
+    weight_node = extract_weight_nodes(node)[0]
     return len(weight_node.users) if weight_node is not None else 0
 
 
-def extract_param_names_from_node(node: Node) -> Tuple[str, Optional[str]]:
+def extract_param_names_from_node(node: Node) -> Tuple[List[str], Optional[List[str]]]:
     """Extracts the name of the parameter associated with the given parametrized node.
 
     Args:
         node: node with weight parameters in the graph.
     """
-    weight_node = extract_weight_node(node)
+    # try:
 
-    assert weight_node, "Cannot identify weight parameter of linear node."
+    # except:
+    #     a = 1
 
-    # Map arg to named parameter
-    weight_name = weight_node.target
+    # assert weight_node, "Cannot identify weight parameter of linear node."
 
-    # check for bias
-    if is_op(node, torch.ops.aten.bmm):
-        bias_node = node.args[2] if len(node.args) > 2 else None
-    else:
-        weight_nodes = [n for n in node.args if isinstance(n, Node) and n.op == "get_attr"]
-        if len(weight_nodes) > 1:
-            bias_node = weight_nodes[1]
-        else:
-            bias_node = None
-    assert bias_node is None or bias_node.op == "get_attr"
-    bias_name = bias_node.target if bias_node is not None else None
+    # # Map arg to named parameter
+    # weight_name = weight_node.target
+
+    # # check for bias
+    # if is_op(node, torch.ops.aten.bmm):
+    #     bias_node = node.args[2] if len(node.args) > 2 else None
+    # else:
+    #     weight_nodes = [n for n in node.args if isinstance(n, Node) and n.op == "get_attr"]
+    #     if len(weight_nodes) > 1:
+    #         bias_node = weight_nodes[1]
+    #     else:
+    #         bias_node = None
+    # assert bias_node is None or bias_node.op == "get_attr"
+    # bias_name = bias_node.target if bias_node is not None else None
 
-    return weight_name, bias_name
+    # return weight_name, bias_name
+    weight_nodes, bias_nodes = extract_weight_nodes(node)
+    return [n.node_key for n in weight_nodes], [n.node_key for n in bias_nodes]
 
 
 def get_op_overload_packet(node: Union[OpOverloadPacket, OpOverload]) -> OpOverloadPacket:
@@ -751,9 +774,9 @@ def get_weight_shape(
     if not is_any_lin_op(node):
         return None
     if dim is None:
-        return shape(extract_weight_node(node))
+        return shape(extract_weight_nodes(node)[0])
     else:
-        return shape(extract_weight_node(node))[dim]
+        return shape(extract_weight_nodes(node)[0])[dim]
 
 
 def get_layer_after_linear_node(
diff --git a/tensorrt_llm/_torch/auto_deploy/utils/quantization_utils.py b/tensorrt_llm/_torch/auto_deploy/utils/quantization_utils.py
@@ -117,8 +117,8 @@ def should_skip_quantization(
     else:
         if not (is_linear_op(node_or_name) or is_bmm_op(node_or_name)):
             return True
-        param_name, _ = extract_param_names_from_node(node_or_name)
-        modname, _, _ = param_name.rpartition(".")
+        param_names, _ = extract_param_names_from_node(node_or_name)
+        modname, _, _ = param_names[0].rpartition(".")
 
     return any(fnmatch(modname, pattern) for pattern in excluded_patterns)