sharding config seems to work

greg-kwasniewski1 · greg-kwasniewski1 · commit 2eb644e4d6b4 · 2025-07-24T08:33:12.000-07:00
Signed-off-by: greg-kwasniewski1 &lt;gkwasniewski@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/llm_args.py b/tensorrt_llm/_torch/auto_deploy/llm_args.py
@@ -157,6 +157,12 @@ class AutoDeployConfig(DynamicYamlMixInForSettings, BaseSettings):
         "If False, auto-detect and use column+row (all_reduce) sharding when possible.",
     )
 
+    use_sharding_from_config: bool = Field(
+        default=True,
+        description="If True, use sharding from the model config (if present). "
+        "If False, run heuristics to detect sharding.",
+    )
+
     compile_backend: Literal["torch-simple", "torch-compile", "torch-cudagraph", "torch-opt"] = (
         Field(
             default="torch-compile",
diff --git a/tensorrt_llm/_torch/auto_deploy/models/hf.py b/tensorrt_llm/_torch/auto_deploy/models/hf.py
@@ -174,12 +174,43 @@ def _build_model(self, device: DeviceLikeType) -> nn.Module:
         if hasattr(model, "post_init"):
             model.post_init()
 
+        # if present, initialize sharding config. We need head_dim for colwise sharding.
+        self._sharding_config = {}
+        self._sharding_config["head_dim"] = 1
+        if hasattr(model_config, "base_model_tp_plan"):
+            self._sharding_config["tp_plan"] = model_config.base_model_tp_plan
+        if hasattr(model_config, "head_dim"):
+            self._sharding_config["head_dim"] = model_config.head_dim
+        if hasattr(model_config, "num_hidden_layers"):
+            self._sharding_config["num_hidden_layers"] = model_config.num_hidden_layers
+        # if it is a multi-modal factory, overwrite the sharding config with the
+        # dedicated sub-configs
+        if hasattr(model_config, "sub_configs") and len(model_config.sub_configs) > 0:
+            # for image-text-to-text models, we only support sharding for the text sub-config
+            if isinstance(self, AutoModelForImageTextToTextFactory):
+                text_config = model_config.sub_configs["text_config"]
+                # if text_config is a class, instantiate it
+                if isinstance(text_config, type):
+                    text_config = text_config()
+                if hasattr(text_config, "base_model_tp_plan"):
+                    self._sharding_config["tp_plan"] = text_config.base_model_tp_plan
+                if hasattr(text_config, "head_dim"):
+                    self._sharding_config["head_dim"] = text_config.head_dim
+                if hasattr(text_config, "num_hidden_layers"):
+                    self._sharding_config["num_hidden_layers"] = text_config.num_hidden_layers
+            else:
+                # TODO: support sharding for other multi-modal models
+                pass
+
         # patch forward method
         model.forward = types.MethodType(self._simple_forward, model)
 
         model.eval()
         return model
 
+    def get_sharding_config(self):
+        return self._sharding_config or {}
+
     def get_quant_config(self) -> Dict:
         return self._quant_config or {}
 
diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/library/sharding.py b/tensorrt_llm/_torch/auto_deploy/transformations/library/sharding.py
@@ -18,11 +18,12 @@
 
 import math
 import operator
+import re
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from enum import IntEnum
 from functools import partial
-from typing import Callable, DefaultDict, Dict, List, Literal, Optional, Set
+from typing import Any, Callable, DefaultDict, Dict, List, Literal, Optional, Set
 
 import torch
 import torch.nn as nn
@@ -32,6 +33,7 @@
 from ...utils.logger import ad_logger
 from ...utils.node_utils import (
     extract_param_names_from_lin_node,
+    filtered_nodes,
     identify_regions_between_residuals,
     is_linear_op,
     is_op,
@@ -248,10 +250,200 @@ def apply(self, gm: GraphModule, node: Node) -> None:
 class ShardingConfig(BaseModel):
     """Configuration for sharding the model."""
 
+    rank: int = 0
+    world_size: int = 1
+    predefined_config: Dict[str, Any] = None
     tp_transforms: List[TPShardingInfo] = Field(default_factory=list)
     bmm_transforms: List[BMMShardingInfo] = Field(default_factory=list)
     ep_transforms: List[EPShardingInfo] = Field(default_factory=list)
 
+    def __init__(self, rank: int, world_size: int, sharding_config: Dict[str, Any] = None):
+        super().__init__()
+        self.rank = rank
+        self.world_size = world_size
+        self.predefined_config = sharding_config
+
+    def create_sharding_from_config(
+        self, gm: GraphModule, sharding_config: Dict[str, Any] = None
+    ) -> None:
+        """
+        Create sharding transformations from the predefined config.
+        TODO: currently, it applies only to TP sharding.
+        Args:
+            gm: Graph module to apply transformations to
+            sharding_config: Predefined sharding configuration
+        """
+        if sharding_config is not None:
+            self.predefined_config = sharding_config
+
+        # check if config is valid.
+        # 1. it is a Dict[str, str]
+        # 2. the keys are of format "module.submodule.subsubmodule..."
+        # 3. the wildcard "*" is allowed in the keys
+        # 4. the allowed values are:
+        #   - "colwise"
+        #   - "rowwise"
+        #   - "sequence_parallel"
+        #   - "local_colwise"
+        #   - "local_rowwise"
+        #   - "local"
+        #   - "gather"
+        # The following constraints are based on
+        # https://github.com/huggingface/transformers/blob/d8e05951b8efd4880acca9a3f291e8b65841a86d/src/transformers/models/llama4/configuration_llama4.py#L249
+
+        if not isinstance(self.predefined_config, dict):
+            ad_logger.warning("Sharding config is not a dictionary. Skipping.")
+            return
+
+        if "head_dim" not in self.predefined_config:
+            ad_logger.warning("Sharding config does not contain head_dim. Skipping.")
+            return
+        head_dim = self.predefined_config["head_dim"]
+
+        if "tp_plan" not in self.predefined_config:
+            ad_logger.warning("Sharding config does not contain tp_plan. Skipping.")
+            return
+        tp_plan = self.predefined_config["tp_plan"]
+
+        values = set(tp_plan.values())
+        allowed_values = {
+            "colwise",
+            "rowwise",
+            "sequence_parallel",
+            "local_colwise",
+            "local_rowwise",
+            "local_packed_rowwise",
+            "local",
+            "gather",
+        }
+        if not values.issubset(allowed_values):
+            ad_logger.warning("Sharding config contains invalid values. Skipping.")
+            return
+
+        for lin_node in filtered_nodes(gm.graph.nodes, is_linear_op):
+            module_name = list(lin_node.meta["nn_module_stack"].keys())[-1]
+            # use regex to find if module_name matches any of the keys in sharding_config
+            for key in tp_plan.keys():
+                pattern_string = "*" + key + "*"
+                # convert it to regex. Escape dots, replace * with .*
+                # WARNING! A very hacky solution! First, we substitute * with unlikely character, e.g. @
+                # Then we escape dots, and finally we replace @ with .*
+                pattern_string = pattern_string.replace("*", "@")
+                pattern_regex = re.escape(pattern_string).replace("@", ".*")
+                if re.match(pattern_regex, module_name):
+                    # we have a match. Get the config for this layer
+                    config = tp_plan[key]
+                    # TODO: @lucaslie: this is SUPER CONFUSING!
+                    # HF config uses "column" and "row" as-if Y = X @ W, so you have
+                    # all-gather after column, and all-reduce after row.
+                    # But since we assume Y = W @ X^T, we have a swapped column and row split.
+                    if config == "colwise":
+                        # if we are doing colwise split, we need to check if we are in
+                        # attention module. If so, we need to set min_local_shape to the
+                        # head_dim - otherwise, we would risk splitting the heads into smaller shards.
+                        # TODO: is there a better way to check if we are in attention module?
+                        attn_names = ["attention", "Attention", "attn", "Attn"]
+                        if any(attn_name in module_name for attn_name in attn_names):
+                            min_local_shape = head_dim
+                        else:
+                            min_local_shape = 1
+                        self.tp_transforms.append(
+                            TPShardingInfo(
+                                target_node=lin_node.name,
+                                split_dim=SplitDimension.ROW,
+                                rank=self.rank,
+                                world_size=self.world_size,
+                                dist_op=None,
+                                min_local_shape=min_local_shape,
+                            )
+                        )
+                    elif config == "rowwise":
+                        self.tp_transforms.append(
+                            TPShardingInfo(
+                                target_node=lin_node.name,
+                                split_dim=SplitDimension.COLUMN,
+                                rank=self.rank,
+                                world_size=self.world_size,
+                                dist_op="all_reduce",
+                                min_local_shape=1,
+                            )
+                        )
+                    elif "sequence" in config:
+                        # TODO: Sequence parallelism is not supported yet.
+                        ad_logger.warning("Sequence parallelism is not supported yet. Skipping.")
+                    elif "local" in config:
+                        # TODO: local refers to hybrid EP+TP parallelism. Not supported yet.
+                        ad_logger.warning("Local EP+TP sharding is not supported yet. Skipping.")
+                    elif "gather" in config:
+                        # Simple shard (row + all_gather)
+                        self.tp_transforms.append(
+                            TPShardingInfo(
+                                target_node=lin_node.name,
+                                split_dim=SplitDimension.ROW,
+                                rank=self.rank,
+                                world_size=self.world_size,
+                                dist_op="all_gather",
+                                min_local_shape=1,
+                            )
+                        )
+                    else:
+                        ad_logger.warning("Invalid sharding config. Skipping.")
+                    # after successful match, break the loop
+                    break
+
+    def simple_shard_first_n_layers(self, n_layers: int) -> None:
+        """
+        Simple shard the first n layers.
+        1. Take the existing config self.predefined_config,
+        2. Search for lines with wildcard "*",
+        3. Prepend to the top of the config list the same lines with "0, 1, ..., n_layers-1"
+        # instead of "*".
+        """
+        new_tp_plan = {}
+        for layer_pattern, config in self.predefined_config["tp_plan"].items():
+            if "*" in layer_pattern:
+                # Create new dict with first n_layers entries first
+
+                for i in range(n_layers):
+                    new_tp_plan[layer_pattern.replace("*", str(i))] = "gather"
+
+            # Add the default config after
+            new_tp_plan[layer_pattern] = config
+
+        self.predefined_config["tp_plan"] = new_tp_plan
+
+    def simple_shard_last_n_layers(self, n_layers: int) -> None:
+        """
+        Simple shard the last n layers.
+        1. Take the existing config self.predefined_config,
+        2. Search for lines with wildcard "*",
+        3. Prepend to the top of the config list the same lines with "0, 1, ..., n_layers-1"
+        # instead of "*".
+        """
+        new_tp_plan = {}
+        num_layers = self.predefined_config["num_hidden_layers"]
+        for layer_pattern, config in self.predefined_config["tp_plan"].items():
+            if "*" in layer_pattern:
+                # Create new dict with first n_layers entries first
+
+                for i in range(num_layers - n_layers, num_layers):
+                    new_tp_plan[layer_pattern.replace("*", str(i))] = "gather"
+
+            # Add the default config after
+            new_tp_plan[layer_pattern] = config
+        self.predefined_config["tp_plan"] = new_tp_plan
+
+    def simple_shard_attention_layers(self) -> None:
+        """
+        If any key in tp_plan contains "attention", replace it with "gather"
+        """
+        for layer_pattern, config in self.predefined_config["tp_plan"].items():
+            if any(
+                attn_name in layer_pattern
+                for attn_name in ["attention", "Attention", "attn", "Attn"]
+            ):
+                self.predefined_config["tp_plan"][layer_pattern] = "gather"
+
 
 def sharding_transform_executor(gm: GraphModule, sharding_config: ShardingConfig) -> None:
     """Apply transformations to the graph module.
diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/transform.py b/tensorrt_llm/_torch/auto_deploy/transformations/transform.py
@@ -114,19 +114,34 @@ def __call__(self, cm: CachedSequenceInterface) -> nn.Module:
         # see https://github.com/NVIDIA/TensorRT-LLM/pull/3668#discussion_r2052714528
         optimize_rope(egm)
 
-        # TODO: Infer sharding parameters (tp_size, row/column sharding) from the model config.
-        sharding_config = ShardingConfig()
-
-        # run TP sharding across ranks
-        detect_column_row_shard(
-            egm, local_rank, world_size, sharding_config, self.ad_config.simple_shard_only
-        )
-
-        # run EP sharding across ranks
-        detect_ep_shard(egm, local_rank, world_size, sharding_config)
-
-        # run BMM sharding across ranks
-        detect_dp_bmm_shard(egm, local_rank, world_size, sharding_config)
+        sharding_config = ShardingConfig(local_rank, world_size, self.factory.get_sharding_config())
+        self.ad_config.use_sharding_from_config = False
+        if (
+            self.ad_config.use_sharding_from_config
+            and sharding_config.predefined_config is not None
+        ):
+            ad_logger.info("\n\nUsing TP sharding from config\n")
+            # sharding_config.simple_shard_attention_layers()
+            sharding_config.create_sharding_from_config(egm)
+        else:
+            ad_logger.info("\n\nRunning TP sharding detection\n")
+            # run TP sharding across ranks
+            detect_column_row_shard(
+                egm, local_rank, world_size, sharding_config, self.ad_config.simple_shard_only
+            )
+
+            # run EP sharding across ranks
+            detect_ep_shard(egm, local_rank, world_size, sharding_config)
+
+            # run BMM sharding across ranks
+            detect_dp_bmm_shard(egm, local_rank, world_size, sharding_config)
+
+        # print detected transformations
+        ad_logger.info("\n\nTP sharding:")
+        for tp_transform in sharding_config.tp_transforms:
+            ad_logger.info(
+                f"{tp_transform.target_node} {tp_transform.split_dim} {tp_transform.dist_op}"
+            )
 
         sharding_transform_executor(egm, sharding_config)
 
diff --git a/tensorrt_llm/_torch/auto_deploy/utils/node_utils.py b/tensorrt_llm/_torch/auto_deploy/utils/node_utils.py