Cleanup sharding interface

greg-kwasniewski1 · greg-kwasniewski1 · commit 59901ba80353 · 2025-08-05T15:49:10.000+02:00
Signed-off-by: greg-kwasniewski1 &lt;213329731+greg-kwasniewski1@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/models/factory.py b/tensorrt_llm/_torch/auto_deploy/models/factory.py
@@ -12,10 +12,8 @@
 from ..custom_ops.attention_interface import CacheConfig
 from ..utils.logger import ad_logger
 
-5
 
-
-class FactorySource(Enum):
+class ShardingConfigSource(Enum):
     """Enum for factory source."""
 
     HUGGINGFACE = "huggingface"
@@ -48,6 +46,7 @@ def __init__(
         self.max_seq_len = max_seq_len
         self._prefetched_model_path: Optional[str] = None
         self._prefetched_tokenizer_path: Optional[str] = None
+        self._sharding_config: Dict[str, Any] = {}
 
     @property
     def model(self) -> Optional[str]:
@@ -106,9 +105,9 @@ def get_quant_config(self) -> Dict:
         """Returns the quantization config for this model or None if not quantized."""
         return {}
 
-    def get_sharding_config(self):
-        """Returns the sharding config for this model or None if not sharded."""
-        return {}
+    def get_sharding_config(self) -> Dict:
+        """Returns the sharding config for this model."""
+        return self._sharding_config
 
     def get_cache_config(self) -> CacheConfig:
         """Return the cache configuration for the model.
@@ -118,13 +117,13 @@ def get_cache_config(self) -> CacheConfig:
         """
         return CacheConfig()
 
-    def get_model_source(self) -> FactorySource:
+    def get_sharding_config_source(self) -> ShardingConfigSource:
         """Return the source of the model factory.
 
         Returns:
             The source identifier for this model factory.
         """
-        return FactorySource.UNKNOWN
+        return ShardingConfigSource.UNKNOWN
 
     def init_tokenizer(self) -> Optional[Any]:
         """Initialize the tokenizer for the model.
diff --git a/tensorrt_llm/_torch/auto_deploy/models/hf.py b/tensorrt_llm/_torch/auto_deploy/models/hf.py
@@ -30,7 +30,7 @@
 from ..custom_ops.attention_interface import CacheConfig
 from ..utils._config import deep_merge_dicts
 from ..utils.logger import ad_logger
-from .factory import FactorySource, ModelFactory, ModelFactoryRegistry
+from .factory import ModelFactory, ModelFactoryRegistry, ShardingConfigSource
 
 
 @contextmanager
@@ -175,7 +175,7 @@ def _build_model(self, device: DeviceLikeType) -> nn.Module:
             model.post_init()
 
         # if present, initialize sharding config. We need head_dim for colwise sharding.
-        self._set_sharding_config(model_config)
+        self._set_sharding_config(model.config)
 
         # patch forward method
         model.forward = types.MethodType(self._simple_forward, model)
@@ -185,7 +185,6 @@ def _build_model(self, device: DeviceLikeType) -> nn.Module:
 
     def _set_sharding_config(self, model_config: PretrainedConfig):
         """Set the sharding config for the model."""
-        self._sharding_config = {}
         self._sharding_config["head_dim"] = 1
         if hasattr(model_config, "base_model_tp_plan"):
             self._sharding_config["tp_plan"] = model_config.base_model_tp_plan
@@ -194,9 +193,6 @@ def _set_sharding_config(self, model_config: PretrainedConfig):
         if hasattr(model_config, "num_hidden_layers"):
             self._sharding_config["num_hidden_layers"] = model_config.num_hidden_layers
 
-    def get_sharding_config(self):
-        return self._sharding_config or {}
-
     def get_quant_config(self) -> Dict:
         return self._quant_config or {}
 
@@ -213,13 +209,13 @@ def get_cache_config(self):
             kv_cache_dtype = None
         return CacheConfig(dtype=kv_cache_dtype)
 
-    def get_model_source(self) -> FactorySource:
+    def get_sharding_config_source(self) -> ShardingConfigSource:
         """Return the source of the model factory.
 
         Returns:
             The source identifier for this model factory.
         """
-        return FactorySource.HUGGINGFACE
+        return ShardingConfigSource.HUGGINGFACE
 
     def init_tokenizer(self) -> Optional[Any]:
         """Initialize the tokenizer—either a custom name or the model's default."""
@@ -389,18 +385,17 @@ def _get_max_position_embeddings_config(self) -> Dict[str, Any]:
         }
 
     def _set_sharding_config(self, model_config: PretrainedConfig):
-        """Set the sharding config for the model."""
-        self._sharding_config = {}
-        text_config = model_config.sub_configs["text_config"]
-        # if text_config is a class, instantiate it
-        if isinstance(text_config, type):
-            text_config = text_config()
-        if hasattr(text_config, "base_model_tp_plan"):
-            self._sharding_config["tp_plan"] = text_config.base_model_tp_plan
-        if hasattr(text_config, "head_dim"):
-            self._sharding_config["head_dim"] = text_config.head_dim
-        if hasattr(text_config, "num_hidden_layers"):
-            self._sharding_config["num_hidden_layers"] = text_config.num_hidden_layers
+        """Override the sharding config for the model with text_config."""
+        super()._set_sharding_config(model_config)
+
+        if hasattr(model_config, "text_config"):
+            text_config = model_config.text_config
+            if hasattr(text_config, "base_model_tp_plan"):
+                self._sharding_config["tp_plan"] = text_config.base_model_tp_plan
+            if hasattr(text_config, "head_dim"):
+                self._sharding_config["head_dim"] = text_config.head_dim
+            if hasattr(text_config, "num_hidden_layers"):
+                self._sharding_config["num_hidden_layers"] = text_config.num_hidden_layers
 
     @property
     def automodel_from_config(self):
diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/library/sharding.py b/tensorrt_llm/_torch/auto_deploy/transformations/library/sharding.py
@@ -30,7 +30,7 @@
 from pydantic import BaseModel, ConfigDict, Field
 from torch.fx import GraphModule, Node
 
-from ...models.factory import FactorySource
+from ...models.factory import ModelFactory, ShardingConfigSource
 from ...utils.logger import ad_logger
 from ...utils.node_utils import (
     extract_param_names_from_lin_node,
@@ -255,7 +255,7 @@ def apply(self, gm: GraphModule, node: Node) -> None:
 class ShardingConfig(BaseModel):
     """Configuration for sharding the model."""
 
-    factory_source: FactorySource
+    factory_source: ShardingConfigSource
     rank: int
     world_size: int
     _predefined_config: Optional[Dict[str, Any]] = None
@@ -267,7 +267,7 @@ class ShardingConfig(BaseModel):
 
     def __init__(
         self,
-        factory_source: FactorySource,
+        factory_source: ShardingConfigSource,
         rank: int,
         world_size: int,
         sharding_config: Dict[str, Any] = None,
@@ -290,30 +290,30 @@ def __init__(
             self.validate_config()
 
     def validate_config(self) -> bool:
-        if self.factory_source != FactorySource.HUGGINGFACE:
+        if self.factory_source != ShardingConfigSource.HUGGINGFACE:
             ad_logger.warning(
                 "Sharding config is is currently only " + "supported for HuggingFace. Skipping."
             )
             # invalidate the config
-            self._predefined_config = None
+            self._predefined_config = {}
             return False
 
         if not isinstance(self._predefined_config, dict):
             ad_logger.warning("Sharding config is not a dictionary. Skipping.")
             # invalidate the config
-            self._predefined_config = None
+            self._predefined_config = {}
             return False
 
         if "head_dim" not in self._predefined_config:
             ad_logger.warning("Sharding config does not contain head_dim. Skipping.")
             # invalidate the config
-            self._predefined_config = None
+            self._predefined_config = {}
             return False
 
         if "tp_plan" not in self._predefined_config:
             ad_logger.warning("Sharding config does not contain tp_plan. Skipping.")
             # invalidate the config
-            self._predefined_config = None
+            self._predefined_config = {}
             return False
         tp_plan = self._predefined_config["tp_plan"]
 
@@ -333,7 +333,7 @@ def validate_config(self) -> bool:
         if not values.issubset(allowed_values):
             ad_logger.warning("Sharding config contains invalid values. Skipping.")
             # invalidate the config
-            self._predefined_config = None
+            self._predefined_config = {}
             return False
         return True
 
@@ -727,10 +727,26 @@ def _append_simple_shard(
     sharding_config.tp_transforms.extend(tp_shards)
 
 
-def detect_sharding(gm: GraphModule, sharding_config: ShardingConfig) -> None:
+def detect_sharding(
+    gm: GraphModule,
+    factory: ModelFactory,
+    local_rank: int,
+    world_size: int,
+    simple_shard_only: bool,
+    use_sharding_from_factory: bool,
+) -> ShardingConfig:
+    sharding_config = ShardingConfig(
+        factory.get_sharding_config_source(),
+        local_rank,
+        world_size,
+        factory.get_sharding_config(),
+        simple_shard_only,
+        use_sharding_from_factory,
+    )
+
     if (
         sharding_config.use_sharding_from_factory
-        and sharding_config.get_predefined_config() is not None
+        and len(sharding_config.get_predefined_config()) > 0
     ):
         ad_logger.info("Applying sharding from config")
         detect_sharding_from_factory_config(gm, sharding_config)
@@ -746,6 +762,8 @@ def detect_sharding(gm: GraphModule, sharding_config: ShardingConfig) -> None:
     # run BMM sharding across ranks
     detect_dp_bmm_shard(gm, sharding_config)
 
+    return sharding_config
+
 
 def detect_column_row_shard(
     gm: GraphModule,
@@ -771,7 +789,7 @@ def detect_column_row_shard(
 
     rank, world_size = sharding_config.rank, sharding_config.world_size
     if world_size < 2:
-        ad_logger.info("Skipping sharding for single device")
+        ad_logger.info("Skipping TP sharding for single device")
         return
 
     assert isinstance(gm, GraphModule), "Expecting GraphModule"
@@ -937,7 +955,7 @@ def detect_dp_bmm_shard(gm: GraphModule, sharding_config: ShardingConfig) -> Non
     ad_logger.debug("Before sharding graph: " + str(gm))
     rank, world_size = sharding_config.rank, sharding_config.world_size
     if world_size < 2:
-        ad_logger.info("Skipping sharding for single device")
+        ad_logger.info("Skipping DP BMM sharding for single device")
         return
 
     assert isinstance(gm, GraphModule), "Expecting GraphModule"
@@ -1008,7 +1026,7 @@ def detect_ep_shard(gm: GraphModule, sharding_config: ShardingConfig) -> None:
 
     rank, world_size = sharding_config.rank, sharding_config.world_size
     if world_size < 2:
-        ad_logger.info("Skipping sharding for single device")
+        ad_logger.info("Skipping EP sharding for single device")
         return
 
     assert isinstance(gm, GraphModule), "Expecting GraphModule"
diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/transform.py b/tensorrt_llm/_torch/auto_deploy/transformations/transform.py
@@ -15,7 +15,6 @@
 from ..utils.logger import ad_logger
 from ._graph import canonicalize_graph, lift_to_meta, move_to_device
 from .library import (
-    ShardingConfig,
     detect_sharding,
     eliminate_redundant_transposes,
     fuse_allreduce_residual_rmsnorm,
@@ -112,17 +111,15 @@ def __call__(self, cm: CachedSequenceInterface) -> nn.Module:
         # see https://github.com/NVIDIA/TensorRT-LLM/pull/3668#discussion_r2052714528
         optimize_rope(egm)
 
-        sharding_config = ShardingConfig(
-            self.factory.get_model_source(),
+        sharding_config = detect_sharding(
+            egm,
+            self.factory,
             local_rank,
             world_size,
-            self.factory.get_sharding_config(),
             self.ad_config.simple_shard_only,
             self.ad_config.use_sharding_from_factory,
         )
 
-        detect_sharding(egm, sharding_config)
-
         sharding_transform_executor(egm, sharding_config)
 
         # let's run a shape propagation pass to update the graph with correct meta values for