[#10063][feat] AutoDeploy attention dp support

lucaslie · lucaslie · commit e2c9817d696c · 2026-01-16T14:08:44.000-08:00
Signed-off-by: Lucas Liebenwein &lt;11156568+lucaslie@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
@@ -351,13 +351,14 @@ def wrapper(
         def _call_func():
             return func(self, scheduled_requests, resource_manager, *args, **kwargs)
 
-        # check if we use cuda graph and we can run it
-        if not (self.cuda_graph_used and scheduled_requests.can_run_cuda_graph):
-            return _call_func()
+        # check conditions for current rank
+        can_run_cuda_graph = self.cuda_graph_used and scheduled_requests.can_run_cuda_graph
+        batch_size = scheduled_requests.batch_size
 
         # generate a persistent dummy request right away to ensure we can reserve the necessary
-        # resources (kv page and slot)
-        if self.padding_dummy_request is None:
+        # resources (kv page and slot) the first time we can actually run cuda graph according to
+        # this rank
+        if can_run_cuda_graph and self.padding_dummy_request is None:
             self.padding_dummy_request = _generate_dummy_request(
                 resource_manager,
                 request_id=CUDA_GRAPH_DUMMY_REQUEST_ID,
@@ -367,20 +368,45 @@ def _call_func():
                 max_beam_width=self.max_beam_width,
             )
 
-        # check closest cuda graph batch size
-        closest_cg_bs = _round_up_to_closest(
-            self.cuda_graph_batch_sizes, scheduled_requests.batch_size
-        )
+        # check if we can pad the batch based on the availability of the dummy request
+        can_pad = self.padding_dummy_request is not None
+
+        # in attention DP mode, we check all ranks
+        if self.enable_attention_dp and self.mapping.tp_size > 1:
+            assert self.dist is not None, "Distributed object is required for attention DP mode"
+            all_rank_info = self.dist.tp_allgather([can_run_cuda_graph, can_pad, batch_size])
+        else:
+            all_rank_info = [[can_run_cuda_graph, can_pad, batch_size]]
+
+        # now let's check if we can run cuda graph and pad the batch for all ranks
+        can_run_cuda_graph_all = all(r_info[0] for r_info in all_rank_info)
+        max_batch_size = max(r_info[2] for r_info in all_rank_info)
+
+        # let's check if all ranks can pad the batch if they need to
+        can_pad_all = all(r_info[1] or (r_info[2] == max_batch_size) for r_info in all_rank_info)
+
+        # fall back if we cannot run cudagraph
+        if not (can_run_cuda_graph_all and can_pad_all):
+            return _call_func()
 
-        # check if we need to pad
-        num_padding = closest_cg_bs - scheduled_requests.batch_size
+        # check if cudagraph batch size is available
+        # NOTE: we assume uniform cudagraph batch sizes across all ranks ensuring all ranks get the
+        # same closest cudagraph batch size here based on the max batch size across all ranks
+        closest_cg_bs = _round_up_to_closest(self.cuda_graph_batch_sizes, max_batch_size)
 
-        if num_padding <= 0:
+        if closest_cg_bs is None:
             return _call_func()
 
-        # check if we have a dummy request to use
-        if self.padding_dummy_request is None:
-            ad_logger.info("No CUDA graph padding possible due to missing dummy request.")
+        # check actual amount of padding needed
+        num_padding = closest_cg_bs - batch_size
+
+        # we should only hit this point for either of these conditions
+        assert num_padding == 0 or (num_padding > 0 and self.padding_dummy_request is not None), (
+            "Padding should not be needed or available at this point"
+        )
+
+        # no padding needed on current rank
+        if num_padding == 0:
             return _call_func()
 
         # pad the scheduled requests with the dummy request
@@ -411,7 +437,12 @@ def _device(self) -> DeviceLikeType:
         return self.cache_seq_interface.device
 
     @classmethod
-    def build_from_config(cls, ad_config: LlmArgs, mapping: Optional[Mapping] = None):
+    def build_from_config(
+        cls,
+        ad_config: LlmArgs,
+        mapping: Optional[Mapping] = None,
+        dist: Optional[Distributed] = None,
+    ):
         """Build the ADEngine using the LlmArgs that gets passed through from the LLM."""
 
         max_batch_size = ad_config.max_batch_size
@@ -453,6 +484,7 @@ def build_from_config(cls, ad_config: LlmArgs, mapping: Optional[Mapping] = None
             device,
             ad_config=ad_config,
             mapping=mapping,
+            dist=dist,
             reporting_info=reporting_info,
         )
 
@@ -464,6 +496,7 @@ def __init__(
         device: DeviceLikeType,
         ad_config: Optional[LlmArgs] = None,
         mapping: Optional[Mapping] = None,
+        dist: Optional[Distributed] = None,
         reporting_info: ReportingInfo = ReportingInfo(),
     ) -> None:
         """Initialize the engine with model and sequence information."""
@@ -484,7 +517,7 @@ def __init__(
         self.iter_states = {}
 
         # NOTE (lucaslie): not a declared base member in the base class; required by PyExecutor...
-        self.enable_attention_dp = False
+        self.enable_attention_dp = mapping.enable_attention_dp if mapping else False
 
         if ad_config is not None:
             self.max_beam_width = ad_config.max_beam_width
@@ -537,6 +570,7 @@ def __init__(
 
         # Reuse _execute_logit_post_processors from PyTorchModelEngine
         self.mapping = mapping
+        self.dist = dist
         self._execute_logit_post_processors = types.MethodType(
             PyTorchModelEngine._execute_logit_post_processors, self
         )
@@ -1005,13 +1039,23 @@ def create_autodeploy_executor(ad_config: LlmArgs, tokenizer: Optional[Tokenizer
     # initialize process groups
     world_size = mpi_world_size()
     rank = mpi_rank()
-    dist_mapping = Mapping(rank=rank, world_size=world_size, tp_size=world_size)
+    enable_attention_dp = ad_config.transforms.get("detect_sharding", {}).get(
+        "enable_attention_dp", False
+    )
+    dist_mapping = Mapping(
+        rank=rank,
+        world_size=world_size,
+        tp_size=world_size,
+        enable_attention_dp=enable_attention_dp,
+    )
     dist = Distributed.get(dist_mapping)
     ad_logger.set_rank(rank)
     torch.cuda.set_device(rank)
     port = dist.broadcast(get_free_port())  # use MPI broadcast to pick a free port
     initialize_or_skip(rank, world_size, port)
 
+    ad_logger.info(f"{dist_mapping=}, {dist=}, {port=}")
+
     # Setup AutoTuner with distributed state for allreduce autotuning
     AutoTuner.get().setup_distributed_state(dist_mapping)
 
@@ -1030,7 +1074,7 @@ def create_autodeploy_executor(ad_config: LlmArgs, tokenizer: Optional[Tokenizer
     )
 
     # initialize model engine
-    engine = ADEngine.build_from_config(ad_config=ad_config, mapping=dist_mapping)
+    engine = ADEngine.build_from_config(ad_config=ad_config, mapping=dist_mapping, dist=dist)
 
     spec_config = ad_config.speculative_config
     if spec_config is not None and not (
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py b/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py
@@ -150,6 +150,11 @@ class ShardingTransformConfig(TransformConfig):
 
     process_grid: Dict[ShardingDim, int] = Field(default_factory=dict)
 
+    enable_attention_dp: bool = Field(
+        default=False,
+        description="When True, skip TP sharding as attention data parallelism is enabled.",
+    )
+
     def validate_config(self, sources: Union[ShardingSource, List[ShardingSource]] = None) -> bool:
         init_process_grid_from_config(self)
         if sources is None:
@@ -737,8 +742,9 @@ def _apply(
             f"Using allreduce strategy: {config.allreduce_strategy.name}, dist backend: {config.dist_backend}"
         )
 
-        if world_size < 2:
-            ad_logger.info("Skipping sharding for single device")
+        if world_size < 2 or config.enable_attention_dp:
+            reason = "single device" if world_size < 2 else "attention DP enabled"
+            ad_logger.info(f"Skipping sharding: {reason}")
             return gm, TransformInfo(
                 skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True
             )
diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
@@ -79,6 +79,24 @@ def test_auto_dtype(self, world_size, enable_chunked_prefill):
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm, sampling_params=sampling_params)
 
+    @pytest.mark.skip_less_device_memory(32000)
+    @pytest.mark.skip_less_device(2)
+    @pytest.mark.parametrize("world_size", [2, 4])
+    def test_attention_dp(self, world_size):
+        """Test attention data parallelism mode where TP sharding is disabled."""
+        kwargs = self.get_default_kwargs(enable_chunked_prefill=True)
+        # Enable attention DP - this disables TP sharding
+        kwargs["transforms"]["detect_sharding"] = {"enable_attention_dp": True}
+        sampling_params = self.get_default_sampling_params()
+        with AutoDeployLLM(model=self.MODEL_PATH,
+                           tokenizer=self.MODEL_PATH,
+                           world_size=world_size,
+                           **kwargs) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm, sampling_params=sampling_params)
+
 
 class TestNemotronH(LlmapiAccuracyTestHarness):
     MODEL_NAME = "nvidia/Nemotron-H-8B-Base-8K"
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -321,4 +321,5 @@ l0_dgx_h100:
   tests:
   - unittest/_torch/auto_deploy/unit/multigpu
   - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4]
+  - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_attention_dp[4]
   - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16