Revert "[None][feat] support attention dp for qwen3 dense model" (#7765)

byshiue · web-flow · commit 8226ef23dc20 · 2025-09-16T19:09:04.000+08:00
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3.py b/tensorrt_llm/_torch/models/modeling_qwen3.py
@@ -8,7 +8,6 @@
 
 from ..attention_backend import AttentionMetadata
 from ..attention_backend.interface import PositionalEmbeddingParams, RopeParams
-from ..distributed import AllReduceParams
 from ..model_config import ModelConfig
 from ..modules.decoder_layer import DecoderLayer
 from ..modules.embedding import Embedding
@@ -83,8 +82,6 @@ def __init__(
             model_config,
             layer_idx=layer_idx,
         )
-        self.mapping = model_config.mapping
-        self.enable_attention_dp = self.mapping.enable_attention_dp
 
         # Qwen3 has accuracy issues with deep_gemm (see: https://nvbugspro.nvidia.com/bug/5461712
         # and https://nvbugspro.nvidia.com/bug/5505402)
@@ -95,7 +92,6 @@ def __init__(
             intermediate_size=config.intermediate_size,
             bias=config.mlp_bias if hasattr(config, "mlp_bias") else False,
             dtype=config.torch_dtype,
-            overridden_tp_size=1 if self.enable_attention_dp else None,
             config=model_config,
             disable_deep_gemm=disable_deep_gemm,
         )
@@ -106,8 +102,6 @@ def __init__(
         self.post_attention_layernorm = RMSNorm(hidden_size=config.hidden_size,
                                                 eps=config.rms_norm_eps,
                                                 dtype=config.torch_dtype)
-        self.disable_allreduce = (self.mapping.tp_size == 1
-                                  or self.enable_attention_dp)
 
     def forward(
         self,
@@ -132,22 +126,13 @@ def forward(
             hidden_states=hidden_states,
             attn_metadata=attn_metadata,
             mrope_config=mrope_config,
-            all_reduce_params=AllReduceParams(
-                enable_allreduce=not self.disable_allreduce),
             **kwargs,
         )
 
         # Fully Connected
         hidden_states, residual = self.post_attention_layernorm(
             hidden_states, residual)
-        hidden_states = self.mlp(
-            hidden_states,
-            all_rank_num_tokens=attn_metadata.all_rank_num_tokens,
-            all_rank_max_num_tokens=attn_metadata.all_rank_max_num_tokens,
-            final_all_reduce_params=AllReduceParams(
-                enable_allreduce=not self.disable_allreduce),
-            cutlass_min_latency_mode=False,
-        )
+        hidden_states = self.mlp(hidden_states)
 
         if spec_metadata is not None:
             spec_metadata.maybe_capture_hidden_states(self.layer_idx,