[TRTLLM-5966][feat] Helix: extend mapping to support different CP types (NVIDIA#6816)

MatthiasKohl · web-flow · commit 69574ad73078 · 2025-08-14T09:00:02.000-07:00
Signed-off-by: Matthias Jouanneaux &lt;mjoux@nvidia.com&gt;
diff --git a/examples/llm-api/star_attention.py b/examples/llm-api/star_attention.py
@@ -7,6 +7,7 @@
 import torch
 
 from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm.mapping import CpType
 from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig
 
 
@@ -59,7 +60,7 @@ def generate_llm_outputs(args, data, fp8=False, fp8_kv_cache=False):
                                kv_cache_quant_algo=QuantAlgo.FP8 if fp8_kv_cache
                                else None) if fp8 else QuantConfig()
     cp_config = {
-        "cp_type": "star_attention",
+        "cp_type": CpType.STAR,
         "cp_anchor_size": args.sa_anchor_size,
         "block_size": args.sa_block_size
     }
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -16,7 +16,7 @@
 from tensorrt_llm.lora_helper import (LoraConfig,
                                       get_default_trtllm_modules_to_hf_modules)
 from tensorrt_llm.lora_manager import load_torch_lora
-from tensorrt_llm.mapping import Mapping
+from tensorrt_llm.mapping import CpType, Mapping
 
 from ..model_config import ModelConfig
 from ..speculative import get_num_extra_kv_tokens, get_spec_decoder
@@ -589,7 +589,7 @@ def instantiate_sampler(engine: PyTorchModelEngine,
         mapping,
         max_seq_len=engine.max_seq_len,
         enable_mixed_sampler=pytorch_backend_config.enable_mixed_sampler)
-    if mapping.cp_config.get('cp_type') == 'star_attention':
+    if mapping.cp_config.get('cp_type') == CpType.STAR:
         assert pytorch_backend_config.attn_backend == "FLASHINFER_STAR_ATTENTION", "attention backend of star attention should be 'FLASHINFER_STAR_ATTENTION'"
         return TorchSampler(sampler_args)
     if engine.spec_config is not None and engine.spec_config.spec_dec_mode.has_spec_decoder(
diff --git a/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py b/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py
@@ -11,6 +11,7 @@
 import torch
 
 from tensorrt_llm._utils import nvtx_range
+from tensorrt_llm.mapping import CpType
 
 from ..distributed import Distributed
 from .llm_request import (ExecutorRequest, LlmRequest,
@@ -569,9 +570,9 @@ def _merge_requests(
         cp_config = self.dist.cp_config
         if 'cp_type' in cp_config:
             cp_type = cp_config['cp_type']
-            if cp_type == 'star_attention':
+            if cp_type == CpType.STAR:
                 return self._merge_star_attention_requests(new_requests)
-            elif cp_type == 'ring_attention':
+            elif cp_type == CpType.RING:
                 raise NotImplementedError("ring attention not implemented yet")
             else:
                 raise NotImplementedError(f'unsupport cp type {cp_type}')
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -29,7 +29,7 @@
 from tensorrt_llm.logger import logger
 from tensorrt_llm.lora_helper import LoraConfig
 from tensorrt_llm.lora_manager import LoraModelConfig
-from tensorrt_llm.mapping import Mapping
+from tensorrt_llm.mapping import CpType, Mapping
 from tensorrt_llm.models.modeling_utils import QuantAlgo
 from tensorrt_llm.quantization.utils.fp4_utils import float4_e2m1x2
 
@@ -666,7 +666,7 @@ def release_batch(result: ScheduledRequests | None):
 
         # TODO: current warmup_request is not suitable for star attention
         cp_type = self.mapping.cp_config.get('cp_type', None)
-        if cp_type == 'star_attention':
+        if cp_type == CpType.STAR:
             return
 
         with contextlib.ExitStack() as stack:
@@ -2110,7 +2110,7 @@ def _prepare_inputs(
             cache_indirection_buffer: Optional[torch.Tensor] = None):
         if self.mapping is not None and 'cp_type' in self.mapping.cp_config:
             cp_type = self.mapping.cp_config['cp_type']
-            if 'star_attention' == cp_type:
+            if CpType.STAR == cp_type:
                 return self._prepare_star_attention_inputs(
                     scheduled_requests, kv_cache_manager, attn_metadata)
             else:
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -31,6 +31,7 @@
 from tensorrt_llm.bindings.internal.batch_manager import (LlmRequestType,
                                                           ReqIdsSet)
 from tensorrt_llm.logger import logger
+from tensorrt_llm.mapping import CpType
 from tensorrt_llm.runtime.generation import CUASSERT
 
 from ..distributed import Distributed
@@ -1460,7 +1461,7 @@ def _update_request_states(self, scheduled_requests: ScheduledRequests):
         cp_config = self.dist.cp_config
         if 'cp_type' in cp_config:
             cp_type = cp_config['cp_type']
-            if cp_type == 'star_attention':
+            if cp_type == CpType.STAR:
                 self._update_request_states_star_attention(scheduled_requests)
             else:
                 assert False, f'Unsupport cp_type {cp_type}'
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -16,7 +16,7 @@
 
 from ..._utils import binding_dtype_size, binding_to_str_dtype, nvtx_range
 from ...logger import logger
-from ...mapping import Mapping
+from ...mapping import CpType, Mapping
 from .llm_request import (LlmRequest, LlmRequestState, SamplingConfig,
                           get_draft_token_length)
 from .scheduler import ScheduledRequests
@@ -402,7 +402,7 @@ def prepare_resources(self, scheduled_batch: ScheduledRequests):
             # allocate KV Cache
             for req in context_batch:
                 req_beam_width = req.sampling_config.beam_width
-                if 'cp_type' in self.mapping.cp_config and 'star_attention' == self.mapping.cp_config[
+                if 'cp_type' in self.mapping.cp_config and CpType.STAR == self.mapping.cp_config[
                         'cp_type']:
                     if req.ctx_iters == 0:
                         seq_len = sum(
diff --git a/tensorrt_llm/mapping.py b/tensorrt_llm/mapping.py
@@ -12,11 +12,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from enum import IntEnum
 from typing import List
 
 import torch
 
 
+class CpType(IntEnum):
+    # CP type for ulysses parallelism
+    ULYSSES = 0
+    # CP type for star attention
+    STAR = 1
+    # CP type for ring attention
+    RING = 2
+    # CP type for helix parallelism
+    HELIX = 3
+
+
 class Mapping(object):
     '''
     A node with 8 GPUs, tp_size = 4, cp_size = 1, pp_size = 2
@@ -135,58 +147,70 @@ def __init__(
         if moe_cluster_size == -1:
             moe_cluster_size = 1
 
+        cp_type = CpType.ULYSSES if cp_config is None else cp_config.get(
+            "cp_type", CpType.ULYSSES)
+        moe_world_size = tp_size if cp_type == CpType.ULYSSES else tp_size * cp_size
+
         if moe_tp_size == -1 and moe_ep_size == -1:
-            moe_tp_size = tp_size // moe_cluster_size
+            moe_tp_size = moe_world_size // moe_cluster_size
             moe_ep_size = 1
 
         elif moe_tp_size == -1:
-            moe_tp_size = tp_size // (moe_ep_size * moe_cluster_size)
+            moe_tp_size = moe_world_size // (moe_ep_size * moe_cluster_size)
 
         elif moe_ep_size == -1:
-            moe_ep_size = tp_size // (moe_tp_size * moe_cluster_size)
+            moe_ep_size = moe_world_size // (moe_tp_size * moe_cluster_size)
 
         if attn_tp_size == -1 and attn_cp_size == -1:
-            # fallback to ulysses
-            attn_tp_size = tp_size * cp_size
-            attn_cp_size = 1
+            if cp_type == CpType.ULYSSES:
+                # fallback to ulysses
+                attn_tp_size = tp_size * cp_size
+                attn_cp_size = 1
+            else:
+                # fallback to helix
+                attn_tp_size = tp_size
+                attn_cp_size = cp_size
 
         elif attn_tp_size == -1:
-            attn_tp_size = cp_size * tp_size // attn_cp_size
+            attn_tp_size = (tp_size * cp_size) // attn_cp_size
 
         elif attn_cp_size == -1:
-            attn_cp_size = cp_size * tp_size // attn_tp_size
+            attn_cp_size = (tp_size * cp_size) // attn_tp_size
 
-        if attn_cp_size != 1:
+        if attn_cp_size != 1 and cp_type == CpType.ULYSSES:
             raise ValueError(
-                f"attn_cp_size must be 1 for now, but got {attn_tp_size}, {attn_cp_size}."
+                f"attn_cp_size must be 1 for now for ulysses, but got {attn_tp_size}, {attn_cp_size}."
             )
 
         if auto_parallel:
-            if tp_size != 1 or pp_size != 1 or tp_size != 1:
+            if tp_size != 1 or pp_size != 1 or cp_size != 1:
                 raise ValueError(
-                    f"When auto parallel is enabled, tp_size, pp_size, cp_size must be 1, but got {tp_size}, {pp_size}, {cp_size}."
-                )
+                    "When auto parallel is enabled, tp_size, pp_size, cp_size must be 1, "
+                    f"but got {tp_size}, {pp_size}, {cp_size}.")
         else:
             if tp_size * pp_size * cp_size != world_size:
                 raise ValueError(
-                    f"world_size must equal to tp_size * pp_size * cp_size, but got {world_size} != {tp_size} * {pp_size} * {cp_size}."
+                    "world_size must equal to tp_size * pp_size * cp_size, "
+                    f"but got {world_size} != {tp_size} * {pp_size} * {cp_size}."
                 )
 
         moe_tp_ep_size = moe_tp_size * moe_ep_size
         moe_tp_cluster_ep_size = moe_tp_ep_size * moe_cluster_size
-        if moe_tp_cluster_ep_size != tp_size:
+        if moe_tp_cluster_ep_size != moe_world_size:
             raise ValueError(
-                f"tp_size must equal to moe_tp_size * moe_ep_size * moe_cluster_size, but got {tp_size} != {moe_tp_size} * {moe_ep_size} * {moe_cluster_size}"
-            )
+                "moe_tp_size * moe_ep_size * moe_cluster_size must equal to moe_world_size, "
+                f"but got {moe_tp_cluster_ep_size} != {moe_world_size}")
 
         attn_tp_cp_size = attn_tp_size * attn_cp_size
         if attn_tp_cp_size != tp_size * cp_size:
             raise ValueError(
-                f"tp_size * cp_size must equal to attn_tp_size * attn_cp_size, but got {tp_size} * {cp_size} != {attn_tp_size} * {attn_cp_size}"
+                "tp_size * cp_size must equal to attn_tp_size * attn_cp_size, "
+                f"but got {tp_size} * {cp_size} != {attn_tp_size} * {attn_cp_size}"
             )
 
-        if moe_ep_size != 1 and cp_size > 1:
-            raise NotImplementedError("CP don't support MoE tp/ep yet")
+        if moe_ep_size != 1 and cp_size > 1 and cp_type != CpType.HELIX:
+            raise NotImplementedError(
+                f"CP {cp_type} doesn't support MoE tp/ep yet")
 
         self.tp_size = tp_size
         self.cp_size = cp_size
@@ -275,6 +299,7 @@ def __eq__(self, other):
                 and self.moe_ep_size == other.moe_ep_size
                 and self.attn_tp_size == other.attn_tp_size
                 and self.attn_cp_size == other.attn_cp_size
+                and self.cp_config == other.cp_config
                 and self.auto_parallel == other.auto_parallel)
 
     def __hash__(self):
@@ -290,6 +315,8 @@ def __hash__(self):
             self.moe_ep_size,
             self.attn_tp_size,
             self.attn_cp_size,
+            # note: we do not allow updating cp_config after initialization
+            tuple(sorted(self.cp_config.items())),
             self.auto_parallel,
         ))
 
@@ -376,8 +403,13 @@ def local_rank(self):
     def dp_size(self):
         return self.tp_size if self.enable_attention_dp else 1
 
-    def has_cp(self):
-        return self.cp_size > 1
+    def has_cp_ulysses(self):
+        return self.cp_size > 1 and self.cp_config.get(
+            "cp_type") == CpType.ULYSSES
+
+    def has_cp_helix(self):
+        return self.cp_size > 1 and self.cp_config.get(
+            "cp_type") == CpType.HELIX
 
     def get_node_rank(self, rank: int):
         return rank // self.gpus_per_node
@@ -415,6 +447,29 @@ def next_pp_rank(self):
             p = p - self.world_size
         return p
 
+    def is_last_cp_rank(self):
+        return self.cp_rank == self.cp_size - 1
+
+    def is_first_cp_rank(self):
+        return self.cp_rank == 0
+
+    def has_cp(self):
+        return self.cp_size > 1
+
+    def prev_cp_rank(self):
+        p = self.rank - self.tp_size
+        if p // (self.tp_size * self.cp_size) < self.rank // (self.tp_size *
+                                                              self.cp_size):
+            return p + self.tp_size * self.cp_size
+        return p
+
+    def next_cp_rank(self):
+        p = self.rank + self.tp_size
+        if p // (self.tp_size * self.cp_size) > self.rank // (self.tp_size *
+                                                              self.cp_size):
+            return p - self.tp_size * self.cp_size
+        return p
+
     def has_moe_cluster(self):
         return self.moe_cluster_size > 1
 
@@ -453,5 +508,6 @@ def to_dict(self):
             'moe_ep_size': self.moe_ep_size,
             'attn_tp_size': self.attn_tp_size,
             'attn_cp_size': self.attn_cp_size,
+            'cp_config': self.cp_config,
             'auto_parallel': self.auto_parallel,
         }
diff --git a/tests/unittest/_torch/multi_gpu/test_star_attention.py b/tests/unittest/_torch/multi_gpu/test_star_attention.py
@@ -8,6 +8,7 @@
 from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.llmapi.utils import get_total_gpu_memory
+from tensorrt_llm.mapping import CpType
 from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig
 
 MAX_SEQ_LEN = 4096 + 1024
@@ -54,7 +55,7 @@ def test_model(backend, model_name, quant, sp_size, sa_block_size,
 
     model_dir = str(llm_models_root() / model_name)
     cp_config = {
-        "cp_type": "star_attention",
+        "cp_type": CpType.STAR,
         "cp_anchor_size": sa_anchor_size,
         "block_size": sa_block_size
     }
diff --git a/tests/unittest/_torch/test_flashinfer_star_attn.py b/tests/unittest/_torch/test_flashinfer_star_attn.py
@@ -13,7 +13,7 @@
 from tensorrt_llm._torch.metadata import KVCacheParams
 from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
 from tensorrt_llm.bindings.executor import KvCacheConfig
-from tensorrt_llm.mapping import Mapping
+from tensorrt_llm.mapping import CpType, Mapping
 
 
 class TestingStarAttentionMetadata(StarAttentionMetadata):
@@ -144,7 +144,7 @@ def test_flashinfer_star_attention(self, scenario: Scenario):
         tokens_per_block = 64
         max_seq_len = tokens_per_block * num_blocks
         cp_config = {
-            "cp_type": "star_attention",
+            "cp_type": CpType.STAR,
             "cp_anchor_size": scenario.anchor_size,
             "block_size": scenario.block_size
         }
@@ -579,7 +579,7 @@ def test_attention_with_cuda_graphs(
         max_seq_len = tokens_per_block * num_blocks
         num_layers = 1 if isinstance(num_kv_heads, int) else len(num_kv_heads)
         cp_config = {
-            "cp_type": "star_attention",
+            "cp_type": CpType.STAR,
             "cp_anchor_size": test_scenario.anchor_size,
             "block_size": test_scenario.block_size
         }
diff --git a/tests/unittest/others/test_mapping.py b/tests/unittest/others/test_mapping.py
@@ -44,3 +44,40 @@ def test_mapping(self):
         self.assertTrue(m.is_last_pp_rank())
         self.assertEqual(m.prev_pp_rank(), 4)
         self.assertEqual(m.next_pp_rank(), 0)
+
+        m = Mapping(world_size=2, rank=0, cp_size=2)
+        self.assertEqual(len(m.tp_groups), 2)
+        self.assertEqual(len(m.pp_groups), 2)
+        self.assertEqual(len(m.cp_groups), 1)
+        self.assertEqual(m.tp_group, [0])
+        self.assertEqual(m.pp_group, [0])
+        self.assertEqual(m.cp_group, [0, 1])
+
+        m = Mapping(world_size=8, rank=3, tp_size=2, pp_size=2, cp_size=2)
+        self.assertEqual(len(m.tp_groups), 4)
+        self.assertEqual(len(m.pp_groups), 4)
+        self.assertEqual(len(m.cp_groups), 4)
+        self.assertEqual(m.tp_group, [2, 3])
+        self.assertEqual(m.pp_group, [3, 7])
+        self.assertEqual(m.cp_group, [1, 3])
+        self.assertTrue(m.is_first_pp_rank())
+        self.assertFalse(m.is_last_pp_rank())
+        self.assertFalse(m.is_first_cp_rank())
+        self.assertTrue(m.is_last_cp_rank())
+        self.assertEqual(m.prev_pp_rank(), 7)
+        self.assertEqual(m.next_pp_rank(), 7)
+        self.assertEqual(m.prev_cp_rank(), 1)
+        self.assertEqual(m.next_cp_rank(), 1)
+
+        m = Mapping(world_size=16, rank=9, tp_size=2, pp_size=2, cp_size=4)
+        self.assertEqual(m.tp_group, [8, 9])
+        self.assertEqual(m.pp_group, [1, 9])
+        self.assertEqual(m.cp_group, [9, 11, 13, 15])
+        self.assertFalse(m.is_first_pp_rank())
+        self.assertTrue(m.is_last_pp_rank())
+        self.assertTrue(m.is_first_cp_rank())
+        self.assertFalse(m.is_last_cp_rank())
+        self.assertEqual(m.prev_pp_rank(), 1)
+        self.assertEqual(m.next_pp_rank(), 1)
+        self.assertEqual(m.prev_cp_rank(), 15)
+        self.assertEqual(m.next_cp_rank(), 11)