feat - support normal no quant multi-dp moe strategy

Nancheng-11 · Nancheng-11 · commit a9f456edab73 · 2025-12-25T11:59:55.000+08:00
diff --git a/rtp_llm/models_py/modules/factory/attention/cuda_impl/flash_infer.py b/rtp_llm/models_py/modules/factory/attention/cuda_impl/flash_infer.py
@@ -18,9 +18,7 @@
 class FlashInferPrefillImpl(FMHAPrefillImplBase):
 
     def __init__(
-        self, 
-        attn_configs: AttentionConfigs,
-        attn_inputs: PyAttentionInputs
+        self, attn_configs: AttentionConfigs, attn_inputs: PyAttentionInputs
     ) -> None:
         super().__init__(
             FlashInferPrefillOp(attn_configs),
@@ -40,16 +38,14 @@ def support_cuda_graph(self) -> bool:
 class FlashInferDecodeImpl(FMHADecodeImplBase):
 
     def __init__(
-        self,
-        attn_configs: AttentionConfigs,
-        attn_inputs: PyAttentionInputs
+        self, attn_configs: AttentionConfigs, attn_inputs: PyAttentionInputs
     ) -> None:
         super().__init__(
             FlashInferDecodeOp(attn_configs),
             FusedRopeKVCacheDecodeOp(attn_configs),
             attn_inputs,
         )
-        self.seq_size_per_block = config.seq_size_per_block
+        self.seq_size_per_block = attn_configs.tokens_per_block
         self.support_ = self.support_ and (not attn_configs.use_mla)
 
     @staticmethod
diff --git a/rtp_llm/models_py/modules/factory/fused_moe/__init__.py b/rtp_llm/models_py/modules/factory/fused_moe/__init__.py
@@ -61,6 +61,7 @@
         CudaFp8PerTensorEpLowLatencyStrategy,
         CudaFp8PerTensorEpNormalStrategy,
         CudaFp8PerTensorNoDPStrategy,
+        CudaNoQuantDpNormalStrategy,
         CudaNoQuantEpLowLatencyStrategy,
     )
 
@@ -72,5 +73,6 @@
     registry.register(CudaFp8PerBlockNoDPStrategy())
     registry.register(CudaFp8PerTensorNoDPStrategy())
     registry.register(CudaNoQuantEpLowLatencyStrategy())
+    registry.register(CudaNoQuantDpNormalStrategy())
     registry.register(BatchedTritonStrategy())
     FusedMoeFactory.set_registry(registry)
diff --git a/rtp_llm/models_py/modules/factory/fused_moe/impl/cuda/executors/f16_cpp_executor.py b/rtp_llm/models_py/modules/factory/fused_moe/impl/cuda/executors/f16_cpp_executor.py
@@ -0,0 +1,87 @@
+# Adapt from https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/layers/moe/ep_moe/kernels.py
+# but make some modifications for RTP-LLM
+# Licensed under the Apache License, Version 2.0
+from typing import Any, Dict, Optional
+
+import torch
+
+import rtp_llm.models_py.modules.factory.fused_moe.defs.fused_moe as mm
+from rtp_llm.models_py.modules.factory.fused_moe.defs.config_adapter import (
+    MoEConfigAdapter,
+)
+from rtp_llm.models_py.modules.factory.fused_moe.defs.quant_config import (
+    FusedMoEQuantConfig,
+)
+from rtp_llm.models_py.modules.factory.fused_moe.defs.type import ExecutorType
+from rtp_llm.models_py.modules.factory.fused_moe.utils.config_resolver import (
+    MoeConfigResolver,
+)
+from rtp_llm.ops.compute_ops import FusedMoEOp
+from rtp_llm.utils.model_weight import W
+
+
+class CppMoeExecutor(mm.FusedMoeExpertExecutor):
+    @classmethod
+    def executor_type(cls):
+        return ExecutorType.FUSED_MOE
+
+    @classmethod
+    def check_conditions(cls, checker: Any, config: MoEConfigAdapter) -> None:
+        resolver = MoeConfigResolver()
+        checker.check(not resolver.has_quantization(config))
+
+    def __init__(
+        self,
+        config: MoEConfigAdapter,
+        weights: Dict[str, torch.Tensor],
+    ):
+        super().__init__(FusedMoEQuantConfig())
+        self.config = config
+        self.ep_size = config.ep_size
+        self.ep_rank = config.ep_rank
+        self.num_experts = config.expert_num
+        assert self.num_experts % self.ep_size == 0
+        self.num_experts_per_partition = self.num_experts // self.ep_size
+        self.start_expert_id = self.ep_rank * self.num_experts_per_partition
+        self.end_expert_id = self.start_expert_id + self.num_experts_per_partition - 1
+        self.top_k = config.moe_k
+        self.intermediate_size = config.model_config.moe_inter_size
+        self.activation = config.activation_type
+        self.renormalize = True
+        self.use_fp8_w8a8 = True
+        self.use_block_quant = True
+        # 权重初始化
+        self.w13_weight = weights[W.moe_w1]
+        self.w2_weight = weights[W.moe_w2]
+        self.moe_op = FusedMoEOp(config.model_config, config.parallelism_config)
+
+    @property
+    def topk_ids_dtype(self) -> torch.dtype:
+        return torch.int32
+
+    @property
+    def local_num_experts(self) -> int:
+        return self.num_experts_per_partition
+
+    def execute(
+        self,
+        payload: mm.ExpertForwardPayload,
+        activation: str,
+        expert_map: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
+        extra_expert_args: Optional[dict[str, Any]],
+    ) -> torch.Tensor:
+        output = torch.zeros_like(payload.expert_x)
+        assert payload.expert_topk_weights is not None, "expert_topk_weights is None"
+        assert payload.expert_topk_ids is not None, "expert_topk_ids is None"
+        payload.expert_topk_ids = payload.expert_topk_ids.to(torch.int32)
+        self.moe_op.forward(
+            payload.expert_x,
+            self.w13_weight,
+            self.w2_weight,
+            payload.expert_topk_weights,
+            payload.expert_topk_ids,
+            output,
+        )
+        return output
diff --git a/rtp_llm/models_py/modules/factory/fused_moe/impl/cuda/routers/deepep_normal_router.py b/rtp_llm/models_py/modules/factory/fused_moe/impl/cuda/routers/deepep_normal_router.py
@@ -9,6 +9,9 @@
     scaled_fp8_per_token_quant,
     sgl_per_token_group_quant_fp8,
 )
+from rtp_llm.models_py.modules.factory.fused_moe.defs.config_adapter import (
+    MoEConfigAdapter,
+)
 from rtp_llm.models_py.modules.factory.fused_moe.defs.fused_moe import (
     ExpertForwardPayload,
     ExpertTokensMetadata,
@@ -19,7 +22,7 @@
 )
 from rtp_llm.models_py.modules.factory.fused_moe.defs.type import RouterType
 from rtp_llm.ops.compute_ops import trt_fp8_quantize_128
-from rtp_llm.models_py.modules.factory.fused_moe.defs.config_adapter import MoEConfigAdapter
+
 
 class DeepepNormalRouter(FusedMoeDataRouter):
     @classmethod
@@ -157,12 +160,21 @@ def prepare(
             num_recv_tokens_per_expert_list, device=expert_x.device, dtype=torch.int32
         )
 
+        if recv_topk_idx.numel() != 0 and (not self.use_fp8):
+            expert_topk_ids = torch.where(
+                recv_topk_idx == -1,
+                self.expert_num - 1 if self.rank_expert_offset == 0 else 0,
+                recv_topk_idx + self.rank_expert_offset,
+            )
+        else:
+            expert_topk_ids = recv_topk_idx
+
         return ExpertForwardPayload(
             expert_x,
             act_dtype,
             expert_x_scale,
             ExpertTokensMetadata(expert_num_tokens, num_recv_tokens_per_expert_list),
-            recv_topk_idx,
+            expert_topk_ids,
             recv_topk_weights,
         )
 
diff --git a/rtp_llm/models_py/modules/factory/fused_moe/impl/cuda/strategy/__init__.py b/rtp_llm/models_py/modules/factory/fused_moe/impl/cuda/strategy/__init__.py
@@ -10,11 +10,12 @@
     CudaFp8PerTensorEpNormalStrategy,
     CudaFp8PerTensorNoDPStrategy,
 )
-from .no_quant import CudaNoQuantEpLowLatencyStrategy
+from .no_quant import CudaNoQuantDpNormalStrategy, CudaNoQuantEpLowLatencyStrategy
 
 __all__ = [
     # No quantization
     "CudaNoQuantEpLowLatencyStrategy",
+    "CudaNoQuantDpNormalStrategy",
     # FP8 PerBlock
     "CudaFp8PerBlockNoDPStrategy",
     "CudaFp8PerBlockEpLowLatencyStrategy",
diff --git a/rtp_llm/models_py/modules/factory/fused_moe/impl/cuda/strategy/no_quant.py b/rtp_llm/models_py/modules/factory/fused_moe/impl/cuda/strategy/no_quant.py
@@ -4,7 +4,9 @@
 
 import torch
 
-from rtp_llm.models_py.modules.factory.fused_moe.defs.config_adapter import MoEConfigAdapter
+from rtp_llm.models_py.modules.factory.fused_moe.defs.config_adapter import (
+    MoEConfigAdapter,
+)
 from rtp_llm.models_py.modules.factory.fused_moe.defs.priority_attributes import (
     StrategyAttributes,
 )
@@ -67,3 +69,39 @@ def get_attributes(self) -> StrategyAttributes:
             router_class=DeepEpLowLatencyRouter,
             executor_class=DeepGemmMaskedExecutor,
         )
+
+
+class CudaNoQuantDpNormalStrategy(MoeStrategy):
+    """CUDA CPP mode without quantization strategy and dp normal mode"""
+
+    def create_router(self, config: MoEConfigAdapter) -> Any:
+        from rtp_llm.models_py.modules.factory.fused_moe.impl.cuda.routers.deepep_normal_router import (
+            DeepepNormalRouter,
+        )
+
+        return DeepepNormalRouter(config, use_fp8=False)
+
+    def create_executor(
+        self, config: MoEConfigAdapter, weights: Dict[str, torch.Tensor]
+    ) -> Any:
+        from rtp_llm.models_py.modules.factory.fused_moe.impl.cuda.executors.f16_cpp_executor import (
+            CppMoeExecutor,
+        )
+
+        return CppMoeExecutor(
+            config,
+            weights,
+        )
+
+    def get_attributes(self) -> StrategyAttributes:
+        from rtp_llm.models_py.modules.factory.fused_moe.impl.cuda.executors.f16_cpp_executor import (
+            CppMoeExecutor,
+        )
+        from rtp_llm.models_py.modules.factory.fused_moe.impl.cuda.routers.deepep_normal_router import (
+            DeepepNormalRouter,
+        )
+
+        return StrategyAttributes(
+            router_class=DeepepNormalRouter,
+            executor_class=CppMoeExecutor,
+        )
diff --git a/rtp_llm/models_py/modules/hybrid/test/mla_reuse_cache_test.py b/rtp_llm/models_py/modules/hybrid/test/mla_reuse_cache_test.py
@@ -17,12 +17,12 @@
 from rtp_llm.models.rotary_embedding.deepseek_rotary_embedding import (
     DeepseekV3YarnRotaryEmbedding,
 )
-from rtp_llm.ops import ParallelismConfig
 from rtp_llm.models_py.modules import LinearFactory
 from rtp_llm.models_py.modules.factory.attention.cuda_mla_impl.flashinfer_mla_wrapper import (
     MlaFlashInferPrefillImpl,
 )
 from rtp_llm.models_py.modules.hybrid.test.mla_attention_ref import attention_ref
+from rtp_llm.ops import ParallelismConfig
 from rtp_llm.ops.compute_ops import KVCache, PyAttentionInputs
 from rtp_llm.utils.model_weight import W
 
@@ -115,8 +115,11 @@ def _run_mla_test(
         self.config.attn_config.softmax_extra_scale = 1.0
         self.config.attn_config.use_mla = True
         self.config.attn_config.size_per_head = 192
-        self.scaling = (self.config.attn_config.nope_head_dim + self.config.attn_config.rope_head_dim) ** (-0.5)
-        
+        self.scaling = (
+            self.config.attn_config.nope_head_dim
+            + self.config.attn_config.rope_head_dim
+        ) ** (-0.5)
+
         self.parallelism_config = ParallelismConfig()
         self.parallelism_config.tp_size = 1
         self.parallelism_config.tp_rank = 0
@@ -146,15 +149,20 @@ def _run_mla_test(
         cos_sin_cache = create_cos_sin_cache()
 
         fmha_impl = MlaFlashInferPrefillImpl(
-            self.config.attn_config, attn_inputs, layer_weights, cos_sin_cache, quant_config=self.config.quant_config
+            self.config.attn_config,
+            attn_inputs,
+            layer_weights,
+            cos_sin_cache,
+            quant_config=self.config.quant_config,
         )
         fmha_impl.prepare(attn_inputs)
 
         q = torch.randn(
             [
                 num_tokens,
                 self.config.attn_config.head_num,
-                self.config.attn_config.nope_head_dim + self.config.attn_config.rope_head_dim,
+                self.config.attn_config.nope_head_dim
+                + self.config.attn_config.rope_head_dim,
             ],
             dtype=torch.bfloat16,
             device=device,
@@ -176,7 +184,8 @@ def _run_mla_test(
             [
                 mock_page_num,
                 page_size,
-                self.config.attn_config.kv_lora_rank + self.config.attn_config.rope_head_dim,
+                self.config.attn_config.kv_lora_rank
+                + self.config.attn_config.rope_head_dim,
             ],
             dtype=torch.bfloat16,
             device=device,
@@ -187,7 +196,10 @@ def _run_mla_test(
 
         k_cache, v_cache = torch.split(
             kv_cache.k_cache_base,
-            [self.config.attn_config.kv_lora_rank, self.config.attn_config.rope_head_dim],
+            [
+                self.config.attn_config.kv_lora_rank,
+                self.config.attn_config.rope_head_dim,
+            ],
             dim=-1,
         )
         page.append_paged_mla_kv_cache(
@@ -197,7 +209,7 @@ def _run_mla_test(
             fmha_impl.rope_params.positions_d,
             k_cache,
             v_cache,
-            fmha_impl.rope_kvcache_impl.cuda_graph_kv_indices,
+            fmha_impl.rope_params.page_indice_d,
             fmha_impl.rope_params.decode_page_indptr_d,
             fmha_impl.rope_params.paged_kv_last_page_len_d,
         )
@@ -228,15 +240,18 @@ def _run_mla_test(
         k_nope = self.k_nope_proj(compressed_kv)
         value_states = self.v_proj(compressed_kv)
 
-        k_nope = k_nope.view(-1, self.config.attn_config.head_num, self.config.attn_config.nope_head_dim)
+        k_nope = k_nope.view(
+            -1, self.config.attn_config.head_num, self.config.attn_config.nope_head_dim
+        )
         value_states = value_states.view(
             -1, self.config.attn_config.head_num, self.config.attn_config.v_head_dim
         )
 
         k = k_pe.new_empty(
             k_pe.size(0),
             self.config.attn_config.head_num,
-            self.config.attn_config.rope_head_dim + self.config.attn_config.nope_head_dim,
+            self.config.attn_config.rope_head_dim
+            + self.config.attn_config.nope_head_dim,
         )
         k[..., : self.config.attn_config.nope_head_dim] = k_nope
         k[..., self.config.attn_config.nope_head_dim :] = k_pe
@@ -285,13 +300,21 @@ def _create_weights(self, config, hidden_size):
         )
 
         weights[W.mla_kc] = torch.randn(
-            [config.attn_config.head_num, config.attn_config.nope_head_dim, config.attn_config.kv_lora_rank],
+            [
+                config.attn_config.head_num,
+                config.attn_config.nope_head_dim,
+                config.attn_config.kv_lora_rank,
+            ],
             dtype=torch.bfloat16,
             device=device,
         )
 
         weights[W.mla_vc] = torch.randn(
-            [config.attn_config.head_num, config.attn_config.kv_lora_rank, config.attn_config.v_head_dim],
+            [
+                config.attn_config.head_num,
+                config.attn_config.kv_lora_rank,
+                config.attn_config.v_head_dim,
+            ],
             dtype=torch.bfloat16,
             device=device,
         )
@@ -310,13 +333,21 @@ def _create_weights(self, config, hidden_size):
 
         weights[W.mla_kc] = (
             weights[W.mla_k_nope_w]
-            .view(config.attn_config.kv_lora_rank, config.attn_config.head_num, config.attn_config.nope_head_dim)
+            .view(
+                config.attn_config.kv_lora_rank,
+                config.attn_config.head_num,
+                config.attn_config.nope_head_dim,
+            )
             .transpose(0, 1)
             .transpose(1, 2)
         )
         weights[W.mla_vc] = (
             weights[W.mla_v_w]
-            .view(config.attn_config.kv_lora_rank, config.attn_config.head_num, config.attn_config.v_head_dim)
+            .view(
+                config.attn_config.kv_lora_rank,
+                config.attn_config.head_num,
+                config.attn_config.v_head_dim,
+            )
             .transpose(0, 1)
         )