[DeepSeek R1] add chunk moe args on deepseek_r1 (#1834)

ranzhejiang · web-flow · commit 03db55517ffa · 2025-09-09T21:23:33.000+08:00
add chunk moe for FP8 deepseek_r1 inference @czhu15 @Wei-Lin-Intel @yiliu30 @hlin99 work with intel/neural-compressor#2270
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -495,6 +495,7 @@ def __init__(
                 experts_min, experts_max = 0, self.local_num_experts
                 moe_op = VllmMixtureOfExpertsOpFP8(
                     num_expert_per_group,
+                    self.global_num_experts,
                     experts_min + ep_shift,
                     experts_max - 1 + ep_shift,
                 )
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -516,6 +516,29 @@ def __init__(self, quant_config: Fp8Config):
         self.enable_dmoe_dynamic_scale = os.environ.get("VLLM_DMOE_DYNAMIC_SCALE", False) in ["1", "true"]
         self.use_static_moe = os.environ.get("VLLM_USE_STATIC_MOE", "0") in ["1", "true"]
         self.optimize_with_partial_experts = os.environ.get("VLLM_OPTIMIZE_WITH_PARTIAL_EXPERTS", "0") in ["1", "true"]
+        self.enable_moe_chunk = os.environ.get('VLLM_SUPPORT_MOE_CHUNK',
+                                                'false').lower() == 'true'
+        self.chunk_size_list = [
+            int(x)
+            for x in os.environ.get(
+                "PT_HPU_MOE_CHUNK", "64,128,512,1024,1536,2048,4096"
+            ).split(",")
+            if x.strip()
+        ]
+        self.token_boundary_list = [
+            int(x)
+            for x in os.environ.get(
+                "PT_HPU_MOE_TOKEN_BOUNDARY", "64,64,1536,1536,2048,2048,4096"
+            ).split(",")
+            if x.strip()
+        ]
+        assert len(self.chunk_size_list) == len(self.token_boundary_list), (
+            f"chunk_size_list({len(self.chunk_size_list)}) and "
+            f"token_boundary_list({len(self.token_boundary_list)}) must be the same length"
+        )
+        if self.enable_moe_chunk:
+            logger.info("token_boundary_list is:%s",self.token_boundary_list)
+            logger.info("chunk_size_list is:%s",self.chunk_size_list)
 
     def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
                        intermediate_size_per_partition: int,
@@ -1043,6 +1066,17 @@ def do_dynamic_moe_with_static_scaling(x, topk_ids, topk_weights, w13_weight_fp8
                                                   topk_weights_across_dp)
 
             batched_tokens = x.shape[0]
+            kwargs = {}
+            if self.enable_moe_chunk:
+                chunk_size = self.chunk_size_list[-1]
+                for idx, threshold in enumerate(self.token_boundary_list):
+                    if batched_tokens <= threshold:
+                        chunk_size = self.chunk_size_list[idx]
+                        break
+                kwargs = {
+                    "chunk_size": chunk_size,
+                    "total_experts": 256,
+                }
 
             if batched_tokens > self.moe_slice_length:
                 final_hidden_states_list = []
@@ -1066,6 +1100,7 @@ def do_dynamic_moe_with_static_scaling(x, topk_ids, topk_weights, w13_weight_fp8
                         activation="silu",
                         experts_min=ep_shift,
                         experts_max=(num_experts + ep_shift - 1),
+                        **kwargs
                     )
                     final_hidden_states_list.append(current_hidden_states)
                 final_hidden_states = torch.cat(final_hidden_states_list, dim=0)
@@ -1084,6 +1119,7 @@ def do_dynamic_moe_with_static_scaling(x, topk_ids, topk_weights, w13_weight_fp8
                     activation="silu",
                     experts_min=ep_shift,
                     experts_max=(num_experts + ep_shift - 1),
+                    **kwargs
                 )
             return final_hidden_states.view(-1, x.shape[1])
 
diff --git a/vllm/model_executor/layers/vllm_ext_patch.py b/vllm/model_executor/layers/vllm_ext_patch.py
@@ -1,10 +1,14 @@
 # ==-------------------------------------------------------------------------==
 # VLLM-HPU-EXT PATCH Start
 # ==-------------------------------------------------------------------------==
+import logging
+import os
 import torch
 from typing import Callable, Optional, Tuple
 import habana_frameworks.torch as htorch
 
+logging.basicConfig(level=logging.INFO)
+
 
 class MoeFP8Matmul(torch.nn.Module):
     def __init__(
@@ -66,7 +70,11 @@ def get_dequant_weights_func(
 
 class VllmMixtureOfExpertsOpFP8(torch.nn.Module):
     def __init__(
-        self, num_experts: int, experts_min: int = 0, experts_max: int = 8
+        self,
+        num_experts: int,
+        global_num_experts: int = 0,
+        experts_min: int = 0,
+        experts_max: int = 8,
     ):
         super().__init__()
         self.w13_list = torch.nn.ModuleList(
@@ -75,10 +83,52 @@ def __init__(
         self.w2_list = torch.nn.ModuleList(
             [MoeFP8Matmul() for _ in range(num_experts)]
         )
+        self.enable_moe_chunk = (
+            os.environ.get("VLLM_SUPPORT_MOE_CHUNK", "false").lower() == "true"
+        )
+        self.chunk_size_list = [
+            int(x)
+            for x in os.environ.get(
+                "PT_HPU_MOE_CHUNK", "64,128,512,1024,1536,2048,4096"
+            ).split(",")
+            if x.strip()
+        ]
+        self.token_boundary_list = [
+            int(x)
+            for x in os.environ.get(
+                "PT_HPU_MOE_TOKEN_BOUNDARY", "64,128,1536,1736,2048,3072,4096"
+            ).split(",")
+            if x.strip()
+        ]
+        assert len(self.chunk_size_list) == len(self.token_boundary_list), (
+            f"chunk_size_list({len(self.chunk_size_list)}) and "
+            f"token_boundary_list({len(self.token_boundary_list)}) must be the same length"
+        )
+        logger = logging.getLogger()
+        if self.enable_moe_chunk:
+            logger.info("token_boundary_list is:%s",self.token_boundary_list)
+            logger.info("chunk_size_list is:%s",self.chunk_size_list)
+
         self.num_experts = num_experts
+        self.global_num_experts = global_num_experts
         self.experts_min = experts_min
         self.experts_max = experts_max
 
+    def _get_extra_kwargs(self, tokens_num: int):
+        if self.enable_moe_chunk:
+            chunk_size = self.chunk_size_list[-1]
+            for idx, threshold in enumerate(self.token_boundary_list):
+                if tokens_num <= threshold:
+                    chunk_size = self.chunk_size_list[idx]
+                    break
+            kwargs = {
+                "chunk_size": chunk_size,
+                "total_experts": self.global_num_experts,
+            }
+        else:
+            kwargs = {}
+        return kwargs
+
     def forward(
         self,
         x,
@@ -89,6 +139,8 @@ def forward(
         max_expert = self.experts_max
         w13_list_slice = []
         w2_list_slice = []
+        tokens_num, _ = x.shape
+        kwargs = self._get_extra_kwargs(tokens_num)
         for j in range(self.num_experts):
             w13_list_slice.append(self.w13_list[j].get_dequant_weight())
             w2_list_slice.append(self.w2_list[j].get_dequant_weight())
@@ -103,6 +155,7 @@ def forward(
             activation="silu",
             experts_min=min_expert,
             experts_max=max_expert,
+            **kwargs,
         )
         htorch.core.mark_step()
         return final_hidden_states

Original file line number	Diff line number	Diff line change
`@@ -495,6 +495,7 @@ def __init__(`
`495`	`495`	`experts_min, experts_max = 0, self.local_num_experts`
`496`	`496`	`moe_op = VllmMixtureOfExpertsOpFP8(`
`497`	`497`	`num_expert_per_group,`
	`498`	`+ self.global_num_experts,`
`498`	`499`	`experts_min + ep_shift,`
`499`	`500`	`experts_max - 1 + ep_shift,`
`500`	`501`	`)`