MoE Default use triton's blockwise fp8 in TP Case (#3678)

zhoutianzi666 · web-flow · commit 17b414c2df4e · 2025-08-29T11:07:30.000+08:00
diff --git a/docs/best_practices/ERNIE-4.5-21B-A3B-Paddle.md b/docs/best_practices/ERNIE-4.5-21B-A3B-Paddle.md
@@ -115,7 +115,6 @@ export FD_ATTENTION_BACKEND=FLASH_ATTN
 export FD_LOG_DIR="prefill_log"
 
 quant_type=block_wise_fp8
-export FD_USE_DEEP_GEMM=0
 
 python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
     --max-model-len 131072 \
@@ -135,7 +134,6 @@ export FLAGS_max_partition_size=2048
 export FD_LOG_DIR="decode_log"
 
 quant_type=block_wise_fp8
-export FD_USE_DEEP_GEMM=0
 
 python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
     --max-model-len 131072 \
diff --git a/docs/usage/environment_variables.md b/docs/usage/environment_variables.md
@@ -70,7 +70,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
 
     # Whether to use DeepGemm for FP8 blockwise MoE.
     "FD_USE_DEEP_GEMM":
-    lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))),
+    lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "0"))),
 
 }
 ```
diff --git a/docs/zh/best_practices/ERNIE-4.5-21B-A3B-Paddle.md b/docs/zh/best_practices/ERNIE-4.5-21B-A3B-Paddle.md
@@ -116,7 +116,6 @@ export FD_ATTENTION_BACKEND=FLASH_ATTN
 export FD_LOG_DIR="prefill_log"
 
 quant_type=block_wise_fp8
-export FD_USE_DEEP_GEMM=0
 
 python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
     --max-model-len 131072 \
@@ -136,7 +135,6 @@ export FLAGS_max_partition_size=2048
 export FD_LOG_DIR="decode_log"
 
 quant_type=block_wise_fp8
-export FD_USE_DEEP_GEMM=0
 
 python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
     --max-model-len 131072 \
diff --git a/docs/zh/usage/environment_variables.md b/docs/zh/usage/environment_variables.md
@@ -70,7 +70,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
 
     # 是否使用DeepGemm后端的FP8 blockwise MoE.
     "FD_USE_DEEP_GEMM":
-    lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))),
-
+    lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "0"))),
 }
 ```
diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py
@@ -65,7 +65,7 @@
     # Whether to use fastsafetensor load weight (0 or 1)
     "FD_USE_FASTSAFETENSOR": lambda: bool(int(os.getenv("FD_USE_FASTSAFETENSOR", "0"))),
     # Whether to use DeepGemm for FP8 blockwise MoE.
-    "FD_USE_DEEP_GEMM": lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))),
+    "FD_USE_DEEP_GEMM": lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "0"))),
     # Whether to use aggregate send.
     "FD_USE_AGGREGATE_SEND": lambda: bool(int(os.getenv("FD_USE_AGGREGATE_SEND", "0"))),
     # Whether to open Trace.
diff --git a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
@@ -61,7 +61,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
         Get quantization method.
         """
         if isinstance(layer, FusedMoE):
-            if self.use_deep_gemm:
+            if layer.ep_size > 1 or self.use_deep_gemm:
                 from fastdeploy.model_executor.layers.moe.fused_moe_deepgemm_backend import (
                     DeepGemmFusedMoeMethod,
                 )
diff --git a/tests/model_loader/test_common_model.py b/tests/model_loader/test_common_model.py
@@ -174,7 +174,7 @@ def run_with_timeout(target, args, timeout=60 * 5):
             {
                 "quant_type": "block_wise_fp8",
                 "backend": "triton",
-                "env": {"FD_USE_DEEP_GEMM": "0", "DG_NVCC_OVERRIDE_CPP_STANDARD": "17"},
+                "env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17"},
             },
             {"quant_type": "block_wise_fp8", "backend": "deepgemm", "env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17"}},
         ],

Original file line number	Diff line number	Diff line change
`@@ -70,7 +70,7 @@ environment_variables: dict[str, Callable[[], Any]] = {`
`70`	`70`
`71`	`71`	`# Whether to use DeepGemm for FP8 blockwise MoE.`
`72`	`72`	`"FD_USE_DEEP_GEMM":`
`73`		`- lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))),`
	`73`	`+ lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "0"))),`
`74`	`74`
`75`	`75`	`}`
`76`	`76`	```
Original file line number	Diff line number	Diff line change
`@@ -70,7 +70,6 @@ environment_variables: dict[str, Callable[[], Any]] = {`
`70`	`70`
`71`	`71`	`# 是否使用DeepGemm后端的FP8 blockwise MoE.`
`72`	`72`	`"FD_USE_DEEP_GEMM":`
`73`		`- lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))),`
`74`		`-`
	`73`	`+ lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "0"))),`
`75`	`74`	`}`
`76`	`75`	```
Original file line number	Diff line number	Diff line change
`@@ -61,7 +61,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:`
`61`	`61`	`Get quantization method.`
`62`	`62`	`"""`
`63`	`63`	`if isinstance(layer, FusedMoE):`
`64`		`- if self.use_deep_gemm:`
	`64`	`+ if layer.ep_size > 1 or self.use_deep_gemm:`
`65`	`65`	`from fastdeploy.model_executor.layers.moe.fused_moe_deepgemm_backend import (`
`66`	`66`	`DeepGemmFusedMoeMethod,`
`67`	`67`	`)`
Original file line number	Diff line number	Diff line change
`@@ -174,7 +174,7 @@ def run_with_timeout(target, args, timeout=60 * 5):`
`174`	`174`	`{`
`175`	`175`	`"quant_type": "block_wise_fp8",`
`176`	`176`	`"backend": "triton",`
`177`		`- "env": {"FD_USE_DEEP_GEMM": "0", "DG_NVCC_OVERRIDE_CPP_STANDARD": "17"},`
	`177`	`+ "env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17"},`
`178`	`178`	`},`
`179`	`179`	`{"quant_type": "block_wise_fp8", "backend": "deepgemm", "env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17"}},`
`180`	`180`	`],`