Skip to content

Commit 17b414c

Browse files
MoE Default use triton's blockwise fp8 in TP Case (#3678)
1 parent b6edd15 commit 17b414c

File tree

7 files changed

+5
-10
lines changed

7 files changed

+5
-10
lines changed

docs/best_practices/ERNIE-4.5-21B-A3B-Paddle.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,6 @@ export FD_ATTENTION_BACKEND=FLASH_ATTN
115115
export FD_LOG_DIR="prefill_log"
116116
117117
quant_type=block_wise_fp8
118-
export FD_USE_DEEP_GEMM=0
119118
120119
python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
121120
--max-model-len 131072 \
@@ -135,7 +134,6 @@ export FLAGS_max_partition_size=2048
135134
export FD_LOG_DIR="decode_log"
136135
137136
quant_type=block_wise_fp8
138-
export FD_USE_DEEP_GEMM=0
139137
140138
python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
141139
--max-model-len 131072 \

docs/usage/environment_variables.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
7070

7171
# Whether to use DeepGemm for FP8 blockwise MoE.
7272
"FD_USE_DEEP_GEMM":
73-
lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))),
73+
lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "0"))),
7474

7575
}
7676
```

docs/zh/best_practices/ERNIE-4.5-21B-A3B-Paddle.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,6 @@ export FD_ATTENTION_BACKEND=FLASH_ATTN
116116
export FD_LOG_DIR="prefill_log"
117117
118118
quant_type=block_wise_fp8
119-
export FD_USE_DEEP_GEMM=0
120119
121120
python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
122121
--max-model-len 131072 \
@@ -136,7 +135,6 @@ export FLAGS_max_partition_size=2048
136135
export FD_LOG_DIR="decode_log"
137136
138137
quant_type=block_wise_fp8
139-
export FD_USE_DEEP_GEMM=0
140138
141139
python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
142140
--max-model-len 131072 \

docs/zh/usage/environment_variables.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
7070

7171
# 是否使用DeepGemm后端的FP8 blockwise MoE.
7272
"FD_USE_DEEP_GEMM":
73-
lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))),
74-
73+
lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "0"))),
7574
}
7675
```

fastdeploy/envs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@
6565
# Whether to use fastsafetensor load weight (0 or 1)
6666
"FD_USE_FASTSAFETENSOR": lambda: bool(int(os.getenv("FD_USE_FASTSAFETENSOR", "0"))),
6767
# Whether to use DeepGemm for FP8 blockwise MoE.
68-
"FD_USE_DEEP_GEMM": lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))),
68+
"FD_USE_DEEP_GEMM": lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "0"))),
6969
# Whether to use aggregate send.
7070
"FD_USE_AGGREGATE_SEND": lambda: bool(int(os.getenv("FD_USE_AGGREGATE_SEND", "0"))),
7171
# Whether to open Trace.

fastdeploy/model_executor/layers/quantization/block_wise_fp8.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
6161
Get quantization method.
6262
"""
6363
if isinstance(layer, FusedMoE):
64-
if self.use_deep_gemm:
64+
if layer.ep_size > 1 or self.use_deep_gemm:
6565
from fastdeploy.model_executor.layers.moe.fused_moe_deepgemm_backend import (
6666
DeepGemmFusedMoeMethod,
6767
)

tests/model_loader/test_common_model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ def run_with_timeout(target, args, timeout=60 * 5):
174174
{
175175
"quant_type": "block_wise_fp8",
176176
"backend": "triton",
177-
"env": {"FD_USE_DEEP_GEMM": "0", "DG_NVCC_OVERRIDE_CPP_STANDARD": "17"},
177+
"env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17"},
178178
},
179179
{"quant_type": "block_wise_fp8", "backend": "deepgemm", "env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17"}},
180180
],

0 commit comments

Comments
 (0)