File tree Expand file tree Collapse file tree 2 files changed +12
-3
lines changed
vllm/model_executor/layers/fused_moe Expand file tree Collapse file tree 2 files changed +12
-3
lines changed Original file line number Diff line number Diff line change 32
32
from .fused_batched_moe import BatchedDispatchCombine , BatchedTritonExperts
33
33
from .fused_moe import TritonExperts , fused_experts
34
34
from .modular_kernel import (FusedMoEModularKernel ,
35
+ FusedMoEPermuteExpertsUnpermute ,
35
36
FusedMoEQuantizeDispatchCombine )
36
37
from .pplx_dispatch_combine import PplxDispatchCombine
37
38
else :
@@ -249,6 +250,8 @@ def set_dispatch_combine(
249
250
250
251
#block_m = MOE_DP_CHUNK_SIZE * (self.moe.ep_size // self.moe.dp_size)
251
252
253
+ experts : FusedMoEPermuteExpertsUnpermute = None
254
+
252
255
if isinstance (dispatch_combine ,
253
256
(BatchedDispatchCombine , PplxDispatchCombine )):
254
257
logger .info ("BatchedTritonExperts %s" , self .moe )
@@ -619,6 +622,8 @@ def __init__(
619
622
assert quant_method is not None
620
623
self .quant_method = quant_method
621
624
625
+ dispatch_combine : FusedMoEQuantizeDispatchCombine = None
626
+
622
627
# TODO: move to method?
623
628
if self .dp_size > 1 :
624
629
logger .info ("using pplx dispatch" )
Original file line number Diff line number Diff line change 6
6
import vllm .model_executor .layers .fused_moe .modular_kernel as mk
7
7
from vllm .model_executor .layers .fused_moe .deep_gemm_moe import (
8
8
DeepGemmExperts , _valid_deep_gemm , _valid_deep_gemm_shape )
9
- from vllm .model_executor .layers .fused_moe .fused_moe import TritonExpert
9
+ from vllm .model_executor .layers .fused_moe .fused_moe import TritonExperts
10
10
11
11
12
12
class TritonOrDeepGemmExperts (mk .FusedMoEPermuteExpertsUnpermute ):
13
13
14
14
def __init__ (self ,
15
15
use_fp8_w8a8 : bool ,
16
+ use_int8_w8a8 : bool ,
16
17
use_int8_w8a16 : bool ,
17
18
use_int4_w4a16 : bool ,
19
+ per_channel_quant : bool ,
18
20
block_shape : Optional [List [int ]] = None ,
19
21
block_m : Optional [int ] = None ,
20
22
allow_deep_gemm : bool = False ):
21
23
super ().__init__ ()
22
- self .triton_expert = TritonExpert (use_fp8_w8a8 , use_int4_w4a16 ,
23
- use_int8_w8a16 , block_shape , block_m )
24
+ self .triton_expert = TritonExperts (use_fp8_w8a8 , use_int8_w8a8 ,
25
+ use_int4_w4a16 , use_int8_w8a16 ,
26
+ per_channel_quant , block_shape ,
27
+ block_m )
24
28
self .deep_gemm_expert = DeepGemmExperts ()
25
29
self .allow_deep_gemm = allow_deep_gemm
26
30
self .use_fp8_w8a8 = use_fp8_w8a8
You can’t perform that action at this time.
0 commit comments