10
10
from vllm import _custom_ops as ops
11
11
from vllm .logger import init_logger
12
12
from vllm .model_executor .layers .fused_moe .layer import (
13
- FusedMoE , FusedMoEMethodBase , FusedMoeWeightScaleSupported )
13
+ FusedMoE , FusedMoEMethodBase , FusedMoeWeightScaleSupported ,
14
+ UnquantizedFusedMoEMethod )
14
15
from vllm .model_executor .layers .linear import (LinearBase , LinearMethodBase ,
15
16
UnquantizedLinearMethod ,
16
17
set_weight_attrs )
@@ -141,6 +142,9 @@ def get_quant_method(self, layer: torch.nn.Module,
141
142
elif isinstance (layer , FusedMoE ):
142
143
from vllm .model_executor .layers .quantization .moe_wna16 import (
143
144
MoeWNA16Config )
145
+ if is_layer_skipped_awq (
146
+ prefix , getattr (self , "modules_to_not_convert" , [])):
147
+ return UnquantizedFusedMoEMethod (layer .moe_config )
144
148
if not check_moe_marlin_supports_layer (layer , self .group_size ):
145
149
logger .warning_once (
146
150
f"Layer '{ prefix } ' is not supported by AWQMoeMarlin. "
@@ -520,4 +524,4 @@ def apply(
520
524
expert_map = expert_map ,
521
525
w1_zeros = layer .w13_qzeros ,
522
526
w2_zeros = layer .w2_qzeros ,
523
- workspace = layer .workspace )
527
+ workspace = layer .workspace )
0 commit comments