Deepseek fix for moe fp8 (#624)

shihaobai · web-flow · commit 3426af681fea · 2024-11-28T15:59:07.000+08:00
Co-authored-by: baishihao &lt;baishihao@sensetime.com&gt;
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight.py
@@ -33,7 +33,7 @@ def __init__(
         self.lock = threading.Lock()
 
     def set_quant_method(self, quant_method):
-        if isinstance(self.quant_method, vLLMFP8w8a8QuantizationMethod):
+        if isinstance(quant_method, vLLMFP8w8a8QuantizationMethod):
             self.quant_method = quant_method
             if self.quant_method is not None:
                 self.quant_method.is_moe = True
diff --git a/lightllm/common/quantization/vllm_quant.py b/lightllm/common/quantization/vllm_quant.py
@@ -71,7 +71,7 @@ def quantize_moe(self, weight):
         weight_scales = []
         qweights = torch.empty_like(weight, dtype=torch.float8_e4m3fn).cuda()
         for i in range(num_experts):
-            qweight, weight_scale = ops.scaled_fp8_quant(weight[0].cuda(), scale=None, use_per_token_if_dynamic=False)
+            qweight, weight_scale = ops.scaled_fp8_quant(weight[i].cuda(), scale=None, use_per_token_if_dynamic=False)
             qweights[i] = qweight
             weight_scales.append(weight_scale)
         weight_scale = torch.cat(weight_scales, dim=0).reshape(-1)