[fix]fix fp8 bug when load moe model

SangChengC · SangChengC · commit b610fe3a9c2e · 2025-09-08T07:38:56.000Z
diff --git a/lightllm/common/quantization/w8a8_quant.py b/lightllm/common/quantization/w8a8_quant.py
@@ -93,11 +93,11 @@ def quantize_moe(self, weight):
         qweights = torch.empty_like(weight, dtype=torch.float8_e4m3fn).cuda(self.device_id_)
         for i in range(num_experts):
             qweight, weight_scale = scaled_fp8_quant(
-                weight[i].contiguous().cuda(self.device_id_), scale=None, use_per_token_if_dynamic=False
+                weight[i].contiguous().cuda(self.device_id_), scale=None, use_per_token_if_dynamic=True
             )
             qweights[i] = qweight
             weight_scales.append(weight_scale)
-        weight_scale = torch.cat(weight_scales, dim=0).reshape(-1)
+        weight_scale = torch.stack(weight_scales, dim=0).contiguous()
         return qweights, weight_scale
 
     def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_custom_tensor_mananger=True):