fix fp8 weight quant need contiguous tensor (#632)

hiworldwzj · wangzaijun · web-flow · commit 9845569aafff · 2024-12-02T16:45:19.000+08:00
Co-authored-by: wangzaijun &lt;wangzaijun@sensetime.com&gt;
diff --git a/lightllm/common/quantization/vllm_quant.py b/lightllm/common/quantization/vllm_quant.py
@@ -62,7 +62,9 @@ def __init__(self):
     def quantize(self, weight: torch.Tensor):
         if self.is_moe:
             return self.quantize_moe(weight)
-        qweight, weight_scale = ops.scaled_fp8_quant(weight.cuda(), scale=None, use_per_token_if_dynamic=True)
+        qweight, weight_scale = ops.scaled_fp8_quant(
+            weight.contiguous().cuda(), scale=None, use_per_token_if_dynamic=True
+        )
         return qweight.transpose(0, 1), weight_scale
 
     def quantize_moe(self, weight):
@@ -71,7 +73,9 @@ def quantize_moe(self, weight):
         weight_scales = []
         qweights = torch.empty_like(weight, dtype=torch.float8_e4m3fn).cuda()
         for i in range(num_experts):
-            qweight, weight_scale = ops.scaled_fp8_quant(weight[i].cuda(), scale=None, use_per_token_if_dynamic=False)
+            qweight, weight_scale = ops.scaled_fp8_quant(
+                weight[i].contiguous().cuda(), scale=None, use_per_token_if_dynamic=False
+            )
             qweights[i] = qweight
             weight_scales.append(weight_scale)
         weight_scale = torch.cat(weight_scales, dim=0).reshape(-1)