fix

hiworldwzj · web-flow · commit 6dbd4ce30f79 · 2025-02-22T20:49:09.000+08:00
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight.py
@@ -73,15 +73,19 @@ def _post_load_weights(self) -> None:
                     and (not self.static_activation or self.input_scale is not None)
                 ):
                     if self.weight_scale.ndim > 1:
+                        # 让 k dim 更连续，大多数split k 算法的算子可能能更快
                         self.weight_scale = self.weight_scale.cuda(self.device_id_).transpose(0, 1)
                     self.weight = [
+                        # 让 k dim 更连续，大多数split k 算法的算子可能能更快
                         self.weight.cuda(self.device_id_).transpose(0, 1),
                         self.weight_scale,
                         self.input_scale,
                     ]
             else:
                 self.weight = self.quant_method.quantize(self.weight.to(self.data_type_).cuda(self.device_id_))
             return
+        
+        # 让 k dim 更连续，大多数split k 算法的算子可能能更快
         self.weight = self.weight.to(self.data_type_).cuda(self.device_id_).transpose(0, 1)
 
 
diff --git a/lightllm/common/quantization/vllm_quant.py b/lightllm/common/quantization/vllm_quant.py
@@ -198,7 +198,6 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_
                 dtype=input_tensor.dtype,
             )
         else:
-            # qweight = qweight.t().contiguous().t()
             input_scale = input_scale.t().contiguous().t()
             torch.ops._C.cutlass_scaled_mm(out, qinput_tensor, qweight, input_scale, weight_scale, bias)
         return out

Original file line number	Diff line number	Diff line change
`@@ -198,7 +198,6 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_`
`198`	`198`	`dtype=input_tensor.dtype,`
`199`	`199`	`)`
`200`	`200`	`else:`
`201`		`- # qweight = qweight.t().contiguous().t()`
`202`	`201`	`input_scale = input_scale.t().contiguous().t()`
`203`	`202`	`torch.ops._C.cutlass_scaled_mm(out, qinput_tensor, qweight, input_scale, weight_scale, bias)`
`204`	`203`	`return out`