fix awq marlin quant

wangzaijun · wangzaijun · commit 1a792b12308b · 2025-11-11T09:28:29.000Z
diff --git a/lightllm/common/quantization/awq_quant.py b/lightllm/common/quantization/awq_quant.py
@@ -6,7 +6,8 @@
 from lightllm.utils.vllm_utils import HAS_VLLM, vllm_ops, cutlass_scaled_mm
 from lightllm.utils.light_utils import HAS_LIGHTLLM_KERNEL, light_ops
 from typing import Any
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, Optional, Tuple
+from lightllm.utils.dist_utils import get_current_device_id
 
 if TYPE_CHECKING:
     from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.mm_weight import MMWeightPack
@@ -118,6 +119,27 @@ def method_name(self):
     def quantize(self, weight: torch.Tensor):
         raise NotImplementedError("AWQ online quantization is not supported yet.")
 
+    def params_need_repack(self) -> bool:
+        """
+        用于说明是否需要对量化后的权重进行repack操作，目前只有awq支持
+        """
+        return True
+
+    def params_repack(
+        self, weight: torch.Tensor, weight_scale: torch.Tensor, weight_zero_point: torch.Tensor, dtype_type: torch.dtype
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        一些量化方法在将参数完成量化后，为了加速性能，还需要将参数进行重拍，使算子性能达到最优，如awq方法。
+        """
+        weight = self._process_weight_after_loading(weight.cuda(get_current_device_id()))
+        weight_scale = self._process_weight_scale_after_loading(
+            weight_scale.cuda(get_current_device_id()).to(dtype_type)
+        )
+        weight_zero_point = self._process_weight_zero_point_after_loading(
+            weight_zero_point.cuda(get_current_device_id())
+        )
+        return weight, weight_scale, weight_zero_point
+
     def _process_weight_after_loading(self, weight: torch.Tensor) -> torch.Tensor:
         assert self.hf_quantization_config is not None, "hf_quantization_config is not set"
         self.k = weight.shape[0]