add awq marlin

shihaobai · shihaobai · commit 358821e73829 · 2025-10-21T14:42:09.000+08:00
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/colmm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/colmm_weight.py
@@ -141,7 +141,32 @@ def _slice_weight_zero_point(self, weight_zero_point: torch.Tensor):
         return weight_zero_point[zero_point_start:zero_point_end, :]
 
 
+class AWQMARLINCOLMMWeight(AWQCOLMMWeight):
+    def __init__(
+        self,
+        weight_name: str,
+        data_type: torch.dtype,
+        bias_name: Optional[str] = None,
+        quant_method: QuantizationMethod = None,
+        tp_rank: int = None,
+        tp_world_size: int = None,
+    ) -> None:
+        super().__init__(weight_name, data_type, bias_name, quant_method, tp_rank, tp_world_size)
+
+    def _process_weight(self, weight: torch.Tensor) -> torch.Tensor:
+        return self.quant_method._process_weight_after_loading(weight.cuda(get_current_device_id()))
+
+    def _process_weight_scale(self, weight_scale: torch.Tensor) -> torch.Tensor:
+        return self.quant_method._process_weight_scale_after_loading(weight_scale.cuda(get_current_device_id()))
+
+    def _process_weight_zero_point(self, weight_zero_point: torch.Tensor) -> torch.Tensor:
+        return self.quant_method._process_weight_zero_point_after_loading(
+            weight_zero_point.cuda(get_current_device_id())
+        )
+
+
 COLBMM_WEIGHT_CLS_MAP = {
     "fp8w8a8b128": W8A8B128COLMMWeight,
     "awq": AWQCOLMMWeight,
+    "awq_marlin": AWQMARLINCOLMMWeight,
 }
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
@@ -184,11 +184,20 @@ def verify_load(self) -> bool:
             load_ok = load_ok and self.bias is not None
         return load_ok
 
+    def _process_weight(self, weight: torch.Tensor) -> torch.Tensor:
+        return weight.cuda(get_current_device_id())
+
+    def _process_weight_scale(self, weight_scale: torch.Tensor) -> torch.Tensor:
+        return weight_scale.cuda(get_current_device_id())
+
+    def _process_weight_zero_point(self, weight_zero_point: torch.Tensor) -> torch.Tensor:
+        return weight_zero_point.cuda(get_current_device_id())
+
     def _load_weights(self, weights: Dict[str, torch.Tensor]) -> None:
         if self.weight_name is not None and self.weight_name in weights:
             weight = weights[self.weight_name]
             weight = self._slice_weight(weight)
-            self.weight[0] = weight.cuda(get_current_device_id())
+            self.weight[0] = self._process_weight(weight)
         if self.bias_name is not None and self.bias_name in weights:
             bias = weights[self.bias_name]
             bias = self._slice_bias(bias)
@@ -198,13 +207,13 @@ def _load_scales(self, weights: Dict[str, torch.Tensor]) -> None:
         if self.weight_scale_name is not None and self.weight_scale_name in weights:
             weight_scale = weights[self.weight_scale_name]
             weight_scale = self._slice_weight_scale(weight_scale)
-            self.weight[1] = weight_scale.cuda(get_current_device_id())
+            self.weight[1] = self._process_weight_scale(weight_scale)
 
     def _load_zero_points(self, weights: Dict[str, torch.Tensor]) -> None:
         if self.weight_zero_point_name is not None and self.weight_zero_point_name in weights:
             weight_zero_point = weights[self.weight_zero_point_name]
             weight_zero_point = self._slice_weight_zero_point(weight_zero_point)
-            self.weight[2] = weight_zero_point.cuda(get_current_device_id())
+            self.weight[2] = self._process_weight_zero_point(weight_zero_point)
 
 
 class AWQMultiMMWeightTpl(AWQMMWeightTpl):
@@ -239,18 +248,18 @@ def __init__(
     def _fuse(self) -> None:
         if self.weight[0] is None and (None not in self.weights):
             weight = torch.cat(self.weights, dim=1)
-            self.weight[0] = weight.cuda(get_current_device_id())
+            self.weight[0] = self._process_weight(weight)
             delattr(self, "weights")
 
         if self.weight[1] is None and (None not in self.weight_scales):
             # awq 保存的量化参数，weight shape 是 in x out。所以这里的cat dim 是 1
             weight_scale = torch.cat(self.weight_scales, dim=1).cuda(get_current_device_id())
-            self.weight[1] = weight_scale.cuda(get_current_device_id())
+            self.weight[1] = self._process_weight_scale(weight_scale)
             delattr(self, "weight_scales")
 
         if self.weight[2] is None and (None not in self.weight_zero_points):
             weight_zero_point = torch.cat(self.weight_zero_points, dim=1)
-            self.weight[2] = weight_zero_point.cuda(get_current_device_id())
+            self.weight[2] = self._process_weight_zero_point(weight_zero_point)
             delattr(self, "weight_zero_points")
 
         if self.has_bias and self.bias is None and (None not in self.biases):
@@ -300,7 +309,7 @@ def _get_quant_method(cls, quant_cfg: Quantcfg, layer_num_: int, name: str) -> Q
         if quant_cfg is None:
             return None, False
         quant_method = quant_cfg.get_quant_method(layer_num_, name)
-        quant_method.hf_quantization_method = quant_cfg.hf_quantization_method
+        quant_method.hf_quantization_config = quant_cfg.hf_quantization_config
         quantized_weight = quant_cfg.quantized_weight
         return quant_method, quantized_weight
 
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py
@@ -316,12 +316,62 @@ def __init__(
         super().__init__(weight_names, data_type, bias_names, quant_method, tp_rank, tp_world_size)
 
 
+class AWQMARLINROWMMWeight(AWQROWMMWeight):
+    def __init__(
+        self,
+        weight_name: str,
+        data_type: torch.dtype,
+        bias_name: Optional[str] = None,
+        quant_method: QuantizationMethod = None,
+        tp_rank: int = None,
+        tp_world_size: int = None,
+    ) -> None:
+        super().__init__(weight_name, data_type, bias_name, quant_method, tp_rank, tp_world_size)
+
+    def _process_weight(self, weight: torch.Tensor) -> torch.Tensor:
+        return self.quant_method._process_weight_after_loading(weight.cuda(get_current_device_id()))
+
+    def _process_weight_scale(self, weight_scale: torch.Tensor) -> torch.Tensor:
+        return self.quant_method._process_weight_scale_after_loading(weight_scale.cuda(get_current_device_id()))
+
+    def _process_weight_zero_point(self, weight_zero_point: torch.Tensor) -> torch.Tensor:
+        return self.quant_method._process_weight_zero_point_after_loading(
+            weight_zero_point.cuda(get_current_device_id())
+        )
+
+
+class AWQMARLINMultiROWMMWeight(AWQMultiROWMMWeight):
+    def __init__(
+        self,
+        weight_names: List[str],
+        data_type: torch.dtype,
+        bias_names: Optional[List[str]] = None,
+        quant_method: QuantizationMethod = None,
+        tp_rank: int = None,
+        tp_world_size: int = None,
+    ) -> None:
+        super().__init__(weight_names, data_type, bias_names, quant_method, tp_rank, tp_world_size)
+
+    def _process_weight(self, weight: torch.Tensor) -> torch.Tensor:
+        return self.quant_method._process_weight_after_loading(weight.cuda(get_current_device_id()))
+
+    def _process_weight_scale(self, weight_scale: torch.Tensor) -> torch.Tensor:
+        return self.quant_method._process_weight_scale_after_loading(weight_scale.cuda(get_current_device_id()))
+
+    def _process_weight_zero_point(self, weight_zero_point: torch.Tensor) -> torch.Tensor:
+        return self.quant_method._process_weight_zero_point_after_loading(
+            weight_zero_point.cuda(get_current_device_id())
+        )
+
+
 ROWBMM_WEIGHT_CLS_MAP = {
     "fp8w8a8b128": W8A8B128ROWMMWeight,
     "awq": AWQROWMMWeight,
+    "awq_marlin": AWQMARLINROWMMWeight,
 }
 
 MULTI_ROWBMM_WEIGHT_CLS_MAP = {
     "fp8w8a8b128": W8A8B128MultiROWMMWeight,
     "awq": AWQMultiROWMMWeight,
+    "awq_marlin": AWQMARLINMultiROWMMWeight,
 }
diff --git a/lightllm/common/quantization/__init__.py b/lightllm/common/quantization/__init__.py
@@ -46,6 +46,8 @@ def _mapping_quant_method(self):
                 logger.info(f"select fp8w8a8-b128 quant way: {self.quant_type}")
         elif self.hf_quantization_method == "awq":
             self.quant_type = "awq"
+            if is_awq_marlin_compatible(self.hf_quantization_config):
+                self.quant_type = "awq_marlin"
             logger.info(f"select awq quant way: {self.quant_type}")
         else:
             # TODO: more quant method
diff --git a/lightllm/common/quantization/awq_quant.py b/lightllm/common/quantization/awq_quant.py
@@ -5,10 +5,25 @@
 import torch.nn.functional as F
 from lightllm.utils.vllm_utils import HAS_VLLM, vllm_ops, cutlass_scaled_mm
 from lightllm.utils.light_utils import HAS_LIGHTLLM_KERNEL, light_ops
+from typing import Any
 
 if HAS_VLLM:
     awq_dequantize = vllm_ops.awq_dequantize
     awq_gemm = vllm_ops.awq_gemm
+    from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+        check_marlin_supported,
+        marlin_permute_scales,
+        awq_to_marlin_zero_points,
+        should_use_atomic_add_reduce,
+        marlin_make_empty_g_idx,
+        marlin_make_workspace_new,
+    )
+    from vllm.scalar_type import scalar_types
+
+    TYPE_MAP = {
+        4: scalar_types.uint4,
+        8: scalar_types.uint8,
+    }
 
 
 class AWQBaseQuantizationMethod(QuantizationMethod):
@@ -56,3 +71,112 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_
         if bias is not None:
             out.add_(bias)
         return out
+
+
+@QUANTMETHODS.register("awq_marlin")
+class AWQMARLINW4A16QuantizationMethod(AWQBaseQuantizationMethod):
+    def __init__(self):
+        super().__init__()
+        self.pack_factor = 8
+        self.weight_scale_suffix = "scales"
+        self.weight_zero_point_suffix = "qzeros"
+        self.weight_suffix = "qweight"
+        self.g_idx = marlin_make_empty_g_idx(torch.device("cuda"))
+        self.g_idx_sort_indices = marlin_make_empty_g_idx(torch.device("cuda"))
+        self.workspace = marlin_make_workspace_new(torch.device("cuda"))
+
+    def get_name(self):
+        return "awq_marlin"
+
+    def quantize(self, weight: torch.Tensor):
+        raise NotImplementedError("AWQ online quantization is not supported yet.")
+
+    def _process_weight_after_loading(self, weight: torch.Tensor) -> torch.Tensor:
+        assert self.hf_quantization_config is not None, "hf_quantization_config is not set"
+        self.k = weight.shape[0]
+        self.n = weight.shape[1] * self.pack_factor
+        return vllm_ops.awq_marlin_repack(
+            weight,
+            size_k=weight.shape[0],
+            size_n=weight.shape[1] * self.pack_factor,
+            num_bits=self.hf_quantization_config["bits"],
+        )
+
+    def _process_weight_scale_after_loading(self, weight_scale: torch.Tensor) -> torch.Tensor:
+        assert self.hf_quantization_config is not None, "hf_quantization_config is not set"
+        group_size = self.hf_quantization_config["group_size"]
+        return marlin_permute_scales(
+            weight_scale,
+            size_k=weight_scale.shape[0] * group_size,
+            size_n=weight_scale.shape[1],
+            group_size=self.hf_quantization_config["group_size"],
+        )
+
+    def _process_weight_zero_point_after_loading(self, weight_zero_point: torch.Tensor) -> torch.Tensor:
+        return awq_to_marlin_zero_points(
+            weight_zero_point,
+            size_k=weight_zero_point.shape[0],
+            size_n=weight_zero_point.shape[1] * self.pack_factor,
+            num_bits=self.hf_quantization_config["bits"],
+        )
+
+    def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_custom_tensor_mananger=True):
+        qweight, weight_scale, qzeros = weights
+        reshaped_x = input_tensor.reshape(-1, input_tensor.shape[-1])
+
+        use_atomic_add = should_use_atomic_add_reduce(
+            m=reshaped_x.size(0),
+            n=self.n,
+            k=self.k,
+            device=input_tensor.device,
+            dtype=input_tensor.dtype,
+        )
+
+        out = vllm_ops.gptq_marlin_gemm(
+            reshaped_x,
+            None,
+            qweight,
+            bias,
+            weight_scale,
+            None,
+            qzeros,
+            self.g_idx,
+            self.g_idx_sort_indices,
+            self.workspace,
+            TYPE_MAP[self.hf_quantization_config["bits"]],
+            size_m=reshaped_x.shape[0],
+            size_n=self.n,
+            size_k=self.k,
+            use_atomic_add=use_atomic_add,
+            use_fp32_reduce=True,
+            is_zp_float=False,
+        )
+
+        if bias is not None:
+            out.add_(bias)
+        return out
+
+
+# adapted from
+# https://github.com/vllm-project/vllm/blob/aef368aa08572505b820db01da82e2fbb3d43a72/vllm/model_executor/layers/quantization/awq_marlin.py#L211-L212
+def is_awq_marlin_compatible(quantization_config: dict[str, Any]):
+    # Extract data from quant config.
+    quant_method = quantization_config.get("quant_method", "").lower()
+    num_bits = quantization_config.get("bits")
+    group_size = quantization_config.get("group_size")
+    zero_point = quantization_config.get("zero_point")
+
+    if not torch.cuda.is_available():
+        return False
+
+    if quant_method != "awq":
+        return False
+
+    # If we cannot find the info needed in the config, cannot convert.
+    if num_bits is None or group_size is None or zero_point is None:
+        return False
+
+    if num_bits not in TYPE_MAP:
+        return False
+
+    return check_marlin_supported(quant_type=TYPE_MAP[num_bits], group_size=group_size, has_zp=zero_point)