fix quant

shihaobai · shihaobai · commit e02a36207642 · 2025-11-12T06:44:42.000Z
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/colmm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/colmm_weight.py
@@ -11,7 +11,7 @@
 from .mm_slicer import ColSliceMixin, QuantizedRowSliceMixin, QuantizedColSliceMixin
 
 
-class UnquantizedCOLMMWeight(MMWeightTpl):
+class StandardCOLMMWeight(MMWeightTpl):
     def __init__(
         self,
         weight_names: Union[str, List[str]],
@@ -72,7 +72,9 @@ def __init__(
             tp_world_size=tp_world_size,
         )
         # 注意这里不是错误，因为awq的weight是按inxout存的
-        self.param_slicer = QuantizedRowSliceMixin(tp_rank=tp_rank, tp_world_size=tp_world_size)
+        self.param_slicer = QuantizedRowSliceMixin(
+            tp_rank=tp_rank, tp_world_size=tp_world_size, bias_div_world_size=True
+        )
 
 
 class AWQMARLINCOLMMWeight(AWQCOLMMWeight):
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_factory.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_factory.py
@@ -6,12 +6,12 @@
     BMMWeightTpl,
 )
 from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.rowmm_weight import (
-    UnquantizedROWMMWeight,
+    StandardROWMMWeight,
     UnquantizedROWBMMWeight,
     ROWMM_WEIGHT_CLS_MAP,
 )
 from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.colmm_weight import (
-    UnquantizedCOLMMWeight,
+    StandardCOLMMWeight,
     COLMM_WEIGHT_CLS_MAP,
 )
 
@@ -61,9 +61,12 @@ class ROWMMWeight(MMWeight):
     @classmethod
     def _get_mmcls(cls, quant_method: QuantizationMethod):
         if quant_method is None:
-            return UnquantizedROWMMWeight
+            return StandardROWMMWeight
 
-        return ROWMM_WEIGHT_CLS_MAP[quant_method.method_name]
+        return ROWMM_WEIGHT_CLS_MAP.get(
+            quant_method.method_name,
+            StandardROWMMWeight,
+        )
 
 
 class ROWBMMWeight(MMWeight):
@@ -80,5 +83,8 @@ class COLMMWeight(MMWeight):
     @classmethod
     def _get_mmcls(cls, quant_method: QuantizationMethod):
         if quant_method is None:
-            return UnquantizedCOLMMWeight
-        return COLMM_WEIGHT_CLS_MAP[quant_method.method_name]
+            return StandardCOLMMWeight
+        return COLMM_WEIGHT_CLS_MAP.get(
+            quant_method.method_name,
+            StandardCOLMMWeight,
+        )
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_slicer.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_slicer.py
@@ -7,9 +7,10 @@
 class SliceMixinBase(ABC):
     """切片操作的Mixin基类"""
 
-    def __init__(self, tp_rank: int = None, tp_world_size: int = None):
+    def __init__(self, tp_rank: int = None, tp_world_size: int = None, bias_div_world_size: bool = False):
         self.tp_rank_ = tp_rank if tp_rank is not None else get_current_rank_in_dp()
         self.tp_world_size_ = tp_world_size if tp_world_size is not None else get_dp_world_size()
+        self.bias_div_world_size_ = bias_div_world_size
 
     @abstractmethod
     def _slice_weight(self, weight: torch.Tensor):
@@ -21,8 +22,8 @@ def _slice_bias(self, bias):
 
 
 class SliceMixinTpl(SliceMixinBase):
-    def __init__(self, tp_rank: int = None, tp_world_size: int = None):
-        super().__init__(tp_rank, tp_world_size)
+    def __init__(self, tp_rank: int = None, tp_world_size: int = None, bias_div_world_size: bool = False):
+        super().__init__(tp_rank, tp_world_size, bias_div_world_size)
 
     def _slice_weight(self, weight: torch.Tensor) -> torch.Tensor:
         raise NotImplementedError("slice_weight must implement this method")
@@ -40,8 +41,8 @@ def _slice_weight_zero_point(self, weight_zero_point: torch.Tensor) -> torch.Ten
 # 默认weight 的shape是 outxin，这也是目前最通用的约定。
 # 所以row-wise是沿着dim=0进行切分，col-wise是沿着dim=1进行切分。
 class RowSliceMixin(SliceMixinTpl):
-    def __init__(self, tp_rank: int = None, tp_world_size: int = None):
-        super().__init__(tp_rank, tp_world_size)
+    def __init__(self, tp_rank: int = None, tp_world_size: int = None, bias_div_world_size: bool = False):
+        super().__init__(tp_rank, tp_world_size, bias_div_world_size)
 
     def _slice_weight(self, weight: torch.Tensor) -> torch.Tensor:
         assert weight.shape[0] % self.tp_world_size_ == 0, f"tp slice error {weight.shape[0]} % {self.tp_world_size_}"
@@ -51,14 +52,16 @@ def _slice_weight(self, weight: torch.Tensor) -> torch.Tensor:
     def _slice_bias(self, bias) -> torch.Tensor:
         assert bias.shape[0] % self.tp_world_size_ == 0, f"tp slice error {bias.shape[0]} % {self.tp_world_size_}"
         tp_size = bias.shape[0] // self.tp_world_size_
+        if self.bias_div_world_size_:
+            return bias[tp_size * self.tp_rank_ : tp_size * (self.tp_rank_ + 1)] / self.tp_world_size_
         return bias[tp_size * self.tp_rank_ : tp_size * (self.tp_rank_ + 1)]
 
 
 # 量化切片默认实现方式是group-wise的量化，所以weight_scale 和weight_zero_point ndims跟weight一样。
 # 后续按需要，扩展per-tensor、per-channel的量化方式。
 class QuantizedRowSliceMixin(RowSliceMixin):
-    def __init__(self, tp_rank: int = None, tp_world_size: int = None):
-        super().__init__(tp_rank, tp_world_size)
+    def __init__(self, tp_rank: int = None, tp_world_size: int = None, bias_div_world_size: bool = False):
+        super().__init__(tp_rank, tp_world_size, bias_div_world_size)
 
     def _slice_weight_scale(self, weight_scale: torch.Tensor) -> torch.Tensor:
         assert (
@@ -80,8 +83,8 @@ def _slice_weight_zero_point(self, weight_zero_point: torch.Tensor) -> torch.Ten
 
 
 class ColSliceMixin(SliceMixinTpl):
-    def __init__(self, tp_rank: int = None, tp_world_size: int = None):
-        super().__init__(tp_rank, tp_world_size)
+    def __init__(self, tp_rank: int = None, tp_world_size: int = None, bias_div_world_size: bool = True):
+        super().__init__(tp_rank, tp_world_size, bias_div_world_size)
 
     def _slice_weight(self, weight: torch.Tensor) -> torch.Tensor:
         assert weight.shape[1] % self.tp_world_size_ == 0, f"tp slice error {weight.shape[1]} % {self.tp_world_size_}"
@@ -91,12 +94,14 @@ def _slice_weight(self, weight: torch.Tensor) -> torch.Tensor:
     def _slice_bias(self, bias) -> torch.Tensor:
         assert bias.shape[0] % self.tp_world_size_ == 0, f"tp slice error {bias.shape[0]} % {self.tp_world_size_}"
         tp_size = bias.shape[0] // self.tp_world_size_
+        if self.bias_div_world_size_:
+            return bias[tp_size * self.tp_rank_ : tp_size * (self.tp_rank_ + 1)] / self.tp_world_size_
         return bias[tp_size * self.tp_rank_ : tp_size * (self.tp_rank_ + 1)]
 
 
 class QuantizedColSliceMixin(ColSliceMixin):
-    def __init__(self, tp_rank: int = None, tp_world_size: int = None):
-        super().__init__(tp_rank, tp_world_size)
+    def __init__(self, tp_rank: int = None, tp_world_size: int = None, bias_div_world_size: bool = True):
+        super().__init__(tp_rank, tp_world_size, bias_div_world_size)
 
     def _slice_weight_scale(self, weight_scale: torch.Tensor) -> torch.Tensor:
         assert (
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
@@ -68,8 +68,6 @@ def __init__(
         quant_method: QuantizationMethod = None,
         tp_rank: int = None,
         tp_world_size: int = None,
-        has_weight_scale: bool = False,
-        has_weight_zero_point: bool = False,
     ) -> None:
         super().__init__(tp_rank, tp_world_size, data_type)
         self.lock = threading.Lock()
@@ -84,13 +82,20 @@ def __init__(
             if bias_names[0] is None:
                 bias_names = None
 
+        if quant_method is not None:
+            has_weight_scale = quant_method.has_weight_scale
+            has_weight_zero_point = quant_method.has_weight_zero_point
+        else:
+            has_weight_scale = False
+            has_weight_zero_point = False
+
         # 同时存在 weight_names 和 quanted_weight_names 是为了兼容在线和离线两种加载方案
         self.weight_names = weight_names
 
         self.bias_names = bias_names
         has_bias = self.bias_names is not None
 
-        self.gen_weight_quant_param_names(quant_method=quant_method, has_weight_zero_point=has_weight_zero_point)
+        self.gen_weight_quant_param_names(quant_method=quant_method)
         self.quant_method = quant_method
         self.sub_child_mm_params: List[MMWeightPack] = [
             MMWeightPack(
@@ -132,7 +137,7 @@ def mm(
             return torch.mm(input_tensor, self.mm_param.weight, out=out)
         return torch.addmm(self.mm_param.bias, input_tensor, self.mm_param.weight, out=out)
 
-    def gen_weight_quant_param_names(self, quant_method: QuantizationMethod, has_weight_zero_point: bool):
+    def gen_weight_quant_param_names(self, quant_method: Optional[QuantizationMethod]):
         if quant_method is None:
             self.quanted_weight_names = None
             self.weight_zero_point_names = None
@@ -144,11 +149,10 @@ def gen_weight_quant_param_names(self, quant_method: QuantizationMethod, has_wei
         weight_zero_point_names = []
 
         for weight_name in self.weight_names:
-            assert quant_method.weight_scale_suffix is not None, "weight_scale_suffix is not set"
-            weight_scale_name = weight_name.replace("weight", quant_method.weight_scale_suffix)
-            weight_scale_names.append(weight_scale_name)
-            if has_weight_zero_point:
-                assert quant_method.weight_zero_point_suffix is not None, "weight_zero_point_suffix is not set"
+            if quant_method.weight_scale_suffix is not None:
+                weight_scale_name = weight_name.replace("weight", quant_method.weight_scale_suffix)
+                weight_scale_names.append(weight_scale_name)
+            if quant_method.weight_zero_point_suffix is not None:
                 weight_zero_point_name = weight_name.replace("weight", quant_method.weight_zero_point_suffix)
                 weight_zero_point_names.append(weight_zero_point_name)
             if quant_method.weight_suffix is not None:
@@ -410,8 +414,6 @@ def __init__(
             quant_method=quant_method,
             tp_rank=tp_rank,
             tp_world_size=tp_world_size,
-            has_weight_scale=True,
-            has_weight_zero_point=False,
         )
 
     def _to_gpu_device(self) -> None:
@@ -445,8 +447,6 @@ def __init__(
             quant_method=quant_method,
             tp_rank=tp_rank,
             tp_world_size=tp_world_size,
-            has_weight_scale=True,
-            has_weight_zero_point=True,
         )
         self.weight_fused_dim = 1
         self.bias_fused_dim = 0
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py
@@ -12,7 +12,7 @@
 from .mm_slicer import RowSliceMixin, QuantizedRowSliceMixin, QuantizedColSliceMixin
 
 
-class UnquantizedROWMMWeight(MMWeightTpl):
+class StandardROWMMWeight(MMWeightTpl):
     def __init__(
         self,
         weight_names: Union[str, List[str]],
@@ -95,7 +95,9 @@ def __init__(
             tp_world_size=tp_world_size,
         )
         # 注意这里不是错误，因为awq的weight是按inxout存的
-        self.param_slicer = QuantizedColSliceMixin(tp_rank=tp_rank, tp_world_size=tp_world_size)
+        self.param_slicer = QuantizedColSliceMixin(
+            tp_rank=tp_rank, tp_world_size=tp_world_size, bias_div_world_size=False
+        )
 
 
 class AWQMARLINROWMMWeight(AWQROWMMWeight):
diff --git a/lightllm/common/quantization/awq_quant.py b/lightllm/common/quantization/awq_quant.py
@@ -65,6 +65,8 @@ def __init__(self):
         self.weight_scale_suffix = "scales"
         self.weight_zero_point_suffix = "qzeros"
         self.weight_suffix = "qweight"
+        self.has_weight_scale = True
+        self.has_weight_zero_point = True
 
     @property
     def method_name(self):
@@ -111,6 +113,8 @@ def __init__(self):
         self.g_idx_sort_indices = marlin_make_empty_g_idx(torch.device("cuda"))
         self.workspace = marlin_make_workspace_new(torch.device("cuda"))
         self.vllm_quant_type = TYPE_MAP[self.nbits]
+        self.has_weight_scale = True
+        self.has_weight_zero_point = True
 
     @property
     def method_name(self):
diff --git a/lightllm/common/quantization/deepgemm_quant.py b/lightllm/common/quantization/deepgemm_quant.py
@@ -53,6 +53,8 @@ def __init__(self):
         self.weight_suffix = None
         self.weight_zero_point_suffix = None
         self.weight_scale_suffix = "weight_scale_inv"
+        self.has_weight_scale = True
+        self.has_weight_zero_point = False
 
     @property
     def method_name(self):
diff --git a/lightllm/common/quantization/quantize_method.py b/lightllm/common/quantization/quantize_method.py
@@ -15,6 +15,8 @@ def __init__(self):
         self.weight_scale_suffix = None
         self.weight_zero_point_suffix = None
         self.act_scale_suffix = None
+        self.has_weight_scale: bool = (None,)
+        self.has_weight_zero_point: bool = (None,)
         # 一些量化模式需要用到的额外量化参数，如awq量化
         self.hf_quantization_config = None
 
diff --git a/lightllm/common/quantization/torchao_quant.py b/lightllm/common/quantization/torchao_quant.py
@@ -64,6 +64,8 @@ def __init__(self):
         super().__init__()
         self.group_size = 256
         self.quant_func = int4_weight_only(group_size=self.group_size)
+        self.has_weight_scale = False
+        self.has_weight_zero_point = False
 
     @property
     def method_name(self):
@@ -76,6 +78,8 @@ def __init__(self):
         super().__init__()
         self.group_size = 128
         self.quant_func = int4_weight_only(group_size=self.group_size)
+        self.has_weight_scale = False
+        self.has_weight_zero_point = False
 
     @property
     def method_name(self):
@@ -88,6 +92,8 @@ def __init__(self):
         super().__init__()
         self.group_size = 64
         self.quant_func = int4_weight_only(group_size=self.group_size)
+        self.has_weight_scale = False
+        self.has_weight_zero_point = False
 
     @property
     def method_name(self):
@@ -100,6 +106,8 @@ def __init__(self):
         super().__init__()
         self.group_size = 32
         self.quant_func = int4_weight_only(group_size=self.group_size)
+        self.has_weight_scale = False
+        self.has_weight_zero_point = False
 
     @property
     def method_name(self):
@@ -111,6 +119,8 @@ class AOW8A8QuantizationMethod(AOBaseQuantizationMethod):
     def __init__(self):
         super().__init__()
         self.quant_func = int8_dynamic_activation_int8_weight()
+        self.has_weight_scale = False
+        self.has_weight_zero_point = False
 
     @property
     def method_name(self):
@@ -122,6 +132,8 @@ class AOW8A16QuantizationMethod(AOBaseQuantizationMethod):
     def __init__(self):
         super().__init__()
         self.quant_func = int8_weight_only()
+        self.has_weight_scale = False
+        self.has_weight_zero_point = False
 
     @property
     def method_name(self):
@@ -135,6 +147,8 @@ def __init__(self):
         is_cuda_8_9 = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 9)
         assert is_cuda_8_9, "FP8 requires GPU with compute capability >= 8.9"
         self.quant_func = float8_weight_only()
+        self.has_weight_scale = False
+        self.has_weight_zero_point = False
 
     @property
     def method_name(self):
@@ -147,6 +161,8 @@ def __init__(self):
         super().__init__()
         assert TORCH_VERSION_AT_LEAST_2_5, "torchao fp6 requires torch >=2.5"
         self.quant_func = fpx_weight_only(3, 2)
+        self.has_weight_scale = False
+        self.has_weight_zero_point = False
 
     @property
     def method_name(self):
diff --git a/lightllm/common/quantization/triton_quant/triton_quant.py b/lightllm/common/quantization/triton_quant/triton_quant.py
@@ -41,6 +41,8 @@ def __init__(self):
         self.weight_suffix = None
         self.weight_zero_point_suffix = None
         self.weight_scale_suffix = "weight_scale_inv"
+        self.has_weight_scale = True
+        self.has_weight_zero_point = False
 
     def quantize(self, weight: torch.Tensor):
         # TODO block-wise quant kernel
diff --git a/lightllm/common/quantization/w8a8_quant.py b/lightllm/common/quantization/w8a8_quant.py
@@ -52,6 +52,8 @@ def method_name(self):
 class w8a8QuantizationMethod(BaseQuantizationMethod):
     def __init__(self):
         super().__init__()
+        self.has_weight_scale = True
+        self.has_weight_zero_point = False
 
     def quantize(self, weight: torch.Tensor):
         if isinstance(weight, tuple):
@@ -98,6 +100,8 @@ class FP8w8a8QuantizationMethod(BaseQuantizationMethod):
     def __init__(self):
         super().__init__()
         self.is_moe = False
+        self.has_weight_scale = True
+        self.has_weight_zero_point = False
 
     def quantize(self, weight: torch.Tensor):
         if self.is_moe:
@@ -156,6 +160,8 @@ def __init__(self):
         super().__init__()
         self.block_size = 128
         self.weight_scale_suffix = "weight_scale_inv"
+        self.has_weight_scale = True
+        self.has_weight_zero_point = False
 
     def quantize(self, weight: torch.Tensor):