ModelTC
diff --git a/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 2 additions & 1 deletion b/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎lightllm/common/basemodel/layer_weights/meta_weights/__init__.py‎
Lines changed: 0 additions & 2 deletions b/‎lightllm/common/basemodel/layer_weights/meta_weights/__init__.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight.py‎
Lines changed: 109 additions & 24 deletions b/‎lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight.py‎
Lines changed: 109 additions & 24 deletions
@@ -99,6 +99,7 @@ def _init_config(self):
         repair_config(self.config, same_names=["num_hidden_layers", "n_layer"])
         if self.finetune_config:
             self.config["vocab_size"] = self.finetune_config.vocab_size
+
         return
 
     @final
@@ -112,7 +113,7 @@ def _verify_params(self):
         return
 
     def _init_quant(self):
-        self.quant_cfg = Quantcfg(self.config["n_layer"], self.quant_type, self.quant_cfg_path)
+        self.quant_cfg = Quantcfg(self.config, self.quant_type, self.quant_cfg_path)
         logger.info(f"Initial quantization. " f"The default quantization method is {self.quant_cfg.quant_type}")
 
     def _init_weights(self):
 
@@ -10,10 +10,8 @@
     MultiROWMMWeightNoTP,
     MultiCOLMMWeight,
     ROWBMMWeight,
-    COLBMMWeight,
     MultiCOLMMWeightNoTp,
     ROWBMMWeightNoTp,
-    COLBMMWeightNoTp,
     COLMMWeightNoTp,
 )
 from .norm_weight import NormWeight, GEMMANormWeight, TpNormWeight
 
@@ -1,31 +1,39 @@
 import os
 import torch
-from .base_weight import BaseWeight
-from lightllm.utils.dist_utils import get_world_size, get_rank
 import threading
+from typing import Optional, Tuple, List, Dict, Any
+from .base_weight import BaseWeight
 from lightllm.common.quantization import vLLMFP8w8a8QuantizationMethod
-
+from lightllm.common.quantization.quantize_method import QuantizationMethod
+from lightllm.utils.dist_utils import get_world_size, get_rank
 from lightllm.common.vllm_kernel import _custom_ops as ops
 from lightllm.utils.device_utils import get_current_device_id
 
 
 class FusedMoeWeight(BaseWeight):
     def __init__(
         self,
-        gate_proj_name,
-        down_proj_name,
-        up_proj_name,
-        e_score_correction_bias_name,
-        weight_prefix,
-        n_routed_experts,
-        split_inter_size,
-        data_type,
-        network_config,
-    ):
+        gate_proj_name: str,
+        down_proj_name: str,
+        up_proj_name: str,
+        e_score_correction_bias_name: str,
+        weight_prefix: str,
+        n_routed_experts: int,
+        split_inter_size: int,
+        data_type: torch.dtype,
+        network_config: Dict[str, Any],
+        weight_scale_suffix: Optional[str] = None,
+        act_scale_suffix: Optional[str] = None,
+    ) -> None:
         super().__init__()
         self.w1_weight_name = gate_proj_name
         self.w2_weight_name = down_proj_name
         self.w3_weight_name = up_proj_name
+        self.weight_scale_suffix = weight_scale_suffix
+        self.act_scale_suffix = act_scale_suffix
+        self.quantized_weight = weight_scale_suffix is not None
+        self.static_activation = act_scale_suffix is not None
+
         self.e_score_correction_bias_name = e_score_correction_bias_name
         self.weight_prefix = weight_prefix
         self.n_routed_experts = n_routed_experts
@@ -34,15 +42,23 @@ def __init__(
         self.tp_rank_ = get_rank()
         self.experts_up_projs = [None] * self.n_routed_experts
         self.experts_gate_projs = [None] * self.n_routed_experts
+        self.experts_up_proj_scales = [None] * self.n_routed_experts
+        self.experts_gate_proj_scales = [None] * self.n_routed_experts
         self.expert_gate_up_proj_etp = None
         self.expert_down_proj_etp = None
         self.e_score_correction_bias = None
         self.w2_list = [None] * self.n_routed_experts
+        self.w2_scale_list = [None] * self.n_routed_experts
         self.quant_method = None
         self.scoring_func = network_config["scoring_func"]
+        self.w1 = [None, None]  # weight, weight_scale
+        self.w2 = [None, None]  # weight, weight_scale
         self.lock = threading.Lock()
 
-    def set_quant_method(self, quant_method):
+    def set_quant_method(self, quant_method: QuantizationMethod) -> None:
+        if self.quantized_weight:
+            self.quant_method = quant_method
+            return
         if isinstance(quant_method, vLLMFP8w8a8QuantizationMethod):
             self.quant_method = quant_method
             if self.quant_method is not None:
@@ -82,6 +98,8 @@ def experts(self, input_tensor, router_logits, top_k, renormalize, use_grouped_t
         return
 
     def _fuse(self):
+        if self.quantized_weight:
+            self._fuse_weight_scale()
         with self.lock:
             if (
                 hasattr(self, "experts_up_projs")
@@ -98,21 +116,48 @@ def _fuse(self):
                     w1_list.append(expert_gate_up_proj)
 
                 inter_shape, hidden_size = w1_list[0].shape[0], w1_list[0].shape[1]
-                self.w1 = torch._utils._flatten_dense_tensors(w1_list).view(len(w1_list), inter_shape, hidden_size)
+                w1 = torch._utils._flatten_dense_tensors(w1_list).view(len(w1_list), inter_shape, hidden_size)
                 inter_shape, hidden_size = self.w2_list[0].shape[0], self.w2_list[0].shape[1]
-                self.w2 = torch._utils._flatten_dense_tensors(self.w2_list).view(
-                    len(self.w2_list), inter_shape, hidden_size
-                )
-                if self.quant_method is not None:
-                    self.w1 = self.quant_method.quantize(self.w1)
-                    self.w2 = self.quant_method.quantize(self.w2)
+                w2 = torch._utils._flatten_dense_tensors(self.w2_list).view(len(self.w2_list), inter_shape, hidden_size)
+                if not self.quantized_weight and self.quant_method is not None:
+                    self.w1 = self.quant_method.quantize(w1)
+                    self.w2 = self.quant_method.quantize(w2)
                 else:
-                    self.w1 = [self._cuda(self.w1), None]
-                    self.w2 = [self._cuda(self.w2), None]
+                    self.w1[0] = self._cuda(w1)
+                    self.w2[0] = self._cuda(w2)
                 delattr(self, "w2_list")
                 delattr(self, "experts_up_projs")
                 delattr(self, "experts_gate_projs")
 
+    def _fuse_weight_scale(self):
+        with self.lock:
+            if (
+                hasattr(self, "experts_up_proj_scales")
+                and None not in self.experts_up_proj_scales
+                and None not in self.experts_gate_proj_scales
+                and None not in self.w2_scale_list
+            ):
+                w1_scale_list = []
+                for i_experts in range(self.n_routed_experts):
+                    expert_gate_up_proj_scale = torch.cat(
+                        [self.experts_gate_proj_scales[i_experts], self.experts_up_proj_scales[i_experts]], dim=0
+                    )
+                    w1_scale_list.append(expert_gate_up_proj_scale)
+
+                inter_shape, hidden_size = w1_scale_list[0].shape[0], w1_scale_list[0].shape[1]
+                w1_scale = torch._utils._flatten_dense_tensors(w1_scale_list).view(
+                    len(w1_scale_list), inter_shape, hidden_size
+                )
+                inter_shape, hidden_size = self.w2_scale_list[0].shape[0], self.w2_scale_list[0].shape[1]
+                w2_scale = torch._utils._flatten_dense_tensors(self.w2_scale_list).view(
+                    len(self.w2_scale_list), inter_shape, hidden_size
+                )
+                self.w1[1] = self._cuda(w1_scale)
+                self.w2[1] = self._cuda(w2_scale)
+                delattr(self, "w2_scale_list")
+                delattr(self, "experts_up_proj_scales")
+                delattr(self, "experts_gate_proj_scales")
+
     def _load_hf_weights_etp(self, weights):
         world_size_ = get_world_size()
         assert self.n_routed_experts % world_size_ == 0
@@ -196,11 +241,51 @@ def load_hf_weights(self, weights):
                     self.w2_list[i_experts] = weights[w2_weight][
                         :, self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1)
                     ]
-
+            if self.quant_method is not None:
+                self._load_weight_scale(weights)
             self._fuse()
 
+    def _load_weight_scale(self, weights: Dict[str, torch.Tensor]) -> None:
+        block_size = 1
+        if hasattr(self.quant_method, "block_size"):
+            block_size = self.quant_method.block_size
+        for i_experts in range(self.n_routed_experts):
+            w1_scale = f"{self.weight_prefix}.{i_experts}.{self.w1_weight_name}.{self.weight_scale_suffix}"
+            w2_scale = f"{self.weight_prefix}.{i_experts}.{self.w2_weight_name}.{self.weight_scale_suffix}"
+            w3_scale = f"{self.weight_prefix}.{i_experts}.{self.w3_weight_name}.{self.weight_scale_suffix}"
+            if w1_scale in weights:
+                self.experts_gate_proj_scales[i_experts] = weights[w1_scale][
+                    self.split_inter_size
+                    // block_size
+                    * self.tp_rank_ : self.split_inter_size
+                    // block_size
+                    * (self.tp_rank_ + 1),
+                    :,
+                ]
+            if w3_scale in weights:
+                self.experts_up_proj_scales[i_experts] = weights[w3_scale][
+                    self.split_inter_size
+                    // block_size
+                    * self.tp_rank_ : self.split_inter_size
+                    // block_size
+                    * (self.tp_rank_ + 1),
+                    :,
+                ]
+
+            if w2_scale in weights:
+                self.w2_scale_list[i_experts] = weights[w2_scale][
+                    :,
+                    self.split_inter_size
+                    // block_size
+                    * self.tp_rank_ : self.split_inter_size
+                    // block_size
+                    * (self.tp_rank_ + 1),
+                ]
+
     def _cuda(self, cpu_tensor):
         device_id = get_current_device_id()
+        if self.quantized_weight:
+            return cpu_tensor.contiguous().cuda(device_id)
         return cpu_tensor.contiguous().to(self.data_type_).cuda(device_id)
 
     def verify_load(self):