ModelTC
diff --git a/‎lightllm/common/basemodel/layer_weights/meta_weights/__init__.py‎
Lines changed: 3 additions & 2 deletions b/‎lightllm/common/basemodel/layer_weights/meta_weights/__init__.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight.py‎
Lines changed: 25 additions & 27 deletions b/‎lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight.py‎
Lines changed: 25 additions & 27 deletions
diff --git a/‎lightllm/common/basemodel/layer_weights/meta_weights/mm_weight.py‎
Lines changed: 90 additions & 77 deletions b/‎lightllm/common/basemodel/layer_weights/meta_weights/mm_weight.py‎
Lines changed: 90 additions & 77 deletions
diff --git a/‎lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py‎
Lines changed: 1 addition & 1 deletion b/‎lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py‎
Lines changed: 1 addition & 1 deletion
@@ -7,8 +7,9 @@
     COLMMWeight,
     MultiROWMMWeight,
     MultiROWMMWeightNoTP,
-    CustomMMWeight,
-    CustomBMMWeight,
+    MultiCOLMMWeight,
+    ROWBMMWeight,
+    COLBMMWeight,
 )
 from .norm_weight import NormWeight, GEMMANormWeight, TpNormWeight
 from .fused_moe_weight import FusedMoeWeight
@@ -2,15 +2,22 @@
 from .base_weight import BaseWeight
 from lightllm.utils.dist_utils import get_world_size, get_rank
 import threading
-from vllm.model_executor.layers.fused_moe import FusedMoE
-from vllm.model_executor.layers.fused_moe import fused_experts
+from lightllm.common.quantization import vLLMFP8w8a8QuantizationMethod
+
+try:
+    HAS_VLLM = True
+    from vllm.model_executor.layers.fused_moe import FusedMoE
+    from vllm.model_executor.layers.fused_moe import fused_experts
+except:
+    HAS_VLLM = False
 
 
 class FusedMoeWeight(BaseWeight):
     def __init__(
         self, gate_proj_name, down_proj_name, up_proj_name, weight_prefix, n_routed_experts, split_inter_size, data_type
     ):
         super().__init__()
+        assert HAS_VLLM, "vllm is not installed, you can't use FusedMoeWeight"
         self.w1_weight_name = gate_proj_name
         self.w2_weight_name = down_proj_name
         self.w3_weight_name = up_proj_name
@@ -26,9 +33,10 @@ def __init__(
         self.lock = threading.Lock()
 
     def set_quant_method(self, quant_method):
-        self.quant_method = quant_method
-        if self.quant_method is not None:
-            self.quant_method.is_moe = True
+        if isinstance(self.quant_method, vLLMFP8w8a8QuantizationMethod):
+            self.quant_method = quant_method
+            if self.quant_method is not None:
+                self.quant_method.is_moe = True
 
     def experts(self, input_tensor, router_logits, top_k, renormalize, use_grouped_topk, topk_group, num_expert_group):
         topk_weights, topk_ids = FusedMoE.select_experts(
@@ -40,32 +48,22 @@ def experts(self, input_tensor, router_logits, top_k, renormalize, use_grouped_t
             topk_group=topk_group,
             num_expert_group=num_expert_group,
         )
-        if self.quant_method is not None:
-            fused_experts(
-                input_tensor,
-                w1=self.w1[0],
-                w2=self.w2[0],
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                inplace=False,
-                use_fp8_w8a8=True,
-                use_int8_w8a16=False,
-                w1_scale=self.w1[1],
-                w2_scale=self.w2[1],
-                a1_scale=None,
-                a2_scale=None,
-            )
-            return
+        w1, w1_scale = self.w1
+        w2, w2_scale = self.w2
+        use_fp8_w8a8 = self.quant_method is not None
         fused_experts(
             hidden_states=input_tensor,
-            w1=self.w1,
-            w2=self.w2,
+            w1=w1,
+            w2=w2,
             topk_weights=topk_weights,
             topk_ids=topk_ids,
             inplace=True,
+            use_fp8_w8a8=use_fp8_w8a8,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
         )
 
-    def fuse(self):
+    def _fuse(self):
         with self.lock:
             if (
                 hasattr(self, "experts_up_projs")
@@ -91,8 +89,8 @@ def fuse(self):
                     self.w1 = self.quant_method.quantize(self.w1)
                     self.w2 = self.quant_method.quantize(self.w2)
                 else:
-                    self.w1 = self._cuda(self.w1)
-                    self.w2 = self._cuda(self.w2)
+                    self.w1 = [self._cuda(self.w1), None]
+                    self.w2 = [self._cuda(self.w2), None]
                 delattr(self, "w2_list")
                 delattr(self, "experts_up_projs")
                 delattr(self, "experts_gate_projs")
@@ -117,7 +115,7 @@ def load_hf_weights(self, weights):
                     :, self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1)
                 ]
 
-        self.fuse()
+        self._fuse()
 
     def _cuda(self, cpu_tensor):
         if self.tp_rank_ is None:
 
@@ -4,11 +4,9 @@
 
 
 class MMWeightTpl(BaseWeightTpl):
-    def __init__(self, data_type, split_n_embed):
+    def __init__(self, data_type):
         super().__init__()
         self.data_type_ = data_type
-        self.start = split_n_embed * self.tp_rank_
-        self.end = split_n_embed * (self.tp_rank_ + 1)
         self.quant_method = None
         self.weight = None
         self.bias = None
@@ -40,7 +38,9 @@ def _post_load_weights(self):
 
 class MMWeight(MMWeightTpl):
     def __init__(self, weight_name, data_type, split_n_embed, bias_name=None):
-        super().__init__(data_type, split_n_embed)
+        super().__init__(data_type)
+        self.start = split_n_embed * self.tp_rank_
+        self.end = split_n_embed * (self.tp_rank_ + 1)
         self.weight_name = weight_name
         self.bias_name = bias_name
 
@@ -72,7 +72,7 @@ def load_hf_weights(self, weights):
         return
 
 
-class ROWMMWeightNoTP(MMWeight):
+class ROWMMWeightNoTP(ROWMMWeight):
     def __init__(self, weight_name, data_type, split_n_embed, bias_name=None):
         super().__init__(weight_name, data_type, split_n_embed, bias_name)
         self.start = 0
@@ -98,13 +98,20 @@ def load_hf_weights(self, weights):
 
 
 class MultiMMWeight(MMWeightTpl):
-    def __init__(self, weight_names, data_type, split_n_embed, bias_names=None):
-        super().__init__(data_type, split_n_embed)
+    def __init__(self, weight_names, data_type, split_n_embeds, bias_names=[]):
+        super().__init__(data_type)
+        if isinstance(split_n_embeds, int):
+            self.split_n_embeds = [split_n_embeds] * len(weight_names)
+        else:
+            self.split_n_embeds = split_n_embeds
+
+        self.starts = [i * self.tp_rank_ for i in self.split_n_embeds]
+        self.ends = [i * (self.tp_rank_ + 1) for i in self.split_n_embeds]
         self.weight_names = weight_names
         self.bias_names = bias_names
         self.weights = [None] * len(self.weight_names)
         self.biases = [None] * len(self.bias_names)
-        self.has_bias = all(b is not None for b in self.bias_names)
+        self.has_bias = all(b is not None for b in self.bias_names) and len(bias_names) > 0
 
     def verify_load(self):
         load_ok = True
@@ -117,7 +124,7 @@ def verify_load(self):
 
 
 class MultiROWMMWeight(MultiMMWeight):
-    def __init__(self, weight_names, data_type, split_n_embed, bias_names=None):
+    def __init__(self, weight_names, data_type, split_n_embed, bias_names=[]):
         super().__init__(weight_names, data_type, split_n_embed, bias_names)
 
     def _fuse(self):
@@ -134,86 +141,48 @@ def load_hf_weights(self, weights):
         for i in range(len(self.weight_names)):
             if self.weight_names[i] in weights:
                 weight = weights[self.weight_names[i]].to(self.data_type_)
-                self.weights[i] = weight[self.start : self.end]
+                self.weights[i] = weight[self.starts[i] : self.ends[i]]
             if self.has_bias and self.bias_names[i] in weights:
                 bias = weights[self.bias_names[i]].to(self.data_type_)
-                self.biases[i] = bias[self.start : self.end]
+                self.biases[i] = bias[self.starts[i] : self.ends[i]]
         self._fuse()
         return
 
 
 class MultiROWMMWeightNoTP(MultiROWMMWeight):
-    def __init__(self, weight_names, data_type, split_n_embed, bias_names=None):
+    def __init__(self, weight_names, data_type, split_n_embed, bias_names=[]):
         super().__init__(weight_names, data_type, split_n_embed, bias_names)
-        self.start = 0
-        self.end = split_n_embed
+        self.starts = [0 for i in self.split_n_embeds]
+        self.ends = [i for i in self.split_n_embeds]
 
 
-class CustomMMWeight(ROWMMWeight):
-    def __init__(
-        self,
-        weight_name,
-        data_type,
-        split_n_embed,
-        bias_name=None,
-        wait_fuse=False,
-        disable_tp=False,
-        custom_load=None,
-        custom_fuse=None,
-    ):
-        super().__init__(weight_name, data_type, split_n_embed, bias_name, wait_fuse=wait_fuse, disable_tp=disable_tp)
-        self.custom_load = custom_load
-        self.custom_fuse = custom_fuse
-
-    def fuse(self, B, op=None):
-        if self.custom_fuse is None:
-            super().fuse(B, op)
-        else:
-            weight = self.custom_fuse(self, B)
-            self.post_load_weights(weight)
+class MultiCOLMMWeight(MultiROWMMWeight):
+    def __init__(self, weight_names, data_type, split_n_embed, bias_names=[]):
+        super().__init__(weight_names, data_type, split_n_embed, bias_names)
 
     def load_hf_weights(self, weights):
-        if self.custom_load is None:
-            super().load_hf_weights(weights)
-        else:
-            weight = None
-            if self.weight_name in weights:
-                weight = self.custom_load(self, self.pre_load_weights(weights[self.weight_name]))
-            if weight is None:
-                return
-            if self.wait_fuse:
-                self.weight = weight
-                return
-            self.post_load_weights(weight)
+        weight = None
+        for i in range(len(self.weight_names)):
+            if self.weight_names[i] in weights:
+                weight = weights[self.weight_names[i]].to(self.data_type_)
+                self.weights[i] = weight[:, self.starts[i] : self.ends[i]]
+            if self.has_bias and self.bias_names[i] in weights:
+                bias = weights[self.bias_names[i]].to(self.data_type_)
+                self.biases[i] = bias[:, self.starts[i] : self.ends[i]]
+        self._fuse()
         return
 
 
-class CustomBMMWeight(CustomMMWeight):
-    def __init__(
-        self,
-        weight_name,
-        data_type,
-        split_n_embed,
-        bias_name=None,
-        wait_fuse=False,
-        disable_tp=False,
-        custom_load=None,
-        custom_fuse=None,
-    ):
-        super().__init__(
-            weight_name,
-            data_type,
-            split_n_embed,
-            bias_name,
-            wait_fuse=wait_fuse,
-            disable_tp=disable_tp,
-            custom_load=custom_load,
-            custom_fuse=custom_fuse,
-        )
+class BMMWeightTpl(BaseWeightTpl):
+    def __init__(self, data_type):
+        super().__init__()
+        self.data_type_ = data_type
+        self.quant_method = None
+        self.weight = None
+        self.bias = None
 
     def set_quant_method(self, quant_method):
-        return
-        raise NotImplementedError("BMM does not currently support quantification")
+        self.quant_method = None
 
     def bmm(self, input_tensor, out=None, use_custom_tensor_mananger=True):
         if self.quant_method is not None:
@@ -230,8 +199,52 @@ def bmm(self, input_tensor, out=None, use_custom_tensor_mananger=True):
             return torch.bmm(input_tensor, self.weight, out=out)
         return torch.addbmm(self.bias, input_tensor, self.weight, out=out)
 
-    def post_load_weights(self, weight):
-        if self.quant_method is not None:
-            self.weight = self.quant_method.quantize(weight.cuda(self.tp_rank_))
-            return
-        self.weight = weight.cuda(self.tp_rank_)
+    def _post_load_weights(self):
+        self.weight = self.weight.cuda(self.tp_rank_)
+
+
+class BMMWeight(BMMWeightTpl):
+    def __init__(self, weight_name, data_type, split_n_embed, bias_name=None):
+        super().__init__(data_type)
+        self.start = split_n_embed * self.tp_rank_
+        self.end = split_n_embed * (self.tp_rank_ + 1)
+        self.weight_name = weight_name
+        self.bias_name = bias_name
+
+    def verify_load(self):
+        load_ok = True
+        # Verify weight. The weight must be not None.
+        load_ok = load_ok and self.weight is not None
+        # Verify bias. If bias_name is set, it must be not None.
+        if self.bias_name is not None:
+            load_ok = load_ok and self.bias is not None
+        return load_ok
+
+
+class ROWBMMWeight(BMMWeight):
+    load_hf_weights = ROWMMWeight.load_hf_weights
+
+    def __init__(
+        self,
+        weight_name,
+        data_type,
+        split_n_embed,
+        bias_name=None,
+    ):
+        super().__init__(weight_name, data_type, split_n_embed, bias_name)
+
+
+class COLBMMWeight(BMMWeight):
+    load_hf_weights = COLMMWeight.load_hf_weights
+
+    def __init__(
+        self,
+        weight_name,
+        data_type,
+        split_n_embed,
+        bias_name=None,
+    ):
+        super().__init__(weight_name, data_type, split_n_embed, bias_name)
+
+    def _post_load_weights(self):
+        self.weight = self.weight.transpose(0, 1).cuda(self.tp_rank_)
@@ -2,7 +2,7 @@
 
 
 class NormWeight(BaseWeightTpl):
-    def __init__(self, weight_name, data_type, bias_name):
+    def __init__(self, weight_name, data_type, bias_name=None):
         super().__init__()
         self.weight_name = weight_name
         self.bias_name = bias_name