Merge pull request #209 from ModelTC/dev_fixbug

gushiqiao · web-flow · commit 837576aef795 · 2024-11-20T14:58:49.000+08:00
Fix gptq bug
diff --git a/llmc/compression/quantization/base_blockwise_quantization.py b/llmc/compression/quantization/base_blockwise_quantization.py
@@ -109,6 +109,7 @@ def get_replacement_params(self, mode='fake_quant', w_only=False, name=None):
                 'softmax_a_qdq': partial(self.a_qdq, aquantizer=self.aquantizer)
                 if self.quant_softmax else None
             }
+
         elif mode == 'quant_act_fn':
             params_dict = {
                 'a_qdq': partial(self.a_qdq, aquantizer=self.aquantizer)
@@ -324,7 +325,6 @@ def replace_attention(self, block, extra_modules):
     def replace_moe_gate(self, block):
         moe_gate_layer = self.model.get_moe_gate(block)
         if moe_gate_layer is not None:
-            logger.info(moe_gate_layer)
             moe_gate_module = _LLMC_MOE_GATE_MAP_[self.config['model']['type']]
             layers_dict = {'layers': moe_gate_layer}
             self.model.replace_module_subset(
@@ -333,7 +333,7 @@ def replace_moe_gate(self, block):
                 layers_dict,
                 self.block_idx,
                 self.get_replacement_params(
-                    mode='quant_moegate', w_only=self.w_only, name=None
+                    mode=None, w_only=self.w_only, name=None
                 ),
             )
 
@@ -554,13 +554,13 @@ def register_act_qparams(self, layers_dict, act_tensors):
         ):
             scales = scales.cuda()
             dist.all_reduce(scales, op=dist.ReduceOp.SUM)
-            scales = (scales / world_size).cpu()
+            scales = (scales / world_size)
 
             for name, layer in layers_dict.items():
                 layer.register_buffer(f'buf_act_scales_{i}', scales)
-                layer.register_buffer(f'buf_act_zeros_{i}', zeros)
-                layer.register_buffer(f'buf_act_qmin_{i}', qmin)
-                layer.register_buffer(f'buf_act_qmax_{i}', qmax)
+                layer.register_buffer(f'buf_act_zeros_{i}', zeros.cuda())
+                layer.register_buffer(f'buf_act_qmin_{i}', qmin.cuda())
+                layer.register_buffer(f'buf_act_qmax_{i}', qmax.cuda())
 
     @torch.no_grad()
     def apply_scale(self, scales, prev_op, layers):
@@ -808,7 +808,7 @@ def deploy(self, quant_format, keep_device=False):
                 self.get_replacement_params(mode=quant_format, w_only=self.w_only),
                 keep_device=keep_device
             )
-            self.set_non_linear_mode(quant_format, self.model.model, False)
+        self.set_non_linear_mode(quant_format, self.model.model, False)
 
         if self.model.vlm_model is not None:
             logger.info(f'Now, the vlm_model is: {self.model.vlm_model}')
diff --git a/llmc/compression/quantization/gptq.py b/llmc/compression/quantization/gptq.py
@@ -308,6 +308,7 @@ def collect_model_qparams(self):
         for i in range(len(self.blocks)):
             block = self.blocks[i]
             block = block.cuda()
+            self.replace_moe_gate(block)
             self.collect_block_qparams(block)
             block = block.cpu()
 
diff --git a/llmc/compression/quantization/module_utils.py b/llmc/compression/quantization/module_utils.py
@@ -818,7 +818,7 @@ def __init__(self, weight, bias, ori_module, w_qdq, a_qdq):
         self.dynamic_quant_weight = False
         self.dynamic_quant_tmp_weight = False
 
-    def forward(self, x):
+    def forward(self, x, dtype=None):
         if hasattr(self, 'buf_rotate') and self.buf_rotate:
             x = self.rotater.rotate(x)
 
@@ -837,10 +837,20 @@ def forward(self, x):
         elif self.dynamic_quant_tmp_weight:
             self.tmp_weight = self.w_qdq(self)
 
+        org_dtype = self.tmp_weight.data.dtype
+        if dtype is not None:
+            self.convert_dtype(dtype)
+
         x = torch.functional.F.linear(x, self.tmp_weight, self.tmp_bias)
 
+        self.convert_dtype(org_dtype)
         return x
 
+    def convert_dtype(self, dtype):
+        self.tmp_weight.data = self.tmp_weight.data.to(dtype)
+        if self.bias is not None:
+            self.bias.data = self.bias.data.to(dtype)
+
     @classmethod
     @torch.no_grad()
     def new(cls, module, w_qdq, a_qdq):
@@ -964,21 +974,32 @@ def __init__(self, module):
         # topk selection algorithm
         self.norm_topk_prob = module.config.norm_topk_prob
         self.gating_dim = module.config.hidden_size
-        self.fc = nn.Linear(self.gating_dim, self.n_routed_experts, bias=False)
-        self.fc.weight = module.weight
+        self.fc = getattr(module, 'fc',
+                          nn.Linear(self.gating_dim, self.n_routed_experts, bias=False))
+        if not hasattr(module, 'fc'):
+            self.fc.weight = module.weight
 
     @property
     def weight(self):
         return self.fc.weight
 
+    def _fp32_forward(self, hidden_states):
+        if isinstance(self.fc, tuple(_LLMC_LINEAR_TYPES_)):
+            logits = self.fc(hidden_states.type(torch.float32), dtype=torch.float32)
+        else:
+            org_dtype = self.fc.weight.dtype
+            self.fc.weight.data = self.fc.weight.data.to(torch.float32)
+            logits = self.fc(hidden_states.type(torch.float32))
+            self.fc.weight.data = self.fc.weight.data.to(org_dtype)
+        return logits
+
     def forward(self, hidden_states):
         bsz, seq_len, h = hidden_states.shape
         # compute gating score
         hidden_states = hidden_states.view(-1, h)
-        org_dtype = self.fc.weight.dtype
-        self.fc.weight.data = self.fc.weight.data.to(torch.float32)
-        logits = self.fc(hidden_states.type(torch.float32))
-        self.fc.weight.data = self.fc.weight.data.to(org_dtype)
+
+        logits = self._fp32_forward(hidden_states)
+
         if self.scoring_func == 'softmax':
             scores = logits.softmax(dim=-1, dtype=torch.float32)
         else: