Fix baichuan2 int4 bug (#400)

Jintao-Huang · web-flow · commit 8fc1c413512d · 2024-02-07T12:29:07.000+08:00
diff --git a/docs/source/LLM/Agent微调最佳实践.md b/docs/source/LLM/Agent微调最佳实践.md
@@ -5,7 +5,7 @@
 - [环境安装](#环境安装)
 - [数据准备](#数据准备)
 - [微调](#微调)
-- [推理](#微调后推理)
+- [推理](#推理)
 - [总结](#总结)
 
 ## 环境安装
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
@@ -622,6 +622,8 @@ def get_model_tokenizer_baichuan2_int4(model_dir: str,
     if device_map != 'auto':
         accelerate.infer_auto_device_map = _old_infer_auto_device_map
     if model is not None:
+        model.config.quantization_config = BitsAndBytesConfig(
+            **model.config.quantization_config)
         model.train()
         model._is_quantized_training_enabled = True
         model.is_loaded_in_4bit = True
@@ -1186,52 +1188,15 @@ def get_model_tokenizer_with_flash_attn(model_dir: str,
     function_kwargs={'bits': 8},
     support_flash_attn=True,
     support_vllm=True)
-def get_model_tokenizer_with_flash_attn_intx(model_dir: str,
-                                             torch_dtype: Dtype,
-                                             model_kwargs: Dict[str, Any],
-                                             load_model: bool = True,
-                                             model_config=None,
-                                             **kwargs):
-    if model_config is None:
-        model_config = AutoConfig.from_pretrained(
-            model_dir, trust_remote_code=True)
-    use_flash_attn = kwargs.pop('use_flash_attn', False)
-    if version.parse(transformers.__version__) >= version.parse('4.36'):
-        if use_flash_attn:
-            model_config._attn_implementation = 'flash_attention_2'
-    else:
-        model_config._flash_attn_2_enabled = use_flash_attn
-
-    logger.info('use gptq, ignore bnb arguments')
-    bits = kwargs.pop('bits')
-    if version.parse(transformers.__version__) >= version.parse('4.35'):
-        model_kwargs['quantization_config'] = GPTQConfig(
-            bits=bits, use_exllama=False)
-    else:
-        model_kwargs['quantization_config'] = GPTQConfig(
-            bits=bits, disable_exllama=True)
-
-    # fix quantlinear bug
-    from auto_gptq.nn_modules.qlinear.qlinear_cuda_old import QuantLinear
-    __old_forward = QuantLinear.forward
-
-    def _new_forward(self, x):
-        if not self.training or not self.autogptq_cuda_available:
-            return self.__old_forward(x)
-        # fix sft no grad
-        self.autogptq_cuda_available = False
-        res = self.__old_forward(x)
-        self.autogptq_cuda_available = True
-        return res
+def get_model_tokenizer_with_qwen1half_intx(model_dir: str,
+                                            torch_dtype: Dtype,
+                                            model_kwargs: Dict[str, Any],
+                                            load_model: bool = True,
+                                            **kwargs):
 
-    if not hasattr(QuantLinear, '__old_forward'):  # avoid double patching
-        QuantLinear.__old_forward = __old_forward
-        QuantLinear.forward = _new_forward
-    get_qwen_function = kwargs.pop('get_qwen_function',
-                                   get_model_tokenizer_with_flash_attn)
-    model, tokenizer = get_qwen_function(model_dir, torch_dtype, model_kwargs,
+    kwargs['get_qwen_function'] = get_model_tokenizer_with_flash_attn
+    return get_model_tokenizer_qwen_intx(model_dir, torch_dtype, model_kwargs,
                                          load_model, **kwargs)
-    return model, tokenizer
 
 
 @register_model(
diff --git a/tests/llm/test_run.py b/tests/llm/test_run.py
@@ -115,7 +115,7 @@ def test_loss_matching(self):
             infer_main([
                 '--ckpt_dir', best_model_checkpoint, '--show_dataset_sample',
                 str(show_dataset_sample), '--max_new_tokens', '100',
-                '--use_flash_attn', 'true', '--verbose',
+                '--use_flash_attn', 'false', '--verbose',
                 str(not bool_var), '--merge_lora_and_save',
                 str(bool_var), '--load_dataset_config',
                 str(load_dataset_config)
@@ -220,7 +220,7 @@ def test_self_cognition(self):
                 self_cognition_sample=100,
                 model_name=['小黄', 'Xiao Huang'],
                 model_author=['魔搭', 'ModelScope'],
-                use_flash_attn=False)
+                use_flash_attn=True)
             torch.cuda.empty_cache()
             output = sft_main(sft_args)
             last_model_checkpoint = output['last_model_checkpoint']
@@ -350,6 +350,24 @@ def test_pai_compat(self):
         infer_main([infer_json])
         os.environ.pop('PAI_TRAINING_JOB_ID')
 
+    def test_baichuan2_chat_int4(self):
+        if not __name__ == '__main__':
+            # ignore citest error in github
+            return
+        from swift.llm import sft_main, infer_main, SftArguments, InferArguments, ModelType, DatasetName
+        output = sft_main(
+            SftArguments(
+                model_type=ModelType.baichuan2_7b_chat_int4,
+                dataset=['alpaca-zh'],
+                lora_target_modules=['DEFAULT', 'EMBEDDING'],
+                train_dataset_sample=20))
+        best_model_checkpoint = output['best_model_checkpoint']
+        infer_main(
+            InferArguments(
+                ckpt_dir=best_model_checkpoint,
+                load_dataset_config=True,
+                val_dataset_sample=1))
+
 
 def data_collate_fn(batch: List[Dict[str, Any]],
                     tokenizer) -> Dict[str, Tensor]: