Support qwen1.5-110b awq (#821)

Jintao-Huang · web-flow · commit e4c792f092e8 · 2024-04-27T18:18:23.000+08:00
diff --git a/docs/source/LLM/支持的模型和数据集.md b/docs/source/LLM/支持的模型和数据集.md
@@ -74,6 +74,7 @@
 |qwen1half-14b-chat-awq|[qwen/Qwen1.5-14B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-14B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-14B-Chat-AWQ)|
 |qwen1half-32b-chat-awq|[qwen/Qwen1.5-32B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-32B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-32B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-32B-Chat-AWQ)|
 |qwen1half-72b-chat-awq|[qwen/Qwen1.5-72B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-72B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-72B-Chat-AWQ)|
+|qwen1half-110b-chat-awq|[qwen/Qwen1.5-110B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-110B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-110B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-110B-Chat-AWQ)|
 |codeqwen1half-7b-chat-awq|[qwen/CodeQwen1.5-7B-Chat-AWQ](https://modelscope.cn/models/qwen/CodeQwen1.5-7B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|[Qwen/CodeQwen1.5-7B-Chat-AWQ](https://huggingface.co/Qwen/CodeQwen1.5-7B-Chat-AWQ)|
 |qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|c_attn|default-generation|&#x2714;|&#x2718;||multi-modal, vision|[Qwen/Qwen-VL](https://huggingface.co/Qwen/Qwen-VL)|
 |qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|qwen|&#x2714;|&#x2718;||multi-modal, vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)|
diff --git a/docs/source_en/LLM/Supported-models-datasets.md b/docs/source_en/LLM/Supported-models-datasets.md
@@ -74,6 +74,7 @@ The table below introcudes all models supported by SWIFT:
 |qwen1half-14b-chat-awq|[qwen/Qwen1.5-14B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-14B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-14B-Chat-AWQ)|
 |qwen1half-32b-chat-awq|[qwen/Qwen1.5-32B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-32B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-32B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-32B-Chat-AWQ)|
 |qwen1half-72b-chat-awq|[qwen/Qwen1.5-72B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-72B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-72B-Chat-AWQ)|
+|qwen1half-110b-chat-awq|[qwen/Qwen1.5-110B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-110B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|[Qwen/Qwen1.5-110B-Chat-AWQ](https://huggingface.co/Qwen/Qwen1.5-110B-Chat-AWQ)|
 |codeqwen1half-7b-chat-awq|[qwen/CodeQwen1.5-7B-Chat-AWQ](https://modelscope.cn/models/qwen/CodeQwen1.5-7B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|[Qwen/CodeQwen1.5-7B-Chat-AWQ](https://huggingface.co/Qwen/CodeQwen1.5-7B-Chat-AWQ)|
 |qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|c_attn|default-generation|&#x2714;|&#x2718;||multi-modal, vision|[Qwen/Qwen-VL](https://huggingface.co/Qwen/Qwen-VL)|
 |qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|qwen|&#x2714;|&#x2718;||multi-modal, vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)|
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
@@ -106,6 +106,7 @@ class ModelType:
     qwen1half_14b_chat_awq = 'qwen1half-14b-chat-awq'
     qwen1half_32b_chat_awq = 'qwen1half-32b-chat-awq'
     qwen1half_72b_chat_awq = 'qwen1half-72b-chat-awq'
+    qwen1half_110b_chat_awq = 'qwen1half-110b-chat-awq'
     codeqwen1half_7b_chat_awq = 'codeqwen1half-7b-chat-awq'
 
     # qwen-vl
@@ -1994,6 +1995,17 @@ def get_model_tokenizer_with_flash_attn(model_dir: str,
     requires=['transformers>=4.37', 'autoawq'],
     torch_dtype=torch.float16,
     hf_model_id='Qwen/Qwen1.5-72B-Chat-AWQ')
+@register_model(
+    ModelType.qwen1half_110b_chat_awq,
+    'qwen/Qwen1.5-110B-Chat-AWQ',
+    LoRATM.qwen1half,
+    TemplateType.qwen,
+    support_flash_attn=True,
+    support_vllm=True,
+    function_kwargs={'is_awq': True},
+    requires=['transformers>=4.37', 'autoawq'],
+    torch_dtype=torch.float16,
+    hf_model_id='Qwen/Qwen1.5-110B-Chat-AWQ')
 @register_model(
     ModelType.codeqwen1half_7b_chat_awq,
     'qwen/CodeQwen1.5-7B-Chat-AWQ',
diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py
@@ -364,14 +364,15 @@ def find_all_linears(model: Module, quantization_bit: int,
     else:
         linear_cls = [Linear]
     if 'int4' in model_type or 'int8' in model_type:
-        from bitsandbytes.nn import Linear4bit
         from peft.utils import get_auto_gptq_quant_linear, get_quantization_config
         gptq_quantization_config = get_quantization_config(model, 'gptq')
         AutoGPTQQuantLinear = get_auto_gptq_quant_linear(
             gptq_quantization_config)
-        linear_cls = [Linear4bit]
-        if AutoGPTQQuantLinear is not None:
-            linear_cls.append(AutoGPTQQuantLinear)
+        if AutoGPTQQuantLinear is None:
+            from bitsandbytes.nn import Linear4bit
+            linear_cls = [Linear4bit]
+        else:
+            linear_cls = [AutoGPTQQuantLinear]
     if 'awq' in model_type:
         from awq.modules.linear import WQLinear_GEMM
         linear_cls.append(WQLinear_GEMM)