Support llama3 gptq awq model (#758)

Jintao-Huang · web-flow · commit 8ba3cf7b0466 · 2024-04-22T00:19:55.000+08:00
diff --git a/README.md b/README.md
@@ -441,7 +441,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \
 | Yuan2                                          | [Langchao Yuan series models](https://github.com/IEIT-Yuan)             | Chinese<br>English    | 2B-102B                                | instruct model                                 |
 | XVerse                                         | [XVerse series models](https://github.com/xverse-ai)                    | Chinese<br>English    | 7B-65B                                 | base model<br>chat model<br>long text model<br>MoE model                |
 | LLaMA2                                         | [LLaMA2 series models](https://github.com/facebookresearch/llama)       | English            | 7B-70B<br>including quantized versions   | base model<br>chat model                       |
-| LLaMA3                   | [LLaMA3 series models](https://github.com/meta-llama/llama3)  | English       | 8B-70B      | base model<br>chat model              |
+| LLaMA3                   | [LLaMA3 series models](https://github.com/meta-llama/llama3)  | English       | 8B-70B<br>including quantized versions      | base model<br>chat model              |
 | Mistral<br>Mixtral                            | [Mistral series models](https://github.com/mistralai/mistral-src)       | English            | 7B-22B     | base model<br>instruct model<br>MoE model                     |
 | YI                                             | [01AI's YI series models](https://github.com/01-ai)                     | Chinese<br>English    | 6B-34B<br>including quantized             | base model<br>chat model<br>long text model            |
 | InternLM<br>InternLM2<br>InternLM2-Math              | [Pujiang AI Lab InternLM series models](https://github.com/InternLM/InternLM) | Chinese<br>English | 1.8B-20B                            | base model<br>chat model<br>math model            |
diff --git a/README_CN.md b/README_CN.md
@@ -438,7 +438,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \
 | Yuan2                                               | [浪潮源系列模型](https://github.com/IEIT-Yuan)               | 中文<br>英文 | 2B-102B                   | instruct模型                                |
 | XVerse                                              | [元象系列模型](https://github.com/xverse-ai)                 | 中文<br>英文 | 7B-65B                    | base模型<br>chat模型<br>长文本模型<br>MoE模型             |                |
 | LLaMA2                                              | [LLaMA2系列模型](https://github.com/facebookresearch/llama)  | 英文       | 7B-70B<br>包含量化版本      | base模型<br>chat模型                          |
-| LLaMA3               | [LLaMA3系列模型](https://github.com/meta-llama/llama3)  | 英文       | 8B-70B      | base模型<br>chat模型              |
+| LLaMA3               | [LLaMA3系列模型](https://github.com/meta-llama/llama3)  | 英文       | 8B-70B<br>包含量化版本      | base模型<br>chat模型              |
 | Mistral<br>Mixtral                                 | [Mistral系列模型](https://github.com/mistralai/mistral-src)  | 英文       | 7B-8x22B | base模型<br>instruct模型<br>MoE模型             |
 | YI                                                  | [01AI的YI系列模型](https://github.com/01-ai)                 | 中文<br>英文 | 6B-34B<br>包含量化版本          | base模型<br>chat模型<br>长文本模型                 |
 | InternLM<br>InternLM2<br>InternLM2-Math                   | [浦江实验室书生浦语系列模型](https://github.com/InternLM/InternLM) | 中文<br>英文 | 1.8B-20B                  | base模型<br>chat模型<br>数学模型                  |
diff --git a/docs/source/LLM/LLM量化文档.md b/docs/source/LLM/LLM量化文档.md
@@ -14,7 +14,9 @@ GPU设备: A10, 3090, V100, A100均可.
 # 设置pip全局镜像 (加速下载)
 pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/
 # 安装ms-swift
-pip install 'ms-swift[llm]' -U
+git clone https://github.com/modelscope/swift.git
+cd swift
+pip install -e '.[llm]'
 
 # 使用awq量化:
 # autoawq和cuda版本有对应关系，请按照`https://github.com/casper-hansen/AutoAWQ`选择版本
@@ -209,6 +211,14 @@ curl http://localhost:8000/v1/chat/completions \
 假设你使用lora微调了qwen1half-4b-chat, 模型权重目录为: `output/qwen1half-4b-chat/vx-xxx/checkpoint-xxx`.
 
 ```shell
+# 推送原始量化模型
+CUDA_VISIBLE_DEVICES=0 swift export \
+    --model_type qwen1half-7b-chat \
+    --model_id_or_path qwen1half-7b-chat-gptq-int4 \
+    --push_to_hub true \
+    --hub_model_id qwen1half-7b-chat-gptq-int4 \
+    --hub_token '<your-sdk-token>'
+
 # 推送lora增量模型
 CUDA_VISIBLE_DEVICES=0 swift export \
     --ckpt_dir output/qwen1half-4b-chat/vx-xxx/checkpoint-xxx \
diff --git a/docs/source/LLM/支持的模型和数据集.md b/docs/source/LLM/支持的模型和数据集.md
@@ -91,8 +91,14 @@
 |llama2-7b-aqlm-2bit-1x16|[AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2718;|transformers>=4.38, aqlm, torch>=2.2.0|-|[ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf](https://huggingface.co/ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf)|
 |llama3-8b|[LLM-Research/Meta-Llama-3-8B](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)|
 |llama3-8b-instruct|[LLM-Research/Meta-Llama-3-8B-Instruct](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B-Instruct/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;||-|[meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)|
+|llama3-8b-instruct-int4|[huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int4](https://modelscope.cn/models/huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|auto_gptq|-|-|
+|llama3-8b-instruct-int8|[huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int8](https://modelscope.cn/models/huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|auto_gptq|-|-|
+|llama3-8b-instruct-awq|[huangjintao/Meta-Llama-3-8B-Instruct-AWQ](https://modelscope.cn/models/huangjintao/Meta-Llama-3-8B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|autoawq|-|-|
 |llama3-70b|[LLM-Research/Meta-Llama-3-70B](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-70B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B)|
 |llama3-70b-instruct|[LLM-Research/Meta-Llama-3-70B-Instruct](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-70B-Instruct/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;||-|[meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct)|
+|llama3-70b-instruct-int4|[huangjintao/Meta-Llama-3-70B-Instruct-GPTQ-Int4](https://modelscope.cn/models/huangjintao/Meta-Llama-3-70B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|auto_gptq|-|-|
+|llama3-70b-instruct-int8|[huangjintao/Meta-Llama-3-70b-Instruct-GPTQ-Int8](https://modelscope.cn/models/huangjintao/Meta-Llama-3-70b-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|auto_gptq|-|-|
+|llama3-70b-instruct-awq|[huangjintao/Meta-Llama-3-70B-Instruct-AWQ](https://modelscope.cn/models/huangjintao/Meta-Llama-3-70B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|autoawq|-|-|
 |atom-7b|[FlagAlpha/Atom-7B](https://modelscope.cn/models/FlagAlpha/Atom-7B/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|[FlagAlpha/Atom-7B](https://huggingface.co/FlagAlpha/Atom-7B)|
 |atom-7b-chat|[FlagAlpha/Atom-7B-Chat](https://modelscope.cn/models/FlagAlpha/Atom-7B-Chat/summary)|q_proj, k_proj, v_proj|atom|&#x2714;|&#x2714;||-|[FlagAlpha/Atom-7B-Chat](https://huggingface.co/FlagAlpha/Atom-7B-Chat)|
 |llava1d6-mistral-7b-instruct|[AI-ModelScope/llava-v1.6-mistral-7b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-mistral-7b/summary)|q_proj, k_proj, v_proj|llava-mistral-instruct|&#x2714;|&#x2718;|transformers>=4.34|multi-modal, vision|[liuhaotian/llava-v1.6-mistral-7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b)|
diff --git a/docs/source_en/LLM/LLM-quantization.md b/docs/source_en/LLM/LLM-quantization.md
@@ -11,7 +11,9 @@ Swift supports using AWQ and GPTQ techniques to quantize models. These two quant
 GPU devices: A10, 3090, V100, A100 are all supported.
 ```bash
 # Install ms-swift
-pip install 'ms-swift[llm]' -U
+git clone https://github.com/modelscope/swift.git
+cd swift
+pip install -e '.[llm]'
 
 # Using AWQ quantization:
 # AutoAWQ and CUDA versions have a corresponding relationship, please select the version according to `https://github.com/casper-hansen/AutoAWQ`
@@ -120,6 +122,14 @@ curl http://localhost:8000/v1/chat/completions \
 Assume you fine-tuned qwen1half-4b-chat using LoRA, and the model weights directory is: `output/qwen1half-4b-chat/vx-xxx/checkpoint-xxx`.
 
 ```shell
+# Push the original quantized model
+CUDA_VISIBLE_DEVICES=0 swift export \
+    --model_type qwen1half-7b-chat \
+    --model_id_or_path qwen1half-7b-chat-gptq-int4 \
+    --push_to_hub true \
+    --hub_model_id qwen1half-7b-chat-gptq-int4 \
+    --hub_token '<your-sdk-token>'
+
 # Push LoRA incremental model
 CUDA_VISIBLE_DEVICES=0 swift export \
     --ckpt_dir output/qwen1half-4b-chat/vx-xxx/checkpoint-xxx \
diff --git a/docs/source_en/LLM/Supported-models-datasets.md b/docs/source_en/LLM/Supported-models-datasets.md
@@ -91,8 +91,14 @@ The table below introcudes all models supported by SWIFT:
 |llama2-7b-aqlm-2bit-1x16|[AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2718;|transformers>=4.38, aqlm, torch>=2.2.0|-|[ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf](https://huggingface.co/ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf)|
 |llama3-8b|[LLM-Research/Meta-Llama-3-8B](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)|
 |llama3-8b-instruct|[LLM-Research/Meta-Llama-3-8B-Instruct](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B-Instruct/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;||-|[meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)|
+|llama3-8b-instruct-int4|[huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int4](https://modelscope.cn/models/huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|auto_gptq|-|-|
+|llama3-8b-instruct-int8|[huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int8](https://modelscope.cn/models/huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|auto_gptq|-|-|
+|llama3-8b-instruct-awq|[huangjintao/Meta-Llama-3-8B-Instruct-AWQ](https://modelscope.cn/models/huangjintao/Meta-Llama-3-8B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|autoawq|-|-|
 |llama3-70b|[LLM-Research/Meta-Llama-3-70B](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-70B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B)|
 |llama3-70b-instruct|[LLM-Research/Meta-Llama-3-70B-Instruct](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-70B-Instruct/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;||-|[meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct)|
+|llama3-70b-instruct-int4|[huangjintao/Meta-Llama-3-70B-Instruct-GPTQ-Int4](https://modelscope.cn/models/huangjintao/Meta-Llama-3-70B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|auto_gptq|-|-|
+|llama3-70b-instruct-int8|[huangjintao/Meta-Llama-3-70b-Instruct-GPTQ-Int8](https://modelscope.cn/models/huangjintao/Meta-Llama-3-70b-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|auto_gptq|-|-|
+|llama3-70b-instruct-awq|[huangjintao/Meta-Llama-3-70B-Instruct-AWQ](https://modelscope.cn/models/huangjintao/Meta-Llama-3-70B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|autoawq|-|-|
 |atom-7b|[FlagAlpha/Atom-7B](https://modelscope.cn/models/FlagAlpha/Atom-7B/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|[FlagAlpha/Atom-7B](https://huggingface.co/FlagAlpha/Atom-7B)|
 |atom-7b-chat|[FlagAlpha/Atom-7B-Chat](https://modelscope.cn/models/FlagAlpha/Atom-7B-Chat/summary)|q_proj, k_proj, v_proj|atom|&#x2714;|&#x2714;||-|[FlagAlpha/Atom-7B-Chat](https://huggingface.co/FlagAlpha/Atom-7B-Chat)|
 |llava1d6-mistral-7b-instruct|[AI-ModelScope/llava-v1.6-mistral-7b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-mistral-7b/summary)|q_proj, k_proj, v_proj|llava-mistral-instruct|&#x2714;|&#x2718;|transformers>=4.34|multi-modal, vision|[liuhaotian/llava-v1.6-mistral-7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b)|
diff --git a/swift/llm/export.py b/swift/llm/export.py
@@ -151,8 +151,11 @@ def llm_export(args: ExportArguments) -> None:
         args.ckpt_dir = quant_path
 
     if args.push_to_hub:
-        assert args.ckpt_dir is not None, 'You need to specify `ckpt_dir`.'
-        push_to_ms_hub(args.ckpt_dir, args.hub_model_id, args.hub_token,
+        ckpt_dir = args.ckpt_dir
+        if ckpt_dir is None:
+            ckpt_dir = args.model_id_or_path
+        assert ckpt_dir is not None, 'You need to specify `ckpt_dir`.'
+        push_to_ms_hub(ckpt_dir, args.hub_model_id, args.hub_token,
                        args.hub_private_repo, args.commit_message)
 
 
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
@@ -128,8 +128,15 @@ class ModelType:
     # llama3
     llama3_8b = 'llama3-8b'
     llama3_8b_instruct = 'llama3-8b-instruct'
+    llama3_8b_instruct_int4 = 'llama3-8b-instruct-int4'
+    llama3_8b_instruct_int8 = 'llama3-8b-instruct-int8'
+    llama3_8b_instruct_awq = 'llama3-8b-instruct-awq'
     llama3_70b = 'llama3-70b'
     llama3_70b_instruct = 'llama3-70b-instruct'
+    llama3_70b_instruct_int4 = 'llama3-70b-instruct-int4'
+    llama3_70b_instruct_int8 = 'llama3-70b-instruct-int8'
+    llama3_70b_instruct_awq = 'llama3-70b-instruct-awq'
+
     # atom
     atom_7b = 'atom-7b'
     atom_7b_chat = 'atom-7b-chat'
@@ -2369,6 +2376,66 @@ def get_model_tokenizer_deepseek_vl(model_dir: str,
     return model, tokenizer
 
 
+@register_model(
+    ModelType.llama3_70b_instruct_awq,
+    'huangjintao/Meta-Llama-3-70B-Instruct-AWQ',
+    LoRATM.llama2,
+    TemplateType.llama3,
+    requires=['autoawq'],
+    torch_dtype=torch.float16,
+    function_kwargs={'is_awq': True},
+    support_flash_attn=True,
+    support_vllm=True)
+@register_model(
+    ModelType.llama3_70b_instruct_int8,
+    'huangjintao/Meta-Llama-3-70b-Instruct-GPTQ-Int8',
+    LoRATM.llama2,
+    TemplateType.llama3,
+    requires=['auto_gptq'],
+    torch_dtype=torch.float16,
+    function_kwargs={'gptq_bits': 8},
+    support_flash_attn=True,
+    support_vllm=True)
+@register_model(
+    ModelType.llama3_70b_instruct_int4,
+    'huangjintao/Meta-Llama-3-70B-Instruct-GPTQ-Int4',
+    LoRATM.llama2,
+    TemplateType.llama3,
+    requires=['auto_gptq'],
+    torch_dtype=torch.float16,
+    function_kwargs={'gptq_bits': 4},
+    support_flash_attn=True,
+    support_vllm=True)
+@register_model(
+    ModelType.llama3_8b_instruct_awq,
+    'huangjintao/Meta-Llama-3-8B-Instruct-AWQ',
+    LoRATM.llama2,
+    TemplateType.llama3,
+    requires=['autoawq'],
+    torch_dtype=torch.float16,
+    function_kwargs={'is_awq': True},
+    support_flash_attn=True,
+    support_vllm=True)
+@register_model(
+    ModelType.llama3_8b_instruct_int8,
+    'huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int8',
+    LoRATM.llama2,
+    TemplateType.llama3,
+    requires=['auto_gptq'],
+    torch_dtype=torch.float16,
+    function_kwargs={'gptq_bits': 8},
+    support_flash_attn=True,
+    support_vllm=True)
+@register_model(
+    ModelType.llama3_8b_instruct_int4,
+    'huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int4',
+    LoRATM.llama2,
+    TemplateType.llama3,
+    requires=['auto_gptq'],
+    torch_dtype=torch.float16,
+    function_kwargs={'gptq_bits': 4},
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.llama3_70b_instruct,
     'LLM-Research/Meta-Llama-3-70B-Instruct',
diff --git a/swift/utils/hub.py b/swift/utils/hub.py
@@ -1,6 +1,7 @@
 import os
 import shutil
 import subprocess
+import tempfile
 import time
 from typing import Optional
 
@@ -46,15 +47,16 @@ def push_to_ms_hub(ckpt_dir: str,
                    hub_private_repo: bool = False,
                    commit_message: str = 'update files'):
     logger.info(f'Starting push to hub. ckpt_dir: {ckpt_dir}.')
+    tmp_file_name = tempfile.TemporaryDirectory().name
     subprocess_run(['git', 'lfs', 'env'],
                    stdout=subprocess.PIPE)  # check git-lfs install
 
     hub_model_id = create_ms_repo(hub_model_id, hub_token, hub_private_repo)
     git_token = ModelScopeConfig.get_token()
     ms_url = f'https://oauth2:{git_token}@www.modelscope.cn/{hub_model_id}.git'
-    subprocess_run(['git', '-C', ckpt_dir, 'clone', ms_url, 'tmp'],
+    subprocess_run(['git', '-C', ckpt_dir, 'clone', ms_url, tmp_file_name],
                    env={'GIT_LFS_SKIP_SMUDGE': '1'})
-    tmp_dir = os.path.join(ckpt_dir, 'tmp')
+    tmp_dir = os.path.join(ckpt_dir, tmp_file_name)
     subprocess_run(['git', '-C', tmp_dir, 'lfs', 'pull'])
     logger.info('Git clone the repo successfully.')
     # mv .git