Skip to content

Commit 8ba3cf7

Browse files
authored
Support llama3 gptq awq model (#758)
1 parent b5f1789 commit 8ba3cf7

File tree

9 files changed

+112
-8
lines changed

9 files changed

+112
-8
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -441,7 +441,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \
441441
| Yuan2 | [Langchao Yuan series models](https://github.com/IEIT-Yuan) | Chinese<br>English | 2B-102B | instruct model |
442442
| XVerse | [XVerse series models](https://github.com/xverse-ai) | Chinese<br>English | 7B-65B | base model<br>chat model<br>long text model<br>MoE model |
443443
| LLaMA2 | [LLaMA2 series models](https://github.com/facebookresearch/llama) | English | 7B-70B<br>including quantized versions | base model<br>chat model |
444-
| LLaMA3 | [LLaMA3 series models](https://github.com/meta-llama/llama3) | English | 8B-70B | base model<br>chat model |
444+
| LLaMA3 | [LLaMA3 series models](https://github.com/meta-llama/llama3) | English | 8B-70B<br>including quantized versions | base model<br>chat model |
445445
| Mistral<br>Mixtral | [Mistral series models](https://github.com/mistralai/mistral-src) | English | 7B-22B | base model<br>instruct model<br>MoE model |
446446
| YI | [01AI's YI series models](https://github.com/01-ai) | Chinese<br>English | 6B-34B<br>including quantized | base model<br>chat model<br>long text model |
447447
| InternLM<br>InternLM2<br>InternLM2-Math | [Pujiang AI Lab InternLM series models](https://github.com/InternLM/InternLM) | Chinese<br>English | 1.8B-20B | base model<br>chat model<br>math model |

README_CN.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -438,7 +438,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \
438438
| Yuan2 | [浪潮源系列模型](https://github.com/IEIT-Yuan) | 中文<br>英文 | 2B-102B | instruct模型 |
439439
| XVerse | [元象系列模型](https://github.com/xverse-ai) | 中文<br>英文 | 7B-65B | base模型<br>chat模型<br>长文本模型<br>MoE模型 | |
440440
| LLaMA2 | [LLaMA2系列模型](https://github.com/facebookresearch/llama) | 英文 | 7B-70B<br>包含量化版本 | base模型<br>chat模型 |
441-
| LLaMA3 | [LLaMA3系列模型](https://github.com/meta-llama/llama3) | 英文 | 8B-70B | base模型<br>chat模型 |
441+
| LLaMA3 | [LLaMA3系列模型](https://github.com/meta-llama/llama3) | 英文 | 8B-70B<br>包含量化版本 | base模型<br>chat模型 |
442442
| Mistral<br>Mixtral | [Mistral系列模型](https://github.com/mistralai/mistral-src) | 英文 | 7B-8x22B | base模型<br>instruct模型<br>MoE模型 |
443443
| YI | [01AI的YI系列模型](https://github.com/01-ai) | 中文<br>英文 | 6B-34B<br>包含量化版本 | base模型<br>chat模型<br>长文本模型 |
444444
| InternLM<br>InternLM2<br>InternLM2-Math | [浦江实验室书生浦语系列模型](https://github.com/InternLM/InternLM) | 中文<br>英文 | 1.8B-20B | base模型<br>chat模型<br>数学模型 |

docs/source/LLM/LLM量化文档.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@ GPU设备: A10, 3090, V100, A100均可.
1414
# 设置pip全局镜像 (加速下载)
1515
pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/
1616
# 安装ms-swift
17-
pip install 'ms-swift[llm]' -U
17+
git clone https://github.com/modelscope/swift.git
18+
cd swift
19+
pip install -e '.[llm]'
1820

1921
# 使用awq量化:
2022
# autoawq和cuda版本有对应关系,请按照`https://github.com/casper-hansen/AutoAWQ`选择版本
@@ -209,6 +211,14 @@ curl http://localhost:8000/v1/chat/completions \
209211
假设你使用lora微调了qwen1half-4b-chat, 模型权重目录为: `output/qwen1half-4b-chat/vx-xxx/checkpoint-xxx`.
210212

211213
```shell
214+
# 推送原始量化模型
215+
CUDA_VISIBLE_DEVICES=0 swift export \
216+
--model_type qwen1half-7b-chat \
217+
--model_id_or_path qwen1half-7b-chat-gptq-int4 \
218+
--push_to_hub true \
219+
--hub_model_id qwen1half-7b-chat-gptq-int4 \
220+
--hub_token '<your-sdk-token>'
221+
212222
# 推送lora增量模型
213223
CUDA_VISIBLE_DEVICES=0 swift export \
214224
--ckpt_dir output/qwen1half-4b-chat/vx-xxx/checkpoint-xxx \

docs/source/LLM/支持的模型和数据集.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,14 @@
9191
|llama2-7b-aqlm-2bit-1x16|[AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2718;|transformers>=4.38, aqlm, torch>=2.2.0|-|[ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf](https://huggingface.co/ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf)|
9292
|llama3-8b|[LLM-Research/Meta-Llama-3-8B](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)|
9393
|llama3-8b-instruct|[LLM-Research/Meta-Llama-3-8B-Instruct](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B-Instruct/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;||-|[meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)|
94+
|llama3-8b-instruct-int4|[huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int4](https://modelscope.cn/models/huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|auto_gptq|-|-|
95+
|llama3-8b-instruct-int8|[huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int8](https://modelscope.cn/models/huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|auto_gptq|-|-|
96+
|llama3-8b-instruct-awq|[huangjintao/Meta-Llama-3-8B-Instruct-AWQ](https://modelscope.cn/models/huangjintao/Meta-Llama-3-8B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|autoawq|-|-|
9497
|llama3-70b|[LLM-Research/Meta-Llama-3-70B](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-70B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B)|
9598
|llama3-70b-instruct|[LLM-Research/Meta-Llama-3-70B-Instruct](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-70B-Instruct/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;||-|[meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct)|
99+
|llama3-70b-instruct-int4|[huangjintao/Meta-Llama-3-70B-Instruct-GPTQ-Int4](https://modelscope.cn/models/huangjintao/Meta-Llama-3-70B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|auto_gptq|-|-|
100+
|llama3-70b-instruct-int8|[huangjintao/Meta-Llama-3-70b-Instruct-GPTQ-Int8](https://modelscope.cn/models/huangjintao/Meta-Llama-3-70b-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|auto_gptq|-|-|
101+
|llama3-70b-instruct-awq|[huangjintao/Meta-Llama-3-70B-Instruct-AWQ](https://modelscope.cn/models/huangjintao/Meta-Llama-3-70B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|autoawq|-|-|
96102
|atom-7b|[FlagAlpha/Atom-7B](https://modelscope.cn/models/FlagAlpha/Atom-7B/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|[FlagAlpha/Atom-7B](https://huggingface.co/FlagAlpha/Atom-7B)|
97103
|atom-7b-chat|[FlagAlpha/Atom-7B-Chat](https://modelscope.cn/models/FlagAlpha/Atom-7B-Chat/summary)|q_proj, k_proj, v_proj|atom|&#x2714;|&#x2714;||-|[FlagAlpha/Atom-7B-Chat](https://huggingface.co/FlagAlpha/Atom-7B-Chat)|
98104
|llava1d6-mistral-7b-instruct|[AI-ModelScope/llava-v1.6-mistral-7b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-mistral-7b/summary)|q_proj, k_proj, v_proj|llava-mistral-instruct|&#x2714;|&#x2718;|transformers>=4.34|multi-modal, vision|[liuhaotian/llava-v1.6-mistral-7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b)|

docs/source_en/LLM/LLM-quantization.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@ Swift supports using AWQ and GPTQ techniques to quantize models. These two quant
1111
GPU devices: A10, 3090, V100, A100 are all supported.
1212
```bash
1313
# Install ms-swift
14-
pip install 'ms-swift[llm]' -U
14+
git clone https://github.com/modelscope/swift.git
15+
cd swift
16+
pip install -e '.[llm]'
1517

1618
# Using AWQ quantization:
1719
# AutoAWQ and CUDA versions have a corresponding relationship, please select the version according to `https://github.com/casper-hansen/AutoAWQ`
@@ -120,6 +122,14 @@ curl http://localhost:8000/v1/chat/completions \
120122
Assume you fine-tuned qwen1half-4b-chat using LoRA, and the model weights directory is: `output/qwen1half-4b-chat/vx-xxx/checkpoint-xxx`.
121123

122124
```shell
125+
# Push the original quantized model
126+
CUDA_VISIBLE_DEVICES=0 swift export \
127+
--model_type qwen1half-7b-chat \
128+
--model_id_or_path qwen1half-7b-chat-gptq-int4 \
129+
--push_to_hub true \
130+
--hub_model_id qwen1half-7b-chat-gptq-int4 \
131+
--hub_token '<your-sdk-token>'
132+
123133
# Push LoRA incremental model
124134
CUDA_VISIBLE_DEVICES=0 swift export \
125135
--ckpt_dir output/qwen1half-4b-chat/vx-xxx/checkpoint-xxx \

docs/source_en/LLM/Supported-models-datasets.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,14 @@ The table below introcudes all models supported by SWIFT:
9191
|llama2-7b-aqlm-2bit-1x16|[AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf](https://modelscope.cn/models/AI-ModelScope/Llama-2-7b-AQLM-2Bit-1x16-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2718;|transformers>=4.38, aqlm, torch>=2.2.0|-|[ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf](https://huggingface.co/ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf)|
9292
|llama3-8b|[LLM-Research/Meta-Llama-3-8B](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)|
9393
|llama3-8b-instruct|[LLM-Research/Meta-Llama-3-8B-Instruct](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B-Instruct/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;||-|[meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)|
94+
|llama3-8b-instruct-int4|[huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int4](https://modelscope.cn/models/huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|auto_gptq|-|-|
95+
|llama3-8b-instruct-int8|[huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int8](https://modelscope.cn/models/huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|auto_gptq|-|-|
96+
|llama3-8b-instruct-awq|[huangjintao/Meta-Llama-3-8B-Instruct-AWQ](https://modelscope.cn/models/huangjintao/Meta-Llama-3-8B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|autoawq|-|-|
9497
|llama3-70b|[LLM-Research/Meta-Llama-3-70B](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-70B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B)|
9598
|llama3-70b-instruct|[LLM-Research/Meta-Llama-3-70B-Instruct](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-70B-Instruct/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;||-|[meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct)|
99+
|llama3-70b-instruct-int4|[huangjintao/Meta-Llama-3-70B-Instruct-GPTQ-Int4](https://modelscope.cn/models/huangjintao/Meta-Llama-3-70B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|auto_gptq|-|-|
100+
|llama3-70b-instruct-int8|[huangjintao/Meta-Llama-3-70b-Instruct-GPTQ-Int8](https://modelscope.cn/models/huangjintao/Meta-Llama-3-70b-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|auto_gptq|-|-|
101+
|llama3-70b-instruct-awq|[huangjintao/Meta-Llama-3-70B-Instruct-AWQ](https://modelscope.cn/models/huangjintao/Meta-Llama-3-70B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|autoawq|-|-|
96102
|atom-7b|[FlagAlpha/Atom-7B](https://modelscope.cn/models/FlagAlpha/Atom-7B/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;||-|[FlagAlpha/Atom-7B](https://huggingface.co/FlagAlpha/Atom-7B)|
97103
|atom-7b-chat|[FlagAlpha/Atom-7B-Chat](https://modelscope.cn/models/FlagAlpha/Atom-7B-Chat/summary)|q_proj, k_proj, v_proj|atom|&#x2714;|&#x2714;||-|[FlagAlpha/Atom-7B-Chat](https://huggingface.co/FlagAlpha/Atom-7B-Chat)|
98104
|llava1d6-mistral-7b-instruct|[AI-ModelScope/llava-v1.6-mistral-7b](https://modelscope.cn/models/AI-ModelScope/llava-v1.6-mistral-7b/summary)|q_proj, k_proj, v_proj|llava-mistral-instruct|&#x2714;|&#x2718;|transformers>=4.34|multi-modal, vision|[liuhaotian/llava-v1.6-mistral-7b](https://huggingface.co/liuhaotian/llava-v1.6-mistral-7b)|

swift/llm/export.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,8 +151,11 @@ def llm_export(args: ExportArguments) -> None:
151151
args.ckpt_dir = quant_path
152152

153153
if args.push_to_hub:
154-
assert args.ckpt_dir is not None, 'You need to specify `ckpt_dir`.'
155-
push_to_ms_hub(args.ckpt_dir, args.hub_model_id, args.hub_token,
154+
ckpt_dir = args.ckpt_dir
155+
if ckpt_dir is None:
156+
ckpt_dir = args.model_id_or_path
157+
assert ckpt_dir is not None, 'You need to specify `ckpt_dir`.'
158+
push_to_ms_hub(ckpt_dir, args.hub_model_id, args.hub_token,
156159
args.hub_private_repo, args.commit_message)
157160

158161

swift/llm/utils/model.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,8 +128,15 @@ class ModelType:
128128
# llama3
129129
llama3_8b = 'llama3-8b'
130130
llama3_8b_instruct = 'llama3-8b-instruct'
131+
llama3_8b_instruct_int4 = 'llama3-8b-instruct-int4'
132+
llama3_8b_instruct_int8 = 'llama3-8b-instruct-int8'
133+
llama3_8b_instruct_awq = 'llama3-8b-instruct-awq'
131134
llama3_70b = 'llama3-70b'
132135
llama3_70b_instruct = 'llama3-70b-instruct'
136+
llama3_70b_instruct_int4 = 'llama3-70b-instruct-int4'
137+
llama3_70b_instruct_int8 = 'llama3-70b-instruct-int8'
138+
llama3_70b_instruct_awq = 'llama3-70b-instruct-awq'
139+
133140
# atom
134141
atom_7b = 'atom-7b'
135142
atom_7b_chat = 'atom-7b-chat'
@@ -2369,6 +2376,66 @@ def get_model_tokenizer_deepseek_vl(model_dir: str,
23692376
return model, tokenizer
23702377

23712378

2379+
@register_model(
2380+
ModelType.llama3_70b_instruct_awq,
2381+
'huangjintao/Meta-Llama-3-70B-Instruct-AWQ',
2382+
LoRATM.llama2,
2383+
TemplateType.llama3,
2384+
requires=['autoawq'],
2385+
torch_dtype=torch.float16,
2386+
function_kwargs={'is_awq': True},
2387+
support_flash_attn=True,
2388+
support_vllm=True)
2389+
@register_model(
2390+
ModelType.llama3_70b_instruct_int8,
2391+
'huangjintao/Meta-Llama-3-70b-Instruct-GPTQ-Int8',
2392+
LoRATM.llama2,
2393+
TemplateType.llama3,
2394+
requires=['auto_gptq'],
2395+
torch_dtype=torch.float16,
2396+
function_kwargs={'gptq_bits': 8},
2397+
support_flash_attn=True,
2398+
support_vllm=True)
2399+
@register_model(
2400+
ModelType.llama3_70b_instruct_int4,
2401+
'huangjintao/Meta-Llama-3-70B-Instruct-GPTQ-Int4',
2402+
LoRATM.llama2,
2403+
TemplateType.llama3,
2404+
requires=['auto_gptq'],
2405+
torch_dtype=torch.float16,
2406+
function_kwargs={'gptq_bits': 4},
2407+
support_flash_attn=True,
2408+
support_vllm=True)
2409+
@register_model(
2410+
ModelType.llama3_8b_instruct_awq,
2411+
'huangjintao/Meta-Llama-3-8B-Instruct-AWQ',
2412+
LoRATM.llama2,
2413+
TemplateType.llama3,
2414+
requires=['autoawq'],
2415+
torch_dtype=torch.float16,
2416+
function_kwargs={'is_awq': True},
2417+
support_flash_attn=True,
2418+
support_vllm=True)
2419+
@register_model(
2420+
ModelType.llama3_8b_instruct_int8,
2421+
'huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int8',
2422+
LoRATM.llama2,
2423+
TemplateType.llama3,
2424+
requires=['auto_gptq'],
2425+
torch_dtype=torch.float16,
2426+
function_kwargs={'gptq_bits': 8},
2427+
support_flash_attn=True,
2428+
support_vllm=True)
2429+
@register_model(
2430+
ModelType.llama3_8b_instruct_int4,
2431+
'huangjintao/Meta-Llama-3-8B-Instruct-GPTQ-Int4',
2432+
LoRATM.llama2,
2433+
TemplateType.llama3,
2434+
requires=['auto_gptq'],
2435+
torch_dtype=torch.float16,
2436+
function_kwargs={'gptq_bits': 4},
2437+
support_flash_attn=True,
2438+
support_vllm=True)
23722439
@register_model(
23732440
ModelType.llama3_70b_instruct,
23742441
'LLM-Research/Meta-Llama-3-70B-Instruct',

swift/utils/hub.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import os
22
import shutil
33
import subprocess
4+
import tempfile
45
import time
56
from typing import Optional
67

@@ -46,15 +47,16 @@ def push_to_ms_hub(ckpt_dir: str,
4647
hub_private_repo: bool = False,
4748
commit_message: str = 'update files'):
4849
logger.info(f'Starting push to hub. ckpt_dir: {ckpt_dir}.')
50+
tmp_file_name = tempfile.TemporaryDirectory().name
4951
subprocess_run(['git', 'lfs', 'env'],
5052
stdout=subprocess.PIPE) # check git-lfs install
5153

5254
hub_model_id = create_ms_repo(hub_model_id, hub_token, hub_private_repo)
5355
git_token = ModelScopeConfig.get_token()
5456
ms_url = f'https://oauth2:{git_token}@www.modelscope.cn/{hub_model_id}.git'
55-
subprocess_run(['git', '-C', ckpt_dir, 'clone', ms_url, 'tmp'],
57+
subprocess_run(['git', '-C', ckpt_dir, 'clone', ms_url, tmp_file_name],
5658
env={'GIT_LFS_SKIP_SMUDGE': '1'})
57-
tmp_dir = os.path.join(ckpt_dir, 'tmp')
59+
tmp_dir = os.path.join(ckpt_dir, tmp_file_name)
5860
subprocess_run(['git', '-C', tmp_dir, 'lfs', 'pull'])
5961
logger.info('Git clone the repo successfully.')
6062
# mv .git

0 commit comments

Comments
 (0)