modelscope
diff --git a/‎docs/source/LLM/LLM量化文档.md‎
Lines changed: 6 additions & 6 deletions b/‎docs/source/LLM/LLM量化文档.md‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎docs/source/LLM/Qwen1.5全流程最佳实践.md‎
Lines changed: 9 additions & 9 deletions b/‎docs/source/LLM/Qwen1.5全流程最佳实践.md‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎docs/source/LLM/命令行参数.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/LLM/命令行参数.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/LLM/自我认知微调最佳实践.md‎
Lines changed: 9 additions & 8 deletions b/‎docs/source/LLM/自我认知微调最佳实践.md‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎docs/source_en/LLM/Command-line-parameters.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source_en/LLM/Command-line-parameters.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source_en/LLM/LLM-quantization.md‎
Lines changed: 6 additions & 10 deletions b/‎docs/source_en/LLM/LLM-quantization.md‎
Lines changed: 6 additions & 10 deletions
diff --git a/‎docs/source_en/LLM/Qwen1.5-best-practice.md‎
Lines changed: 22 additions & 19 deletions b/‎docs/source_en/LLM/Qwen1.5-best-practice.md‎
Lines changed: 22 additions & 19 deletions
@@ -38,16 +38,16 @@ pip install -r requirements/llm.txt  -U
 # 如果出现量化的时候OOM, 可以适度降低`--quant_n_samples`(默认256)和`--quant_seqlen`(默认2048).
 # gptq-int4量化 (使用A100大约需要20分钟, 显存占用: 7GB)
 
-# awq: 使用`ms-bench-mini`作为量化数据集
+# awq: 使用`alpaca-zh alpaca-en sharegpt-gpt4-mini`作为量化数据集
 CUDA_VISIBLE_DEVICES=0 swift export \
     --model_type qwen1half-7b-chat --quant_bits 4 \
-    --dataset ms-bench-mini --quant_method awq
+    --dataset alpaca-zh alpaca-en sharegpt-gpt4-mini --quant_method awq
 
-# gptq: 使用`ms-bench-mini`作为量化数据集
+# gptq: 使用`alpaca-zh alpaca-en sharegpt-gpt4-mini`作为量化数据集
 # gptq量化请先查看此issue: https://github.com/AutoGPTQ/AutoGPTQ/issues/439
 OMP_NUM_THREADS=14 CUDA_VISIBLE_DEVICES=0 swift export \
     --model_type qwen1half-7b-chat --quant_bits 4 \
-    --dataset ms-bench-mini --quant_method gptq
+    --dataset alpaca-zh alpaca-en sharegpt-gpt4-mini --quant_method gptq
 
 # awq: 使用自定义量化数据集 (`--custom_val_dataset_path`参数不进行使用)
 # gptq同理
@@ -167,11 +167,11 @@ CUDA_VISIBLE_DEVICES=0 swift infer --model_type qwen1half-7b-chat
 
 **Merge-LoRA & 量化**
 ```shell
-# 使用`ms-bench-mini`作为量化数据集
+# 使用`alpaca-zh alpaca-en sharegpt-gpt4-mini`作为量化数据集
 CUDA_VISIBLE_DEVICES=0 swift export \
     --ckpt_dir 'output/qwen1half-4b-chat/vx-xxx/checkpoint-xxx' \
     --merge_lora true --quant_bits 4 \
-    --dataset ms-bench-mini --quant_method awq
+    --dataset alpaca-zh alpaca-en sharegpt-gpt4-mini --quant_method awq
 
 # 使用微调时使用的数据集作为量化数据集
 CUDA_VISIBLE_DEVICES=0 swift export \
 
@@ -178,16 +178,16 @@ CUDA_VISIBLE_DEVICES=0 swift app-ui \
 
 使用python:
 ```python
-# Experimental environment: A100
-# 26GB GPU memory
+# Experimental environment: 3090
+# 24GB GPU memory
 import os
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 
 from swift.llm import DatasetName, ModelType, SftArguments, sft_main
 
 sft_args = SftArguments(
     model_type=ModelType.qwen1half_7b_chat,
-    dataset=[DatasetName.ms_bench_mini],
+    dataset=[DatasetName.alpaca_zh, DatasetName.alpaca_en],
     train_dataset_sample=1000,
     logging_steps=5,
     max_length=2048,
@@ -208,11 +208,11 @@ print(f'best_model_checkpoint: {best_model_checkpoint}')
 使用模型并行:
 ```shell
 # Experimental environment: 2 * 3090
-# 2 * 19GB GPU memory
+# 2 * 18GB GPU memory
 CUDA_VISIBLE_DEVICES=0,1 \
 swift sft \
     --model_type qwen1half-7b-chat \
-    --dataset ms-bench-mini \
+    --dataset alpaca-zh alpaca-en \
     --train_dataset_sample 1000 \
     --logging_steps 5 \
     --max_length 2048 \
@@ -225,15 +225,15 @@ swift sft \
     --model_author 魔搭 ModelScope \
 ```
 
-使用**zero3**进行分布式训练的脚本:
+使用**zero2**进行分布式训练的脚本:
 ```shell
 # Experimental environment: 4 * 3090
 # 4 * 24GB GPU memory
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
 NPROC_PER_NODE=4 \
 swift sft \
     --model_type qwen1half-7b-chat \
-    --dataset ms-bench-mini \
+    --dataset alpaca-zh alpaca-en \
     --train_dataset_sample 1000 \
     --logging_steps 5 \
     --max_length 2048 \
@@ -244,7 +244,7 @@ swift sft \
     --self_cognition_sample 500 \
     --model_name 小黄 'Xiao Huang' \
     --model_author 魔搭 ModelScope \
-    --deepspeed default-zero3 \
+    --deepspeed default-zero2 \
 ```
 
 如果你想要使用**界面的方式进行训练**, 可以输入以下命令, 并填入相应的值:
@@ -484,7 +484,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 \
 NPROC_PER_NODE=4 \
 swift sft \
     --model_type qwen1half-72b-chat \
-    --dataset ms-bench-mini \
+    --dataset alpaca-zh alpaca-en \
     --train_dataset_sample 1000 \
     --logging_steps 5 \
     --max_length 4096 \
 
@@ -241,7 +241,7 @@ export参数继承了infer参数, 除此之外增加了以下参数:
 - `--merge_lora`: 默认为`False`. 该参数已在InferArguments中定义, 不属于新增参数. 是否将lora权重merge到基模型中, 并保存完整的权重. 权重会保存在`ckpt_dir`的同级目录中, e.g. `'/path/to/your/vx-xxx/checkpoint-xxx-merged'`目录下.
 - `--quant_bits`: 量化的bits数. 默认为`0`, 即不进行量化. 如果你设置了`--quant_method awq`, 你可以设置为`4`进行4bits量化. 如果你设置了`--quant_method gptq`, 你可以设置为`2`,`3`,`4`,`8`进行对应bits的量化. 如果对原始模型进行量化, 权重会保存在`f'{args.model_type}-{args.quant_method}-int{args.quant_bits}'`目录中. 如果对微调后模型进行量化, 权重会保存在`ckpt_dir`的同级目录中, e.g. `f'/path/to/your/vx-xxx/checkpoint-xxx-{args.quant_method}-int{args.quant_bits}'`目录下.
 - `--quant_method`: 量化方法, 默认为`'awq'`. 你可以选择为'awq', 'gptq'.
-- `--dataset`: 该参数已在InferArguments中定义, 在export时含义为量化数据集. 默认为`[]`. 推荐设置为`--dataset ms-bench-mini`. 该数据集含多语言的内容(中文为主)且质量很高, 量化中文模型具有很好的效果. 你也可以设置`--dataset pileval`, 使用autoawq默认量化数据集, 该数据集的语言为英文. 更多细节: 包括如何自定义量化数据集, 可以参考[LLM量化文档](LLM量化文档.md).
+- `--dataset`: 该参数已在InferArguments中定义, 在export时含义为量化数据集. 默认为`[]`. 更多细节: 包括如何自定义量化数据集, 可以参考[LLM量化文档](LLM量化文档.md).
 - `--quant_n_samples`: 量化参数, 默认为`256`. 当设置为`--quant_method awq`时, 如果出现量化的时候OOM, 可以适度降低`--quant_n_samples`和`--quant_seqlen`. `--quant_method gptq`通常不会出现量化OOM.
 - `--quant_seqlen`: 量化参数, 默认为`2048`.
 - `--quant_device_map`: 默认为`'cpu'`, 节约显存. 你可以指定为'cuda:0', 'auto', 'cpu'等, 表示量化时模型导入的设备.
 
@@ -75,15 +75,15 @@ CUDA_VISIBLE_DEVICES=0 swift infer --model_type qwen1half-4b-chat
 使用python:
 ```python
 # Experimental environment: A10, 3090, V100, ...
-# 23GB GPU memory
+# 22GB GPU memory
 import os
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 
 from swift.llm import DatasetName, ModelType, SftArguments, sft_main
 
 sft_args = SftArguments(
     model_type=ModelType.qwen1half_4b_chat,
-    dataset=[DatasetName.ms_bench_mini],
+    dataset=[DatasetName.alpaca_zh, DatasetName.alpaca_en],
     train_dataset_sample=1000,
     logging_steps=5,
     max_length=2048,
@@ -132,11 +132,11 @@ Val: 100%|███████████████████████
 使用CLI (单卡):
 ```bash
 # Experimental environment: A10, 3090, V100, ...
-# 23GB GPU memory
+# 22GB GPU memory
 CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model_type qwen1half-4b-chat \
-    --dataset ms-bench-mini \
+    --dataset alpaca-zh alpaca-en \
     --train_dataset_sample 1000 \
     --logging_steps 5 \
     --max_length 2048 \
@@ -149,16 +149,16 @@ swift sft \
     --model_author 魔搭 ModelScope \
 ```
 
-使用CLI (DDP):
+使用CLI (DeepSpeed-ZeRO2):
 > 如果你使用的是3090等卡, 可以降低`max_length`来减少显存消耗.
 ```bash
-# Experimental environment: 4 * A100
-# 4 * 32GB GPU memory
+# Experimental environment: 4 * 3090
+# 4 * 24GB GPU memory
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
 NPROC_PER_NODE=4 \
 swift sft \
     --model_type qwen1half-4b-chat \
-    --dataset ms-bench-mini \
+    --dataset alpaca-zh alpaca-en \
     --train_dataset_sample 1000 \
     --logging_steps 5 \
     --max_length 2048 \
@@ -169,6 +169,7 @@ swift sft \
     --self_cognition_sample 500 \
     --model_name 小黄 'Xiao Huang' \
     --model_author 魔搭 ModelScope \
+    --deepspeed default-zero2
 ```
 
 ## 微调后推理
 
@@ -237,7 +237,7 @@ export parameters inherit from infer parameters, with the following added parame
 - `--merge_lora`: Default is `False`. This parameter is already defined in InferArguments, not a new parameter. Whether to merge lora weights into base model and save full weights. Weights will be saved in the same level directory as `ckpt_dir`, e.g. `'/path/to/your/vx-xxx/checkpoint-xxx-merged'` directory.
 - `--quant_bits`: Number of bits for quantization. Default is `0`, i.e. no quantization. If you set `--quant_method awq`, you can set this to `4` for 4bits quantization. If you set `--quant_method gptq`, you can set this to `2`,`3`,`4`,`8` for corresponding bits quantization. If quantizing original model, weights will be saved in `f'{args.model_type}-{args.quant_method}-int{args.quant_bits}'` directory. If quantizing fine-tuned model, weights will be saved in the same level directory as `ckpt_dir`, e.g. `f'/path/to/your/vx-xxx/checkpoint-xxx-{args.quant_method}-int{args.quant_bits}'` directory.
 - `--quant_method`: Quantization method, default is `'awq'`. Options are 'awq', 'gptq'.
-- `--dataset`: This parameter is already defined in InferArguments, for export it means quantization dataset. Default is `[]`. Recommended to set `--dataset ms-bench-mini`. This dataset contains multilingual content (mainly Chinese) of high quality, with good effect for quantizing Chinese models. You can also set `--dataset pileval`, using autoawq default quantization dataset, the language of this dataset is English. More details: including how to customize quantization dataset, can be found in [LLM Quantization Documentation](LLM-quantization.md).
+- `--dataset`: This parameter is already defined in InferArguments, for export it means quantization dataset. Default is `[]`. More details: including how to customize quantization dataset, can be found in [LLM Quantization Documentation](LLM-quantization.md).
 - `--quant_n_samples`: Quantization parameter, default is `256`. When set to `--quant_method awq`, if OOM occurs during quantization, you can moderately reduce `--quant_n_samples` and `--quant_seqlen`. `--quant_method gptq` generally does not encounter quantization OOM.
 - `--quant_seqlen`: Quantization parameter, default is `2048`.
 - `--quant_device_map`: Default is `'cpu'`, to save memory. You can specify 'cuda:0', 'auto', 'cpu', etc., representing the device to load model during quantization.
 
@@ -35,16 +35,16 @@ Here we demonstrate AWQ and GPTQ quantization on the qwen1half-7b-chat model.
 # If OOM occurs during quantization, you can appropriately reduce `--quant_n_samples` (default 256) and `--quant_seqlen` (default 2048).
 # GPTQ-INT4 quantization (takes about 20 minutes using A100, memory usage: 7GB)
 
-# AWQ: Use `ms-bench-mini` as the quantization dataset
+# AWQ: Use `alpaca-zh alpaca-en sharegpt-gpt4-mini` as the quantization dataset
 CUDA_VISIBLE_DEVICES=0 swift export \
     --model_type qwen1half-7b-chat --quant_bits 4 \
-    --dataset ms-bench-mini --quant_method awq
+    --dataset alpaca-zh alpaca-en sharegpt-gpt4-mini --quant_method awq
 
-# GPTQ: Use `ms-bench-mini` as the quantization dataset
+# GPTQ: Use `alpaca-zh alpaca-en sharegpt-gpt4-mini` as the quantization dataset
 # For GPTQ quantization, please first refer to this issue: https://github.com/AutoGPTQ/AutoGPTQ/issues/439
 OMP_NUM_THREADS=14 CUDA_VISIBLE_DEVICES=0 swift export \
     --model_type qwen1half-7b-chat --quant_bits 4 \
-    --dataset ms-bench-mini --quant_method gptq
+    --dataset alpaca-zh alpaca-en sharegpt-gpt4-mini --quant_method gptq
 
 # AWQ: Use custom quantization dataset (don't use the `--custom_val_dataset_path` parameter)
 # Same for GPTQ
@@ -67,10 +67,6 @@ CUDA_VISIBLE_DEVICES=0 swift infer \
 CUDA_VISIBLE_DEVICES=0 swift infer --model_type qwen1half-7b-chat
 ```
 
-**Comparison of quantization effects**:
-
-The comparison shows inference results from the AWQ-INT4 model, GPTQ-INT4 model, and the original unquantized model. The quantized models maintain high quality output while enabling faster inference speeds.
-
 ## Fine-tuned Model
 
 Assume you fine-tuned qwen1half-4b-chat using LoRA, and the model weights directory is: `output/qwen1half-4b-chat/vx-xxx/checkpoint-xxx`.
@@ -79,11 +75,11 @@ Here we only introduce using the AWQ technique to quantize the fine-tuned model.
 
 **Merge-LoRA & Quantization**
 ```shell
-# Use `ms-bench-mini` as the quantization dataset
+# Use `alpaca-zh alpaca-en sharegpt-gpt4-mini` as the quantization dataset
 CUDA_VISIBLE_DEVICES=0 swift export \
     --ckpt_dir 'output/qwen1half-4b-chat/vx-xxx/checkpoint-xxx' \
     --merge_lora true --quant_bits 4 \
-    --dataset ms-bench-mini --quant_method awq
+    --dataset alpaca-zh alpaca-en sharegpt-gpt4-mini --quant_method awq
 
 # Use the dataset from fine-tuning as the quantization dataset
 CUDA_VISIBLE_DEVICES=0 swift export \
 
@@ -175,16 +175,16 @@ Next, we perform self-cognition fine-tuning on the model to train your own large
 
 Using Python:
 ```python
-# Experimental environment: A100
-# 26GB GPU memory
+# Experimental environment: 3090
+# 24GB GPU memory
 import os
 os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 
 from swift.llm import DatasetName, ModelType, SftArguments, sft_main
 
 sft_args = SftArguments(
     model_type=ModelType.qwen1half_7b_chat,
-    dataset=[DatasetName.ms_bench_mini],
+    dataset=[DatasetName.alpaca_zh, DatasetName.alpaca_en],
     train_dataset_sample=1000,
     logging_steps=5,
     max_length=2048,
@@ -193,8 +193,8 @@ sft_args = SftArguments(
     output_dir='output',
     lora_target_modules=['ALL'],
     self_cognition_sample=500,
-    model_name=['Xiao Huang', 'Xiao Huang'],
-    model_author=['ModelScope', 'ModelScope'])
+    model_name=['小黄', 'Xiao Huang'],
+    model_author=['魔搭', 'ModelScope'])
 output = sft_main(sft_args)
 best_model_checkpoint = output['best_model_checkpoint']
 print(f'best_model_checkpoint: {best_model_checkpoint}')
@@ -207,11 +207,11 @@ Using model parallelism:
 ```shell
 
 # Experimental environment: 2 * 3090
-# 2 * 19GB GPU memory
+# 2 * 18GB GPU memory
 CUDA_VISIBLE_DEVICES=0,1 \
 swift sft \
     --model_type qwen1half-7b-chat \
-    --dataset ms-bench-mini \
+    --dataset alpaca-zh alpaca-en \
     --train_dataset_sample 1000 \
     --logging_steps 5 \
     --max_length 2048 \
@@ -220,17 +220,19 @@ swift sft \
     --output_dir output \
     --lora_target_modules ALL \
     --self_cognition_sample 500 \
-    --model_name Xiao Huang 'Xiao Huang' \
-    --model_author ModelScope ModelScope \```
+    --model_name 小黄 'Xiao Huang' \
+    --model_author 魔搭 ModelScope \
+```
 
-Script for distributed training using **zero3**:```shell
+script for distributed training using **zero2**:
+```shell
 # Experimental environment: 4 * 3090
 # 4 * 24GB GPU memory
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
 NPROC_PER_NODE=4 \
 swift sft \
     --model_type qwen1half-7b-chat \
-    --dataset ms-bench-mini \
+    --dataset alpaca-zh alpaca-en \
     --train_dataset_sample 1000 \
     --logging_steps 5 \
     --max_length 2048 \
@@ -239,9 +241,10 @@ swift sft \
     --output_dir output \
     --lora_target_modules ALL \
     --self_cognition_sample 500 \
-    --model_name Xiao Huang 'Xiao Huang' \
-    --model_author ModelScope ModelScope \
-    --deepspeed default-zero3 \```
+    --model_name 小黄 'Xiao Huang' \
+    --model_author 魔搭 ModelScope \
+    --deepspeed default-zero2 \
+```
 
 If you want to use **the interface to train**, you can enter the following command and fill in the corresponding values:
 
@@ -408,7 +411,7 @@ for query in ['Who are you?', "what's your name?", 'Who developed you?']:
     messages.append({'role': 'assistant', 'content': response})
 
 # streaming
-for query in ['78654+657=?', 'What to do if I can't fall asleep at night']:
+for query in ['78654+657=?', "What to do if I can't fall asleep at night"]:
     messages.append({'role': 'user', 'content': query})
     stream_resp = client.chat.completions.create(
         model=model_type,
@@ -483,7 +486,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 \
 NPROC_PER_NODE=4 \
 swift sft \
     --model_type qwen1half-72b-chat \
-    --dataset ms-bench-mini \
+    --dataset alpaca-zh alpaca-en \
     --train_dataset_sample 1000 \
     --logging_steps 5 \
     --max_length 4096 \
@@ -492,8 +495,8 @@ swift sft \
     --output_dir output \
     --lora_target_modules ALL \
     --self_cognition_sample 500 \
-    --model_name Xiao Huang 'Xiao Huang' \
-    --model_author ModelScope ModelScope \
+    --model_name 小黄 'Xiao Huang' \
+    --model_author 魔搭 ModelScope \
     --deepspeed default-zero3 \
 ```
 
@@ -570,7 +573,7 @@ for query in ['Who are you?', "what's your name?", 'Who developed you?']:
     messages.append({'role': 'assistant', 'content': response})
 
 # streaming
-for query in ['78654+657=?', 'What to do if I can't fall asleep at night']:
+for query in ['78654+657=?', "What to do if I can't fall asleep at night"]:
     messages.append({'role': 'user', 'content': query})
     stream_resp = client.chat.completions.create(
         model=model_type,