modelscope
diff --git a/‎README.md‎
Lines changed: 41 additions & 39 deletions b/‎README.md‎
Lines changed: 41 additions & 39 deletions
diff --git a/‎README_CN.md‎
Lines changed: 40 additions & 39 deletions b/‎README_CN.md‎
Lines changed: 40 additions & 39 deletions
diff --git a/‎examples/pytorch/llm/README.md‎
Lines changed: 5 additions & 5 deletions b/‎examples/pytorch/llm/README.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎examples/pytorch/llm/README_CN.md‎
Lines changed: 5 additions & 5 deletions b/‎examples/pytorch/llm/README_CN.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh‎
Lines changed: 2 additions & 2 deletions b/‎examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/pytorch/llm/scripts/internlm_20b/lora_ddp/sft.sh‎
Lines changed: 5 additions & 4 deletions b/‎examples/pytorch/llm/scripts/internlm_20b/lora_ddp/sft.sh‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎examples/pytorch/llm/scripts/internlm_20b/qlora/sft.sh‎
Lines changed: 1 addition & 1 deletion b/‎examples/pytorch/llm/scripts/internlm_20b/qlora/sft.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh‎
Lines changed: 2 additions & 4 deletions b/‎examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh‎
Lines changed: 2 additions & 4 deletions b/‎examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎examples/pytorch/llm/scripts/skywork_13b/qlora/sft.sh‎
Lines changed: 1 addition & 1 deletion b/‎examples/pytorch/llm/scripts/skywork_13b/qlora/sft.sh‎
Lines changed: 1 addition & 1 deletion
@@ -50,7 +50,7 @@
   - Multi-Modal: 🔥[coco-en](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)
   - Custom Dataset
 - Supported Templates:
-  - Text Generation: default-generation, chatglm-generation
+  - Text Generation: default-generation, default-generation-bos, chatglm-generation
   - Chat: default, chatml(qwen), baichuan, chatglm2, chatglm3, llama, openbuddy, internlm, xverse, ziya, skywork, bluelm
 
 
@@ -111,7 +111,7 @@ infer_args = InferArguments(
     ckpt_dir=best_ckpt_dir,
     load_args_from_ckpt_dir=True,
     stream=True,
-    show_dataset_sample=5)
+    val_dataset_sample=5)
 infer_main(infer_args)
 torch.cuda.empty_cache()
 web_ui_main(infer_args)
@@ -208,12 +208,12 @@ bash scripts/qwen_7b_chat/lora_ddp_ds/infer.sh
 bash scripts/qwen_7b_chat/lora_mp_ddp/sft.sh
 bash scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
 
-# sft(full+mp) and infer qwen-7b-chat, Requires 2*75GB GPU memory.
+# sft(full+mp) and infer qwen-7b-chat, Requires 2*55GB GPU memory.
 # Recommended experimental environment: A100
 bash scripts/qwen_7b_chat/full_mp/sft.sh
 bash scripts/qwen_7b_chat/full_mp/infer.sh
 
-# sft(full+mp+ddp) and infer qwen-7b-chat, Requires 4*75GB GPU memory.
+# sft(full+mp+ddp) and infer qwen-7b-chat, Requires 4*55GB GPU memory.
 # Recommended experimental environment: A100
 bash scripts/qwen_7b_chat/full_mp_ddp/sft.sh
 bash scripts/qwen_7b_chat/full_mp_ddp/infer.sh
@@ -594,7 +594,7 @@ The template initialization function retrieves the complete chat template based
 - `--dataset`: Default value is `'blossom-math-zh'`. For specific parameter details, please refer to the `sft.sh Command Line Arguments`. This parameter only takes effect when `eval_human` is set to False.
 - `--dataset_seed`: Default value is `42`. For specific parameter details, please refer to the `sft.sh Command Line Arguments`. This parameter only takes effect when `eval_human` is set to False.
 - `--dataset_test_ratio`: Default value is `0.01`. For specific parameter details, please refer to the `sft.sh Command Line Arguments`. This parameter only takes effect when `eval_human` is set to False.
-- `--show_dataset_sample`: Indicates the number of samples from the validation set to evaluate and display. Default value is `10`. This parameter only takes effect when `eval_human` is set to False.
+- `--val_dataset_sample`: Indicates the number of samples from the validation set to evaluate and display. Default value is `10`. This parameter only takes effect when `eval_human` is set to False.
 - `--system`: Default value is `'you are a helpful assistant!'`. For specific parameter details, please refer to the `sft.sh Command Line Arguments`.
 - `--max_length`: Default value is `2048`. For specific parameter details, please refer to the `sft.sh Command Line Arguments`.
 - `--check_dataset_strategy`: The default value is `'none'`, For specific parameter details, please refer to the `sft.sh Command Line Arguments`.
 
@@ -50,7 +50,7 @@
   - 多模态: 🔥[coco-en](https://modelscope.cn/datasets/modelscope/coco_2014_caption/summary)
   - 自定义数据集
 - 支持的对话模板:
-  - 文本生成: default-generation, chatglm-generation
+  - 文本生成: default-generation, default-generation-bos, chatglm-generation
   - 对话: default, chatml(qwen), baichuan, chatglm2, chatglm3, llama, openbuddy, internlm, xverse, ziya, skywork, bluelm
 
 ## 🛠️ 准备实验环境
@@ -110,7 +110,7 @@ infer_args = InferArguments(
     ckpt_dir=best_ckpt_dir,
     load_args_from_ckpt_dir=True,
     stream=True,
-    show_dataset_sample=5)
+    val_dataset_sample=5)
 infer_main(infer_args)
 torch.cuda.empty_cache()
 web_ui_main(infer_args)
@@ -207,12 +207,12 @@ bash scripts/qwen_7b_chat/lora_ddp_ds/infer.sh
 bash scripts/qwen_7b_chat/lora_mp_ddp/sft.sh
 bash scripts/qwen_7b_chat/lora_mp_ddp/infer.sh
 
-# 微调(full+mp)+推理 qwen-7b-chat, 需要2卡*75G显存.
+# 微调(full+mp)+推理 qwen-7b-chat, 需要2卡*55G显存.
 # 推荐的实验环境: A100
 bash scripts/qwen_7b_chat/full_mp/sft.sh
 bash scripts/qwen_7b_chat/full_mp/infer.sh
 
-# 微调(full+mp+ddp)+推理 qwen-7b-chat, 需要4卡*75G显存.
+# 微调(full+mp+ddp)+推理 qwen-7b-chat, 需要4卡*55G显存.
 # 推荐的实验环境: A100
 bash scripts/qwen_7b_chat/full_mp_ddp/sft.sh
 bash scripts/qwen_7b_chat/full_mp_ddp/infer.sh
@@ -597,7 +597,7 @@ if __name__ == '__main__':
 - `--dataset`: 默认值为`'blossom-math-zh'`, 具体的参数介绍可以在`sft.sh命令行参数`中查看. 该参数只有在`eval_human`设置为False时才生效.
 - `--dataset_seed`: 默认值为`42`, 具体的参数介绍可以在`sft.sh命令行参数`中查看. 该参数只有在`eval_human`设置为False时才生效.
 - `--dataset_test_ratio`: 默认值为`0.01`, 具体的参数介绍可以在`sft.sh命令行参数`中查看. 该参数只有在`eval_human`设置为False时才生效.
-- `--show_dataset_sample`: 表示想要评估和展示的验证集的数量, 默认值为`10`. 该参数只有在`eval_human`设置为False时才生效.
+- `--val_dataset_sample`: 表示想要评估和展示的验证集的数量, 默认值为`10`. 该参数只有在`eval_human`设置为False时才生效.
 - `--system`: 默认值为`'you are a helpful assistant!'`. 具体的参数介绍可以在`sft.sh命令行参数`中查看.
 - `--max_length`: 默认值为`2048`. 具体的参数介绍可以在`sft.sh命令行参数`中查看.
 - `--check_dataset_strategy`: 默认值为`'none'`, 具体的参数介绍可以在`sft.sh命令行参数`中查看.
 
@@ -1,5 +1,5 @@
 # Experimental environment: 2 * A100
-# 2 * 30GB GPU memory
+# 2 * 28GB GPU memory
 nproc_per_node=2
 
 PYTHONPATH=../../.. \
@@ -25,7 +25,7 @@ torchrun \
     --lora_alpha 32 \
     --lora_dropout_p 0.05 \
     --lora_target_modules ALL \
-    --gradient_checkpointing false \
+    --gradient_checkpointing true \
     --batch_size 1 \
     --weight_decay 0.01 \
     --learning_rate 1e-4 \
 
@@ -1,4 +1,5 @@
-# Experimental environment: A100
+# Experimental environment: 2 * A100
+# 2 * 56GB GPU memory
 nproc_per_node=2
 
 PYTHONPATH=../../.. \
@@ -11,7 +12,7 @@ torchrun \
     --model_revision master \
     --sft_type lora \
     --tuner_backend swift \
-    --template_type default-generation \
+    --template_type default-generation-bos \
     --dtype AUTO \
     --output_dir output \
     --ddp_backend nccl \
@@ -23,8 +24,8 @@ torchrun \
     --lora_rank 8 \
     --lora_alpha 32 \
     --lora_dropout_p 0.05 \
-    --lora_target_modules q_proj k_proj v_proj \
-    --gradient_checkpointing false \
+    --lora_target_modules DEFAULT \
+    --gradient_checkpointing true \
     --batch_size 1 \
     --weight_decay 0.01 \
     --learning_rate 1e-4 \
 
@@ -7,7 +7,7 @@ python llm_sft.py \
     --model_revision master \
     --sft_type lora \
     --tuner_backend swift \
-    --template_type default-generation \
+    --template_type default-generation-bos \
     --dtype AUTO \
     --output_dir output \
     --dataset advertise-gen-zh \
 
@@ -1,7 +1,5 @@
 # Experimental environment: 2 * A100
-# 2 * 75GB GPU memory (use flash_attn)
-# You need to install flash_attn or set gradient_checkpointing to True,
-# otherwise it may result in an OOM (Out of Memory) error.
+# 2 * 55GB GPU memory (use flash_attn)
 PYTHONPATH=../../.. \
 CUDA_VISIBLE_DEVICES=0,1 \
 python llm_sft.py \
@@ -16,7 +14,7 @@ python llm_sft.py \
     --num_train_epochs 1 \
     --max_length 8192 \
     --check_dataset_strategy warning \
-    --gradient_checkpointing false \
+    --gradient_checkpointing true \
     --batch_size 1 \
     --weight_decay 0.01 \
     --learning_rate 2e-5 \
 
@@ -1,7 +1,5 @@
 # Experimental environment: 4 * A100
-# 4 * 75GB GPU memory (use flash_attn)
-# You need to install flash_attn or set gradient_checkpointing to True,
-# otherwise it may result in an OOM (Out of Memory) error.
+# 4 * 55GB GPU memory (use flash_attn)
 nproc_per_node=2
 
 PYTHONPATH=../../.. \
@@ -21,7 +19,7 @@ torchrun \
     --num_train_epochs 1 \
     --max_length 8192 \
     --check_dataset_strategy warning \
-    --gradient_checkpointing false \
+    --gradient_checkpointing true \
     --batch_size 1 \
     --weight_decay 0.01 \
     --learning_rate 2e-5 \
 
@@ -7,7 +7,7 @@ python llm_sft.py \
     --model_revision master \
     --sft_type lora \
     --tuner_backend swift \
-    --template_type default-generation \
+    --template_type default-generation-bos \
     --dtype AUTO \
     --output_dir output \
     --dataset advertise-gen-zh \