Merge commit '2626106a398a96cf4df6801247f8c4aa91820f64' into release/1.5

tastelikefeet · tastelikefeet · commit 4d9e27fa8ee9 · 2024-01-22T21:25:30.000+08:00
* commit '2626106a398a96cf4df6801247f8c4aa91820f64': update default_lr; fix do_sample in vllm (#336)
diff --git a/docs/source/LLM/命令行参数.md b/docs/source/LLM/命令行参数.md
@@ -55,7 +55,7 @@
 - `--num_train_epochs`: 训练的epoch数, 默认为`1`. 如果`max_steps >= 0`, 则覆盖`num_train_epochs`.
 - `--max_steps`: 训练的max_steps数, 默认为`-1`. 如果`max_steps >= 0`, 则覆盖`num_train_epochs`.
 - `--optim`: 默认为`'adamw_torch'`.
-- `--learning_rate`: 默认值为`None`, 即如果`sft_type`为lora, 则设置为1e-4, 如果`sft_type`为full, 则设置为2e-5.
+- `--learning_rate`: 默认值为`None`, 即如果`sft_type`为lora, 则设置为1e-4, 如果`sft_type`为full, 则设置为1e-5.
 - `--weight_decay`: 默认值为`0.01`.
 - `--gradient_accumulation_steps`: 梯度累加, 默认值为`None`, 设置为`math.ceil(16 / self.batch_size / world_size)`. `total_batch_size =  batch_size * gradient_accumulation_steps * world_size`.
 - `--max_grad_norm`: 梯度裁剪, 默认值为`0.5`.
diff --git a/examples/pytorch/llm/scripts/qwen_1_8b_chat/full/sft.sh b/examples/pytorch/llm/scripts/qwen_1_8b_chat/full/sft.sh
@@ -17,7 +17,7 @@ python llm_sft.py \
     --gradient_checkpointing true \
     --batch_size 1 \
     --weight_decay 0.01 \
-    --learning_rate 2e-5 \
+    --learning_rate 1e-5 \
     --gradient_accumulation_steps 16 \
     --max_grad_norm 0.5 \
     --warmup_ratio 0.03 \
diff --git a/examples/pytorch/llm/scripts/qwen_1_8b_chat/full_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_1_8b_chat/full_ddp/sft.sh
@@ -24,7 +24,7 @@ torchrun \
     --gradient_checkpointing true \
     --batch_size 1 \
     --weight_decay 0.01 \
-    --learning_rate 2e-5 \
+    --learning_rate 1e-5 \
     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
     --max_grad_norm 0.5 \
     --warmup_ratio 0.03 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full/sft.sh
@@ -9,7 +9,7 @@ swift sft \
     --output_dir output \
     --num_train_epochs 1 \
     --max_length 4096 \
-    --learning_rate 2e-5 \
+    --learning_rate 1e-5 \
     --use_flash_attn true \
     --save_only_model true \
     --dataset codefuse-evol-instruction-zh \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_freeze_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_freeze_ddp/sft.sh
@@ -10,7 +10,7 @@ swift sft \
     --output_dir output \
     --num_train_epochs 1 \
     --max_length 4096 \
-    --learning_rate 2e-5 \
+    --learning_rate 1e-5 \
     --use_flash_attn true \
     --save_only_model true \
     --dataset codefuse-evol-instruction-zh \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp/sft.sh
@@ -17,7 +17,7 @@ python llm_sft.py \
     --gradient_checkpointing true \
     --batch_size 1 \
     --weight_decay 0.01 \
-    --learning_rate 2e-5 \
+    --learning_rate 1e-5 \
     --gradient_accumulation_steps 16 \
     --max_grad_norm 0.5 \
     --warmup_ratio 0.03 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/full_mp_ddp/sft.sh
@@ -22,7 +22,7 @@ torchrun \
     --gradient_checkpointing true \
     --batch_size 1 \
     --weight_decay 0.01 \
-    --learning_rate 2e-5 \
+    --learning_rate 1e-5 \
     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
     --max_grad_norm 0.5 \
     --warmup_ratio 0.03 \
diff --git a/examples/pytorch/llm/scripts/qwen_audio_chat/full_mp/sft.sh b/examples/pytorch/llm/scripts/qwen_audio_chat/full_mp/sft.sh
@@ -9,7 +9,7 @@ swift sft \
     --output_dir output \
     --num_train_epochs 1 \
     --max_length 2048 \
-    --learning_rate 2e-5 \
+    --learning_rate 1e-5 \
     --use_flash_attn true \
     --save_only_model true \
     --dataset aishell1-mini-zh \
diff --git a/examples/pytorch/llm/scripts/qwen_audio_chat/full_mp_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_audio_chat/full_mp_ddp/sft.sh
@@ -10,7 +10,7 @@ swift sft \
     --output_dir output \
     --num_train_epochs 1 \
     --max_length 2048 \
-    --learning_rate 2e-5 \
+    --learning_rate 1e-5 \
     --use_flash_attn true \
     --save_only_model true \
     --dataset aishell1-mini-zh \
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/full_mp/sft.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/full_mp/sft.sh
@@ -9,7 +9,7 @@ swift sft \
     --output_dir output \
     --num_train_epochs 1 \
     --max_length 2048 \
-    --learning_rate 2e-5 \
+    --learning_rate 1e-5 \
     --use_flash_attn true \
     --save_only_model true \
     --dataset coco-mini-en \
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/full_mp_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/full_mp_ddp/sft.sh
@@ -10,7 +10,7 @@ swift sft \
     --output_dir output \
     --num_train_epochs 1 \
     --max_length 2048 \
-    --learning_rate 2e-5 \
+    --learning_rate 1e-5 \
     --use_flash_attn true \
     --save_only_model true \
     --dataset coco-mini-en \
diff --git a/examples/pytorch/llm/scripts/seqgpt_560m/full/sft.sh b/examples/pytorch/llm/scripts/seqgpt_560m/full/sft.sh
@@ -17,7 +17,7 @@ python llm_sft.py \
     --gradient_checkpointing true \
     --batch_size 4 \
     --weight_decay 0.01 \
-    --learning_rate 2e-5 \
+    --learning_rate 1e-5 \
     --gradient_accumulation_steps 8 \
     --max_grad_norm 0.5 \
     --warmup_ratio 0.03 \
diff --git a/examples/pytorch/llm/scripts/seqgpt_560m/full_ddp/sft.sh b/examples/pytorch/llm/scripts/seqgpt_560m/full_ddp/sft.sh
@@ -23,7 +23,7 @@ torchrun \
     --gradient_checkpointing true \
     --batch_size 4 \
     --weight_decay 0.01 \
-    --learning_rate 2e-5 \
+    --learning_rate 1e-5 \
     --gradient_accumulation_steps $(expr 32 / $nproc_per_node / 4) \
     --max_grad_norm 0.5 \
     --warmup_ratio 0.03 \
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
@@ -246,7 +246,7 @@ def __post_init__(self) -> None:
                     self.additional_trainable_parameters
                 ]
             if self.learning_rate is None:
-                self.learning_rate = 2e-5
+                self.learning_rate = 1e-5
             if self.save_only_model is None:
                 self.save_only_model = True
         else:
diff --git a/swift/llm/utils/vllm_utils.py b/swift/llm/utils/vllm_utils.py
@@ -146,11 +146,18 @@ def __init__(
                     f'The VLLM version is too old and does not support the parameter: {k}.'
                 )
                 kwargs.pop(k)
+        self._temperature = temperature
         super().__init__(**kwargs)
 
     def __setattr__(self, key: str, value: str) -> None:
         if key == 'max_new_tokens':
             self.max_tokens = value
+        elif key == 'do_sample':
+            assert value in {True, False}
+            if value:
+                self.temperature = self._temperature
+            else:
+                self.temperature = 0.
         elif key == 'max_length':
             raise ValueError(
                 '`max_length` is not supported, please use `max_new_tokens` for setting.'
diff --git a/swift/ui/llm_train/hyper.py b/swift/ui/llm_train/hyper.py
@@ -143,7 +143,7 @@ def do_build_ui(cls, base_tab: Type['BaseUI']):
                         scale=20)
                     learning_rate = gr.Textbox(
                         elem_id='learning_rate',
-                        value='2e-5',
+                        value='1e-5',
                         lines=1,
                         scale=20)
                     gr.Slider(
@@ -174,7 +174,7 @@ def do_build_ui(cls, base_tab: Type['BaseUI']):
 
             def update_lr(sft_type):
                 if sft_type == 'full':
-                    return 2e-5
+                    return 1e-5
                 else:
                     return 1e-4