set training_args._frozen=False (#70)

Jintao-Huang · web-flow · commit ff25ae0d3e80 · 2023-09-13T20:00:57.000+08:00
diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md
@@ -34,7 +34,7 @@
 5. supported templates: chatml(qwen), baichuan, chatglm2, llama, openbuddy-llama, default, default-generation
 
 ## Prepare the Environment
-Experimental environment: A10, 3090, A100, ... (V100 does not support bf16, quantization)
+Experimental environment: V100, A10, 3090, A100, ... (V100 does not support bf16, quantization)
 ```bash
 # Installing miniconda
 wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md
@@ -35,7 +35,7 @@
 5. 支持的对话模板: chatml(qwen), baichuan, chatglm2, llama, openbuddy-llama, default, default-generation
 
 ## 准备实验环境
-实验环境: A10, 3090, A100均可. (V100不支持bf16, 量化)
+实验环境: V100, A10, 3090, A100均可. (V100不支持bf16, 量化)
 ```bash
 # 安装miniconda
 wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_mp_ddp/sft.sh
@@ -19,8 +19,8 @@ torchrun \
     --lora_rank 8 \
     --lora_alpha 32 \
     --lora_dropout_p 0. \
-    --lora_target_modules c_attn c_proj \
-    --gradient_checkpointing true \
+    --lora_target_modules c_attn \
+    --gradient_checkpointing false \
     --batch_size 1 \
     --weight_decay 0. \
     --learning_rate 1e-4 \
diff --git a/examples/pytorch/llm/src/llm_sft.py b/examples/pytorch/llm/src/llm_sft.py
@@ -115,7 +115,7 @@ class SftArguments:
         default=None,
         metadata={
             'help':
-            "This parameter is used only when model_type.startswith('qwen-7b')"
+            "This parameter is used only when model_type.startswith('qwen')"
         })
 
     def __post_init__(self):
@@ -316,6 +316,8 @@ def llm_sft(args: SftArguments) -> None:
         model.config.use_cache = False
         model.enable_input_require_grads()
     if is_dist():
+        # Compatible with https://github.com/huggingface/transformers/pull/25903
+        training_args._frozen = False
         if args.gradient_checkpointing:
             training_args.ddp_find_unused_parameters = False
             training_args.ddp_broadcast_buffers = False