Add baichuan2 (#40)

Jintao-Huang · web-flow · commit a875f16cd908 · 2023-09-06T16:24:26.000+08:00
diff --git a/README.md b/README.md
@@ -32,7 +32,7 @@ Key features:
 [code link](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm)
 
 1. supported SFT methods: [lora](https://arxiv.org/abs/2106.09685), [qlora](https://arxiv.org/abs/2305.14314), full(full parameter fine-tuning)
-2. supported models: qwen-7b, [qwen-7b-chat](https://github.com/QwenLM/Qwen-7B), qwen-vl, [qwen-vl-chat](https://github.com/QwenLM/Qwen-VL), baichuan-7b, baichuan-13b, baichuan-13b-chat, chatglm2-6b, chatglm2-6b-32k, llama2-7b, llama2-7b-chat, llama2-13b, llama2-13b-chat, llama2-70b, llama2-70b-chat, openbuddy-llama2-13b, openbuddy-llama-65b, openbuddy-llama2-70b, polylm-13b
+2. supported models: qwen-7b, [qwen-7b-chat](https://github.com/QwenLM/Qwen-7B), qwen-vl, [qwen-vl-chat](https://github.com/QwenLM/Qwen-VL), baichuan-7b, baichuan-13b, baichuan-13b-chat, chatglm2-6b, chatglm2-6b-32k, llama2-7b, llama2-7b-chat, llama2-13b, llama2-13b-chat, llama2-70b, llama2-70b-chat, openbuddy-llama2-13b, openbuddy-llama-65b, openbuddy-llama2-70b, polylm-13b, baichuan2-7b, baichuan2-7b-chat, baichuan2-13b, baichuan2-13b-chat
 3. supported features: quantization, ddp, model parallelism(device map), gradient checkpointing, gradient accumulation, pushing to modelscope hub, custom datasets, multimodal and agent SFT, mutli-round chat, ...
 4. supported datasets:
    1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en
diff --git a/README_CN.md b/README_CN.md
@@ -30,7 +30,7 @@ SWIFT（Scalable lightWeight Infrastructure for Fine-Tuning）是一个可扩展
 [code link](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm)
 
 1. 支持的SFT方法: [lora](https://arxiv.org/abs/2106.09685), [qlora](https://arxiv.org/abs/2305.14314), 全参数微调
-2. 支持的模型: qwen-7b, [qwen-7b-chat](https://github.com/QwenLM/Qwen-7B), qwen-vl, [qwen-vl-chat](https://github.com/QwenLM/Qwen-VL), baichuan-7b, baichuan-13b, baichuan-13b-chat, chatglm2-6b, chatglm2-6b-32k, llama2-7b, llama2-7b-chat, llama2-13b, llama2-13b-chat, llama2-70b, llama2-70b-chat, openbuddy-llama2-13b, openbuddy-llama-65b, openbuddy-llama2-70b, polylm-13b
+2. 支持的模型: qwen-7b, [qwen-7b-chat](https://github.com/QwenLM/Qwen-7B), qwen-vl, [qwen-vl-chat](https://github.com/QwenLM/Qwen-VL), baichuan-7b, baichuan-13b, baichuan-13b-chat, chatglm2-6b, chatglm2-6b-32k, llama2-7b, llama2-7b-chat, llama2-13b, llama2-13b-chat, llama2-70b, llama2-70b-chat, openbuddy-llama2-13b, openbuddy-llama-65b, openbuddy-llama2-70b, polylm-13b, baichuan2-7b, baichuan2-7b-chat, baichuan2-13b, baichuan2-13b-chat
 3. 支持的特性: 模型量化, DDP, 模型并行(device_map), gradient checkpointing, 梯度累加, 支持推送ModelScope Hub, 自定义数据集, 多模态和Agent SFT, 多轮对话, ...
 4. 支持的数据集:
    1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en
diff --git a/examples/pytorch/llm/README.md b/examples/pytorch/llm/README.md
@@ -16,7 +16,7 @@
 
 ## Features
 1. supported SFT methods: [lora](https://arxiv.org/abs/2106.09685), [qlora](https://arxiv.org/abs/2305.14314), full(full parameter fine-tuning)
-2. supported models: qwen-7b, [qwen-7b-chat](https://github.com/QwenLM/Qwen-7B), qwen-vl, [qwen-vl-chat](https://github.com/QwenLM/Qwen-VL), baichuan-7b, baichuan-13b, baichuan-13b-chat, chatglm2-6b, chatglm2-6b-32k, llama2-7b, llama2-7b-chat, llama2-13b, llama2-13b-chat, llama2-70b, llama2-70b-chat, openbuddy-llama2-13b, openbuddy-llama-65b, openbuddy-llama2-70b, polylm-13b
+2. supported models: qwen-7b, [qwen-7b-chat](https://github.com/QwenLM/Qwen-7B), qwen-vl, [qwen-vl-chat](https://github.com/QwenLM/Qwen-VL), baichuan-7b, baichuan-13b, baichuan-13b-chat, chatglm2-6b, chatglm2-6b-32k, llama2-7b, llama2-7b-chat, llama2-13b, llama2-13b-chat, llama2-70b, llama2-70b-chat, openbuddy-llama2-13b, openbuddy-llama-65b, openbuddy-llama2-70b, polylm-13b, baichuan2-7b, baichuan2-7b-chat, baichuan2-13b, baichuan2-13b-chat
 3. supported features: quantization, ddp, model parallelism(device map), gradient checkpointing, gradient accumulation, pushing to modelscope hub, custom datasets, multimodal and agent SFT, mutli-round chat, ...
 4. supported datasets:
    1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en
@@ -59,20 +59,24 @@ pip install .
 git clone https://github.com/modelscope/swift.git
 cd swift/examples/pytorch/llm
 
+# sft lora and infer qwen-7b, Requires 22GB VRAM.
+# If you want to push weights into modelscope hub during training, you need to set '--push_to_hub true'
+bash scripts/qwen_7b_chat/lora/sft.sh
+bash scripts/qwen_7b_chat/lora/infer.sh
+
+# sft(lora+ddp) and infer qwen-7b, Requires 4*22GB VRAM.
+bash scripts/qwen_7b_chat/lora_ddp/sft.sh
+bash scripts/qwen_7b_chat/lora_ddp/infer.sh
+
 # sft(qlora) and infer qwen-7b, Requires 16GB VRAM.
 # If you want to use quantification, you need to `pip install bitsandbytes -U`
-# If you want to push weights into modelscope hub during training, you need to set '--push_to_hub true'
 bash scripts/qwen_7b_chat/qlora/sft.sh
 bash scripts/qwen_7b_chat/qlora/infer.sh
 
 # sft(qlora+ddp) and infer qwen-7b, Requires 4*16GB VRAM.
 bash scripts/qwen_7b_chat/qlora_ddp/sft.sh
 bash scripts/qwen_7b_chat/qlora_ddp/infer.sh
 
-# sft(lora+ddp) and infer qwen-7b, Requires 4*22GB VRAM.
-bash scripts/qwen_7b_chat/lora_ddp/sft.sh
-bash scripts/qwen_7b_chat/lora_ddp/infer.sh
-
 # sft(full) and infer qwen-7b, Requires 95GB VRAM.
 bash scripts/qwen_7b_chat/full/sft.sh
 bash scripts/qwen_7b_chat/full/infer.sh
diff --git a/examples/pytorch/llm/README_CN.md b/examples/pytorch/llm/README_CN.md
@@ -17,7 +17,7 @@
 
 ## 特性
 1. 支持的SFT方法: [lora](https://arxiv.org/abs/2106.09685), [qlora](https://arxiv.org/abs/2305.14314), 全参数微调
-2. 支持的模型: qwen-7b, [qwen-7b-chat](https://github.com/QwenLM/Qwen-7B), qwen-vl, [qwen-vl-chat](https://github.com/QwenLM/Qwen-VL), baichuan-7b, baichuan-13b, baichuan-13b-chat, chatglm2-6b, chatglm2-6b-32k, llama2-7b, llama2-7b-chat, llama2-13b, llama2-13b-chat, llama2-70b, llama2-70b-chat, openbuddy-llama2-13b, openbuddy-llama-65b, openbuddy-llama2-70b, polylm-13b
+2. 支持的模型: qwen-7b, [qwen-7b-chat](https://github.com/QwenLM/Qwen-7B), qwen-vl, [qwen-vl-chat](https://github.com/QwenLM/Qwen-VL), baichuan-7b, baichuan-13b, baichuan-13b-chat, chatglm2-6b, chatglm2-6b-32k, llama2-7b, llama2-7b-chat, llama2-13b, llama2-13b-chat, llama2-70b, llama2-70b-chat, openbuddy-llama2-13b, openbuddy-llama-65b, openbuddy-llama2-70b, polylm-13b, baichuan2-7b, baichuan2-7b-chat, baichuan2-13b, baichuan2-13b-chat
 3. 支持的特性: 模型量化, DDP, 模型并行(device_map), gradient checkpointing, 梯度累加, 支持推送ModelScope Hub, 自定义数据集, 多模态和Agent SFT, 多轮对话, ...
 4. 支持的数据集:
    1. NLP: alpaca-en(gpt4), alpaca-zh(gpt4), finance-en, multi-alpaca-all, code-en, instinwild-en, instinwild-zh, cot-en, cot-zh, firefly-all-zh, poetry-zh, instruct-en, gpt4all-en
@@ -61,20 +61,24 @@ pip install .
 git clone https://github.com/modelscope/swift.git
 cd swift/examples/pytorch/llm
 
+# 微调(lora)+推理 qwen-7b, 需要22GB显存.
+# 如果你想在训练时, 将权重push到modelscope hub中, 你需要设置`--push_to_hub true`
+bash scripts/qwen_7b_chat/lora/sft.sh
+bash scripts/qwen_7b_chat/lora/infer.sh
+
+# 微调(lora+ddp)+推理 qwen-7b, 需要4卡*22GB显存.
+bash scripts/qwen_7b_chat/lora_ddp/sft.sh
+bash scripts/qwen_7b_chat/lora_ddp/infer.sh
+
 # 微调(qlora)+推理 qwen-7b, 需要16GB显存.
 # 如果你想要使用量化, 你需要`pip install bitsandbytes -U`
-# 如果你想在训练时, 将权重push到modelscope hub中, 你需要设置`--push_to_hub true`
 bash scripts/qwen_7b_chat/qlora/sft.sh
 bash scripts/qwen_7b_chat/qlora/infer.sh
 
 # 微调(qlora+ddp)+推理 qwen-7b, 需要4卡*16GB显存.
 bash scripts/qwen_7b_chat/qlora_ddp/sft.sh
 bash scripts/qwen_7b_chat/qlora_ddp/infer.sh
 
-# 微调(lora+ddp)+推理 qwen-7b, 需要4卡*22GB显存.
-bash scripts/qwen_7b_chat/lora_ddp/sft.sh
-bash scripts/qwen_7b_chat/lora_ddp/infer.sh
-
 # 微调(full)+推理 qwen-7b, 需要95G显存.
 bash scripts/qwen_7b_chat/full/sft.sh
 bash scripts/qwen_7b_chat/full/infer.sh
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/infer.sh
@@ -0,0 +1,14 @@
+# 16G
+CUDA_VISIBLE_DEVICES=0 \
+python src/llm_infer.py \
+    --model_type baichuan2-7b-chat \
+    --sft_type lora \
+    --template_type baichuan \
+    --dtype bf16 \
+    --ckpt_dir "runs/baichuan2-7b-chat/vx_xxx/checkpoint-xxx" \
+    --eval_human true \
+    --max_new_tokens 1024 \
+    --temperature 0.9 \
+    --top_k 50 \
+    --top_p 0.9 \
+    --do_sample true \
diff --git a/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/baichuan2_7b_chat/lora_ddp/sft.sh
@@ -1,26 +1,36 @@
-# 4 * 17G
+# 4 * 22GB VRAM
 nproc_per_node=4
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
 torchrun \
     --nproc_per_node=$nproc_per_node \
     --master_port 29500 \
     src/llm_sft.py \
-    --model_type baichuan-13b-chat \
+    --model_type baichuan2-7b-chat \
     --sft_type lora \
+    --template_type baichuan \
+    --dtype bf16 \
     --output_dir runs \
     --ddp_backend nccl \
     --dataset alpaca-en,alpaca-zh \
-    --dataset_sample -1 \
+    --dataset_sample 20000 \
     --num_train_epochs 1 \
     --max_length 1024 \
-    --quantization_bit 4 \
     --lora_rank 8 \
     --lora_alpha 32 \
-    --lora_dropout_p 0.1 \
+    --lora_dropout_p 0.05 \
+    --lora_target_modules W_pack o_proj \
+    --gradient_checkpointing true \
     --batch_size 1 \
+    --weight_decay 0. \
     --learning_rate 1e-4 \
     --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
+    --max_grad_norm 0.5 \
+    --warmup_ratio 0.03 \
     --eval_steps 50 \
     --save_steps 50 \
     --save_total_limit 2 \
     --logging_steps 10 \
+    --push_to_hub false \
+    --hub_model_id baichuan2-7b-chat-lora \
+    --hub_private_repo true \
+    --hub_token 'your-sdk-token' \
diff --git a/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/infer.sh
@@ -1,4 +1,4 @@
-# 10G
+# 16G
 CUDA_VISIBLE_DEVICES=0 \
 python src/llm_infer.py \
     --model_type qwen-7b \
@@ -7,8 +7,6 @@ python src/llm_infer.py \
     --dtype bf16 \
     --ckpt_dir "runs/qwen-7b/vx_xxx/checkpoint-xxx" \
     --eval_human true \
-    --quantization_bit 4 \
-    --bnb_4bit_comp_dtype bf16 \
     --max_new_tokens 1024 \
     --temperature 0.9 \
     --top_k 50 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b/lora_ddp/sft.sh
@@ -1,4 +1,4 @@
-# 4 * 16GB VRAM
+# 4 * 22GB VRAM
 nproc_per_node=4
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
 torchrun \
@@ -15,12 +15,10 @@ torchrun \
     --dataset_sample -1 \
     --num_train_epochs 1 \
     --max_length 1024 \
-    --quantization_bit 4 \
-    --bnb_4bit_comp_dtype bf16 \
-    --lora_rank 64 \
+    --lora_rank 8 \
     --lora_alpha 32 \
     --lora_dropout_p 0.05 \
-    --lora_target_modules ALL \
+    --lora_target_modules c_attn c_proj \
     --gradient_checkpointing true \
     --batch_size 1 \
     --weight_decay 0. \
@@ -34,6 +32,6 @@ torchrun \
     --logging_steps 10 \
     --use_flash_attn false \
     --push_to_hub false \
-    --hub_model_id qwen-7b-qlora \
+    --hub_model_id qwen-7b-lora \
     --hub_private_repo true \
     --hub_token 'your-sdk-token' \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/infer.sh
@@ -1,11 +1,12 @@
-# 12G
+# 16G
 CUDA_VISIBLE_DEVICES=0 \
 python src/llm_infer.py \
-    --model_type baichuan-13b-chat \
+    --model_type qwen-7b-chat \
     --sft_type lora \
-    --ckpt_dir "runs/baichuan-13b-chat/vx_xxx/checkpoint-xxx" \
+    --template_type chatml \
+    --dtype bf16 \
+    --ckpt_dir "runs/qwen-7b-chat/vx_xxx/checkpoint-xxx" \
     --eval_human true \
-    --quantization_bit 4 \
     --max_new_tokens 1024 \
     --temperature 0.9 \
     --top_k 50 \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora/sft.sh
@@ -0,0 +1,32 @@
+# 22GB VRAM
+CUDA_VISIBLE_DEVICES=0 \
+python src/llm_sft.py \
+    --model_type qwen-7b-chat \
+    --sft_type lora \
+    --template_type chatml \
+    --dtype bf16 \
+    --output_dir runs \
+    --dataset alpaca-en,alpaca-zh \
+    --dataset_sample -1 \
+    --num_train_epochs 1 \
+    --max_length 1024 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --lora_dropout_p 0.05 \
+    --lora_target_modules c_attn c_proj \
+    --gradient_checkpointing true \
+    --batch_size 1 \
+    --weight_decay 0. \
+    --learning_rate 1e-4 \
+    --gradient_accumulation_steps 16 \
+    --max_grad_norm 0.5 \
+    --warmup_ratio 0.03 \
+    --eval_steps 50 \
+    --save_steps 50 \
+    --save_total_limit 2 \
+    --logging_steps 10 \
+    --use_flash_attn false \
+    --push_to_hub false \
+    --hub_model_id qwen-7b-chat-lora \
+    --hub_private_repo true \
+    --hub_token 'your-sdk-token' \
diff --git a/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_7b_chat/lora_ddp/sft.sh
@@ -18,6 +18,7 @@ torchrun \
     --lora_rank 8 \
     --lora_alpha 32 \
     --lora_dropout_p 0.05 \
+    --lora_target_modules c_attn c_proj \
     --gradient_checkpointing true \
     --batch_size 1 \
     --weight_decay 0. \
diff --git a/examples/pytorch/llm/scripts/qwen_vl/lora_ddp/infer.sh b/examples/pytorch/llm/scripts/qwen_vl/lora_ddp/infer.sh
@@ -1,4 +1,4 @@
-# 10G
+# 16G
 CUDA_VISIBLE_DEVICES=0 \
 python src/llm_infer.py \
     --model_type qwen-vl \
@@ -9,8 +9,6 @@ python src/llm_infer.py \
     --eval_human false \
     --dataset coco-en \
     --dataset_sample 20000 \
-    --quantization_bit 4 \
-    --bnb_4bit_comp_dtype bf16 \
     --max_new_tokens 1024 \
     --temperature 0.9 \
     --top_k 50 \
diff --git a/examples/pytorch/llm/scripts/qwen_vl/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_vl/lora_ddp/sft.sh
@@ -1,4 +1,4 @@
-# 4 * 16GB VRAM
+# 4 * 22GB VRAM
 nproc_per_node=4
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
 torchrun \
@@ -15,12 +15,10 @@ torchrun \
     --dataset_sample 20000 \
     --num_train_epochs 1 \
     --max_length 1024 \
-    --quantization_bit 4 \
-    --bnb_4bit_comp_dtype bf16 \
-    --lora_rank 64 \
+    --lora_rank 8 \
     --lora_alpha 32 \
     --lora_dropout_p 0.05 \
-    --lora_target_modules ALL \
+    --lora_target_modules c_attn attn.c_proj \
     --gradient_checkpointing true \
     --batch_size 1 \
     --weight_decay 0. \
@@ -34,6 +32,6 @@ torchrun \
     --logging_steps 10 \
     --use_flash_attn false \
     --push_to_hub false \
-    --hub_model_id qwen-vl-qlora \
+    --hub_model_id qwen-vl-lora \
     --hub_private_repo true \
     --hub_token 'your-sdk-token' \
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/lora/infer.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/lora/infer.sh
@@ -0,0 +1,16 @@
+# 19G
+CUDA_VISIBLE_DEVICES=0 \
+python src/llm_infer.py \
+    --model_type qwen-vl-chat \
+    --sft_type lora \
+    --template_type chatml \
+    --dtype bf16 \
+    --ckpt_dir "runs/qwen-vl-chat/vx_xxx/checkpoint-xxx" \
+    --eval_human false \
+    --dataset coco-en \
+    --dataset_sample 20000 \
+    --max_new_tokens 1024 \
+    --temperature 0.9 \
+    --top_k 50 \
+    --top_p 0.9 \
+    --do_sample true \
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/lora/sft.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/lora/sft.sh
@@ -0,0 +1,32 @@
+# 23GB VRAM
+CUDA_VISIBLE_DEVICES=0 \
+python src/llm_sft.py \
+    --model_type qwen-vl-chat \
+    --sft_type lora \
+    --template_type chatml \
+    --dtype bf16 \
+    --output_dir runs \
+    --dataset coco-en \
+    --dataset_sample 20000 \
+    --num_train_epochs 1 \
+    --max_length 1024 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --lora_dropout_p 0.05 \
+    --lora_target_modules c_attn attn.c_proj \
+    --gradient_checkpointing true \
+    --batch_size 1 \
+    --weight_decay 0. \
+    --learning_rate 1e-4 \
+    --gradient_accumulation_steps 16 \
+    --max_grad_norm 0.5 \
+    --warmup_ratio 0.03 \
+    --eval_steps 50 \
+    --save_steps 50 \
+    --save_total_limit 2 \
+    --logging_steps 10 \
+    --use_flash_attn false \
+    --push_to_hub false \
+    --hub_model_id qwen-vl-chat-lora \
+    --hub_private_repo true \
+    --hub_token 'your-sdk-token' \
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/lora_ddp/sft.sh
@@ -18,6 +18,7 @@ torchrun \
     --lora_rank 8 \
     --lora_alpha 32 \
     --lora_dropout_p 0.05 \
+    --lora_target_modules c_attn attn.c_proj \
     --gradient_checkpointing true \
     --batch_size 1 \
     --weight_decay 0. \
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/sft.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/qlora/sft.sh
@@ -15,7 +15,7 @@ python src/llm_sft.py \
     --lora_rank 64 \
     --lora_alpha 32 \
     --lora_dropout_p 0.05 \
-    --lora_target_modules ALL \
+    --lora_target_modules c_attn attn.c_proj \
     --gradient_checkpointing true \
     --batch_size 1 \
     --weight_decay 0. \
diff --git a/examples/pytorch/llm/scripts/qwen_vl_chat/qlora_ddp/sft.sh b/examples/pytorch/llm/scripts/qwen_vl_chat/qlora_ddp/sft.sh
@@ -20,7 +20,7 @@ torchrun \
     --lora_rank 64 \
     --lora_alpha 32 \
     --lora_dropout_p 0.05 \
-    --lora_target_modules ALL \
+    --lora_target_modules c_attn attn.c_proj \
     --gradient_checkpointing true \
     --batch_size 1 \
     --weight_decay 0. \
diff --git a/examples/pytorch/llm/src/utils/model.py b/examples/pytorch/llm/src/utils/model.py
diff --git a/examples/pytorch/llm/src/utils/utils.py b/examples/pytorch/llm/src/utils/utils.py