Update TRL version in example

AAnoosheh · AAnoosheh · commit eb51261f06f7 · 2025-09-16T03:40:06.000-07:00
Signed-off-by: Asha Anoosheh &lt;aanoosheh@nvidia.com&gt;
diff --git a/examples/llm_distill/README.md b/examples/llm_distill/README.md
@@ -154,35 +154,47 @@ Keep in mind the training loss of the distillation run is not directly comparabl
 ### Train teacher
 
 ```bash
-accelerate launch --multi_gpu --mixed_precision bf16  main.py \
+accelerate launch \
+    --multi_gpu \
+    --mixed_precision bf16 \
+    --fsdp_version 2 \
+    --fsdp_reshard_after_forward True \
+    --fsdp_auto_wrap_policy 'TRANSFORMER_BASED_WRAP' \
+    --fsdp_transformer_layer_cls_to_wrap LlamaDecoderLayer \
+    \
+    main.py \
     --single_model \
     --teacher_name_or_path 'meta-llama/Llama-2-7b-hf' \
     --output_dir ./llama2-7b-sft \
     --logging_steps 5 \
     --max_steps 400 \
-    --max_seq_length 2048 \
+    --max_length 2048 \
     --per_device_train_batch_size 1 \
     --per_device_eval_batch_size 4 \
-    --gradient_checkpointing True \
-    --fsdp 'full_shard auto_wrap' \
-    --fsdp_transformer_layer_cls_to_wrap LlamaDecoderLayer
+    --gradient_checkpointing True
 ```
 
 ### Distill teacher into student
 
 ```bash
-accelerate launch --multi_gpu --mixed_precision bf16  main.py \
+accelerate launch \
+    --multi_gpu \
+    --mixed_precision bf16  \
+    --fsdp_version 2 \
+    --fsdp_reshard_after_forward True \
+    --fsdp_auto_wrap_policy 'TRANSFORMER_BASED_WRAP' \
+    --fsdp_transformer_layer_cls_to_wrap LlamaDecoderLayer \
+    \
+    main.py \
     --teacher_name_or_path ./llama2-7b-sft \
     --student_name_or_path 'TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T' \
     --output_dir ./llama2-distill \
     --logging_steps 5 \
     --max_steps 200 \
-    --max_seq_length 2048 \
+    --max_length 2048 \
     --per_device_train_batch_size 1 \
     --per_device_eval_batch_size 4 \
-    --gradient_checkpointing False \
-    --fsdp 'full_shard auto_wrap' \
-    --fsdp_transformer_layer_cls_to_wrap LlamaDecoderLayer
+    --gradient_checkpointing False
 ```
 
 > [!NOTE]
diff --git a/examples/llm_distill/main.py b/examples/llm_distill/main.py
@@ -48,25 +48,22 @@ class TrainingArguments(transformers.TrainingArguments):
     do_train: bool = True
     do_eval: bool = True
     save_strategy: str = "no"
-    max_seq_length: int = 1024
+    max_length: int = 1024
     optim: str = "adamw_torch"
     learning_rate: float = 1e-5
     lr_scheduler_type: str = "cosine"
     dataloader_drop_last: bool = True
     dataset_num_proc: int = 8
-    dataset_batch_size: int = 500
     bf16: bool = True
     tf32: bool = True
 
 
 def llama_text_format_func(sample):
-    texts = []
-    for p, q, r in zip(sample["system_prompt"], sample["question"], sample["response"]):
-        if not p:
-            texts.append(f"<s>[INST] {q}[/INST]\n{r}</s>")
-        else:
-            texts.append(f"<s>[INST] <<SYS>>{p}<</SYS>>\n{q}[/INST]\n{r}</s>")
-    return texts
+    p, q, r = sample["system_prompt"], sample["question"], sample["response"]
+    if not p:
+        return f"<s>[INST] {q}[/INST]\n{r}</s>"
+    else:
+        return f"<s>[INST] <<SYS>>{p}<</SYS>>\n{q}[/INST]\n{r}</s>"
 
 
 class KDSFTTrainer(SFTTrainer, KDTrainer):
@@ -130,7 +127,6 @@ def train():
         kd_config = {
             "teacher_model": teacher_model,
             "criterion": LMLogitsLoss(),
-            "expose_minimal_state_dict": False,  # FSDP forces us to disable this
         }
         model = mtd.convert(model, mode=[("kd_loss", kd_config)])
         logger.info("Models converted.")
diff --git a/examples/llm_distill/requirements.txt b/examples/llm_distill/requirements.txt
@@ -1,2 +1,2 @@
 pyarrow
-trl==0.13.0
+trl==0.23.0

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`pyarrow`
`2`		`-trl==0.13.0`
	`2`	`+trl==0.23.0`