Review suggestions and fix test

AAnoosheh · AAnoosheh · commit 549d20f1ae61 · 2025-09-17T08:11:46.000-07:00
Signed-off-by: Asha Anoosheh &lt;aanoosheh@nvidia.com&gt;
diff --git a/examples/llm_distill/README.md b/examples/llm_distill/README.md
@@ -154,47 +154,33 @@ Keep in mind the training loss of the distillation run is not directly comparabl
 ### Train teacher
 
 ```bash
-accelerate launch \
-    --multi_gpu \
-    --mixed_precision bf16 \
-    --fsdp_version 2 \
-    --fsdp_reshard_after_forward True \
-    --fsdp_auto_wrap_policy 'TRANSFORMER_BASED_WRAP' \
-    --fsdp_transformer_layer_cls_to_wrap LlamaDecoderLayer \
-    \
+accelerate launch --config-file ./accelerate_config/fsdp2.yaml \
     main.py \
     --single_model \
     --teacher_name_or_path 'meta-llama/Llama-2-7b-hf' \
     --output_dir ./llama2-7b-sft \
-    --logging_steps 5 \
-    --max_steps 400 \
     --max_length 2048 \
     --per_device_train_batch_size 1 \
     --per_device_eval_batch_size 4 \
-    --gradient_checkpointing True
+    --max_steps 400 \
+    --logging_steps 5
 ```
 
 ### Distill teacher into student
 
 ```bash
-accelerate launch \
-    --multi_gpu \
-    --mixed_precision bf16  \
-    --fsdp_version 2 \
-    --fsdp_reshard_after_forward True \
-    --fsdp_auto_wrap_policy 'TRANSFORMER_BASED_WRAP' \
-    --fsdp_transformer_layer_cls_to_wrap LlamaDecoderLayer \
-    \
+accelerate launch --config-file ./accelerate_config/fsdp2.yaml \
+    --fsdp_cpu_ram_efficient_loading False \
+    --fsdp_activation_checkpointing False \
     main.py \
     --teacher_name_or_path ./llama2-7b-sft \
     --student_name_or_path 'TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T' \
     --output_dir ./llama2-distill \
-    --logging_steps 5 \
-    --max_steps 200 \
     --max_length 2048 \
     --per_device_train_batch_size 1 \
     --per_device_eval_batch_size 4 \
-    --gradient_checkpointing False
+    --max_steps 200 \
+    --logging_steps 5
 ```
 
 > [!NOTE]
diff --git a/examples/llm_distill/accelerate_config/fsdp2.yaml b/examples/llm_distill/accelerate_config/fsdp2.yaml
@@ -0,0 +1,25 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+fsdp_config:
+  fsdp_activation_checkpointing: true
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_offload_params: false
+  fsdp_reshard_after_forward: true
+  fsdp_state_dict_type: SHARDED_STATE_DICT
+  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  fsdp_version: 2
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: gpu
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/examples/llm_distill/main.py b/examples/llm_distill/main.py
@@ -21,7 +21,6 @@
 import torch
 import torch.distributed
 import transformers
-from accelerate import PartialState
 from accelerate.logging import get_logger
 from transformers import AutoTokenizer
 from trl import SFTTrainer
@@ -108,21 +107,19 @@ def train():
     if model_args.single_model:
         logger.info("Loading single model only...")
         model = transformers.AutoModelForCausalLM.from_pretrained(
-            model_path, device_map=PartialState().process_index
+            model_path, dtype=torch.bfloat16 if training_args.bf16 else None
         )
         logger.info("Model loaded.")
     else:
         logger.info("Loading student model...")
         model = transformers.AutoModelForCausalLM.from_pretrained(
-            model_args.student_name_or_path,
-            device_map=PartialState().process_index,
+            model_args.student_name_or_path, dtype=torch.bfloat16 if training_args.bf16 else None
         )
         logger.info("Student loaded.")
         # Load checkpoint
         logger.info("Loading teacher model and converting to Distillation model...")
         teacher_model = transformers.AutoModelForCausalLM.from_pretrained(
-            model_args.teacher_name_or_path,
-            device_map=PartialState().process_index,
+            model_args.teacher_name_or_path, dtype=torch.bfloat16 if training_args.bf16 else None
         )
         kd_config = {
             "teacher_model": teacher_model,
@@ -134,8 +131,6 @@ def train():
     # Fix problematic settings that logger.info excessive warnings
     model.generation_config.temperature = None
     model.generation_config.top_p = None
-    if training_args.gradient_checkpointing:
-        training_args.gradient_checkpointing_kwargs = {"use_reentrant": False}
 
     # Trainer
     trainer_cls = SFTTrainer if model_args.single_model else KDSFTTrainer
diff --git a/examples/llm_distill/requirements.txt b/examples/llm_distill/requirements.txt
@@ -1,2 +1,2 @@
 pyarrow
-trl==0.23.0
+trl>=0.23.0
diff --git a/tests/examples/llm_distill/test_llm_distill.py b/tests/examples/llm_distill/test_llm_distill.py
@@ -21,18 +21,18 @@
 def test_llama_distill(tiny_llama_path, tmp_path):
     run_example_command(
         [
-            "accelerate", "launch", "--multi_gpu", "--mixed_precision", "bf16", "main.py",
+            "accelerate", "launch", "--config-file", "./accelerate_config/fsdp2.yaml",
+            "--fsdp_cpu_ram_efficient_loading", "False",
+            "--fsdp_activation_checkpointing", "False",
+            "main.py",
             "--teacher_name_or_path", tiny_llama_path,
             "--student_name_or_path", tiny_llama_path,
             "--output_dir", tmp_path,
-            "--logging_steps", "5",
-            "--max_steps", "10",
-            "--max_seq_length", "1024",
+            "--max_length", "1024",
             "--per_device_train_batch_size", "2",
             "--per_device_eval_batch_size", "8",
-            "--gradient_checkpointing", "True",
-            "--fsdp", "full_shard auto_wrap",
-            "--fsdp_transformer_layer_cls_to_wrap", "LlamaDecoderLayer",
+            "--max_steps", "10",
+            "--logging_steps", "5",
         ],
         "llm_distill",
     )
diff --git a/tests/unit/torch/opt/plugins/test_hf_patching.py b/tests/unit/torch/opt/plugins/test_hf_patching.py
@@ -25,15 +25,6 @@
 import modelopt.torch.opt as mto
 
 
-def _teacher_factory(model_name_or_path, teacher_model_type):
-    if teacher_model_type == "qwen3":
-        return get_tiny_qwen3()
-    else:
-        return AutoModelForCausalLM.from_pretrained(
-            model_name_or_path,
-        )
-
-
 @pytest.mark.parametrize(
     ("model_cls", "teacher_model_type"),
     [
@@ -46,12 +37,13 @@ def test_nested_model_save_restore(tmp_path, model_cls, teacher_model_type):
 
     model_ref = model_cls.from_pretrained(tiny_llama_dir)
 
+    if teacher_model_type == "qwen3":
+        teacher_model = get_tiny_qwen3()
+    else:
+        teacher_model = AutoModelForCausalLM.from_pretrained(tiny_llama_dir)
+
     kd_config = {
-        "teacher_model": (
-            _teacher_factory,
-            (tiny_llama_dir, teacher_model_type),
-            {},
-        ),
+        "teacher_model": teacher_model,
         "criterion": mtd.LogitsDistillationLoss(),
         "expose_minimal_state_dict": False,
     }
@@ -61,6 +53,5 @@ def test_nested_model_save_restore(tmp_path, model_cls, teacher_model_type):
     model_test = model_cls.from_pretrained(tiny_llama_dir / "modelopt_model")
 
     tf_output_tester(model, model_test)
-    # since distill model contains loss function, we compare state of model and teacher model manually
+    # since distill model contains loss function, we compare state of model manually
     assert mto.modelopt_state(model.model) == mto.modelopt_state(model_test.model)
-    assert mto.modelopt_state(model._teacher_model) == mto.modelopt_state(model_test._teacher_model)

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`pyarrow`
`2`		`-trl==0.23.0`
	`2`	`+trl>=0.23.0`