Revamp distillation HF example (#430)

AAnoosheh · web-flow · commit 35f90d07fe57 · 2025-10-14T17:33:32.000+02:00
Signed-off-by: Asha Anoosheh &lt;aanoosheh@nvidia.com&gt;
diff --git a/examples/llm_distill/README.md b/examples/llm_distill/README.md
@@ -49,8 +49,8 @@ First obtain both a pretrained model to act as the teacher and a (usually smalle
 from transformers import AutoModelForCausalLM
 
 # Define student & teacher
-student_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
-teacher_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-70B-Instruct")
+student_model = AutoModelForCausalLM.from_pretrained("student-model-id-or-path")
+teacher_model = AutoModelForCausalLM.from_pretrained("teacher-model-id-or-path")
 ```
 
 ### Set up the meta model
@@ -149,52 +149,27 @@ You can also look at the NeMo tutorial notebooks [here](https://github.com/NVIDI
 
 ## Knowledge Distillation (KD) for HuggingFace Models
 
-In this e2e example we finetune Llama-2 models on the [OpenOrca](https://huggingface.co/datasets/Open-Orca/OpenOrca)
-question-answer dataset as a minimal example to demonstrate a simple way of integrating Model Optimizer's KD feature.
+In this e2e example we finetune Llama-3.2 models on the [smol-smoltalk-Interaction-SFT](https://huggingface.co/datasets/ReactiveAI/smol-smoltalk-Interaction-SFT)
+dataset as a minimal example to demonstrate a simple way of integrating Model Optimizer's KD feature.
 
-First we do supervised finetuning (SFT) of a Llama-2-7b on OpenOrca dataset as the teacher, then distill it into
-a 1B-parameter model.
-
-Keep in mind the training loss of the distillation run is not directly comparable to the training loss of the teacher run.
+We replace normal supervised finetuning (SFT) of a Llama-3.2-1B base model by distilling information from Llama-3.2-3B-Instruct which has already been instruction-finetuned.
 
 > [!NOTE]
 > We can fit the following in memory using [FSDP](https://huggingface.co/docs/accelerate/en/usage_guides/fsdp) enabled on 8x RTX 6000 (total ~400GB VRAM)
 
-### Train teacher
-
-```bash
-accelerate launch --config-file ./accelerate_config/fsdp2.yaml \
-    main.py \
-    --single_model \
-    --teacher_name_or_path 'meta-llama/Llama-2-7b-hf' \
-    --output_dir ./llama2-7b-sft \
-    --max_length 2048 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 4 \
-    --max_steps 400 \
-    --logging_steps 5
-```
-
-### Distill teacher into student
-
 ```bash
 accelerate launch --config-file ./accelerate_config/fsdp2.yaml \
-    --fsdp_cpu_ram_efficient_loading False \
-    --fsdp_activation_checkpointing False \
     main.py \
-    --teacher_name_or_path ./llama2-7b-sft \
-    --student_name_or_path 'TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T' \
-    --output_dir ./llama2-distill \
+    --teacher_name_or_path 'meta-llama/Llama-3.2-3B-Instruct' \
+    --student_name_or_path 'meta-llama/Llama-3.2-1B' \
+    --output_dir ./llama3.2-distill \
     --max_length 2048 \
-    --per_device_train_batch_size 1 \
-    --per_device_eval_batch_size 4 \
+    --per_device_train_batch_size 4 \
+    --per_device_eval_batch_size 8 \
     --max_steps 200 \
     --logging_steps 5
 ```
 
-> [!NOTE]
-> If you receive a `RuntimeError: unable to open file <...> in read-only mode: No such file or directory` simply re-run the command a second time.
-
 ## Resources
 
 - 📅 [Roadmap](https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/146)
diff --git a/examples/llm_distill/accelerate_config/fsdp2.yaml b/examples/llm_distill/accelerate_config/fsdp2.yaml
@@ -4,9 +4,9 @@ distributed_type: FSDP
 downcast_bf16: 'no'
 enable_cpu_affinity: false
 fsdp_config:
-  fsdp_activation_checkpointing: true
+  fsdp_activation_checkpointing: false
   fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  fsdp_cpu_ram_efficient_loading: true
+  fsdp_cpu_ram_efficient_loading: false
   fsdp_offload_params: false
   fsdp_reshard_after_forward: true
   fsdp_state_dict_type: SHARDED_STATE_DICT
diff --git a/examples/llm_distill/main.py b/examples/llm_distill/main.py
@@ -37,7 +37,6 @@
 class ModelArguments:
     teacher_name_or_path: str | None = None
     student_name_or_path: str | None = None
-    single_model: bool = False
 
 
 @dataclass
@@ -55,41 +54,20 @@ class TrainingArguments(transformers.TrainingArguments):
     tf32: bool = True
 
 
-def llama_text_format_func(sample):
-    p, q, r = sample["system_prompt"], sample["question"], sample["response"]
-    if not p:
-        return f"<s>[INST] {q}[/INST]\n{r}</s>"
-    else:
-        return f"<s>[INST] <<SYS>>{p}<</SYS>>\n{q}[/INST]\n{r}</s>"
+def _format_smoltalk_chat_template(sample, tokenizer):
+    # smol-smoltalk-Interaction-SFT dataset has "query" and "answer" fields
+    # Convert them to messages format and use tokenizer's apply_chat_template
+    messages = [
+        {"role": "user", "content": sample["query"]},
+        {"role": "assistant", "content": sample["answer"]},
+    ]
+    return tokenizer.apply_chat_template(messages, tokenize=False)
 
 
 class KDSFTTrainer(SFTTrainer, KDTrainer):
     pass
 
 
-def _save_model_fsdp_compat(
-    self,
-    output_dir: str | None = None,
-    _internal_call: bool = False,
-    *args,
-    **kwargs,
-):
-    output_dir = output_dir or self.args.output_dir
-    model = self.accelerator.unwrap_model(self.model)
-    if not _internal_call and self.is_fsdp_enabled:
-        state_dict = self.accelerator.get_state_dict(self.model)
-        if self.accelerator.is_main_process:
-            model.save_pretrained(
-                output_dir,
-                is_main_process=self.accelerator.is_main_process,
-                save_function=self.accelerator.save,
-                state_dict=state_dict,
-            )
-            self.processing_class.save_pretrained(output_dir)
-    else:
-        super(SFTTrainer, self).save_model(output_dir, _internal_call, *args, **kwargs)
-
-
 def train():
     parser = transformers.HfArgumentParser((ModelArguments, TrainingArguments))
     model_args, training_args = parser.parse_args_into_dataclasses()
@@ -98,9 +76,6 @@ def train():
     # modelopt state will be saved automatically to "modelopt_state.pth"
     mto.enable_huggingface_checkpointing()
 
-    # HACK: Fix FSDP2-incompatible save_model() function for SFTTrainer
-    SFTTrainer.save_model = _save_model_fsdp_compat
-
     # Set total batch size across all ranks to equal 64
     total_batch_size = 64
     num_accum_steps = total_batch_size / (
@@ -117,8 +92,8 @@ def train():
 
     # Dataset
     logger.info("Loading dataset...")
-    dset = datasets.load_dataset("Open-Orca/OpenOrca", split="train")
-    dset_splits = dset.train_test_split(train_size=25600, test_size=1700, seed=420)
+    dset = datasets.load_dataset("ReactiveAI/smol-smoltalk-Interaction-SFT", split="train")
+    dset_splits = dset.train_test_split(train_size=12800, test_size=1280, seed=420)
     dset_train, dset_eval = dset_splits["train"], dset_splits["test"]
     logger.info("Dataset loaded.")
 
@@ -131,42 +106,34 @@ def train():
     logger.info("Tokenizer loaded.")
 
     # Model
-    if model_args.single_model:
-        logger.info("Loading single model only...")
-        model = transformers.AutoModelForCausalLM.from_pretrained(
-            model_path, dtype=torch.bfloat16 if training_args.bf16 else None
-        )
-        logger.info("Model loaded.")
-    else:
-        logger.info("Loading student model...")
-        model = transformers.AutoModelForCausalLM.from_pretrained(
-            model_args.student_name_or_path, dtype=torch.bfloat16 if training_args.bf16 else None
-        )
-        logger.info("Student loaded.")
-        # Load checkpoint
-        logger.info("Loading teacher model and converting to Distillation model...")
-        teacher_model = transformers.AutoModelForCausalLM.from_pretrained(
-            model_args.teacher_name_or_path, dtype=torch.bfloat16 if training_args.bf16 else None
-        )
-        kd_config = {
-            "teacher_model": teacher_model,
-            "criterion": LMLogitsLoss(),
-        }
-        model = mtd.convert(model, mode=[("kd_loss", kd_config)])
-        logger.info("Models converted.")
+    logger.info("Loading student model...")
+    model = transformers.AutoModelForCausalLM.from_pretrained(
+        model_args.student_name_or_path, dtype=torch.bfloat16 if training_args.bf16 else None
+    )
+    logger.info("Student loaded.")
+    # Load checkpoint
+    logger.info("Loading teacher model and converting to Distillation model...")
+    teacher_model = transformers.AutoModelForCausalLM.from_pretrained(
+        model_args.teacher_name_or_path, dtype=torch.bfloat16 if training_args.bf16 else None
+    )
+    kd_config = {
+        "teacher_model": teacher_model,
+        "criterion": LMLogitsLoss(),
+    }
+    model = mtd.convert(model, mode=[("kd_loss", kd_config)])
+    logger.info("Models converted.")
 
     # Fix problematic settings that logger.info excessive warnings
     model.generation_config.temperature = None
     model.generation_config.top_p = None
 
     # Trainer
-    trainer_cls = SFTTrainer if model_args.single_model else KDSFTTrainer
-    trainer = trainer_cls(
+    trainer = KDSFTTrainer(
         model,
         training_args,
         train_dataset=dset_train,
         eval_dataset=dset_eval,
-        formatting_func=llama_text_format_func,
+        formatting_func=lambda sample: _format_smoltalk_chat_template(sample, tokenizer),
         processing_class=tokenizer,
     )
 
@@ -186,8 +153,7 @@ def train():
     # Save checkpoint
     logger.info("Saving checkpoint...")
     trainer.save_state()
-    kwargs = {"export_student": True} if not model_args.single_model else {}
-    trainer.save_model(trainer.args.output_dir, **kwargs)
+    trainer.save_model(trainer.args.output_dir, export_student=True)
     logger.info("Checkpoint saved.")
 
 
diff --git a/tests/examples/llm_distill/test_llm_distill.py b/tests/examples/llm_distill/test_llm_distill.py
@@ -22,8 +22,6 @@ def test_llama_distill(tiny_llama_path, tmp_path):
     run_example_command(
         [
             "accelerate", "launch", "--config-file", "./accelerate_config/fsdp2.yaml",
-            "--fsdp_cpu_ram_efficient_loading", "False",
-            "--fsdp_activation_checkpointing", "False",
             "main.py",
             "--teacher_name_or_path", tiny_llama_path,
             "--student_name_or_path", tiny_llama_path,

Original file line number	Diff line number	Diff line change
`@@ -22,8 +22,6 @@ def test_llama_distill(tiny_llama_path, tmp_path):`
`22`	`22`	`run_example_command(`
`23`	`23`	`[`
`24`	`24`	`"accelerate", "launch", "--config-file", "./accelerate_config/fsdp2.yaml",`
`25`		`- "--fsdp_cpu_ram_efficient_loading", "False",`
`26`		`- "--fsdp_activation_checkpointing", "False",`
`27`	`25`	`"main.py",`
`28`	`26`	`"--teacher_name_or_path", tiny_llama_path,`
`29`	`27`	`"--student_name_or_path", tiny_llama_path,`