Update KD docs

AAnoosheh · AAnoosheh · commit be7c7c04d1ce · 2025-09-17T08:11:46.000-07:00
Signed-off-by: Asha Anoosheh &lt;aanoosheh@nvidia.com&gt;
diff --git a/docs/source/guides/4_distillation.rst b/docs/source/guides/4_distillation.rst
@@ -16,9 +16,9 @@ a more powerful teacher model using :mod:`modelopt.torch.distill <modelopt.torch
     interaction between the two.
 #.  **Distillation training**: Seamlessly use the meta-model in place of the original model and run
     the original script with only one additional line of code for loss calculation.
-#.  **Checkpoint and re-load**: Save the model via :meth:`mto.save <modelopt.torch.opt.conversion.save>` and
-    restore via :meth:`mto.restore <modelopt.torch.opt.conversion.restore>`. See :ref:`saving and restoring <save-restore>`
-    to learn more.
+#.  **Checkpoint and re-load**: Save the model via :meth:`mto.save <modelopt.torch.opt.conversion.save>`
+    Note that restoring the model (via :meth:`mto.restore <modelopt.torch.opt.conversion.restore>`)
+    will not reinstantiate the distillation meta-model, in order to avoid unpickling issues.
 
 *To find out more about Distillation and related concepts, please refer to the below section*
 :ref:`Distillation Concepts <distillation-concepts>`.
@@ -44,7 +44,7 @@ Example usage:
 
     # Configure and convert for distillation
     distillation_config = {
-        # `teacher_model` is a model class or callable, or a tuple.
+        # `teacher_model` is a model, model class, callable, or a tuple.
         # If a tuple, it must be of the form (model_cls_or_callable,) or
         # (model_cls_or_callable, args) or (model_cls_or_callable, args, kwargs).
         "teacher_model": teacher_model,
@@ -53,15 +53,9 @@ Example usage:
     }
     distillation_model = mtd.convert(model, mode=[("kd_loss", distillation_config)])
 
-    # Export model in original class form
+    # Export model in original class, with only previously-present attributes
     model_exported = mtd.export(distillation_model)
 
-.. note::
-    The config requires a (non-lambda) Callable to return a teacher model in place of the model
-    itself. This is to avoid re-saving the teacher state dict upon saving the Distillation
-    meta model. Thus, the same callable must be available in the namespace when restoring via
-    the :meth:`mto.restore <modelopt.torch.opt.conversion.restore>` utility.
-
 .. tip::
     When training the student on a small corpus of ground truth data, consider using :class:`MFTLoss <modelopt.torch.distill.MFTLoss>` for to perform Minifinetuning in lieu of the standard
     :class:`LogitsDistillationLoss <modelopt.torch.distill.losses.LogitsDistillationLoss>`. This will allow the student to learn from the teacher's distribution while adapting to the new data, improving the specialization of the new data without overwriting teacher's general knowledge.
@@ -170,10 +164,12 @@ outputs in the same order as well:
 The intermediate outputs for the losses are captured by the
 :class:`DistillationModel <modelopt.torch.distill.distillation_model.DistillationModel>` and then the loss(es) are
 invoked using :meth:`DistillationModel.compute_kd_loss() <modelopt.torch.distill.distillation_model.DistillationModel.compute_kd_loss>`.
-If present, the original student's non-distillation loss is passed in as an argument.
+If present, the original student's non-distillation loss can be passed in as an argument.
 
 Writing a custom loss function is often necessary, especially to handle outputs that need to be processed
-to obtain the logits and activations.
+to obtain the logits and activations. Additional arguments to the loss function can be passed in to
+:meth:`DistillationModel.compute_kd_loss() <modelopt.torch.distill.distillation_model.DistillationModel.compute_kd_loss>`
+as ``kwargs``.
 
 Loss Balancer
 ^^^^^^^^^^^^^
diff --git a/examples/llm_distill/README.md b/examples/llm_distill/README.md
@@ -39,13 +39,9 @@ First obtain both a pretrained model to act as the teacher and a (usually smalle
 ```python
 from transformers import AutoModelForCausalLM
 
-# Define student
+# Define student & teacher
 student_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
-
-# Define callable which returns teacher
-def teacher_factory():
-    teacher_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-70B-Instruct")
-    return teacher_model
+teacher_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-70B-Instruct")
 ```
 
 ### Set up the meta model
@@ -58,15 +54,15 @@ Please see an example Distillation setup below. This example assumes the outputs
 import modelopt.torch.distill as mtd
 
 distillation_config = {
-    "teacher_model": teacher_factory,  # model initializer
+    "teacher_model": teacher_model,
     "criterion": mtd.LogitsDistillationLoss(),  # callable receiving student and teacher outputs, in order
     "loss_balancer": mtd.StaticLossBalancer(),  # combines multiple losses; omit if only one distillation loss used
 }
 
 distillation_model = mtd.convert(student_model, mode=[("kd_loss", distillation_config)])
 ```
 
-The `teacher_model` can be either a callable which returns an `nn.Module` or a tuple of `(model_cls, args, kwargs)`. The `criterion` is the distillation loss used between student and teacher tensors. The `loss_balancer` determines how the original and distillation losses are combined (if needed).
+The `teacher_model` can be either a `nn.Module`, a callable which returns an `nn.Module`, or a tuple of `(model_cls, args, kwargs)`. The `criterion` is the distillation loss used between student and teacher tensors. The `loss_balancer` determines how the original and distillation losses are combined (if needed).
 
 See [Distillation](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/4_distillation.html) for more info.
 
diff --git a/examples/llm_distill/main.py b/examples/llm_distill/main.py
@@ -73,13 +73,6 @@ class KDSFTTrainer(SFTTrainer, KDTrainer):
     pass
 
 
-def _teacher_factory(model_name_or_path):
-    return transformers.AutoModelForCausalLM.from_pretrained(
-        model_name_or_path,
-        device_map=PartialState().process_index,
-    )
-
-
 def train():
     parser = transformers.HfArgumentParser((ModelArguments, TrainingArguments))
     model_args, training_args = parser.parse_args_into_dataclasses()
@@ -117,7 +110,9 @@ def train():
 
     if model_args.single_model:
         logger.info("Loading single model only...")
-        model = _teacher_factory(model_path)
+        model = transformers.AutoModelForCausalLM.from_pretrained(
+            model_path, device_map=PartialState().process_index
+        )
         logger.info("Model loaded.")
     else:
         logger.info("Loading student model...")
@@ -128,12 +123,12 @@ def train():
         logger.info("Student loaded.")
         # Load checkpoint
         logger.info("Loading teacher model and converting to Distillation model...")
+        teacher_model = transformers.AutoModelForCausalLM.from_pretrained(
+            model_args.teacher_name_or_path,
+            device_map=PartialState().process_index,
+        )
         kd_config = {
-            "teacher_model": (
-                _teacher_factory,
-                (model_args.teacher_name_or_path,),
-                {},
-            ),
+            "teacher_model": teacher_model,
             "criterion": LMLogitsLoss(),
             "expose_minimal_state_dict": False,  # FSDP forces us to disable this
         }