Disable KD from saving any real state

AAnoosheh · AAnoosheh · commit cf0792611a83 · 2025-09-17T08:11:44.000-07:00
Signed-off-by: Asha Anoosheh &lt;aanoosheh@nvidia.com&gt;
diff --git a/modelopt/torch/distill/config.py b/modelopt/torch/distill/config.py
@@ -16,20 +16,18 @@
 """Configurations for distillation modes."""
 
 import warnings
-from collections.abc import Callable
 from typing import Any, Union
 
 import pydantic
-import torch.nn as nn
 from torch.nn.modules.loss import _Loss as Loss
 
 from modelopt.torch.opt.config import ModeloptBaseConfig, ModeloptField
+from modelopt.torch.utils.network import ModelLike
 
 from .loss_balancers import DistillationLossBalancer
 
 __all__ = ["KDLossConfig"]
 
-TeacherModel = type[nn.Module] | tuple | Callable
 Criterion = Union[Loss, dict[tuple[str, str], Loss]]  # noqa: UP007
 
 
@@ -42,14 +40,13 @@ class KDLossConfig(ModeloptBaseConfig):
     # TODO: we should really think about a better to configure KDLossConfig
     model_config = pydantic.ConfigDict(extra="forbid", arbitrary_types_allowed=True)
 
-    teacher_model: TeacherModel | None = ModeloptField(
+    teacher_model: ModelLike | None = ModeloptField(
         default=None,
         title="Teacher model",
         description=(
-            "The class or callable or tuple to initialize the teacher model using"
+            "The module, class, callable, or tuple to initialize the teacher model using"
             " :meth:`init_model_from_model_like"
-            " <modelopt.torch.utils.network.init_model_from_model_like>`. This cannot already be an"
-            " instance of nn.Module."
+            " <modelopt.torch.utils.network.init_model_from_model_like>`."
         ),
     )
     criterion: Criterion | None = ModeloptField(
diff --git a/modelopt/torch/distill/mode.py b/modelopt/torch/distill/mode.py
@@ -83,7 +83,12 @@ def restore(self) -> RestoreEntrypoint:
     @property
     def update_for_new_mode(self) -> UpdateEntrypoint:
         """The mode's entrypoint for updating the models state for adding new mode."""
-        return _update_kd_state_before_new_mode
+        return _reset_kd_state_config
+
+    @property
+    def update_for_save(self) -> UpdateEntrypoint:
+        """The mode's entrypoint for updating the models state before saving."""
+        return _reset_kd_state_config
 
 
 @DistillModeRegistry.register_mode
@@ -171,16 +176,12 @@ def _convert_for_kd(model: nn.Module, config: KDLossConfig) -> ConvertReturnType
 
 def _restore_kd_model(model: nn.Module, config: KDLossConfig, metadata: MetadataDict) -> nn.Module:
     """Function for restoring a previously convert model to a distillation meta-model."""
-    # the metadata should be empty
-    assert not metadata, "No metadata expected!"
+    # NOTE: DistillationModel will purposely remain unrestored
+    return model
 
-    return _convert_for_kd(model, config)[0]
 
-
-def _update_kd_state_before_new_mode(
-    model: nn.Module, config: KDLossConfig, metadata: MetadataDict
-) -> None:
-    """Function for updating the model's state before new mode."""
+def _reset_kd_state_config(model: nn.Module, config: KDLossConfig, metadata: MetadataDict):
+    """Function for resetting the state's config."""
     config.teacher_model = nn.Module
     config.criterion = Loss()
     config.loss_balancer = None
@@ -216,8 +217,5 @@ def _export_student(model: nn.Module, config: ExportStudentConfig) -> ConvertRet
 def _restore_exported_student(
     model: nn.Module, config: ExportStudentConfig, metadata: MetadataDict
 ) -> nn.Module:
-    """Function for restoring a previously exported distillation meta-model."""
-    # no metadata is used by the mode
-    assert not metadata, "No metadata expected!"
-
-    return _export_student(model, config)[0]
+    # NOTE: DistillationModel was unrestored so this does nothing
+    return model
diff --git a/modelopt/torch/distill/plugins/huggingface.py b/modelopt/torch/distill/plugins/huggingface.py
@@ -80,12 +80,6 @@ def save_model(
                     state_dict=state_dict,
                 )
                 self.processing_class.save_pretrained(output_dir)
-                if export_student:
-                    modelopt_state["modelopt_state_dict"] = [
-                        state
-                        for state in modelopt_state["modelopt_state_dict"]
-                        if "kd_loss" not in state and "export_student" not in state
-                    ]
                 torch.save(modelopt_state, f"{output_dir}/modelopt_state.pth")
         else:
             model = model.export() if export_student else model
diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py
@@ -174,12 +174,6 @@ def _save_modelopt_state_with_weights(self):
             torch.distributed.barrier()
 
         modelopt_state = mto.modelopt_state(self.model)
-        # TODO: remove this from ModelOpt HF Trainer flows
-        modelopt_state["modelopt_state_dict"] = [
-            state
-            for state in modelopt_state["modelopt_state_dict"]
-            if "kd_loss" not in state and "export_student" not in state
-        ]
         modelopt_state["modelopt_state_weights"] = get_quantizer_state_dict(self.model)
 
         if self.args.should_save:
diff --git a/tests/unit/torch/distill/test_distill.py b/tests/unit/torch/distill/test_distill.py
@@ -147,19 +147,15 @@ def test_distillation_save_restore(distillation_model, tmp_path):
     new_student = tiny_mobilenet()
     distillation_model_new = mto.restore(new_student, tmp_path / "ckpt.pt")
 
-    assert isinstance(distillation_model_new, mtd.DistillationModel)
-    assert distillation_model_new.teacher_model is not None
-
-    input = get_input_tensor()
-
-    # disable dropout for deterministic results
-    distillation_model.eval()
-    distillation_model_new.eval()
-
-    out = distillation_model(input)
-    out_new = distillation_model_new(input)
+    # Ensure state config was reset
+    manager = mto.ModeloptStateManager(distillation_model_new)
+    cfg = manager._state[-1][1]["config"]
+    assert cfg["teacher_model"] == nn.Module
+    assert isinstance(next(iter(cfg["criterion"].values())), Loss)
+    assert cfg["loss_balancer"] is None
 
-    assert torch.allclose(out, out_new)
+    # Should not have restored anything
+    assert isinstance(distillation_model_new, type(new_student))
 
 
 def test_distillation_export(distillation_model, tmp_path):