NVIDIA-NeMo
diff --git a/‎nemo_automodel/_transformers/auto_model.py‎
Lines changed: 41 additions & 37 deletions b/‎nemo_automodel/_transformers/auto_model.py‎
Lines changed: 41 additions & 37 deletions
diff --git a/‎nemo_automodel/components/checkpoint/checkpointing.py‎
Lines changed: 8 additions & 2 deletions b/‎nemo_automodel/components/checkpoint/checkpointing.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎nemo_automodel/recipes/llm/kd.py‎
Lines changed: 0 additions & 29 deletions b/‎nemo_automodel/recipes/llm/kd.py‎
Lines changed: 0 additions & 29 deletions
diff --git a/‎nemo_automodel/recipes/llm/train_ft.py‎
Lines changed: 1 addition & 4 deletions b/‎nemo_automodel/recipes/llm/train_ft.py‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎nemo_automodel/recipes/llm/train_seq_cls.py‎
Lines changed: 0 additions & 1 deletion b/‎nemo_automodel/recipes/llm/train_seq_cls.py‎
Lines changed: 0 additions & 1 deletion
@@ -46,6 +46,7 @@
 from nemo_automodel.components._peft.lora import apply_lora_to_linear_modules
 from nemo_automodel.components.checkpoint.checkpointing import (
     Checkpointer,
+    CheckpointingConfig,
     _maybe_adapt_state_dict_to_hf,
 )
 from nemo_automodel.components.distributed.ddp import DDPManager
@@ -524,7 +525,6 @@ def apply_model_infrastructure(
     is_hf_model,
     is_meta_device,
     device,
-    checkpointer,
     model_wrapper=None,
     tp_size=1,
     cp_size=1,
@@ -536,9 +536,9 @@ def apply_model_infrastructure(
     autopipeline=None,
     parallelize_fn=None,
     compile_config=None,
-    model_name_or_path=None,
     load_base_model=False,
     cache_dir=None,
+    pretrained_model_name_or_path="",
     **_kwargs,
 ):
     """Apply sharding, PEFT, quantization, and checkpoint loading to a model.
@@ -558,7 +558,6 @@ def apply_model_infrastructure(
         is_hf_model: Whether this is an HF model (vs custom implementation)
         is_meta_device: Whether model was initialized on meta device
         device: Target device for model
-        checkpointer: Checkpointer instance for weight loading
         model_wrapper: Model wrapper (FSDP2Manager, DDPManager, etc.). Default: None
         tp_size: Tensor parallelism size. Default: 1
         cp_size: Context parallelism size. Default: 1
@@ -570,7 +569,7 @@ def apply_model_infrastructure(
         autopipeline: AutoPipeline instance for pipeline parallelism. Default: None
         parallelize_fn: Function to apply parallelization (EP + FSDP2). Default: None
         compile_config: Compilation configuration. Default: None
-        model_name_or_path: Model name or path for checkpoint loading. Default: None
+        pretrained_model_name_or_path: Model name or path for checkpoint loading. Default: None
         load_base_model: Whether to load base model weights (True for from_pretrained). Default: False
         cache_dir: Cache directory for model weights. Default: None
         **_kwargs: Additional keyword arguments (ignored, allows passing extra kwargs)
@@ -580,6 +579,24 @@ def apply_model_infrastructure(
     """
     _verify_sdpa_support(model, is_hf_model, cp_size)
 
+    # Create a dummy checkpointer. We can pass in dummy values here since we are only loading the base weights.
+    ckpt_config = CheckpointingConfig(
+        enabled=True,
+        checkpoint_dir="",
+        model_save_format="safetensors",
+        model_cache_dir=cache_dir,
+        model_repo_id=pretrained_model_name_or_path,
+        save_consolidated=True,
+        is_peft=peft_config is not None,
+    )
+    checkpointer = Checkpointer(
+        ckpt_config,
+        0,
+        0,
+        0,
+        getattr(model_wrapper, "moe_mesh", None) if model_wrapper else None,
+    )
+
     # Handle checkpointer config updates if checkpointer is provided
     dequantize_base_checkpoint = False
     if checkpointer is not None:
@@ -599,11 +616,10 @@ def apply_model_infrastructure(
             model, tp_size, autopipeline, peft_config, quantization_config, fp8_config, qat_quantizer
         )
 
-    # hold a list copy of the model state dict keys before any parallelization
-    if checkpointer is not None:
-        checkpointer.config.model_state_dict_keys = list(
-            _maybe_adapt_state_dict_to_hf(model, model.state_dict(), quantization=dequantize_base_checkpoint).keys()
-        )
+    # hold a list copy of the model state dict keys before any parallelization. To be used during checkpoint saving in safetensors format.
+    pre_shard_hf_state_dict_keys = list(
+        _maybe_adapt_state_dict_to_hf(model, model.state_dict(), quantization=dequantize_base_checkpoint).keys()
+    )
 
     # Loss function check
     if not _supports_logits_to_keep(model) and not isinstance(loss_fn, MaskedCrossEntropy):
@@ -613,24 +629,26 @@ def apply_model_infrastructure(
     # Note: AutoPipeline takes care of applying PP + EP + FSDP. _shard_ep_fsdp will take care of applying EP + FSDP if no PP.
     if autopipeline is not None:
         model = _shard_pp(autopipeline, model, loss_fn, parallelize_fn)
+        for part in model.parts:
+            setattr(part, "_pre_shard_hf_state_dict_keys", pre_shard_hf_state_dict_keys)
     else:
         model = _shard_ep_fsdp(model, model_wrapper, parallelize_fn)
         if compile_config is not None:
             model = compile_model(model, compile_config)
+        if isinstance(model_wrapper, DDPManager):
+            setattr(model.module, "_pre_shard_hf_state_dict_keys", pre_shard_hf_state_dict_keys)
+        else:
+            setattr(model, "_pre_shard_hf_state_dict_keys", pre_shard_hf_state_dict_keys)
 
     # Load the checkpoint if needed and return
     # Weights need to be loaded for meta device models that were parallelized:
     # 1. When parallelize_fn was used (which will internally apply FSDP2/EP sharding)
     # 2. When FSDP2Manager.parallelize was used (but not MegatronFSDP which handles weights internally)
-    should_load_checkpoint = (
-        is_meta_device
-        and checkpointer is not None
-        and any(
-            [
-                parallelize_fn is not None and get_world_size_safe() > 1,
-                callable(getattr(model_wrapper, "parallelize", None)),
-            ]
-        )
+    should_load_checkpoint = is_meta_device and any(
+        [
+            parallelize_fn is not None and get_world_size_safe() > 1,
+            callable(getattr(model_wrapper, "parallelize", None)),
+        ]
     )
     if should_load_checkpoint:
         models_to_load = model.parts if hasattr(model, "parts") else [model]
@@ -640,7 +658,7 @@ def apply_model_infrastructure(
                 mp,
                 device,
                 cache_dir,
-                model_name_or_path,
+                pretrained_model_name_or_path,
                 lora_a_init,
                 load_base_model=load_base_model,
             )
@@ -778,7 +796,6 @@ def from_pretrained(
         model_wrapper=None,
         autopipeline: AutoPipeline | None = None,
         parallelize_fn: Callable | None = None,
-        checkpointer: Optional[Checkpointer] = None,
         peft_config: Optional[dict] = None,
         fp8_config: Optional["FP8Config"] = None,
         qat_quantizer: Optional[Union["Int4WeightOnlyQATQuantizer", "Int8DynActInt4WeightQATQuantizer"]] = None,
@@ -824,9 +841,6 @@ def from_pretrained(
                 pipeline stages. Default: None.
             parallelize_fn (Callable | None, optional): Custom function to apply
                 parallelization (EP + FSDP2). Default: None.
-            checkpointer (Checkpointer, optional): Checkpointer instance for loading weights
-                and enabling save_pretrained() functionality. Required for weight loading
-                and checkpoint management.
             peft_config (dict | None, optional): PEFT/LoRA configuration dictionary.
                 If provided, LoRA adapters will be applied to the model. Default: None.
             fp8_config (FP8Config | None, optional): FP8 quantization configuration.
@@ -882,7 +896,6 @@ def _retry(**override):
                 fp8_config=fp8_config,
                 qat_quantizer=qat_quantizer,
                 loss_fn=loss_fn,
-                checkpointer=checkpointer,
                 compile_config=compile_config,
                 model_wrapper=model_wrapper,
                 **kwargs,
@@ -899,11 +912,10 @@ def _retry(**override):
         device = torch.cuda.current_device()
 
         # Neither of these parallelization methods support meta device initialization
-        # Also require checkpointer for meta device init, as we need it to load weights
         is_meta_device = (
             not isinstance(model_wrapper, (MegatronFSDPManager, DDPManager))
             and not force_hf
-            and checkpointer is not None
+            and get_world_size_safe() > 1
         )
         init_ctx = ContextManagers([no_init_weights(), init_empty_weights()]) if is_meta_device else nullcontext()
 
@@ -948,10 +960,10 @@ def _retry(**override):
 
         model = apply_model_infrastructure(
             model=model,
+            pretrained_model_name_or_path=pretrained_model_name_or_path,
             is_hf_model=is_hf_model,
             cp_size=cp_size,
             tp_size=tp_size,
-            checkpointer=checkpointer,
             peft_config=peft_config,
             quantization_config=quantization_config,
             fp8_config=fp8_config,
@@ -963,7 +975,6 @@ def _retry(**override):
             is_meta_device=is_meta_device,
             device=device,
             compile_config=compile_config,
-            model_name_or_path=pretrained_model_name_or_path,
             load_base_model=True,
             cache_dir=kwargs.get("cache_dir", TRANSFORMERS_CACHE),
         )
@@ -990,7 +1001,6 @@ def from_config(
         qat_quantizer: Optional[Union["Int4WeightOnlyQATQuantizer", "Int8DynActInt4WeightQATQuantizer"]] = None,
         loss_fn: Optional[Callable] = None,
         compile_config: Optional["CompileConfig"] = None,
-        checkpointer: Optional[Checkpointer] = None,
         **kwargs,
     ) -> PreTrainedModel:
         """
@@ -1051,9 +1061,6 @@ def from_config(
                 it will be replaced with MaskedCrossEntropy. This is passed to AutoPipeline. Default: None.
             compile_config (CompileConfig | None, optional): Configuration for torch.compile.
                 If provided, the model will be compiled for improved performance. Default: None.
-            checkpointer (Checkpointer, optional): Checkpointer instance for checkpoint
-                management and enabling save_pretrained() functionality. Required for
-                proper checkpoint handling.
             **kwargs:
                 Additional keyword arguments. Notable ones include:
                 - tp_size (int): Tensor parallelism size. Default: 1.
@@ -1096,7 +1103,6 @@ def _retry(**override):
                 qat_quantizer=qat_quantizer,
                 loss_fn=loss_fn,
                 compile_config=compile_config,
-                checkpointer=checkpointer,
                 **kwargs,
             )
 
@@ -1117,11 +1123,10 @@ def _retry(**override):
         device = torch.cuda.current_device()
 
         # Neither of these parallelization methods support meta device initialization
-        # Also require checkpointer for meta device init, as we need it to load weights
         is_meta_device = (
             not isinstance(model_wrapper, (MegatronFSDPManager, DDPManager))
             and not force_hf
-            and checkpointer is not None
+            and get_world_size_safe() > 1
         )
         init_ctx = ContextManagers([no_init_weights(), init_empty_weights()]) if is_meta_device else nullcontext()
 
@@ -1162,7 +1167,6 @@ def _retry(**override):
             is_hf_model=is_hf_model,
             cp_size=cp_size,
             tp_size=tp_size,
-            checkpointer=checkpointer,
             peft_config=peft_config,
             quantization_config=quantization_config,
             fp8_config=fp8_config,
@@ -1174,7 +1178,7 @@ def _retry(**override):
             is_meta_device=is_meta_device,
             device=device,
             compile_config=compile_config,
-            model_name_or_path=getattr(config, "name_or_path"),
+            pretrained_model_name_or_path=getattr(config, "name_or_path"),
             load_base_model=False,
             cache_dir=kwargs.get("cache_dir", TRANSFORMERS_CACHE),
         )
 
@@ -87,7 +87,9 @@ class CheckpointingConfig:
     model_repo_id: str
     save_consolidated: bool
     is_peft: bool
-    model_state_dict_keys: list[str] = None  # copy of the model state dict keys before any parallelization
+    model_state_dict_keys: list[str] = (
+        None  # copy of the model state dict keys before any parallelization. Kept for BW compatibility.
+    )
     is_async: bool = False
     dequantize_base_checkpoint: bool | None = None
     original_model_root_dir: str | None = None
@@ -587,7 +589,11 @@ def _maybe_build_consolidated_index(
             # some HF models like Moonlight-16B have non-persistent buffers in the base checkpoint
             # however, HF initializes buffers with persistent=False, so we need to make sure these
             # buffer keys are not saved during checkpointing
-            keys_to_remove = list(set(fqn_to_file_index_mapping.keys()) - set(self.config.model_state_dict_keys))
+            # The `_pre_shard_hf_state_dict_keys` attribute is set in the `apply_model_infrastructure` in auto_model.py
+            pre_shard_hf_state_dict_keys = (
+                getattr(model, "_pre_shard_hf_state_dict_keys", None) or self.config.model_state_dict_keys
+            )
+            keys_to_remove = list(set(fqn_to_file_index_mapping.keys()) - set(pre_shard_hf_state_dict_keys))
             if model_state.is_tied_lm_head:
                 keys_to_remove.append(model_state.lm_head_param_name)
             for key in keys_to_remove:
 
@@ -43,7 +43,6 @@
 import torch
 import wandb
 from torchao.float8 import precompute_float8_dynamic_scale_for_fsdp
-from transformers.utils import TRANSFORMERS_CACHE
 
 from nemo_automodel._transformers.auto_tokenizer import NeMoAutoTokenizer
 from nemo_automodel.components.config._arg_parser import parse_args_and_load_config
@@ -77,10 +76,6 @@ def _build_teacher_model(
     cp_size=1,
     parallelize_fn=None,
     device=None,
-    dp_rank=0,
-    tp_rank=0,
-    pp_rank=0,
-    moe_mesh=None,
 ):
     """Build and initialize the teacher model for knowledge distillation.
 
@@ -104,37 +99,17 @@ def _build_teacher_model(
         The `offload_teacher_model` config option is not supported with this approach.
         Device placement is handled internally by NeMoAutoModelForCausalLM infrastructure.
     """
-    from nemo_automodel.components.checkpoint.checkpointing import Checkpointer, CheckpointingConfig
 
     assert cfg_teacher is not None, "`teacher_model` section missing from YAML config"
     logger.info("Instantiating teacher model")
 
-    # Create a simple checkpointer for the teacher (just for weight loading)
-    teacher_checkpointer = Checkpointer(
-        CheckpointingConfig(
-            model_repo_id=cfg_teacher.get("pretrained_model_name_or_path"),
-            model_cache_dir=cfg_teacher.get("cache_dir", TRANSFORMERS_CACHE),
-            # Dummy values
-            is_peft=False,
-            enabled=False,
-            checkpoint_dir="",
-            model_save_format="safetensors",
-            save_consolidated=False,
-        ),
-        dp_rank=dp_rank,
-        tp_rank=tp_rank,
-        pp_rank=pp_rank,
-        moe_mesh=moe_mesh,
-    )
-
     # Build teacher model using the same infrastructure as student
     # but without PEFT/FP8/QAT (teacher should be frozen in full precision)
     with ScopedRNG(seed=seed, ranked=True):
         kwargs: Dict[str, Any] = {
             "tp_size": tp_size,
             "cp_size": cp_size,
             "has_packed_sequence": has_packed_sequence,
-            "checkpointer": teacher_checkpointer,
             "model_wrapper": model_wrapper,
             "parallelize_fn": parallelize_fn,
         }
@@ -196,10 +171,6 @@ def setup(self):  # noqa: C901 – same complexity as parent
             cp_size=self.cfg.get("distributed.cp_size", 1),
             parallelize_fn=getattr(self.cfg.get("parallelizer", None), "instantiate", None),
             device=teacher_device,
-            dp_rank=self._get_dp_rank(include_cp=True),
-            tp_rank=self._get_tp_rank(),
-            pp_rank=self._get_pp_rank(),
-            moe_mesh=self.moe_mesh,
         )
         logger.info("Teacher Model: " + str(self.teacher_model))
         # KD
 
@@ -133,7 +133,6 @@ def build_model_and_optimizer(
     cfg_peft,
     model_wrapper,
     seed,
-    checkpointer: Checkpointer,
     has_packed_sequence=False,
     tp_size=1,
     cp_size=1,
@@ -174,7 +173,6 @@ def build_model_and_optimizer(
             "has_packed_sequence": has_packed_sequence,
             "autopipeline": autopipeline,
             "parallelize_fn": parallelize_fn,
-            "checkpointer": checkpointer,
             "peft_config": cfg_peft,
             "model_wrapper": model_wrapper,
             "loss_fn": loss_fn,
@@ -214,7 +212,7 @@ def build_model_and_optimizer(
                 is_hf_model=False,
                 is_meta_device=False,
                 device=torch.cuda.current_device(),
-                model_name_or_path=None,
+                pretrained_model_name_or_path=None,
                 load_base_model=False,
                 cache_dir=TRANSFORMERS_CACHE,
                 **kwargs,
@@ -923,7 +921,6 @@ def setup(self):
             autopipeline=autopipeline,
             loss_fn=self.loss_fn,
             parallelize_fn=parallelize_fn,
-            checkpointer=self.checkpointer,
         )
 
         if isinstance(model, AutoPipeline):
 
@@ -118,7 +118,6 @@ def setup(self):
             autopipeline=None,
             loss_fn=self.loss_fn,
             parallelize_fn=None,
-            checkpointer=self.checkpointer,
             unfreeze_modules=["classifier"] if self.peft_config is not None else None,
         )
Original file line number	Diff line number	Diff line change
`@@ -118,7 +118,6 @@ def setup(self):`
`118`	`118`	`autopipeline=None,`
`119`	`119`	`loss_fn=self.loss_fn,`
`120`	`120`	`parallelize_fn=None,`
`121`		`- checkpointer=self.checkpointer,`
`122`	`121`	`unfreeze_modules=["classifier"] if self.peft_config is not None else None,`
`123`	`122`	`)`
`124`	`123`