refactor

adil-a · adil-a · commit a3589a995daf · 2026-02-01T04:54:58.000Z
Signed-off-by: adil-a &lt;adil.asif2000@hotmail.com&gt;
diff --git a/nemo_automodel/_transformers/auto_model.py b/nemo_automodel/_transformers/auto_model.py
@@ -46,6 +46,7 @@
 from nemo_automodel.components._peft.lora import apply_lora_to_linear_modules
 from nemo_automodel.components.checkpoint.checkpointing import (
     Checkpointer,
+    CheckpointingConfig,
     _maybe_adapt_state_dict_to_hf,
 )
 from nemo_automodel.components.distributed.ddp import DDPManager
@@ -524,7 +525,6 @@ def apply_model_infrastructure(
     is_hf_model,
     is_meta_device,
     device,
-    checkpointer,
     model_wrapper=None,
     tp_size=1,
     cp_size=1,
@@ -539,6 +539,7 @@ def apply_model_infrastructure(
     model_name_or_path=None,
     load_base_model=False,
     cache_dir=None,
+    pretrained_model_name_or_path="",
     **_kwargs,
 ):
     """Apply sharding, PEFT, quantization, and checkpoint loading to a model.
@@ -558,7 +559,6 @@ def apply_model_infrastructure(
         is_hf_model: Whether this is an HF model (vs custom implementation)
         is_meta_device: Whether model was initialized on meta device
         device: Target device for model
-        checkpointer: Checkpointer instance for weight loading
         model_wrapper: Model wrapper (FSDP2Manager, DDPManager, etc.). Default: None
         tp_size: Tensor parallelism size. Default: 1
         cp_size: Context parallelism size. Default: 1
@@ -580,6 +580,24 @@ def apply_model_infrastructure(
     """
     _verify_sdpa_support(model, is_hf_model, cp_size)
 
+    # Create a dummy checkpointer. We can pass in dummy values here since we are only loading the base weights.
+    ckpt_config = CheckpointingConfig(
+        enabled=True,
+        checkpoint_dir="",
+        model_save_format="safetensors",
+        model_cache_dir=cache_dir,
+        model_repo_id=pretrained_model_name_or_path,
+        save_consolidated=True,
+        is_peft=peft_config is not None,
+    )
+    checkpointer = Checkpointer(
+        ckpt_config,
+        0,
+        0,
+        0,
+        getattr(model_wrapper, "moe_mesh", None) if model_wrapper else None,
+    )
+
     # Handle checkpointer config updates if checkpointer is provided
     dequantize_base_checkpoint = False
     if checkpointer is not None:
@@ -600,10 +618,9 @@ def apply_model_infrastructure(
         )
 
     # hold a list copy of the model state dict keys before any parallelization
-    if checkpointer is not None:
-        checkpointer.config.model_state_dict_keys = list(
-            _maybe_adapt_state_dict_to_hf(model, model.state_dict(), quantization=dequantize_base_checkpoint).keys()
-        )
+    checkpointer.config.model_state_dict_keys = list(
+        _maybe_adapt_state_dict_to_hf(model, model.state_dict(), quantization=dequantize_base_checkpoint).keys()
+    )
 
     # Loss function check
     if not _supports_logits_to_keep(model) and not isinstance(loss_fn, MaskedCrossEntropy):
@@ -622,15 +639,11 @@ def apply_model_infrastructure(
     # Weights need to be loaded for meta device models that were parallelized:
     # 1. When parallelize_fn was used (which will internally apply FSDP2/EP sharding)
     # 2. When FSDP2Manager.parallelize was used (but not MegatronFSDP which handles weights internally)
-    should_load_checkpoint = (
-        is_meta_device
-        and checkpointer is not None
-        and any(
-            [
-                parallelize_fn is not None and get_world_size_safe() > 1,
-                callable(getattr(model_wrapper, "parallelize", None)),
-            ]
-        )
+    should_load_checkpoint = is_meta_device and any(
+        [
+            parallelize_fn is not None and get_world_size_safe() > 1,
+            callable(getattr(model_wrapper, "parallelize", None)),
+        ]
     )
     if should_load_checkpoint:
         models_to_load = model.parts if hasattr(model, "parts") else [model]
@@ -778,7 +791,6 @@ def from_pretrained(
         model_wrapper=None,
         autopipeline: AutoPipeline | None = None,
         parallelize_fn: Callable | None = None,
-        checkpointer: Optional[Checkpointer] = None,
         peft_config: Optional[dict] = None,
         fp8_config: Optional["FP8Config"] = None,
         qat_quantizer: Optional[Union["Int4WeightOnlyQATQuantizer", "Int8DynActInt4WeightQATQuantizer"]] = None,
@@ -824,9 +836,6 @@ def from_pretrained(
                 pipeline stages. Default: None.
             parallelize_fn (Callable | None, optional): Custom function to apply
                 parallelization (EP + FSDP2). Default: None.
-            checkpointer (Checkpointer, optional): Checkpointer instance for loading weights
-                and enabling save_pretrained() functionality. Required for weight loading
-                and checkpoint management.
             peft_config (dict | None, optional): PEFT/LoRA configuration dictionary.
                 If provided, LoRA adapters will be applied to the model. Default: None.
             fp8_config (FP8Config | None, optional): FP8 quantization configuration.
@@ -882,7 +891,6 @@ def _retry(**override):
                 fp8_config=fp8_config,
                 qat_quantizer=qat_quantizer,
                 loss_fn=loss_fn,
-                checkpointer=checkpointer,
                 compile_config=compile_config,
                 model_wrapper=model_wrapper,
                 **kwargs,
@@ -899,12 +907,7 @@ def _retry(**override):
         device = torch.cuda.current_device()
 
         # Neither of these parallelization methods support meta device initialization
-        # Also require checkpointer for meta device init, as we need it to load weights
-        is_meta_device = (
-            not isinstance(model_wrapper, (MegatronFSDPManager, DDPManager))
-            and not force_hf
-            and checkpointer is not None
-        )
+        is_meta_device = not isinstance(model_wrapper, (MegatronFSDPManager, DDPManager)) and not force_hf
         init_ctx = ContextManagers([no_init_weights(), init_empty_weights()]) if is_meta_device else nullcontext()
 
         try:
@@ -948,10 +951,10 @@ def _retry(**override):
 
         model = apply_model_infrastructure(
             model=model,
+            pretrained_model_name_or_path=pretrained_model_name_or_path,
             is_hf_model=is_hf_model,
             cp_size=cp_size,
             tp_size=tp_size,
-            checkpointer=checkpointer,
             peft_config=peft_config,
             quantization_config=quantization_config,
             fp8_config=fp8_config,
@@ -990,7 +993,6 @@ def from_config(
         qat_quantizer: Optional[Union["Int4WeightOnlyQATQuantizer", "Int8DynActInt4WeightQATQuantizer"]] = None,
         loss_fn: Optional[Callable] = None,
         compile_config: Optional["CompileConfig"] = None,
-        checkpointer: Optional[Checkpointer] = None,
         **kwargs,
     ) -> PreTrainedModel:
         """
@@ -1051,9 +1053,6 @@ def from_config(
                 it will be replaced with MaskedCrossEntropy. This is passed to AutoPipeline. Default: None.
             compile_config (CompileConfig | None, optional): Configuration for torch.compile.
                 If provided, the model will be compiled for improved performance. Default: None.
-            checkpointer (Checkpointer, optional): Checkpointer instance for checkpoint
-                management and enabling save_pretrained() functionality. Required for
-                proper checkpoint handling.
             **kwargs:
                 Additional keyword arguments. Notable ones include:
                 - tp_size (int): Tensor parallelism size. Default: 1.
@@ -1096,7 +1095,6 @@ def _retry(**override):
                 qat_quantizer=qat_quantizer,
                 loss_fn=loss_fn,
                 compile_config=compile_config,
-                checkpointer=checkpointer,
                 **kwargs,
             )
 
@@ -1117,12 +1115,7 @@ def _retry(**override):
         device = torch.cuda.current_device()
 
         # Neither of these parallelization methods support meta device initialization
-        # Also require checkpointer for meta device init, as we need it to load weights
-        is_meta_device = (
-            not isinstance(model_wrapper, (MegatronFSDPManager, DDPManager))
-            and not force_hf
-            and checkpointer is not None
-        )
+        is_meta_device = not isinstance(model_wrapper, (MegatronFSDPManager, DDPManager)) and not force_hf
         init_ctx = ContextManagers([no_init_weights(), init_empty_weights()]) if is_meta_device else nullcontext()
 
         try:
@@ -1162,7 +1155,6 @@ def _retry(**override):
             is_hf_model=is_hf_model,
             cp_size=cp_size,
             tp_size=tp_size,
-            checkpointer=checkpointer,
             peft_config=peft_config,
             quantization_config=quantization_config,
             fp8_config=fp8_config,
diff --git a/nemo_automodel/recipes/llm/kd.py b/nemo_automodel/recipes/llm/kd.py
@@ -43,7 +43,6 @@
 import torch
 import wandb
 from torchao.float8 import precompute_float8_dynamic_scale_for_fsdp
-from transformers.utils import TRANSFORMERS_CACHE
 
 from nemo_automodel._transformers.auto_tokenizer import NeMoAutoTokenizer
 from nemo_automodel.components.config._arg_parser import parse_args_and_load_config
@@ -77,10 +76,6 @@ def _build_teacher_model(
     cp_size=1,
     parallelize_fn=None,
     device=None,
-    dp_rank=0,
-    tp_rank=0,
-    pp_rank=0,
-    moe_mesh=None,
 ):
     """Build and initialize the teacher model for knowledge distillation.
 
@@ -104,37 +99,17 @@ def _build_teacher_model(
         The `offload_teacher_model` config option is not supported with this approach.
         Device placement is handled internally by NeMoAutoModelForCausalLM infrastructure.
     """
-    from nemo_automodel.components.checkpoint.checkpointing import Checkpointer, CheckpointingConfig
 
     assert cfg_teacher is not None, "`teacher_model` section missing from YAML config"
     logger.info("Instantiating teacher model")
 
-    # Create a simple checkpointer for the teacher (just for weight loading)
-    teacher_checkpointer = Checkpointer(
-        CheckpointingConfig(
-            model_repo_id=cfg_teacher.get("pretrained_model_name_or_path"),
-            model_cache_dir=cfg_teacher.get("cache_dir", TRANSFORMERS_CACHE),
-            # Dummy values
-            is_peft=False,
-            enabled=False,
-            checkpoint_dir="",
-            model_save_format="safetensors",
-            save_consolidated=False,
-        ),
-        dp_rank=dp_rank,
-        tp_rank=tp_rank,
-        pp_rank=pp_rank,
-        moe_mesh=moe_mesh,
-    )
-
     # Build teacher model using the same infrastructure as student
     # but without PEFT/FP8/QAT (teacher should be frozen in full precision)
     with ScopedRNG(seed=seed, ranked=True):
         kwargs: Dict[str, Any] = {
             "tp_size": tp_size,
             "cp_size": cp_size,
             "has_packed_sequence": has_packed_sequence,
-            "checkpointer": teacher_checkpointer,
             "model_wrapper": model_wrapper,
             "parallelize_fn": parallelize_fn,
         }
@@ -196,10 +171,6 @@ def setup(self):  # noqa: C901 – same complexity as parent
             cp_size=self.cfg.get("distributed.cp_size", 1),
             parallelize_fn=getattr(self.cfg.get("parallelizer", None), "instantiate", None),
             device=teacher_device,
-            dp_rank=self._get_dp_rank(include_cp=True),
-            tp_rank=self._get_tp_rank(),
-            pp_rank=self._get_pp_rank(),
-            moe_mesh=self.moe_mesh,
         )
         logger.info("Teacher Model: " + str(self.teacher_model))
         # KD
diff --git a/nemo_automodel/recipes/llm/train_ft.py b/nemo_automodel/recipes/llm/train_ft.py
@@ -133,7 +133,6 @@ def build_model_and_optimizer(
     cfg_peft,
     model_wrapper,
     seed,
-    checkpointer: Checkpointer,
     has_packed_sequence=False,
     tp_size=1,
     cp_size=1,
@@ -174,7 +173,6 @@ def build_model_and_optimizer(
             "has_packed_sequence": has_packed_sequence,
             "autopipeline": autopipeline,
             "parallelize_fn": parallelize_fn,
-            "checkpointer": checkpointer,
             "peft_config": cfg_peft,
             "model_wrapper": model_wrapper,
             "loss_fn": loss_fn,
@@ -923,7 +921,6 @@ def setup(self):
             autopipeline=autopipeline,
             loss_fn=self.loss_fn,
             parallelize_fn=parallelize_fn,
-            checkpointer=self.checkpointer,
         )
 
         if isinstance(model, AutoPipeline):
diff --git a/nemo_automodel/recipes/llm/train_seq_cls.py b/nemo_automodel/recipes/llm/train_seq_cls.py
@@ -118,7 +118,6 @@ def setup(self):
             autopipeline=None,
             loss_fn=self.loss_fn,
             parallelize_fn=None,
-            checkpointer=self.checkpointer,
             unfreeze_modules=["classifier"] if self.peft_config is not None else None,
         )
 
diff --git a/nemo_automodel/recipes/vlm/finetune.py b/nemo_automodel/recipes/vlm/finetune.py
@@ -123,7 +123,6 @@ def build_model_and_optimizer(
     cfg_peft,
     model_wrapper,
     seed,
-    checkpointer: Checkpointer,
     tp_size=1,
     cp_size=1,
     freeze_embeddings=True,
@@ -144,7 +143,6 @@ def build_model_and_optimizer(
             "tp_size": tp_size,
             "cp_size": cp_size,
             "parallelize_fn": parallelize_fn,
-            "checkpointer": checkpointer,
             "peft_config": cfg_peft,
             "model_wrapper": model_wrapper,
             "loss_fn": loss_fn,
@@ -653,7 +651,6 @@ def setup(self):
             cfg_compile=self.cfg.get("compile", None),
             loss_fn=self.loss_fn,
             parallelize_fn=parallelize_fn,
-            checkpointer=self.checkpointer,
             autopipeline=autopipeline,
         )
 
diff --git a/tests/unit_tests/recipes/test_finetune_vlm_helpers.py b/tests/unit_tests/recipes/test_finetune_vlm_helpers.py
diff --git a/tests/unit_tests/recipes/test_train_ft.py b/tests/unit_tests/recipes/test_train_ft.py

Original file line number	Diff line number	Diff line change
`@@ -118,7 +118,6 @@ def setup(self):`
`118`	`118`	`autopipeline=None,`
`119`	`119`	`loss_fn=self.loss_fn,`
`120`	`120`	`parallelize_fn=None,`
`121`		`- checkpointer=self.checkpointer,`
`122`	`121`	`unfreeze_modules=["classifier"] if self.peft_config is not None else None,`
`123`	`122`	`)`
`124`	`123`