Fix params_dtype for distillation and GPT HF Exporter head_dim for pruning (#12792) (#13002)

kevalmorabia97 · web-flow · commit 3ee4832784e7 · 2025-04-14T07:26:37.000-05:00
* Fix GPT HF Exporter dtype and head_dim



* Fix params_dtype



* Apply isort and black reformatting



---------

Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
Signed-off-by: kevalmorabia97 &lt;kevalmorabia97@users.noreply.github.com&gt;
Co-authored-by: kevalmorabia97 &lt;kevalmorabia97@users.noreply.github.com&gt;
diff --git a/nemo/collections/llm/gpt/model/gemma.py b/nemo/collections/llm/gpt/model/gemma.py
@@ -284,6 +284,11 @@ def config(self) -> "GemmaConfig":
             hidden_size=source.hidden_size,
             intermediate_size=source.ffn_hidden_size,
             num_attention_heads=source.num_attention_heads,
+            head_dim=(
+                source.kv_channels
+                if source.kv_channels is not None
+                else source.hidden_size // source.num_attention_heads
+            ),
             max_position_embeddings=source.seq_length,
             initializer_range=source.init_method_std,
             rms_norm_eps=source.layernorm_epsilon,
diff --git a/nemo/collections/llm/gpt/model/gemma2.py b/nemo/collections/llm/gpt/model/gemma2.py
@@ -366,6 +366,11 @@ def config(self) -> "Gemma2Config":
             hidden_size=source.hidden_size,
             intermediate_size=source.ffn_hidden_size,
             num_attention_heads=source.num_attention_heads,
+            head_dim=(
+                source.kv_channels
+                if source.kv_channels is not None
+                else source.hidden_size // source.num_attention_heads
+            ),
             max_position_embeddings=source.seq_length,
             initializer_range=source.init_method_std,
             rms_norm_eps=source.layernorm_epsilon,
diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py
@@ -704,6 +704,11 @@ def config(self) -> "HFLlamaConfig":
             hidden_size=source.hidden_size,
             intermediate_size=source.ffn_hidden_size,
             num_attention_heads=source.num_attention_heads,
+            head_dim=(
+                source.kv_channels
+                if source.kv_channels is not None
+                else source.hidden_size // source.num_attention_heads
+            ),
             max_position_embeddings=source.seq_length,
             initializer_range=source.init_method_std,
             rms_norm_eps=source.layernorm_epsilon,
diff --git a/nemo/collections/llm/gpt/model/qwen2.py b/nemo/collections/llm/gpt/model/qwen2.py
@@ -378,6 +378,11 @@ def config(self) -> "HFQwen2Config":
             hidden_size=source.hidden_size,
             intermediate_size=source.ffn_hidden_size,
             num_attention_heads=source.num_attention_heads,
+            head_dim=(
+                source.kv_channels
+                if source.kv_channels is not None
+                else source.hidden_size // source.num_attention_heads
+            ),
             max_position_embeddings=source.seq_length,
             initializer_range=source.init_method_std,
             rms_norm_eps=source.layernorm_epsilon,
diff --git a/nemo/collections/llm/modelopt/model_utils.py b/nemo/collections/llm/modelopt/model_utils.py
@@ -134,7 +134,9 @@ def setup_trainer_and_restore_model_with_modelopt_spec(
         num_nodes=num_nodes,
         accelerator="gpu",
         strategy=strategy,
-        plugins=nl.MegatronMixedPrecision(precision="bf16", params_dtype=torch.bfloat16, autocast_enabled=True),
+        plugins=nl.MegatronMixedPrecision(
+            precision="bf16-mixed", params_dtype=torch.bfloat16, autocast_enabled=False, grad_reduce_in_fp32=True
+        ),
         **trainer_kwargs,
     )
 
diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py
@@ -902,11 +902,12 @@ def save_checkpoint(
                 ckpt_io = self.checkpoint_io
                 if isinstance(ckpt_io, _WrappingCheckpointIO):
                     ckpt_io = ckpt_io.checkpoint_io
-                mto.plugins.save_sharded_modelopt_state(
-                    [core_model],
-                    ckpt_to_weights_subdir(filepath, is_saving=True),
-                    sharded_strategy=ckpt_io.save_sharded_strategy,
-                )
+                with core_model.hide_teacher_model() if hasattr(core_model, "hide_teacher_model") else nullcontext():
+                    mto.plugins.save_sharded_modelopt_state(
+                        [core_model],
+                        ckpt_to_weights_subdir(filepath, is_saving=True),
+                        sharded_strategy=ckpt_io.save_sharded_strategy,
+                    )
                 logging.info("Saved Model-Optimizer state into checkpoint.")
 
     def should_restore_optimizer_states(self, selective_restore: bool = False) -> bool:
diff --git a/scripts/llm/gpt_distillation.py b/scripts/llm/gpt_distillation.py
@@ -15,6 +15,7 @@
 import os
 from argparse import ArgumentParser
 
+import torch
 from lightning.pytorch.loggers import TensorBoardLogger
 from megatron.core.dist_checkpointing.validation import StrictHandling
 from megatron.core.optimizer import OptimizerConfig
@@ -82,7 +83,12 @@ def get_args():
         limit_val_batches=args.limit_val_batches,
         strategy=strategy,
         accelerator="gpu",
-        plugins=nl.MegatronMixedPrecision(precision=args.precision),
+        plugins=nl.MegatronMixedPrecision(
+            precision=args.precision,
+            params_dtype=torch.bfloat16 if "bf16" in args.precision else torch.float32,
+            autocast_enabled=False,
+            grad_reduce_in_fp32=True,
+        ),
     )
 
     # Set up dataset