feat: Megatron LoRA GRPO w/ Weight Merging (#1889)

vadam5 · ashors1 · root · web-flow · commit a4268960d388 · 2026-03-02T19:27:57.000Z
Signed-off-by: Anna Shors &lt;ashors@nvidia.com&gt;
Signed-off-by: Virginia Wu &lt;vadams@nvidia.com&gt;
Signed-off-by: Virginia Wu &lt;78445382+vadam5@users.noreply.github.com&gt;
Signed-off-by: Terry Kong &lt;terryk@nvidia.com&gt;
Co-authored-by: Anna Shors &lt;ashors@nvidia.com&gt;
Co-authored-by: root &lt;root@pool0-00689.cm.cluster&gt;
Co-authored-by: Terry Kong &lt;terryk@nvidia.com&gt;
diff --git a/examples/configs/distillation_math_megatron.yaml b/examples/configs/distillation_math_megatron.yaml
@@ -62,7 +62,19 @@ policy: &POLICY_BASE
         moe_enable_deepep: false
         moe_token_dispatcher_type: "allgather"
         moe_shared_expert_overlap: false
-        
+        peft:
+            enabled: false
+            target_modules: []
+            exclude_modules: []
+            dim: 8
+            alpha: 32
+            dropout: 0.0
+            dropout_position: "post"
+            lora_A_init_method: "xavier"
+            lora_B_init_method: "zero"
+            a2a_experimental: false
+            lora_dtype: null
+
         optimizer:
             optimizer: "adam"
             lr: 2.00001e-5
diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml
@@ -147,6 +147,19 @@ policy:
     moe_token_dispatcher_type: "allgather"
     moe_shared_expert_overlap: false
 
+    peft:
+      enabled: false
+      target_modules: []
+      exclude_modules: []
+      dim: 8
+      alpha: 32
+      dropout: 0.0
+      dropout_position: "post"
+      lora_A_init_method: "xavier"
+      lora_B_init_method: "zero"
+      a2a_experimental: false
+      lora_dtype: None
+
     optimizer:
       optimizer: "adam"
       lr: 5.0e-6
diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml
@@ -100,6 +100,19 @@ policy:
     moe_shared_expert_overlap: false
     #gives ~20% training perf speedup with sequence packing 
     apply_rope_fusion: True
+
+    peft:
+      enabled: false
+      target_modules: []
+      exclude_modules: []
+      dim: 8
+      alpha: 32
+      dropout: 0.0
+      dropout_position: "post"
+      lora_A_init_method: "xavier"
+      lora_B_init_method: "zero"
+      a2a_experimental: false
+      lora_dtype: null
     
     optimizer:
       optimizer: "adam"
diff --git a/examples/configs/recipes/llm/grpo-nanov3-30BA3B-2n8g-megatron-lora.yaml b/examples/configs/recipes/llm/grpo-nanov3-30BA3B-2n8g-megatron-lora.yaml
@@ -0,0 +1,38 @@
+defaults: ../../grpo_math_1B.yaml
+grpo:
+  num_prompts_per_step: 2
+  num_generations_per_prompt: 8
+checkpointing:
+  checkpoint_dir: results/grpo-nanov3-30BA3B-2n8g-megatron-lora
+policy:
+  model_name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16
+  tokenizer:
+    name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
+  train_global_batch_size: 16
+  train_micro_batch_size: 1
+  logprob_batch_size: 1
+  max_total_sequence_length: 2048
+  dtensor_cfg:
+    enabled: false
+  megatron_cfg:
+    enabled: true
+    peft:
+      enabled: true
+      dim: 128
+      alpha: 512
+      exclude_modules: ['*out_proj*'] # Exclude all out_proj modules. When NemotronHMamba2Mixer uses cuda_kernels_forward, out_proj LoRA has no gradient.
+  sequence_packing:
+    enabled: false
+  generation:
+    vllm_cfg:
+      tensor_parallel_size: 4
+      gpu_memory_utilization: 0.7
+logger:
+  wandb_enabled: true
+  tensorboard_enabled: true
+  wandb:
+    project: nemo-rl
+    name: grpo-nanov3-30BA3B-2n8g-megatron-lora
+cluster:
+  gpus_per_node: 8
+  num_nodes: 2
diff --git a/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-megatron-lora.yaml b/examples/configs/recipes/llm/grpo-qwen3-8b-base-1n8g-megatron-lora.yaml
@@ -0,0 +1,31 @@
+defaults: ../../grpo_math_1B.yaml
+grpo:
+  val_at_start: true
+checkpointing:
+  enabled: false
+  checkpoint_dir: results/grpo-qwen3-8b-base-1n8g-megatron-lora
+policy:
+  model_name: Qwen/Qwen3-8B-Base
+  max_total_sequence_length: 2048
+  dtensor_cfg:
+    enabled: false
+  megatron_cfg:
+    enabled: true
+    peft:
+      enabled: true
+      dim: 128
+      alpha: 128
+    scheduler:
+      lr_warmup_iters: 50
+
+  sequence_packing:
+    enabled: false
+logger:
+  log_dir: logs/grpo-qwen3-8b-base-1n8g-megatron-lora
+  wandb_enabled: true
+  tensorboard_enabled: true
+  wandb:
+    project: nemo-rl
+    name: grpo-qwen3-8b-base-1n8g-megatron-lora
+cluster:
+  gpus_per_node: 8
diff --git a/examples/configs/sft_openmathinstruct2_megatron.yaml b/examples/configs/sft_openmathinstruct2_megatron.yaml
@@ -96,6 +96,18 @@ policy:
     moe_enable_deepep: false
     moe_token_dispatcher_type: "allgather"
     moe_shared_expert_overlap: false
+    peft:
+      enabled: false
+      target_modules: []
+      exclude_modules: []
+      dim: 8
+      alpha: 32
+      dropout: 0.0
+      dropout_position: "post"
+      lora_A_init_method: "xavier"
+      lora_B_init_method: "zero"
+      a2a_experimental: false
+      lora_dtype: null
 
     env_vars:
       PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:False"
diff --git a/examples/configs/vlm_grpo_3B_megatron.yaml b/examples/configs/vlm_grpo_3B_megatron.yaml
@@ -158,6 +158,18 @@ policy:
     moe_enable_deepep: false
     moe_token_dispatcher_type: "allgather"
     moe_shared_expert_overlap: false
+    peft:
+      enabled: false
+      target_modules: []
+      exclude_modules: []
+      dim: 8
+      alpha: 32
+      dropout: 0.0
+      dropout_position: "post"
+      lora_A_init_method: "xavier"
+      lora_B_init_method: "zero"
+      a2a_experimental: false
+      lora_dtype: null
     optimizer:
       optimizer: adam
       lr: 2.0e-07
diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py
@@ -684,10 +684,6 @@ def setup_model_and_optimizer(
 
     mixed_precision_wrapper = Float16Module
     if policy_cfg["megatron_cfg"]["freeze_moe_router"]:
-        if use_peft:
-            raise ValueError(
-                "Freezing the MOE router is not currently supported when using PEFT"
-            )
 
         def freeze_moe_router(megatron_model):
             if not isinstance(megatron_model, list):
@@ -708,6 +704,14 @@ def freeze_moe_router(megatron_model):
 
     if use_peft:
         peft_cfg = policy_cfg["megatron_cfg"].get("peft", {})
+        if "dim" not in peft_cfg or peft_cfg["dim"] is None:
+            raise ValueError(
+                "If megtatron_cfg.peft.enabled is True, dim must be set in peft_cfg"
+            )
+        if "alpha" not in peft_cfg or peft_cfg["alpha"] is None:
+            raise ValueError(
+                "If megtatron_cfg.peft.enabled is True, alpha must be set in peft_cfg"
+            )
         peft = LoRA(
             target_modules=peft_cfg["target_modules"],
             exclude_modules=peft_cfg["exclude_modules"],
@@ -722,6 +726,7 @@ def freeze_moe_router(megatron_model):
         )
     else:
         peft = None
+
     megatron_cfg.peft = peft
 
     if megatron_cfg.peft is not None:
@@ -872,22 +877,70 @@ def setup_reference_model_state(
     if config["megatron_cfg"].get("freeze_moe_router", False):
         ref_mixed_precision_wrapper = MoEFloat16Module
 
+    ref_pre_wrap_hooks = []
+    use_peft = config["megatron_cfg"].get("peft", {}).get("enabled", False)
+
+    if use_peft:
+        peft_cfg = config["megatron_cfg"].get("peft", {})
+        if "dim" not in peft_cfg or peft_cfg["dim"] is None:
+            raise ValueError(
+                "If megtatron_cfg.peft.enabled is True, dim must be set in peft_cfg"
+            )
+        if "alpha" not in peft_cfg or peft_cfg["alpha"] is None:
+            raise ValueError(
+                "If megtatron_cfg.peft.enabled is True, alpha must be set in peft_cfg"
+            )
+        peft = LoRA(
+            target_modules=peft_cfg["target_modules"],
+            exclude_modules=peft_cfg["exclude_modules"],
+            dim=peft_cfg["dim"],
+            alpha=peft_cfg["alpha"],
+            dropout=peft_cfg["dropout"],
+            dropout_position=peft_cfg["dropout_position"],
+            lora_A_init_method="zero",
+            lora_B_init_method="zero",
+            a2a_experimental=peft_cfg["a2a_experimental"],
+            lora_dtype=peft_cfg["lora_dtype"],
+        )
+    else:
+        peft = None
+
+    ref_megatron_cfg.peft = peft
+
+    if ref_megatron_cfg.peft is not None:
+        pre_peft_hook = _create_peft_pre_wrap_hook(ref_megatron_cfg, ref_state)
+        ref_megatron_cfg.model.register_pre_wrap_hook(pre_peft_hook)
+
+        def composed_peft_hook(model: list[MegatronModule]) -> list[MegatronModule]:
+            model = pre_peft_hook(model)
+            return model
+
+        ref_pre_wrap_hooks.extend([composed_peft_hook])
+
     reference_model = get_model(
         megatron_cfg.model,
         megatron_cfg.ddp,
         use_torch_fsdp2=megatron_cfg.dist.use_torch_fsdp2,
         overlap_param_gather_with_optimizer_step=megatron_cfg.optimizer.overlap_param_gather_with_optimizer_step,
-        pre_wrap_hook=megatron_cfg.rng.data_parallel_random_init,
+        data_parallel_random_init=megatron_cfg.rng.data_parallel_random_init,
+        pre_wrap_hook=ref_pre_wrap_hooks,
         mixed_precision_wrapper=ref_mixed_precision_wrapper,
         pg_collection=ProcessGroupCollection.use_mpu_process_groups(),
     )
 
+    should_load_checkpoint = (
+        ref_checkpoint_config.pretrained_checkpoint is not None
+        and checkpoint_exists(ref_checkpoint_config.pretrained_checkpoint)
+    )
+
+    if should_load_checkpoint and use_peft:
+        # The finetune toggle is explicitly set to True in order to avoid loading optimizer and RNG states
+        # This is switched off here in order to load these states from the checkpoint
+        ref_megatron_cfg.checkpoint.finetune = False
+
     print("Loading the Reference Model")
-    reference_state_dict = {}
 
-    if ref_checkpoint_config.pretrained_checkpoint is not None and checkpoint_exists(
-        ref_checkpoint_config.pretrained_checkpoint
-    ):
+    if should_load_checkpoint:
         load_checkpoint(
             ref_state,
             reference_model,
@@ -896,9 +949,14 @@ def setup_reference_model_state(
             checkpointing_context=ref_ckpt_context,
             skip_load_to_model_and_opt=HAVE_FSDP2 and megatron_cfg.dist.use_torch_fsdp2,
         )
+    else:
+        print("Reference model not loaded")
+
+    reference_state_dict = {}
+
+    if should_load_checkpoint or use_peft:
         reference_model = reference_model[0]
         reference_model.eval()
-
         # Store reference state dict on CPU
         for name, item in reference_model.state_dict().items():
             if isinstance(item, torch.Tensor):
@@ -908,8 +966,6 @@ def setup_reference_model_state(
                 cpu_item = item
             reference_state_dict[name] = cpu_item
         print("Reference model loaded")
-    else:
-        print("Reference model not loaded")
 
     return reference_state_dict
 
diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py
@@ -108,6 +108,24 @@ class RewardModelConfig(TypedDict):
     reward_model_type: str
 
 
+class MegatronPeftConfigDisabled(TypedDict):
+    enabled: Literal[False]
+
+
+class MegatronPeftConfig(TypedDict):
+    enabled: Literal[True]
+    target_modules: list[str]
+    exclude_modules: list[str]
+    dim: int
+    alpha: int
+    dropout: float
+    dropout_position: Literal["pre", "post"]
+    lora_A_init_method: str
+    lora_B_init_method: str
+    a2a_experimental: bool
+    lora_dtype: str | None
+
+
 class MegatronOptimizerConfig(TypedDict):
     optimizer: str
     lr: float
@@ -193,6 +211,7 @@ class MegatronConfig(TypedDict):
     moe_token_dispatcher_type: str
     # Can be used only with 'alltoall' token dispatcher
     moe_shared_expert_overlap: bool
+    peft: NotRequired[MegatronPeftConfig | MegatronPeftConfigDisabled]
     optimizer: MegatronOptimizerConfig
     scheduler: MegatronSchedulerConfig
     distributed_data_parallel_config: MegatronDDPConfig
diff --git a/nemo_rl/models/policy/workers/megatron_policy_worker.py b/nemo_rl/models/policy/workers/megatron_policy_worker.py
@@ -541,10 +541,10 @@ def use_reference_model(self):
                         )
                     model_state_dict[name] = item
 
-                self.model.load_state_dict(self.reference_state_dict, strict=True)
-                # for name, item in self.reference_state_dict.items():
-                # if isinstance(item, torch.Tensor):
-                # self.model.state_dict()[name] = item.detach().to(device="cuda", non_blocking=True, copy=True)
+                # Swap reference model state_dict to self.model
+                for k, v in self.model.state_dict().items():
+                    if isinstance(v, torch.Tensor):
+                        v.copy_(self.reference_state_dict[k])
 
                 if self.cfg["megatron_cfg"]["empty_unused_memory_level"] >= 1:
                     gc.collect()
@@ -556,11 +556,9 @@ def use_reference_model(self):
 
             finally:
                 # Restore original references and device placement
-                self.model.load_state_dict(model_state_dict, strict=True)
-                # for name, item in model_state_dict.items():
-                # if isinstance(item, torch.Tensor):
-                # item = item.detach().to(device="cuda", non_blocking=True, copy=True)
-                # self.model.state_dict()[name] = item
+                for k, v in self.model.state_dict().items():
+                    if isinstance(v, torch.Tensor):
+                        v.copy_(model_state_dict[k])
 
                 if self.cfg["megatron_cfg"]["empty_unused_memory_level"] >= 1:
                     gc.collect()
diff --git a/tests/functional/L1_Functional_Tests_GPU.sh b/tests/functional/L1_Functional_Tests_GPU.sh
@@ -52,6 +52,8 @@ run_test      uv run --no-sync bash ./tests/functional/grpo_automodel_lora_async
 run_test      uv run --no-sync bash ./tests/functional/grpo_automodel_lora_non_colocated.sh
 run_test      uv run --no-sync bash ./tests/functional/grpo_megatron.sh
 run_test      uv run --no-sync bash ./tests/functional/grpo_megatron_generation.sh
+run_test      uv run --no-sync bash ./tests/functional/grpo_megatron_lora.sh
+run_test      uv run --no-sync bash ./tests/functional/grpo_megatron_lora_async.sh
 run_test      uv run --no-sync bash ./tests/functional/grpo_multiple_dataloaders.sh
 run_test      uv run --no-sync bash ./tests/functional/grpo_multiturn.sh
 run_test      uv run --no-sync bash ./tests/functional/grpo_non_colocated.sh
diff --git a/tests/functional/grpo_megatron_lora.sh b/tests/functional/grpo_megatron_lora.sh
diff --git a/tests/functional/grpo_megatron_lora_async.sh b/tests/functional/grpo_megatron_lora_async.sh
diff --git a/tests/test_suites/llm/grpo-nanov3-30BA3B-2n8g-megatron-lora.sh b/tests/test_suites/llm/grpo-nanov3-30BA3B-2n8g-megatron-lora.sh
diff --git a/tests/test_suites/llm/grpo-qwen3-8b-base-1n8g-megatron-lora.sh b/tests/test_suites/llm/grpo-qwen3-8b-base-1n8g-megatron-lora.sh
diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt