feat: nano v3 configs and FSDP fix (#964)

adil-a · web-flow · commit 1645e6123f8b · 2025-12-15T19:37:11.000-08:00
Signed-off-by: adil-a &lt;adil.asif2000@hotmail.com&gt;
diff --git a/examples/llm_finetune/nemotron/nemotron_nano_v3_squad.yaml b/examples/llm_finetune/nemotron/nemotron_nano_v3_squad.yaml
@@ -0,0 +1,98 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# To run this recipe, please use the following command:
+# torchrun --nproc-per-node=8 recipes/llm_finetune/finetune.py --config recipes/llm_finetune/nemotron/llama3_3_nemotron_super_49B_squad_peft.yaml
+
+
+step_scheduler:
+  global_batch_size: 16
+  local_batch_size: 1
+  ckpt_every_steps: 1000
+  val_every_steps: 1000  # will run every x number of gradient steps
+  max_steps: 100
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 1
+
+rng:
+  _target_: nemo_automodel.components.training.rng.StatefulRNG
+  seed: 1111
+  ranked: true
+
+model:
+  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
+  pretrained_model_name_or_path: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
+
+# torch.compile configuration
+compile:
+  enabled: false
+  mode: "default"  # Options: "default", "reduce-overhead", "max-autotune"
+  fullgraph: false
+  dynamic: true  # Set to false for better performance with fixed shapes
+  backend: null  # Use default backend (inductor)
+
+distributed:
+  _target_: nemo_automodel.components.distributed.fsdp2.FSDP2Manager
+  dp_size: none
+  dp_replicate_size: 1 # dp_shard_size = dp_size / dp_replicate_size and dp_shard_size < dp_size. For DDP usecase, use DDPManager
+  tp_size: 1
+  cp_size: 1
+  sequence_parallel: false
+  defer_fsdp_grad_sync: false
+
+loss_fn:
+  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
+
+dataset:
+  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
+  dataset_name: rajpurkar/squad
+  split: train
+
+packed_sequence:
+  packed_sequence_size: 0
+
+dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  collate_fn: nemo_automodel.components.datasets.utils.default_collater
+  shuffle: True
+
+validation_dataset:
+  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
+  dataset_name: rajpurkar/squad
+  split: validation
+  limit_dataset_samples: 64
+
+validation_dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  collate_fn: nemo_automodel.components.datasets.utils.default_collater
+
+optimizer:
+  _target_: torch.optim.Adam
+  betas: [0.9, 0.999]
+  eps: 1e-8
+  lr: 1.0e-5
+  weight_decay: 0
+
+lr_scheduler:
+  lr_decay_style: cosine
+  min_lr: 1.0e-6
+
+# wandb:
+#   project: <your_wandb_project>
+#   entity: <your_wandb_entity>
+#   name: <your_wandb_exp_name>
+#   save_dir: <your_wandb_save_dir> 
diff --git a/examples/llm_finetune/nemotron/nemotron_nano_v3_squad_peft.yaml b/examples/llm_finetune/nemotron/nemotron_nano_v3_squad_peft.yaml
@@ -0,0 +1,104 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# To run this recipe, please use the following command:
+# torchrun --nproc-per-node=8 recipes/llm_finetune/finetune.py --config recipes/llm_finetune/nemotron/llama3_3_nemotron_super_49B_squad_peft.yaml
+
+
+step_scheduler:
+  global_batch_size: 8
+  local_batch_size: 1
+  ckpt_every_steps: 1000
+  val_every_steps: 1000  # will run every x number of gradient steps
+  max_steps: 100
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 1
+
+rng:
+  _target_: nemo_automodel.components.training.rng.StatefulRNG
+  seed: 1111
+  ranked: true
+
+model:
+  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
+  pretrained_model_name_or_path: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
+
+# torch.compile configuration
+compile:
+  enabled: false
+  mode: "default"  # Options: "default", "reduce-overhead", "max-autotune"
+  fullgraph: false
+  dynamic: true  # Set to false for better performance with fixed shapes
+  backend: null  # Use default backend (inductor)
+
+peft:
+  _target_: nemo_automodel.components._peft.lora.PeftConfig
+  match_all_linear: True
+  dim: 8
+  alpha: 32
+  use_triton: True
+
+distributed:
+  _target_: nemo_automodel.components.distributed.fsdp2.FSDP2Manager
+  dp_size: none
+  dp_replicate_size: 1 # dp_shard_size = dp_size / dp_replicate_size and dp_shard_size < dp_size. For DDP usecase, use DDPManager
+  tp_size: 1
+  cp_size: 1
+  sequence_parallel: false
+
+loss_fn:
+  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
+
+dataset:
+  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
+  dataset_name: rajpurkar/squad
+  split: train
+
+packed_sequence:
+  packed_sequence_size: 0
+
+dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  collate_fn: nemo_automodel.components.datasets.utils.default_collater
+  shuffle: True
+
+validation_dataset:
+  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
+  dataset_name: rajpurkar/squad
+  split: validation
+  limit_dataset_samples: 64
+
+validation_dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  collate_fn: nemo_automodel.components.datasets.utils.default_collater
+
+optimizer:
+  _target_: torch.optim.Adam
+  betas: [0.9, 0.999]
+  eps: 1e-8
+  lr: 1.0e-5
+  weight_decay: 0
+
+lr_scheduler:
+  lr_decay_style: cosine
+  min_lr: 1.0e-6
+
+# wandb:
+#   project: <your_wandb_project>
+#   entity: <your_wandb_entity>
+#   name: <your_wandb_exp_name>
+#   save_dir: <your_wandb_save_dir> 
diff --git a/nemo_automodel/components/distributed/fsdp2.py b/nemo_automodel/components/distributed/fsdp2.py
@@ -128,6 +128,11 @@ class FSDP2Manager:
         metadata={"help": "Enable activation checkpointing if True. Applies to linear layers."},
     )
 
+    defer_fsdp_grad_sync: Optional[bool] = field(
+        default=True,
+        metadata={"help": "Defer FSDP gradient sync to only the final micro-batch before the optimizer step if True."},
+    )
+
     def __post_init__(self):
         """
         Post-initialization hook that sets up the distributed environment.
diff --git a/nemo_automodel/components/distributed/utils.py b/nemo_automodel/components/distributed/utils.py
@@ -213,13 +213,16 @@ def reduce_loss(
     return loss, denominator
 
 
-def get_sync_ctx(model, is_optim_step):
+def get_sync_ctx(model, is_optim_step, defer_fsdp_grad_sync: bool):
     """
     Get the synchronization context for the model.
 
     Args:
         model: The model to synchronize.
         is_optim_step: Whether the current step is an optimizer step.
+        defer_fsdp_grad_sync: Controls FSDP2 gradient synchronization during gradient accumulation.
+            - True: disable gradient sync on non-final micro-batches (saves comm, can increase peak memory).
+            - False: always sync gradients on every micro-batch (more comm, lower peak memory).
 
     Returns:
         A context manager that synchronizes the model.
@@ -229,7 +232,10 @@ def get_sync_ctx(model, is_optim_step):
     # all-reduce for every micro-batch and greatly improves throughput.
     sync_ctx = nullcontext()
     if isinstance(model, dist.fsdp._fully_shard._fully_shard.FSDPModule):
-        model.set_requires_gradient_sync(is_optim_step)
+        if defer_fsdp_grad_sync:
+            model.set_requires_gradient_sync(is_optim_step)
+        else:
+            model.set_requires_gradient_sync(True)
     elif isinstance(model, torch.nn.parallel.DistributedDataParallel) and not is_optim_step:
         sync_ctx = model.no_sync()
     return sync_ctx
diff --git a/nemo_automodel/recipes/llm/kd.py b/nemo_automodel/recipes/llm/kd.py
@@ -173,7 +173,15 @@ def _forward_backward_step(
         train_ctx, batch = make_cp_batch_and_ctx(self.device_mesh, batch, labels)
 
         model = self.model_parts[0]
-        sync_ctx = get_sync_ctx(model, idx == num_batches - 1) if is_train else nullcontext()
+        sync_ctx = (
+            get_sync_ctx(
+                model,
+                idx == num_batches - 1,
+                defer_fsdp_grad_sync=getattr(self.model_wrapper, "defer_fsdp_grad_sync", True),
+            )
+            if is_train
+            else nullcontext()
+        )
         with train_ctx(), sync_ctx:
             # No grad for teacher forward
             with (
diff --git a/nemo_automodel/recipes/llm/train_ft.py b/nemo_automodel/recipes/llm/train_ft.py
@@ -1221,7 +1221,15 @@ def _forward_backward_step(
             loss_buffer.append(local_loss.clone().detach())
         else:
             model = self.model_parts[0]
-            sync_ctx = get_sync_ctx(model, idx == num_batches - 1) if is_train else nullcontext()
+            sync_ctx = (
+                get_sync_ctx(
+                    model,
+                    idx == num_batches - 1,
+                    defer_fsdp_grad_sync=getattr(self.model_wrapper, "defer_fsdp_grad_sync", True),
+                )
+                if is_train
+                else nullcontext()
+            )
             with train_ctx(), sync_ctx:
                 if isinstance(self.loss_fn, FusedLinearCrossEntropy):
                     # use num_logits_to_keep to avoid full logits matrix in memory
diff --git a/nemo_automodel/recipes/vlm/finetune.py b/nemo_automodel/recipes/vlm/finetune.py
@@ -755,7 +755,14 @@ def _run_train_optim_step(self, batches, max_grad_norm: Optional[float] = None):
             labels = batch.pop("labels")
 
             train_ctx, batch = make_cp_batch_and_ctx(self.device_mesh, batch, labels)
-            with train_ctx(), get_sync_ctx(self.model, i == num_batches - 1):
+            with (
+                train_ctx(),
+                get_sync_ctx(
+                    self.model,
+                    i == num_batches - 1,
+                    defer_fsdp_grad_sync=getattr(self.model_wrapper, "defer_fsdp_grad_sync", True),
+                ),
+            ):
                 if isinstance(self.loss_fn, FusedLinearCrossEntropy):
                     # use num_logits_to_keep to avoid full logits matrix in memory
                     out = self.model(logits_to_keep=1, **batch)
diff --git a/tests/unit_tests/distributed/test_utils.py b/tests/unit_tests/distributed/test_utils.py
@@ -117,7 +117,7 @@ def test_get_sync_ctx(monkeypatch, patch_dist):
     class Plain(torch.nn.Linear):
         pass
 
-    ctx = du.get_sync_ctx(Plain(2, 2), is_optim_step=False)
+    ctx = du.get_sync_ctx(Plain(2, 2), is_optim_step=False, defer_fsdp_grad_sync=False)
     # entering/exiting the context must be a no-op
     with ctx:
         pass
diff --git a/tests/unit_tests/recipes/test_finetune_vlm_helpers.py b/tests/unit_tests/recipes/test_finetune_vlm_helpers.py
@@ -228,6 +228,7 @@ def test_run_train_step_supports_tensor_outputs(monkeypatch):
     recipe.cfg = _Cfg(fp8=None)
     recipe.lr_scheduler = None
     recipe.timestamp = 0.0
+    recipe.model_wrapper = None
 
     recipe._dp_allreduce = lambda tensor, include_cp=False: tensor
     recipe._get_dp_group_size = lambda include_cp=True: 1
@@ -251,7 +252,7 @@ def fake_calculate_loss(*args, **kwargs):
     )
     monkeypatch.setattr(
         "nemo_automodel.recipes.vlm.finetune.get_sync_ctx",
-        lambda model, is_last: nullcontext(),
+        lambda model, is_last, defer_fsdp_grad_sync=True: nullcontext(),
     )
 
     calculate_mock = MagicMock(side_effect=fake_calculate_loss)

Original file line number	Diff line number	Diff line change
`@@ -228,6 +228,7 @@ def test_run_train_step_supports_tensor_outputs(monkeypatch):`
`228`	`228`	`recipe.cfg = _Cfg(fp8=None)`
`229`	`229`	`recipe.lr_scheduler = None`
`230`	`230`	`recipe.timestamp = 0.0`
	`231`	`+ recipe.model_wrapper = None`
`231`	`232`
`232`	`233`	`recipe._dp_allreduce = lambda tensor, include_cp=False: tensor`
`233`	`234`	`recipe._get_dp_group_size = lambda include_cp=True: 1`
`@@ -251,7 +252,7 @@ def fake_calculate_loss(args, *kwargs):`
`251`	`252`	`)`
`252`	`253`	`monkeypatch.setattr(`
`253`	`254`	`"nemo_automodel.recipes.vlm.finetune.get_sync_ctx",`
`254`		`- lambda model, is_last: nullcontext(),`
	`255`	`+ lambda model, is_last, defer_fsdp_grad_sync=True: nullcontext(),`
`255`	`256`	`)`
`256`	`257`
`257`	`258`	`calculate_mock = MagicMock(side_effect=fake_calculate_loss)`