feat: add support for nemotron-nas with custom plan. (#1180)

joyang-nv · web-flow · commit 56a6225a8459 · 2025-09-25T12:04:39.000-04:00
Signed-off-by: Jonas Yang &lt;joyang@nvidia.com&gt;
diff --git a/3rdparty/Automodel-workspace/Automodel b/3rdparty/Automodel-workspace/Automodel
@@ -1 +1 @@
-Subproject commit 7b55cabc0a3b1d8b03b6c1f680c030ea2c8eaa77
+Subproject commit 277a8a8d951f6d8bf030d34915cfa61b88eebffd
diff --git a/examples/configs/recipes/llm/grpo-math-llama-nemotron-super-49b-v.5-4n8g-fsdp2tp8.yaml b/examples/configs/recipes/llm/grpo-math-llama-nemotron-super-49b-v.5-4n8g-fsdp2tp8.yaml
@@ -0,0 +1,180 @@
+# GRPO Algorithm Configuration
+grpo:
+  num_prompts_per_step: 128
+  num_generations_per_prompt: 16
+  max_rollout_turns: 1 # for multi-turn rollouts. Math Environments just have 1 turn (answering the question)
+  max_num_epochs: 1
+  max_num_steps: 1000000
+  normalize_rewards: true
+  use_leave_one_out_baseline: true
+  val_period: 10
+  val_at_start: false
+  overlong_filtering: false
+  max_val_samples: 256
+  val_batch_size: 256
+  seed: 42
+  async_grpo:
+    enabled: false
+    max_trajectory_age_steps: 1
+
+loss_fn:
+  reference_policy_kl_penalty: 0.01
+  ratio_clip_min: 0.2
+  ratio_clip_max: 0.2
+  ratio_clip_c: null
+  # (default off) loss formulation improvements (docs/guides/grpo.md#loss)
+  use_on_policy_kl_approximation: false
+  use_importance_sampling_correction: false
+  sequence_level_importance_ratios: false
+  token_level_loss: true
+
+checkpointing:
+  enabled: true
+  checkpoint_dir: "results/grpo"
+  metric_name: "val_reward"
+  higher_is_better: true
+  keep_top_k: 3
+  save_period: 10
+  checkpoint_must_save_by: null
+  model_save_format: "safetensors"
+  save_consolidated: false
+
+policy:
+  model_name: "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5"
+  tokenizer:
+    name: "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5"
+  max_total_sequence_length: 1024
+  precision: "bfloat16"
+  train_global_batch_size: 128
+  train_micro_batch_size: 4
+  logprob_batch_size: 4
+  logprob_chunk_size: null
+
+  dtensor_cfg:
+    _v2: true
+    activation_checkpointing: true
+    context_parallel_size: 1
+    cpu_offload: false
+    enabled: true
+    sequence_parallel: false
+    tensor_parallel_size: 8
+    custom_parallel_plan: examples.configs.recipes.llm.llama_nemotron_super_49b_custom_plan.custom_parallel_plan
+
+  megatron_cfg:
+    enabled: false
+
+ # See docs/design-docs/sequence-packing-and-dynamic-batching.md 
+  # for more details on dynamic batching and sequence packing.
+  dynamic_batching:
+    enabled: True
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
+    sequence_length_round: 64
+
+  sequence_packing:
+    enabled: False
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
+    algorithm: "modified_first_fit_decreasing"
+    sequence_length_round: 64
+
+  make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
+  max_grad_norm: 1.0
+
+  optimizer:
+    name: "torch.optim.AdamW"
+    kwargs:
+      lr: 3.0e-7
+      weight_decay: 0.01
+      betas: [0.9, 0.999]
+      eps: 1e-8
+
+  scheduler:
+    - name: "torch.optim.lr_scheduler.LinearLR"
+      kwargs:
+        start_factor: 0.1
+        end_factor: 1.0
+        # The scheduler iteration is per GPRO step and is decoupled with the optimizer step (may be >=1 per GPRO step)
+        total_iters: 13
+    - name: "torch.optim.lr_scheduler.ConstantLR"
+      kwargs:
+        factor: 1.0
+        total_iters: 10000000000
+    - milestones: [13]
+
+  generation:
+    backend: "vllm"
+    max_new_tokens: ${policy.max_total_sequence_length}
+    temperature: 1.0
+    top_p: 1.0
+    top_k: null
+    stop_token_ids: null
+    stop_strings: null
+    vllm_cfg:
+      async_engine: false
+      precision: ${policy.precision}
+      tensor_parallel_size: 4
+      pipeline_parallel_size: 1
+      expert_parallel_size: 1  # When EP > 1, EP must be a multiple of TP since vLLM's EP = DP * TP
+      gpu_memory_utilization: 0.6
+      max_model_len: ${policy.max_total_sequence_length}
+      # when enforce_eager is False, it is optional to set ++policy.generation.vllm_kwargs.compilation_config.use_inductor=False for better accuracy,
+      # with the flag, vllm will use the custom CUDA kernels instead of the Triton kernels generated by torch.compile
+      # for more details, see convergence issue https://github.com/NVIDIA-NeMo/RL/issues/998
+      enforce_eager: False
+      use_deep_gemm: False
+      num_last_layers_in_bf16: 0
+      num_first_layers_in_bf16: 0
+    vllm_kwargs: {}
+    colocated:
+      # true: generation shares training GPUs
+      # false: uses dedicated generation resources
+      enabled: true
+      # only relevant when enabled is false
+      resources:
+        gpus_per_node: null # Decides num gpus to be dedicated to generation when there is one node in the cluster i.e cluster.num_nodes == 1
+        num_nodes: null # Decides number of nodes to be dedicated to generation
+
+data:
+  max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len
+  prompt_file: "examples/prompts/cot.txt"
+  system_prompt_file: null
+  shuffle: true
+
+  dataset_name: "OpenMathInstruct-2"
+  # You can use custom response datasets for training and validation. For example:
+  #   data:
+  #     dataset_name: ResponseDataset
+  #     train_data_path: <PathToTrainingDataset>  # e.g., /path/to/local/dataset.jsonl or hf_org/hf_dataset_name (HuggingFace)
+  #     val_data_path: <PathToValidationDataset>
+  #     input_key: <QuestionKey>, default is "input"
+  #     output_key: <AnswerKey>, default is "output"
+  #     train_split: <TrainSplit>, default is None  # used for HuggingFace datasets
+  #     val_split: <ValSplit>, default is None  # used for HuggingFace datasets
+  # See https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/grpo.md#datasets for more details.
+
+env:
+  math:
+    num_workers: 8
+
+logger:
+  log_dir: "logs"  # Base directory for all logs
+  num_val_samples_to_print: 0
+  wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running
+  tensorboard_enabled: false
+  mlflow_enabled: false
+  monitor_gpus: false  # If true, will monitor GPU usage and log to wandb and/or tensorboard
+  wandb:
+    project: "grpo-nemotron-super-49b"
+    name: "grpo-${data.dataset_name}-nemotron-super-49b-tp${policy.dtensor_cfg.tensor_parallel_size}"
+  tensorboard: {}
+  mlflow:
+    experiment_name: "sft-dev"
+    run_name: "grpo-nemotron-super-49b"
+  gpu_monitoring:
+    collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
+    flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
+
+cluster:
+  gpus_per_node: 8
+  num_nodes: 4
diff --git a/examples/configs/recipes/llm/llama_nemotron_super_49b_custom_plan.py b/examples/configs/recipes/llm/llama_nemotron_super_49b_custom_plan.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    ParallelStyle,
+    PrepareModuleInput,
+    PrepareModuleOutput,
+    RowwiseParallel,
+)
+from torch.distributed.tensor.placement_types import Replicate, Shard
+
+custom_parallel_plan: dict[str, ParallelStyle] = {
+    "model.layers.*.self_attn": PrepareModuleInput(
+        input_kwarg_layouts={"attention_mask": Replicate()},
+        desired_input_kwarg_layouts={"attention_mask": Replicate()},
+    ),
+    "model.embed_tokens": RowwiseParallel(
+        input_layouts=Replicate(), output_layouts=Replicate(), use_local_output=True
+    ),
+    "model.layers.*.self_attn.q_proj": ColwiseParallel(use_local_output=False),
+    "model.layers.*.self_attn.k_proj": ColwiseParallel(use_local_output=False),
+    "model.layers.*.self_attn.v_proj": ColwiseParallel(use_local_output=False),
+    "model.layers.*.self_attn.o_proj": RowwiseParallel(
+        output_layouts=Replicate(), use_local_output=True
+    ),
+    "model.layers.*.self_attn.rotary_emb": PrepareModuleOutput(
+        output_layouts=(Replicate(), Replicate()),
+        desired_output_layouts=(Replicate(), Replicate()),
+        use_local_output=False,
+    ),
+    "model.layers.*.mlp.up_proj": ColwiseParallel(),
+    "model.layers.*.mlp.gate_proj": ColwiseParallel(),
+    "model.layers.*.mlp.down_proj": RowwiseParallel(
+        output_layouts=Replicate(), use_local_output=True
+    ),
+    "lm_head": ColwiseParallel(output_layouts=Shard(-1), use_local_output=False),
+}
diff --git a/nemo_rl/models/policy/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/dtensor_policy_worker_v2.py
@@ -39,7 +39,6 @@
 )
 from nemo_automodel.components.distributed.parallelizer import (
     fsdp2_strategy_parallelize,
-    unshard_fsdp2_model,
 )
 from nemo_automodel.components.distributed.tensor_utils import (
     get_cpu_state_dict,
@@ -181,6 +180,10 @@ def __init__(
             else None,
         )
 
+        self.allow_flash_attn_args = self.check_model_allow_flash_attn_args(
+            model_config
+        )
+
         self._is_reward_model = (
             "reward_model_cfg" in self.cfg and self.cfg["reward_model_cfg"]["enabled"]
         )
@@ -467,6 +470,17 @@ def init_collective(self, ip: str, port: int, world_size: int) -> None:
     def is_alive(self) -> bool:
         return True
 
+    def check_model_allow_flash_attn_args(self, model_config) -> bool:
+        # Some models doesn't support flash_attn_kwargs
+        # Check nemotron nas.
+        if (
+            model_config.architectures[0] == "DeciLMForCausalLM"
+            and model_config.model_type == "nemotron-nas"
+        ):
+            return False
+
+        return True
+
     def reset_peak_memory_stats(self) -> None:
         torch.cuda.reset_peak_memory_stats()
 
@@ -686,6 +700,12 @@ def train(
                             if len(vlm_kwargs) > 0:
                                 del model_args["flash_attn_kwargs"]
 
+                            if (
+                                not self.allow_flash_attn_args
+                                and "flash_attn_kwargs" in model_args
+                            ):
+                                del model_args["flash_attn_kwargs"]
+
                             outputs = self.model(**model_args)
 
                         # Get logprobs
@@ -879,7 +899,7 @@ def get_logprobs(
         all_log_probs = []
         self.model.eval()
 
-        with unshard_fsdp2_model(self.model), torch.no_grad():
+        with torch.no_grad():
             data.to("cuda")
             dummy_iterator = iter([])
             if self.cfg["dynamic_batching"]["enabled"]:
@@ -997,6 +1017,12 @@ def get_logprobs(
                         if len(vlm_kwargs) > 0:
                             del model_args["flash_attn_kwargs"]
 
+                        if (
+                            not self.allow_flash_attn_args
+                            and "flash_attn_kwargs" in model_args
+                        ):
+                            del model_args["flash_attn_kwargs"]
+
                         outputs = self.model(**model_args)
 
                     logits = outputs.logits
@@ -1158,7 +1184,7 @@ def score(self, data: BatchedDataDict) -> BatchedDataDict[ScoreOutputSpec]:
                 )
         self.model.eval()
         print("Begin to batch datas")
-        with unshard_fsdp2_model(self.model), torch.no_grad():
+        with torch.no_grad():
             data.to("cuda")
             dummy_iterator = iter([])
             if self.cfg["dynamic_batching"]["enabled"]:
diff --git a/tests/test_suites/llm/grpo-math-llama-nemotron-super-49b-v.5-4n8g-fsdp2tp8.sh b/tests/test_suites/llm/grpo-math-llama-nemotron-super-49b-v.5-4n8g-fsdp2tp8.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=4
+STEPS_PER_RUN=2  # 40min: step_time: [1341, 801]
+MAX_STEPS=2
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=30
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["2"] < 1.1' \
+        'mean(data["timing/train/policy_training"]) < 280' \
+        'mean(data["ray/node.0.gpu.0.mem_gb"]) < 75'
+fi
diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt
@@ -40,6 +40,9 @@ tests/test_suites/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8.sh
 # Non-colocated
 tests/test_suites/llm/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated.sh
 
+# Nemotron Super 49B
+tests/test_suites/llm/grpo-math-llama-nemotron-super-49b-v.5-4n8g-fsdp2tp8.sh
+
 #######
 # SFT #
 #######