NVIDIA-NeMo · terrykong · Oct 17, 2025 · Jul 3, 2025 · Jul 3, 2025 · Aug 10, 2025
@@ -13,7 +13,7 @@
 # limitations under the License.
 import os
 from pathlib import Path
-from typing import Any, Optional, Tuple, TypedDict, TypeVar, cast
+from typing import Any, NotRequired, Optional, Tuple, TypedDict, TypeVar, cast
 
 import numpy as np
 import ray
@@ -27,6 +27,7 @@
     ClippedPGLossDataDict,
     ClippedPGLossFn,
 )
+from nemo_rl.algorithms.reward_functions import RewardConfig, process_rewards
 from nemo_rl.algorithms.utils import calculate_baseline_and_std_per_prompt
 from nemo_rl.data import DataConfig
 from nemo_rl.data.datasets import AllTaskProcessedDataset, rl_collate_fn
@@ -83,6 +84,8 @@ class GRPOConfig(TypedDict):
     val_at_start: bool
     max_val_samples: int
     checkpoint_dir: str
+    use_dynamic_sampling: NotRequired[bool]
+    max_num_gen_batches: NotRequired[int]
 
 
 class GRPOSaveState(TypedDict):
@@ -106,6 +109,7 @@ class GRPOLoggerConfig(LoggerConfig):
 class MasterConfig(TypedDict):
     policy: PolicyConfig
     loss_fn: ClippedPGLossConfig
+    reward_fn: RewardConfig
     env: dict[str, Any]
     data: DataConfig
     grpo: GRPOConfig
@@ -518,6 +522,7 @@ def grpo_train(
         logger.log_metrics(val_metrics, step, prefix="validation")
         logger.log_metrics(validation_timings, step, prefix="timing/validation")
 
+    num_gen_batches = 0
     # Run grpo training (single-turn)
     batch: BatchedDataDict[DatumSpec]
     for batch in dataloader:
@@ -555,6 +560,7 @@ def grpo_train(
                 else:
                     policy_generation.prepare_for_generation()
 
+            num_gen_batches += 1
             with timer.time("generation"):
                 # Use async rollouts if vLLM async engine is enabled
                 if _should_use_async_rollouts(master_config):
@@ -601,8 +607,76 @@ def grpo_train(
                         "use_leave_one_out_baseline"
                     ],
                 )
+
+                # Dynamic sampling algorithm (used in DAPO algorithm)
+                # This block implements dynamic sampling by selecting prompt groups with non-zero std.
+                # If sampled prompts are fewer than train_batch_size * num_generations_per_prompt, continue sampling until max_num_gen_batches is reached.
+                if master_config["grpo"]["use_dynamic_sampling"]:
+                    std_chunks_per_prompt = std.split(
+                        master_config["grpo"]["num_generations_per_prompt"]
+                    )
+                    keep_prompt_indices = []
+                    selected_std_chunks = []
+                    for chunk_idx, chunk in enumerate(std_chunks_per_prompt):
+                        chunk_length = chunk.shape[0]
+                        if torch.nonzero(chunk).shape[0] == chunk_length:
+                            chunk_prompt_indices = [
+                                chunk_idx * chunk_length + idx
+                                for idx in range(chunk_length)
+                            ]
+                            keep_prompt_indices.extend(chunk_prompt_indices)
+                            selected_std_chunks.append(chunk)
+                    std = torch.cat(selected_std_chunks)
+
+                    generation_sample_buffer_size = len(keep_prompt_indices)
+                    train_prompts_buffer_size = (
+                        master_config["policy"]["train_global_batch_size"]
+                        * master_config["grpo"]["num_generations_per_prompt"]
+                    )
+
+                    # If the generation samples size is smaller than a fixed threshold (train_prompts_buffer_size), keep generating by processing the next batch
+                    if generation_sample_buffer_size < train_prompts_buffer_size:
+                        max_num_gen_batches = master_config["grpo"].get(
+                            "max_num_gen_batches", 0
+                        )
+                        if (
+                            max_num_gen_batches <= 0
+                            or num_gen_batches <= max_num_gen_batches
+                        ):
+                            continue
+                        else:
+                            raise ValueError(
+                                f"Dynamic sampling has reached the maximum allowable number of batches ({max_num_gen_batches}). Consider evaluating the complexity of your data or adjusting the num_prompts_per_step or num_generations_per_prompt parameters to enhance the diversity of the samples."
+                            )
+                    else:
+                        # Select the inputs that have non-zero std
+                        repeated_batch = repeated_batch.select_indices(
+                            keep_prompt_indices
+                        )
+
+                        # Gather the corresponding rewards
+                        rewards = rewards[keep_prompt_indices]
+
+                        # Gather the corresponding baselines(mean)
+                        baseline = baseline[keep_prompt_indices]
+
+                        # Slice the batch, rewards, baselines and std to ensure batch size is train_prompts_buffer_size
+                        repeated_batch = repeated_batch.slice(
+                            0, train_prompts_buffer_size
+                        )
+                        rewards = rewards[:train_prompts_buffer_size]
+                        baseline = baseline[:train_prompts_buffer_size]
+                        std = std[:train_prompts_buffer_size]
+
+                # Process rewards with custom reward function
+                if master_config["reward_fn"]["enabled"]:
+                    rewards = process_rewards(
+                        repeated_batch, rewards, master_config["reward_fn"]
+                    )
+
                 advantages = (rewards - baseline).unsqueeze(-1)
 
+                # Normalize rewards
                 if master_config["grpo"]["normalize_rewards"]:
                     # don't sharpen the ones with no variation
                     zero_std_mask = std > 0

@@ -0,0 +1,79 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TypedDict, TypeVar
+
+import torch
+
+from nemo_rl.distributed.batched_data_dict import BatchedDataDict
+
+Tensor = TypeVar("Tensor", bound=torch.Tensor)
+
+
+class RewardConfig(TypedDict):
+    """Configuration for reward function processing.
+
+    This configuration enables custom reward shaping, currently supporting DAPO-style
+    penalties for responses that exceed the maximum response length threshold.
+    """
+
+    enabled: bool
+    overlong_buffer_length: int
+    overlong_buffer_penalty: float
+    max_response_length: int
+
+
+def process_rewards(
+    batch: BatchedDataDict, rewards: torch.Tensor, cfg: RewardConfig
+) -> torch.Tensor:
+    """Process rewards by applying penalties for responses exceeding max_response_length. Currently, this function only supports DAPO reward shaping as illustrated in the DAPO paper : https://arxiv.org/pdf/2503.14476.
+
+    Nonetheless, it can be potentially extended to support any custom reward logic.
+    """
+    if not cfg["enabled"]:
+        return rewards
+
+    # DAPO reward shaping requires overlong_buffer_length, overlong_buffer_penalty, and max_response_length to be set.
+    if (
+        cfg["overlong_buffer_length"] is None
+        or cfg["overlong_buffer_penalty"] is None
+        or cfg["max_response_length"] is None
+    ):
+        raise ValueError(
+            "Reward function is enabled but only DAPO reward shaping is currently supported. Please ensure overlong_buffer_length, overlong_buffer_penalty, and max_response_length are properly configured."
+        )
+
+    # Get the overlong_buffer_length, overlong_buffer_penalty and max_response_length
+    overlong_buffer_length = cfg["overlong_buffer_length"]
+    overlong_buffer_penalty = cfg["overlong_buffer_penalty"]
+    max_response_length = cfg["max_response_length"]
+
+    # Calculate the expected response length
+    expected_response_length = max_response_length - overlong_buffer_length
+
+    assert len(batch["message_log"]) == len(rewards), (
+        "The number of messages in the batch must match the number of rewards"
+    )
+
+    updated_rewards = torch.zeros_like(rewards)
+    for i, message_log in enumerate(batch["message_log"]):
+        # Get the assistant response length (index 1 is the assistant response)
+        message_response_length = message_log[1]["token_ids"].shape[0]
+        # Calculate the exceed length and the corresponding reward penalty
+        exceed_length = message_response_length - expected_response_length
+        overlong_reward = min(
+            -exceed_length / overlong_buffer_length * overlong_buffer_penalty, 0
+        )
+        updated_rewards[i] = rewards[i] + overlong_reward
+
+    return updated_rewards