remove redudant repeat_batch key

nbasyl · nbasyl · commit a4109359fae3 · 2026-03-05T20:17:58.000-08:00
Signed-off-by: Shih-Yang Liu &lt;shihyangl@nvidia.com&gt;
diff --git a/nemo_rl/algorithms/advantage_estimator.py b/nemo_rl/algorithms/advantage_estimator.py
@@ -97,9 +97,9 @@ def compute_advantage(
         """Compute GDPO advantages.
 
         Args:
-            prompt_ids: Unused; for interface consistency.
+            prompt_ids: Tensor identifying which prompt each sample belongs to (for per-prompt baselines).
             rewards: Unused; for interface consistency.
-            repeated_batch: Batch containing _input_ids_for_baseline and reward1, reward2, ... keys.
+            repeated_batch: Batch containing reward1, reward2, ... keys.
             mask: Response token mask of shape [batch_size, seq_len], 1 for valid response tokens, 0 for padding.
             **kwargs: Additional arguments (unused).
 
@@ -113,20 +113,17 @@ def compute_advantage(
                 f"This batch has {len(reward_component_keys)} component(s). "
                 "Switch to GRPO by setting grpo.adv_estimator.name to 'grpo' in your config."
             )
-        current_input_ids = repeated_batch["_input_ids_for_baseline"]
-        valid = torch.ones_like(
-            repeated_batch[reward_component_keys[0]]
-        )
+        valid = torch.ones_like(repeated_batch[reward_component_keys[0]])
         leave_one_out = self.use_leave_one_out_baseline
-        assert current_input_ids.shape[0] == valid.shape[0], (
-            "_input_ids_for_baseline must match reward batch size after dynamic_sampling; "
-            f"got {current_input_ids.shape[0]} vs {valid.shape[0]}"
+        assert prompt_ids.shape[0] == valid.shape[0], (
+            "prompt_ids must match reward batch size; "
+            f"got {prompt_ids.shape[0]} vs {valid.shape[0]}"
         )
         advantage_parts = []
         for key in reward_component_keys:
             r = repeated_batch[key]
             base, std_k = calculate_baseline_and_std_per_prompt(
-                current_input_ids,
+                prompt_ids,
                 r,
                 valid,
                 leave_one_out_baseline=leave_one_out,
diff --git a/nemo_rl/algorithms/grpo.py b/nemo_rl/algorithms/grpo.py
@@ -1605,10 +1605,6 @@ def grpo_train(
                 with timer.time("reward_calculation"):
                     # Extract rewards from final_batch
                     rewards = repeated_batch["total_reward"]
-                    # Store input_ids in batch so that after dynamic_sampling it stays aligned with
-                    # the (possibly filtered) batch: select_indices / from_batches / slice all
-                    # apply to this key, so per-reward baselines use the same prompts as reward components.
-                    repeated_batch["_input_ids_for_baseline"] = input_ids
 
                     print("▶ Computing advantages...", flush=True)
                     if master_config["grpo"].get("calculate_advantages_on_gpu"):
@@ -2744,8 +2740,6 @@ def async_grpo_train(
                     del prompt_batched_flat
 
                     rewards = repeated_batch["total_reward"]
-                    # All estimators read _input_ids_for_baseline from repeated_batch
-                    repeated_batch["_input_ids_for_baseline"] = prompt_ids_for_adv
 
                     print(
                         f"  📊 Rewards stats: min={rewards.min():.4f}, max={rewards.max():.4f}, mean={rewards.mean():.4f}, std={rewards.std():.4f}"