revert async change and enable fast test

yuki-97 · yuki-97 · commit 539893e34cfc · 2026-03-04T08:45:26.000-08:00
Signed-off-by: Yuki Huang &lt;yukih@nvidia.com&gt;
diff --git a/nemo_rl/experience/rollouts.py b/nemo_rl/experience/rollouts.py
@@ -934,35 +934,25 @@ async def run_single_sample_with_error_handling(i, sample_state):
 
         # Reconstruct batch from sample results
         batch_size = len(final_sample_states)
-        final_batch_dict = {
-            "message_log": [state["message_log"] for state in final_sample_states],
-            "extra_env_info": [
-                state["extra_env_info"] for state in final_sample_states
-            ],
-            "task_name": [state["task_name"] for state in final_sample_states],
-            "total_reward": torch.stack(
-                [state["total_reward"] for state in final_sample_states]
-            ),
-            "idx": [
-                state.get("idx", i) for i, state in enumerate(final_sample_states)
-            ],
-            "truncated": torch.tensor(
-                [metrics["truncated"] for metrics in all_sample_metrics],
-                dtype=torch.bool,
-            ),
-        }
-
-        # Add any reward component keys (reward1, reward2, ...) from the first state
-        reward_keys = [
-            k for k in final_sample_states[0]
-            if k.startswith("reward") and k[6:].isdigit()
-        ]
-        reward_keys = sorted(reward_keys, key=lambda k: int(k[6:]))
-        for key in reward_keys:
-            final_batch_dict[key] = torch.stack(
-                [state[key] for state in final_sample_states]
-            )
-        final_batch = BatchedDataDict[DatumSpec](final_batch_dict)
+        final_batch = BatchedDataDict[DatumSpec](
+            {
+                "message_log": [state["message_log"] for state in final_sample_states],
+                "extra_env_info": [
+                    state["extra_env_info"] for state in final_sample_states
+                ],
+                "task_name": [state["task_name"] for state in final_sample_states],
+                "total_reward": torch.stack(
+                    [state["total_reward"] for state in final_sample_states]
+                ),
+                "idx": [
+                    state.get("idx", i) for i, state in enumerate(final_sample_states)
+                ],
+                "truncated": torch.tensor(
+                    [metrics["truncated"] for metrics in all_sample_metrics],
+                    dtype=torch.bool,
+                ),
+            }
+        )
 
         # Preserve additional fields from the original input_batch
         for key in input_batch.keys():
@@ -1237,42 +1227,28 @@ def run_async_nemo_gym_rollout(
     )
     input_ids = batched_flat["token_ids"]
 
-    final_batch_dict = {
-        "agent_ref": [r["agent_ref"] for r in results],
-        "message_log": [r["message_log"] for r in results],
-        # length is used downstream for mean_prompt_length
-        "length": torch.tensor(
-            [len(r["input_message_log"][0]["token_ids"]) for r in results]
-        ),
-        "loss_multiplier": input_batch["loss_multiplier"],
-        # Unnecessary parts of the DatumSpec unused by the GRPO algorithm
-        # extra_env_info: dict[str, Any]
-        # idx: int
-        # task_name: NotRequired[str]
-        # stop_strings: NotRequired[list[str]]  # Optional stop strings for generation
-        # Extra information not in the DatumSpec used by the GRPO algorithm
-        "total_reward": torch.tensor([r["full_result"]["reward"] for r in results]),
-        # Add truncated field to match other rollout paths (reusing hit_max_tokens logic)
-        "truncated": torch.tensor(
-            [m["hit_max_tokens"] for m in all_sample_metrics], dtype=torch.bool
-        ),
-    }
-
-    # Add any reward component keys (reward1, reward2, ...) from full_result
-    if results:
-        full_result = results[0].get("full_result", {})
-        reward_keys = sorted(
-            [
-                k for k in full_result
-                if isinstance(k, str) and k.startswith("reward") and k[6:].isdigit()
-            ],
-            key=lambda k: int(k[6:]),
-        )
-        for key in reward_keys:
-            final_batch_dict[key] = torch.tensor(
-                [r["full_result"][key] for r in results]
-            )
-    final_batch = BatchedDataDict[DatumSpec](final_batch_dict)
+    final_batch = BatchedDataDict[DatumSpec](
+        {
+            "agent_ref": [r["agent_ref"] for r in results],
+            "message_log": [r["message_log"] for r in results],
+            # length is used downstream for mean_prompt_length
+            "length": torch.tensor(
+                [len(r["input_message_log"][0]["token_ids"]) for r in results]
+            ),
+            "loss_multiplier": input_batch["loss_multiplier"],
+            # Unnecessary parts of the DatumSpec unused by the GRPO algorithm
+            # extra_env_info: dict[str, Any]
+            # idx: int
+            # task_name: NotRequired[str]
+            # stop_strings: NotRequired[list[str]]  # Optional stop strings for generation
+            # Extra information not in the DatumSpec used by the GRPO algorithm
+            "total_reward": torch.tensor([r["full_result"]["reward"] for r in results]),
+            # Add truncated field to match other rollout paths (reusing hit_max_tokens logic)
+            "truncated": torch.tensor(
+                [m["hit_max_tokens"] for m in all_sample_metrics], dtype=torch.bool
+            ),
+        }
+    )
 
     return AsyncNemoGymRolloutResult(
         input_ids=input_ids,
diff --git a/tests/functional/L1_Functional_Tests_GPU.sh b/tests/functional/L1_Functional_Tests_GPU.sh
@@ -45,7 +45,7 @@ run_test      uv run --no-sync bash ./tests/functional/dpo_automodel_lora.sh
 run_test      uv run --no-sync bash ./tests/functional/dpo_megatron.sh
 run_test      uv run --no-sync bash ./tests/functional/eval.sh
 run_test      uv run --no-sync bash ./tests/functional/eval_async.sh
-run_test      uv run --no-sync bash ./tests/functional/gdpo.sh
+run_test fast uv run --no-sync bash ./tests/functional/gdpo.sh
 run_test fast uv run --no-sync bash ./tests/functional/grpo.sh
 run_test fast uv run --no-sync bash ./tests/functional/grpo_async_gym.sh
 run_test      uv run --no-sync bash ./tests/functional/grpo_automodel_lora.sh