add unit test for gdpo estimator and multi-reward env

yuki-97 · yuki-97 · commit 198c991ea4ff · 2026-03-09T07:27:28.000-07:00
Signed-off-by: Yuki Huang &lt;yukih@nvidia.com&gt;
diff --git a/tests/unit/algorithms/test_grpo.py b/tests/unit/algorithms/test_grpo.py
@@ -20,6 +20,7 @@
 from torchdata.stateful_dataloader import StatefulDataLoader
 
 from nemo_rl.algorithms.advantage_estimator import (
+    GDPOAdvantageEstimator,
     GRPOAdvantageEstimator,
     ReinforcePlusPlusAdvantageEstimator,
 )
@@ -1810,6 +1811,51 @@ def test_grpo_advantage_estimator_small_nonzero_std():
 # ============================================================================
 
 
+def test_gdpo_advantage_estimator_multiple_rewards():
+    """Test GDPOAdvantageEstimator with multiple rewards."""
+    estimator_config = {
+        "use_leave_one_out_baseline": False,
+        "normalize_rewards": True,
+    }
+    loss_config = {}
+    estimator = GDPOAdvantageEstimator(estimator_config, loss_config)
+
+    prompt_ids = torch.tensor([[0], [0]])
+    mask = torch.ones(2, 3)
+    repeated_batch = {
+        "reward1": torch.tensor([1.0, 1.0]),
+        "reward2": torch.tensor([1.0, -1.0]),
+        "reward3": torch.tensor([1.0, 0.0]),
+    }
+
+    result = estimator.compute_advantage(prompt_ids, None, mask, repeated_batch)
+    assert result.shape == (2, 3)
+    assert torch.allclose(result[0, 0], torch.tensor(0.7071))
+    assert torch.allclose(result[1, 0], torch.tensor(-0.7071))
+
+
+def test_gdpo_advantage_estimator_single_reward():
+    """Test GDPOAdvantageEstimator with multiple rewards."""
+    estimator_config = {
+        "use_leave_one_out_baseline": False,
+        "normalize_rewards": True,
+    }
+    loss_config = {}
+    estimator = GDPOAdvantageEstimator(estimator_config, loss_config)
+
+    prompt_ids = torch.tensor([[0], [0]])
+    mask = torch.ones(2, 3)
+    repeated_batch = {"reward1": torch.tensor([1.0, 3.0])}
+
+    with pytest.raises(ValueError):
+        estimator.compute_advantage(prompt_ids, None, mask, repeated_batch)
+
+
+# ============================================================================
+# Tests for ReinforcePlusPlusAdvantageEstimator class
+# ============================================================================
+
+
 def test_reinforce_plus_plus_global_normalization():
     """Test that ReinforcePlusPlusAdvantageEstimator applies global normalization.
 
diff --git a/tests/unit/environments/test_math_environment.py b/tests/unit/environments/test_math_environment.py
@@ -11,29 +11,34 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
 import time
 
 import pytest
 import ray
 
-from nemo_rl.distributed.ray_actor_environment_registry import (
-    get_actor_python_env,
-)
-from nemo_rl.environments.math_environment import MathEnvironment
+from nemo_rl.environments.utils import create_env
+
+# ============================================================================
+# Environment fixtures
+# ============================================================================
 
 
 @pytest.fixture(scope="module")
 def math_env():
     """Create a MathEnvironment actor for testing."""
-    env = MathEnvironment.options(
-        runtime_env={
-            "py_executable": get_actor_python_env(
-                "nemo_rl.environments.math_environment.MathEnvironment"
-            ),
-            "env_vars": dict(os.environ),
-        }
-    ).remote({"num_workers": 2})
+    env = create_env("math", {"num_workers": 2})
+    yield env
+    # Clean up the actor and wait for it to be killed
+    env.shutdown.remote()
+    ray.kill(env)
+    # Give some time for cleanup
+    time.sleep(0.1)
+
+
+@pytest.fixture(scope="module")
+def math_multi_reward_env():
+    """Create a MathMultiRewardEnvironment actor for testing."""
+    env = create_env("math_multi_reward", {"num_workers": 2})
     yield env
     # Clean up the actor and wait for it to be killed
     env.shutdown.remote()
@@ -45,15 +50,7 @@ def math_env():
 @pytest.fixture(scope="module")
 def multichoice_env(request):
     """Create a MathEnvironment actor for testing."""
-    verifier_type = request.param
-    env = MathEnvironment.options(
-        runtime_env={
-            "py_executable": get_actor_python_env(
-                "nemo_rl.environments.math_environment.MathEnvironment"
-            ),
-            "env_vars": dict(os.environ),
-        }
-    ).remote({"num_workers": 2, "verifier_type": verifier_type})
+    env = create_env("math", {"num_workers": 2, "verifier_type": request.param})
     yield env
     # Clean up the actor and wait for it to be killed
     env.shutdown.remote()
@@ -62,6 +59,11 @@ def multichoice_env(request):
     time.sleep(0.1)
 
 
+# ============================================================================
+# Data fixtures
+# ============================================================================
+
+
 @pytest.fixture
 def basic_test_data():
     """Common test data for basic math problems."""
@@ -88,6 +90,41 @@ def basic_test_data():
     }
 
 
+@pytest.fixture
+def multi_reward_test_data():
+    """Common test data for basic math problems with multiple rewards."""
+    return {
+        "message_log_batch": [
+            [
+                {"role": "user", "content": "What is 2 + 2?"},
+                {
+                    "role": "assistant",
+                    "content": "<think>2 + 2 = 4</think>\n<answer>4</answer>",
+                },
+            ],
+            [
+                {"role": "user", "content": "What is 3 * 4?"},
+                {
+                    "role": "assistant",
+                    "content": "<think>3 * 4 = 12</think>\n<answer>12.5</answer>",
+                },
+            ],
+            [
+                {"role": "user", "content": "What is 10 - 5?"},
+                {
+                    "role": "assistant",
+                    "content": "<think>10 - 5 = 5\n<answer>5</answer>",
+                },
+            ],
+        ],
+        "metadata": [
+            {"ground_truth": "4"},
+            {"ground_truth": "12"},
+            {"ground_truth": "5"},
+        ],
+    }
+
+
 @pytest.fixture
 def multichoice_test_data(request):
     """Common test data for basic multichoice problems."""
@@ -170,6 +207,11 @@ def multiple_assistant_test_data():
     }
 
 
+# ============================================================================
+# Environment tests
+# ============================================================================
+
+
 def test_math_env_step_basic(math_env, basic_test_data):
     """Test basic functionality of MathEnvironment step with simple messages."""
     result = ray.get(
@@ -204,6 +246,56 @@ def test_math_env_step_basic(math_env, basic_test_data):
     assert all(result.terminateds == 1.0), "All terminated flags should be 1.0"
 
 
+def test_multi_reward_env_step_basic(math_multi_reward_env, multi_reward_test_data):
+    """Test basic step: correct answer + valid format -> all 3 rewards 1.0."""
+    result = ray.get(
+        math_multi_reward_env.step.remote(
+            multi_reward_test_data["message_log_batch"],
+            multi_reward_test_data["metadata"],
+        )
+    )
+
+    # Check observations (based on correctness reward, index 0)
+    assert len(result.observations) == 3, (
+        "Should return observations for all 3 messages"
+    )
+    assert all(obs["role"] == "environment" for obs in result.observations), (
+        "All observations should be from environment"
+    )
+
+    # Check observations for each data point
+    assert result.observations[0]["content"] == "Environment: correct"
+    assert result.observations[1]["content"] == "Environment: incorrect"
+    assert result.observations[2]["content"] == "Environment: correct"
+
+    # Check metadata
+    assert len(result.metadata) == 3, "Should return metadata for all 3 messages"
+    assert result.metadata == multi_reward_test_data["metadata"], (
+        "Metadata should be unchanged"
+    )
+
+    # Check rewards: shape (batch_size=3, number_of_rewards=3)
+    assert result.rewards.shape == (3, 3), "Rewards should be a tensor of shape (3, 3)"
+
+    # Check rewards for each data point
+    # First reward: correctness reward 1.0, int reward 1.0, format reward 1.0
+    assert (result.rewards[0] == 1.0).all(), "First reward should be 1.0"
+    # Second reward: correctness reward 0.0, int reward 0.0, format reward 1.0
+    assert result.rewards[1][0] == 0.0
+    assert result.rewards[1][1] == 0.0
+    assert result.rewards[1][2] == 1.0
+    # Third reward: correctness reward 1.0, int reward 1.0, format reward 0.0
+    assert result.rewards[2][0] == 1.0
+    assert result.rewards[2][1] == 1.0
+    assert result.rewards[2][2] == 0.0
+
+    # Check terminated flags
+    assert result.terminateds.shape == (3,), (
+        "Terminated flags should be a tensor of shape (3,)"
+    )
+    assert all(result.terminateds == 1.0), "All terminated flags should be 1.0"
+
+
 @pytest.mark.parametrize(
     "multichoice_env, multichoice_test_data",
     [