pytorch · vmoens · Mar 5, 2026
diff --git a/sota-implementations/grpo/grpo_utils.py b/sota-implementations/grpo/grpo_utils.py
@@ -649,8 +649,8 @@ def make_env(cfg: DictConfig, single_env: bool = False):
     # Setup environment
     max_steps = cfg.env.max_steps if cfg.env.reasoning else 1
     if cfg.env.dataset == "gsm8k":
-        # Reward scale is 0.0 to 100
-        reward_threshold = 20
+        # Reward scale is 0.0 to 1.0
+        reward_threshold = 0.1
         env = GSM8KEnv(
             repeats=cfg.env.repeats,
             tokenizer=train_tokenizer,
@@ -659,9 +659,9 @@ def make_env(cfg: DictConfig, single_env: bool = False):
             device=torch.device("cpu"),
             ray_backend=True,
         )
-    elif cfg.env.dataset == "ifeval":  # ifeval
-        # Reward scale is 0.0 to 2.2
-        reward_threshold = 1.0
+    elif cfg.env.dataset == "ifeval":
+        # Reward scale is 0.0 to ~1.15
+        reward_threshold = 0.5
         env = IFEvalEnv(
             repeats=cfg.env.repeats,
             tokenizer=train_tokenizer,

diff --git a/test/llm/test_llm_envs.py b/test/llm/test_llm_envs.py
@@ -266,10 +266,92 @@ def test_gsm8kenv(self, ray_backend, device, ray_backend_fixture):
         r["history"].full = history_full
         s = env.step(r)
         assert s.device == device
-        assert s["next", "reward"] >= 10
+        assert s["next", "reward"] > 0
         assert s["next", "done"].all()
 
 
+class TestGSM8KRewardParser:
+    """Unit tests for the GSM8K reward parser (no model/dataset required)."""
+
+    def test_extract_tags(self):
+        from torchrl.envs.llm.reward.gsm8k import GSM8KRewardParser
+
+        think, answer = GSM8KRewardParser.extract_tags(
+            "<think>some reasoning</think> <answer>42</answer>"
+        )
+        assert think == "some reasoning"
+        assert answer == "42"
+
+    def test_extract_tags_malformed(self):
+        from torchrl.envs.llm.reward.gsm8k import GSM8KRewardParser
+
+        think, answer = GSM8KRewardParser.extract_tags(
+            "<think>reasoning with <special> chars & stuff</think> <answer>5</answer>"
+        )
+        assert answer == "5"
+
+    def test_extract_tags_missing(self):
+        from torchrl.envs.llm.reward.gsm8k import GSM8KRewardParser
+
+        think, answer = GSM8KRewardParser.extract_tags("no tags here at all")
+        assert think == ""
+        assert answer == ""
+
+    def test_normalize_answer(self):
+        from torchrl.envs.llm.reward.gsm8k import GSM8KRewardParser
+
+        assert GSM8KRewardParser.normalize_answer("1,234") == "1234"
+        assert GSM8KRewardParser.normalize_answer("$120") == "120"
+        assert GSM8KRewardParser.normalize_answer("120.0") == "120"
+        assert GSM8KRewardParser.normalize_answer("120.00") == "120"
+        assert GSM8KRewardParser.normalize_answer(" 42 ") == "42"
+        assert GSM8KRewardParser.normalize_answer("3.14") == "3.14"
+        assert GSM8KRewardParser.normalize_answer("100%") == "100"
+
+    def test_correct_answer_reward(self):
+        from torchrl.envs.llm.reward.gsm8k import GSM8KRewardParser
+
+        parser = GSM8KRewardParser()
+        td = parser._single_correctness_reward("42", "42", "some reasoning")
+        assert td["success"]
+        assert td["reward"] == 1.0
+
+    def test_wrong_answer_with_format(self):
+        from torchrl.envs.llm.reward.gsm8k import GSM8KRewardParser
+
+        parser = GSM8KRewardParser()
+        td = parser._single_correctness_reward("42", "99", "some reasoning")
+        assert not td["success"]
+        assert td["reward"] == 0.1
+        assert td["reward_answer"] == 1.0
+
+    def test_no_answer(self):
+        from torchrl.envs.llm.reward.gsm8k import GSM8KRewardParser
+
+        parser = GSM8KRewardParser()
+        td = parser._single_correctness_reward("42", "", "")
+        assert not td["success"]
+        assert td["reward"] == 0.0
+        assert td["reward_answer"] == 0.0
+
+    def test_normalized_match(self):
+        from torchrl.envs.llm.reward.gsm8k import GSM8KRewardParser
+
+        parser = GSM8KRewardParser()
+        td = parser._single_correctness_reward("1234", "1,234", "thinking")
+        assert td["success"]
+        assert td["reward"] == 1.0
+
+    def test_custom_reward_values(self):
+        from torchrl.envs.llm.reward.gsm8k import GSM8KRewardParser
+
+        parser = GSM8KRewardParser(format_reward=0.5, correct_reward=2.0)
+        td_correct = parser._single_correctness_reward("42", "42", "cot")
+        assert td_correct["reward"] == 2.0
+        td_format = parser._single_correctness_reward("42", "99", "cot")
+        assert td_format["reward"] == 0.5
+
+
 @pytest.mark.skipif(not _has_ifeval, reason="requires IFEval libs")
 class TestIFEvalEnv:
     def test_ifeval(self):

diff --git a/torchrl/envs/llm/datasets/gsm8k.py b/torchrl/envs/llm/datasets/gsm8k.py
@@ -259,7 +259,6 @@ class GSM8KEnv(DatasetChatEnv):
                             is_shared=False),
                         reward: Tensor(shape=torch.Size([1, 1, 1]), device=cpu, dtype=torch.float32, is_shared=False),
                         reward_answer: Tensor(shape=torch.Size([1, 1, 1]), device=cpu, dtype=torch.float32, is_shared=False),
-                        reward_contained: Tensor(shape=torch.Size([1, 1, 1]), device=cpu, dtype=torch.float32, is_shared=False),
                         reward_right: Tensor(shape=torch.Size([1, 1, 1]), device=cpu, dtype=torch.float32, is_shared=False),
                         reward_think: Tensor(shape=torch.Size([1, 1, 1]), device=cpu, dtype=torch.float32, is_shared=False),
                         step_count: Tensor(shape=torch.Size([1, 1]), device=cpu, dtype=torch.int64, is_shared=False),
@@ -293,7 +292,7 @@ class GSM8KEnv(DatasetChatEnv):
             device=None,
             is_shared=False,
             stack_dim=0)
-        >>> assert s["next", "reward"] >= 10
+        >>> assert s["next", "reward"] > 0
         >>> assert s["next", "done"].all()
 
     """