NVIDIA-NeMo
diff --git a/‎docs/guides/eval.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/guides/eval.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/configs/evals/eval.yaml‎
Lines changed: 2 additions & 2 deletions b/‎examples/configs/evals/eval.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎nemo_rl/environments/code_environment.py‎
Lines changed: 7 additions & 0 deletions b/‎nemo_rl/environments/code_environment.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎nemo_rl/environments/games/sliding_puzzle.py‎
Lines changed: 9 additions & 2 deletions b/‎nemo_rl/environments/games/sliding_puzzle.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎nemo_rl/environments/interfaces.py‎
Lines changed: 2 additions & 0 deletions b/‎nemo_rl/environments/interfaces.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎nemo_rl/environments/math_environment.py‎
Lines changed: 96 additions & 30 deletions b/‎nemo_rl/environments/math_environment.py‎
Lines changed: 96 additions & 30 deletions
diff --git a/‎nemo_rl/environments/tools/retriever.py‎
Lines changed: 7 additions & 0 deletions b/‎nemo_rl/environments/tools/retriever.py‎
Lines changed: 7 additions & 0 deletions
@@ -81,7 +81,7 @@ When you complete the evaluation, you will receive a summary similar to the foll
 model_name='Qwen2.5-Math-1.5B-Instruct' dataset_name='aime2024'
 max_new_tokens=2048 temperature=0.0 top_p=1.0 top_k=-1
 
-metric='pass@k' pass_k_value=1 num_tests_per_prompt=1
+metric=pass@1 num_tests_per_prompt=1
 
 score=0.1000 (3.0/30)
 ============================================================
 
@@ -1,9 +1,9 @@
 # Evaluation Configuration
 eval:
-  metric: "pass@k"
+  metric: "pass@k" # pass@k and cons@k are supported
   num_tests_per_prompt: 1 # every prompt will be tested num_tests_per_prompt times and use the average score as the final score
   seed: 42
-  pass_k_value: 1
+  k_value: 1
   save_path: null # Path to save evaluation results and configuration of the evaluation. Set to null to disable saving. Example: "results/eval_output" or "/path/to/evaluation_results"
 
 generation:
 
@@ -206,6 +206,7 @@ def step(
         self,
         message_log_batch: List[LLMMessageLogType],
         metadata_batch: List[CodeEnvMetadata],
+        return_extracted_answer: bool = False,
     ) -> EnvironmentReturn:
         """Process a batch of code execution steps."""
         message_batch = [ml[-1]["content"] for ml in message_log_batch]
@@ -240,12 +241,18 @@ def step(
 
         next_stop_strings = [["</code>"]] * len(message_log_batch)
 
+        assert return_extracted_answer == False, (
+            "return_extracted_answer is not supported in CodeEnvironment. Please set it to False."
+        )
+        extracted_answers = None
+
         return EnvironmentReturn(
             observations=observations,
             metadata=new_metadata_batch,
             next_stop_strings=next_stop_strings,
             rewards=rewards_tensor,
             terminateds=terminated_tensor,
+            answers=extracted_answers,
         )
 
     def shutdown(self):
 
@@ -272,6 +272,7 @@ def process_turn(
         bool,
         Optional[list[str]],
         Optional[SlidingPuzzleMetadata],
+        Optional[list[str]],
     ]:
         """Processes a single turn for the sliding puzzle task."""
         game_state = metadata["game_state"]
@@ -297,6 +298,7 @@ def process_turn(
                 is_terminated,
                 None,
                 next_metadata,
+                None,
             )
 
         # Get last assistant message and parse action
@@ -328,13 +330,15 @@ def process_turn(
 
             if is_terminated:
                 next_metadata = None  # Clear metadata on termination
-
+        # answers save the extracted answer, only assigned in the verify function
+        next_answers = None
         return (
             {"role": "environment", "content": next_observation_content + "\n"},
             turn_reward,
             is_terminated,
             next_stop_strings,
             next_metadata,
+            next_answers,
         )
 
 
@@ -365,13 +369,15 @@ def step(
         terminateds = []
         all_stop_strings = []
         all_next_metadata = []
+        all_answers = []
 
-        for obs, rew, term, stops, meta in results:
+        for obs, rew, term, stops, meta, answ in results:
             observations.append(obs)
             rewards.append(rew)
             terminateds.append(term)
             all_stop_strings.append(stops)
             all_next_metadata.append(meta)
+            all_answers.append(answ)
 
         rewards_tensor = torch.tensor(rewards, dtype=torch.float32)
         terminated_tensor = torch.tensor(terminateds, dtype=torch.bool)
@@ -382,6 +388,7 @@ def step(
             next_stop_strings=all_stop_strings,
             rewards=rewards_tensor,
             terminateds=terminated_tensor,
+            answers=all_answers,
         )
 
     def shutdown(self):
 
@@ -38,13 +38,15 @@ class EnvironmentReturn(NamedTuple, Generic[MetadataT]):
                        similar. This field lets you control this per turn.
     rewards: the rewards for this turn.
     terminateds: whether the episode ended this turn.
+    answers: the answers for this turn.
     """
 
     observations: list[dict[str, str]]
     metadata: list[MetadataT]
     next_stop_strings: list[list[str] | None] | list[None]
     rewards: Tensor
     terminateds: Tensor
+    answers: list[str | None] | None
 
 
 class EnvironmentInterface(abc.ABC, Generic[MetadataT]):
 
@@ -15,10 +15,11 @@
 import io
 import logging
 import re
-from typing import Any, Optional, TypedDict
+from typing import Any, Optional, TypedDict, Union
 
 import ray
 import torch
+from math_verify import grader
 from math_verify.errors import TimeoutException
 from math_verify.metric import math_metric
 from math_verify.parser import ExprExtractionConfig, LatexExtractionConfig
@@ -69,53 +70,84 @@ def __init__(self) -> None:
         )
 
     def verify(
-        self, pred_responses: list[str], ground_truths: list[str]
-    ) -> list[float]:
+        self,
+        pred_responses: list[str],
+        ground_truths: list[str],
+        return_extracted_answer: bool = False,
+    ) -> Union[list[float], tuple[list[float], list[str | None]]]:
         """Verify the correctness of the predicted responses against the ground truth.
 
         Args:
             pred_responses: list[str]. The predicted responses from the LLM.
             ground_truths: list[str]. The ground truth responses.
 
         Returns:
-            list[float]. The rewards for each predicted response.
+            Union[list[float], tuple[list[float], list[str | None]]].
+            If return_extracted_answer is False, returns only the scores.
+            If return_extracted_answer is True, returns (scores, extracted_answers).
         """
         results = []
+        extracted_answers: list[str | None] = []
+
         for response, ground_truth in zip(pred_responses, ground_truths):
             try:
                 ground_truth_parsable = "\\boxed{" + ground_truth + "}"
                 with _mute_output():
-                    try:
-                        ret_score, _ = self.verify_func(
-                            [ground_truth_parsable], [response]
-                        )
-                    # It's possible to emit a TimeoutException and that wouldn't be caught since
-                    # it actually subclasses from BaseException and math-verify itself does not
-                    # to catch it.
-                    except (Exception, TimeoutException):
-                        ret_score = 0.0
+                    ret_score, extracted_answer = self.verify_func(
+                        [ground_truth_parsable], [response]
+                    )
 
                 results.append(float(ret_score))
-            except Exception:
+
+                if return_extracted_answer:
+                    # Make sure the extracted answer is not None and is a list of two elements
+                    assert extracted_answer is not None
+                    assert len(extracted_answer) == 2
+                    extracted_gold, extracted_prediction = extracted_answer
+                    # Get the extracted answer with the same logic as in the HFVerifyWorker
+                    for pred in extracted_prediction:
+                        if any(grader.verify(gold, pred) for gold in extracted_gold):
+                            extracted_answers.append(pred)
+                            break
+                    else:
+                        # If no match is found, means all answers are incorrect, just use the first prediction
+                        extracted_answers.append(extracted_prediction[0][0])
+
+            # It's possible to emit a TimeoutException and that wouldn't be caught since
+            # it actually subclasses from BaseException and math-verify itself does not
+            # to catch it.
+            except (Exception, TimeoutException):
                 results.append(0.0)
-        return results
+                extracted_answers.append(None)
+
+        if return_extracted_answer:
+            return results, extracted_answers
+        else:
+            return results
 
 
 @ray.remote  # pragma: no cover
 class MultilingualMultichoiceVerifyWorker:
     def verify(
-        self, pred_responses: list[str], ground_truths: list[str]
-    ) -> list[float]:
+        self,
+        pred_responses: list[str],
+        ground_truths: list[str],
+        return_extracted_answer: bool = False,
+    ) -> Union[list[float], tuple[list[float], list[str | None]]]:
         """Verify the correctness of the predicted responses against the ground truth.
 
         Args:
             pred_responses: list[str]. The predicted responses from the LLM.
             ground_truths: list[str]. The ground truth responses.
 
         Returns:
-            list[float]. The rewards for each predicted response.
+            Union[list[float], tuple[list[float], list[str | None]]].
+            If return_extracted_answer is False, returns only the scores.
+            If return_extracted_answer is True, returns (scores, extracted_answers).
         """
         results = []
+        extracted_answers: list[str | None] = []
+
         for response, ground_truth in zip(pred_responses, ground_truths):
             response = answer_parsing.normalize_response(response)
             extracted_answer = None
@@ -131,24 +163,36 @@ def verify(
                     break
             score = 1.0 if extracted_answer == ground_truth else 0.0
             results.append(score)
-        return results
+            extracted_answers.append(extracted_answer)
+
+        if return_extracted_answer:
+            return results, extracted_answers
+        else:
+            return results
 
 
 @ray.remote  # pragma: no cover
 class EnglishMultichoiceVerifyWorker:
     def verify(
-        self, pred_responses: list[str], ground_truths: list[str]
-    ) -> list[float]:
+        self,
+        pred_responses: list[str],
+        ground_truths: list[str],
+        return_extracted_answer: bool = False,
+    ) -> Union[list[float], tuple[list[float], list[str | None]]]:
         """Verify the correctness of the predicted responses against the ground truth.
 
         Args:
             pred_responses: list[str]. The predicted responses from the LLM.
             ground_truths: list[str]. The ground truth responses.
 
         Returns:
-            list[float]. The rewards for each predicted response.
+            Union[list[float], tuple[list[float], list[str | None]]].
+            If return_extracted_answer is False, returns only the scores.
+            If return_extracted_answer is True, returns (scores, extracted_answers).
         """
         results = []
+        extracted_answers: list[str | None] = []
+
         for response, ground_truth in zip(pred_responses, ground_truths):
             ground_truth = answer_parsing.normalize_response(ground_truth)
             response = answer_parsing.normalize_response(response)
@@ -160,11 +204,18 @@ def verify(
                 )
             score = 1.0 if extracted_answer == ground_truth else 0.0
             results.append(score)
-        return results
+            if return_extracted_answer:
+                extracted_answers.append(extracted_answer)
+
+        if return_extracted_answer:
+            return results, extracted_answers
+        else:
+            return results
 
 
 class MathEnvironmentMetadata(TypedDict):
     ground_truth: str
+    extracted_answer: str | None
 
 
 @ray.remote(max_restarts=-1, max_task_retries=-1)  # pragma: no cover
@@ -198,12 +249,13 @@ def step(
         self,
         message_log_batch: list[LLMMessageLogType],
         metadata: list[MathEnvironmentMetadata],
+        return_extracted_answer: bool = False,
     ) -> EnvironmentReturn[MathEnvironmentMetadata]:
         """Runs a step in the math environment.
 
         Args:
             message_log: list[list[dict[str, str]]]. A batch of OpenAI-API-like message logs that represent interactions with the LLM.
-            metadata: list[MathEnvironmentMetadata]. The grader will use the 'ground_truth' key to evaluate correctness.
+            metadata: list[MathEnvironmentMetadata]. The grader will use the 'ground_truth' key to evaluate correctness. The extracted answer will be stored to caculate cons@k.
 
         Returns:
             EnvironmentReturn: A tuple containing:
@@ -231,18 +283,32 @@ def step(
         )
         chunked_ground_truths = chunk_list_to_workers(ground_truths, self.num_workers)
 
-        # # Process each chunk in parallel
+        # Process each chunk in parallel
         futures = [
-            self.workers[i].verify.remote(chunk, ground_truth_chunk)
+            self.workers[i].verify.remote(
+                chunk, ground_truth_chunk, return_extracted_answer
+            )
             for i, (chunk, ground_truth_chunk) in enumerate(
                 zip(chunked_assistant_response_batch, chunked_ground_truths)
             )
         ]
 
-        results = ray.get(futures)
+        worker_results = ray.get(futures)
+
+        # Flatten the results and extract both scores and answers
+        results = []
+        extracted_answers: list[str | None] | None = (
+            [] if return_extracted_answer else None
+        )
+
+        for worker_result in worker_results:
+            if return_extracted_answer:
+                worker_scores, worker_answers = worker_result
+                results.extend(worker_scores)
+                extracted_answers.extend(worker_answers)
+            else:
+                results.extend(worker_result)
 
-        # flatten the results
-        results = [item for sublist in results for item in sublist]
         observations = [
             {
                 "role": "environment",
@@ -256,7 +322,6 @@ def step(
         # create a tensor of rewards and done flags
         rewards = torch.tensor(results).cpu()
         done = torch.ones_like(rewards).cpu()
-
         next_stop_strings = [None] * len(message_log_batch)
 
         return EnvironmentReturn(
@@ -265,6 +330,7 @@ def step(
             next_stop_strings=next_stop_strings,
             rewards=rewards,
             terminateds=done,
+            answers=extracted_answers,
         )
 
     def global_post_process_and_metrics(
 
@@ -162,6 +162,7 @@ def step(
         self,
         message_log_batch: List[LLMMessageLogType],
         metadata_batch: List[Dict[str, Any]],
+        return_extracted_answer: bool = False,
     ) -> EnvironmentReturn:
         """Process a batch of retrieval steps."""
         # Extract queries from the last message in each log
@@ -186,12 +187,18 @@ def step(
         terminated_tensor = torch.ones(batch_size, dtype=torch.bool)
         next_stop_strings = [["</retrieve>"]] * batch_size
 
+        assert return_extracted_answer == False, (
+            "return_extracted_answer is not supported in RAGEnvironment. Please set it to False."
+        )
+        extracted_answers = None
+
         return EnvironmentReturn(
             observations=results,
             metadata=metadata_batch,
             next_stop_strings=next_stop_strings,
             rewards=rewards_tensor,
             terminateds=terminated_tensor,
+            answers=extracted_answers,
         )
 
     def shutdown(self):