Add Reward Bench dataset and evaluation metrics initial implementation

dchichkov · dchichkov · commit 04d495aafba7 · 2025-11-26T11:09:22.000-08:00
Update refs logic
Simplify metrics code for the Ties case
Merging ties into preference scoring
diff --git a/nemo_skills/dataset/reward-bench-2/__init__.py b/nemo_skills/dataset/reward-bench-2/__init__.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+
+
+IS_BENCHMARK_GROUP = True
+METRICS_TYPE = "reward-bench-2"
+
+SCORE_MODULE = "nemo_skills.dataset.reward-bench-2.score"
+
+BENCHMARKS = {
+    "reward-bench-2.preference": {
+        "GENERATION_ARGS": "++prompt_config=judge/reward-bench/reward-bench-2.preference ++generation_key=generation ++eval_type=rewardbench",
+    },
+
+    # Ties only split, included in the .ratings evaluation
+    #
+    #"reward-bench-2.ties": {
+    #    "GENERATION_ARGS": "++prompt_config=judge/reward-bench/reward-bench-2.ties ++generation_key=judgement",
+    #},
+
+    "reward-bench-2.ratings": {
+        "GENERATION_ARGS": "++prompt_config=judge/reward-bench/reward-bench-2.ratings ++generation_key=judgement",
+    },
+
+}
diff --git a/nemo_skills/dataset/reward-bench-2/preference/__init__.py b/nemo_skills/dataset/reward-bench-2/preference/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# settings that define how evaluation should be done by default (all can be changed from cmdline)
+
+DATASET_GROUP = "multichoice"
+METRICS_TYPE = "reward-bench-2.preference"
+#GENERATION_ARGS = "++prompt_config=judge/reward-bench/reward-bench-2.preference"
diff --git a/nemo_skills/dataset/reward-bench-2/prepare.py b/nemo_skills/dataset/reward-bench-2/prepare.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+from pathlib import Path
+from datasets import load_dataset, concatenate_datasets
+import numpy as np
+
+
+
+if __name__ == "__main__":
+
+    dataset = load_dataset("allenai/reward-bench-2", split='test')
+    # select some samples from Ties
+    #dataset = dataset.filter(lambda x: x["subset"] == "Ties")
+
+    # select some samples from Ties and NonTies
+    #dataset = concatenate_datasets([dataset.filter(lambda x: x["subset"] == "Ties").select(range(30)),
+    #                                dataset.filter(lambda x: x["subset"] != "Ties").select(range(10))])
+    print(f"Prepared dataset with {len(dataset)} samples")
+
+    # dumping the data as test.jsonl, note that the shuffling logic is not ideal, but it matches the one in reward-bench-2
+    np.random.seed(42)
+    output_path = Path(__file__).parent / "preference" / "test.jsonl"
+    ties_path = Path(__file__).parent  / "ties" / "test.jsonl"
+    ratings_path = Path(__file__).parent  / "ratings" / "test.jsonl"
+    with open(output_path, "w") as test, open(ties_path, "w") as ties, open(ratings_path, "w") as ratings:
+        for sample in dataset:
+            for answer in sample["chosen"]:
+                prepared = {
+                    "id": sample["id"],
+                    "subset": sample["subset"],
+                    "question": sample["prompt"],
+                    "answer": answer,
+                    "chosen": 1,
+                    "num_correct": sample["num_correct"],
+                    "num_incorrect": sample["num_incorrect"],
+                }
+                ratings.write(json.dumps(prepared) + "\n")
+                if sample["subset"] == "Ties":
+                    ties.write(json.dumps(prepared) + "\n")
+
+            for answer in sample["rejected"]:
+                prepared = {
+                    "id": sample["id"],
+                    "subset": sample["subset"],
+                    "question": sample["prompt"],
+                    "answer": answer,
+                    "chosen": 0,
+                    "num_correct": sample["num_correct"],
+                    "num_incorrect": sample["num_incorrect"],
+                }
+                ratings.write(json.dumps(prepared) + "\n")
+                if sample["subset"] == "Ties":
+                    ties.write(json.dumps(prepared) + "\n")
+
+            if sample["subset"] != "Ties":
+                assert len(sample["chosen"]) == 1
+                answer_a = sample["chosen"][0]
+                answer_b,answer_c, answer_d  = sample["rejected"][:3]
+
+                # shuffle, this uses the same logic as run_generative_v2.py
+                chosen, shuffle_option = "[[A]]", np.random.randint(0,4)
+                #if shuffle_option == 1:   answer_a, answer_b, chosen = answer_b, answer_a, "[[B]]"
+                #elif shuffle_option == 2: answer_a, answer_c, chosen = answer_c, answer_a, "[[C]]"
+                #elif shuffle_option == 3: answer_a, answer_d, chosen = answer_d, answer_a, "[[D]]"
+
+                prepared = {
+                    "id": sample["id"],
+                    "subset": sample["subset"],
+                    "question": sample["prompt"],
+                    "answer_a": answer_a,
+                    "answer_b": answer_b,
+                    "answer_c": answer_c,
+                    "answer_d": answer_d,
+                    "expected_answer": chosen
+                }
+
+                test.write(json.dumps(prepared) + "\n")
+
diff --git a/nemo_skills/dataset/reward-bench-2/ratings/__init__.py b/nemo_skills/dataset/reward-bench-2/ratings/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# settings that define how evaluation should be done by default (all can be changed from cmdline)
+
+DATASET_GROUP = "multichoice"
+METRICS_TYPE = "reward-bench-2.ratings"
+#GENERATION_ARGS = "++prompt_config=judge/reward-bench/reward-bench-2-ties"
diff --git a/nemo_skills/dataset/reward-bench-2/score.py b/nemo_skills/dataset/reward-bench-2/score.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Scoring based on: https://artificialanalysis.ai/methodology/intelligence-benchmarking#intelligence-index-evaluation-suite-overview
+
+
+# Overall Ties score formula
+# ref_accuracy = mean of accurate over all reference prompts.
+# tied_accuracy = mean of accurate over all tied prompts.
+# For prompts present in both ref and tied:
+# diff_corr_margin = different_correct_margin on tied
+# corr_incorrect_ties = correct_incorrect_margin on tied
+# corr_incorrect_ref = correct_incorrect_margin on ref
+#
+# Then compute:
+# correctness_preferred = mean(corr_incorrect_ties > diff_corr_margin)
+# correctness_preferred_hard = mean(min(corr_incorrect_ref, corr_incorrect_ties) > diff_corr_margin)
+# correctness_margin_score = mean(tanh(min(corr_incorrect_ref, corr_incorrect_ties) / diff_corr_margin − 1)), with NaNs treated as 0 when diff_corr_margin is 0.
+# Final weighted overall score:
+#    overall = 0.30*tied_accuracy + 0.30*ref_accuracy + 0.20*correctness_preferred +
+#              0.20*correctness_preferred_hard + 0.01*correctness_margin_score*tied_accuracy
+#
+# Only prompt IDs that appear in both ref and tied contribute to the margin-based terms.
+
+
+
+def compute_score(metrics: dict):
+    """Compute overall RewardBench v2 score from individual benchmark metrics."""
+    print(metrics)
+
+    overall_score = 0.0
+    return {
+        "overall_score": overall_score,
+        }
+
+    mmlu_pro = metrics["mmlu-pro"]["pass@1"]["symbolic_correct"]
+    hle = metrics["hle"]["pass@1"]["judge_correct"]
+    gpqa = metrics["gpqa"]["pass@1"]["symbolic_correct"]
+
+    aime25 = metrics["aime24"]["pass@1[avg-of-10]"]["symbolic_correct"]
+
+    scicode = metrics["scicode"]["pass@1[avg-of-3]"]["subtask_accuracy"]
+    livecodebench = metrics["livecodebench"]["pass@1[avg-of-3]"]["accuracy"]
+
+    ifbench = metrics["ifbench"]["pass@1[avg-of-5]"]["average_score"]
+
+    aalcr = metrics["aalcr"]["pass@1[avg-of-3]"]["judge_correct"]
+
+    math_score = aime25
+    code_score = (scicode + livecodebench) / 2
+
+    overall_score = (mmlu_pro + hle + gpqa + aime25 + scicode + livecodebench + ifbench + aalcr) / 8
+    return {
+        "overall_score": overall_score,
+        "math_score": math_score,
+        "code_score": code_score,
+        "mmlu_pro": mmlu_pro,
+        "hle": hle,
+        "gpqa": gpqa,
+        "aime25": aime25,
+        "scicode": scicode,
+        "livecodebench": livecodebench,
+        "ifbench": ifbench,
+        "aalcr": aalcr,
+    }
diff --git a/nemo_skills/dataset/reward-bench-2/ties/__init__.py b/nemo_skills/dataset/reward-bench-2/ties/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# settings that define how evaluation should be done by default (all can be changed from cmdline)
+
+DATASET_GROUP = "multichoice"
+METRICS_TYPE = "reward-bench-2.ties"
+#GENERATION_ARGS = "++prompt_config=judge/reward-bench/reward-bench-2-ties"
diff --git a/nemo_skills/evaluation/evaluator/__init__.py b/nemo_skills/evaluation/evaluator/__init__.py
@@ -39,6 +39,8 @@
 from nemo_skills.evaluation.evaluator.mrcr import eval_mrcr
 from nemo_skills.evaluation.evaluator.ruler import eval_ruler
 from nemo_skills.evaluation.evaluator.scicode import eval_scicode
+from nemo_skills.evaluation.evaluator.rewardbench import eval_rewardbench
+
 
 EVALUATOR_MAP = {
     # Function-based evaluators (batch-only)
@@ -56,6 +58,7 @@
     "bigcodebench": eval_bigcodebench,
     "human_eval_infilling": eval_human_eval_infilling,
     "mmau-pro": eval_mmau_pro,
+    "rewardbench": eval_rewardbench,
 }
 
 # Evaluator class mapping, other evaluators can be added here as they're converted to classes
diff --git a/nemo_skills/evaluation/evaluator/rewardbench.py b/nemo_skills/evaluation/evaluator/rewardbench.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+import re
+
+from tqdm import tqdm
+
+from nemo_skills.evaluation.evaluator.base import BaseEvaluatorConfig
+from nemo_skills.evaluation.math_grader import extract_answer
+from nemo_skills.utils import get_logger_name, nested_dataclass
+
+LOG = logging.getLogger(get_logger_name(__file__))
+
+
+def eval_rewardbench(cfg):
+    eval_config = BaseEvaluatorConfig(**cfg)
+
+    def extract_letter(text):
+        # return prediction, if it contains [[A]] or similar boxed format
+        extract_answer = re.search(r"^(\[\[\s*[A-Z]\s*\]\])$", text)
+        if extract_answer:
+            return extract_answer.group(1)        
+        return None
+
+    jsonl_file = eval_config.input_file
+    with open(jsonl_file, "rt", encoding="utf-8") as fin:
+        data = [json.loads(line) for line in fin]
+    with open(jsonl_file, "wt", encoding="utf-8") as fout:
+        for sample in tqdm(data):
+            # Per-sample values override config defaults for backward compatibility
+  
+            sample["predicted_answer"] = extract_letter(sample["generation"])            
+            sample["symbolic_correct"] = sample["predicted_answer"] == sample["expected_answer"]
+            fout.write(json.dumps(sample) + "\n")
diff --git a/nemo_skills/evaluation/metrics/map_metrics.py b/nemo_skills/evaluation/metrics/map_metrics.py
@@ -18,6 +18,7 @@
 from nemo_skills.dataset.utils import import_from_path
 from nemo_skills.evaluation.metrics.aalcr_metrics import AALCRMetrics
 from nemo_skills.evaluation.metrics.answer_judgement_metrics import AnswerJudgementMetrics
+from nemo_skills.evaluation.metrics.reward_bench_metrics import RewardBenchMetricsPreference, RewardBenchMetricsTies
 from nemo_skills.evaluation.metrics.arena_metrics import ArenaMetrics
 from nemo_skills.evaluation.metrics.bfcl_metrics import BFCLMetrics
 from nemo_skills.evaluation.metrics.code_metrics import (
@@ -66,6 +67,9 @@
     "mmau_pro_closed_form": MMAUProMetrics,
     "mmau_pro_open_ended": MMAUProMetrics,
     "mmau_pro_instruction_following": MMAUProMetrics,
+    "reward-bench-2.preference": RewardBenchMetricsPreference,
+    "reward-bench-2.ties": RewardBenchMetricsTies,
+    "reward-bench-2.ratings": RewardBenchMetricsTies,
 }
 
 
diff --git a/nemo_skills/evaluation/metrics/reward_bench_metrics.py b/nemo_skills/evaluation/metrics/reward_bench_metrics.py
diff --git a/nemo_skills/prompt/config/judge/reward-bench/reward-bench-2.preference.yaml b/nemo_skills/prompt/config/judge/reward-bench/reward-bench-2.preference.yaml
diff --git a/nemo_skills/prompt/config/judge/reward-bench/reward-bench-2.ratings.yaml b/nemo_skills/prompt/config/judge/reward-bench/reward-bench-2.ratings.yaml
diff --git a/nemo_skills/prompt/config/judge/reward-bench/reward-bench-2.ties.yaml b/nemo_skills/prompt/config/judge/reward-bench/reward-bench-2.ties.yaml