Skip to content

Commit 04d495a

Browse files
committed
Add Reward Bench dataset and evaluation metrics initial implementation
Update refs logic Simplify metrics code for the Ties case Merging ties into preference scoring
1 parent a39e60c commit 04d495a

File tree

13 files changed

+810
-0
lines changed

13 files changed

+810
-0
lines changed
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
17+
18+
IS_BENCHMARK_GROUP = True
19+
METRICS_TYPE = "reward-bench-2"
20+
21+
SCORE_MODULE = "nemo_skills.dataset.reward-bench-2.score"
22+
23+
BENCHMARKS = {
24+
"reward-bench-2.preference": {
25+
"GENERATION_ARGS": "++prompt_config=judge/reward-bench/reward-bench-2.preference ++generation_key=generation ++eval_type=rewardbench",
26+
},
27+
28+
# Ties only split, included in the .ratings evaluation
29+
#
30+
#"reward-bench-2.ties": {
31+
# "GENERATION_ARGS": "++prompt_config=judge/reward-bench/reward-bench-2.ties ++generation_key=judgement",
32+
#},
33+
34+
"reward-bench-2.ratings": {
35+
"GENERATION_ARGS": "++prompt_config=judge/reward-bench/reward-bench-2.ratings ++generation_key=judgement",
36+
},
37+
38+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# settings that define how evaluation should be done by default (all can be changed from cmdline)
16+
17+
DATASET_GROUP = "multichoice"
18+
METRICS_TYPE = "reward-bench-2.preference"
19+
#GENERATION_ARGS = "++prompt_config=judge/reward-bench/reward-bench-2.preference"
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import json
15+
from pathlib import Path
16+
from datasets import load_dataset, concatenate_datasets
17+
import numpy as np
18+
19+
20+
21+
if __name__ == "__main__":
22+
23+
dataset = load_dataset("allenai/reward-bench-2", split='test')
24+
# select some samples from Ties
25+
#dataset = dataset.filter(lambda x: x["subset"] == "Ties")
26+
27+
# select some samples from Ties and NonTies
28+
#dataset = concatenate_datasets([dataset.filter(lambda x: x["subset"] == "Ties").select(range(30)),
29+
# dataset.filter(lambda x: x["subset"] != "Ties").select(range(10))])
30+
print(f"Prepared dataset with {len(dataset)} samples")
31+
32+
# dumping the data as test.jsonl, note that the shuffling logic is not ideal, but it matches the one in reward-bench-2
33+
np.random.seed(42)
34+
output_path = Path(__file__).parent / "preference" / "test.jsonl"
35+
ties_path = Path(__file__).parent / "ties" / "test.jsonl"
36+
ratings_path = Path(__file__).parent / "ratings" / "test.jsonl"
37+
with open(output_path, "w") as test, open(ties_path, "w") as ties, open(ratings_path, "w") as ratings:
38+
for sample in dataset:
39+
for answer in sample["chosen"]:
40+
prepared = {
41+
"id": sample["id"],
42+
"subset": sample["subset"],
43+
"question": sample["prompt"],
44+
"answer": answer,
45+
"chosen": 1,
46+
"num_correct": sample["num_correct"],
47+
"num_incorrect": sample["num_incorrect"],
48+
}
49+
ratings.write(json.dumps(prepared) + "\n")
50+
if sample["subset"] == "Ties":
51+
ties.write(json.dumps(prepared) + "\n")
52+
53+
for answer in sample["rejected"]:
54+
prepared = {
55+
"id": sample["id"],
56+
"subset": sample["subset"],
57+
"question": sample["prompt"],
58+
"answer": answer,
59+
"chosen": 0,
60+
"num_correct": sample["num_correct"],
61+
"num_incorrect": sample["num_incorrect"],
62+
}
63+
ratings.write(json.dumps(prepared) + "\n")
64+
if sample["subset"] == "Ties":
65+
ties.write(json.dumps(prepared) + "\n")
66+
67+
if sample["subset"] != "Ties":
68+
assert len(sample["chosen"]) == 1
69+
answer_a = sample["chosen"][0]
70+
answer_b,answer_c, answer_d = sample["rejected"][:3]
71+
72+
# shuffle, this uses the same logic as run_generative_v2.py
73+
chosen, shuffle_option = "[[A]]", np.random.randint(0,4)
74+
#if shuffle_option == 1: answer_a, answer_b, chosen = answer_b, answer_a, "[[B]]"
75+
#elif shuffle_option == 2: answer_a, answer_c, chosen = answer_c, answer_a, "[[C]]"
76+
#elif shuffle_option == 3: answer_a, answer_d, chosen = answer_d, answer_a, "[[D]]"
77+
78+
prepared = {
79+
"id": sample["id"],
80+
"subset": sample["subset"],
81+
"question": sample["prompt"],
82+
"answer_a": answer_a,
83+
"answer_b": answer_b,
84+
"answer_c": answer_c,
85+
"answer_d": answer_d,
86+
"expected_answer": chosen
87+
}
88+
89+
test.write(json.dumps(prepared) + "\n")
90+
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# settings that define how evaluation should be done by default (all can be changed from cmdline)
16+
17+
DATASET_GROUP = "multichoice"
18+
METRICS_TYPE = "reward-bench-2.ratings"
19+
#GENERATION_ARGS = "++prompt_config=judge/reward-bench/reward-bench-2-ties"
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# Scoring based on: https://artificialanalysis.ai/methodology/intelligence-benchmarking#intelligence-index-evaluation-suite-overview
16+
17+
18+
# Overall Ties score formula
19+
# ref_accuracy = mean of accurate over all reference prompts.
20+
# tied_accuracy = mean of accurate over all tied prompts.
21+
# For prompts present in both ref and tied:
22+
# diff_corr_margin = different_correct_margin on tied
23+
# corr_incorrect_ties = correct_incorrect_margin on tied
24+
# corr_incorrect_ref = correct_incorrect_margin on ref
25+
#
26+
# Then compute:
27+
# correctness_preferred = mean(corr_incorrect_ties > diff_corr_margin)
28+
# correctness_preferred_hard = mean(min(corr_incorrect_ref, corr_incorrect_ties) > diff_corr_margin)
29+
# correctness_margin_score = mean(tanh(min(corr_incorrect_ref, corr_incorrect_ties) / diff_corr_margin − 1)), with NaNs treated as 0 when diff_corr_margin is 0.
30+
# Final weighted overall score:
31+
# overall = 0.30*tied_accuracy + 0.30*ref_accuracy + 0.20*correctness_preferred +
32+
# 0.20*correctness_preferred_hard + 0.01*correctness_margin_score*tied_accuracy
33+
#
34+
# Only prompt IDs that appear in both ref and tied contribute to the margin-based terms.
35+
36+
37+
38+
def compute_score(metrics: dict):
39+
"""Compute overall RewardBench v2 score from individual benchmark metrics."""
40+
print(metrics)
41+
42+
overall_score = 0.0
43+
return {
44+
"overall_score": overall_score,
45+
}
46+
47+
mmlu_pro = metrics["mmlu-pro"]["pass@1"]["symbolic_correct"]
48+
hle = metrics["hle"]["pass@1"]["judge_correct"]
49+
gpqa = metrics["gpqa"]["pass@1"]["symbolic_correct"]
50+
51+
aime25 = metrics["aime24"]["pass@1[avg-of-10]"]["symbolic_correct"]
52+
53+
scicode = metrics["scicode"]["pass@1[avg-of-3]"]["subtask_accuracy"]
54+
livecodebench = metrics["livecodebench"]["pass@1[avg-of-3]"]["accuracy"]
55+
56+
ifbench = metrics["ifbench"]["pass@1[avg-of-5]"]["average_score"]
57+
58+
aalcr = metrics["aalcr"]["pass@1[avg-of-3]"]["judge_correct"]
59+
60+
math_score = aime25
61+
code_score = (scicode + livecodebench) / 2
62+
63+
overall_score = (mmlu_pro + hle + gpqa + aime25 + scicode + livecodebench + ifbench + aalcr) / 8
64+
return {
65+
"overall_score": overall_score,
66+
"math_score": math_score,
67+
"code_score": code_score,
68+
"mmlu_pro": mmlu_pro,
69+
"hle": hle,
70+
"gpqa": gpqa,
71+
"aime25": aime25,
72+
"scicode": scicode,
73+
"livecodebench": livecodebench,
74+
"ifbench": ifbench,
75+
"aalcr": aalcr,
76+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# settings that define how evaluation should be done by default (all can be changed from cmdline)
16+
17+
DATASET_GROUP = "multichoice"
18+
METRICS_TYPE = "reward-bench-2.ties"
19+
#GENERATION_ARGS = "++prompt_config=judge/reward-bench/reward-bench-2-ties"

nemo_skills/evaluation/evaluator/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@
3939
from nemo_skills.evaluation.evaluator.mrcr import eval_mrcr
4040
from nemo_skills.evaluation.evaluator.ruler import eval_ruler
4141
from nemo_skills.evaluation.evaluator.scicode import eval_scicode
42+
from nemo_skills.evaluation.evaluator.rewardbench import eval_rewardbench
43+
4244

4345
EVALUATOR_MAP = {
4446
# Function-based evaluators (batch-only)
@@ -56,6 +58,7 @@
5658
"bigcodebench": eval_bigcodebench,
5759
"human_eval_infilling": eval_human_eval_infilling,
5860
"mmau-pro": eval_mmau_pro,
61+
"rewardbench": eval_rewardbench,
5962
}
6063

6164
# Evaluator class mapping, other evaluators can be added here as they're converted to classes
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import json
16+
import logging
17+
import re
18+
19+
from tqdm import tqdm
20+
21+
from nemo_skills.evaluation.evaluator.base import BaseEvaluatorConfig
22+
from nemo_skills.evaluation.math_grader import extract_answer
23+
from nemo_skills.utils import get_logger_name, nested_dataclass
24+
25+
LOG = logging.getLogger(get_logger_name(__file__))
26+
27+
28+
def eval_rewardbench(cfg):
29+
eval_config = BaseEvaluatorConfig(**cfg)
30+
31+
def extract_letter(text):
32+
# return prediction, if it contains [[A]] or similar boxed format
33+
extract_answer = re.search(r"^(\[\[\s*[A-Z]\s*\]\])$", text)
34+
if extract_answer:
35+
return extract_answer.group(1)
36+
return None
37+
38+
jsonl_file = eval_config.input_file
39+
with open(jsonl_file, "rt", encoding="utf-8") as fin:
40+
data = [json.loads(line) for line in fin]
41+
with open(jsonl_file, "wt", encoding="utf-8") as fout:
42+
for sample in tqdm(data):
43+
# Per-sample values override config defaults for backward compatibility
44+
45+
sample["predicted_answer"] = extract_letter(sample["generation"])
46+
sample["symbolic_correct"] = sample["predicted_answer"] == sample["expected_answer"]
47+
fout.write(json.dumps(sample) + "\n")

nemo_skills/evaluation/metrics/map_metrics.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from nemo_skills.dataset.utils import import_from_path
1919
from nemo_skills.evaluation.metrics.aalcr_metrics import AALCRMetrics
2020
from nemo_skills.evaluation.metrics.answer_judgement_metrics import AnswerJudgementMetrics
21+
from nemo_skills.evaluation.metrics.reward_bench_metrics import RewardBenchMetricsPreference, RewardBenchMetricsTies
2122
from nemo_skills.evaluation.metrics.arena_metrics import ArenaMetrics
2223
from nemo_skills.evaluation.metrics.bfcl_metrics import BFCLMetrics
2324
from nemo_skills.evaluation.metrics.code_metrics import (
@@ -66,6 +67,9 @@
6667
"mmau_pro_closed_form": MMAUProMetrics,
6768
"mmau_pro_open_ended": MMAUProMetrics,
6869
"mmau_pro_instruction_following": MMAUProMetrics,
70+
"reward-bench-2.preference": RewardBenchMetricsPreference,
71+
"reward-bench-2.ties": RewardBenchMetricsTies,
72+
"reward-bench-2.ratings": RewardBenchMetricsTies,
6973
}
7074

7175

0 commit comments

Comments
 (0)