|
1 | | -from typing import Any, Callable, Dict, List |
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +"""Accuracy Reward Function Class.""" |
| 3 | +from typing import Callable, Optional |
2 | 4 |
|
3 | | -from .base import RewardShapper |
| 5 | +from latex2sympy2_extended import NormalizationConfig |
| 6 | +from math_verify import LatexExtractionConfig, parse, verify |
4 | 7 |
|
| 8 | +from trinity.common.rewards.reward_fn import REWARD_FUNCTIONS, RewardFn |
| 9 | +from trinity.utils.log import get_logger |
5 | 10 |
|
6 | | -class AccuracyRewardShapper(RewardShapper): |
7 | | - """Shapper for accuracy-based rewards""" |
| 11 | +logger = get_logger(__name__) |
8 | 12 |
|
9 | | - def __init__( |
10 | | - self, |
11 | | - answer_parser: Callable[[str], str], |
12 | | - correct_reward: float = 1.0, |
13 | | - incorrect_reward: float = 0.0, |
14 | | - kwargs: Dict[str, Any] = {}, |
15 | | - ): |
| 13 | + |
| 14 | +@REWARD_FUNCTIONS.register_module("accuracy_reward") |
| 15 | +class AccuracyReward(RewardFn): |
| 16 | + """A reward function that rewards correct answers. |
| 17 | + Ref: https://github.com/huggingface/open-r1/blob/main/src/open_r1/rewards.py |
| 18 | + """ |
| 19 | + |
| 20 | + def __init__(self, answer_parser: Optional[Callable[[str], str]] = None): |
16 | 21 | self.answer_parser = answer_parser |
17 | | - self.correct_reward = correct_reward |
18 | | - self.incorrect_reward = incorrect_reward |
19 | | - self.response_key = kwargs.get("response", "response") |
20 | | - self.truth_key = kwargs.get("ground_truth", "ground_truth") |
21 | 22 |
|
22 | | - def shape(self, sample: Dict[str, Any]) -> Dict[str, Any]: |
23 | | - response = sample[self.response_key] |
24 | | - truth = sample[self.truth_key] |
| 23 | + def __call__( # type: ignore |
| 24 | + self, |
| 25 | + response: str, |
| 26 | + prompt: Optional[str] = None, |
| 27 | + truth: Optional[str] = None, |
| 28 | + ) -> dict[str, float]: |
| 29 | + if self.answer_parser: |
| 30 | + answer_parsed = self.answer_parser(response) |
| 31 | + truth_parsed = self.answer_parser(truth) # type: ignore [arg-type] |
25 | 32 |
|
26 | | - parsed_response = self.answer_parser(response) |
27 | | - reward = self.correct_reward if parsed_response == truth else self.incorrect_reward |
| 33 | + else: |
| 34 | + truth_parsed = parse( |
| 35 | + truth, |
| 36 | + extraction_mode="first_match", |
| 37 | + extraction_config=[LatexExtractionConfig()], |
| 38 | + ) |
| 39 | + if len(truth_parsed) == 0: |
| 40 | + truth_parsed = truth |
28 | 41 |
|
29 | | - sample["accuracy_reward"] = reward |
30 | | - return sample |
| 42 | + answer_parsed = parse( |
| 43 | + response, |
| 44 | + extraction_config=[ |
| 45 | + LatexExtractionConfig( |
| 46 | + normalization_config=NormalizationConfig( |
| 47 | + nits=False, |
| 48 | + malformed_operators=False, |
| 49 | + basic_latex=True, |
| 50 | + equations=True, |
| 51 | + boxed="all", |
| 52 | + units=True, |
| 53 | + ), |
| 54 | + # Ensures that boxed is tried first |
| 55 | + boxed_match_priority=0, |
| 56 | + try_extract_without_anchor=False, |
| 57 | + ) |
| 58 | + ], |
| 59 | + extraction_mode="first_match", |
| 60 | + ) |
31 | 61 |
|
32 | | - def batch_shape(self, samples: List[Dict[str, Any]]) -> List[Dict[str, Any]]: |
33 | | - return [self.shape(sample) for sample in samples] |
| 62 | + # Reward 1 if the content is the same as the ground truth, 0 otherwise |
| 63 | + try: |
| 64 | + reward = float(verify(answer_parsed, truth_parsed)) |
| 65 | + except Exception as e: |
| 66 | + logger.info(f"verify failed: {e}, answer: {answer_parsed}, gold: {truth_parsed}") |
| 67 | + reward = 0.0 |
| 68 | + return {"accuracy": reward} |
0 commit comments