Skip to content

Commit 3c8ac24

Browse files
authored
feat: add option to change regex to parse score (#237)
1 parent e71b3c4 commit 3c8ac24

File tree

2 files changed

+18
-8
lines changed

2 files changed

+18
-8
lines changed

flexeval/core/metric/llm_score.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,14 @@
1414
from .utils import extract_text_from_outputs, validate_inputs
1515

1616

17-
def parse_score_from_evaluator_output(evaluator_output: str, valid_score_range: tuple[int, int] | None) -> int | None:
17+
def parse_score_from_evaluator_output(
18+
evaluator_output: str, valid_score_range: tuple[int, int] | None, regex_to_parse_score: str = r"(\d+)"
19+
) -> int | None:
1820
"""Extract the last integer value from the evaluator output.
1921
2022
Return None if parsing fails.
2123
"""
22-
matched = re.findall(r"(\d+)", evaluator_output)
24+
matched = re.findall(regex_to_parse_score, evaluator_output)
2325
if not matched:
2426
return None
2527

@@ -182,6 +184,7 @@ class LLMScore(Metric):
182184
category_key: A key to create category-wise mean score.
183185
The category key is expected to be in extra_info.
184186
metric_prefix: A prefix to be added to the metric keys in the summary and instance details.
187+
regex_to_parse_score: A regular expression to parse score.
185188
186189
Examples:
187190
>>> from flexeval import LLMScore, OpenAIChatAPI, Jinja2PromptTemplate
@@ -216,6 +219,7 @@ def __init__(
216219
valid_score_range: tuple[int, int] | None = None,
217220
category_key: str | None = None,
218221
metric_prefix: str | None = None,
222+
regex_to_parse_score: str = r"(\d+)",
219223
) -> None:
220224
self.language_model = language_model
221225
self.prompt_template = prompt_template
@@ -224,6 +228,7 @@ def __init__(
224228
self.valid_score_range = valid_score_range
225229
self.category_key = category_key
226230
self.metric_prefix = f"{metric_prefix}-" if metric_prefix else ""
231+
self.regex_to_parse_score = regex_to_parse_score
227232

228233
def evaluate(
229234
self,
@@ -254,6 +259,7 @@ def evaluate(
254259
evaluator_score = parse_score_from_evaluator_output(
255260
evaluator_output.text,
256261
valid_score_range=self.valid_score_range,
262+
regex_to_parse_score=self.regex_to_parse_score,
257263
)
258264
if evaluator_score is None:
259265
logger.warning(f"Failed to parse score from evaluator output: {evaluator_output}")

tests/core/metric/test_llm_score.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,20 +34,24 @@ def generate_chat_response(
3434

3535

3636
@pytest.mark.parametrize(
37-
("evaluator_output", "valid_score_range", "expected_score"),
37+
("evaluator_output", "valid_score_range", "regex_to_parse_score", "expected_score"),
3838
[
39-
("The final score is 65.", None, 65),
40-
("The final score is 65.", (0, 5), None),
41-
("Yes, this is a good one.", None, None),
42-
("The score is 5. No, it is 6.", None, 6),
39+
("The final score is 65.", None, r"(\d+)", 65),
40+
("The final score is 65.", (0, 5), r"(\d+)", None),
41+
("Yes, this is a good one.", None, r"(\d+)", None),
42+
("The score is 5. No, it is 6.", None, r"(\d+)", 6),
43+
("The score is [[5]]. No, it is 6.", None, r"\[\[(\d+)\]\]", 5),
4344
],
4445
)
4546
def test_parse_score_from_evaluator_output(
4647
evaluator_output: str,
4748
valid_score_range: tuple[int, int] | None,
49+
regex_to_parse_score: str,
4850
expected_score: int,
4951
) -> None:
50-
score = parse_score_from_evaluator_output(evaluator_output, valid_score_range)
52+
score = parse_score_from_evaluator_output(
53+
evaluator_output, valid_score_range, regex_to_parse_score=regex_to_parse_score
54+
)
5155
assert score == expected_score
5256

5357

0 commit comments

Comments
 (0)