forked from microsoft/agent-lightning
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patheval_utils.py
More file actions
69 lines (49 loc) · 1.91 KB
/
eval_utils.py
File metadata and controls
69 lines (49 loc) · 1.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# Copyright (c) Microsoft. All rights reserved.
# Copied and adapted from https://github.com/prompteus/calc-x/blob/master/gadgets/metrics.py
import math
import re
import string
import sympy
from agentlightning.reward import reward
def normalize_option(option: str) -> str:
"""
>>> normalize_option(" (A) \n")
'A'
"""
return re.sub(r"(\s+|\(|\))", "", option)
def is_option_result(result: str) -> bool:
"""
>>> is_option_result(" A) \n")
True
>>> is_option_result(" 23/7 ")
False
"""
return normalize_option(result) in list(string.ascii_letters)
def float_eval(input_str: str) -> float:
if " = around " in input_str:
input_str = input_str.split(" = around ")[0]
expr = sympy.parse_expr(input_str, evaluate=True)
return float(expr.evalf())
def scalar_are_results_same(pred_result: str, true_result: str, rel_tol: float) -> bool:
pred_result = str(pred_result) if pred_result is not None else "" # type: ignore
true_result = str(true_result) if true_result is not None else "" # type: ignore
if pred_result.strip() == true_result.strip():
return True
if is_option_result(true_result):
# The task is to select correct option
true_result = normalize_option(true_result)
pred_result = normalize_option(pred_result)
return pred_result == true_result
# The task is to calculate the result as a number
try:
pred_float = float_eval(pred_result)
true_float = float_eval(true_result)
return math.isclose(pred_float, true_float, rel_tol=rel_tol)
except Exception:
pass
return False
async def evaluate(prediction: str, ground_truth: str) -> float:
return float(scalar_are_results_same(prediction, ground_truth, 1e-2))
@reward
async def evaluate_v0_1(prediction: str, ground_truth: str) -> float:
return float(scalar_are_results_same(prediction, ground_truth, 1e-2))