Skip to content

Commit 6830fca

Browse files
michaelharrisonmaiMichael Harrison
andauthored
use llm judge for math-v as with vista and verse (#163)
Update Math-V to use LLM judge, since the previous answer extraction was not reliable. Uses same few shot extraction/scoring templates at MathVerse and MathVista. Ran on several models already, with scores roughly matching published scores on MathV leaderboard. --------- Co-authored-by: Michael Harrison <[email protected]>
1 parent 46a3f0d commit 6830fca

File tree

3 files changed

+131
-19
lines changed

3 files changed

+131
-19
lines changed
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
I am providing you a response from a model to a math problem, termed 'Model Response'. You should extract the answer from the response. Directly output the extracted answer with no explanation.
2+
3+
1.
4+
Model response: 'Rounded to two decimal places, the perimeter of the sector is approximately:\n\n(-2, 1)'
5+
Extracted Answer: (-2, 1)
6+
7+
2.
8+
Model response: 'at those points.\n\nTherefore, the correct option that represents the meaning of the intersection points of the graphs is:\n\nD. They give the solutions to the equation $f(t)=g(t)$.",'
9+
Extracted Answer: D
10+
11+
3.
12+
Model response: ' at 1 (there's a closed circle at y = 1), the range in interval notation is \\((-4, 1]\\).\n\nFinal values:\nDomain: \\((-3, 3]\\)\nRange: \\((-4, 1]\\)'
13+
Extracted Answer: Domain: \\((-3, 3]\\)\nRange: \\((-4, 1]\\)
14+
15+
4.
16+
Model response: 'As it stands, I cannot provide the correct option letter because there isn't enough information to solve for 'y'.'
17+
Extracted Answer: null
18+
19+
5.
20+
Model response: 'Given that AB = 17.6 meters, we can now substitute into the equation:\n\nd = 17.6 / cos(38\u00b0)\n\nTherefore, to one decimal place, the distance d between Ned and Bart is approximately 22.3 meters.'
21+
Extracted answer: 22.3
22+
23+
6.
24+
Model response: have all the coefficients for the quadratic function:\n\\( f(x) = ax^2 + bx + c \\)\n\\( f(x) = -1x^2 - 2x + 1 \\)\n\nTherefore, the equation for the graphed function \\( f \\) is:\n\\( f(x) = -x^2 - 2x + 1 \\)"'
25+
Extracted answer: f(x) = -x^2 - 2x + 1
26+
27+
7.
28+
Model response: {{ response }}
29+
Extracted Answer:
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
Below are two answers to a math question. Question is [Question], [Standard Answer] is the standard answer to the question, and [Model_answer] is the answer extracted from a model's output to this question. Determine whether these two answers are consistent.
2+
Please note that only when the [Model_answer] completely matches the [Standard Answer] means they are consistent. For multiple choice questions, consider that the model answer may contain the letter representing the answer value. For non-multiple-choice questions, if the meaning is expressed in the same way, it is also considered consistent, for example, 0.5m and 50cm.
3+
If they are consistent, Judgement is 1; if they are different, Judgement is 0.
4+
5+
[Question]: Write the set of numbers represented on the number line in interval notation.
6+
[Standard Answer]: (-2,1]
7+
[Model_answer] : Extracted Answer: \\((-2, 1)\\)
8+
Judgement: 0
9+
10+
[Question]: As shown in the figure, circle O has a radius 1.0, if angle BAC = 60.0, then the length of BC is ()\nChoices:\nA:2\nB:2\u221a{{3}}\nC:\u221a{{3}}\nD:2\u221a{{2}}
11+
[Standard Answer]: C
12+
[Model_answer] : B:2\u221a{{3}}
13+
Judgement: 0
14+
15+
[Question]: Find the domain and range of the function f using interval notation.
16+
[Standard Answer]: domain: [-4, 0) and range: (-3, 1]
17+
[Model_answer] : Range: \\((-4, 1]\\)
18+
Judgement: 0
19+
20+
[Question]: As shown in the figure, circle O has a radius 1.0, if angle BAC = 60.0, then the length of BC is ()\nChoices:\nA:2\nB:2\u221a{{3}}\nC:\u221a{{3}}\nD:2\u221a{{2}}
21+
[Standard Answer]: C
22+
[Model_answer] : null
23+
Judgement: 0
24+
25+
[Question]: Given the graph of the line that intersects with x-axis at -3 and with y-axis at 4, determine its equation. A. y = \\frac{{4}}{{3}}x + 4 B. Cannot determine.\n
26+
[Standard Answer]: A
27+
[Model_answer] : y = \\frac{{4}}{{3}}x + 4
28+
Judgement: 1
29+
30+
[Question]: {{original_question}}
31+
[Standard Answer]: {{answer}}
32+
[Model_answer] : {{extraction}}
33+
Judgement:

eureka_ml_insights/user_configs/mathvision.py

Lines changed: 69 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,12 @@
55
from typing import Any
66

77
from eureka_ml_insights.core import (
8-
DataProcessing,
98
EvalReporting,
109
Inference,
1110
PromptProcessing
1211
)
1312
from eureka_ml_insights.data_utils import (
14-
AddColumn,
13+
ColumnRename,
1514
CopyColumn,
1615
DataReader,
1716
HFDataReader,
@@ -23,7 +22,6 @@
2322

2423
from eureka_ml_insights.configs import(
2524
AggregatorConfig,
26-
DataProcessingConfig,
2725
DataSetConfig,
2826
EvalReportingConfig,
2927
InferenceConfig,
@@ -32,10 +30,11 @@
3230
PromptProcessingConfig,
3331
)
3432

35-
from eureka_ml_insights.data_utils.mathvision_utils import MathVisionOutputEvaluator
3633
from eureka_ml_insights.metrics.reports import AverageAggregator
3734
from eureka_ml_insights.configs import ExperimentConfig
3835

36+
from eureka_ml_insights.configs.model_configs import OAI_GPT4_1106_PREVIEW_CONFIG as PERSONAL_GPT4O
37+
3938
class MATHVISION_PIPELINE(ExperimentConfig):
4039
def configure_pipeline(
4140
self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]
@@ -56,7 +55,6 @@ def configure_pipeline(
5655
columns='options_string',
5756
mapping=lambda x: "" if len(x)==0 else ("\n[Options]:\n" + '\n'.join([chr(ord('A') + i) + ". " + opt for i, opt in enumerate(x)]))
5857
),
59-
#SamplerTransform(sample_count=2, random_seed=1234),
6058
]
6159
),
6260
},
@@ -82,32 +80,73 @@ def configure_pipeline(
8280
resume_from=resume_from,
8381
)
8482

85-
# post process the response to extract the answer
86-
self.data_post_processing = DataProcessingConfig(
87-
component_type=DataProcessing,
83+
# Eval data pre processing component round 1 (answer extraction).
84+
self.eval_data_pre_processing = PromptProcessingConfig(
85+
component_type=PromptProcessing,
8886
data_reader_config=DataSetConfig(
8987
DataReader,
9088
{
9189
"path": os.path.join(self.inference_comp.output_dir, "inference_result.jsonl"),
9290
"format": ".jsonl",
93-
"transform": SequenceTransform(
94-
[
95-
AddColumn("score"),
96-
MathVisionOutputEvaluator(score_column_name="score"),
97-
]
98-
),
91+
"transform": SequenceTransform([
92+
ColumnRename(name_mapping={"model_output": "response"}),
93+
ColumnRename(name_mapping={"prompt": "original_question"})
94+
]),
9995
},
10096
),
101-
output_dir=os.path.join(self.log_dir, "data_post_processing_output"),
97+
prompt_template_path=os.path.join(
98+
os.path.dirname(__file__), "../prompt_templates/mathvision_templates/answer_extraction_prompt.jinja"
99+
),
100+
output_dir=os.path.join(self.log_dir, "eval_data_pre_processing_output"),
101+
)
102+
103+
# Eval Inference component round 1 (answer extraction).
104+
self.eval_inference_comp = InferenceConfig(
105+
component_type=Inference,
106+
model_config=PERSONAL_GPT4O,
107+
data_loader_config=DataSetConfig(
108+
MMDataLoader,
109+
{"path": os.path.join(self.eval_data_pre_processing.output_dir, "transformed_data.jsonl"), "load_images":False},
110+
),
111+
output_dir=os.path.join(self.log_dir, "eval_inference_result"),
112+
)
113+
114+
# Eval data pre processing component round 2 (LLM scoring).
115+
self.eval_data_pre_processing_two = PromptProcessingConfig(
116+
component_type=PromptProcessing,
117+
data_reader_config=DataSetConfig(
118+
DataReader,
119+
{
120+
"path": os.path.join(self.eval_inference_comp.output_dir, "inference_result.jsonl"),
121+
"format": ".jsonl",
122+
"transform": SequenceTransform([ColumnRename(name_mapping={"model_output": "extraction"})]),
123+
},
124+
),
125+
prompt_template_path=os.path.join(
126+
os.path.dirname(__file__), "../prompt_templates/mathvision_templates/scoring_prompt.jinja"
127+
),
128+
output_dir=os.path.join(self.log_dir, "eval_data_pre_processing_output_two"),
129+
)
130+
131+
# Eval Inference component round 2 (LLM scoring)
132+
self.eval_inference_comp_two = InferenceConfig(
133+
component_type=Inference,
134+
model_config=PERSONAL_GPT4O,
135+
data_loader_config=DataSetConfig(
136+
MMDataLoader,
137+
{"path": os.path.join(self.eval_data_pre_processing_two.output_dir, "transformed_data.jsonl"), "load_images":False},
138+
),
139+
output_dir=os.path.join(self.log_dir, "eval_inference_result_two"),
102140
)
103141

104142
self.evalreporting_comp = EvalReportingConfig(
105143
component_type=EvalReporting,
106144
data_reader_config=DataSetConfig(
107145
DataReader,
108146
{
109-
"path": os.path.join(self.data_post_processing.output_dir, "transformed_data.jsonl"),
147+
"path": os.path.join(self.eval_inference_comp_two.output_dir, "inference_result.jsonl"),
110148
"format": ".jsonl",
149+
"transform": ColumnRename(name_mapping={"model_output": "score"}),
111150
},
112151
),
113152
aggregator_configs=[
@@ -122,8 +161,16 @@ def configure_pipeline(
122161
AverageAggregator,
123162
{
124163
"column_names": ["score"],
125-
"filename_base": "MathVision_Score_By_Type",
126-
"group_by": ["level", "subject"],
164+
"filename_base": "MathVision_Score_By_Subect",
165+
"group_by": ["subject"],
166+
},
167+
),
168+
AggregatorConfig(
169+
AverageAggregator,
170+
{
171+
"column_names": ["score"],
172+
"filename_base": "MathVision_Score_By_SubectLevel",
173+
"group_by": ["subject", "level"],
127174
},
128175
),
129176
],
@@ -134,7 +181,10 @@ def configure_pipeline(
134181
[
135182
self.data_processing_comp,
136183
self.inference_comp,
137-
self.data_post_processing,
184+
self.eval_data_pre_processing,
185+
self.eval_inference_comp,
186+
self.eval_data_pre_processing_two,
187+
self.eval_inference_comp_two,
138188
self.evalreporting_comp,
139189
],
140190
self.log_dir,

0 commit comments

Comments
 (0)