Skip to content

Commit 25bc122

Browse files
committed
hand formatted last issue
1 parent 61f42c9 commit 25bc122

File tree

1 file changed

+95
-71
lines changed

1 file changed

+95
-71
lines changed

eval/chat_benchmarks/HMMT/matharena/grader.py

Lines changed: 95 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -13,33 +13,46 @@
1313
from matharena.parser import parse_grading, WarningType
1414

1515

16-
1716
def similar(a, b):
1817
return SequenceMatcher(None, a, b).ratio() > 0.8 # Allow minor formatting differences
1918

19+
2020
def clean_string_to_json(text: str) -> str:
21-
text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
21+
text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
2222
text = re.sub(r"```json\n(.*?)\n```", r"\1", text, flags=re.DOTALL)
2323
text = text.replace("`", "")
2424
return text
2525

26+
2627
def format_grading_scheme(scheme, problem_id):
2728
formatted_str = ""
28-
if scheme['problem_idx'] != problem_id:
29-
raise ValueError(f'Incorrect schema given for problem {problem_id}')
29+
if scheme["problem_idx"] != problem_id:
30+
raise ValueError(f"Incorrect schema given for problem {problem_id}")
3031
total_points = 0
31-
for category in scheme['grading_scheme']:
32-
total_points += category['points']
33-
formatted_str += f'Category: {category['title']}\nAvailable points: {category['points']}\nDescription: {category['desc']}\n\n'
34-
35-
if total_points != scheme['points']:
36-
raise ValueError(f'Total points in schema for problem {problem_id} totals {total_points}, but should be {scheme['points']}')
37-
32+
for category in scheme["grading_scheme"]:
33+
total_points += category["points"]
34+
formatted_str += f"Category: {category['title']}\n"
35+
formatted_str += f"Available points: {category['points']}\n"
36+
formatted_str += f"Description: {category['desc']}\n\n"
37+
38+
if total_points != scheme["points"]:
39+
raise ValueError(
40+
f"Total points in schema for problem {problem_id} totals {total_points}, but should be {scheme['points']}"
41+
)
42+
3843
return formatted_str
3944

40-
def run_grader(grader_config, solver_config_path, competition, skip_existing=False,
41-
output_folder="outputs", grading_folder="autogrades",
42-
competition_config_folder="competition_configs", autograding_config_path="configs/autograding/config.yaml"):
45+
46+
def run_grader(
47+
grader_config,
48+
solver_config_path,
49+
competition,
50+
skip_existing=False,
51+
output_folder="outputs",
52+
grading_folder="autogrades",
53+
competition_config_folder="competition_configs",
54+
autograding_config_path="configs/autograding/config.yaml",
55+
):
4356
model = grader_config["model"]
4457
n = grader_config["n"]
4558
api = grader_config["api"]
@@ -82,7 +95,7 @@ def run_grader(grader_config, solver_config_path, competition, skip_existing=Fal
8295
marking_schemas = {}
8396

8497
all_messages_per_problem = {i: [] for i in range(len(problems))}
85-
all_evals_per_problem_per_solution = {i : {} for i in range(len(problems))}
98+
all_evals_per_problem_per_solution = {i: {} for i in range(len(problems))}
8699

87100
for i, problem in enumerate(problems):
88101
problem_id = problem["problem_idx"]
@@ -92,37 +105,43 @@ def run_grader(grader_config, solver_config_path, competition, skip_existing=Fal
92105
raise ValueError(f"Could not find the solutions for {problem_id} in {output_dir}")
93106
else:
94107
data_file = json.load(open(output_file))
95-
problem['anon_id'] = data_file['anonymous_id']
108+
problem["anon_id"] = data_file["anonymous_id"]
96109
messages = data_file["messages"]
97110
all_evals_per_problem_per_solution[i] = {i: [] for i in range(n_evals)}
98-
messages = [
99-
messages_one for messages_one in messages if len(messages_one[-1]["content"]) > 0
100-
]
111+
messages = [messages_one for messages_one in messages if len(messages_one[-1]["content"]) > 0]
101112
all_messages_per_problem[i] = messages
102113

103114
marking_schema = format_grading_scheme(problem, problem_id)
104-
marking_schemas[i] = problem['grading_scheme']
115+
marking_schemas[i] = problem["grading_scheme"]
105116

106117
for j in range(n_evals):
107-
auto_grading_file = os.path.join(autograder_dir,f"{problem_id}/{problem['anon_id']}_{grader_config['model'].split('/')[-1]}-{j}.json")
108-
118+
auto_grading_file = os.path.join(
119+
autograder_dir, f"{problem_id}/{problem['anon_id']}_{grader_config['model'].split('/')[-1]}-{j}.json"
120+
)
121+
109122
if skip_existing and os.path.exists(auto_grading_file):
110123
data_file = json.load(open(auto_grading_file))
111-
messages = [messages_one['raw'] for messages_one in data_file]
124+
messages = [messages_one["raw"] for messages_one in data_file]
112125
all_evals_per_problem_per_solution[i][j] = messages
113126
if len(all_evals_per_problem_per_solution[i][j]) == n:
114-
calculate_grading_results(problem, autograder_dir,
115-
all_evals_per_problem_per_solution[i][j], marking_schemas[i],
116-
i, j, grader_model_name=grader_config['model'].split('/')[-1])
127+
calculate_grading_results(
128+
problem,
129+
autograder_dir,
130+
all_evals_per_problem_per_solution[i][j],
131+
marking_schemas[i],
132+
i,
133+
j,
134+
grader_model_name=grader_config["model"].split("/")[-1],
135+
)
117136
continue
118137
for _, message in enumerate(messages):
119138
problem_statement = problem["problem"]
120139
grading_prompt = prompt_template.format(
121-
problem_statement=problem_statement,
122-
marking_schema=marking_schema,
123-
correct_solution=problem['sample_solution'],
124-
example_grading=problem['sample_grading'],
125-
solution=message if skip_existing and os.path.exists(auto_grading_file) else message[-1]["content"]
140+
problem_statement=problem_statement,
141+
marking_schema=marking_schema,
142+
correct_solution=problem["sample_solution"],
143+
example_grading=problem["sample_grading"],
144+
solution=message if skip_existing and os.path.exists(auto_grading_file) else message[-1]["content"],
126145
)
127146
batch_idx_to_problem_idx[len(batch_prompts)] = (i, j)
128147
batch_prompts.append((grading_prompt, None))
@@ -131,11 +150,7 @@ def run_grader(grader_config, solver_config_path, competition, skip_existing=Fal
131150

132151
if len(batch_prompts) == 0:
133152
return
134-
api = APIQuery(
135-
model=model,
136-
api=api,
137-
**kwargs
138-
)
153+
api = APIQuery(model=model, api=api, **kwargs)
139154

140155
cot_solver = CoTSolver(
141156
querier=api,
@@ -144,25 +159,33 @@ def run_grader(grader_config, solver_config_path, competition, skip_existing=Fal
144159
for idx, messages, _ in cot_solver.solve(batch_prompts):
145160
problem_idx, grader_idx = batch_idx_to_problem_idx[idx]
146161
problem = problems[problem_idx]
147-
all_evals_per_problem_per_solution[problem_idx][grader_idx].append(messages[-1]['content'])
162+
all_evals_per_problem_per_solution[problem_idx][grader_idx].append(messages[-1]["content"])
148163
# check if the whole problem is finished
149164
if len(all_evals_per_problem_per_solution[problem_idx][grader_idx]) == n:
150-
calculate_grading_results(problem, autograder_dir,
151-
all_evals_per_problem_per_solution[problem_idx][grader_idx], marking_schemas[problem_idx],
152-
problem_idx, grader_idx, grader_model_name=grader_config['model'].split('/')[-1])
153-
154-
def calculate_grading_results(problem, output_dir, gradings_per_solution, marking_schema,
155-
problem_idx, grader_idx, grader_model_name):
165+
calculate_grading_results(
166+
problem,
167+
autograder_dir,
168+
all_evals_per_problem_per_solution[problem_idx][grader_idx],
169+
marking_schemas[problem_idx],
170+
problem_idx,
171+
grader_idx,
172+
grader_model_name=grader_config["model"].split("/")[-1],
173+
)
174+
175+
176+
def calculate_grading_results(
177+
problem, output_dir, gradings_per_solution, marking_schema, problem_idx, grader_idx, grader_model_name
178+
):
156179
problem_id = problem["problem_idx"]
157180
anon_id = problem["anon_id"]
158-
181+
159182
output_file = os.path.join(output_dir, f"{problem_id}/{anon_id}_{grader_model_name}-{grader_idx}.json")
160-
os.makedirs(f'{output_dir}/{problem_id}', exist_ok=True)
183+
os.makedirs(f"{output_dir}/{problem_id}", exist_ok=True)
161184

162185
outputs = [{} for _ in gradings_per_solution]
163186

164187
for i, message in enumerate(gradings_per_solution):
165-
outputs[i]['raw'] = message
188+
outputs[i]["raw"] = message
166189
warning = WarningType.NONE
167190
parsed_grading = {}
168191
try:
@@ -172,36 +195,40 @@ def calculate_grading_results(problem, output_dir, gradings_per_solution, markin
172195
parsed_grading = json5.loads(clean_string_to_json(message), strict=False)
173196
except Exception:
174197
parsed_grading = parse_grading(message)
175-
if not 'points' in parsed_grading:
176-
logger.error(f'Final points were not generated for grader {grader_idx} of {problem_idx}:\n {message}')
177-
warning = max(warning,WarningType.MAJOR)
178-
if not 'details' in parsed_grading:
179-
if not 'scheme' in parsed_grading:
180-
logger.error(f'Not scoring details found for grader {grader_idx} of {problem_idx}:\n {message}')
181-
warning = max(warning,WarningType.MAJOR)
198+
if not "points" in parsed_grading:
199+
logger.error(f"Final points were not generated for grader {grader_idx} of {problem_idx}:\n {message}")
200+
warning = max(warning, WarningType.MAJOR)
201+
if not "details" in parsed_grading:
202+
if not "scheme" in parsed_grading:
203+
logger.error(f"Not scoring details found for grader {grader_idx} of {problem_idx}:\n {message}")
204+
warning = max(warning, WarningType.MAJOR)
182205
else:
183-
parsed_grading['details'] = parsed_grading['scheme']
184-
elif len(parsed_grading['details']) != len(marking_schema):
185-
logger.error(f'Mismatch between marking schema lengths')
186-
warning = max(warning,WarningType.MAJOR)
206+
parsed_grading["details"] = parsed_grading["scheme"]
207+
elif len(parsed_grading["details"]) != len(marking_schema):
208+
logger.error(f"Mismatch between marking schema lengths")
209+
warning = max(warning, WarningType.MAJOR)
187210
else:
188-
if anon_id == 'ecddbb':
211+
if anon_id == "ecddbb":
189212
breakpoint()
190213
final_points = 0
191-
for (given, expected) in zip(parsed_grading["details"], marking_schema):
214+
for given, expected in zip(parsed_grading["details"], marking_schema):
192215
if not similar(given["title"], expected["title"]):
193216
logger.error(f"Title mismatch: '{given['title']}' vs '{expected['title']}'")
194217
warning = max(warning, WarningType.MAJOR)
195218
elif given["points"] > expected["points"]:
196-
logger.warning(f"Warning: Given points ({given['points']}) exceed max allowed ({expected['points']}) for category '{given['title']}'")
219+
logger.warning(
220+
f"Warning: Given points ({given['points']}) exceed max allowed ({expected['points']}) for category '{given['title']}'"
221+
)
197222
warning = max(warning, WarningType.MINOR)
198223
given["points"] = expected["points"]
199224
elif given["points"] < 0:
200-
logger.warning(f"Warning: Given points ({given['points']}) are negative for category '{given['title']}'")
225+
logger.warning(
226+
f"Warning: Given points ({given['points']}) are negative for category '{given['title']}'"
227+
)
201228
warning = max(warning, WarningType.MINOR)
202229
given["points"] = 0
203230

204-
given["title"] = expected["title"]
231+
given["title"] = expected["title"]
205232
final_points += given["points"]
206233
parsed_grading["points"] = final_points
207234

@@ -211,17 +238,14 @@ def calculate_grading_results(problem, output_dir, gradings_per_solution, markin
211238
parsed_grading = {
212239
"points": 0,
213240
"details": [
214-
{
215-
"title": item['title'],
216-
"points": 0,
217-
"desc": "The grading could not be parsed."
218-
} for item in marking_schema
219-
]
241+
{"title": item["title"], "points": 0, "desc": "The grading could not be parsed."}
242+
for item in marking_schema
243+
],
220244
}
221245

222-
outputs[i]['warning'] = warning.value
246+
outputs[i]["warning"] = warning.value
223247
for k in parsed_grading:
224248
outputs[i][k] = parsed_grading[k]
225-
249+
226250
with open(output_file, "w") as f:
227-
json.dump(outputs, f)
251+
json.dump(outputs, f)

0 commit comments

Comments
 (0)