1313from matharena .parser import parse_grading , WarningType
1414
1515
16-
1716def similar (a , b ):
1817 return SequenceMatcher (None , a , b ).ratio () > 0.8 # Allow minor formatting differences
1918
19+
2020def clean_string_to_json (text : str ) -> str :
21- text = re .sub (r' <think>.*?</think>' , '' , text , flags = re .DOTALL )
21+ text = re .sub (r" <think>.*?</think>" , "" , text , flags = re .DOTALL )
2222 text = re .sub (r"```json\n(.*?)\n```" , r"\1" , text , flags = re .DOTALL )
2323 text = text .replace ("`" , "" )
2424 return text
2525
26+
2627def format_grading_scheme (scheme , problem_id ):
2728 formatted_str = ""
28- if scheme [' problem_idx' ] != problem_id :
29- raise ValueError (f' Incorrect schema given for problem { problem_id } ' )
29+ if scheme [" problem_idx" ] != problem_id :
30+ raise ValueError (f" Incorrect schema given for problem { problem_id } " )
3031 total_points = 0
31- for category in scheme ['grading_scheme' ]:
32- total_points += category ['points' ]
33- formatted_str += f'Category: { category ['title' ]} \n Available points: { category ['points' ]} \n Description: { category ['desc' ]} \n \n '
34-
35- if total_points != scheme ['points' ]:
36- raise ValueError (f'Total points in schema for problem { problem_id } totals { total_points } , but should be { scheme ['points' ]} ' )
37-
32+ for category in scheme ["grading_scheme" ]:
33+ total_points += category ["points" ]
34+ formatted_str += f"Category: { category ['title' ]} \n "
35+ formatted_str += f"Available points: { category ['points' ]} \n "
36+ formatted_str += f"Description: { category ['desc' ]} \n \n "
37+
38+ if total_points != scheme ["points" ]:
39+ raise ValueError (
40+ f"Total points in schema for problem { problem_id } totals { total_points } , but should be { scheme ['points' ]} "
41+ )
42+
3843 return formatted_str
3944
40- def run_grader (grader_config , solver_config_path , competition , skip_existing = False ,
41- output_folder = "outputs" , grading_folder = "autogrades" ,
42- competition_config_folder = "competition_configs" , autograding_config_path = "configs/autograding/config.yaml" ):
45+
46+ def run_grader (
47+ grader_config ,
48+ solver_config_path ,
49+ competition ,
50+ skip_existing = False ,
51+ output_folder = "outputs" ,
52+ grading_folder = "autogrades" ,
53+ competition_config_folder = "competition_configs" ,
54+ autograding_config_path = "configs/autograding/config.yaml" ,
55+ ):
4356 model = grader_config ["model" ]
4457 n = grader_config ["n" ]
4558 api = grader_config ["api" ]
@@ -82,7 +95,7 @@ def run_grader(grader_config, solver_config_path, competition, skip_existing=Fal
8295 marking_schemas = {}
8396
8497 all_messages_per_problem = {i : [] for i in range (len (problems ))}
85- all_evals_per_problem_per_solution = {i : {} for i in range (len (problems ))}
98+ all_evals_per_problem_per_solution = {i : {} for i in range (len (problems ))}
8699
87100 for i , problem in enumerate (problems ):
88101 problem_id = problem ["problem_idx" ]
@@ -92,37 +105,43 @@ def run_grader(grader_config, solver_config_path, competition, skip_existing=Fal
92105 raise ValueError (f"Could not find the solutions for { problem_id } in { output_dir } " )
93106 else :
94107 data_file = json .load (open (output_file ))
95- problem [' anon_id' ] = data_file [' anonymous_id' ]
108+ problem [" anon_id" ] = data_file [" anonymous_id" ]
96109 messages = data_file ["messages" ]
97110 all_evals_per_problem_per_solution [i ] = {i : [] for i in range (n_evals )}
98- messages = [
99- messages_one for messages_one in messages if len (messages_one [- 1 ]["content" ]) > 0
100- ]
111+ messages = [messages_one for messages_one in messages if len (messages_one [- 1 ]["content" ]) > 0 ]
101112 all_messages_per_problem [i ] = messages
102113
103114 marking_schema = format_grading_scheme (problem , problem_id )
104- marking_schemas [i ] = problem [' grading_scheme' ]
115+ marking_schemas [i ] = problem [" grading_scheme" ]
105116
106117 for j in range (n_evals ):
107- auto_grading_file = os .path .join (autograder_dir ,f"{ problem_id } /{ problem ['anon_id' ]} _{ grader_config ['model' ].split ('/' )[- 1 ]} -{ j } .json" )
108-
118+ auto_grading_file = os .path .join (
119+ autograder_dir , f"{ problem_id } /{ problem ['anon_id' ]} _{ grader_config ['model' ].split ('/' )[- 1 ]} -{ j } .json"
120+ )
121+
109122 if skip_existing and os .path .exists (auto_grading_file ):
110123 data_file = json .load (open (auto_grading_file ))
111- messages = [messages_one [' raw' ] for messages_one in data_file ]
124+ messages = [messages_one [" raw" ] for messages_one in data_file ]
112125 all_evals_per_problem_per_solution [i ][j ] = messages
113126 if len (all_evals_per_problem_per_solution [i ][j ]) == n :
114- calculate_grading_results (problem , autograder_dir ,
115- all_evals_per_problem_per_solution [i ][j ], marking_schemas [i ],
116- i , j , grader_model_name = grader_config ['model' ].split ('/' )[- 1 ])
127+ calculate_grading_results (
128+ problem ,
129+ autograder_dir ,
130+ all_evals_per_problem_per_solution [i ][j ],
131+ marking_schemas [i ],
132+ i ,
133+ j ,
134+ grader_model_name = grader_config ["model" ].split ("/" )[- 1 ],
135+ )
117136 continue
118137 for _ , message in enumerate (messages ):
119138 problem_statement = problem ["problem" ]
120139 grading_prompt = prompt_template .format (
121- problem_statement = problem_statement ,
122- marking_schema = marking_schema ,
123- correct_solution = problem [' sample_solution' ],
124- example_grading = problem [' sample_grading' ],
125- solution = message if skip_existing and os .path .exists (auto_grading_file ) else message [- 1 ]["content" ]
140+ problem_statement = problem_statement ,
141+ marking_schema = marking_schema ,
142+ correct_solution = problem [" sample_solution" ],
143+ example_grading = problem [" sample_grading" ],
144+ solution = message if skip_existing and os .path .exists (auto_grading_file ) else message [- 1 ]["content" ],
126145 )
127146 batch_idx_to_problem_idx [len (batch_prompts )] = (i , j )
128147 batch_prompts .append ((grading_prompt , None ))
@@ -131,11 +150,7 @@ def run_grader(grader_config, solver_config_path, competition, skip_existing=Fal
131150
132151 if len (batch_prompts ) == 0 :
133152 return
134- api = APIQuery (
135- model = model ,
136- api = api ,
137- ** kwargs
138- )
153+ api = APIQuery (model = model , api = api , ** kwargs )
139154
140155 cot_solver = CoTSolver (
141156 querier = api ,
@@ -144,25 +159,33 @@ def run_grader(grader_config, solver_config_path, competition, skip_existing=Fal
144159 for idx , messages , _ in cot_solver .solve (batch_prompts ):
145160 problem_idx , grader_idx = batch_idx_to_problem_idx [idx ]
146161 problem = problems [problem_idx ]
147- all_evals_per_problem_per_solution [problem_idx ][grader_idx ].append (messages [- 1 ][' content' ])
162+ all_evals_per_problem_per_solution [problem_idx ][grader_idx ].append (messages [- 1 ][" content" ])
148163 # check if the whole problem is finished
149164 if len (all_evals_per_problem_per_solution [problem_idx ][grader_idx ]) == n :
150- calculate_grading_results (problem , autograder_dir ,
151- all_evals_per_problem_per_solution [problem_idx ][grader_idx ], marking_schemas [problem_idx ],
152- problem_idx , grader_idx , grader_model_name = grader_config ['model' ].split ('/' )[- 1 ])
153-
154- def calculate_grading_results (problem , output_dir , gradings_per_solution , marking_schema ,
155- problem_idx , grader_idx , grader_model_name ):
165+ calculate_grading_results (
166+ problem ,
167+ autograder_dir ,
168+ all_evals_per_problem_per_solution [problem_idx ][grader_idx ],
169+ marking_schemas [problem_idx ],
170+ problem_idx ,
171+ grader_idx ,
172+ grader_model_name = grader_config ["model" ].split ("/" )[- 1 ],
173+ )
174+
175+
176+ def calculate_grading_results (
177+ problem , output_dir , gradings_per_solution , marking_schema , problem_idx , grader_idx , grader_model_name
178+ ):
156179 problem_id = problem ["problem_idx" ]
157180 anon_id = problem ["anon_id" ]
158-
181+
159182 output_file = os .path .join (output_dir , f"{ problem_id } /{ anon_id } _{ grader_model_name } -{ grader_idx } .json" )
160- os .makedirs (f' { output_dir } /{ problem_id } ' , exist_ok = True )
183+ os .makedirs (f" { output_dir } /{ problem_id } " , exist_ok = True )
161184
162185 outputs = [{} for _ in gradings_per_solution ]
163186
164187 for i , message in enumerate (gradings_per_solution ):
165- outputs [i ][' raw' ] = message
188+ outputs [i ][" raw" ] = message
166189 warning = WarningType .NONE
167190 parsed_grading = {}
168191 try :
@@ -172,36 +195,40 @@ def calculate_grading_results(problem, output_dir, gradings_per_solution, markin
172195 parsed_grading = json5 .loads (clean_string_to_json (message ), strict = False )
173196 except Exception :
174197 parsed_grading = parse_grading (message )
175- if not ' points' in parsed_grading :
176- logger .error (f' Final points were not generated for grader { grader_idx } of { problem_idx } :\n { message } ' )
177- warning = max (warning ,WarningType .MAJOR )
178- if not ' details' in parsed_grading :
179- if not ' scheme' in parsed_grading :
180- logger .error (f' Not scoring details found for grader { grader_idx } of { problem_idx } :\n { message } ' )
181- warning = max (warning ,WarningType .MAJOR )
198+ if not " points" in parsed_grading :
199+ logger .error (f" Final points were not generated for grader { grader_idx } of { problem_idx } :\n { message } " )
200+ warning = max (warning , WarningType .MAJOR )
201+ if not " details" in parsed_grading :
202+ if not " scheme" in parsed_grading :
203+ logger .error (f" Not scoring details found for grader { grader_idx } of { problem_idx } :\n { message } " )
204+ warning = max (warning , WarningType .MAJOR )
182205 else :
183- parsed_grading [' details' ] = parsed_grading [' scheme' ]
184- elif len (parsed_grading [' details' ]) != len (marking_schema ):
185- logger .error (f' Mismatch between marking schema lengths' )
186- warning = max (warning ,WarningType .MAJOR )
206+ parsed_grading [" details" ] = parsed_grading [" scheme" ]
207+ elif len (parsed_grading [" details" ]) != len (marking_schema ):
208+ logger .error (f" Mismatch between marking schema lengths" )
209+ warning = max (warning , WarningType .MAJOR )
187210 else :
188- if anon_id == ' ecddbb' :
211+ if anon_id == " ecddbb" :
189212 breakpoint ()
190213 final_points = 0
191- for ( given , expected ) in zip (parsed_grading ["details" ], marking_schema ):
214+ for given , expected in zip (parsed_grading ["details" ], marking_schema ):
192215 if not similar (given ["title" ], expected ["title" ]):
193216 logger .error (f"Title mismatch: '{ given ['title' ]} ' vs '{ expected ['title' ]} '" )
194217 warning = max (warning , WarningType .MAJOR )
195218 elif given ["points" ] > expected ["points" ]:
196- logger .warning (f"Warning: Given points ({ given ['points' ]} ) exceed max allowed ({ expected ['points' ]} ) for category '{ given ['title' ]} '" )
219+ logger .warning (
220+ f"Warning: Given points ({ given ['points' ]} ) exceed max allowed ({ expected ['points' ]} ) for category '{ given ['title' ]} '"
221+ )
197222 warning = max (warning , WarningType .MINOR )
198223 given ["points" ] = expected ["points" ]
199224 elif given ["points" ] < 0 :
200- logger .warning (f"Warning: Given points ({ given ['points' ]} ) are negative for category '{ given ['title' ]} '" )
225+ logger .warning (
226+ f"Warning: Given points ({ given ['points' ]} ) are negative for category '{ given ['title' ]} '"
227+ )
201228 warning = max (warning , WarningType .MINOR )
202229 given ["points" ] = 0
203230
204- given ["title" ] = expected ["title" ]
231+ given ["title" ] = expected ["title" ]
205232 final_points += given ["points" ]
206233 parsed_grading ["points" ] = final_points
207234
@@ -211,17 +238,14 @@ def calculate_grading_results(problem, output_dir, gradings_per_solution, markin
211238 parsed_grading = {
212239 "points" : 0 ,
213240 "details" : [
214- {
215- "title" : item ['title' ],
216- "points" : 0 ,
217- "desc" : "The grading could not be parsed."
218- } for item in marking_schema
219- ]
241+ {"title" : item ["title" ], "points" : 0 , "desc" : "The grading could not be parsed." }
242+ for item in marking_schema
243+ ],
220244 }
221245
222- outputs [i ][' warning' ] = warning .value
246+ outputs [i ][" warning" ] = warning .value
223247 for k in parsed_grading :
224248 outputs [i ][k ] = parsed_grading [k ]
225-
249+
226250 with open (output_file , "w" ) as f :
227- json .dump (outputs , f )
251+ json .dump (outputs , f )
0 commit comments