22Grade Problem Tool for LLM Benchmark
33"""
44
5- import Test: DefaultTestSet, finish
5+ import Test: DefaultTestSet
66
77mutable struct GradeProblemTool <: ClaudeMCPTools.MCPTool
88 grade_fn:: Function
@@ -57,53 +57,63 @@ function ClaudeMCPTools.execute(tool::GradeProblemTool, params::Dict)
5757 Test. pop_testset ()
5858 end
5959
60- # Format testset results
61- test_summary = Dict {String,Any} (
62- " description" => ts. description,
63- " passed" => ts. n_passed,
64- " failed" => count (r -> isa (r, Test. Fail), ts. results),
65- " errored" => count (r -> isa (r, Test. Error), ts. results),
66- " broken" => count (r -> isa (r, Test. Broken), ts. results),
67- " total" => ts. n_passed + length (ts. results)
68- )
60+ # Format the testset output as a string similar to how Test module would display it
61+ test_output = IOBuffer ()
62+
63+ # Write the summary line
64+ n_pass = ts. n_passed
65+ n_fail = count (r -> isa (r, Test. Fail), ts. results)
66+ n_error = count (r -> isa (r, Test. Error), ts. results)
67+ n_broken = count (r -> isa (r, Test. Broken), ts. results)
68+ n_total = n_pass + n_fail + n_error + n_broken
69+
70+ println (test_output, " Test Summary: | Pass Fail Error Broken Total" )
71+ println (test_output, " $(ts. description) | $(n_pass) $(n_fail) $(n_error) $(n_broken) $(n_total) " )
6972
70- # Collect details about failures
71- failures = []
72- for r in ts. results
73- if isa (r, Test. Fail)
74- push! (failures, Dict (
75- " type" => " fail" ,
76- " expression" => string (r. orig_expr),
77- " message" => r. data != = nothing ? string (r. data) : " "
78- ))
79- elseif isa (r, Test. Error)
80- push! (failures, Dict (
81- " type" => " error" ,
82- " expression" => string (r. orig_expr),
83- " message" => string (r. value)
84- ))
85- elseif isa (r, DefaultTestSet)
73+ # Add details about nested testsets and failures
74+ for result in ts. results
75+ if isa (result, DefaultTestSet)
8676 # Nested testset
87- nested_summary = Dict (
88- " description" => r. description,
89- " passed" => r. n_passed,
90- " failed" => count (x -> isa (x, Test. Fail), r. results),
91- " errored" => count (x -> isa (x, Test. Error), r. results)
92- )
93- push! (test_summary, " nested" => nested_summary)
77+ n_pass_nested = result. n_passed
78+ n_fail_nested = count (r -> isa (r, Test. Fail), result. results)
79+ n_error_nested = count (r -> isa (r, Test. Error), result. results)
80+ println (test_output, " $(result. description) | $(n_pass_nested) $(n_fail_nested) $(n_error_nested) " )
81+ elseif isa (result, Test. Fail)
82+ # Test failure details
83+ println (test_output, " \n Test Failed:" )
84+ println (test_output, " Expression: $(result. orig_expr) " )
85+ if result. data != = nothing
86+ println (test_output, " Evaluated: $(result. data) " )
87+ end
88+ elseif isa (result, Test. Error)
89+ # Test error details
90+ println (test_output, " \n Test Error:" )
91+ println (test_output, " Expression: $(result. orig_expr) " )
92+ println (test_output, " Exception: $(result. value) " )
9493 end
9594 end
9695
97- if ! isempty (failures)
98- test_summary[" failures" ] = failures
96+ test_output_str = String (take! (test_output))
97+
98+ # Check if any tests failed (including in nested testsets)
99+ function has_failures (testset)
100+ for r in testset. results
101+ if isa (r, Test. Fail) || isa (r, Test. Error)
102+ return true
103+ elseif isa (r, DefaultTestSet)
104+ if has_failures (r)
105+ return true
106+ end
107+ end
108+ end
109+ return false
99110 end
100111
112+ has_test_failures = has_failures (ts)
113+
101114 # Debug: Print the result type
102115 @debug " Grade function returned: $(typeof (result)) "
103116
104- # Check if any tests failed
105- has_test_failures = test_summary[" failed" ] > 0 || test_summary[" errored" ] > 0
106-
107117 # The grade function should return a grading result
108118 # It could be a Dict with subscores, weights, and total score
109119 if isa (result, Dict)
@@ -143,8 +153,8 @@ function ClaudeMCPTools.execute(tool::GradeProblemTool, params::Dict)
143153 end
144154 end
145155
146- # Add test results to the grading result
147- result[" test_results " ] = test_summary
156+ # Add test output to the grading result
157+ result[" test_output " ] = test_output_str
148158
149159 return Dict (
150160 " content" => [Dict (
@@ -162,7 +172,7 @@ function ClaudeMCPTools.execute(tool::GradeProblemTool, params::Dict)
162172 " subscores" => Dict (" total" => final_score),
163173 " weights" => Dict (" total" => 1.0 ),
164174 " score" => final_score,
165- " test_results " => test_summary
175+ " test_output " => test_output_str
166176 )
167177
168178 return Dict (
@@ -181,7 +191,7 @@ function ClaudeMCPTools.execute(tool::GradeProblemTool, params::Dict)
181191 " weights" => Dict (" completion" => 1.0 ),
182192 " score" => 0.0 ,
183193 " details" => string (result),
184- " test_results " => test_summary
194+ " test_output " => test_output_str
185195 )
186196
187197 return Dict (
@@ -202,24 +212,13 @@ function ClaudeMCPTools.execute(tool::GradeProblemTool, params::Dict)
202212 # Also print to stderr for debugging
203213 @error " Grade problem failed" exception= (e, catch_backtrace ())
204214
205- # Create a test summary for the error case
206- error_test_summary = Dict {String,Any} (
207- " description" => isempty (problem_id) ? " grading" : " grading: $problem_id " ,
208- " passed" => 0 ,
209- " failed" => 0 ,
210- " errored" => 1 ,
211- " broken" => 0 ,
212- " total" => 1 ,
213- " error_message" => error_msg
214- )
215-
216215 # Return a failed grade with error
217216 grading_result = Dict (
218217 " subscores" => Dict (" completion" => 0.0 ),
219218 " weights" => Dict (" completion" => 1.0 ),
220219 " score" => 0.0 ,
221220 " error" => error_msg,
222- " test_results " => error_test_summary
221+ " test_output " => " Test execution failed: grading function threw an exception "
223222 )
224223
225224 return Dict (
0 commit comments