22Grade Problem Tool for LLM Benchmark
33"""
44
5- import Test: DefaultTestSet, finish
6-
75mutable struct GradeProblemTool <: ClaudeMCPTools.MCPTool
86 grade_fn:: Function
97 working_dir:: String
@@ -39,73 +37,13 @@ function ClaudeMCPTools.execute(tool::GradeProblemTool, params::Dict)
3937 transcript = get (params, " transcript" , " " )
4038
4139 try
42- # Create a custom testset for grading
43- testset_name = isempty (problem_id) ? " grading" : " grading: $problem_id "
44- ts = DefaultTestSet (testset_name; verbose= false )
45-
46- # Variable to store the grading result
47- result = nothing
48-
49- # Save and disable TESTSET_PRINT_ENABLE to prevent duplicate output
50- old_print_enable = Test. TESTSET_PRINT_ENABLE[]
51- Test. TESTSET_PRINT_ENABLE[] = false
52-
53- # Push testset to capture test results (Test module won't print when inside a testset)
54- Test. push_testset (ts)
55- try
56- # Use LLMBENCH_WORKSPACE if set, otherwise use working_dir
57- workspace = get (ENV , " LLMBENCH_WORKSPACE" , tool. working_dir)
58-
59- # Call the grade function with all arguments
60- # Use invokelatest to handle world age issues when loading modules dynamically
61- # Always pass all three parameters - the function has a default value for problem_id
62- result = Base. invokelatest (tool. grade_fn, workspace, transcript, problem_id)
63- finally
64- Test. pop_testset ()
65- # Restore TESTSET_PRINT_ENABLE
66- Test. TESTSET_PRINT_ENABLE[] = old_print_enable
67- end
68-
69- # Capture the testset output using redirect_stdout
70- # Create a Pipe for capturing stdout
71- old_stdout = stdout
72- rd, wr = redirect_stdout ()
73-
74- try
75- Test. finish (ts)
76- catch e
77- # finish throws an error if tests fail, but we still want the output
78- end
79-
80- # Check if any tests failed and print errors (while still redirected)
81- # This ensures the errors are captured in the output
82- function has_failures (testset)
83- for r in testset. results
84- if isa (r, Test. Fail) || isa (r, Test. Error)
85- return true
86- elseif isa (r, DefaultTestSet)
87- if has_failures (r)
88- return true
89- end
90- end
91- end
92- return false
93- end
94-
95- if has_failures (ts)
96- Test. print_test_errors (ts)
97- end
98-
99- # Restore stdout and close the write end
100- redirect_stdout (old_stdout)
101- close (wr)
102-
103- # Read the captured output
104- test_output_str = read (rd, String)
105- close (rd)
40+ # Use LLMBENCH_WORKSPACE if set, otherwise use working_dir
41+ workspace = get (ENV , " LLMBENCH_WORKSPACE" , tool. working_dir)
10642
107- # Check if any tests failed (reuse the has_failures function defined above)
108- has_test_failures = has_failures (ts)
43+ # Call the grade function with all arguments
44+ # Use invokelatest to handle world age issues when loading modules dynamically
45+ # Always pass all three parameters - the function has a default value for problem_id
46+ result = Base. invokelatest (tool. grade_fn, workspace, transcript, problem_id)
10947
11048 # Debug: Print the result type
11149 @debug " Grade function returned: $(typeof (result)) "
@@ -140,21 +78,6 @@ function ClaudeMCPTools.execute(tool::GradeProblemTool, params::Dict)
14078 result[" score" ] = total
14179 end
14280
143- # Override score to 0 if any tests failed
144- if has_test_failures
145- result[" score" ] = 0.0
146- # Also set all subscores to 0
147- for key in keys (result[" subscores" ])
148- result[" subscores" ][key] = 0.0
149- end
150- end
151-
152- # Add test output to metadata
153- if ! haskey (result, " metadata" )
154- result[" metadata" ] = Dict {String,Any} ()
155- end
156- result[" metadata" ][" test_output" ] = test_output_str
157-
15881 return Dict (
15982 " content" => [Dict (
16083 " type" => " text" ,
@@ -165,13 +88,11 @@ function ClaudeMCPTools.execute(tool::GradeProblemTool, params::Dict)
16588
16689 elseif isa (result, Number)
16790 # Simple numeric score
168- # Override to 0 if any tests failed
169- final_score = has_test_failures ? 0.0 : Float64 (result)
91+ final_score = Float64 (result)
17092 grading_result = Dict (
17193 " subscores" => Dict (" total" => final_score),
17294 " weights" => Dict (" total" => 1.0 ),
173- " score" => final_score,
174- " metadata" => Dict (" test_output" => test_output_str)
95+ " score" => final_score
17596 )
17697
17798 return Dict (
@@ -184,13 +105,12 @@ function ClaudeMCPTools.execute(tool::GradeProblemTool, params::Dict)
184105
185106 else
186107 # Convert to string and return as details
187- # Score is always 0 for non-numeric results or if tests failed
108+ # Score is 0 for non-numeric/non-dict results
188109 grading_result = Dict (
189110 " subscores" => Dict (" completion" => 0.0 ),
190111 " weights" => Dict (" completion" => 1.0 ),
191112 " score" => 0.0 ,
192- " details" => string (result),
193- " metadata" => Dict (" test_output" => test_output_str)
113+ " details" => string (result)
194114 )
195115
196116 return Dict (
@@ -216,8 +136,7 @@ function ClaudeMCPTools.execute(tool::GradeProblemTool, params::Dict)
216136 " subscores" => Dict (" completion" => 0.0 ),
217137 " weights" => Dict (" completion" => 1.0 ),
218138 " score" => 0.0 ,
219- " error" => error_msg,
220- " metadata" => Dict (" test_output" => " Test execution failed: grading function threw an exception" )
139+ " error" => error_msg
221140 )
222141
223142 return Dict (
0 commit comments