Skip to content

Commit e8a938c

Browse files
committed
Remove testset handling from grade_problem tool
- Remove testset wrapping that was overriding scores when tests failed - Simplify grading to just return what the grade function returns - Test failures should be handled by the benchmark itself, not the server - Fixes issue where test failures were incorrectly overriding valid scores
1 parent 51e2a93 commit e8a938c

File tree

1 file changed

+11
-92
lines changed

1 file changed

+11
-92
lines changed

src/tools/grade_problem.jl

Lines changed: 11 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22
Grade Problem Tool for LLM Benchmark
33
"""
44

5-
import Test: DefaultTestSet, finish
6-
75
mutable struct GradeProblemTool <: ClaudeMCPTools.MCPTool
86
grade_fn::Function
97
working_dir::String
@@ -39,73 +37,13 @@ function ClaudeMCPTools.execute(tool::GradeProblemTool, params::Dict)
3937
transcript = get(params, "transcript", "")
4038

4139
try
42-
# Create a custom testset for grading
43-
testset_name = isempty(problem_id) ? "grading" : "grading: $problem_id"
44-
ts = DefaultTestSet(testset_name; verbose=false)
45-
46-
# Variable to store the grading result
47-
result = nothing
48-
49-
# Save and disable TESTSET_PRINT_ENABLE to prevent duplicate output
50-
old_print_enable = Test.TESTSET_PRINT_ENABLE[]
51-
Test.TESTSET_PRINT_ENABLE[] = false
52-
53-
# Push testset to capture test results (Test module won't print when inside a testset)
54-
Test.push_testset(ts)
55-
try
56-
# Use LLMBENCH_WORKSPACE if set, otherwise use working_dir
57-
workspace = get(ENV, "LLMBENCH_WORKSPACE", tool.working_dir)
58-
59-
# Call the grade function with all arguments
60-
# Use invokelatest to handle world age issues when loading modules dynamically
61-
# Always pass all three parameters - the function has a default value for problem_id
62-
result = Base.invokelatest(tool.grade_fn, workspace, transcript, problem_id)
63-
finally
64-
Test.pop_testset()
65-
# Restore TESTSET_PRINT_ENABLE
66-
Test.TESTSET_PRINT_ENABLE[] = old_print_enable
67-
end
68-
69-
# Capture the testset output using redirect_stdout
70-
# Create a Pipe for capturing stdout
71-
old_stdout = stdout
72-
rd, wr = redirect_stdout()
73-
74-
try
75-
Test.finish(ts)
76-
catch e
77-
# finish throws an error if tests fail, but we still want the output
78-
end
79-
80-
# Check if any tests failed and print errors (while still redirected)
81-
# This ensures the errors are captured in the output
82-
function has_failures(testset)
83-
for r in testset.results
84-
if isa(r, Test.Fail) || isa(r, Test.Error)
85-
return true
86-
elseif isa(r, DefaultTestSet)
87-
if has_failures(r)
88-
return true
89-
end
90-
end
91-
end
92-
return false
93-
end
94-
95-
if has_failures(ts)
96-
Test.print_test_errors(ts)
97-
end
98-
99-
# Restore stdout and close the write end
100-
redirect_stdout(old_stdout)
101-
close(wr)
102-
103-
# Read the captured output
104-
test_output_str = read(rd, String)
105-
close(rd)
40+
# Use LLMBENCH_WORKSPACE if set, otherwise use working_dir
41+
workspace = get(ENV, "LLMBENCH_WORKSPACE", tool.working_dir)
10642

107-
# Check if any tests failed (reuse the has_failures function defined above)
108-
has_test_failures = has_failures(ts)
43+
# Call the grade function with all arguments
44+
# Use invokelatest to handle world age issues when loading modules dynamically
45+
# Always pass all three parameters - the function has a default value for problem_id
46+
result = Base.invokelatest(tool.grade_fn, workspace, transcript, problem_id)
10947

11048
# Debug: Print the result type
11149
@debug "Grade function returned: $(typeof(result))"
@@ -140,21 +78,6 @@ function ClaudeMCPTools.execute(tool::GradeProblemTool, params::Dict)
14078
result["score"] = total
14179
end
14280

143-
# Override score to 0 if any tests failed
144-
if has_test_failures
145-
result["score"] = 0.0
146-
# Also set all subscores to 0
147-
for key in keys(result["subscores"])
148-
result["subscores"][key] = 0.0
149-
end
150-
end
151-
152-
# Add test output to metadata
153-
if !haskey(result, "metadata")
154-
result["metadata"] = Dict{String,Any}()
155-
end
156-
result["metadata"]["test_output"] = test_output_str
157-
15881
return Dict(
15982
"content" => [Dict(
16083
"type" => "text",
@@ -165,13 +88,11 @@ function ClaudeMCPTools.execute(tool::GradeProblemTool, params::Dict)
16588

16689
elseif isa(result, Number)
16790
# Simple numeric score
168-
# Override to 0 if any tests failed
169-
final_score = has_test_failures ? 0.0 : Float64(result)
91+
final_score = Float64(result)
17092
grading_result = Dict(
17193
"subscores" => Dict("total" => final_score),
17294
"weights" => Dict("total" => 1.0),
173-
"score" => final_score,
174-
"metadata" => Dict("test_output" => test_output_str)
95+
"score" => final_score
17596
)
17697

17798
return Dict(
@@ -184,13 +105,12 @@ function ClaudeMCPTools.execute(tool::GradeProblemTool, params::Dict)
184105

185106
else
186107
# Convert to string and return as details
187-
# Score is always 0 for non-numeric results or if tests failed
108+
# Score is 0 for non-numeric/non-dict results
188109
grading_result = Dict(
189110
"subscores" => Dict("completion" => 0.0),
190111
"weights" => Dict("completion" => 1.0),
191112
"score" => 0.0,
192-
"details" => string(result),
193-
"metadata" => Dict("test_output" => test_output_str)
113+
"details" => string(result)
194114
)
195115

196116
return Dict(
@@ -216,8 +136,7 @@ function ClaudeMCPTools.execute(tool::GradeProblemTool, params::Dict)
216136
"subscores" => Dict("completion" => 0.0),
217137
"weights" => Dict("completion" => 1.0),
218138
"score" => 0.0,
219-
"error" => error_msg,
220-
"metadata" => Dict("test_output" => "Test execution failed: grading function threw an exception")
139+
"error" => error_msg
221140
)
222141

223142
return Dict(

0 commit comments

Comments
 (0)