Skip to content

Commit aafd494

Browse files
committed
Replace structured test result parsing with raw testset output
- Changed from structured test result parsing to capturing raw test output as string - Manually format testset results similar to Test module's default output - Include test summaries, nested testsets, and failure details in the output - Fixed has_test_failures to recursively check nested testsets for failures - The test_output field now contains the full testset output as it would appear in stdout - This preserves context from @testset let blocks and provides more familiar output format
1 parent b66eaf8 commit aafd494

File tree

1 file changed

+53
-54
lines changed

1 file changed

+53
-54
lines changed

src/tools/grade_problem.jl

Lines changed: 53 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Grade Problem Tool for LLM Benchmark
33
"""
44

5-
import Test: DefaultTestSet, finish
5+
import Test: DefaultTestSet
66

77
mutable struct GradeProblemTool <: ClaudeMCPTools.MCPTool
88
grade_fn::Function
@@ -57,53 +57,63 @@ function ClaudeMCPTools.execute(tool::GradeProblemTool, params::Dict)
5757
Test.pop_testset()
5858
end
5959

60-
# Format testset results
61-
test_summary = Dict{String,Any}(
62-
"description" => ts.description,
63-
"passed" => ts.n_passed,
64-
"failed" => count(r -> isa(r, Test.Fail), ts.results),
65-
"errored" => count(r -> isa(r, Test.Error), ts.results),
66-
"broken" => count(r -> isa(r, Test.Broken), ts.results),
67-
"total" => ts.n_passed + length(ts.results)
68-
)
60+
# Format the testset output as a string similar to how Test module would display it
61+
test_output = IOBuffer()
62+
63+
# Write the summary line
64+
n_pass = ts.n_passed
65+
n_fail = count(r -> isa(r, Test.Fail), ts.results)
66+
n_error = count(r -> isa(r, Test.Error), ts.results)
67+
n_broken = count(r -> isa(r, Test.Broken), ts.results)
68+
n_total = n_pass + n_fail + n_error + n_broken
69+
70+
println(test_output, "Test Summary: | Pass Fail Error Broken Total")
71+
println(test_output, "$(ts.description) | $(n_pass) $(n_fail) $(n_error) $(n_broken) $(n_total)")
6972

70-
# Collect details about failures
71-
failures = []
72-
for r in ts.results
73-
if isa(r, Test.Fail)
74-
push!(failures, Dict(
75-
"type" => "fail",
76-
"expression" => string(r.orig_expr),
77-
"message" => r.data !== nothing ? string(r.data) : ""
78-
))
79-
elseif isa(r, Test.Error)
80-
push!(failures, Dict(
81-
"type" => "error",
82-
"expression" => string(r.orig_expr),
83-
"message" => string(r.value)
84-
))
85-
elseif isa(r, DefaultTestSet)
73+
# Add details about nested testsets and failures
74+
for result in ts.results
75+
if isa(result, DefaultTestSet)
8676
# Nested testset
87-
nested_summary = Dict(
88-
"description" => r.description,
89-
"passed" => r.n_passed,
90-
"failed" => count(x -> isa(x, Test.Fail), r.results),
91-
"errored" => count(x -> isa(x, Test.Error), r.results)
92-
)
93-
push!(test_summary, "nested" => nested_summary)
77+
n_pass_nested = result.n_passed
78+
n_fail_nested = count(r -> isa(r, Test.Fail), result.results)
79+
n_error_nested = count(r -> isa(r, Test.Error), result.results)
80+
println(test_output, " $(result.description) | $(n_pass_nested) $(n_fail_nested) $(n_error_nested)")
81+
elseif isa(result, Test.Fail)
82+
# Test failure details
83+
println(test_output, "\nTest Failed:")
84+
println(test_output, " Expression: $(result.orig_expr)")
85+
if result.data !== nothing
86+
println(test_output, " Evaluated: $(result.data)")
87+
end
88+
elseif isa(result, Test.Error)
89+
# Test error details
90+
println(test_output, "\nTest Error:")
91+
println(test_output, " Expression: $(result.orig_expr)")
92+
println(test_output, " Exception: $(result.value)")
9493
end
9594
end
9695

97-
if !isempty(failures)
98-
test_summary["failures"] = failures
96+
test_output_str = String(take!(test_output))
97+
98+
# Check if any tests failed (including in nested testsets)
99+
function has_failures(testset)
100+
for r in testset.results
101+
if isa(r, Test.Fail) || isa(r, Test.Error)
102+
return true
103+
elseif isa(r, DefaultTestSet)
104+
if has_failures(r)
105+
return true
106+
end
107+
end
108+
end
109+
return false
99110
end
100111

112+
has_test_failures = has_failures(ts)
113+
101114
# Debug: Print the result type
102115
@debug "Grade function returned: $(typeof(result))"
103116

104-
# Check if any tests failed
105-
has_test_failures = test_summary["failed"] > 0 || test_summary["errored"] > 0
106-
107117
# The grade function should return a grading result
108118
# It could be a Dict with subscores, weights, and total score
109119
if isa(result, Dict)
@@ -143,8 +153,8 @@ function ClaudeMCPTools.execute(tool::GradeProblemTool, params::Dict)
143153
end
144154
end
145155

146-
# Add test results to the grading result
147-
result["test_results"] = test_summary
156+
# Add test output to the grading result
157+
result["test_output"] = test_output_str
148158

149159
return Dict(
150160
"content" => [Dict(
@@ -162,7 +172,7 @@ function ClaudeMCPTools.execute(tool::GradeProblemTool, params::Dict)
162172
"subscores" => Dict("total" => final_score),
163173
"weights" => Dict("total" => 1.0),
164174
"score" => final_score,
165-
"test_results" => test_summary
175+
"test_output" => test_output_str
166176
)
167177

168178
return Dict(
@@ -181,7 +191,7 @@ function ClaudeMCPTools.execute(tool::GradeProblemTool, params::Dict)
181191
"weights" => Dict("completion" => 1.0),
182192
"score" => 0.0,
183193
"details" => string(result),
184-
"test_results" => test_summary
194+
"test_output" => test_output_str
185195
)
186196

187197
return Dict(
@@ -202,24 +212,13 @@ function ClaudeMCPTools.execute(tool::GradeProblemTool, params::Dict)
202212
# Also print to stderr for debugging
203213
@error "Grade problem failed" exception=(e, catch_backtrace())
204214

205-
# Create a test summary for the error case
206-
error_test_summary = Dict{String,Any}(
207-
"description" => isempty(problem_id) ? "grading" : "grading: $problem_id",
208-
"passed" => 0,
209-
"failed" => 0,
210-
"errored" => 1,
211-
"broken" => 0,
212-
"total" => 1,
213-
"error_message" => error_msg
214-
)
215-
216215
# Return a failed grade with error
217216
grading_result = Dict(
218217
"subscores" => Dict("completion" => 0.0),
219218
"weights" => Dict("completion" => 1.0),
220219
"score" => 0.0,
221220
"error" => error_msg,
222-
"test_results" => error_test_summary
221+
"test_output" => "Test execution failed: grading function threw an exception"
223222
)
224223

225224
return Dict(

0 commit comments

Comments
 (0)