Skip to content

Commit 889ddba

Browse files
committed
Fix LLMBenchSimple integration tests to use answer tags
- Update test transcripts to use <answer>...</answer> format - Fix MCP grading tests to specify problem_id and test individually - Remove extra end statement causing syntax error The grade function expects answers wrapped in <answer> tags as per the LLMBenchSimple API. Tests now properly format answers and test grading of individual problems rather than expecting batch grading.
1 parent 35e591d commit 889ddba

File tree

1 file changed

+38
-8
lines changed

1 file changed

+38
-8
lines changed

test/test_llmbench_simple_integration.jl

Lines changed: 38 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -52,18 +52,24 @@ end # module
5252
@test !occursin("10 - 4", description)
5353

5454
# Test grading with correct answers
55-
result = Base.invokelatest(mod.grade, workdir, "8", "math1")
55+
result = Base.invokelatest(mod.grade, workdir, "<answer>8</answer>", "math1")
56+
if result["score"] != 1.0
57+
@info "Grade result for math1 with answer 8" result
58+
end
5659
@test result["score"] == 1.0
5760

58-
result = Base.invokelatest(mod.grade, workdir, "6", "math2")
61+
result = Base.invokelatest(mod.grade, workdir, "<answer>6</answer>", "math2")
62+
if result["score"] != 1.0
63+
@info "Grade result for math2 with answer 6" result
64+
end
5965
@test result["score"] == 1.0
6066

6167
# Test grading with incorrect answer
62-
result = Base.invokelatest(mod.grade, workdir, "7", "math1")
68+
result = Base.invokelatest(mod.grade, workdir, "<answer>7</answer>", "math1")
6369
@test result["score"] == 0.0
6470

6571
# Test grading with empty problem_id (should return error)
66-
result = Base.invokelatest(mod.grade, workdir, "8", "")
72+
result = Base.invokelatest(mod.grade, workdir, "<answer>8</answer>", "")
6773
@test result["score"] == 0.0
6874
@test occursin("problem_id is required", result["details"])
6975
end
@@ -102,25 +108,49 @@ end # module
102108
@test occursin("math1", response["result"]["content"][1]["text"])
103109
@test occursin("math2", response["result"]["content"][1]["text"])
104110

105-
# Test grading through MCP
111+
# Test grading through MCP for math1
106112
request = Dict(
107113
"jsonrpc" => "2.0",
108114
"id" => 2,
109115
"method" => "tools/call",
110116
"params" => Dict(
111117
"name" => "grade_problem",
112-
"arguments" => Dict("transcript" => "8") # Answer to first problem
118+
"arguments" => Dict(
119+
"transcript" => "<answer>8</answer>",
120+
"problem_id" => "math1"
121+
)
113122
)
114123
)
115124

116125
response = ClaudeMCPTools.handle_request(server, request)
117126
grade_result = JSON.parse(response["result"]["content"][1]["text"])
118127

119-
# Should have graded both problems
128+
# Should have graded math1 correctly
120129
@test haskey(grade_result, "subscores")
121130
@test grade_result["subscores"]["math1"] == 1.0 # Correct
131+
@test grade_result["score"] == 1.0
132+
133+
# Test grading through MCP for math2
134+
request = Dict(
135+
"jsonrpc" => "2.0",
136+
"id" => 3,
137+
"method" => "tools/call",
138+
"params" => Dict(
139+
"name" => "grade_problem",
140+
"arguments" => Dict(
141+
"transcript" => "<answer>5</answer>", # Wrong answer
142+
"problem_id" => "math2"
143+
)
144+
)
145+
)
146+
147+
response = ClaudeMCPTools.handle_request(server, request)
148+
grade_result = JSON.parse(response["result"]["content"][1]["text"])
149+
150+
# Should have graded math2 incorrectly
151+
@test haskey(grade_result, "subscores")
122152
@test grade_result["subscores"]["math2"] == 0.0 # Incorrect
153+
@test grade_result["score"] == 0.0
123154
end
124155
end
125-
end
126156
end

0 commit comments

Comments
 (0)