Skip to content

Commit ca05832

Browse files
Kenoclaude
andcommitted
Add auto mode support for LLMBenchMCPServer
- Support 'auto' as module specification to auto-detect from problem_id - Extract module name from problem_id prefix (ModuleName-problem_id) - Dynamically load modules using Base.require in current active project - Create wrapper functions for setup_problem and grade in auto mode - Update help text and examples to document auto mode usage - Add comprehensive tests for auto mode functionality 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent e5cf103 commit ca05832

File tree

2 files changed

+302
-48
lines changed

2 files changed

+302
-48
lines changed

src/server.jl

Lines changed: 145 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,7 @@ function (@main)(args)
272272
273273
Arguments:
274274
ModuleName Name of the module containing setup_problem and grade functions
275+
Use "auto" to auto-detect module from problem_id prefix
275276
276277
Options:
277278
--workspace PATH Working directory (default: current directory)
@@ -294,6 +295,7 @@ function (@main)(args)
294295
295296
Examples:
296297
julia --project -m LLMBenchMCPServer MyBenchmark
298+
julia --project -m LLMBenchMCPServer auto # Auto-detect module from problem_id
297299
julia --project -m LLMBenchMCPServer MyBenchmark --socket
298300
julia --project -m LLMBenchMCPServer MyBenchmark --direct # Run without sandbox
299301
julia --project -m LLMBenchMCPServer MyBenchmark --bash-uid 1000
@@ -313,6 +315,7 @@ function (@main)(args)
313315
direct_mode = false # New flag for direct execution
314316
bash_uid = nothing # UID for bash session execution
315317
bash_env = Dict{String,String}() # Environment variables for bash
318+
auto_mode = (module_name == "auto") # Check if we're in auto-detect mode
316319

317320
i = 2
318321
while i <= length(args)
@@ -395,40 +398,156 @@ function (@main)(args)
395398
mkpath(working_dir)
396399
end
397400

398-
# Load the module
401+
# Handle module loading based on mode
399402
try
400-
# Try to load as a module first, then as a file
401403
mod = nothing
402-
mod_symbol = Symbol(module_name)
404+
setup_fn = nothing
405+
grade_fn = nothing
403406

404-
# First try to load as a registered package/module
405-
try
406-
mod = Base.require(Main, mod_symbol)
407-
catch
408-
# If that fails, try to load as a local file
409-
if endswith(module_name, ".jl")
410-
# Load file directly
411-
Base.include(Main, module_name)
412-
# Extract module name from file
413-
file_mod_name = basename(module_name)[1:end-3] # Remove .jl
414-
mod_symbol = Symbol(file_mod_name)
415-
if isdefined(Main, mod_symbol)
416-
mod = getfield(Main, mod_symbol)
407+
if auto_mode
408+
# In auto mode, create wrapper functions that dynamically load modules
409+
if verbose
410+
println("Auto mode enabled - modules will be loaded based on problem_id prefix")
411+
end
412+
413+
# Create a wrapper function for setup_problem that auto-detects the module
414+
function auto_setup_problem(workdir::String, problem_id::String="")
415+
if isempty(problem_id)
416+
return "Error: problem_id is required in auto mode. Format: ModuleName-problem_id"
417+
end
418+
419+
# Extract module name from problem_id
420+
parts = split(problem_id, "-", limit=2)
421+
if length(parts) < 2
422+
return "Error: Invalid problem_id format. Expected: ModuleName-problem_id, got: $problem_id"
423+
end
424+
425+
mod_name = parts[1]
426+
clean_problem_id = parts[2]
427+
428+
# Try to load the module
429+
try
430+
mod_symbol = Symbol(mod_name)
431+
target_mod = Base.require(Main, mod_symbol)
432+
433+
# Check if the module has setup_problem
434+
if !isdefined(target_mod, :setup_problem)
435+
return "Error: Module $mod_name does not export setup_problem function"
436+
end
437+
438+
# Call the module's setup_problem with the clean problem_id
439+
setup_fn = getfield(target_mod, :setup_problem)
440+
return Base.invokelatest(setup_fn, workdir, clean_problem_id)
441+
catch e
442+
io = IOBuffer()
443+
showerror(io, e, catch_backtrace())
444+
return "Error loading module $mod_name: " * String(take!(io))
445+
end
446+
end
447+
448+
# Create a wrapper function for grade that auto-detects the module
449+
function auto_grade(workdir::String, transcript::String, problem_id::String="")
450+
if isempty(problem_id)
451+
return Dict(
452+
"score" => 0.0,
453+
"metadata" => Dict("error" => "Error: problem_id is required in auto mode. Format: ModuleName-problem_id")
454+
)
455+
end
456+
457+
# Extract module name from problem_id
458+
parts = split(problem_id, "-", limit=2)
459+
if length(parts) < 2
460+
return Dict(
461+
"score" => 0.0,
462+
"metadata" => Dict("error" => "Error: Invalid problem_id format. Expected: ModuleName-problem_id, got: $problem_id")
463+
)
464+
end
465+
466+
mod_name = parts[1]
467+
clean_problem_id = parts[2]
468+
469+
# Try to load the module
470+
try
471+
mod_symbol = Symbol(mod_name)
472+
target_mod = Base.require(Main, mod_symbol)
473+
474+
# Check if the module has grade
475+
if !isdefined(target_mod, :grade)
476+
return Dict(
477+
"score" => 0.0,
478+
"metadata" => Dict("error" => "Error: Module $mod_name does not export grade function")
479+
)
480+
end
481+
482+
# Call the module's grade with the clean problem_id
483+
grade_fn = getfield(target_mod, :grade)
484+
return Base.invokelatest(grade_fn, workdir, transcript, clean_problem_id)
485+
catch e
486+
io = IOBuffer()
487+
showerror(io, e, catch_backtrace())
488+
return Dict(
489+
"score" => 0.0,
490+
"metadata" => Dict("error" => "Error loading module $mod_name: " * String(take!(io)))
491+
)
492+
end
493+
end
494+
495+
# Set the wrapper functions
496+
setup_fn = auto_setup_problem
497+
grade_fn = auto_grade
498+
499+
else
500+
# Normal mode - load the specified module
501+
mod_symbol = Symbol(module_name)
502+
503+
# First try to load as a registered package/module
504+
try
505+
mod = Base.require(Main, mod_symbol)
506+
catch
507+
# If that fails, try to load as a local file
508+
if endswith(module_name, ".jl")
509+
# Load file directly
510+
Base.include(Main, module_name)
511+
# Extract module name from file
512+
file_mod_name = basename(module_name)[1:end-3] # Remove .jl
513+
mod_symbol = Symbol(file_mod_name)
514+
if isdefined(Main, mod_symbol)
515+
mod = getfield(Main, mod_symbol)
516+
end
517+
elseif isfile(module_name * ".jl")
518+
# Try adding .jl extension
519+
Base.include(Main, module_name * ".jl")
520+
mod_symbol = Symbol(module_name)
521+
if isdefined(Main, mod_symbol)
522+
mod = getfield(Main, mod_symbol)
523+
end
524+
end
525+
end
526+
527+
if mod === nothing
528+
throw(ArgumentError("Could not load module $module_name"))
529+
end
530+
531+
# Extract functions from the loaded module
532+
if isdefined(mod, :setup_problem)
533+
setup_fn = getfield(mod, :setup_problem)
534+
if verbose
535+
println("Found setup_problem function in $module_name")
417536
end
418-
elseif isfile(module_name * ".jl")
419-
# Try adding .jl extension
420-
Base.include(Main, module_name * ".jl")
421-
mod_symbol = Symbol(module_name)
422-
if isdefined(Main, mod_symbol)
423-
mod = getfield(Main, mod_symbol)
537+
else
538+
println("Warning: No setup_problem function found in $module_name")
539+
end
540+
541+
if isdefined(mod, :grade)
542+
grade_fn = getfield(mod, :grade)
543+
if verbose
544+
println("Found grade function in $module_name")
424545
end
546+
else
547+
println("Warning: No grade function found in $module_name")
425548
end
426549
end
427550

428-
if mod === nothing
429-
throw(ArgumentError("Could not load module $module_name"))
430-
end
431-
432551
# Set environment variables for benchmark access
433552
# Set workspace directory
434553
ENV["LLMBENCH_WORKSPACE"] = working_dir
@@ -442,28 +561,6 @@ function (@main)(args)
442561
for (key, value) in bash_env
443562
ENV["LLMBENCH_BASH_ENV_$key"] = value
444563
end
445-
446-
# Extract functions
447-
setup_fn = nothing
448-
grade_fn = nothing
449-
450-
if isdefined(mod, :setup_problem)
451-
setup_fn = getfield(mod, :setup_problem)
452-
if verbose
453-
println("Found setup_problem function in $module_name")
454-
end
455-
else
456-
println("Warning: No setup_problem function found in $module_name")
457-
end
458-
459-
if isdefined(mod, :grade)
460-
grade_fn = getfield(mod, :grade)
461-
if verbose
462-
println("Found grade function in $module_name")
463-
end
464-
else
465-
println("Warning: No grade function found in $module_name")
466-
end
467564

468565
# Create and run the server
469566
server = LLMBenchServer(

test/test_auto_mode.jl

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
# Test auto mode functionality
2+
using Test
3+
using LLMBenchMCPServer
4+
using LLMBenchSimple
5+
using ClaudeMCPTools
6+
7+
# Create a test module
8+
module TestAutoModule
9+
using LLMBenchSimple
10+
11+
@bench "test_problem" promptval"What is 2+2?" == 4
12+
end
13+
14+
@testset "Auto Mode" begin
15+
@testset "Auto mode wrapper functions" begin
16+
# Create wrapper functions similar to what auto mode does
17+
function test_auto_setup_problem(workdir::String, problem_id::String="")
18+
if isempty(problem_id)
19+
return "Error: problem_id is required in auto mode. Format: ModuleName-problem_id"
20+
end
21+
22+
# Extract module name from problem_id
23+
parts = split(problem_id, "-", limit=2)
24+
if length(parts) < 2
25+
return "Error: Invalid problem_id format. Expected: ModuleName-problem_id, got: $problem_id"
26+
end
27+
28+
mod_name = parts[1]
29+
clean_problem_id = parts[2]
30+
31+
# Try to load the module
32+
try
33+
mod_symbol = Symbol(mod_name)
34+
target_mod = Base.require(Main, mod_symbol)
35+
36+
# Check if the module has setup_problem
37+
if !isdefined(target_mod, :setup_problem)
38+
return "Error: Module $mod_name does not export setup_problem function"
39+
end
40+
41+
# Call the module's setup_problem with the clean problem_id
42+
setup_fn = getfield(target_mod, :setup_problem)
43+
return Base.invokelatest(setup_fn, workdir, clean_problem_id)
44+
catch e
45+
io = IOBuffer()
46+
showerror(io, e, catch_backtrace())
47+
return "Error loading module $mod_name: " * String(take!(io))
48+
end
49+
end
50+
51+
function test_auto_grade(workdir::String, transcript::String, problem_id::String="")
52+
if isempty(problem_id)
53+
return Dict(
54+
"score" => 0.0,
55+
"metadata" => Dict("error" => "Error: problem_id is required in auto mode. Format: ModuleName-problem_id")
56+
)
57+
end
58+
59+
# Extract module name from problem_id
60+
parts = split(problem_id, "-", limit=2)
61+
if length(parts) < 2
62+
return Dict(
63+
"score" => 0.0,
64+
"metadata" => Dict("error" => "Error: Invalid problem_id format. Expected: ModuleName-problem_id, got: $problem_id")
65+
)
66+
end
67+
68+
mod_name = parts[1]
69+
clean_problem_id = parts[2]
70+
71+
# Try to load the module
72+
try
73+
mod_symbol = Symbol(mod_name)
74+
target_mod = Base.require(Main, mod_symbol)
75+
76+
# Check if the module has grade
77+
if !isdefined(target_mod, :grade)
78+
return Dict(
79+
"score" => 0.0,
80+
"metadata" => Dict("error" => "Error: Module $mod_name does not export grade function")
81+
)
82+
end
83+
84+
# Call the module's grade with the clean problem_id
85+
grade_fn = getfield(target_mod, :grade)
86+
return Base.invokelatest(grade_fn, workdir, transcript, clean_problem_id)
87+
catch e
88+
io = IOBuffer()
89+
showerror(io, e, catch_backtrace())
90+
return Dict(
91+
"score" => 0.0,
92+
"metadata" => Dict("error" => "Error loading module $mod_name: " * String(take!(io)))
93+
)
94+
end
95+
end
96+
97+
# Test auto_setup_problem
98+
mktempdir() do tmpdir
99+
# Test with proper format
100+
result = test_auto_setup_problem(tmpdir, "TestAutoModule-test_problem")
101+
@test occursin("What is 2+2?", result)
102+
@test occursin("<answer>", result)
103+
104+
# Test with missing problem_id
105+
result = test_auto_setup_problem(tmpdir, "")
106+
@test occursin("Error: problem_id is required", result)
107+
108+
# Test with invalid format
109+
result = test_auto_setup_problem(tmpdir, "invalid_format")
110+
@test occursin("Error: Invalid problem_id format", result)
111+
end
112+
113+
# Test auto_grade
114+
mktempdir() do tmpdir
115+
# Test with proper format and correct answer
116+
transcript = "<answer>4</answer>"
117+
result = test_auto_grade(tmpdir, transcript, "TestAutoModule-test_problem")
118+
@test result["score"] == 1.0
119+
120+
# Test with incorrect answer
121+
transcript = "<answer>5</answer>"
122+
result = test_auto_grade(tmpdir, transcript, "TestAutoModule-test_problem")
123+
@test result["score"] == 0.0
124+
125+
# Test with missing problem_id
126+
result = test_auto_grade(tmpdir, transcript, "")
127+
@test result["score"] == 0.0
128+
@test haskey(result, "metadata")
129+
@test occursin("Error: problem_id is required", result["metadata"]["error"])
130+
131+
# Test with invalid format
132+
result = test_auto_grade(tmpdir, transcript, "invalid_format")
133+
@test result["score"] == 0.0
134+
@test haskey(result, "metadata")
135+
@test occursin("Error: Invalid problem_id format", result["metadata"]["error"])
136+
137+
# Test with non-existent module
138+
result = test_auto_grade(tmpdir, transcript, "NonExistentModule-test")
139+
@test result["score"] == 0.0
140+
@test haskey(result, "metadata")
141+
@test occursin("Error loading module", result["metadata"]["error"])
142+
end
143+
end
144+
145+
@testset "Auto mode server creation" begin
146+
# Test that we can create a server in auto mode
147+
server = LLMBenchServer(
148+
name="auto-MCP",
149+
version="1.0.0",
150+
include_basic_tools=false
151+
)
152+
153+
# Should have no setup/grade tools initially since they are set via wrappers
154+
@test !haskey(server.tools, "setup_problem")
155+
@test !haskey(server.tools, "grade_problem")
156+
end
157+
end

0 commit comments

Comments
 (0)