Skip to content

Commit b95d203

Browse files
committed
Public release
1 parent f67b255 commit b95d203

28 files changed

+2980
-230
lines changed

.github/workflows/CI.yml

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
name: CI
2+
on:
3+
push:
4+
branches:
5+
- main
6+
- master
7+
- release-*
8+
tags: '*'
9+
pull_request:
10+
jobs:
11+
test:
12+
name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
13+
runs-on: ${{ matrix.os }}
14+
strategy:
15+
fail-fast: false
16+
matrix:
17+
version:
18+
- 'pre'
19+
- 'nightly'
20+
os:
21+
- ubuntu-latest
22+
- macOS-latest
23+
arch:
24+
- x64
25+
steps:
26+
- uses: actions/checkout@v4
27+
- uses: julia-actions/setup-julia@v2
28+
with:
29+
version: ${{ matrix.version }}
30+
arch: ${{ matrix.arch }}
31+
- uses: julia-actions/cache@v2
32+
- name: Setup SSH for private repos
33+
run: |
34+
# Add GitHub to known hosts
35+
mkdir -p ~/.ssh
36+
ssh-keyscan github.com >> ~/.ssh/known_hosts
37+
38+
# Write deploy keys to files
39+
echo "${{ secrets.CLAUDE_MCP_TOOLS_DEPLOY_KEY }}" > ~/.ssh/claude_mcp_tools_key
40+
chmod 600 ~/.ssh/claude_mcp_tools_key
41+
42+
echo "${{ secrets.LLMBENCH_SIMPLE_DEPLOY_KEY }}" > ~/.ssh/llmbench_simple_key
43+
chmod 600 ~/.ssh/llmbench_simple_key
44+
45+
# Configure SSH to use different keys for different repos
46+
cat >> ~/.ssh/config << 'EOF'
47+
Host github-claudemcp
48+
HostName github.com
49+
User git
50+
IdentityFile ~/.ssh/claude_mcp_tools_key
51+
IdentitiesOnly yes
52+
53+
Host github-llmbench
54+
HostName github.com
55+
User git
56+
IdentityFile ~/.ssh/llmbench_simple_key
57+
IdentitiesOnly yes
58+
EOF
59+
chmod 600 ~/.ssh/config
60+
61+
# Configure git to use the appropriate hosts
62+
git config --global url."git@github-claudemcp:JuliaComputing/ClaudeMCPTools.jl".insteadOf "[email protected]:JuliaComputing/ClaudeMCPTools.jl"
63+
git config --global url."git@github-llmbench:JuliaComputing/LLMBenchSimple.jl".insteadOf "[email protected]:JuliaComputing/LLMBenchSimple.jl"
64+
- uses: julia-actions/julia-buildpkg@v1
65+
with:
66+
git_cli: true
67+
- uses: julia-actions/julia-runtest@v1
68+
with:
69+
force_latest_compatible_version: false
70+
- uses: julia-actions/julia-processcoverage@v1
71+
- uses: codecov/codecov-action@v4
72+
with:
73+
files: lcov.info
74+
token: ${{ secrets.CODECOV_TOKEN }}
75+
fail_ci_if_error: false

.github/workflows/CompatHelper.yml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
name: CompatHelper
2+
on:
3+
schedule:
4+
- cron: 0 0 * * *
5+
workflow_dispatch:
6+
permissions:
7+
contents: write
8+
pull-requests: write
9+
jobs:
10+
CompatHelper:
11+
runs-on: ubuntu-latest
12+
steps:
13+
- name: Pkg.add("CompatHelper")
14+
run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
15+
- name: CompatHelper.main()
16+
env:
17+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
18+
COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}
19+
run: julia -e 'using CompatHelper; CompatHelper.main()'

.github/workflows/TagBot.yml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
name: TagBot
2+
on:
3+
issue_comment:
4+
types:
5+
- created
6+
workflow_dispatch:
7+
inputs:
8+
lookback:
9+
default: 3
10+
permissions:
11+
actions: read
12+
checks: read
13+
contents: write
14+
deployments: read
15+
issues: read
16+
discussions: read
17+
packages: read
18+
pages: read
19+
pull-requests: read
20+
repository-projects: read
21+
security-events: read
22+
statuses: read
23+
jobs:
24+
TagBot:
25+
if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot'
26+
runs-on: ubuntu-latest
27+
steps:
28+
- uses: JuliaRegistries/TagBot@v1
29+
with:
30+
token: ${{ secrets.GITHUB_TOKEN }}
31+
ssh: ${{ secrets.DOCUMENTER_KEY }}

CLAUDE.md

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# Development Guidelines for LLMBenchMCPServer.jl
2+
3+
## Important Notes
4+
5+
### Unix Domain Sockets
6+
- **Always use `ispath()` not `isfile()` to check for Unix domain sockets**
7+
- Unix domain sockets are not regular files, so `isfile()` returns false
8+
- This applies to checking SSH_AUTH_SOCK, server sockets, and any socket cleanup
9+
10+
### Environment Variables
11+
- **ANTHROPIC_API_KEY is automatically forwarded to the sandbox**
12+
- If set in the host environment, it will be available in the sandboxed process
13+
- This allows tools inside the sandbox to use the Anthropic API
14+
15+
## Shipping Code
16+
17+
When asked to "ship it" or after making changes:
18+
19+
1. Stage all changes: `git add -A`
20+
2. Create a descriptive commit message
21+
3. Run tests locally and make sure they pass: `julia --project=. -e 'using Pkg; Pkg.test()'`
22+
4. Push to the repository: `git push origin master`
23+
5. **IMPORTANT**: Monitor the GitHub Actions CI run
24+
- Use: `gh run list --repo JuliaComputing/LLMBenchMCPServer.jl --branch master --limit 1` to find the run
25+
- Use: `gh run watch <run-id> --repo JuliaComputing/LLMBenchMCPServer.jl --exit-status` to monitor it (can take up to 10 minutes)
26+
- If it fails, investigate and fix before considering the task complete
27+
28+
## Testing
29+
30+
Always run tests before pushing:
31+
```bash
32+
julia --project=. -e 'using Pkg; Pkg.test()'
33+
```
34+
35+
## Package Structure
36+
37+
- `src/LLMBenchMCPServer.jl` - Main module file
38+
- `src/server.jl` - Server configuration and creation
39+
- `src/tools/` - LLM benchmark-specific tools
40+
- `setup_problem.jl` - Problem setup tool
41+
- `grade_problem.jl` - Grading tool
42+
43+
## Dependencies
44+
45+
This package depends on:
46+
- `ClaudeMCPTools.jl` - Core MCP server implementation
47+
- `LLMBenchSimple.jl` (optional, for testing) - Simple benchmark definitions
48+
49+
## Creating Benchmark Modules
50+
51+
Benchmark modules should export:
52+
- `setup_problem(workdir::String)` - Returns problem description
53+
- `grade(workdir::String, transcript::String)` - Returns grading result
54+
55+
## Running as MCP Server
56+
57+
```julia
58+
using LLMBenchMCPServer
59+
60+
# With custom functions
61+
server = LLMBenchServer(
62+
setup_fn=my_setup,
63+
grade_fn=my_grade
64+
)
65+
66+
# Or with a module
67+
LLMBenchMCPServer.main("MyBenchmarkModule")
68+
```

LICENSE

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2025 The JuliaBench contributors
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

Manifest.toml

Lines changed: 0 additions & 78 deletions
This file was deleted.

Project.toml

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,81 @@ authors = ["JuliaComputing"]
55

66
[deps]
77
ClaudeMCPTools = "b9bb1685-6a70-41d7-9793-2f9fb633d966"
8+
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
89
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
10+
LLMBenchSimple = "a7b3c4d5-e6f7-8901-2345-678901234567"
11+
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
12+
Scratch = "6c6a2e73-6563-6170-7368-637461726353"
13+
Sockets = "6462fe0b-24de-5631-8697-dd941f90decc"
14+
TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
15+
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
16+
UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
17+
18+
[weakdeps]
19+
BinaryBuilder2 = "12aac903-9f7c-5d81-afc2-d9565ea332af"
20+
BinaryBuilderAuditor = "53524979-5234-6e31-4220-6120654b694c"
21+
BinaryBuilderGitUtils = "654d7472-7548-7361-4832-74694762694c"
22+
BinaryBuilderPlatformExtensions = "213f2928-4d72-6f46-7461-4c705f634367"
23+
BinaryBuilderProducts = "21737265-7a69-6e4f-4974-6375646f7270"
24+
BinaryBuilderSources = "316c416d-4527-6863-7465-466137743047"
25+
BinaryBuilderToolchains = "33566f4c-336c-3150-6d30-4374274e6143"
26+
JLLGenerator = "73536572-7074-4e69-5270-206c4c6a2061"
27+
KeywordArgumentExtraction = "45533465-3150-2073-4772-41774b207255"
28+
MultiHashParsing = "73786568-6863-756d-6873-6168796e616d"
29+
OutputCollectors = "6c11c7d4-943b-4e2b-80de-f2cfc2930a8c"
30+
Sandbox = "9307e30f-c43e-9ca7-d17c-c2dc59df670d"
31+
ScratchSpaceGarbageCollector = "73336863-5434-7263-5370-556e61336c43"
32+
TreeArchival = "216c6a2e-6c61-7669-6863-726165657274"
33+
34+
[extensions]
35+
SandboxExt = ["Sandbox", "BinaryBuilder2"]
936

1037
[sources]
11-
ClaudeMCPTools = {path = "../ClaudeMCPTools"}
38+
# Direct dependencies
39+
ClaudeMCPTools = {rev = "master", url = "[email protected]:JuliaBench/ClaudeMCPTools.jl.git"}
40+
LLMBenchSimple = {rev = "master", url = "[email protected]:JuliaComputing/LLMBenchSimple.jl.git"}
41+
Scratch = {rev = "master", url = "https://github.com/JuliaPackaging/Scratch.jl"}
42+
43+
# Weak dependencies for sandbox functionality (BB2 ecosystem)
44+
BinaryBuilder2 = {rev = "kf/consolidated", url = "https://github.com/JuliaPackaging/BinaryBuilder2.jl"}
45+
BinaryBuilderAuditor = {rev = "kf/consolidated", subdir = "BinaryBuilderAuditor.jl", url = "https://github.com/JuliaPackaging/BinaryBuilder2.jl"}
46+
BinaryBuilderGitUtils = {rev = "kf/consolidated", subdir = "BinaryBuilderGitUtils.jl", url = "https://github.com/JuliaPackaging/BinaryBuilder2.jl"}
47+
BinaryBuilderPlatformExtensions = {rev = "kf/consolidated", subdir = "BinaryBuilderPlatformExtensions.jl", url = "https://github.com/JuliaPackaging/BinaryBuilder2.jl"}
48+
BinaryBuilderProducts = {rev = "kf/consolidated", subdir = "BinaryBuilderProducts.jl", url = "https://github.com/JuliaPackaging/BinaryBuilder2.jl"}
49+
BinaryBuilderSources = {rev = "kf/consolidated", subdir = "BinaryBuilderSources.jl", url = "https://github.com/JuliaPackaging/BinaryBuilder2.jl"}
50+
BinaryBuilderToolchains = {rev = "kf/consolidated", subdir = "BinaryBuilderToolchains.jl", url = "https://github.com/JuliaPackaging/BinaryBuilder2.jl"}
51+
JLLGenerator = {rev = "kf/consolidated", subdir = "JLLGenerator.jl", url = "https://github.com/JuliaPackaging/BinaryBuilder2.jl"}
52+
KeywordArgumentExtraction = {rev = "kf/consolidated", subdir = "KeywordArgumentExtraction.jl", url = "https://github.com/JuliaPackaging/BinaryBuilder2.jl"}
53+
MultiHashParsing = {rev = "kf/consolidated", subdir = "MultiHashParsing.jl", url = "https://github.com/JuliaPackaging/BinaryBuilder2.jl"}
54+
OutputCollectors = {rev = "master", url = "https://github.com/JuliaPackaging/OutputCollectors.jl"}
55+
Sandbox = {rev = "master", url = "https://github.com/JuliaPackaging/Sandbox.jl"}
56+
ScratchSpaceGarbageCollector = {rev = "kf/consolidated", subdir = "ScratchSpaceGarbageCollector.jl", url = "https://github.com/JuliaPackaging/BinaryBuilder2.jl"}
57+
TreeArchival = {rev = "kf/consolidated", subdir = "TreeArchival.jl", url = "https://github.com/JuliaPackaging/BinaryBuilder2.jl"}
58+
59+
[compat]
60+
ClaudeMCPTools = "0.1.0"
61+
Pkg = "1.11.0"
62+
Scratch = "1.3.0"
63+
TOML = "1.0.3"
64+
Test = "1.11.0"
1265

1366
[extras]
67+
# These are for backwards compatibility with [sources] validation
68+
BinaryBuilder2 = "12aac903-9f7c-5d81-afc2-d9565ea332af"
69+
BinaryBuilderAuditor = "53524979-5234-6e31-4220-6120654b694c"
70+
BinaryBuilderGitUtils = "654d7472-7548-7361-4832-74694762694c"
71+
BinaryBuilderPlatformExtensions = "213f2928-4d72-6f46-7461-4c705f634367"
72+
BinaryBuilderProducts = "21737265-7a69-6e4f-4974-6375646f7270"
73+
BinaryBuilderSources = "316c416d-4527-6863-7465-466137743047"
74+
BinaryBuilderToolchains = "33566f4c-336c-3150-6d30-4374274e6143"
75+
JLLGenerator = "73536572-7074-4e69-5270-206c4c6a2061"
76+
KeywordArgumentExtraction = "45533465-3150-2073-4772-41774b207255"
77+
MultiHashParsing = "73786568-6863-756d-6873-6168796e616d"
78+
OutputCollectors = "6c11c7d4-943b-4e2b-80de-f2cfc2930a8c"
79+
Sandbox = "9307e30f-c43e-9ca7-d17c-c2dc59df670d"
80+
ScratchSpaceGarbageCollector = "73336863-5434-7263-5370-556e61336c43"
1481
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
82+
TreeArchival = "216c6a2e-6c61-7669-6863-726165657274"
1583

1684
[targets]
1785
test = ["Test"]

0 commit comments

Comments
 (0)