Skip to content

Commit 81377bb

Browse files
authored
[GRO-189] adding evals to mcp server, ci evals (#107)
* adding evals to mcp server, auto publish to npm, ci evals * rm publish, fix required deps * switch to npm for ci * change to npm update locfile * upgrade to node 22 * prettier fix * write prettier on commit staged * remove pnpm * lower passing threshold * remove release * update ci and evals with main, update eval runner * lockfile * pnpm ci fix, fix evals unnecessary changes * sessionid * pnpm in ci * remove pnpm version * add chalk as a dev dep * pnpm lockfile for chalk
1 parent 340dd58 commit 81377bb

File tree

10 files changed

+4483
-1691
lines changed

10 files changed

+4483
-1691
lines changed

.github/workflows/ci.yml

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,46 @@
11
name: CI
22

33
on:
4-
push:
5-
branches: [main]
64
pull_request:
75
branches: [main]
6+
push:
7+
branches: [main]
88

99
jobs:
10-
lint:
10+
test:
11+
name: Test and Lint
1112
runs-on: ubuntu-latest
13+
1214
steps:
13-
- uses: actions/checkout@v4
15+
- name: Checkout code
16+
uses: actions/checkout@v4
17+
1418
- name: Install pnpm
1519
uses: pnpm/action-setup@v4
16-
- name: Use Node.js 22
20+
21+
- name: Setup Node.js
1722
uses: actions/setup-node@v4
1823
with:
1924
node-version: "22"
2025
cache: "pnpm"
26+
2127
- name: Install dependencies
2228
run: pnpm install --frozen-lockfile
23-
- run: pnpm run build
24-
- name: Run ESLint
25-
run: pnpm run lint
26-
- name: Ensure no changes
27-
run: git diff --exit-code
29+
30+
- name: Run linting
31+
run: pnpm lint
32+
33+
- name: Check formatting
34+
run: pnpm format
35+
36+
- name: Build project
37+
run: pnpm build
38+
39+
- name: Run evaluation tests
40+
env:
41+
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
42+
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
43+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
44+
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
45+
run: |
46+
pnpm evals

.github/workflows/release.yml

Lines changed: 0 additions & 44 deletions
This file was deleted.

.husky/pre-commit

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
#!/bin/sh
2+
pnpm install
3+
pnpm build
24
pnpm pre-commit

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ git clone https://github.com/browserbase/mcp-server-browserbase.git
105105
cd mcp-server-browserbase
106106

107107
# Install the dependencies and build the project
108-
pnpm install && pnpm build
108+
npm install && npm run build
109109
```
110110

111111
Then in your MCP Config JSON run the server. To run locally we can use STDIO or self-host SHTTP.

evals/mcp-eval-basic.config.json

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
{
2+
"passThreshold": 0.7,
3+
"server": {
4+
"transport": "stdio",
5+
"command": "node",
6+
"args": ["./cli.js"],
7+
"env": {
8+
"BROWSERBASE_API_KEY": "${BROWSERBASE_API_KEY}",
9+
"BROWSERBASE_PROJECT_ID": "${BROWSERBASE_PROJECT_ID}",
10+
"GEMINI_API_KEY": "${GEMINI_API_KEY}"
11+
}
12+
},
13+
"timeout": 180000,
14+
"llmJudge": false,
15+
"workflows": [
16+
{
17+
"name": "basic-navigation-test",
18+
"description": "Test basic browser navigation functionality",
19+
"steps": [
20+
{
21+
"user": "Create a browser session, navigate to https://example.com, and close the session",
22+
"expectedState": "closed"
23+
}
24+
],
25+
"expectTools": [
26+
"browserbase_session_create",
27+
"browserbase_stagehand_navigate",
28+
"browserbase_session_close"
29+
]
30+
},
31+
{
32+
"name": "search-and-extract-test",
33+
"description": "Test navigation, search interaction, and data extraction",
34+
"steps": [
35+
{
36+
"user": "Create a browser session, navigate to https://example.com, extract the page title, and close the session",
37+
"expectedState": "Example Domain"
38+
}
39+
],
40+
"expectTools": [
41+
"browserbase_session_create",
42+
"browserbase_stagehand_navigate",
43+
"browserbase_stagehand_extract",
44+
"browserbase_session_close"
45+
]
46+
},
47+
{
48+
"name": "observe-and-interact-test",
49+
"description": "Test element observation and interaction capabilities",
50+
"steps": [
51+
{
52+
"user": "Create a browser session, navigate to https://example.com, observe the page elements, and close the session",
53+
"expectedState": "closed"
54+
}
55+
],
56+
"expectTools": [
57+
"browserbase_session_create",
58+
"browserbase_stagehand_navigate",
59+
"browserbase_stagehand_observe",
60+
"browserbase_session_close"
61+
]
62+
},
63+
{
64+
"name": "screenshot-test",
65+
"description": "Test screenshot functionality",
66+
"steps": [
67+
{
68+
"user": "Create a browser session, navigate to https://example.com, take a screenshot, and close the session",
69+
"expectedState": "closed"
70+
}
71+
],
72+
"expectTools": [
73+
"browserbase_session_create",
74+
"browserbase_stagehand_navigate",
75+
"browserbase_screenshot",
76+
"browserbase_session_close"
77+
]
78+
},
79+
{
80+
"name": "multi-session-test",
81+
"description": "Test multi-session browser management",
82+
"steps": [
83+
{
84+
"user": "Create a multi-session browser named 'test-session', list all sessions, navigate to https://example.com in that session, and close the session",
85+
"expectedState": "closed"
86+
}
87+
],
88+
"expectTools": [
89+
"multi_browserbase_stagehand_session_create",
90+
"multi_browserbase_stagehand_session_list",
91+
"multi_browserbase_stagehand_navigate_session",
92+
"multi_browserbase_stagehand_session_close"
93+
]
94+
},
95+
{
96+
"name": "form-interaction-test",
97+
"description": "Test form filling and submission capabilities",
98+
"steps": [
99+
{
100+
"user": "Create a browser session, navigate to https://httpbin.org/forms/post, fill in the customer name field with 'TestUser', and close the session",
101+
"expectedState": "closed"
102+
}
103+
],
104+
"expectTools": [
105+
"browserbase_session_create",
106+
"browserbase_stagehand_navigate",
107+
"browserbase_stagehand_act",
108+
"browserbase_session_close"
109+
]
110+
},
111+
{
112+
"name": "error-handling-test",
113+
"description": "Test error handling for invalid operations",
114+
"steps": [
115+
{
116+
"user": "Create a browser session and try to navigate to an invalid URL like 'invalid-url-test'",
117+
"expectedState": "error"
118+
}
119+
],
120+
"expectTools": [
121+
"browserbase_session_create",
122+
"browserbase_stagehand_navigate"
123+
]
124+
},
125+
{
126+
"name": "url-retrieval-test",
127+
"description": "Test URL retrieval functionality",
128+
"steps": [
129+
{
130+
"user": "Create a browser session, navigate to https://example.com, get the current URL to verify navigation, and close the session",
131+
"expectedState": "https://example.com"
132+
}
133+
],
134+
"expectTools": [
135+
"browserbase_session_create",
136+
"browserbase_stagehand_navigate",
137+
"browserbase_stagehand_get_url",
138+
"browserbase_session_close"
139+
]
140+
}
141+
]
142+
}

evals/mcp-eval-minimal.config.json

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
{
2+
"passThreshold": 0.7,
3+
"server": {
4+
"transport": "stdio",
5+
"command": "node",
6+
"args": ["./cli.js"],
7+
"env": {
8+
"BROWSERBASE_API_KEY": "${BROWSERBASE_API_KEY}",
9+
"BROWSERBASE_PROJECT_ID": "${BROWSERBASE_PROJECT_ID}",
10+
"GEMINI_API_KEY": "${GEMINI_API_KEY}"
11+
}
12+
},
13+
"timeout": 60000,
14+
"llmJudge": false,
15+
"workflows": [
16+
{
17+
"name": "smoke-test-navigation",
18+
"description": "Quick test to verify basic navigation works",
19+
"steps": [
20+
{
21+
"user": "Open a browser and go to example.org",
22+
"expectedState": "session created"
23+
},
24+
{
25+
"user": "Close the browser",
26+
"expectedState": "closed successfully via Stagehand"
27+
}
28+
],
29+
"expectTools": [
30+
"browserbase_session_create",
31+
"browserbase_stagehand_navigate",
32+
"browserbase_session_close"
33+
]
34+
},
35+
{
36+
"name": "smoke-test-extraction",
37+
"description": "Quick test to verify data extraction works",
38+
"steps": [
39+
{
40+
"user": "Navigate to example.org and extract the page title",
41+
"expectedState": "Example Domain"
42+
},
43+
{
44+
"user": "Close the session",
45+
"expectedState": "closed successfully via Stagehand"
46+
}
47+
],
48+
"expectTools": [
49+
"browserbase_session_create",
50+
"browserbase_stagehand_navigate",
51+
"browserbase_stagehand_extract",
52+
"browserbase_session_close"
53+
]
54+
},
55+
{
56+
"name": "smoke-test-multi-session",
57+
"description": "Quick test to verify multi-session functionality",
58+
"steps": [
59+
{
60+
"user": "Create a browser session named 'test-session'",
61+
"expectedState": "session created"
62+
},
63+
{
64+
"user": "List active sessions",
65+
"expectedState": "test-session"
66+
},
67+
{
68+
"user": "Close the test session",
69+
"expectedState": "closed session"
70+
}
71+
],
72+
"expectTools": [
73+
"multi_browserbase_stagehand_session_create",
74+
"multi_browserbase_stagehand_session_list",
75+
"multi_browserbase_stagehand_session_close"
76+
]
77+
},
78+
{
79+
"name": "smoke-test-url-tools",
80+
"description": "Quick test to verify URL retrieval tools work",
81+
"steps": [
82+
{
83+
"user": "Create a browser session, navigate to example.org, get the current URL, and close the session",
84+
"expectedState": "example.org"
85+
}
86+
],
87+
"expectTools": [
88+
"browserbase_session_create",
89+
"browserbase_stagehand_navigate",
90+
"browserbase_stagehand_get_url",
91+
"browserbase_session_close"
92+
]
93+
}
94+
]
95+
}

0 commit comments

Comments
 (0)