Skip to content

Commit 98af43f

Browse files
committed
improve MCP tool descriptions and add evals
Enhanced tool descriptions with WHEN TO USE guidance, examples, and common paths. Added realistic evaluation scenarios.
1 parent 80a1034 commit 98af43f

File tree

6 files changed

+907
-19
lines changed

6 files changed

+907
-19
lines changed

.env.example

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,7 @@ NUXT_OAUTH_GITHUB_CLIENT_SECRET=
2626
# Session encryption password (generate a secure random string)
2727
# You can use: openssl rand -base64 32
2828
NUXT_SESSION_PASSWORD=
29+
30+
# MCP Evaluation (Optional - for running evalite tests)
31+
OPENAI_API_KEY=
32+
MCP_URL=

evalite.config.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import { defineConfig } from 'evalite/config'
2+
3+
export default defineConfig({
4+
testTimeout: 60000, // 60s for MCP calls
5+
maxConcurrency: 5, // Limit concurrent API calls
6+
cache: true, // Cache AI SDK outputs
7+
})

mcp.eval.ts

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
import { experimental_createMCPClient as createMCPClient } from '@ai-sdk/mcp'
2+
import { openai } from '@ai-sdk/openai'
3+
import { generateText } from 'ai'
4+
import { evalite } from 'evalite'
5+
import { wrapAISDKModel } from 'evalite/ai-sdk'
6+
import { toolCallAccuracy } from 'evalite/scorers'
7+
8+
/**
9+
* MCP Evaluation Tests
10+
*
11+
* Note: The MCP server has prompts (find_documentation_for_topic, deployment_guide, migration_help)
12+
* that would improve these scenarios, but @ai-sdk/mcp doesn't support converting prompts to tools yet.
13+
*
14+
* TODO: Once @ai-sdk/mcp supports prompt-to-tool conversion or prompt usage in generateText,
15+
* uncomment the tests below that require search/topic-based navigation.
16+
*
17+
* Related: https://ai-sdk.dev/docs/reference/ai-sdk-core/create-mcp-client
18+
*/
19+
20+
const MCP_URL = process.env.MCP_URL ?? 'http://localhost:3000/mcp'
21+
const model = wrapAISDKModel(openai('gpt-5.1-codex-mini'))
22+
23+
evalite('Evaluate Nuxt MCP Documentation Tools', {
24+
data: async () => [
25+
// TODO: Uncomment when find_documentation_for_topic prompt becomes usable as a tool
26+
// {
27+
// input: 'I keep getting hydration mismatch errors in my Nuxt app. Find the documentation that explains this.',
28+
// expected: [{ toolName: 'find_documentation_for_topic', input: { topic: 'hydration errors' } }],
29+
// },
30+
// {
31+
// input: 'What are the different rendering modes available in Nuxt 4 and which one should I use for SEO?',
32+
// expected: [{ toolName: 'find_documentation_for_topic', input: { topic: 'rendering modes SSR SEO' } }],
33+
// },
34+
// {
35+
// input: 'How do I migrate my composables from Nuxt 3 to Nuxt 4?',
36+
// expected: [{ toolName: 'migration_help', input: { fromVersion: '3.x', toVersion: '4.x' } }],
37+
// },
38+
{
39+
input: 'Show me the introduction page for Nuxt 4',
40+
expected: [{ toolName: 'get_documentation_page', input: { path: '/docs/4.x/getting-started/introduction' } }],
41+
},
42+
],
43+
task: async (input) => {
44+
const mcpClient = await createMCPClient({ transport: { type: 'http', url: MCP_URL } })
45+
try {
46+
const result = await generateText({ model, prompt: input, tools: await mcpClient.tools() })
47+
return result.toolCalls ?? []
48+
}
49+
finally {
50+
await mcpClient.close()
51+
}
52+
},
53+
scorers: [async ({ output, expected }) => toolCallAccuracy({ actualCalls: output, expectedCalls: expected })],
54+
})
55+
56+
evalite('Evaluate Nuxt MCP Blog Tools', {
57+
data: async () => [
58+
{ input: 'What are the latest performance improvements announced for Nuxt?', expected: [{ toolName: 'list_blog_posts' }] },
59+
{ input: 'Show me announcements about major version releases', expected: [{ toolName: 'list_blog_posts' }] },
60+
{ input: 'Has there been any blog post about server components or islands architecture?', expected: [{ toolName: 'list_blog_posts' }] },
61+
{ input: 'Get the blog post about Nuxt 4', expected: [{ toolName: 'get_blog_post', input: { path: '/blog/v4' } }] },
62+
],
63+
task: async (input) => {
64+
const mcpClient = await createMCPClient({ transport: { type: 'http', url: MCP_URL } })
65+
try {
66+
const result = await generateText({ model, prompt: input, tools: await mcpClient.tools() })
67+
return result.toolCalls ?? []
68+
}
69+
finally {
70+
await mcpClient.close()
71+
}
72+
},
73+
scorers: [async ({ output, expected }) => toolCallAccuracy({ actualCalls: output, expectedCalls: expected })],
74+
})
75+
76+
evalite('Evaluate Nuxt MCP Deploy Tools', {
77+
data: async () => [
78+
{ input: 'I need a deployment platform that supports edge functions and has a generous free tier. What are my options?', expected: [{ toolName: 'list_deploy_providers' }] },
79+
{ input: 'What deployment providers support Docker containerization?', expected: [{ toolName: 'list_deploy_providers' }] },
80+
{ input: 'I want to self-host my Nuxt app with automatic SSL. Show me how to deploy to a VPS.', expected: [{ toolName: 'list_deploy_providers' }] },
81+
// TODO: Uncomment when deployment_guide prompt becomes usable as a tool
82+
// { input: 'How do I deploy to Vercel?', expected: [{ toolName: 'deployment_guide', input: { provider: 'Vercel' } }] },
83+
],
84+
task: async (input) => {
85+
const mcpClient = await createMCPClient({ transport: { type: 'http', url: MCP_URL } })
86+
try {
87+
const result = await generateText({ model, prompt: input, tools: await mcpClient.tools() })
88+
return result.toolCalls ?? []
89+
}
90+
finally {
91+
await mcpClient.close()
92+
}
93+
},
94+
scorers: [async ({ output, expected }) => toolCallAccuracy({ actualCalls: output, expectedCalls: expected })],
95+
})
96+
97+
evalite('Evaluate Nuxt MCP Module Tools', {
98+
data: async () => [
99+
{ input: 'I need to add authentication with social login providers to my app. Find me a suitable module.', expected: [{ toolName: 'list_modules', input: { category: 'authentication' } }] },
100+
{ input: 'What modules are available for image optimization and lazy loading?', expected: [{ toolName: 'list_modules', input: { category: 'media' } }] },
101+
{ input: 'Show me popular UI component libraries for Nuxt 4', expected: [{ toolName: 'list_modules', input: { category: 'ui' } }] },
102+
{ input: 'I want to add i18n support for multiple languages. What module should I use and does it support Nuxt 4?', expected: [{ toolName: 'list_modules' }, { toolName: 'get_module', input: { slug: '@nuxtjs/i18n' } }] },
103+
{ input: 'Get details about @nuxt/ui module', expected: [{ toolName: 'get_module', input: { slug: '@nuxt/ui' } }] },
104+
],
105+
task: async (input) => {
106+
const mcpClient = await createMCPClient({ transport: { type: 'http', url: MCP_URL } })
107+
try {
108+
const result = await generateText({ model, prompt: input, tools: await mcpClient.tools(), maxSteps: 3 })
109+
return result.toolCalls ?? []
110+
}
111+
finally {
112+
await mcpClient.close()
113+
}
114+
},
115+
scorers: [async ({ output, expected }) => toolCallAccuracy({ actualCalls: output, expectedCalls: expected })],
116+
})
117+
118+
evalite('Evaluate Nuxt MCP Cross-Tool Workflows', {
119+
data: async () => [
120+
{ input: 'I want to build an e-commerce site with Nuxt 4. What modules do I need and where should I deploy it?', expected: [{ toolName: 'list_modules' }, { toolName: 'list_deploy_providers' }] },
121+
{ input: 'Show me the latest features in Nuxt 4 and link to the relevant documentation', expected: [{ toolName: 'list_blog_posts' }, { toolName: 'get_documentation_page', input: { path: '/docs/4.x/getting-started/introduction' } }] },
122+
],
123+
task: async (input) => {
124+
const mcpClient = await createMCPClient({ transport: { type: 'http', url: MCP_URL } })
125+
try {
126+
const result = await generateText({ model, prompt: input, tools: await mcpClient.tools(), maxSteps: 5 })
127+
return result.toolCalls ?? []
128+
}
129+
finally {
130+
await mcpClient.close()
131+
}
132+
},
133+
scorers: [async ({ output, expected }) => toolCallAccuracy({ actualCalls: output, expectedCalls: expected })],
134+
})

package.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
"lint": "eslint . --cache",
1111
"typecheck": "nuxt typecheck",
1212
"test": "pnpm lint && pnpm typecheck",
13+
"eval": "evalite",
14+
"eval:ui": "evalite --ui",
1315
"db:generate": "nuxt hub database generate",
1416
"db:migrate": "nuxt hub database migrate"
1517
},
@@ -54,15 +56,19 @@
5456
"valibot": "1.1.0"
5557
},
5658
"devDependencies": {
59+
"@ai-sdk/mcp": "^0.0.8",
60+
"@ai-sdk/openai": "3.0.0-beta.60",
5761
"@iconify-json/vscode-icons": "^1.2.33",
5862
"@nuxt/eslint": "^1.9.0",
5963
"@nuxt/test-utils": "^3.20.1",
6064
"@nuxtjs/turnstile": "^1.1.1",
6165
"@testing-library/vue": "^8.1.0",
6266
"@types/youtube": "^0.1.2",
67+
"ai": "6.0.0-beta.99",
6368
"capture-website": "^5.0.1",
6469
"drizzle-kit": "^0.31.5",
6570
"eslint": "^9.38.0",
71+
"evalite": "1.0.0-beta.13",
6672
"nuxt-content-twoslash": "0.1.2",
6773
"shiki": "^3.14.0",
6874
"twoslash": "^0.3.4",

0 commit comments

Comments
 (0)