|
| 1 | +import { experimental_createMCPClient as createMCPClient } from '@ai-sdk/mcp' |
| 2 | +import { openai } from '@ai-sdk/openai' |
| 3 | +import { generateText } from 'ai' |
| 4 | +import { evalite } from 'evalite' |
| 5 | +import { wrapAISDKModel } from 'evalite/ai-sdk' |
| 6 | +import { toolCallAccuracy } from 'evalite/scorers' |
| 7 | + |
| 8 | +/** |
| 9 | + * MCP Evaluation Tests |
| 10 | + * |
| 11 | + * Note: The MCP server has prompts (find_documentation_for_topic, deployment_guide, migration_help) |
| 12 | + * that would improve these scenarios, but @ai-sdk/mcp doesn't support converting prompts to tools yet. |
| 13 | + * |
| 14 | + * TODO: Once @ai-sdk/mcp supports prompt-to-tool conversion or prompt usage in generateText, |
| 15 | + * uncomment the tests below that require search/topic-based navigation. |
| 16 | + * |
| 17 | + * Related: https://ai-sdk.dev/docs/reference/ai-sdk-core/create-mcp-client |
| 18 | + */ |
| 19 | + |
| 20 | +const MCP_URL = process.env.MCP_URL ?? 'http://localhost:3000/mcp' |
| 21 | +const model = wrapAISDKModel(openai('gpt-5.1-codex-mini')) |
| 22 | + |
| 23 | +evalite('Evaluate Nuxt MCP Documentation Tools', { |
| 24 | + data: async () => [ |
| 25 | + // TODO: Uncomment when find_documentation_for_topic prompt becomes usable as a tool |
| 26 | + // { |
| 27 | + // input: 'I keep getting hydration mismatch errors in my Nuxt app. Find the documentation that explains this.', |
| 28 | + // expected: [{ toolName: 'find_documentation_for_topic', input: { topic: 'hydration errors' } }], |
| 29 | + // }, |
| 30 | + // { |
| 31 | + // input: 'What are the different rendering modes available in Nuxt 4 and which one should I use for SEO?', |
| 32 | + // expected: [{ toolName: 'find_documentation_for_topic', input: { topic: 'rendering modes SSR SEO' } }], |
| 33 | + // }, |
| 34 | + // { |
| 35 | + // input: 'How do I migrate my composables from Nuxt 3 to Nuxt 4?', |
| 36 | + // expected: [{ toolName: 'migration_help', input: { fromVersion: '3.x', toVersion: '4.x' } }], |
| 37 | + // }, |
| 38 | + { |
| 39 | + input: 'Show me the introduction page for Nuxt 4', |
| 40 | + expected: [{ toolName: 'get_documentation_page', input: { path: '/docs/4.x/getting-started/introduction' } }], |
| 41 | + }, |
| 42 | + ], |
| 43 | + task: async (input) => { |
| 44 | + const mcpClient = await createMCPClient({ transport: { type: 'http', url: MCP_URL } }) |
| 45 | + try { |
| 46 | + const result = await generateText({ model, prompt: input, tools: await mcpClient.tools() }) |
| 47 | + return result.toolCalls ?? [] |
| 48 | + } |
| 49 | + finally { |
| 50 | + await mcpClient.close() |
| 51 | + } |
| 52 | + }, |
| 53 | + scorers: [async ({ output, expected }) => toolCallAccuracy({ actualCalls: output, expectedCalls: expected })], |
| 54 | +}) |
| 55 | + |
| 56 | +evalite('Evaluate Nuxt MCP Blog Tools', { |
| 57 | + data: async () => [ |
| 58 | + { input: 'What are the latest performance improvements announced for Nuxt?', expected: [{ toolName: 'list_blog_posts' }] }, |
| 59 | + { input: 'Show me announcements about major version releases', expected: [{ toolName: 'list_blog_posts' }] }, |
| 60 | + { input: 'Has there been any blog post about server components or islands architecture?', expected: [{ toolName: 'list_blog_posts' }] }, |
| 61 | + { input: 'Get the blog post about Nuxt 4', expected: [{ toolName: 'get_blog_post', input: { path: '/blog/v4' } }] }, |
| 62 | + ], |
| 63 | + task: async (input) => { |
| 64 | + const mcpClient = await createMCPClient({ transport: { type: 'http', url: MCP_URL } }) |
| 65 | + try { |
| 66 | + const result = await generateText({ model, prompt: input, tools: await mcpClient.tools() }) |
| 67 | + return result.toolCalls ?? [] |
| 68 | + } |
| 69 | + finally { |
| 70 | + await mcpClient.close() |
| 71 | + } |
| 72 | + }, |
| 73 | + scorers: [async ({ output, expected }) => toolCallAccuracy({ actualCalls: output, expectedCalls: expected })], |
| 74 | +}) |
| 75 | + |
| 76 | +evalite('Evaluate Nuxt MCP Deploy Tools', { |
| 77 | + data: async () => [ |
| 78 | + { input: 'I need a deployment platform that supports edge functions and has a generous free tier. What are my options?', expected: [{ toolName: 'list_deploy_providers' }] }, |
| 79 | + { input: 'What deployment providers support Docker containerization?', expected: [{ toolName: 'list_deploy_providers' }] }, |
| 80 | + { input: 'I want to self-host my Nuxt app with automatic SSL. Show me how to deploy to a VPS.', expected: [{ toolName: 'list_deploy_providers' }] }, |
| 81 | + // TODO: Uncomment when deployment_guide prompt becomes usable as a tool |
| 82 | + // { input: 'How do I deploy to Vercel?', expected: [{ toolName: 'deployment_guide', input: { provider: 'Vercel' } }] }, |
| 83 | + ], |
| 84 | + task: async (input) => { |
| 85 | + const mcpClient = await createMCPClient({ transport: { type: 'http', url: MCP_URL } }) |
| 86 | + try { |
| 87 | + const result = await generateText({ model, prompt: input, tools: await mcpClient.tools() }) |
| 88 | + return result.toolCalls ?? [] |
| 89 | + } |
| 90 | + finally { |
| 91 | + await mcpClient.close() |
| 92 | + } |
| 93 | + }, |
| 94 | + scorers: [async ({ output, expected }) => toolCallAccuracy({ actualCalls: output, expectedCalls: expected })], |
| 95 | +}) |
| 96 | + |
| 97 | +evalite('Evaluate Nuxt MCP Module Tools', { |
| 98 | + data: async () => [ |
| 99 | + { input: 'I need to add authentication with social login providers to my app. Find me a suitable module.', expected: [{ toolName: 'list_modules', input: { category: 'authentication' } }] }, |
| 100 | + { input: 'What modules are available for image optimization and lazy loading?', expected: [{ toolName: 'list_modules', input: { category: 'media' } }] }, |
| 101 | + { input: 'Show me popular UI component libraries for Nuxt 4', expected: [{ toolName: 'list_modules', input: { category: 'ui' } }] }, |
| 102 | + { input: 'I want to add i18n support for multiple languages. What module should I use and does it support Nuxt 4?', expected: [{ toolName: 'list_modules' }, { toolName: 'get_module', input: { slug: '@nuxtjs/i18n' } }] }, |
| 103 | + { input: 'Get details about @nuxt/ui module', expected: [{ toolName: 'get_module', input: { slug: '@nuxt/ui' } }] }, |
| 104 | + ], |
| 105 | + task: async (input) => { |
| 106 | + const mcpClient = await createMCPClient({ transport: { type: 'http', url: MCP_URL } }) |
| 107 | + try { |
| 108 | + const result = await generateText({ model, prompt: input, tools: await mcpClient.tools(), maxSteps: 3 }) |
| 109 | + return result.toolCalls ?? [] |
| 110 | + } |
| 111 | + finally { |
| 112 | + await mcpClient.close() |
| 113 | + } |
| 114 | + }, |
| 115 | + scorers: [async ({ output, expected }) => toolCallAccuracy({ actualCalls: output, expectedCalls: expected })], |
| 116 | +}) |
| 117 | + |
| 118 | +evalite('Evaluate Nuxt MCP Cross-Tool Workflows', { |
| 119 | + data: async () => [ |
| 120 | + { input: 'I want to build an e-commerce site with Nuxt 4. What modules do I need and where should I deploy it?', expected: [{ toolName: 'list_modules' }, { toolName: 'list_deploy_providers' }] }, |
| 121 | + { input: 'Show me the latest features in Nuxt 4 and link to the relevant documentation', expected: [{ toolName: 'list_blog_posts' }, { toolName: 'get_documentation_page', input: { path: '/docs/4.x/getting-started/introduction' } }] }, |
| 122 | + ], |
| 123 | + task: async (input) => { |
| 124 | + const mcpClient = await createMCPClient({ transport: { type: 'http', url: MCP_URL } }) |
| 125 | + try { |
| 126 | + const result = await generateText({ model, prompt: input, tools: await mcpClient.tools(), maxSteps: 5 }) |
| 127 | + return result.toolCalls ?? [] |
| 128 | + } |
| 129 | + finally { |
| 130 | + await mcpClient.close() |
| 131 | + } |
| 132 | + }, |
| 133 | + scorers: [async ({ output, expected }) => toolCallAccuracy({ actualCalls: output, expectedCalls: expected })], |
| 134 | +}) |
0 commit comments