Skip to content

Commit 79eaf7f

Browse files
Add API compatibility test (#114)
* add compatibility test * update readme * update test suite * fix example config * fix typo * bump version
1 parent c77966f commit 79eaf7f

File tree

11 files changed

+2686
-1
lines changed

11 files changed

+2686
-1
lines changed

compatibility-test/.gitignore

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
# Logs
2+
logs
3+
*.log
4+
npm-debug.log*
5+
yarn-debug.log*
6+
yarn-error.log*
7+
lerna-debug.log*
8+
9+
# Diagnostic reports (https://nodejs.org/api/report.html)
10+
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
11+
12+
# Runtime data
13+
pids
14+
*.pid
15+
*.seed
16+
*.pid.lock
17+
18+
# Directory for instrumented libs generated by jscoverage/JSCover
19+
lib-cov
20+
21+
# Coverage directory used by tools like istanbul
22+
coverage
23+
*.lcov
24+
25+
# nyc test coverage
26+
.nyc_output
27+
28+
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
29+
.grunt
30+
31+
# Bower dependency directory (https://bower.io/)
32+
bower_components
33+
34+
# node-waf configuration
35+
.lock-wscript
36+
37+
# Compiled binary addons (https://nodejs.org/api/addons.html)
38+
build/Release
39+
40+
# Dependency directories
41+
node_modules/
42+
jspm_packages/
43+
44+
# Snowpack dependency directory (https://snowpack.dev/)
45+
web_modules/
46+
47+
# TypeScript cache
48+
*.tsbuildinfo
49+
50+
# Optional npm cache directory
51+
.npm
52+
53+
# Optional eslint cache
54+
.eslintcache
55+
56+
# Optional stylelint cache
57+
.stylelintcache
58+
59+
# Optional REPL history
60+
.node_repl_history
61+
62+
# Output of 'npm pack'
63+
*.tgz
64+
65+
# Yarn Integrity file
66+
.yarn-integrity
67+
68+
# dotenv environment variable files
69+
.env
70+
.env.*
71+
!.env.example
72+
73+
# parcel-bundler cache (https://parceljs.org/)
74+
.cache
75+
.parcel-cache
76+
77+
# Next.js build output
78+
.next
79+
out
80+
81+
# Nuxt.js build / generate output
82+
.nuxt
83+
dist
84+
85+
# Gatsby files
86+
.cache/
87+
# Comment in the public line in if your project uses Gatsby and not Next.js
88+
# https://nextjs.org/blog/next-9-1#public-directory-support
89+
# public
90+
91+
# vuepress build output
92+
.vuepress/dist
93+
94+
# vuepress v2.x temp and cache directory
95+
.temp
96+
.cache
97+
98+
# Sveltekit cache directory
99+
.svelte-kit/
100+
101+
# vitepress build output
102+
**/.vitepress/dist
103+
104+
# vitepress cache directory
105+
**/.vitepress/cache
106+
107+
# Docusaurus cache and generated files
108+
.docusaurus
109+
110+
# Serverless directories
111+
.serverless/
112+
113+
# FuseBox cache
114+
.fusebox/
115+
116+
# DynamoDB Local files
117+
.dynamodb/
118+
119+
# Firebase cache directory
120+
.firebase/
121+
122+
# TernJS port file
123+
.tern-port
124+
125+
# Stores VSCode versions used for testing VSCode extensions
126+
.vscode-test
127+
128+
# yarn v3
129+
.pnp.*
130+
.yarn/*
131+
!.yarn/patches
132+
!.yarn/plugins
133+
!.yarn/releases
134+
!.yarn/sdks
135+
!.yarn/versions
136+
137+
# Vite logs files
138+
vite.config.js.timestamp-*
139+
vite.config.ts.timestamp-*
140+
141+
rollout_*.jsonl
142+
analysis_*.json

compatibility-test/README.md

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# API Compatibility Test
2+
3+
This script uses the Agents SDK in TypeScript and the underlying OpenAI client to verify the shape of the API calls but also whether the API performs tool calling.
4+
5+
## What it tests
6+
7+
1.
8+
9+
## How to run
10+
11+
0. Run `npm install` in this directory.
12+
1. Update `providers.ts` to create an entry for the API to test. Change `vllm` to the provider name of your choice. Use `chat` for Chat Completions tests and `responses` for Responses API tests.
13+
2. Run an initial quick test to make sure things work. This will only run one test
14+
15+
```
16+
npm start -- --provider <name> -n 1 -k 1
17+
```
18+
19+
3. Run the full test (runs each test 5 times to test consistency)
20+
21+
```
22+
npm start -- --provider <name> -k 5
23+
```
24+
25+
## Considerations
26+
27+
1. The tests will fail if the API shape does not match the expected behavior
28+
2. Events in the chat API are currently not tested
29+
3. If the schema validation succeeds but the input is wrong the test will still pass for this test. That's because it's likely more of a prompt engineering issue or a validator issue than an API issue as it still nailed the input

compatibility-test/analysis.ts

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
export function analyze(caseResults: any[], tries: number) {
2+
// Group results by unique task: test_case + apiType
3+
type TaskKey = string;
4+
const taskKeyFor = (r: any): TaskKey =>
5+
`${r.test_case}::${r.result?.apiType}`;
6+
7+
const successesByTask: Map<TaskKey, Map<number, boolean>> = new Map();
8+
9+
// Count wrong-input tool calls (schema correct but incorrect arguments)
10+
let wrongInputToolCalls = 0;
11+
12+
// Count invalid response shapes per API type
13+
const totalByApiType: Record<string, number> = {};
14+
const invalidByApiType: Record<string, number> = {};
15+
16+
for (const r of caseResults) {
17+
if (!r?.result || typeof r.result.apiType !== "string") continue;
18+
19+
// Parse attempt index from run_id `${i}_${k}` safely
20+
let attemptIndex: number | undefined;
21+
if (typeof r.run_id === "string") {
22+
const parts = r.run_id.split("_");
23+
const k = Number(parts[1]);
24+
if (Number.isFinite(k)) attemptIndex = k;
25+
}
26+
27+
const key = taskKeyFor(r);
28+
if (!successesByTask.has(key)) successesByTask.set(key, new Map());
29+
if (attemptIndex != null) {
30+
successesByTask.get(key)!.set(attemptIndex, Boolean(r.success));
31+
}
32+
33+
const d = r.result.toolCallingDetails ?? {};
34+
const calledToolAtLeastOnce = Boolean(d.calledToolAtLeastOnce);
35+
const calledToolWithRightSchema = Boolean(d.calledToolWithRightSchema);
36+
const calledToolWithRightArguments = Boolean(
37+
d.calledToolWithRightArguments
38+
);
39+
if (
40+
calledToolAtLeastOnce &&
41+
calledToolWithRightSchema &&
42+
!calledToolWithRightArguments
43+
) {
44+
wrongInputToolCalls++;
45+
}
46+
47+
// Track invalid/total per apiType for response shape
48+
const apiType = r.result.apiType as string;
49+
totalByApiType[apiType] = (totalByApiType[apiType] ?? 0) + 1;
50+
const isValidResponse = r.result.validResponse === true;
51+
if (!isValidResponse) {
52+
invalidByApiType[apiType] = (invalidByApiType[apiType] ?? 0) + 1;
53+
}
54+
}
55+
56+
const totalTasks = successesByTask.size;
57+
58+
// Compute pass@k and pass^k for k = 1..tries
59+
const passAtKByK: number[] = [];
60+
const passHatKByK: number[] = [];
61+
62+
for (let k = 1; k <= tries; k++) {
63+
let tasksSuccessfulK = 0; // any success in first k attempts
64+
let tasksAllSuccessfulK = 0; // all success in first k attempts
65+
66+
for (const [, attemptsMap] of successesByTask) {
67+
let anySuccess = false;
68+
let allSuccess = true;
69+
for (let i = 0; i < k; i++) {
70+
const v = attemptsMap.get(i) === true;
71+
anySuccess = anySuccess || v;
72+
if (!v) allSuccess = false;
73+
}
74+
if (anySuccess) tasksSuccessfulK++;
75+
if (allSuccess) tasksAllSuccessfulK++;
76+
}
77+
78+
const passAtK = totalTasks > 0 ? tasksSuccessfulK / totalTasks : 0;
79+
const passHatK = totalTasks > 0 ? tasksAllSuccessfulK / totalTasks : 0;
80+
passAtKByK.push(passAtK);
81+
passHatKByK.push(passHatK);
82+
}
83+
84+
// Convenience: final k=tries values
85+
const passAtK = passAtKByK[tries - 1] ?? 0;
86+
const passHatK = passHatKByK[tries - 1] ?? 0;
87+
88+
return {
89+
totalTasks,
90+
passAtKByK,
91+
passHatKByK,
92+
passAtK,
93+
passHatK,
94+
wrongInputToolCalls,
95+
// New stats for invalid response shapes per API
96+
invalidByApiType,
97+
totalByApiType,
98+
};
99+
}
100+
101+
export function printAnalysis(
102+
stats: ReturnType<typeof analyze>,
103+
caseResults: any[],
104+
provider: string,
105+
selectedLines: string[],
106+
tries: number,
107+
skipped: number,
108+
analysisFile: string
109+
) {
110+
const formatPerK = (arr: number[]) =>
111+
Array.from({ length: tries }, (_, i) => {
112+
const v = arr[i] ?? 0;
113+
return `${i + 1}=${v.toFixed(3)}`;
114+
}).join(", ");
115+
116+
console.log("Summary:");
117+
console.log(` Provider: ${provider}`);
118+
console.log(` Total input cases: ${selectedLines.length}`);
119+
console.log(` Tries: ${tries}`);
120+
console.log(` Total tasks: ${stats.totalTasks}`);
121+
console.log(` Total runs: ${caseResults.length}`);
122+
// Conditionally print invalid response shape stats per API type
123+
if ((stats.totalByApiType["responses"] ?? 0) > 0) {
124+
const bad = stats.invalidByApiType["responses"] ?? 0;
125+
const tot = stats.totalByApiType["responses"] ?? 0;
126+
console.log(` Invalid Responses API responses: ${bad} (out of ${tot})`);
127+
}
128+
if ((stats.totalByApiType["chat"] ?? 0) > 0) {
129+
const bad = stats.invalidByApiType["chat"] ?? 0;
130+
const tot = stats.totalByApiType["chat"] ?? 0;
131+
console.log(
132+
` Invalid Chat Completions API responses: ${bad} (out of ${tot})`
133+
);
134+
}
135+
console.log(` pass@k (k=1..${tries}): ${formatPerK(stats.passAtKByK)}`);
136+
console.log(` pass^k (k=1..${tries}): ${formatPerK(stats.passHatKByK)}`);
137+
console.log(` pass@k (k=${tries}): ${stats.passAtK.toFixed(3)}`);
138+
console.log(` pass^k (k=${tries}): ${stats.passHatK.toFixed(3)}`);
139+
console.log(` Wrong-input tool calls: ${stats.wrongInputToolCalls}`);
140+
console.log(` Invalid cases.jsonl lines: ${skipped}`);
141+
console.log(` Analysis written to ${analysisFile}`);
142+
}

compatibility-test/cases.jsonl

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
{"tool_name":"get_system_health","input":"Hey, quick check: is everything up and running?","expected_arguments":"{}"}
2+
{"tool_name":"get_system_health","input":"Status report please.","expected_arguments":"{}"}
3+
{"tool_name":"get_system_health","input":"Can you confirm the LLM health before we start?","expected_arguments":"{}"}
4+
{"tool_name":"get_system_health","input":"Need a health snapshot.","expected_arguments":"{}"}
5+
{"tool_name":"get_system_health","input":"Hi, what's the current system health?","expected_arguments":"{}"}
6+
{"tool_name":"markdown_to_html","input":"Convert this markdown to HTML:\n\n# Title\n\nSome *italic* text.","expected_arguments":"{\"markdown\":\"# Title\\n\\nSome *italic* text.\"}"}
7+
{"tool_name":"markdown_to_html","input":"Hey, could you turn `## Docs` into HTML?","expected_arguments":"{\"markdown\":\"## Docs\"}"}
8+
{"tool_name":"markdown_to_html","input":"Please render the following markdown:\n\n- item 1\n- item 2","expected_arguments":"{\"markdown\":\"- item 1\\n- item 2\"}"}
9+
{"tool_name":"markdown_to_html","input":"I have `**bold**` markdown; give me HTML.","expected_arguments":"{\"markdown\":\"**bold**\"}"}
10+
{"tool_name":"markdown_to_html","input":"Markdown to HTML: > quote","expected_arguments":"{\"markdown\":\"> quote\"}"}
11+
{"tool_name":"detect_language","input":"Hey, what language is this: 'Buenos días, ¿cómo estás?'","expected_arguments":"{\"text\":\"Buenos días, ¿cómo estás?\"}"}
12+
{"tool_name":"detect_language","input":"Identify the language: \"Guten Morgen\"","expected_arguments":"{\"text\":\"Guten Morgen\"}"}
13+
{"tool_name":"detect_language","input":"Language detection needed: こんにちは、お元気ですか?","expected_arguments":"{\"text\":\"こんにちは、お元気ですか?\"}"}
14+
{"tool_name":"detect_language","input":"Detect language for: 'Привет, как дела?'","expected_arguments":"{\"text\":\"Привет, как дела?\"}"}
15+
{"tool_name":"detect_language","input":"What language is 'Bonjour tout le monde'?","expected_arguments":"{\"text\":\"Bonjour tout le monde\"}"}
16+
{"tool_name":"generate_chart","input":"Plot a simple line chart for these points: (1,2),(2,4),(3,9).","expected_arguments":"{\"data\":[[1,2],[2,4],[3,9]],\"chart_type\":\"line\"}"}
17+
{"tool_name":"generate_chart","input":"Hey, can I get a bar chart of my sales: 10, 20, 30 across Q1–Q3?","expected_arguments":"{\"data\":[[1,10],[2,20],[3,30]],\"chart_type\":\"bar\",\"title\":\"Quarterly Sales\"}"}
18+
{"tool_name":"generate_chart","input":"Make a scatter chart titled 'Experiment' with x label Time and y label Value for data [ [0,1], [1,1.5], [2,2.2] ].","expected_arguments":"{\"data\":[[0,1],[1,1.5],[2,2.2]],\"chart_type\":\"scatter\",\"title\":\"Experiment\",\"x_label\":\"Time\",\"y_label\":\"Value\"}"}
19+
{"tool_name":"generate_chart","input":"Create a line chart of temperatures 70,72,68,65 over 4 days, label x as 'Day'.","expected_arguments":"{\"data\":[[1,70],[2,72],[3,68],[4,65]],\"chart_type\":\"line\",\"x_label\":\"Day\"}"}
20+
{"tool_name":"generate_chart","input":"Visualize visits per day with a bar chart; numbers: 100,150,120.","expected_arguments":"{\"data\":[[1,100],[2,150],[3,120]],\"chart_type\":\"bar\",\"title\":\"Daily Visits\",\"y_label\":\"Visitors\"}"}
21+
{"tool_name":"query_database","input":"Give me the ids and emails from users table, limit 5.","expected_arguments":"{\"table\":\"users\",\"columns\":[\"id\",\"email\"],\"limit\":5}"}
22+
{"tool_name":"query_database","input":"Hey, fetch order_id and amount from orders where status is 'shipped'.","expected_arguments":"{\"table\":\"orders\",\"columns\":[\"order_id\",\"amount\"],\"filters\":\"status = 'shipped'\"}"}
23+
{"tool_name":"query_database","input":"Retrieve name and price from products ordered by price descending, top 10 please.","expected_arguments":"{\"table\":\"products\",\"columns\":[\"name\",\"price\"],\"limit\":10,\"order_by\":\"price DESC\"}"}
24+
{"tool_name":"query_database","input":"I need the first 3 log entries from audit_log table.","expected_arguments":"{\"table\":\"audit_log\",\"columns\":[\"id\",\"timestamp\",\"action\"],\"limit\":3}"}
25+
{"tool_name":"query_database","input":"Query the customers table for name, city where city = 'Berlin'.","expected_arguments":"{\"table\":\"customers\",\"columns\":[\"name\",\"city\"],\"filters\":\"city = 'Berlin'\"}"}
26+
{"tool_name":"get_weather","input":"What's the weather in San Francisco right now?","expected_arguments":"{\"location\":\"San Francisco\"}"}
27+
{"tool_name":"get_weather","input":"Weather for Tokyo, please.","expected_arguments":"{\"location\":\"Tokyo\"}"}
28+
{"tool_name":"get_weather","input":"Get me the current weather for 10001.","expected_arguments":"{\"location\":\"10001\"}"}
29+
{"tool_name":"get_weather","input":"How's the weather in Paris today?","expected_arguments":"{\"location\":\"Paris\"}"}
30+
{"tool_name":"get_weather","input":"Check the weather for Sydney.","expected_arguments":"{\"location\":\"Sydney\"}"}

0 commit comments

Comments
 (0)