openai
diff --git a/‎compatibility-test/.gitignore
Lines changed: 142 additions & 0 deletions b/‎compatibility-test/.gitignore
Lines changed: 142 additions & 0 deletions
diff --git a/‎compatibility-test/README.md
Lines changed: 29 additions & 0 deletions b/‎compatibility-test/README.md
Lines changed: 29 additions & 0 deletions
diff --git a/‎compatibility-test/analysis.ts
Lines changed: 142 additions & 0 deletions b/‎compatibility-test/analysis.ts
Lines changed: 142 additions & 0 deletions
diff --git a/‎compatibility-test/cases.jsonl
Lines changed: 30 additions & 0 deletions b/‎compatibility-test/cases.jsonl
Lines changed: 30 additions & 0 deletions
@@ -0,0 +1,142 @@
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+lerna-debug.log*
+
+# Diagnostic reports (https://nodejs.org/api/report.html)
+report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
+
+# Runtime data
+pids
+*.pid
+*.seed
+*.pid.lock
+
+# Directory for instrumented libs generated by jscoverage/JSCover
+lib-cov
+
+# Coverage directory used by tools like istanbul
+coverage
+*.lcov
+
+# nyc test coverage
+.nyc_output
+
+# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
+.grunt
+
+# Bower dependency directory (https://bower.io/)
+bower_components
+
+# node-waf configuration
+.lock-wscript
+
+# Compiled binary addons (https://nodejs.org/api/addons.html)
+build/Release
+
+# Dependency directories
+node_modules/
+jspm_packages/
+
+# Snowpack dependency directory (https://snowpack.dev/)
+web_modules/
+
+# TypeScript cache
+*.tsbuildinfo
+
+# Optional npm cache directory
+.npm
+
+# Optional eslint cache
+.eslintcache
+
+# Optional stylelint cache
+.stylelintcache
+
+# Optional REPL history
+.node_repl_history
+
+# Output of 'npm pack'
+*.tgz
+
+# Yarn Integrity file
+.yarn-integrity
+
+# dotenv environment variable files
+.env
+.env.*
+!.env.example
+
+# parcel-bundler cache (https://parceljs.org/)
+.cache
+.parcel-cache
+
+# Next.js build output
+.next
+out
+
+# Nuxt.js build / generate output
+.nuxt
+dist
+
+# Gatsby files
+.cache/
+# Comment in the public line in if your project uses Gatsby and not Next.js
+# https://nextjs.org/blog/next-9-1#public-directory-support
+# public
+
+# vuepress build output
+.vuepress/dist
+
+# vuepress v2.x temp and cache directory
+.temp
+.cache
+
+# Sveltekit cache directory
+.svelte-kit/
+
+# vitepress build output
+**/.vitepress/dist
+
+# vitepress cache directory
+**/.vitepress/cache
+
+# Docusaurus cache and generated files
+.docusaurus
+
+# Serverless directories
+.serverless/
+
+# FuseBox cache
+.fusebox/
+
+# DynamoDB Local files
+.dynamodb/
+
+# Firebase cache directory
+.firebase/
+
+# TernJS port file
+.tern-port
+
+# Stores VSCode versions used for testing VSCode extensions
+.vscode-test
+
+# yarn v3
+.pnp.*
+.yarn/*
+!.yarn/patches
+!.yarn/plugins
+!.yarn/releases
+!.yarn/sdks
+!.yarn/versions
+
+# Vite logs files
+vite.config.js.timestamp-*
+vite.config.ts.timestamp-*
+
+rollout_*.jsonl
+analysis_*.json
@@ -0,0 +1,29 @@
+# API Compatibility Test
+
+This script uses the Agents SDK in TypeScript and the underlying OpenAI client to verify the shape of the API calls but also whether the API performs tool calling.
+
+## What it tests
+
+1.
+
+## How to run
+
+0. Run `npm install` in this directory.
+1. Update `providers.ts` to create an entry for the API to test. Change `vllm` to the provider name of your choice. Use `chat` for Chat Completions tests and `responses` for Responses API tests.
+2. Run an initial quick test to make sure things work. This will only run one test
+
+```
+npm start -- --provider <name> -n 1 -k 1
+```
+
+3. Run the full test (runs each test 5 times to test consistency)
+
+```
+npm start -- --provider <name> -k 5
+```
+
+## Considerations
+
+1. The tests will fail if the API shape does not match the expected behavior
+2. Events in the chat API are currently not tested
+3. If the schema validation succeeds but the input is wrong the test will still pass for this test. That's because it's likely more of a prompt engineering issue or a validator issue than an API issue as it still nailed the input
@@ -0,0 +1,142 @@
+export function analyze(caseResults: any[], tries: number) {
+  // Group results by unique task: test_case + apiType
+  type TaskKey = string;
+  const taskKeyFor = (r: any): TaskKey =>
+    `${r.test_case}::${r.result?.apiType}`;
+
+  const successesByTask: Map<TaskKey, Map<number, boolean>> = new Map();
+
+  // Count wrong-input tool calls (schema correct but incorrect arguments)
+  let wrongInputToolCalls = 0;
+
+  // Count invalid response shapes per API type
+  const totalByApiType: Record<string, number> = {};
+  const invalidByApiType: Record<string, number> = {};
+
+  for (const r of caseResults) {
+    if (!r?.result || typeof r.result.apiType !== "string") continue;
+
+    // Parse attempt index from run_id `${i}_${k}` safely
+    let attemptIndex: number | undefined;
+    if (typeof r.run_id === "string") {
+      const parts = r.run_id.split("_");
+      const k = Number(parts[1]);
+      if (Number.isFinite(k)) attemptIndex = k;
+    }
+
+    const key = taskKeyFor(r);
+    if (!successesByTask.has(key)) successesByTask.set(key, new Map());
+    if (attemptIndex != null) {
+      successesByTask.get(key)!.set(attemptIndex, Boolean(r.success));
+    }
+
+    const d = r.result.toolCallingDetails ?? {};
+    const calledToolAtLeastOnce = Boolean(d.calledToolAtLeastOnce);
+    const calledToolWithRightSchema = Boolean(d.calledToolWithRightSchema);
+    const calledToolWithRightArguments = Boolean(
+      d.calledToolWithRightArguments
+    );
+    if (
+      calledToolAtLeastOnce &&
+      calledToolWithRightSchema &&
+      !calledToolWithRightArguments
+    ) {
+      wrongInputToolCalls++;
+    }
+
+    // Track invalid/total per apiType for response shape
+    const apiType = r.result.apiType as string;
+    totalByApiType[apiType] = (totalByApiType[apiType] ?? 0) + 1;
+    const isValidResponse = r.result.validResponse === true;
+    if (!isValidResponse) {
+      invalidByApiType[apiType] = (invalidByApiType[apiType] ?? 0) + 1;
+    }
+  }
+
+  const totalTasks = successesByTask.size;
+
+  // Compute pass@k and pass^k for k = 1..tries
+  const passAtKByK: number[] = [];
+  const passHatKByK: number[] = [];
+
+  for (let k = 1; k <= tries; k++) {
+    let tasksSuccessfulK = 0; // any success in first k attempts
+    let tasksAllSuccessfulK = 0; // all success in first k attempts
+
+    for (const [, attemptsMap] of successesByTask) {
+      let anySuccess = false;
+      let allSuccess = true;
+      for (let i = 0; i < k; i++) {
+        const v = attemptsMap.get(i) === true;
+        anySuccess = anySuccess || v;
+        if (!v) allSuccess = false;
+      }
+      if (anySuccess) tasksSuccessfulK++;
+      if (allSuccess) tasksAllSuccessfulK++;
+    }
+
+    const passAtK = totalTasks > 0 ? tasksSuccessfulK / totalTasks : 0;
+    const passHatK = totalTasks > 0 ? tasksAllSuccessfulK / totalTasks : 0;
+    passAtKByK.push(passAtK);
+    passHatKByK.push(passHatK);
+  }
+
+  // Convenience: final k=tries values
+  const passAtK = passAtKByK[tries - 1] ?? 0;
+  const passHatK = passHatKByK[tries - 1] ?? 0;
+
+  return {
+    totalTasks,
+    passAtKByK,
+    passHatKByK,
+    passAtK,
+    passHatK,
+    wrongInputToolCalls,
+    // New stats for invalid response shapes per API
+    invalidByApiType,
+    totalByApiType,
+  };
+}
+
+export function printAnalysis(
+  stats: ReturnType<typeof analyze>,
+  caseResults: any[],
+  provider: string,
+  selectedLines: string[],
+  tries: number,
+  skipped: number,
+  analysisFile: string
+) {
+  const formatPerK = (arr: number[]) =>
+    Array.from({ length: tries }, (_, i) => {
+      const v = arr[i] ?? 0;
+      return `${i + 1}=${v.toFixed(3)}`;
+    }).join(", ");
+
+  console.log("Summary:");
+  console.log(`  Provider: ${provider}`);
+  console.log(`  Total input cases: ${selectedLines.length}`);
+  console.log(`  Tries: ${tries}`);
+  console.log(`  Total tasks: ${stats.totalTasks}`);
+  console.log(`  Total runs: ${caseResults.length}`);
+  // Conditionally print invalid response shape stats per API type
+  if ((stats.totalByApiType["responses"] ?? 0) > 0) {
+    const bad = stats.invalidByApiType["responses"] ?? 0;
+    const tot = stats.totalByApiType["responses"] ?? 0;
+    console.log(`  Invalid Responses API responses: ${bad} (out of ${tot})`);
+  }
+  if ((stats.totalByApiType["chat"] ?? 0) > 0) {
+    const bad = stats.invalidByApiType["chat"] ?? 0;
+    const tot = stats.totalByApiType["chat"] ?? 0;
+    console.log(
+      `  Invalid Chat Completions API responses: ${bad} (out of ${tot})`
+    );
+  }
+  console.log(`  pass@k (k=1..${tries}): ${formatPerK(stats.passAtKByK)}`);
+  console.log(`  pass^k (k=1..${tries}): ${formatPerK(stats.passHatKByK)}`);
+  console.log(`  pass@k (k=${tries}): ${stats.passAtK.toFixed(3)}`);
+  console.log(`  pass^k (k=${tries}): ${stats.passHatK.toFixed(3)}`);
+  console.log(`  Wrong-input tool calls: ${stats.wrongInputToolCalls}`);
+  console.log(`  Invalid cases.jsonl lines: ${skipped}`);
+  console.log(`  Analysis written to ${analysisFile}`);
+}
@@ -0,0 +1,30 @@
+{"tool_name":"get_system_health","input":"Hey, quick check: is everything up and running?","expected_arguments":"{}"}
+{"tool_name":"get_system_health","input":"Status report please.","expected_arguments":"{}"}
+{"tool_name":"get_system_health","input":"Can you confirm the LLM health before we start?","expected_arguments":"{}"}
+{"tool_name":"get_system_health","input":"Need a health snapshot.","expected_arguments":"{}"}
+{"tool_name":"get_system_health","input":"Hi, what's the current system health?","expected_arguments":"{}"}
+{"tool_name":"markdown_to_html","input":"Convert this markdown to HTML:\n\n# Title\n\nSome *italic* text.","expected_arguments":"{\"markdown\":\"# Title\\n\\nSome *italic* text.\"}"}
+{"tool_name":"markdown_to_html","input":"Hey, could you turn `## Docs` into HTML?","expected_arguments":"{\"markdown\":\"## Docs\"}"}
+{"tool_name":"markdown_to_html","input":"Please render the following markdown:\n\n- item 1\n- item 2","expected_arguments":"{\"markdown\":\"- item 1\\n- item 2\"}"}
+{"tool_name":"markdown_to_html","input":"I have `**bold**` markdown; give me HTML.","expected_arguments":"{\"markdown\":\"**bold**\"}"}
+{"tool_name":"markdown_to_html","input":"Markdown to HTML: > quote","expected_arguments":"{\"markdown\":\"> quote\"}"}
+{"tool_name":"detect_language","input":"Hey, what language is this: 'Buenos días, ¿cómo estás?'","expected_arguments":"{\"text\":\"Buenos días, ¿cómo estás?\"}"}
+{"tool_name":"detect_language","input":"Identify the language: \"Guten Morgen\"","expected_arguments":"{\"text\":\"Guten Morgen\"}"}
+{"tool_name":"detect_language","input":"Language detection needed: こんにちは、お元気ですか？","expected_arguments":"{\"text\":\"こんにちは、お元気ですか？\"}"}
+{"tool_name":"detect_language","input":"Detect language for: 'Привет, как дела?'","expected_arguments":"{\"text\":\"Привет, как дела?\"}"}
+{"tool_name":"detect_language","input":"What language is 'Bonjour tout le monde'?","expected_arguments":"{\"text\":\"Bonjour tout le monde\"}"}
+{"tool_name":"generate_chart","input":"Plot a simple line chart for these points: (1,2),(2,4),(3,9).","expected_arguments":"{\"data\":[[1,2],[2,4],[3,9]],\"chart_type\":\"line\"}"}
+{"tool_name":"generate_chart","input":"Hey, can I get a bar chart of my sales: 10, 20, 30 across Q1–Q3?","expected_arguments":"{\"data\":[[1,10],[2,20],[3,30]],\"chart_type\":\"bar\",\"title\":\"Quarterly Sales\"}"}
+{"tool_name":"generate_chart","input":"Make a scatter chart titled 'Experiment' with x label Time and y label Value for data [ [0,1], [1,1.5], [2,2.2] ].","expected_arguments":"{\"data\":[[0,1],[1,1.5],[2,2.2]],\"chart_type\":\"scatter\",\"title\":\"Experiment\",\"x_label\":\"Time\",\"y_label\":\"Value\"}"}
+{"tool_name":"generate_chart","input":"Create a line chart of temperatures 70,72,68,65 over 4 days, label x as 'Day'.","expected_arguments":"{\"data\":[[1,70],[2,72],[3,68],[4,65]],\"chart_type\":\"line\",\"x_label\":\"Day\"}"}
+{"tool_name":"generate_chart","input":"Visualize visits per day with a bar chart; numbers: 100,150,120.","expected_arguments":"{\"data\":[[1,100],[2,150],[3,120]],\"chart_type\":\"bar\",\"title\":\"Daily Visits\",\"y_label\":\"Visitors\"}"}
+{"tool_name":"query_database","input":"Give me the ids and emails from users table, limit 5.","expected_arguments":"{\"table\":\"users\",\"columns\":[\"id\",\"email\"],\"limit\":5}"}
+{"tool_name":"query_database","input":"Hey, fetch order_id and amount from orders where status is 'shipped'.","expected_arguments":"{\"table\":\"orders\",\"columns\":[\"order_id\",\"amount\"],\"filters\":\"status = 'shipped'\"}"}
+{"tool_name":"query_database","input":"Retrieve name and price from products ordered by price descending, top 10 please.","expected_arguments":"{\"table\":\"products\",\"columns\":[\"name\",\"price\"],\"limit\":10,\"order_by\":\"price DESC\"}"}
+{"tool_name":"query_database","input":"I need the first 3 log entries from audit_log table.","expected_arguments":"{\"table\":\"audit_log\",\"columns\":[\"id\",\"timestamp\",\"action\"],\"limit\":3}"}
+{"tool_name":"query_database","input":"Query the customers table for name, city where city = 'Berlin'.","expected_arguments":"{\"table\":\"customers\",\"columns\":[\"name\",\"city\"],\"filters\":\"city = 'Berlin'\"}"}
+{"tool_name":"get_weather","input":"What's the weather in San Francisco right now?","expected_arguments":"{\"location\":\"San Francisco\"}"}
+{"tool_name":"get_weather","input":"Weather for Tokyo, please.","expected_arguments":"{\"location\":\"Tokyo\"}"}
+{"tool_name":"get_weather","input":"Get me the current weather for 10001.","expected_arguments":"{\"location\":\"10001\"}"}
+{"tool_name":"get_weather","input":"How's the weather in Paris today?","expected_arguments":"{\"location\":\"Paris\"}"}
+{"tool_name":"get_weather","input":"Check the weather for Sydney.","expected_arguments":"{\"location\":\"Sydney\"}"}