diff --git a/sdk/ai/ai-agents/CHANGELOG.md b/sdk/ai/ai-agents/CHANGELOG.md index 0ed8a001ab19..0f2b3d36ebaf 100644 --- a/sdk/ai/ai-agents/CHANGELOG.md +++ b/sdk/ai/ai-agents/CHANGELOG.md @@ -1,5 +1,11 @@ # Release History +## 1.2.0-beta.3 (2025-10-02) + +### Features Added + +- Add `ToolUtility.createComputerUseTool` to support computer use tool in agent + ## 1.2.0-beta.2 (2025-09-26) ### Features Added diff --git a/sdk/ai/ai-agents/data/cua_screenshot.jpg b/sdk/ai/ai-agents/data/cua_screenshot.jpg new file mode 100644 index 000000000000..12f21cb9dec8 Binary files /dev/null and b/sdk/ai/ai-agents/data/cua_screenshot.jpg differ diff --git a/sdk/ai/ai-agents/data/cua_screenshot_next.jpg b/sdk/ai/ai-agents/data/cua_screenshot_next.jpg new file mode 100644 index 000000000000..0d9633596fce Binary files /dev/null and b/sdk/ai/ai-agents/data/cua_screenshot_next.jpg differ diff --git a/sdk/ai/ai-agents/package.json b/sdk/ai/ai-agents/package.json index e314478927f1..a148a5d70d37 100644 --- a/sdk/ai/ai-agents/package.json +++ b/sdk/ai/ai-agents/package.json @@ -1,6 +1,6 @@ { "name": "@azure/ai-agents", - "version": "1.2.0-beta.2", + "version": "1.2.0-beta.3", "description": "Azure AI Agents client library.", "engines": { "node": ">=20.0.0" diff --git a/sdk/ai/ai-agents/review/ai-agents-node.api.md b/sdk/ai/ai-agents/review/ai-agents-node.api.md index d2799b9b82b4..f1a27a1de12f 100644 --- a/sdk/ai/ai-agents/review/ai-agents-node.api.md +++ b/sdk/ai/ai-agents/review/ai-agents-node.api.md @@ -107,7 +107,7 @@ export interface AgentsNamedToolChoice { } // @public -export type AgentsNamedToolChoiceType = "function" | "code_interpreter" | "file_search" | "bing_grounding" | "fabric_dataagent" | "sharepoint_grounding" | "azure_ai_search" | "bing_custom_search" | "connected_agent" | "deep_research" | "mcp"; +export type AgentsNamedToolChoiceType = "function" | "code_interpreter" | "file_search" | "bing_grounding" | "fabric_dataagent" | "sharepoint_grounding" | "azure_ai_search" | "bing_custom_search" | "connected_agent" | "deep_research" | "mcp" | "computer_use_preview"; // @public export interface AgentsResponseFormat { @@ -276,6 +276,14 @@ export interface BrowserAutomationToolParameters { connection: BrowserAutomationToolConnectionParameters; } +// @public +export interface ClickAction extends ComputerUseAction { + button: MouseButton; + type: "click"; + x: number; + y: number; +} + // @public export interface CodeInterpreterToolDefinition extends ToolDefinition { type: "code_interpreter"; @@ -287,6 +295,44 @@ export interface CodeInterpreterToolResource { fileIds?: string[]; } +// @public +export interface ComputerScreenshot { + fileId?: string; + imageUrl?: string; + type: "computer_screenshot"; +} + +// @public +export interface ComputerToolOutput extends StructuredToolOutput { + acknowledgedSafetyChecks?: SafetyCheck[]; + output: ComputerScreenshot; + type: "computer_call_output"; +} + +// @public +export interface ComputerUseAction { + type: string; +} + +// @public +export type ComputerUseActionUnion = ClickAction | DoubleClickAction | DragAction | KeyPressAction | MoveAction | ScreenshotAction | ScrollAction | TypeAction | WaitAction | ComputerUseAction; + +// @public +export type ComputerUseEnvironment = "windows" | "mac" | "linux" | "browser"; + +// @public +export interface ComputerUseToolDefinition extends ToolDefinition { + computerUsePreview: ComputerUseToolParameters; + type: "computer_use_preview"; +} + +// @public +export interface ComputerUseToolParameters { + displayHeight: number; + displayWidth: number; + environment: ComputerUseEnvironment; +} + // @public export interface ConnectedAgentDetails { description: string; @@ -314,6 +360,12 @@ export type ContinuablePage = TPage & { continuationToken?: string; }; +// @public +export interface CoordinatePoint { + x: number; + y: number; +} + // @public export interface CreateAgentOptionalParams extends OperationOptions { description?: string | null; @@ -372,6 +424,19 @@ export enum DoneEvent { Done = "done" } +// @public +export interface DoubleClickAction extends ComputerUseAction { + type: "double_click"; + x: number; + y: number; +} + +// @public +export interface DragAction extends ComputerUseAction { + path: CoordinatePoint[]; + type: "drag"; +} + // @public export enum ErrorEvent { Error = "error" @@ -522,6 +587,12 @@ export function isOutputOfType(output: RequiredAction | RequiredToolCall | ToolDefinitionUnion, type: string): output is T; +// @public +export interface KeyPressAction extends ComputerUseAction { + keys: string[]; + type: "keypress"; +} + // @public export enum KnownVersions { V1 = "v1", @@ -890,6 +961,16 @@ export interface MicrosoftFabricToolDefinition extends ToolDefinition { type: "fabric_dataagent"; } +// @public +export type MouseButton = "left" | "right" | "wheel" | "back" | "forward"; + +// @public +export interface MoveAction extends ComputerUseAction { + type: "move"; + x: number; + y: number; +} + // @public export interface OpenApiAnonymousAuthDetails extends OpenApiAuthDetails { type: "anonymous"; @@ -985,6 +1066,18 @@ export interface RequiredAction { // @public export type RequiredActionUnion = SubmitToolOutputsAction | SubmitToolApprovalAction | RequiredAction; +// @public +export interface RequiredComputerUseToolCall extends RequiredToolCall { + computerUsePreview: RequiredComputerUseToolCallDetails; + type: "computer_use_preview"; +} + +// @public +export interface RequiredComputerUseToolCallDetails { + action: ComputerUseActionUnion; + pendingSafetyChecks: SafetyCheck[]; +} + // @public export interface RequiredFunctionToolCall extends RequiredToolCall { function: RequiredFunctionToolCallDetails; @@ -1012,7 +1105,7 @@ export interface RequiredToolCall { } // @public -export type RequiredToolCallUnion = RequiredFunctionToolCall | RequiredMcpToolCall | RequiredToolCall; +export type RequiredToolCallUnion = RequiredFunctionToolCall | RequiredMcpToolCall | RequiredComputerUseToolCall | RequiredToolCall; // @public export type ResponseFormat = "text" | "json_object"; @@ -1091,7 +1184,7 @@ export interface RunsOperations { createThreadAndRun: (assistantId: string, options?: CreateThreadAndRunOptionalParams) => AgentRunResponse; get: (threadId: string, runId: string, options?: RunsGetRunOptionalParams) => Promise; list: (threadId: string, options?: RunsListRunsOptionalParams) => PagedAsyncIterableIterator; - submitToolOutputs: (threadId: string, runId: string, toolOutputs: ToolOutput[], options?: RunsSubmitToolOutputsToRunOptionalParams) => AgentRunResponse; + submitToolOutputs: (threadId: string, runId: string, toolOutputs: StructuredToolOutputUnion[], options?: RunsSubmitToolOutputsToRunOptionalParams) => AgentRunResponse; update: (threadId: string, runId: string, options?: RunsUpdateRunOptionalParams) => Promise; } @@ -1099,7 +1192,7 @@ export interface RunsOperations { export interface RunsSubmitToolOutputsToRunOptionalParams extends OperationOptions { stream?: boolean | null; toolApprovals?: ToolApproval[]; - toolOutputs?: ToolOutput[]; + toolOutputs?: StructuredToolOutputUnion[]; } // @public @@ -1205,6 +1298,20 @@ export interface RunStepCompletionUsage { totalTokens: number; } +// @public +export interface RunStepComputerUseToolCall extends RunStepToolCall { + computerUsePreview: RunStepComputerUseToolCallDetails; + type: "computer_use_preview"; +} + +// @public +export interface RunStepComputerUseToolCallDetails { + acknowledgedSafetyChecks?: SafetyCheck[]; + action: ComputerUseActionUnion; + output: ComputerScreenshot; + pendingSafetyChecks: SafetyCheck[]; +} + // @public export interface RunStepConnectedAgent { agentId?: string; @@ -1301,6 +1408,20 @@ export interface RunStepDeltaCodeInterpreterToolCall extends RunStepDeltaToolCal type: "code_interpreter"; } +// @public +export interface RunStepDeltaComputerUseDetails { + acknowledgedSafetyChecks?: SafetyCheck[]; + action?: ComputerUseActionUnion; + output?: ComputerScreenshot; + pendingSafetyChecks?: SafetyCheck[]; +} + +// @public +export interface RunStepDeltaComputerUseToolCall extends RunStepDeltaToolCall { + computerUsePreview?: RunStepDeltaComputerUseDetails; + type: "computer_use_preview"; +} + // @public export interface RunStepDeltaConnectedAgentToolCall extends RunStepDeltaToolCall { connectedAgent: RunStepConnectedAgent; @@ -1408,7 +1529,7 @@ export interface RunStepDeltaToolCallObject extends RunStepDeltaDetail { } // @public -export type RunStepDeltaToolCallUnion = RunStepDeltaMcpToolCall | RunStepDeltaOpenAPIToolCall | RunStepDeltaConnectedAgentToolCall | RunStepDeltaFunctionToolCall | RunStepDeltaFileSearchToolCall | RunStepDeltaCodeInterpreterToolCall | RunStepDeltaBingGroundingToolCall | RunStepDeltaCustomBingGroundingToolCall | RunStepDeltaAzureFunctionToolCall | RunStepDeltaDeepResearchToolCall | RunStepDeltaAzureAISearchToolCall | RunStepDeltaMicrosoftFabricToolCall | RunStepDeltaSharepointToolCall | RunStepDeltaToolCall; +export type RunStepDeltaToolCallUnion = RunStepDeltaMcpToolCall | RunStepDeltaOpenAPIToolCall | RunStepDeltaConnectedAgentToolCall | RunStepDeltaFunctionToolCall | RunStepDeltaFileSearchToolCall | RunStepDeltaCodeInterpreterToolCall | RunStepDeltaBingGroundingToolCall | RunStepDeltaCustomBingGroundingToolCall | RunStepDeltaAzureFunctionToolCall | RunStepDeltaDeepResearchToolCall | RunStepDeltaAzureAISearchToolCall | RunStepDeltaComputerUseToolCall | RunStepDeltaMicrosoftFabricToolCall | RunStepDeltaSharepointToolCall | RunStepDeltaToolCall; // @public export interface RunStepDetails { @@ -1553,7 +1674,7 @@ export interface RunStepToolCallDetails extends RunStepDetails { } // @public -export type RunStepToolCallUnion = RunStepCodeInterpreterToolCall | RunStepFileSearchToolCall | RunStepBingGroundingToolCall | RunStepAzureAISearchToolCall | RunStepBrowserAutomationToolCall | RunStepMcpToolCall | RunStepSharepointToolCall | RunStepMicrosoftFabricToolCall | RunStepBingCustomSearchToolCall | RunStepAzureFunctionToolCall | RunStepFunctionToolCall | RunStepOpenAPIToolCall | RunStepDeepResearchToolCall | RunStepConnectedAgentToolCall | RunStepToolCall; +export type RunStepToolCallUnion = RunStepCodeInterpreterToolCall | RunStepFileSearchToolCall | RunStepBingGroundingToolCall | RunStepAzureAISearchToolCall | RunStepBrowserAutomationToolCall | RunStepMcpToolCall | RunStepComputerUseToolCall | RunStepSharepointToolCall | RunStepMicrosoftFabricToolCall | RunStepBingCustomSearchToolCall | RunStepAzureFunctionToolCall | RunStepFunctionToolCall | RunStepOpenAPIToolCall | RunStepDeepResearchToolCall | RunStepConnectedAgentToolCall | RunStepToolCall; // @public export type RunStepType = "message_creation" | "tool_calls" | "activities"; @@ -1577,6 +1698,27 @@ export interface RunsUpdateRunOptionalParams extends OperationOptions { metadata?: Record | null; } +// @public +export interface SafetyCheck { + code?: string; + id: string; + message?: string; +} + +// @public +export interface ScreenshotAction extends ComputerUseAction { + type: "screenshot"; +} + +// @public +export interface ScrollAction extends ComputerUseAction { + scrollX: number; + scrollY: number; + type: "scroll"; + x: number; + y: number; +} + // @public export interface SharepointGroundingToolParameters { connectionList?: ToolConnection[]; @@ -1588,6 +1730,15 @@ export interface SharepointToolDefinition extends ToolDefinition { type: "sharepoint_grounding"; } +// @public +export interface StructuredToolOutput { + toolCallId?: string; + type: string; +} + +// @public +export type StructuredToolOutputUnion = ToolOutput | ComputerToolOutput | StructuredToolOutput; + // @public export interface SubmitToolApprovalAction extends RequiredAction { submitToolApproval: SubmitToolApprovalDetails; @@ -1736,12 +1887,12 @@ export interface ToolDefinition { } // @public -export type ToolDefinitionUnion = CodeInterpreterToolDefinition | FileSearchToolDefinition | FunctionToolDefinition | BingGroundingToolDefinition | MicrosoftFabricToolDefinition | SharepointToolDefinition | AzureAISearchToolDefinition | OpenApiToolDefinition | BingCustomSearchToolDefinition | ConnectedAgentToolDefinition | DeepResearchToolDefinition | MCPToolDefinition | AzureFunctionToolDefinition | BrowserAutomationToolDefinition | ToolDefinition; +export type ToolDefinitionUnion = CodeInterpreterToolDefinition | FileSearchToolDefinition | FunctionToolDefinition | BingGroundingToolDefinition | MicrosoftFabricToolDefinition | SharepointToolDefinition | AzureAISearchToolDefinition | OpenApiToolDefinition | BingCustomSearchToolDefinition | ConnectedAgentToolDefinition | DeepResearchToolDefinition | MCPToolDefinition | ComputerUseToolDefinition | AzureFunctionToolDefinition | BrowserAutomationToolDefinition | ToolDefinition; // @public -export interface ToolOutput { +export interface ToolOutput extends StructuredToolOutput { output?: string; - toolCallId?: string; + type: "function_call_output"; } // @public @@ -1806,6 +1957,10 @@ export class ToolUtility { definition: CodeInterpreterToolDefinition; resources: ToolResources; }; + // (undocumented) + static createComputerUseTool(displayWidth: number, displayHeight: number, env: ComputerUseEnvironment): { + definition: ComputerUseToolDefinition; + }; static createConnectedAgentTool(id: string, name: string, description: string): { definition: ConnectedAgentToolDefinition; }; @@ -1838,6 +1993,12 @@ export interface TruncationObject { // @public export type TruncationStrategy = "auto" | "last_messages"; +// @public +export interface TypeAction extends ComputerUseAction { + text: string; + type: "type"; +} + // @public export interface UpdateAgentOptionalParams extends OperationOptions { description?: string | null; @@ -2126,6 +2287,11 @@ export interface VectorStoreStaticChunkingStrategyResponse extends VectorStoreCh // @public export type VectorStoreStatus = "expired" | "in_progress" | "completed"; +// @public +export interface WaitAction extends ComputerUseAction { + type: "wait"; +} + // (No @packageDocumentation comment for this package) ``` diff --git a/sdk/ai/ai-agents/samples-dev/agentsComputerUse.ts b/sdk/ai/ai-agents/samples-dev/agentsComputerUse.ts new file mode 100644 index 000000000000..24c1d501da8b --- /dev/null +++ b/sdk/ai/ai-agents/samples-dev/agentsComputerUse.ts @@ -0,0 +1,230 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +/** + * This sample demonstrates how to use agent operations with the Computer Use tool (preview) + * using a synchronous client. This sample uses fake screenshots to demonstrate how output actions work, + * but the actual implementation would involve mapping the output action types to their corresponding + * API calls in the user's preferred managed environment framework (e.g. Playwright or Docker). + * + * NOTE: Usage of the computer-use-preview model currently requires approval. Please see + * https://learn.microsoft.com/azure/ai-foundry/openai/how-to/computer-use for more information. + * + * @summary demonstrates how to use agent operations with the Computer Use tool. + */ + +import type { + ComputerUseEnvironment, + MessageInputContentBlock, + MessageInputTextBlock, + MessageInputImageUrlBlock, + MessageImageUrlParam, + RunStepToolCallDetails, + RunStepComputerUseToolCall, + RequiredComputerUseToolCall, + SubmitToolOutputsAction, + TypeAction, + ScreenshotAction, + ComputerScreenshot, + StructuredToolOutputUnion, +} from "@azure/ai-agents"; +import { AgentsClient, isOutputOfType, ToolUtility } from "@azure/ai-agents"; +import { DefaultAzureCredential } from "@azure/identity"; +import { readFileSync, existsSync } from "fs"; +import { join, dirname } from "path"; +import { fileURLToPath } from "url"; +import "dotenv/config"; + +const projectEndpoint = process.env["PROJECT_ENDPOINT"] || ""; +const modelDeploymentName = process.env["COMPUTER_USE_MODEL"] || "computer-use-preview"; +const environment = (process.env["COMPUTER_USE_ENVIRONMENT"] as ComputerUseEnvironment) || "windows"; + +/** + * Convert an image file to a Base64-encoded string. + * + * @param imagePath - The path to the image file (e.g. 'image_file.png') + * @returns A Base64-encoded string representing the image. + * @throws Error if the provided file path does not exist or there's an error reading the file. + */ +function imageToBase64(imagePath: string): string { + if (!existsSync(imagePath)) { + throw new Error(`File not found at: ${imagePath}`); + } + + try { + const fileData = readFileSync(imagePath); + return fileData.toString("base64"); + } catch (error) { + throw new Error(`Error reading file '${imagePath}': ${error}`); + } +} + +export async function main(): Promise { + // Get the directory of the current script + let currentDir: string; + try { + currentDir = dirname(fileURLToPath(import.meta.url)); + } catch { + // Fallback for environments where import.meta.url is not available + currentDir = __dirname || process.cwd(); + } + + const assetFilePath = join(currentDir, "../data/cua_screenshot.jpg"); + const actionResultFilePath = join(currentDir, "../data/cua_screenshot_next.jpg"); + + // Create an Azure AI Agents Client + const client = new AgentsClient(projectEndpoint, new DefaultAzureCredential()); + + // Initialize Computer Use tool with a browser-sized viewport + const computerUse = ToolUtility.createComputerUseTool(1026, 769, environment); + + // Create a new Agent that has the Computer Use tool attached. + const agent = await client.createAgent(modelDeploymentName, { + name: "my-agent-computer-use", + instructions: ` + You are a computer automation assistant. + Use the computer_use_preview tool to interact with the screen when needed. + `, + tools: [computerUse.definition], + }); + + console.log(`Created agent, ID: ${agent.id}`); + + // Create thread for communication + const thread = await client.threads.create(); + console.log(`Created thread, ID: ${thread.id}`); + + const inputMessage = + "I can see a web browser with bing.com open and the cursor in the search box. " + + "Type 'movies near me' without pressing Enter or any other key. Only type 'movies near me'."; + + const imageBase64 = imageToBase64(assetFilePath); + const imgUrl = `data:image/jpeg;base64,${imageBase64}`; + const urlParam: MessageImageUrlParam = { url: imgUrl, detail: "high" }; + console.log("urlParam ", urlParam); + + const textBlock: MessageInputTextBlock = { type: "text", text: inputMessage }; + const imageBlock: MessageInputImageUrlBlock = { type: "image_url", imageUrl: urlParam }; + const contentBlocks: MessageInputContentBlock[] = [ + textBlock, + imageBlock, + ]; + + // Create message to thread + const message = await client.messages.create(thread.id, "user", contentBlocks); + console.log(`Created message, ID: ${message.id}`); + + const run = await client.runs.create(thread.id, agent.id); + console.log(`Created run, ID: ${run.id}`); + + // Create a fake screenshot showing the text typed in + const resultImageBase64 = imageToBase64(actionResultFilePath); + const resultImgUrl = `data:image/jpeg;base64,${resultImageBase64}`; + const computerScreenshot: ComputerScreenshot = { + type: "computer_screenshot", + imageUrl: resultImgUrl + }; + + while (run.status === "queued" || run.status === "in_progress" || run.status === "requires_action") { + await new Promise(resolve => setTimeout(resolve, 1000)); + const updatedRun = await client.runs.get(thread.id, run.id); + + if (updatedRun.status === "requires_action" && updatedRun.requiredAction) { + if (isOutputOfType(updatedRun.requiredAction, "submit_tool_outputs")) { + console.log("Run requires action:"); + const toolCalls = updatedRun.requiredAction.submitToolOutputs.toolCalls; + + if (!toolCalls || toolCalls.length === 0) { + console.log("No tool calls provided - cancelling run"); + await client.runs.cancel(thread.id, run.id); + break; + } + + const toolOutputs: StructuredToolOutputUnion[] = []; + for (const toolCall of toolCalls) { + if (isOutputOfType(toolCall, "computer_use_preview")) { + console.log(toolCall); + try { + const action = toolCall.computerUsePreview.action; + console.log(`Executing computer use action: ${action.type}`); + + if (isOutputOfType(action, "type")) { + console.log(` Text to type: ${action.text}`); + // (add hook to input text in managed environment API here) + + toolOutputs.push({ + type: "computer_call_output", + toolCallId: toolCall.id, + output: computerScreenshot, + }); + } else if (isOutputOfType(action, "screenshot")) { + console.log(" Screenshot requested"); + // (add hook to take screenshot in managed environment API here) + + toolOutputs.push({ + type: "computer_call_output", + toolCallId: toolCall.id, + output: computerScreenshot, + }); + } + } catch (error) { + console.log(`Error executing tool_call ${toolCall.id}: ${error}`); + } + } + } + + console.log(`Tool outputs: ${JSON.stringify(toolOutputs, null, 2)}`); + if (toolOutputs.length > 0) { + await client.runs.submitToolOutputs(thread.id, run.id, toolOutputs); + } + } + } + + // Update run status for the loop condition + const currentRun = await client.runs.get(thread.id, run.id); + Object.assign(run, currentRun); + console.log(`Current run status: ${run.status}`); + } + + console.log(`Run completed with status: ${run.status}`); + if (run.status === "failed") { + console.log(`Run failed: ${JSON.stringify(run.lastError)}`); + } + + // Fetch run steps to get the details of the agent run + const runStepsIterator = client.runSteps.list(thread.id, run.id); + console.log("\nRun Steps:"); + + for await (const step of runStepsIterator) { + console.log(`Step ${step.id} status: ${step.status}`); + console.log(step); + + if (isOutputOfType(step.stepDetails, "tool_calls")) { + console.log(" Tool calls:"); + const runStepToolCalls = step.stepDetails.toolCalls; + + for (const call of runStepToolCalls) { + console.log(` Tool call ID: ${call.id}`); + console.log(` Tool call type: ${call.type}`); + + if (isOutputOfType(call, "computer_use_preview")) { + const details = call.computerUsePreview; + console.log(` Computer use action type: ${details.action.type}`); + } + + console.log(); // extra newline between tool calls + } + } + + console.log(); // extra newline between run steps + } + + // Optional: Delete the agent once the run is finished. + await client.deleteAgent(agent.id); + console.log("Deleted agent"); +} + +main().catch((err) => { + console.error("The sample encountered an error:", err); + process.exit(1); +}); diff --git a/sdk/ai/ai-agents/samples/v1-beta/javascript/README.md b/sdk/ai/ai-agents/samples/v1-beta/javascript/README.md index 51994300d8ba..e0b46e180a36 100644 --- a/sdk/ai/ai-agents/samples/v1-beta/javascript/README.md +++ b/sdk/ai/ai-agents/samples/v1-beta/javascript/README.md @@ -20,6 +20,7 @@ These sample programs show how to use the JavaScript client libraries for Azure | [agentsBingGrounding.js][agentsbinggrounding] | demonstrates how to use agent operations with the Grounding with Bing Search tool. | | [agentsBingGroundingWithStreaming.js][agentsbinggroundingwithstreaming] | demonstrates how to use agent operations with the Grounding with Bing Search tool using streaming. | | [agentsBrowserAutomation.js][agentsbrowserautomation] | demonstrates how to use agent operations with the Browser Automation tool. | +| [agentsComputerUse.js][agentscomputeruse] | demonstrates how to use agent operations with the Computer Use tool. | | [agentsConnectedAgents.js][agentsconnectedagents] | This sample demonstrates how to use Agent operations with the Connected Agent tool from the Azure Agents service. | | [agentsImageInputWithBase64.js][agentsimageinputwithbase64] | This sample demonstrates how to use basic agent operations with image input (base64 encoded) for the Azure Agents service. | | [agentsImageInputWithFile.js][agentsimageinputwithfile] | This sample demonstrates how to use basic agent operations using image file input for the Azure Agents service. | @@ -92,6 +93,7 @@ Take a look at our [API Documentation][apiref] for more information about the AP [agentsbinggrounding]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/javascript/agentsBingGrounding.js [agentsbinggroundingwithstreaming]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/javascript/agentsBingGroundingWithStreaming.js [agentsbrowserautomation]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/javascript/agentsBrowserAutomation.js +[agentscomputeruse]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/javascript/agentsComputerUse.js [agentsconnectedagents]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/javascript/agentsConnectedAgents.js [agentsimageinputwithbase64]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/javascript/agentsImageInputWithBase64.js [agentsimageinputwithfile]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/javascript/agentsImageInputWithFile.js diff --git a/sdk/ai/ai-agents/samples/v1-beta/javascript/agentsComputerUse.js b/sdk/ai/ai-agents/samples/v1-beta/javascript/agentsComputerUse.js new file mode 100644 index 000000000000..fb1f6b294da4 --- /dev/null +++ b/sdk/ai/ai-agents/samples/v1-beta/javascript/agentsComputerUse.js @@ -0,0 +1,211 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +/** + * This sample demonstrates how to use agent operations with the Computer Use tool (preview) + * using a synchronous client. This sample uses fake screenshots to demonstrate how output actions work, + * but the actual implementation would involve mapping the output action types to their corresponding + * API calls in the user's preferred managed environment framework (e.g. Playwright or Docker). + * + * NOTE: Usage of the computer-use-preview model currently requires approval. Please see + * https://learn.microsoft.com/azure/ai-foundry/openai/how-to/computer-use for more information. + * + * @summary demonstrates how to use agent operations with the Computer Use tool. + */ + +const { AgentsClient, isOutputOfType, ToolUtility } = require("@azure/ai-agents"); +const { DefaultAzureCredential } = require("@azure/identity"); +const { readFileSync, existsSync } = require("fs"); +const { join, dirname } = require("path"); +const { fileURLToPath } = require("url"); +require("dotenv/config"); + +const projectEndpoint = process.env["PROJECT_ENDPOINT"] || ""; +const modelDeploymentName = process.env["COMPUTER_USE_MODEL"] || "computer-use-preview"; +const environment = process.env["COMPUTER_USE_ENVIRONMENT"] || "windows"; + +/** + * Convert an image file to a Base64-encoded string. + * + * @param imagePath - The path to the image file (e.g. 'image_file.png') + * @returns A Base64-encoded string representing the image. + * @throws Error if the provided file path does not exist or there's an error reading the file. + */ +function imageToBase64(imagePath) { + if (!existsSync(imagePath)) { + throw new Error(`File not found at: ${imagePath}`); + } + + try { + const fileData = readFileSync(imagePath); + return fileData.toString("base64"); + } catch (error) { + throw new Error(`Error reading file '${imagePath}': ${error}`); + } +} + +async function main() { + // Get the directory of the current script + const currentDir = __dirname; + const assetFilePath = join(currentDir, "../data/cua_screenshot.jpg"); + const actionResultFilePath = join(currentDir, "../data/cua_screenshot_next.jpg"); + + // Create an Azure AI Agents Client + const client = new AgentsClient(projectEndpoint, new DefaultAzureCredential()); + + // Initialize Computer Use tool with a browser-sized viewport + const computerUse = ToolUtility.createComputerUseTool(1026, 769, environment); + + // Create a new Agent that has the Computer Use tool attached. + const agent = await client.createAgent(modelDeploymentName, { + name: "my-agent-computer-use", + instructions: ` + You are a computer automation assistant. + Use the computer_use_preview tool to interact with the screen when needed. + `, + tools: [computerUse.definition], + }); + + console.log(`Created agent, ID: ${agent.id}`); + + // Create thread for communication + const thread = await client.threads.create(); + console.log(`Created thread, ID: ${thread.id}`); + + const inputMessage = + "I can see a web browser with bing.com open and the cursor in the search box. " + + "Type 'movies near me' without pressing Enter or any other key. Only type 'movies near me'."; + + const imageBase64 = imageToBase64(assetFilePath); + const imgUrl = `data:image/jpeg;base64,${imageBase64}`; + const urlParam = { url: imgUrl, detail: "high" }; + console.log("urlParam ", urlParam); + + const textBlock = { type: "text", text: inputMessage }; + const imageBlock = { type: "image_url", imageUrl: urlParam }; + const contentBlocks = [textBlock, imageBlock]; + + // Create message to thread + const message = await client.messages.create(thread.id, "user", contentBlocks); + console.log(`Created message, ID: ${message.id}`); + + const run = await client.runs.create(thread.id, agent.id); + console.log(`Created run, ID: ${run.id}`); + + // Create a fake screenshot showing the text typed in + const resultImageBase64 = imageToBase64(actionResultFilePath); + const resultImgUrl = `data:image/jpeg;base64,${resultImageBase64}`; + const computerScreenshot = { + type: "computer_screenshot", + imageUrl: resultImgUrl, + }; + + while ( + run.status === "queued" || + run.status === "in_progress" || + run.status === "requires_action" + ) { + await new Promise((resolve) => setTimeout(resolve, 1000)); + const updatedRun = await client.runs.get(thread.id, run.id); + + if (updatedRun.status === "requires_action" && updatedRun.requiredAction) { + if (isOutputOfType(updatedRun.requiredAction, "submit_tool_outputs")) { + console.log("Run requires action:"); + const toolCalls = updatedRun.requiredAction.submitToolOutputs.toolCalls; + + if (!toolCalls || toolCalls.length === 0) { + console.log("No tool calls provided - cancelling run"); + await client.runs.cancel(thread.id, run.id); + break; + } + + const toolOutputs = []; + for (const toolCall of toolCalls) { + if (isOutputOfType(toolCall, "computer_use_preview")) { + console.log(toolCall); + try { + const action = toolCall.computerUsePreview.action; + console.log(`Executing computer use action: ${action.type}`); + + if (isOutputOfType(action, "type")) { + console.log(` Text to type: ${action.text}`); + // (add hook to input text in managed environment API here) + + toolOutputs.push({ + type: "computer_call_output", + toolCallId: toolCall.id, + output: computerScreenshot, + }); + } else if (isOutputOfType(action, "screenshot")) { + console.log(" Screenshot requested"); + // (add hook to take screenshot in managed environment API here) + + toolOutputs.push({ + type: "computer_call_output", + toolCallId: toolCall.id, + output: computerScreenshot, + }); + } + } catch (error) { + console.log(`Error executing tool_call ${toolCall.id}: ${error}`); + } + } + } + + console.log(`Tool outputs: ${JSON.stringify(toolOutputs, null, 2)}`); + if (toolOutputs.length > 0) { + await client.runs.submitToolOutputs(thread.id, run.id, toolOutputs); + } + } + } + + // Update run status for the loop condition + const currentRun = await client.runs.get(thread.id, run.id); + Object.assign(run, currentRun); + console.log(`Current run status: ${run.status}`); + } + + console.log(`Run completed with status: ${run.status}`); + if (run.status === "failed") { + console.log(`Run failed: ${JSON.stringify(run.lastError)}`); + } + + // Fetch run steps to get the details of the agent run + const runStepsIterator = client.runSteps.list(thread.id, run.id); + console.log("\nRun Steps:"); + + for await (const step of runStepsIterator) { + console.log(`Step ${step.id} status: ${step.status}`); + console.log(step); + + if (isOutputOfType(step.stepDetails, "tool_calls")) { + console.log(" Tool calls:"); + const runStepToolCalls = step.stepDetails.toolCalls; + + for (const call of runStepToolCalls) { + console.log(` Tool call ID: ${call.id}`); + console.log(` Tool call type: ${call.type}`); + + if (isOutputOfType(call, "computer_use_preview")) { + const details = call.computerUsePreview; + console.log(` Computer use action type: ${details.action.type}`); + } + + console.log(); // extra newline between tool calls + } + } + + console.log(); // extra newline between run steps + } + + // Optional: Delete the agent once the run is finished. + await client.deleteAgent(agent.id); + console.log("Deleted agent"); +} + +main().catch((err) => { + console.error("The sample encountered an error:", err); + process.exit(1); +}); + +module.exports = { main }; diff --git a/sdk/ai/ai-agents/samples/v1-beta/javascript/data/cua_screenshot.jpg b/sdk/ai/ai-agents/samples/v1-beta/javascript/data/cua_screenshot.jpg new file mode 100644 index 000000000000..12f21cb9dec8 Binary files /dev/null and b/sdk/ai/ai-agents/samples/v1-beta/javascript/data/cua_screenshot.jpg differ diff --git a/sdk/ai/ai-agents/samples/v1-beta/javascript/data/cua_screenshot_next.jpg b/sdk/ai/ai-agents/samples/v1-beta/javascript/data/cua_screenshot_next.jpg new file mode 100644 index 000000000000..0d9633596fce Binary files /dev/null and b/sdk/ai/ai-agents/samples/v1-beta/javascript/data/cua_screenshot_next.jpg differ diff --git a/sdk/ai/ai-agents/samples/v1-beta/typescript/README.md b/sdk/ai/ai-agents/samples/v1-beta/typescript/README.md index 3e2ff2e31af4..c1c6595976bf 100644 --- a/sdk/ai/ai-agents/samples/v1-beta/typescript/README.md +++ b/sdk/ai/ai-agents/samples/v1-beta/typescript/README.md @@ -20,6 +20,7 @@ These sample programs show how to use the TypeScript client libraries for Azure | [agentsBingGrounding.ts][agentsbinggrounding] | demonstrates how to use agent operations with the Grounding with Bing Search tool. | | [agentsBingGroundingWithStreaming.ts][agentsbinggroundingwithstreaming] | demonstrates how to use agent operations with the Grounding with Bing Search tool using streaming. | | [agentsBrowserAutomation.ts][agentsbrowserautomation] | demonstrates how to use agent operations with the Browser Automation tool. | +| [agentsComputerUse.ts][agentscomputeruse] | demonstrates how to use agent operations with the Computer Use tool. | | [agentsConnectedAgents.ts][agentsconnectedagents] | This sample demonstrates how to use Agent operations with the Connected Agent tool from the Azure Agents service. | | [agentsImageInputWithBase64.ts][agentsimageinputwithbase64] | This sample demonstrates how to use basic agent operations with image input (base64 encoded) for the Azure Agents service. | | [agentsImageInputWithFile.ts][agentsimageinputwithfile] | This sample demonstrates how to use basic agent operations using image file input for the Azure Agents service. | @@ -104,6 +105,7 @@ Take a look at our [API Documentation][apiref] for more information about the AP [agentsbinggrounding]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/typescript/src/agentsBingGrounding.ts [agentsbinggroundingwithstreaming]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/typescript/src/agentsBingGroundingWithStreaming.ts [agentsbrowserautomation]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/typescript/src/agentsBrowserAutomation.ts +[agentscomputeruse]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/typescript/src/agentsComputerUse.ts [agentsconnectedagents]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/typescript/src/agentsConnectedAgents.ts [agentsimageinputwithbase64]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/typescript/src/agentsImageInputWithBase64.ts [agentsimageinputwithfile]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/typescript/src/agentsImageInputWithFile.ts diff --git a/sdk/ai/ai-agents/samples/v1-beta/typescript/data/cua_screenshot.jpg b/sdk/ai/ai-agents/samples/v1-beta/typescript/data/cua_screenshot.jpg new file mode 100644 index 000000000000..12f21cb9dec8 Binary files /dev/null and b/sdk/ai/ai-agents/samples/v1-beta/typescript/data/cua_screenshot.jpg differ diff --git a/sdk/ai/ai-agents/samples/v1-beta/typescript/data/cua_screenshot_next.jpg b/sdk/ai/ai-agents/samples/v1-beta/typescript/data/cua_screenshot_next.jpg new file mode 100644 index 000000000000..0d9633596fce Binary files /dev/null and b/sdk/ai/ai-agents/samples/v1-beta/typescript/data/cua_screenshot_next.jpg differ diff --git a/sdk/ai/ai-agents/samples/v1-beta/typescript/src/agentsComputerUse.ts b/sdk/ai/ai-agents/samples/v1-beta/typescript/src/agentsComputerUse.ts new file mode 100644 index 000000000000..1ba10d035be3 --- /dev/null +++ b/sdk/ai/ai-agents/samples/v1-beta/typescript/src/agentsComputerUse.ts @@ -0,0 +1,230 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +/** + * This sample demonstrates how to use agent operations with the Computer Use tool (preview) + * using a synchronous client. This sample uses fake screenshots to demonstrate how output actions work, + * but the actual implementation would involve mapping the output action types to their corresponding + * API calls in the user's preferred managed environment framework (e.g. Playwright or Docker). + * + * NOTE: Usage of the computer-use-preview model currently requires approval. Please see + * https://learn.microsoft.com/azure/ai-foundry/openai/how-to/computer-use for more information. + * + * @summary demonstrates how to use agent operations with the Computer Use tool. + */ + +import type { + ComputerUseEnvironment, + MessageInputContentBlock, + MessageInputTextBlock, + MessageInputImageUrlBlock, + MessageImageUrlParam, + RunStepToolCallDetails, + RunStepComputerUseToolCall, + RequiredComputerUseToolCall, + SubmitToolOutputsAction, + TypeAction, + ScreenshotAction, + ComputerScreenshot, + StructuredToolOutputUnion, +} from "@azure/ai-agents"; +import { AgentsClient, isOutputOfType, ToolUtility } from "@azure/ai-agents"; +import { DefaultAzureCredential } from "@azure/identity"; +import { readFileSync, existsSync } from "fs"; +import { join, dirname } from "path"; +import { fileURLToPath } from "url"; +import "dotenv/config"; + +const projectEndpoint = process.env["PROJECT_ENDPOINT"] || ""; +const modelDeploymentName = process.env["COMPUTER_USE_MODEL"] || "computer-use-preview"; +const environment = (process.env["COMPUTER_USE_ENVIRONMENT"] as ComputerUseEnvironment) || "windows"; + +/** + * Convert an image file to a Base64-encoded string. + * + * @param imagePath - The path to the image file (e.g. 'image_file.png') + * @returns A Base64-encoded string representing the image. + * @throws Error if the provided file path does not exist or there's an error reading the file. + */ +function imageToBase64(imagePath: string): string { + if (!existsSync(imagePath)) { + throw new Error(`File not found at: ${imagePath}`); + } + + try { + const fileData = readFileSync(imagePath); + return fileData.toString("base64"); + } catch (error) { + throw new Error(`Error reading file '${imagePath}': ${error}`); + } +} + +export async function main(): Promise { + // Get the directory of the current script + let currentDir: string; + try { + currentDir = dirname(fileURLToPath(import.meta.url)); + } catch { + // Fallback for environments where import.meta.url is not available + currentDir = process.cwd(); + } + + const assetFilePath = join(currentDir, "../data/cua_screenshot.jpg"); + const actionResultFilePath = join(currentDir, "../data/cua_screenshot_next.jpg"); + + // Create an Azure AI Agents Client + const client = new AgentsClient(projectEndpoint, new DefaultAzureCredential()); + + // Initialize Computer Use tool with a browser-sized viewport + const computerUse = ToolUtility.createComputerUseTool(1026, 769, environment); + + // Create a new Agent that has the Computer Use tool attached. + const agent = await client.createAgent(modelDeploymentName, { + name: "my-agent-computer-use", + instructions: ` + You are a computer automation assistant. + Use the computer_use_preview tool to interact with the screen when needed. + `, + tools: [computerUse.definition], + }); + + console.log(`Created agent, ID: ${agent.id}`); + + // Create thread for communication + const thread = await client.threads.create(); + console.log(`Created thread, ID: ${thread.id}`); + + const inputMessage = + "I can see a web browser with bing.com open and the cursor in the search box. " + + "Type 'movies near me' without pressing Enter or any other key. Only type 'movies near me'."; + + const imageBase64 = imageToBase64(assetFilePath); + const imgUrl = `data:image/jpeg;base64,${imageBase64}`; + const urlParam: MessageImageUrlParam = { url: imgUrl, detail: "high" }; + console.log("urlParam ", urlParam); + + const textBlock: MessageInputTextBlock = { type: "text", text: inputMessage }; + const imageBlock: MessageInputImageUrlBlock = { type: "image_url", imageUrl: urlParam }; + const contentBlocks: MessageInputContentBlock[] = [ + textBlock, + imageBlock, + ]; + + // Create message to thread + const message = await client.messages.create(thread.id, "user", contentBlocks); + console.log(`Created message, ID: ${message.id}`); + + const run = await client.runs.create(thread.id, agent.id); + console.log(`Created run, ID: ${run.id}`); + + // Create a fake screenshot showing the text typed in + const resultImageBase64 = imageToBase64(actionResultFilePath); + const resultImgUrl = `data:image/jpeg;base64,${resultImageBase64}`; + const computerScreenshot: ComputerScreenshot = { + type: "computer_screenshot", + imageUrl: resultImgUrl + }; + + while (run.status === "queued" || run.status === "in_progress" || run.status === "requires_action") { + await new Promise(resolve => setTimeout(resolve, 1000)); + const updatedRun = await client.runs.get(thread.id, run.id); + + if (updatedRun.status === "requires_action" && updatedRun.requiredAction) { + if (isOutputOfType(updatedRun.requiredAction, "submit_tool_outputs")) { + console.log("Run requires action:"); + const toolCalls = updatedRun.requiredAction.submitToolOutputs.toolCalls; + + if (!toolCalls || toolCalls.length === 0) { + console.log("No tool calls provided - cancelling run"); + await client.runs.cancel(thread.id, run.id); + break; + } + + const toolOutputs: StructuredToolOutputUnion[] = []; + for (const toolCall of toolCalls) { + if (isOutputOfType(toolCall, "computer_use_preview")) { + console.log(toolCall); + try { + const action = toolCall.computerUsePreview.action; + console.log(`Executing computer use action: ${action.type}`); + + if (isOutputOfType(action, "type")) { + console.log(` Text to type: ${action.text}`); + // (add hook to input text in managed environment API here) + + toolOutputs.push({ + type: "computer_call_output", + toolCallId: toolCall.id, + output: computerScreenshot, + }); + } else if (isOutputOfType(action, "screenshot")) { + console.log(" Screenshot requested"); + // (add hook to take screenshot in managed environment API here) + + toolOutputs.push({ + type: "computer_call_output", + toolCallId: toolCall.id, + output: computerScreenshot, + }); + } + } catch (error) { + console.log(`Error executing tool_call ${toolCall.id}: ${error}`); + } + } + } + + console.log(`Tool outputs: ${JSON.stringify(toolOutputs, null, 2)}`); + if (toolOutputs.length > 0) { + await client.runs.submitToolOutputs(thread.id, run.id, toolOutputs); + } + } + } + + // Update run status for the loop condition + const currentRun = await client.runs.get(thread.id, run.id); + Object.assign(run, currentRun); + console.log(`Current run status: ${run.status}`); + } + + console.log(`Run completed with status: ${run.status}`); + if (run.status === "failed") { + console.log(`Run failed: ${JSON.stringify(run.lastError)}`); + } + + // Fetch run steps to get the details of the agent run + const runStepsIterator = client.runSteps.list(thread.id, run.id); + console.log("\nRun Steps:"); + + for await (const step of runStepsIterator) { + console.log(`Step ${step.id} status: ${step.status}`); + console.log(step); + + if (isOutputOfType(step.stepDetails, "tool_calls")) { + console.log(" Tool calls:"); + const runStepToolCalls = step.stepDetails.toolCalls; + + for (const call of runStepToolCalls) { + console.log(` Tool call ID: ${call.id}`); + console.log(` Tool call type: ${call.type}`); + + if (isOutputOfType(call, "computer_use_preview")) { + const details = call.computerUsePreview; + console.log(` Computer use action type: ${details.action.type}`); + } + + console.log(); // extra newline between tool calls + } + } + + console.log(); // extra newline between run steps + } + + // Optional: Delete the agent once the run is finished. + await client.deleteAgent(agent.id); + console.log("Deleted agent"); +} + +main().catch((err) => { + console.error("The sample encountered an error:", err); + process.exit(1); +}); diff --git a/sdk/ai/ai-agents/src/api/agentsContext.ts b/sdk/ai/ai-agents/src/api/agentsContext.ts index 140305936983..282567833a2b 100644 --- a/sdk/ai/ai-agents/src/api/agentsContext.ts +++ b/sdk/ai/ai-agents/src/api/agentsContext.ts @@ -26,7 +26,7 @@ export function createAgents( ): AgentsContext { const endpointUrl = options.endpoint ?? String(endpointParam); const prefixFromOptions = options?.userAgentOptions?.userAgentPrefix; - const userAgentInfo = `azsdk-js-ai-agents/1.2.0-beta.2`; + const userAgentInfo = `azsdk-js-ai-agents/1.2.0-beta.3`; const userAgentPrefix = prefixFromOptions ? `${prefixFromOptions} azsdk-js-api ${userAgentInfo}` : `azsdk-js-api ${userAgentInfo}`; diff --git a/sdk/ai/ai-agents/src/api/runs/operations.ts b/sdk/ai/ai-agents/src/api/runs/operations.ts index f2bc731e11f2..ab1af408e892 100644 --- a/sdk/ai/ai-agents/src/api/runs/operations.ts +++ b/sdk/ai/ai-agents/src/api/runs/operations.ts @@ -13,7 +13,7 @@ import { agentsToolChoiceOptionSerializer, threadRunDeserializer, _agentsPagedResultThreadRunDeserializer, - toolOutputArraySerializer, + structuredToolOutputUnionArraySerializer, toolApprovalArraySerializer, } from "../../models/models.js"; import type { @@ -108,7 +108,7 @@ export function _submitToolOutputsToRunSend( }, body: { tool_outputs: options?.toolOutputs?.length - ? toolOutputArraySerializer(options?.toolOutputs) + ? structuredToolOutputUnionArraySerializer(options?.toolOutputs) : undefined, tool_approvals: options?.toolApprovals?.length ? toolApprovalArraySerializer(options?.toolApprovals) diff --git a/sdk/ai/ai-agents/src/api/runs/options.ts b/sdk/ai/ai-agents/src/api/runs/options.ts index dfcfb8925245..84070177b207 100644 --- a/sdk/ai/ai-agents/src/api/runs/options.ts +++ b/sdk/ai/ai-agents/src/api/runs/options.ts @@ -8,7 +8,7 @@ import type { ThreadMessageOptions, TruncationObject, AgentsToolChoiceOption, - ToolOutput, + StructuredToolOutputUnion, ToolApproval, ListSortOrder, RunAdditionalFieldList, @@ -22,7 +22,7 @@ export interface RunsCancelRunOptionalParams extends OperationOptions {} /** Optional parameters. */ export interface RunsSubmitToolOutputsToRunOptionalParams extends OperationOptions { /** A list of tools for which the outputs are being submitted */ - toolOutputs?: ToolOutput[]; + toolOutputs?: StructuredToolOutputUnion[]; /** A list of tool approvals allowing data to be sent to tools. */ toolApprovals?: ToolApproval[]; /** If true, returns a stream of events that happen during the Run as SSE, terminating at `[DONE]`. */ diff --git a/sdk/ai/ai-agents/src/classic/runs/index.ts b/sdk/ai/ai-agents/src/classic/runs/index.ts index 5b4c60ca9a00..e3d0b90a6930 100644 --- a/sdk/ai/ai-agents/src/classic/runs/index.ts +++ b/sdk/ai/ai-agents/src/classic/runs/index.ts @@ -2,7 +2,7 @@ // Licensed under the MIT License. import { AgentsContext } from "../../api/agentsContext.js"; -import { ThreadRun, ToolOutput } from "../../models/models.js"; +import { ThreadRun, StructuredToolOutputUnion } from "../../models/models.js"; import { RunsCancelRunOptionalParams, RunsSubmitToolOutputsToRunOptionalParams, @@ -38,7 +38,7 @@ export interface RunsOperations { submitToolOutputs: ( threadId: string, runId: string, - toolOutputs: ToolOutput[], + toolOutputs: StructuredToolOutputUnion[], options?: RunsSubmitToolOutputsToRunOptionalParams, ) => AgentRunResponse; /** Modifies an existing thread run. */ @@ -80,7 +80,7 @@ function _getRuns(context: AgentsContext) { submitToolOutputs: ( threadId: string, runId: string, - toolOutputs: ToolOutput[], + toolOutputs: StructuredToolOutputUnion[], options?: RunsSubmitToolOutputsToRunOptionalParams, ) => submitToolOutputsToRun(context, threadId, runId, { diff --git a/sdk/ai/ai-agents/src/constants.ts b/sdk/ai/ai-agents/src/constants.ts index a60ede045955..cadca3854166 100644 --- a/sdk/ai/ai-agents/src/constants.ts +++ b/sdk/ai/ai-agents/src/constants.ts @@ -4,7 +4,7 @@ /** * Current version of the `@azure/ai-agents` package. */ -export const SDK_VERSION = `1.2.0-beta.2`; +export const SDK_VERSION = `1.2.0-beta.3`; /** * The package name of the `@azure/ai-agents` package. diff --git a/sdk/ai/ai-agents/src/index.ts b/sdk/ai/ai-agents/src/index.ts index 15e6467e03d2..f8a037883919 100644 --- a/sdk/ai/ai-agents/src/index.ts +++ b/sdk/ai/ai-agents/src/index.ts @@ -49,6 +49,9 @@ export { DeepResearchDetails, DeepResearchBingGroundingConnection, MCPToolDefinition, + ComputerUseToolDefinition, + ComputerUseToolParameters, + ComputerUseEnvironment, AzureFunctionToolDefinition, AzureFunctionDefinition, AzureFunctionBinding, @@ -113,6 +116,22 @@ export { RequiredFunctionToolCall, RequiredFunctionToolCallDetails, RequiredMcpToolCall, + RequiredComputerUseToolCall, + RequiredComputerUseToolCallDetails, + ComputerUseAction, + ComputerUseActionUnion, + ClickAction, + MouseButton, + DoubleClickAction, + DragAction, + CoordinatePoint, + KeyPressAction, + MoveAction, + ScreenshotAction, + ScrollAction, + TypeAction, + WaitAction, + SafetyCheck, SubmitToolApprovalAction, SubmitToolApprovalDetails, RunError, @@ -140,7 +159,11 @@ export { MessageImageFileContent, MessageImageFileDetails, MessageDeletionStatus, + StructuredToolOutput, + StructuredToolOutputUnion, ToolOutput, + ComputerToolOutput, + ComputerScreenshot, ToolApproval, RunStep, RunStepType, @@ -169,6 +192,8 @@ export { BrowserAutomationToolCallDetails, BrowserAutomationToolCallStep, RunStepMcpToolCall, + RunStepComputerUseToolCall, + RunStepComputerUseToolCallDetails, RunStepSharepointToolCall, RunStepMicrosoftFabricToolCall, RunStepBingCustomSearchToolCall, @@ -261,6 +286,8 @@ export { RunStepDeltaAzureFunctionToolCall, RunStepDeltaDeepResearchToolCall, RunStepDeltaAzureAISearchToolCall, + RunStepDeltaComputerUseToolCall, + RunStepDeltaComputerUseDetails, RunStepDeltaMicrosoftFabricToolCall, RunStepDeltaSharepointToolCall, RunStepDeltaMCPObject, diff --git a/sdk/ai/ai-agents/src/models/index.ts b/sdk/ai/ai-agents/src/models/index.ts index 65e699b701db..f79fc84ff8ff 100644 --- a/sdk/ai/ai-agents/src/models/index.ts +++ b/sdk/ai/ai-agents/src/models/index.ts @@ -39,6 +39,9 @@ export { DeepResearchDetails, DeepResearchBingGroundingConnection, MCPToolDefinition, + ComputerUseToolDefinition, + ComputerUseToolParameters, + ComputerUseEnvironment, AzureFunctionToolDefinition, AzureFunctionDefinition, AzureFunctionBinding, @@ -103,6 +106,22 @@ export { RequiredFunctionToolCall, RequiredFunctionToolCallDetails, RequiredMcpToolCall, + RequiredComputerUseToolCall, + RequiredComputerUseToolCallDetails, + ComputerUseAction, + ComputerUseActionUnion, + ClickAction, + MouseButton, + DoubleClickAction, + DragAction, + CoordinatePoint, + KeyPressAction, + MoveAction, + ScreenshotAction, + ScrollAction, + TypeAction, + WaitAction, + SafetyCheck, SubmitToolApprovalAction, SubmitToolApprovalDetails, RunError, @@ -130,7 +149,11 @@ export { MessageImageFileContent, MessageImageFileDetails, MessageDeletionStatus, + StructuredToolOutput, + StructuredToolOutputUnion, ToolOutput, + ComputerToolOutput, + ComputerScreenshot, ToolApproval, RunStep, RunStepType, @@ -159,6 +182,8 @@ export { BrowserAutomationToolCallDetails, BrowserAutomationToolCallStep, RunStepMcpToolCall, + RunStepComputerUseToolCall, + RunStepComputerUseToolCallDetails, RunStepSharepointToolCall, RunStepMicrosoftFabricToolCall, RunStepBingCustomSearchToolCall, @@ -251,6 +276,8 @@ export { RunStepDeltaAzureFunctionToolCall, RunStepDeltaDeepResearchToolCall, RunStepDeltaAzureAISearchToolCall, + RunStepDeltaComputerUseToolCall, + RunStepDeltaComputerUseDetails, RunStepDeltaMicrosoftFabricToolCall, RunStepDeltaSharepointToolCall, RunStepDeltaMCPObject, diff --git a/sdk/ai/ai-agents/src/models/models.ts b/sdk/ai/ai-agents/src/models/models.ts index 24e817a48f70..1eec4d6700aa 100644 --- a/sdk/ai/ai-agents/src/models/models.ts +++ b/sdk/ai/ai-agents/src/models/models.ts @@ -9,7 +9,7 @@ import type { FileContents } from "../static-helpers/multipartHelpers.js"; /** An abstract representation of an input tool definition that an agent can use. */ export interface ToolDefinition { /** The object type. */ - /** The discriminator possible values: code_interpreter, file_search, function, bing_grounding, fabric_dataagent, sharepoint_grounding, azure_ai_search, openapi, bing_custom_search, connected_agent, deep_research, mcp, azure_function, browser_automation */ + /** The discriminator possible values: code_interpreter, file_search, function, bing_grounding, fabric_dataagent, sharepoint_grounding, azure_ai_search, openapi, bing_custom_search, connected_agent, deep_research, mcp, computer_use_preview, azure_function, browser_automation */ type: string; } @@ -37,6 +37,7 @@ export type ToolDefinitionUnion = | ConnectedAgentToolDefinition | DeepResearchToolDefinition | MCPToolDefinition + | ComputerUseToolDefinition | AzureFunctionToolDefinition | BrowserAutomationToolDefinition | ToolDefinition; @@ -79,6 +80,9 @@ export function toolDefinitionUnionSerializer(item: ToolDefinitionUnion): any { case "mcp": return mcpToolDefinitionSerializer(item as MCPToolDefinition); + case "computer_use_preview": + return computerUseToolDefinitionSerializer(item as ComputerUseToolDefinition); + case "azure_function": return azureFunctionToolDefinitionSerializer(item as AzureFunctionToolDefinition); @@ -128,6 +132,9 @@ export function toolDefinitionUnionDeserializer(item: any): ToolDefinitionUnion case "mcp": return mcpToolDefinitionDeserializer(item as MCPToolDefinition); + case "computer_use_preview": + return computerUseToolDefinitionDeserializer(item as ComputerUseToolDefinition); + case "azure_function": return azureFunctionToolDefinitionDeserializer(item as AzureFunctionToolDefinition); @@ -363,13 +370,30 @@ export function bingGroundingSearchConfigurationArrayDeserializer( export interface BingGroundingSearchConfiguration { /** Connection id for grounding with bing search */ connectionId: string; - /** The market where the results come from. */ + /** The market where the results come from. Typically, market is the country where the user is making the request from. However, it could be a different country if the user is not located in a country where Bing delivers results. The market must be in the form: `-` where `` is an ISO 639-1 language code (neutral culture) and `` is an ISO 3166 country/region (specific culture) code. For example, `en-US`. The string is case insensitive. For a list of possible market values, see [Market codes](https://learn.microsoft.com/bing/search-apis/bing-web-search/reference/market-codes). If known, you are encouraged to always specify the market. Specifying the market helps Bing route the request and return an appropriate and optimal response. If you specify a market that is not listed in Market codes, Bing uses a best fit market code based on an internal mapping that is subject to change. */ market?: string; - /** The language to use for user interface strings when calling Bing API. */ + /** + * The language to use for user interface strings. You may specify the language using either a 2-letter or 4-letter code. Using 4-letter codes is preferred. + * For a list of supported language codes, see [Bing supported languages](https://learn.microsoft.com/bing/search-apis/bing-web-search/reference/market-codes#bing-supported-language-codes). + * Bing loads the localized strings if this parameter contains a valid 2-letter neutral culture code (for example `fr`) or a valid 4-letter specific culture code (`fr-ca`). For example, for `fr-ca`, Bing loads the `fr` neutral culture code strings. + * If the parameter is not valid (for example, `zh`) or Bing doesn’t support the language (for example, `af`, `af-na`), Bing defaults to `en` (English). + * To specify the 2-letter code, set this parameter to an ISO 639-1 language code. + * To specify the 4-letter code, use the form `-` where `` is an ISO 639-1 language code (neutral culture) and `` is an ISO 3166 country/region (specific culture) code. For example, use `en-US` for United States English. + * Although optional, you should always specify the language. Typically, you set this parameter to the same language specified by the market value unless the user wants the user interface strings displayed in a different language. + */ setLang?: string; - /** The number of search results to return in the bing api response */ + /** + * The number of search results to return in the response. The default is 5 and the maximum value is 50. The actual number delivered may be less than requested. + * - It is possible for multiple pages to include some overlap in results. + * - This parameter affects only web page results. It's possible that AI model might not use all search results returned by Bing. + */ count?: number; - /** Filter search results by a specific time range. Accepted values: https://learn.microsoft.com/bing/search-apis/bing-web-search/reference/query-parameters */ + /** + * Filter search results by the following case-insensitive age values: + * - Day: Return webpages that Bing discovered within the last 24 hours. + * - Week: Return webpages that Bing discovered within the last 7 days. + * - Month: Return webpages that Bing discovered within the last 30 days. To get articles discovered by Bing during a specific timeframe, specify a date range in the form: `YYYY-MM-DD..YYYY-MM-DD`. For example, `freshness=2019-02-01..2019-05-30. To limit the results to a single date, set this parameter to a specific date. For example, freshness=2019-02-04`. + */ freshness?: string; } @@ -863,13 +887,30 @@ export interface BingCustomSearchConfiguration { connectionId: string; /** Name of the custom configuration instance given to config. */ instanceName: string; - /** The market where the results come from. */ + /** The market where the results come from. Typically, market is the country where the user is making the request from. However, it could be a different country if the user is not located in a country where Bing delivers results. The market must be in the form: `-` where `` is an ISO 639-1 language code (neutral culture) and `` is an ISO 3166 country/region (specific culture) code. For example, `en-US`. The string is case insensitive. For a list of possible market values, see [Market codes](https://learn.microsoft.com/bing/search-apis/bing-web-search/reference/market-codes). If known, you are encouraged to always specify the market. Specifying the market helps Bing route the request and return an appropriate and optimal response. If you specify a market that is not listed in Market codes, Bing uses a best fit market code based on an internal mapping that is subject to change. */ market?: string; - /** The language to use for user interface strings when calling Bing API. */ + /** + * The language to use for user interface strings. You may specify the language using either a 2-letter or 4-letter code. Using 4-letter codes is preferred. + * For a list of supported language codes, see [Bing supported languages](https://learn.microsoft.com/bing/search-apis/bing-web-search/reference/market-codes#bing-supported-language-codes). + * Bing loads the localized strings if this parameter contains a valid 2-letter neutral culture code (for example `fr`) or a valid 4-letter specific culture code (`fr-ca`). For example, for `fr-ca`, Bing loads the `fr` neutral culture code strings. + * If the parameter is not valid (for example, `zh`) or Bing doesn’t support the language (for example, `af`, `af-na`), Bing defaults to `en` (English). + * To specify the 2-letter code, set this parameter to an ISO 639-1 language code. + * To specify the 4-letter code, use the form `-` where `` is an ISO 639-1 language code (neutral culture) and `` is an ISO 3166 country/region (specific culture) code. For example, use `en-US` for United States English. + * Although optional, you should always specify the language. Typically, you set this parameter to the same language specified by the market value unless the user wants the user interface strings displayed in a different language. + */ setLang?: string; - /** The number of search results to return in the bing api response */ + /** + * The number of search results to return in the response. The default is 5 and the maximum value is 50. The actual number delivered may be less than requested. + * - It is possible for multiple pages to include some overlap in results. + * - This parameter affects only web page results. It's possible that AI model might not use all search results returned by Bing. + */ count?: number; - /** Filter search results by a specific time range. Accepted values: https://learn.microsoft.com/bing/search-apis/bing-web-search/reference/query-parameters */ + /** + * Filter search results by the following case-insensitive age values: + * - Day: Return webpages that Bing discovered within the last 24 hours. + * - Week: Return webpages that Bing discovered within the last 7 days. + * - Month: Return webpages that Bing discovered within the last 30 days. To get articles discovered by Bing during a specific timeframe, specify a date range in the form: `YYYY-MM-DD..YYYY-MM-DD`. For example, `freshness=2019-02-01..2019-05-30. To limit the results to a single date, set this parameter to a specific date. For example, freshness=2019-02-04`. + */ freshness?: string; } @@ -1067,6 +1108,69 @@ export function mcpToolDefinitionDeserializer(item: any): MCPToolDefinition { }; } +/** The input definition information for a Computer Use tool as used to configure an agent. */ +export interface ComputerUseToolDefinition extends ToolDefinition { + /** The object type, which is always 'computer_use_preview'. */ + type: "computer_use_preview"; + /** The computer use tool parameters. */ + computerUsePreview: ComputerUseToolParameters; +} + +export function computerUseToolDefinitionSerializer( + item: ComputerUseToolDefinition, +): any { + return { + type: item["type"], + computer_use_preview: computerUseToolParametersSerializer( + item["computerUsePreview"], + ), + }; +} + +export function computerUseToolDefinitionDeserializer( + item: any, +): ComputerUseToolDefinition { + return { + type: item["type"], + computerUsePreview: computerUseToolParametersDeserializer( + item["computer_use_preview"], + ), + }; +} + +/** The computer use tool parameters. */ +export interface ComputerUseToolParameters { + /** The display width for the computer use tool. */ + displayWidth: number; + /** The display height for the computer use tool. */ + displayHeight: number; + /** The environment for the computer use tool. */ + environment: ComputerUseEnvironment; +} + +export function computerUseToolParametersSerializer( + item: ComputerUseToolParameters, +): any { + return { + display_width: item["displayWidth"], + display_height: item["displayHeight"], + environment: item["environment"], + }; +} + +export function computerUseToolParametersDeserializer( + item: any, +): ComputerUseToolParameters { + return { + displayWidth: item["display_width"], + displayHeight: item["display_height"], + environment: item["environment"], + }; +} + +/** The environment types supported by the computer use tool. */ +export type ComputerUseEnvironment = "windows" | "mac" | "linux" | "browser"; + /** The input definition information for a azure function tool as used to configure an agent. */ export interface AzureFunctionToolDefinition extends ToolDefinition { /** The object type, which is always 'azure_function'. */ @@ -2214,7 +2318,8 @@ export type AgentsNamedToolChoiceType = | "bing_custom_search" | "connected_agent" | "deep_research" - | "mcp"; + | "mcp" + | "computer_use_preview"; /** The function name that will be used, if using the `function` tool */ export interface FunctionName { @@ -2436,7 +2541,7 @@ export function requiredToolCallUnionArrayDeserializer( /** An abstract representation of a tool invocation needed by the model to continue a run. */ export interface RequiredToolCall { /** The object type for the required tool call. */ - /** The discriminator possible values: function, mcp */ + /** The discriminator possible values: function, mcp, computer_use_preview */ type: string; /** The ID of the tool call. This ID must be referenced when submitting tool outputs. */ id: string; @@ -2453,6 +2558,7 @@ export function requiredToolCallDeserializer(item: any): RequiredToolCall { export type RequiredToolCallUnion = | RequiredFunctionToolCall | RequiredMcpToolCall + | RequiredComputerUseToolCall | RequiredToolCall; export function requiredToolCallUnionDeserializer(item: any): RequiredToolCallUnion { @@ -2463,6 +2569,9 @@ export function requiredToolCallUnionDeserializer(item: any): RequiredToolCallUn case "mcp": return requiredMcpToolCallDeserializer(item as RequiredMcpToolCall); + case "computer_use_preview": + return requiredComputerUseToolCallDeserializer(item as RequiredComputerUseToolCall); + default: return requiredToolCallDeserializer(item); } @@ -2523,6 +2632,321 @@ export function requiredMcpToolCallDeserializer(item: any): RequiredMcpToolCall }; } +/** A representation of a requested call to a Computer Use tool, needed by the model to continue evaluation of a run. */ +export interface RequiredComputerUseToolCall extends RequiredToolCall { + /** The object type of the required tool call. Always 'computer_use_preview' for Computer Use tools. */ + type: "computer_use_preview"; + /** Detailed information about the computer use action to be executed. */ + computerUsePreview: RequiredComputerUseToolCallDetails; +} + +export function requiredComputerUseToolCallDeserializer( + item: any, +): RequiredComputerUseToolCall { + return { + type: item["type"], + id: item["id"], + computerUsePreview: requiredComputerUseToolCallDetailsDeserializer( + item["computer_use_preview"], + ), + }; +} + +/** The detailed information for a computer use tool invocation. */ +export interface RequiredComputerUseToolCallDetails { + /** The action to be performed by the computer use tool. */ + action: ComputerUseActionUnion; + /** Safety checks that are pending acknowledgment by the developer. */ + pendingSafetyChecks: SafetyCheck[]; +} + +export function requiredComputerUseToolCallDetailsDeserializer( + item: any, +): RequiredComputerUseToolCallDetails { + return { + action: computerUseActionUnionDeserializer(item["action"]), + pendingSafetyChecks: safetyCheckArrayDeserializer( + item["pending_safety_checks"], + ), + }; +} + +/** An abstract representation of a computer use action. */ +export interface ComputerUseAction { + /** The type of computer use action. */ + /** The discriminator possible values: click, double_click, drag, keypress, move, screenshot, scroll, type, wait */ + type: string; +} + +export function computerUseActionDeserializer(item: any): ComputerUseAction { + return { + type: item["type"], + }; +} + +/** Alias for ComputerUseActionUnion */ +export type ComputerUseActionUnion = + | ClickAction + | DoubleClickAction + | DragAction + | KeyPressAction + | MoveAction + | ScreenshotAction + | ScrollAction + | TypeAction + | WaitAction + | ComputerUseAction; + +export function computerUseActionUnionDeserializer( + item: any, +): ComputerUseActionUnion { + switch (item.type) { + case "click": + return clickActionDeserializer(item as ClickAction); + + case "double_click": + return doubleClickActionDeserializer(item as DoubleClickAction); + + case "drag": + return dragActionDeserializer(item as DragAction); + + case "keypress": + return keyPressActionDeserializer(item as KeyPressAction); + + case "move": + return moveActionDeserializer(item as MoveAction); + + case "screenshot": + return screenshotActionDeserializer(item as ScreenshotAction); + + case "scroll": + return scrollActionDeserializer(item as ScrollAction); + + case "type": + return typeActionDeserializer(item as TypeAction); + + case "wait": + return waitActionDeserializer(item as WaitAction); + + default: + return computerUseActionDeserializer(item); + } +} + +/** A click action. */ +export interface ClickAction extends ComputerUseAction { + /** Specifies the event type. For a click action, this property is always set to click. */ + type: "click"; + /** The x-coordinate where the click occurred. */ + x: number; + /** The y-coordinate where the click occurred. */ + y: number; + /** Indicates which mouse button was pressed during the click. */ + button: MouseButton; +} + +export function clickActionDeserializer(item: any): ClickAction { + return { + type: item["type"], + x: item["x"], + y: item["y"], + button: item["button"], + }; +} + +/** The mouse button types supported by click actions. */ +export type MouseButton = "left" | "right" | "wheel" | "back" | "forward"; + +/** A double click action. */ +export interface DoubleClickAction extends ComputerUseAction { + /** Specifies the event type. For a double click action, this property is always set to double_click. */ + type: "double_click"; + /** The x-coordinate where the double click occurred. */ + x: number; + /** The y-coordinate where the double click occurred. */ + y: number; +} + +export function doubleClickActionDeserializer(item: any): DoubleClickAction { + return { + type: item["type"], + x: item["x"], + y: item["y"], + }; +} + +/** A drag action. */ +export interface DragAction extends ComputerUseAction { + /** Specifies the event type. For a drag action, this property is always set to drag. */ + type: "drag"; + /** An array of coordinates representing the path of the drag action. */ + path: CoordinatePoint[]; +} + +export function dragActionDeserializer(item: any): DragAction { + return { + type: item["type"], + path: coordinatePointArrayDeserializer(item["path"]), + }; +} + +export function coordinatePointArrayDeserializer( + result: Array, +): any[] { + return result.map((item) => { + return coordinatePointDeserializer(item); + }); +} + +/** A coordinate point with x and y values. */ +export interface CoordinatePoint { + /** The x-coordinate. */ + x: number; + /** The y-coordinate. */ + y: number; +} + +export function coordinatePointDeserializer(item: any): CoordinatePoint { + return { + x: item["x"], + y: item["y"], + }; +} + +/** A collection of keypresses the model would like to perform. */ +export interface KeyPressAction extends ComputerUseAction { + /** Specifies the event type. For a keypress action, this property is always set to keypress. */ + type: "keypress"; + /** The combination of keys the model is requesting to be pressed. This is an array of strings, each representing a key. */ + keys: string[]; +} + +export function keyPressActionDeserializer(item: any): KeyPressAction { + return { + type: item["type"], + keys: item["keys"].map((p: any) => { + return p; + }), + }; +} + +/** A mouse move action. */ +export interface MoveAction extends ComputerUseAction { + /** Specifies the event type. For a move action, this property is always set to move. */ + type: "move"; + /** The x-coordinate to move to. */ + x: number; + /** The y-coordinate to move to. */ + y: number; +} + +export function moveActionDeserializer(item: any): MoveAction { + return { + type: item["type"], + x: item["x"], + y: item["y"], + }; +} + +/** A screenshot action. */ +export interface ScreenshotAction extends ComputerUseAction { + /** Specifies the event type. For a screenshot action, this property is always set to screenshot. */ + type: "screenshot"; +} + +export function screenshotActionDeserializer(item: any): ScreenshotAction { + return { + type: item["type"], + }; +} + +/** A scroll action. */ +export interface ScrollAction extends ComputerUseAction { + /** Specifies the event type. For a scroll action, this property is always set to scroll. */ + type: "scroll"; + /** The x-coordinate where the scroll occurred. */ + x: number; + /** The y-coordinate where the scroll occurred. */ + y: number; + /** The horizontal scroll distance. */ + scrollX: number; + /** The vertical scroll distance. */ + scrollY: number; +} + +export function scrollActionDeserializer(item: any): ScrollAction { + return { + type: item["type"], + x: item["x"], + y: item["y"], + scrollX: item["scroll_x"], + scrollY: item["scroll_y"], + }; +} + +/** An action to type in text. */ +export interface TypeAction extends ComputerUseAction { + /** Specifies the event type. For a type action, this property is always set to type. */ + type: "type"; + /** The text to type. */ + text: string; +} + +export function typeActionDeserializer(item: any): TypeAction { + return { + type: item["type"], + text: item["text"], + }; +} + +/** A wait action. */ +export interface WaitAction extends ComputerUseAction { + /** Specifies the event type. For a wait action, this property is always set to wait. */ + type: "wait"; +} + +export function waitActionDeserializer(item: any): WaitAction { + return { + type: item["type"], + }; +} + +export function safetyCheckArraySerializer(result: Array): any[] { + return result.map((item) => { + return safetyCheckSerializer(item); + }); +} + +export function safetyCheckArrayDeserializer( + result: Array, +): any[] { + return result.map((item) => { + return safetyCheckDeserializer(item); + }); +} + +/** Safety check that has been acknowledged by the developer. */ +export interface SafetyCheck { + /** The ID of the pending safety check. */ + id: string; + /** The type of the pending safety check. */ + code?: string; + /** Details about the pending safety check. */ + message?: string; +} + +export function safetyCheckSerializer(item: SafetyCheck): any { + return { id: item["id"], code: item["code"], message: item["message"] }; +} + +export function safetyCheckDeserializer(item: any): SafetyCheck { + return { + id: item["id"], + code: item["code"], + message: item["message"], + }; +} + /** The details for required tool call approval that must be submitted for an agent thread run to continue. */ export interface SubmitToolApprovalAction extends RequiredAction { /** The object type, which is always 'submit_tool_approval'. */ @@ -3085,15 +3509,102 @@ export function threadRunArrayDeserializer(result: Array): any[] { } /** The data provided during a tool outputs submission to resolve pending tool calls and allow the model to continue. */ -export interface ToolOutput { +export interface StructuredToolOutput { + /** The object type for the tool output. Defaults to `function_call_output` if not provided. */ + /** The discriminator possible values: function_call_output, computer_call_output */ + type: string; /** The ID of the tool call being resolved, as provided in the tool calls of a required action from a run. */ toolCallId?: string; - /** The output from the tool to be submitted. */ +} + +export function structuredToolOutputSerializer( + item: StructuredToolOutput, +): any { + return { type: item["type"], tool_call_id: item["toolCallId"] }; +} + +/** Alias for StructuredToolOutputUnion */ +export type StructuredToolOutputUnion = + | ToolOutput + | ComputerToolOutput + | StructuredToolOutput; + +export function structuredToolOutputUnionSerializer( + item: StructuredToolOutputUnion, +): any { + switch (item.type) { + case "function_call_output": + return toolOutputSerializer(item as ToolOutput); + + case "computer_call_output": + return computerToolOutputSerializer(item as ComputerToolOutput); + + default: + return structuredToolOutputSerializer(item); + } +} + +/** The output from a function tool to be submitted. */ +export interface ToolOutput extends StructuredToolOutput { + /** The object type, which is always 'function_call_output'. */ + type: "function_call_output"; + /** The output from the function tool to be submitted. */ output?: string; } export function toolOutputSerializer(item: ToolOutput): any { - return { tool_call_id: item["toolCallId"], output: item["output"] }; + return { + type: item["type"], + tool_call_id: item["toolCallId"], + output: item["output"], + }; +} + +/** The output from a computer use tool to be submitted. */ +export interface ComputerToolOutput extends StructuredToolOutput { + /** The object type, which is always 'computer_call_output'. */ + type: "computer_call_output"; + /** The output from the computer use tool. */ + output: ComputerScreenshot; + /** Safety checks that have been acknowledged by the developer. */ + acknowledgedSafetyChecks?: SafetyCheck[]; +} + +export function computerToolOutputSerializer(item: ComputerToolOutput): any { + return { + type: item["type"], + tool_call_id: item["toolCallId"], + output: computerScreenshotSerializer(item["output"]), + acknowledged_safety_checks: !item["acknowledgedSafetyChecks"] + ? item["acknowledgedSafetyChecks"] + : safetyCheckArraySerializer(item["acknowledgedSafetyChecks"]), + }; +} + +/** The output from a computer use tool representing a screenshot. */ +export interface ComputerScreenshot { + /** Specifies the event type. For a computer screenshot, this property is always set to computer_screenshot. */ + type: "computer_screenshot"; + /** The identifier of an uploaded file that contains the screenshot. */ + fileId?: string; + /** The URL of the screenshot image. */ + imageUrl?: string; +} + +export function computerScreenshotSerializer(item: ComputerScreenshot): any { + return { + type: item["type"], + file_id: item["fileId"], + image_url: item["imageUrl"], + }; +} + +export function computerScreenshotDeserializer(item: any): ComputerScreenshot { + return { + type: item["type"], + fileId: item["file_id"], + imageUrl: item["image_url"], + }; } /** The data provided during a tool outputs submission to resolve pending tool calls and allow the model to continue. */ @@ -3114,9 +3625,11 @@ export function toolApprovalSerializer(item: ToolApproval): any { }; } -export function toolOutputArraySerializer(result: Array): any[] { +export function structuredToolOutputUnionArraySerializer( + result: Array, +): any[] { return result.map((item) => { - return toolOutputSerializer(item); + return structuredToolOutputUnionSerializer(item); }); } @@ -3285,7 +3798,7 @@ export function runStepToolCallUnionArrayDeserializer(result: Array