Image, file output types for functions (openai#1898)

rm-openai · web-flow · commit 680554cc92ef · 2025-10-16T13:11:02.000-04:00
To allow the new output types for image/file, you can now return one of the three new types (or lists of those types, or even a typed dict version). If you use those, we'll convert to the correct tool call output type. Resolves openai#1850
diff --git a/docs/tools.md b/docs/tools.md
@@ -173,6 +173,14 @@ for tool in agent.tools:
     }
     ```
 
+### Returning images or files from function tools
+
+In addition to returning text outputs, you can return one or many images or files as the output of a function tool. To do so, you can return any of:
+
+-   Images: [`ToolOutputImage`][agents.tool.ToolOutputImage] (or the TypedDict version, [`ToolOutputImageDict`][agents.tool.ToolOutputImageDict])
+-   Files: [`ToolOutputFileContent`][agents.tool.ToolOutputFileContent] (or the TypedDict version, [`ToolOutputFileContentDict`][agents.tool.ToolOutputFileContentDict])
+-   Text: either a string or stringable objects, or [`ToolOutputText`][agents.tool.ToolOutputText] (or the TypedDict version, [`ToolOutputTextDict`][agents.tool.ToolOutputTextDict])
+
 ### Custom function tools
 
 Sometimes, you don't want to use a Python function as a tool. You can directly create a [`FunctionTool`][agents.tool.FunctionTool] if you prefer. You'll need to provide:
@@ -288,9 +296,9 @@ async def run_my_agent() -> str:
 
 In certain cases, you might want to modify the output of the tool-agents before returning it to the central agent. This may be useful if you want to:
 
-- Extract a specific piece of information (e.g., a JSON payload) from the sub-agent's chat history.
-- Convert or reformat the agent’s final answer (e.g., transform Markdown into plain text or CSV).
-- Validate the output or provide a fallback value when the agent’s response is missing or malformed.
+-   Extract a specific piece of information (e.g., a JSON payload) from the sub-agent's chat history.
+-   Convert or reformat the agent’s final answer (e.g., transform Markdown into plain text or CSV).
+-   Validate the output or provide a fallback value when the agent’s response is missing or malformed.
 
 You can do this by supplying the `custom_output_extractor` argument to the `as_tool` method:
 
@@ -370,16 +378,16 @@ asyncio.run(main())
 
 The `is_enabled` parameter accepts:
 
-- **Boolean values**: `True` (always enabled) or `False` (always disabled)
-- **Callable functions**: Functions that take `(context, agent)` and return a boolean
-- **Async functions**: Async functions for complex conditional logic
+-   **Boolean values**: `True` (always enabled) or `False` (always disabled)
+-   **Callable functions**: Functions that take `(context, agent)` and return a boolean
+-   **Async functions**: Async functions for complex conditional logic
 
 Disabled tools are completely hidden from the LLM at runtime, making this useful for:
 
-- Feature gating based on user permissions
-- Environment-specific tool availability (dev vs prod)
-- A/B testing different tool configurations
-- Dynamic tool filtering based on runtime state
+-   Feature gating based on user permissions
+-   Environment-specific tool availability (dev vs prod)
+-   A/B testing different tool configurations
+-   Dynamic tool filtering based on runtime state
 
 ## Handling errors in function tools
 
diff --git a/examples/basic/dynamic_system_prompt.py b/examples/basic/dynamic_system_prompt.py
@@ -28,6 +28,7 @@ def custom_instructions(
     instructions=custom_instructions,
 )
 
+
 async def main():
     context = CustomContext(style=random.choice(["haiku", "pirate", "robot"]))
     print(f"Using style: {context.style}\n")
diff --git a/examples/basic/image_tool_output.py b/examples/basic/image_tool_output.py
@@ -0,0 +1,43 @@
+import asyncio
+
+from agents import Agent, Runner, ToolOutputImage, ToolOutputImageDict, function_tool
+
+return_typed_dict = True
+
+
+@function_tool
+def fetch_random_image() -> ToolOutputImage | ToolOutputImageDict:
+    """Fetch a random image."""
+
+    print("Image tool called")
+    if return_typed_dict:
+        return {
+            "type": "image",
+            "image_url": "https://upload.wikimedia.org/wikipedia/commons/0/0c/GoldenGateBridge-001.jpg",
+            "detail": "auto",
+        }
+
+    return ToolOutputImage(
+        image_url="https://upload.wikimedia.org/wikipedia/commons/0/0c/GoldenGateBridge-001.jpg",
+        detail="auto",
+    )
+
+
+async def main():
+    agent = Agent(
+        name="Assistant",
+        instructions="You are a helpful assistant.",
+        tools=[fetch_random_image],
+    )
+
+    result = await Runner.run(
+        agent,
+        input="Fetch an image using the random_image tool, then describe it",
+    )
+    print(result.final_output)
+    """The image shows the iconic Golden Gate Bridge, a large suspension bridge painted in a
+    bright reddish-orange color..."""
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/examples/basic/tools.py b/examples/basic/tools.py
@@ -18,6 +18,7 @@ def get_weather(city: Annotated[str, "The city to get the weather for"]) -> Weat
     print("[debug] get_weather called")
     return Weather(city=city, temperature_range="14-20C", conditions="Sunny with wind.")
 
+
 agent = Agent(
     name="Hello world",
     instructions="You are a helpful agent.",
diff --git a/src/agents/__init__.py b/src/agents/__init__.py
@@ -81,6 +81,12 @@
     MCPToolApprovalFunctionResult,
     MCPToolApprovalRequest,
     Tool,
+    ToolOutputFileContent,
+    ToolOutputFileContentDict,
+    ToolOutputImage,
+    ToolOutputImageDict,
+    ToolOutputText,
+    ToolOutputTextDict,
     WebSearchTool,
     default_tool_error_function,
     function_tool,
@@ -273,6 +279,12 @@ def enable_verbose_stdout_logging():
     "MCPToolApprovalFunction",
     "MCPToolApprovalRequest",
     "MCPToolApprovalFunctionResult",
+    "ToolOutputText",
+    "ToolOutputTextDict",
+    "ToolOutputImage",
+    "ToolOutputImageDict",
+    "ToolOutputFileContent",
+    "ToolOutputFileContentDict",
     "function_tool",
     "Usage",
     "add_trace_processor",
diff --git a/src/agents/_run_impl.py b/src/agents/_run_impl.py
@@ -832,7 +832,7 @@ async def run_single_tool(
                 output=result,
                 run_item=ToolCallOutputItem(
                     output=result,
-                    raw_item=ItemHelpers.tool_call_output_item(tool_run.tool_call, str(result)),
+                    raw_item=ItemHelpers.tool_call_output_item(tool_run.tool_call, result),
                     agent=agent,
                 ),
             )
diff --git a/src/agents/extensions/memory/__init__.py b/src/agents/extensions/memory/__init__.py
@@ -58,8 +58,6 @@ def __getattr__(name: str) -> Any:
 
             return AdvancedSQLiteSession
         except ModuleNotFoundError as e:
-            raise ImportError(
-                f"Failed to import AdvancedSQLiteSession: {e}"
-            ) from e
+            raise ImportError(f"Failed to import AdvancedSQLiteSession: {e}") from e
 
     raise AttributeError(f"module {__name__} has no attribute {name}")
diff --git a/src/agents/items.py b/src/agents/items.py
@@ -21,6 +21,12 @@
 from openai.types.responses.response_code_interpreter_tool_call import (
     ResponseCodeInterpreterToolCall,
 )
+from openai.types.responses.response_function_call_output_item_list_param import (
+    ResponseFunctionCallOutputItemListParam,
+    ResponseFunctionCallOutputItemParam,
+)
+from openai.types.responses.response_input_file_content_param import ResponseInputFileContentParam
+from openai.types.responses.response_input_image_content_param import ResponseInputImageContentParam
 from openai.types.responses.response_input_item_param import (
     ComputerCallOutput,
     FunctionCallOutput,
@@ -36,9 +42,17 @@
 )
 from openai.types.responses.response_reasoning_item import ResponseReasoningItem
 from pydantic import BaseModel
-from typing_extensions import TypeAlias
+from typing_extensions import TypeAlias, assert_never
 
 from .exceptions import AgentsException, ModelBehaviorError
+from .logger import logger
+from .tool import (
+    ToolOutputFileContent,
+    ToolOutputImage,
+    ToolOutputText,
+    ValidToolOutputPydanticModels,
+    ValidToolOutputPydanticModelsTypeAdapter,
+)
 from .usage import Usage
 
 if TYPE_CHECKING:
@@ -298,11 +312,93 @@ def text_message_output(cls, message: MessageOutputItem) -> str:
 
     @classmethod
     def tool_call_output_item(
-        cls, tool_call: ResponseFunctionToolCall, output: str
+        cls, tool_call: ResponseFunctionToolCall, output: Any
     ) -> FunctionCallOutput:
-        """Creates a tool call output item from a tool call and its output."""
+        """Creates a tool call output item from a tool call and its output.
+
+        Accepts either plain values (stringified) or structured outputs using
+        input_text/input_image/input_file shapes. Structured outputs may be
+        provided as Pydantic models or dicts, or an iterable of such items.
+        """
+
+        converted_output = cls._convert_tool_output(output)
+
         return {
             "call_id": tool_call.call_id,
-            "output": output,
+            "output": converted_output,
             "type": "function_call_output",
         }
+
+    @classmethod
+    def _convert_tool_output(cls, output: Any) -> str | ResponseFunctionCallOutputItemListParam:
+        """Converts a tool return value into an output acceptable by the Responses API."""
+
+        # If the output is either a single or list of the known structured output types, convert to
+        # ResponseFunctionCallOutputItemListParam. Else, just stringify.
+        if isinstance(output, (list, tuple)):
+            maybe_converted_output_list = [
+                cls._maybe_get_output_as_structured_function_output(item) for item in output
+            ]
+            if all(maybe_converted_output_list):
+                return [
+                    cls._convert_single_tool_output_pydantic_model(item)
+                    for item in maybe_converted_output_list
+                    if item is not None
+                ]
+            else:
+                return str(output)
+        else:
+            maybe_converted_output = cls._maybe_get_output_as_structured_function_output(output)
+            if maybe_converted_output:
+                return [cls._convert_single_tool_output_pydantic_model(maybe_converted_output)]
+            else:
+                return str(output)
+
+    @classmethod
+    def _maybe_get_output_as_structured_function_output(
+        cls, output: Any
+    ) -> ValidToolOutputPydanticModels | None:
+        if isinstance(output, (ToolOutputText, ToolOutputImage, ToolOutputFileContent)):
+            return output
+        elif isinstance(output, dict):
+            try:
+                return ValidToolOutputPydanticModelsTypeAdapter.validate_python(output)
+            except pydantic.ValidationError:
+                logger.debug("dict was not a valid tool output pydantic model")
+                return None
+
+        return None
+
+    @classmethod
+    def _convert_single_tool_output_pydantic_model(
+        cls, output: ValidToolOutputPydanticModels
+    ) -> ResponseFunctionCallOutputItemParam:
+        if isinstance(output, ToolOutputText):
+            return {"type": "input_text", "text": output.text}
+        elif isinstance(output, ToolOutputImage):
+            # Forward all provided optional fields so the Responses API receives
+            # the correct identifiers and settings for the image resource.
+            result: ResponseInputImageContentParam = {"type": "input_image"}
+            if output.image_url is not None:
+                result["image_url"] = output.image_url
+            if output.file_id is not None:
+                result["file_id"] = output.file_id
+            if output.detail is not None:
+                result["detail"] = output.detail
+            return result
+        elif isinstance(output, ToolOutputFileContent):
+            # Forward all provided optional fields so the Responses API receives
+            # the correct identifiers and metadata for the file resource.
+            result_file: ResponseInputFileContentParam = {"type": "input_file"}
+            if output.file_data is not None:
+                result_file["file_data"] = output.file_data
+            if output.file_url is not None:
+                result_file["file_url"] = output.file_url
+            if output.file_id is not None:
+                result_file["file_id"] = output.file_id
+            if output.filename is not None:
+                result_file["filename"] = output.filename
+            return result_file
+        else:
+            assert_never(output)
+            raise ValueError(f"Unexpected tool output type: {output}")
diff --git a/src/agents/tool.py b/src/agents/tool.py
@@ -15,14 +15,13 @@
 from openai.types.responses.tool_param import CodeInterpreter, ImageGeneration, Mcp
 from openai.types.responses.web_search_tool import Filters as WebSearchToolFilters
 from openai.types.responses.web_search_tool_param import UserLocation
-from pydantic import ValidationError
+from pydantic import BaseModel, TypeAdapter, ValidationError
 from typing_extensions import Concatenate, NotRequired, ParamSpec, TypedDict
 
 from . import _debug
 from .computer import AsyncComputer, Computer
 from .exceptions import ModelBehaviorError
 from .function_schema import DocstringStyle, function_schema
-from .items import RunItem
 from .logger import logger
 from .run_context import RunContextWrapper
 from .strict_schema import ensure_strict_json_schema
@@ -34,6 +33,8 @@
 
 if TYPE_CHECKING:
     from .agent import Agent, AgentBase
+    from .items import RunItem
+
 
 ToolParams = ParamSpec("ToolParams")
 
@@ -48,6 +49,72 @@
 ]
 
 
+class ToolOutputText(BaseModel):
+    """Represents a tool output that should be sent to the model as text."""
+
+    type: Literal["text"] = "text"
+    text: str
+
+
+class ToolOutputTextDict(TypedDict, total=False):
+    """TypedDict variant for text tool outputs."""
+
+    type: Literal["text"]
+    text: str
+
+
+class ToolOutputImage(BaseModel):
+    """Represents a tool output that should be sent to the model as an image.
+
+    You can provide either an `image_url` (URL or data URL) or a `file_id` for previously uploaded
+    content. The optional `detail` can control vision detail.
+    """
+
+    type: Literal["image"] = "image"
+    image_url: str | None = None
+    file_id: str | None = None
+    detail: Literal["low", "high", "auto"] | None = None
+
+
+class ToolOutputImageDict(TypedDict, total=False):
+    """TypedDict variant for image tool outputs."""
+
+    type: Literal["image"]
+    image_url: NotRequired[str]
+    file_id: NotRequired[str]
+    detail: NotRequired[Literal["low", "high", "auto"]]
+
+
+class ToolOutputFileContent(BaseModel):
+    """Represents a tool output that should be sent to the model as a file.
+
+    Provide one of `file_data` (base64), `file_url`, or `file_id`. You may also
+    provide an optional `filename` when using `file_data` to hint file name.
+    """
+
+    type: Literal["file"] = "file"
+    file_data: str | None = None
+    file_url: str | None = None
+    file_id: str | None = None
+    filename: str | None = None
+
+
+class ToolOutputFileContentDict(TypedDict, total=False):
+    """TypedDict variant for file content tool outputs."""
+
+    type: Literal["file"]
+    file_data: NotRequired[str]
+    file_url: NotRequired[str]
+    file_id: NotRequired[str]
+    filename: NotRequired[str]
+
+
+ValidToolOutputPydanticModels = Union[ToolOutputText, ToolOutputImage, ToolOutputFileContent]
+ValidToolOutputPydanticModelsTypeAdapter: TypeAdapter[ValidToolOutputPydanticModels] = TypeAdapter(
+    ValidToolOutputPydanticModels
+)
+
+
 @dataclass
 class FunctionToolResult:
     tool: FunctionTool
@@ -81,7 +148,9 @@ class FunctionTool:
     1. The tool run context.
     2. The arguments from the LLM, as a JSON string.
 
-    You must return a string representation of the tool output, or something we can call `str()` on.
+    You must return a one of the structured tool output types (e.g. ToolOutputText, ToolOutputImage,
+    ToolOutputFileContent) or a string representation of the tool output, or a list of them,
+    or something we can call `str()` on.
     In case of errors, you can either raise an Exception (which will cause the run to fail) or
     return a string error message (which will be sent back to the LLM).
     """
diff --git a/tests/mcp/test_message_handler.py b/tests/mcp/test_message_handler.py
diff --git a/tests/test_tool_output_conversion.py b/tests/test_tool_output_conversion.py
diff --git a/tests/voice/test_workflow.py b/tests/voice/test_workflow.py

Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,7 @@ def custom_instructions(`
`28`	`28`	`instructions=custom_instructions,`
`29`	`29`	`)`
`30`	`30`
	`31`	`+`
`31`	`32`	`async def main():`
`32`	`33`	`context = CustomContext(style=random.choice(["haiku", "pirate", "robot"]))`
`33`	`34`	`print(f"Using style: {context.style}\n")`
Original file line number	Diff line number	Diff line change
`@@ -832,7 +832,7 @@ async def run_single_tool(`
`832`	`832`	`output=result,`
`833`	`833`	`run_item=ToolCallOutputItem(`
`834`	`834`	`output=result,`
`835`		`- raw_item=ItemHelpers.tool_call_output_item(tool_run.tool_call, str(result)),`
	`835`	`+ raw_item=ItemHelpers.tool_call_output_item(tool_run.tool_call, result),`
`836`	`836`	`agent=agent,`
`837`	`837`	`),`
`838`	`838`	`)`