Python: Add file handling support to BinaryContent for OpenAI Responses API (#12258)

ltwlf · moonbox3 · web-flow · commit 0e7556e112f1 · 2025-06-09T21:15:39.000Z
## Summary

Enhances `BinaryContent` to support file handling for OpenAI Responses
API, enabling file uploads through the responses agent while maintaining
a provider-agnostic design.

  ## Changes

  ### BinaryContent Enhancements
- **Add `can_read` property**: Indicates whether content has readable
data available
- **Add `from_file()` class method**: Creates BinaryContent instances
from file paths with automatic base64 encoding
- **Fix Unicode handling**: Prevents decode errors when processing
binary files (PDFs, images, etc.)

  ### OpenAI Responses Agent Integration
- **Add BinaryContent support**: Pattern matching case for file handling
in `responses_agent_thread_actions.py`
- **Correct OpenAI API format**: Uses proper `filename` and `file_data`
structure with data URI format
- **UUID-based filenames**: Generates appropriate filenames with
mime-type extensions
- **Provider-specific mapping**: File format conversion happens only in
OpenAI agent code

  ### Testing
- **Test coverage**: New tests for `can_read`, `from_file()`, and binary
data handling
- **Unicode error prevention**: Specific test for binary PDF-like
content
- **Base64 encoding verification**: Ensures proper data format for API
compatibility

  ## Design Principles

- **Provider-agnostic**: BinaryContent remains completely generic with
no OpenAI-specific dependencies
- **Clean separation**: OpenAI format mapping isolated to OpenAI agent
files
- **No FileContent class**: Enhances existing BinaryContent instead of
introducing new types
- **Follows existing patterns**: Similar approach to ImageContent and
TextContent handling

  ## Usage Example

  ```python
 response = await self.agent.get_response(
        messages=ChatMessageContent(
            content="Analyse PDF",
            role=AuthorRole("user"),
            items=[BinaryContent.from_file(file_path=c":/test.pdf")],
        )
    )

Co-authored-by: Evan Mattson &lt;35585003+moonbox3@users.noreply.github.com&gt;
diff --git a/python/samples/concepts/README.md b/python/samples/concepts/README.md
@@ -95,6 +95,7 @@
 - [OpenAI Responses Agent Declarative File Search](./agents/openai_responses/openai_responses_agent_declarative_file_search.py)
 - [OpenAI Responses Agent Declarative Function Calling From File](./agents/openai_responses/openai_responses_agent_declarative_function_calling_from_file.py)
 - [OpenAI Responses Agent Declarative Web Search](./agents/openai_responses/openai_responses_agent_declarative_web_search.py)
+- [OpenAI Responses Binary Content Upload](./agents/openai_responses/responses_agent_binary_content_upload.py)
 - [OpenAI Responses Message Callback Streaming](./agents/openai_responses/responses_agent_message_callback_streaming.py)
 - [OpenAI Responses Message Callback](./agents/openai_responses/responses_agent_message_callback.py)
 - [OpenAI Responses File Search Streaming](./agents/openai_responses/responses_agent_file_search_streaming.py)
diff --git a/python/samples/concepts/agents/openai_responses/responses_agent_binary_content_upload.py b/python/samples/concepts/agents/openai_responses/responses_agent_binary_content_upload.py
@@ -0,0 +1,188 @@
+# Copyright (c) Microsoft. All rights reserved.
+import asyncio
+import os
+import tempfile
+
+from semantic_kernel.agents import OpenAIResponsesAgent
+from semantic_kernel.connectors.ai.open_ai import OpenAISettings
+from semantic_kernel.contents.binary_content import BinaryContent
+from semantic_kernel.contents.chat_message_content import ChatMessageContent
+from semantic_kernel.contents.text_content import TextContent
+from semantic_kernel.contents.utils.author_role import AuthorRole
+
+"""
+The following sample demonstrates how to upload PDF and text files using BinaryContent
+with an OpenAI Responses Agent. This shows how to create BinaryContent objects from files
+and compose multi-modal messages that combine text and binary content.
+
+The sample demonstrates:
+1. Creating BinaryContent from a PDF file
+2. Creating BinaryContent from a text file
+3. Composing multi-modal messages with mixed content types (text + binary)
+4. Sending complex messages directly to the agent via the messages parameter
+5. Having the agent process and respond to questions about the uploaded files
+
+This approach differs from simple string-based interactions by showing how to combine
+multiple content types within a single message, which is useful for rich media interactions.
+
+Note: This sample uses the existing employees.pdf file from the resources directory.
+"""
+
+# Sample follow-up questions to demonstrate continued conversation
+USER_INPUTS = [
+    "What specific types of files did I just upload?",
+    "Can you tell me about the content in the PDF file?",
+    "What does the text file contain?",
+    "Can you provide a summary of both documents?",
+]
+
+
+def create_sample_text_content() -> str:
+    """Create sample text content for demonstration purposes.
+
+    Returns:
+        str: A sample company policy document in text format.
+    """
+    return """Company Policy Document - Remote Work Guidelines
+
+This document outlines our company's remote work policies and procedures.
+
+Remote Work Eligibility:
+- Full-time employees with at least 6 months tenure
+- Managers approval required
+- Home office setup must meet security requirements
+
+Work Schedule:
+- Core hours: 10 AM - 3 PM local time
+- Flexible start/end times outside core hours
+- Maximum 3 remote days per week for hybrid roles
+
+Communication Requirements:
+- Daily check-ins with team lead
+- Weekly video conference participation
+- Response time: within 4 hours during business hours
+
+Equipment and Security:
+- Company-provided laptop and VPN access
+- Secure Wi-Fi connection required
+- No public Wi-Fi for work activities
+
+For questions about remote work policies, contact HR at hr@company.com
+"""
+
+
+async def main():
+    # 1. Initialize the OpenAI client
+    client = OpenAIResponsesAgent.create_client()
+
+    # 2. Prepare file paths and create sample content
+    pdf_file_path = os.path.join(
+        os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))),
+        "resources",
+        "file_search",
+        "employees.pdf",
+    )
+
+    # Create a temporary text file for demonstration purposes
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as text_file:
+        text_content = create_sample_text_content()
+        text_file.write(text_content)
+        text_file_path = text_file.name
+
+    try:
+        # 3. Create BinaryContent objects from files using different methods
+        print("Creating BinaryContent from files...")
+
+        # Method 1: Create BinaryContent from an existing PDF file
+        pdf_binary_content = BinaryContent.from_file(file_path=pdf_file_path, mime_type="application/pdf")
+        print(f"Created PDF BinaryContent: {pdf_binary_content.mime_type}, can_read: {pdf_binary_content.can_read}")
+
+        # Method 2: Create BinaryContent from the temporary text file
+        text_binary_content = BinaryContent.from_file(file_path=text_file_path, mime_type="text/plain")
+        print(f"Created text BinaryContent: {text_binary_content.mime_type}, can_read: {text_binary_content.can_read}")
+
+        # Method 3: Create BinaryContent directly from in-memory data
+        # This approach allows creating BinaryContent without file I/O operations
+        alternative_text_content = BinaryContent(
+            data=text_content.encode("utf-8"), mime_type="text/plain", data_format="base64"
+        )
+        print(f"Alternative text BinaryContent: {alternative_text_content.mime_type}")
+
+        # 4. Initialize the OpenAI Responses Agent with file analysis capabilities
+        # Configure the AI model for responses
+        settings = OpenAISettings()
+        responses_model = settings.responses_model_id or "gpt-4o"
+
+        agent = OpenAIResponsesAgent(
+            ai_model_id=responses_model,
+            client=client,
+            instructions=(
+                "You are a helpful assistant that can analyze uploaded files. "
+                "When users upload files, examine their content and provide helpful insights. "
+                "You can identify file types, summarize content, and answer questions about the files."
+            ),
+            name="FileAnalyzer",
+        )
+
+        # 5. Demonstrate multi-modal message composition
+        # This showcases combining text and binary content in a single message
+
+        # Compose a message containing both text instructions and file attachments
+        # This pattern is ideal for scenarios requiring rich, mixed-content interactions
+        initial_message = ChatMessageContent(
+            role=AuthorRole.USER,
+            items=[
+                TextContent(text="I'm uploading a PDF document and a text file for you to analyze."),
+                pdf_binary_content,
+                text_binary_content,
+            ],
+        )
+
+        # 6. Conduct a conversation with the agent about the uploaded files
+        thread = None
+
+        # Send the initial multi-modal message containing file uploads
+        print("\n# User: 'I'm uploading a PDF document and a text file for you to analyze.'")
+        first_chunk = True
+        async for response in agent.invoke_stream(messages=initial_message, thread=thread):
+            thread = response.thread
+            if first_chunk:
+                print(f"# {response.name}: ", end="", flush=True)
+                first_chunk = False
+            print(response.content, end="", flush=True)
+        print()  # New line after response
+
+        # Continue the conversation with text-based follow-up questions
+        for user_input in USER_INPUTS:
+            print(f"\n# User: '{user_input}'")
+
+            # Process follow-up questions using standard text input
+            first_chunk = True
+            async for response in agent.invoke_stream(messages=user_input, thread=thread):
+                thread = response.thread
+                if first_chunk:
+                    print(f"# {response.name}: ", end="", flush=True)
+                    first_chunk = False
+                print(response.content, end="", flush=True)
+            print()  # New line after response
+
+    finally:
+        # 7. Clean up temporary resources
+        if os.path.exists(text_file_path):
+            os.unlink(text_file_path)
+
+    print("\n" + "=" * 60)
+    print("Sample completed!")
+    print("\nKey points about BinaryContent:")
+    print("1. Use BinaryContent.from_file() to create from existing files")
+    print("2. Use BinaryContent(data=...) to create from bytes/string data")
+    print("3. Specify appropriate mime_type for proper handling")
+    print("4. BinaryContent can be included in chat messages alongside text")
+    print("5. The OpenAI Responses API will process supported file types")
+    print("\nSupported file types include:")
+    print("- PDF documents (application/pdf)")
+    print("- Text files (text/plain)")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/python/semantic_kernel/agents/open_ai/responses_agent_thread_actions.py b/python/semantic_kernel/agents/open_ai/responses_agent_thread_actions.py
@@ -2,6 +2,7 @@
 
 import asyncio
 import logging
+import uuid
 from collections.abc import AsyncIterable, Sequence
 from functools import reduce
 from typing import TYPE_CHECKING, Any, ClassVar, Literal, TypeVar, cast
@@ -31,6 +32,7 @@
 from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior
 from semantic_kernel.connectors.ai.open_ai.exceptions.content_filter_ai_exception import ContentFilterAIException
 from semantic_kernel.contents.annotation_content import AnnotationContent
+from semantic_kernel.contents.binary_content import BinaryContent
 from semantic_kernel.contents.chat_history import ChatHistory
 from semantic_kernel.contents.chat_message_content import CMC_ITEM_TYPES, ChatMessageContent
 from semantic_kernel.contents.function_call_content import FunctionCallContent
@@ -721,6 +723,51 @@ def _prepare_chat_history_for_request(
                             "call_id": content.call_id,
                         }
                         response_inputs.append(rfrc_dict)
+                    case BinaryContent() if content.can_read:
+                        # Generate filename with appropriate extension based on mime type
+                        extension = ""
+                        if content.mime_type == "application/pdf":
+                            extension = ".pdf"
+                        elif content.mime_type.startswith("text/"):
+                            extension = ".txt"
+                        elif content.mime_type.startswith("image/"):
+                            # For image content, warn that ImageContent class should be used instead
+                            logger.warning(
+                                f"Using BinaryContent for image type '{content.mime_type}'. "
+                                "Use ImageContent for handling of images."
+                            )
+                            extension = f".{content.mime_type.split('/')[-1]}"
+                        elif content.mime_type.startswith("audio/"):
+                            # For audio content, warn that AudioContent class should be used instead
+                            logger.warning(
+                                f"Use BinaryContent for audio type '{content.mime_type}'. "
+                                "Use AudioContent for handling of audio."
+                            )
+                            extension = f".{content.mime_type.split('/')[-1]}"
+                        else:
+                            # For other binary types, use generic extension based on MIME type
+                            # or fallback to .bin for application/octet-stream
+                            mime_subtype = (
+                                content.mime_type.split("/")[-1]
+                                if "/" in content.mime_type
+                                else "application/octet-stream"
+                            )
+                            extension = f".{mime_subtype}"
+                            logger.warning(
+                                f"Using binary content with mime type '{content.mime_type}' "
+                                f"which may not be supported by the OpenAI Responses API"
+                            )
+
+                        filename = f"{uuid.uuid4()}{extension}"
+
+                        # Format according to OpenAI Responses API specification
+                        file_data_uri = f"data:{content.mime_type};base64,{content.data_string}"
+                        contents.append({
+                            "type": "input_file",
+                            "filename": filename,
+                            "file_data": file_data_uri,
+                        })
+                        response_inputs.append({"role": original_role, "content": contents})
 
         return response_inputs
 
diff --git a/python/semantic_kernel/contents/binary_content.py b/python/semantic_kernel/contents/binary_content.py
@@ -165,6 +165,53 @@ def mime_type(self, value: str):
         if self._data_uri:
             self._data_uri.mime_type = value
 
+    @property
+    def can_read(self) -> bool:
+        """Get whether the content can be read.
+
+        Returns True if the content has data available for reading.
+        """
+        return self._data_uri is not None
+
+    @classmethod
+    def from_file(
+        cls: type[_T],
+        file_path: str | Path,
+        mime_type: str | None = None,
+    ) -> _T:
+        """Create BinaryContent from a file.
+
+        Args:
+            file_path: Path to the file to read
+            mime_type: MIME type of the file content
+
+        Returns:
+            BinaryContent instance with file data
+
+        Raises:
+            FileNotFoundError: If the file doesn't exist
+            ContentInitializationError: If the path is not a file
+        """
+        from semantic_kernel.exceptions.content_exceptions import ContentInitializationError
+
+        path = Path(file_path)
+
+        if not path.exists():
+            raise FileNotFoundError(f"File not found: {file_path}")
+
+        if not path.is_file():
+            raise ContentInitializationError(f"Path is not a file: {file_path}")
+
+        # Read file as binary data to handle all file types properly
+        data = path.read_bytes()
+
+        return cls(
+            data=data,
+            mime_type=mime_type,
+            uri=str(path),
+            data_format="base64",
+        )
+
     def __str__(self) -> str:
         """Return the string representation of the content."""
         return self.data_uri if self._data_uri else str(self.uri)
diff --git a/python/tests/unit/contents/test_binary_content.py b/python/tests/unit/contents/test_binary_content.py