Skip to content

Commit bf03ed6

Browse files
Support OpenAI image detail on ImageUrl and BinaryContent via vendor_metadata (#2987)
Co-authored-by: Douwe Maan <[email protected]>
1 parent 763e7bc commit bf03ed6

File tree

4 files changed

+79
-5
lines changed

4 files changed

+79
-5
lines changed

pydantic_ai_slim/pydantic_ai/messages.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ class FileUrl(ABC):
126126
127127
Supported by:
128128
- `GoogleModel`: `VideoUrl.vendor_metadata` is used as `video_metadata`: https://ai.google.dev/gemini-api/docs/video-understanding#customize-video-processing
129+
- `OpenAIChatModel`, `OpenAIResponsesModel`: `ImageUrl.vendor_metadata['detail']` is used as `detail` setting for images
129130
"""
130131

131132
_media_type: Annotated[str | None, pydantic.Field(alias='media_type', default=None, exclude=True)] = field(
@@ -471,6 +472,7 @@ class BinaryContent:
471472
472473
Supported by:
473474
- `GoogleModel`: `BinaryContent.vendor_metadata` is used as `video_metadata`: https://ai.google.dev/gemini-api/docs/video-understanding#customize-video-processing
475+
- `OpenAIChatModel`, `OpenAIResponsesModel`: `BinaryContent.vendor_metadata['detail']` is used as `detail` setting for images
474476
"""
475477

476478
kind: Literal['binary'] = 'binary'

pydantic_ai_slim/pydantic_ai/models/openai.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -757,12 +757,16 @@ async def _map_user_prompt(part: UserPromptPart) -> chat.ChatCompletionUserMessa
757757
if isinstance(item, str):
758758
content.append(ChatCompletionContentPartTextParam(text=item, type='text'))
759759
elif isinstance(item, ImageUrl):
760-
image_url = ImageURL(url=item.url)
760+
image_url: ImageURL = {'url': item.url}
761+
if metadata := item.vendor_metadata:
762+
image_url['detail'] = metadata.get('detail', 'auto')
761763
content.append(ChatCompletionContentPartImageParam(image_url=image_url, type='image_url'))
762764
elif isinstance(item, BinaryContent):
763765
base64_encoded = base64.b64encode(item.data).decode('utf-8')
764766
if item.is_image:
765-
image_url = ImageURL(url=f'data:{item.media_type};base64,{base64_encoded}')
767+
image_url: ImageURL = {'url': f'data:{item.media_type};base64,{base64_encoded}'}
768+
if metadata := item.vendor_metadata:
769+
image_url['detail'] = metadata.get('detail', 'auto')
766770
content.append(ChatCompletionContentPartImageParam(image_url=image_url, type='image_url'))
767771
elif item.is_audio:
768772
assert item.format in ('wav', 'mp3')
@@ -1387,11 +1391,17 @@ async def _map_user_prompt(part: UserPromptPart) -> responses.EasyInputMessagePa
13871391
elif isinstance(item, BinaryContent):
13881392
base64_encoded = base64.b64encode(item.data).decode('utf-8')
13891393
if item.is_image:
1394+
detail: Literal['auto', 'low', 'high'] = 'auto'
1395+
if metadata := item.vendor_metadata:
1396+
detail = cast(
1397+
Literal['auto', 'low', 'high'],
1398+
metadata.get('detail', 'auto'),
1399+
)
13901400
content.append(
13911401
responses.ResponseInputImageParam(
13921402
image_url=f'data:{item.media_type};base64,{base64_encoded}',
13931403
type='input_image',
1394-
detail='auto',
1404+
detail=detail,
13951405
)
13961406
)
13971407
elif item.is_document:
@@ -1410,8 +1420,15 @@ async def _map_user_prompt(part: UserPromptPart) -> responses.EasyInputMessagePa
14101420
else: # pragma: no cover
14111421
raise RuntimeError(f'Unsupported binary content type: {item.media_type}')
14121422
elif isinstance(item, ImageUrl):
1423+
detail: Literal['auto', 'low', 'high'] = 'auto'
1424+
if metadata := item.vendor_metadata:
1425+
detail = cast(Literal['auto', 'low', 'high'], metadata.get('detail', 'auto'))
14131426
content.append(
1414-
responses.ResponseInputImageParam(image_url=item.url, type='input_image', detail='auto')
1427+
responses.ResponseInputImageParam(
1428+
image_url=item.url,
1429+
type='input_image',
1430+
detail=detail,
1431+
)
14151432
)
14161433
elif isinstance(item, AudioUrl): # pragma: no cover
14171434
downloaded_item = await download_item(item, data_format='base64_uri', type_format='extension')

tests/models/test_openai.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,27 @@ async def test_request_simple_usage(allow_model_requests: None):
172172
)
173173

174174

175+
async def test_openai_chat_image_detail_vendor_metadata(allow_model_requests: None):
176+
c = completion_message(
177+
ChatCompletionMessage(content='done', role='assistant'),
178+
)
179+
mock_client = MockOpenAI.create_mock(c)
180+
model = OpenAIChatModel('gpt-4o', provider=OpenAIProvider(openai_client=mock_client))
181+
agent = Agent(model)
182+
183+
image_url = ImageUrl('https://example.com/image.png', vendor_metadata={'detail': 'high'})
184+
binary_image = BinaryContent(b'\x89PNG', media_type='image/png', vendor_metadata={'detail': 'high'})
185+
186+
await agent.run(['Describe these inputs.', image_url, binary_image])
187+
188+
request_kwargs = get_mock_chat_completion_kwargs(mock_client)
189+
image_parts = [
190+
item['image_url'] for item in request_kwargs[0]['messages'][0]['content'] if item['type'] == 'image_url'
191+
]
192+
assert image_parts
193+
assert all(part['detail'] == 'high' for part in image_parts)
194+
195+
175196
async def test_request_structured_response(allow_model_requests: None):
176197
c = completion_message(
177198
ChatCompletionMessage(

tests/models/test_openai_responses.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
from pydantic_ai.usage import RequestUsage, RunUsage
4040

4141
from ..conftest import IsDatetime, IsStr, TestEnv, try_import
42-
from .mock_openai import MockOpenAIResponses, response_message
42+
from .mock_openai import MockOpenAIResponses, get_mock_responses_kwargs, response_message
4343

4444
with try_import() as imports_successful:
4545
from openai.types.responses.response_output_message import Content, ResponseOutputMessage, ResponseOutputText
@@ -78,6 +78,40 @@ async def test_openai_responses_model_simple_response(allow_model_requests: None
7878
assert result.output == snapshot('The capital of France is Paris.')
7979

8080

81+
async def test_openai_responses_image_detail_vendor_metadata(allow_model_requests: None):
82+
c = response_message(
83+
[
84+
ResponseOutputMessage(
85+
id='output-1',
86+
content=cast(list[Content], [ResponseOutputText(text='done', type='output_text', annotations=[])]),
87+
role='assistant',
88+
status='completed',
89+
type='message',
90+
)
91+
]
92+
)
93+
mock_client = MockOpenAIResponses.create_mock(c)
94+
model = OpenAIResponsesModel('gpt-4o', provider=OpenAIProvider(openai_client=mock_client))
95+
agent = Agent(model=model)
96+
97+
image_url = ImageUrl('https://example.com/image.png', vendor_metadata={'detail': 'high'})
98+
binary_image = BinaryContent(b'\x89PNG', media_type='image/png', vendor_metadata={'detail': 'high'})
99+
100+
result = await agent.run(['Describe these inputs.', image_url, binary_image])
101+
assert result.output == 'done'
102+
103+
response_kwargs = get_mock_responses_kwargs(mock_client)
104+
image_parts = [
105+
item
106+
for message in response_kwargs[0]['input']
107+
if message.get('role') == 'user'
108+
for item in message['content']
109+
if item['type'] == 'input_image'
110+
]
111+
assert image_parts
112+
assert all(part['detail'] == 'high' for part in image_parts)
113+
114+
81115
async def test_openai_responses_model_simple_response_with_tool_call(allow_model_requests: None, openai_api_key: str):
82116
model = OpenAIResponsesModel('gpt-4o', provider=OpenAIProvider(api_key=openai_api_key))
83117

0 commit comments

Comments
 (0)