Skip to content

Commit 3267e3a

Browse files
authored
Add DocumentUrl and support document via BinaryContent (#987)
1 parent f96849c commit 3267e3a

24 files changed

+2840
-44
lines changed

docs/input.md

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
# Image and Audio Input
1+
# Image, Audio & Document Input
22

3-
Some LLMs are now capable of understanding both audio and image content.
3+
Some LLMs are now capable of understanding both audio, image and document content.
44

55
## Image Input
66

@@ -51,3 +51,54 @@ print(result.data)
5151
Some models do not support audio input. Please check the model's documentation to confirm whether it supports audio input.
5252

5353
You can provide audio input using either [`AudioUrl`][pydantic_ai.AudioUrl] or [`BinaryContent`][pydantic_ai.BinaryContent]. The process is analogous to the examples above.
54+
55+
## Document Input
56+
57+
!!! info
58+
Some models do not support document input. Please check the model's documentation to confirm whether it supports document input.
59+
60+
!!! warning
61+
When using Gemini models, the document content will always be sent as binary data, regardless of whether you use `DocumentUrl` or `BinaryContent`. This is due to differences in how Vertex AI and Google AI handle document inputs.
62+
63+
For more details, see [this discussion](https://discuss.ai.google.dev/t/i-am-using-google-generative-ai-model-gemini-1-5-pro-for-image-analysis-but-getting-error/34866/4).
64+
65+
If you are unsatisfied with this behavior, please let us know by opening an issue on
66+
[GitHub](https://github.com/pydantic/pydantic-ai/issues).
67+
68+
You can provide document input using either [`DocumentUrl`][pydantic_ai.DocumentUrl] or [`BinaryContent`][pydantic_ai.BinaryContent]. The process is similar to the examples above.
69+
70+
If you have a direct URL for the document, you can use [`DocumentUrl`][pydantic_ai.DocumentUrl]:
71+
72+
```py {title="main.py" test="skip" lint="skip"}
73+
from pydantic_ai import Agent, DocumentUrl
74+
75+
agent = Agent(model='anthropic:claude-3-sonnet')
76+
result = agent.run_sync(
77+
[
78+
'What is the main content of this document?',
79+
DocumentUrl(url='https://storage.googleapis.com/cloud-samples-data/generative-ai/pdf/2403.05530.pdf'),
80+
]
81+
)
82+
print(result.data)
83+
#> This document is the technical report introducing Gemini 1.5, Google's latest large language model...
84+
```
85+
86+
The supported document formats vary by model.
87+
88+
You can also use [`BinaryContent`][pydantic_ai.BinaryContent] to pass document data directly:
89+
90+
```py {title="main.py" test="skip" lint="skip"}
91+
from pathlib import Path
92+
from pydantic_ai import Agent, BinaryContent
93+
94+
pdf_path = Path('document.pdf')
95+
agent = Agent(model='anthropic:claude-3-sonnet')
96+
result = agent.run_sync(
97+
[
98+
'What is the main content of this document?',
99+
BinaryContent(data=pdf_path.read_bytes(), media_type='application/pdf'),
100+
]
101+
)
102+
print(result.data)
103+
#> The document discusses...
104+
```

pydantic_ai_slim/pydantic_ai/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
UsageLimitExceeded,
1111
UserError,
1212
)
13-
from .messages import AudioUrl, BinaryContent, ImageUrl
13+
from .messages import AudioUrl, BinaryContent, DocumentUrl, ImageUrl
1414
from .tools import RunContext, Tool
1515

1616
__all__ = (
@@ -33,6 +33,7 @@
3333
# messages
3434
'ImageUrl',
3535
'AudioUrl',
36+
'DocumentUrl',
3637
'BinaryContent',
3738
# tools
3839
'Tool',

pydantic_ai_slim/pydantic_ai/messages.py

Lines changed: 112 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from collections.abc import Sequence
55
from dataclasses import dataclass, field, replace
66
from datetime import datetime
7+
from mimetypes import guess_type
78
from typing import Annotated, Any, Literal, Union, cast, overload
89

910
import pydantic
@@ -83,9 +84,57 @@ def media_type(self) -> ImageMediaType:
8384
else:
8485
raise ValueError(f'Unknown image file extension: {self.url}')
8586

87+
@property
88+
def format(self) -> ImageFormat:
89+
"""The file format of the image.
90+
91+
The choice of supported formats were based on the Bedrock Converse API. Other APIs don't require to use a format.
92+
"""
93+
return _image_format(self.media_type)
94+
95+
96+
@dataclass
97+
class DocumentUrl:
98+
"""The URL of the document."""
99+
100+
url: str
101+
"""The URL of the document."""
102+
103+
kind: Literal['document-url'] = 'document-url'
104+
"""Type identifier, this is available on all parts as a discriminator."""
105+
106+
@property
107+
def media_type(self) -> str:
108+
"""Return the media type of the document, based on the url."""
109+
type_, _ = guess_type(self.url)
110+
if type_ is None:
111+
raise RuntimeError(f'Unknown document file extension: {self.url}')
112+
return type_
113+
114+
@property
115+
def format(self) -> DocumentFormat:
116+
"""The file format of the document.
117+
118+
The choice of supported formats were based on the Bedrock Converse API. Other APIs don't require to use a format.
119+
"""
120+
return _document_format(self.media_type)
121+
86122

87123
AudioMediaType: TypeAlias = Literal['audio/wav', 'audio/mpeg']
88124
ImageMediaType: TypeAlias = Literal['image/jpeg', 'image/png', 'image/gif', 'image/webp']
125+
DocumentMediaType: TypeAlias = Literal[
126+
'application/pdf',
127+
'text/plain',
128+
'text/csv',
129+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
130+
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
131+
'text/html',
132+
'text/markdown',
133+
'application/vnd.ms-excel',
134+
]
135+
AudioFormat: TypeAlias = Literal['wav', 'mp3']
136+
ImageFormat: TypeAlias = Literal['jpeg', 'png', 'gif', 'webp']
137+
DocumentFormat: TypeAlias = Literal['csv', 'doc', 'docx', 'html', 'md', 'pdf', 'txt', 'xls', 'xlsx']
89138

90139

91140
@dataclass
@@ -95,7 +144,7 @@ class BinaryContent:
95144
data: bytes
96145
"""The binary data."""
97146

98-
media_type: AudioMediaType | ImageMediaType | str
147+
media_type: AudioMediaType | ImageMediaType | DocumentMediaType | str
99148
"""The media type of the binary data."""
100149

101150
kind: Literal['binary'] = 'binary'
@@ -112,17 +161,69 @@ def is_image(self) -> bool:
112161
return self.media_type.startswith('image/')
113162

114163
@property
115-
def audio_format(self) -> Literal['mp3', 'wav']:
116-
"""Return the audio format given the media type."""
117-
if self.media_type == 'audio/mpeg':
118-
return 'mp3'
119-
elif self.media_type == 'audio/wav':
120-
return 'wav'
121-
else:
122-
raise ValueError(f'Unknown audio media type: {self.media_type}')
164+
def is_document(self) -> bool:
165+
"""Return `True` if the media type is a document type."""
166+
return self.media_type in {
167+
'application/pdf',
168+
'text/plain',
169+
'text/csv',
170+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
171+
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
172+
'text/html',
173+
'text/markdown',
174+
'application/vnd.ms-excel',
175+
}
123176

124-
125-
UserContent: TypeAlias = 'str | ImageUrl | AudioUrl | BinaryContent'
177+
@property
178+
def format(self) -> str:
179+
"""The file format of the binary content."""
180+
if self.is_audio:
181+
if self.media_type == 'audio/mpeg':
182+
return 'mp3'
183+
elif self.media_type == 'audio/wav':
184+
return 'wav'
185+
elif self.is_image:
186+
return _image_format(self.media_type)
187+
elif self.is_document:
188+
return _document_format(self.media_type)
189+
raise ValueError(f'Unknown media type: {self.media_type}')
190+
191+
192+
UserContent: TypeAlias = 'str | ImageUrl | AudioUrl | DocumentUrl | BinaryContent'
193+
194+
195+
def _document_format(media_type: str) -> DocumentFormat:
196+
if media_type == 'application/pdf':
197+
return 'pdf'
198+
elif media_type == 'text/plain':
199+
return 'txt'
200+
elif media_type == 'text/csv':
201+
return 'csv'
202+
elif media_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
203+
return 'docx'
204+
elif media_type == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
205+
return 'xlsx'
206+
elif media_type == 'text/html':
207+
return 'html'
208+
elif media_type == 'text/markdown':
209+
return 'md'
210+
elif media_type == 'application/vnd.ms-excel':
211+
return 'xls'
212+
else:
213+
raise ValueError(f'Unknown document media type: {media_type}')
214+
215+
216+
def _image_format(media_type: str) -> ImageFormat:
217+
if media_type == 'image/jpeg':
218+
return 'jpeg'
219+
elif media_type == 'image/png':
220+
return 'png'
221+
elif media_type == 'image/gif':
222+
return 'gif'
223+
elif media_type == 'image/webp':
224+
return 'webp'
225+
else:
226+
raise ValueError(f'Unknown image media type: {media_type}')
126227

127228

128229
@dataclass

pydantic_ai_slim/pydantic_ai/models/anthropic.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,15 @@
99
from json import JSONDecodeError, loads as json_loads
1010
from typing import Any, Literal, Union, cast, overload
1111

12+
from anthropic.types import DocumentBlockParam
1213
from httpx import AsyncClient as AsyncHTTPClient
1314
from typing_extensions import assert_never
1415

1516
from .. import ModelHTTPError, UnexpectedModelBehavior, _utils, usage
1617
from .._utils import guard_tool_call_id as _guard_tool_call_id
1718
from ..messages import (
1819
BinaryContent,
20+
DocumentUrl,
1921
ImageUrl,
2022
ModelMessage,
2123
ModelRequest,
@@ -42,11 +44,13 @@
4244
try:
4345
from anthropic import NOT_GIVEN, APIStatusError, AsyncAnthropic, AsyncStream
4446
from anthropic.types import (
47+
Base64PDFSourceParam,
4548
ContentBlock,
4649
ImageBlockParam,
4750
Message as AnthropicMessage,
4851
MessageParam,
4952
MetadataParam,
53+
PlainTextSourceParam,
5054
RawContentBlockDeltaEvent,
5155
RawContentBlockStartEvent,
5256
RawContentBlockStopEvent,
@@ -288,7 +292,9 @@ async def _map_message(self, messages: list[ModelMessage]) -> tuple[str, list[Me
288292
anthropic_messages: list[MessageParam] = []
289293
for m in messages:
290294
if isinstance(m, ModelRequest):
291-
user_content_params: list[ToolResultBlockParam | TextBlockParam | ImageBlockParam] = []
295+
user_content_params: list[
296+
ToolResultBlockParam | TextBlockParam | ImageBlockParam | DocumentBlockParam
297+
] = []
292298
for request_part in m.parts:
293299
if isinstance(request_part, SystemPromptPart):
294300
system_prompt += request_part.content
@@ -334,7 +340,9 @@ async def _map_message(self, messages: list[ModelMessage]) -> tuple[str, list[Me
334340
return system_prompt, anthropic_messages
335341

336342
@staticmethod
337-
async def _map_user_prompt(part: UserPromptPart) -> AsyncGenerator[ImageBlockParam | TextBlockParam]:
343+
async def _map_user_prompt(
344+
part: UserPromptPart,
345+
) -> AsyncGenerator[ImageBlockParam | TextBlockParam | DocumentBlockParam]:
338346
if isinstance(part.content, str):
339347
yield TextBlockParam(text=part.content, type='text')
340348
else:
@@ -379,6 +387,25 @@ async def _map_user_prompt(part: UserPromptPart) -> AsyncGenerator[ImageBlockPar
379387
)
380388
else: # pragma: no cover
381389
raise RuntimeError(f'Unsupported image type: {mime_type}')
390+
elif isinstance(item, DocumentUrl):
391+
response = await cached_async_http_client().get(item.url)
392+
response.raise_for_status()
393+
if item.media_type == 'application/pdf':
394+
yield DocumentBlockParam(
395+
source=Base64PDFSourceParam(
396+
data=io.BytesIO(response.content),
397+
media_type=item.media_type,
398+
type='base64',
399+
),
400+
type='document',
401+
)
402+
elif item.media_type == 'text/plain':
403+
yield DocumentBlockParam(
404+
source=PlainTextSourceParam(data=response.text, media_type=item.media_type, type='text'),
405+
type='document',
406+
)
407+
else: # pragma: no cover
408+
raise RuntimeError(f'Unsupported media type: {item.media_type}')
382409
else:
383410
raise RuntimeError(f'Unsupported content type: {type(item)}')
384411

0 commit comments

Comments
 (0)