Skip to content

Commit c7cb28f

Browse files
committed
Support image/video/document resolution with Gemini 3
1 parent 1fe583c commit c7cb28f

File tree

10 files changed

+1219
-47
lines changed

10 files changed

+1219
-47
lines changed

pydantic_ai_slim/pydantic_ai/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
AgentStreamEvent,
3636
AudioFormat,
3737
AudioMediaType,
38+
AudioOptions,
3839
AudioUrl,
3940
BaseToolCallPart,
4041
BaseToolReturnPart,
@@ -45,7 +46,9 @@
4546
CachePoint,
4647
DocumentFormat,
4748
DocumentMediaType,
49+
DocumentOptions,
4850
DocumentUrl,
51+
FileOptions,
4952
FilePart,
5053
FileUrl,
5154
FinalResultEvent,
@@ -55,6 +58,7 @@
5558
HandleResponseEvent,
5659
ImageFormat,
5760
ImageMediaType,
61+
ImageOptions,
5862
ImageUrl,
5963
ModelMessage,
6064
ModelMessagesTypeAdapter,
@@ -82,6 +86,7 @@
8286
UserPromptPart,
8387
VideoFormat,
8488
VideoMediaType,
89+
VideoOptions,
8590
VideoUrl,
8691
)
8792
from .output import NativeOutput, PromptedOutput, StructuredDict, TextOutput, ToolOutput
@@ -136,6 +141,7 @@
136141
'AgentStreamEvent',
137142
'AudioFormat',
138143
'AudioMediaType',
144+
'AudioOptions',
139145
'AudioUrl',
140146
'BaseToolCallPart',
141147
'BaseToolReturnPart',
@@ -145,7 +151,9 @@
145151
'CachePoint',
146152
'DocumentFormat',
147153
'DocumentMediaType',
154+
'DocumentOptions',
148155
'DocumentUrl',
156+
'FileOptions',
149157
'FileUrl',
150158
'FilePart',
151159
'FinalResultEvent',
@@ -155,6 +163,7 @@
155163
'HandleResponseEvent',
156164
'ImageFormat',
157165
'ImageMediaType',
166+
'ImageOptions',
158167
'ImageUrl',
159168
'BinaryImage',
160169
'ModelMessage',
@@ -183,6 +192,7 @@
183192
'UserPromptPart',
184193
'VideoFormat',
185194
'VideoMediaType',
195+
'VideoOptions',
186196
'VideoUrl',
187197
# profiles
188198
'ModelProfile',

pydantic_ai_slim/pydantic_ai/messages.py

Lines changed: 164 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import pydantic_core
1414
from genai_prices import calc_price, types as genai_types
1515
from opentelemetry._events import Event # pyright: ignore[reportPrivateImportUsage]
16-
from typing_extensions import deprecated
16+
from typing_extensions import TypedDict, deprecated
1717

1818
from . import _otel_messages, _utils
1919
from ._utils import generate_tool_call_id as _generate_tool_call_id, now_utc as _now_utc
@@ -106,6 +106,77 @@ def _multi_modal_content_identifier(identifier: str | bytes) -> str:
106106
return hashlib.sha1(identifier).hexdigest()[:6]
107107

108108

109+
class FileOptions(TypedDict, total=False):
110+
"""Options for how the provider should process the file."""
111+
112+
pass
113+
114+
115+
class VideoOptions(TypedDict, total=False):
116+
"""Options for how the provider should process the video."""
117+
118+
detail: Literal['high', 'medium', 'low']
119+
"""The detail level of the video.
120+
121+
Supported by:
122+
123+
- Google: Maps to `media_resolution`: https://ai.google.dev/gemini-api/docs/gemini-3?thinking=high#media_resolution
124+
"""
125+
126+
fps: float
127+
"""The frame rate of the video sent to the model. If not specified, the default value will be 1.0. The fps range is (0.0, 24.0].
128+
129+
Supported by:
130+
131+
- Google: https://ai.google.dev/gemini-api/docs/video-understanding#customize-video-processing
132+
"""
133+
134+
start_offset: str
135+
"""The start offset of the video sent to the model.
136+
137+
Supported by:
138+
139+
- Google: https://ai.google.dev/gemini-api/docs/video-understanding#customize-video-processing
140+
"""
141+
142+
end_offset: str
143+
"""The end offset of the video sent to the model.
144+
145+
Supported by:
146+
147+
- Google: https://ai.google.dev/gemini-api/docs/video-understanding#customize-video-processing
148+
"""
149+
150+
151+
class ImageOptions(TypedDict, total=False):
152+
"""Options for how the provider should process the image."""
153+
154+
detail: Literal['high', 'medium', 'low']
155+
"""The detail level of the image.
156+
157+
Supported by:
158+
159+
- OpenAI: Supports only `high` and `low`: https://platform.openai.com/docs/guides/images-vision?api-mode=responses#specify-image-input-detail-level
160+
- Google: Maps to `media_resolution`: https://ai.google.dev/gemini-api/docs/gemini-3?thinking=high#media_resolution
161+
"""
162+
163+
164+
class DocumentOptions(TypedDict, total=False):
165+
"""Options for how the provider should process the document."""
166+
167+
detail: Literal['high', 'medium', 'low']
168+
"""The detail level of the document.
169+
170+
Supported by:
171+
172+
- Google: Maps to `media_resolution`: https://ai.google.dev/gemini-api/docs/gemini-3?thinking=high#media_resolution
173+
"""
174+
175+
176+
class AudioOptions(TypedDict, total=False):
177+
"""Options for how the provider should process the audio."""
178+
179+
109180
@dataclass(init=False, repr=False)
110181
class FileUrl(ABC):
111182
"""Abstract base class for any URL-based file."""
@@ -122,13 +193,9 @@ class FileUrl(ABC):
122193
* If False, the URL is sent directly to the model and no download is performed.
123194
"""
124195

125-
vendor_metadata: dict[str, Any] | None = None
126-
"""Vendor-specific metadata for the file.
127-
128-
Supported by:
129-
- `GoogleModel`: `VideoUrl.vendor_metadata` is used as `video_metadata`: https://ai.google.dev/gemini-api/docs/video-understanding#customize-video-processing
130-
- `OpenAIChatModel`, `OpenAIResponsesModel`: `ImageUrl.vendor_metadata['detail']` is used as `detail` setting for images
131-
"""
196+
# TODO (v2): Rename to `options`?
197+
vendor_metadata: FileOptions | None = None
198+
"""Options on how the provider should process the file."""
132199

133200
_media_type: Annotated[str | None, pydantic.Field(alias='media_type', default=None, exclude=True)] = field(
134201
compare=False, default=None
@@ -145,7 +212,7 @@ def __init__(
145212
media_type: str | None = None,
146213
identifier: str | None = None,
147214
force_download: bool = False,
148-
vendor_metadata: dict[str, Any] | None = None,
215+
vendor_metadata: FileOptions | None = None,
149216
) -> None:
150217
self.url = url
151218
self._media_type = media_type
@@ -209,7 +276,7 @@ def __init__(
209276
media_type: str | None = None,
210277
identifier: str | None = None,
211278
force_download: bool = False,
212-
vendor_metadata: dict[str, Any] | None = None,
279+
vendor_metadata: VideoOptions | None = None,
213280
kind: Literal['video-url'] = 'video-url',
214281
# Required for inline-snapshot which expects all dataclass `__init__` methods to take all field names as kwargs.
215282
_media_type: str | None = None,
@@ -285,7 +352,7 @@ def __init__(
285352
media_type: str | None = None,
286353
identifier: str | None = None,
287354
force_download: bool = False,
288-
vendor_metadata: dict[str, Any] | None = None,
355+
vendor_metadata: AudioOptions | None = None,
289356
kind: Literal['audio-url'] = 'audio-url',
290357
# Required for inline-snapshot which expects all dataclass `__init__` methods to take all field names as kwargs.
291358
_media_type: str | None = None,
@@ -348,7 +415,7 @@ def __init__(
348415
media_type: str | None = None,
349416
identifier: str | None = None,
350417
force_download: bool = False,
351-
vendor_metadata: dict[str, Any] | None = None,
418+
vendor_metadata: ImageOptions | None = None,
352419
kind: Literal['image-url'] = 'image-url',
353420
# Required for inline-snapshot which expects all dataclass `__init__` methods to take all field names as kwargs.
354421
_media_type: str | None = None,
@@ -406,7 +473,7 @@ def __init__(
406473
media_type: str | None = None,
407474
identifier: str | None = None,
408475
force_download: bool = False,
409-
vendor_metadata: dict[str, Any] | None = None,
476+
vendor_metadata: DocumentOptions | None = None,
410477
kind: Literal['document-url'] = 'document-url',
411478
# Required for inline-snapshot which expects all dataclass `__init__` methods to take all field names as kwargs.
412479
_media_type: str | None = None,
@@ -476,12 +543,8 @@ class BinaryContent:
476543
media_type: AudioMediaType | ImageMediaType | DocumentMediaType | str
477544
"""The media type of the binary data."""
478545

479-
vendor_metadata: dict[str, Any] | None = None
480-
"""Vendor-specific metadata for the file.
481-
482-
Supported by:
483-
- `GoogleModel`: `BinaryContent.vendor_metadata` is used as `video_metadata`: https://ai.google.dev/gemini-api/docs/video-understanding#customize-video-processing
484-
- `OpenAIChatModel`, `OpenAIResponsesModel`: `BinaryContent.vendor_metadata['detail']` is used as `detail` setting for images
546+
vendor_metadata: FileOptions | None = None
547+
"""Options on how the provider should process the file.
485548
"""
486549

487550
_identifier: Annotated[str | None, pydantic.Field(alias='identifier', default=None, exclude=True)] = field(
@@ -491,13 +554,91 @@ class BinaryContent:
491554
kind: Literal['binary'] = 'binary'
492555
"""Type identifier, this is available on all parts as a discriminator."""
493556

557+
@overload
558+
def __init__(
559+
self,
560+
data: bytes,
561+
*,
562+
media_type: ImageMediaType,
563+
identifier: str | None = None,
564+
vendor_metadata: ImageOptions | None = None,
565+
kind: Literal['binary'] = 'binary',
566+
# Required for inline-snapshot which expects all dataclass `__init__` methods to take all field names as kwargs.
567+
_identifier: str | None = None,
568+
) -> None: ...
569+
570+
@overload
571+
def __init__(
572+
self,
573+
data: bytes,
574+
*,
575+
media_type: VideoMediaType,
576+
identifier: str | None = None,
577+
vendor_metadata: VideoOptions | None = None,
578+
kind: Literal['binary'] = 'binary',
579+
# Required for inline-snapshot which expects all dataclass `__init__` methods to take all field names as kwargs.
580+
_identifier: str | None = None,
581+
) -> None: ...
582+
583+
@overload
584+
def __init__(
585+
self,
586+
data: bytes,
587+
*,
588+
media_type: DocumentMediaType | str,
589+
identifier: str | None = None,
590+
vendor_metadata: DocumentOptions | None = None,
591+
kind: Literal['binary'] = 'binary',
592+
# Required for inline-snapshot which expects all dataclass `__init__` methods to take all field names as kwargs.
593+
_identifier: str | None = None,
594+
) -> None: ...
595+
596+
@overload
597+
def __init__(
598+
self,
599+
data: bytes,
600+
*,
601+
media_type: AudioMediaType,
602+
identifier: str | None = None,
603+
vendor_metadata: AudioOptions | None = None,
604+
kind: Literal['binary'] = 'binary',
605+
# Required for inline-snapshot which expects all dataclass `__init__` methods to take all field names as kwargs.
606+
_identifier: str | None = None,
607+
) -> None: ...
608+
609+
@overload
610+
def __init__(
611+
self,
612+
data: bytes,
613+
*,
614+
media_type: str,
615+
identifier: str | None = None,
616+
vendor_metadata: FileOptions | None = None,
617+
kind: Literal['binary'] = 'binary',
618+
# Required for inline-snapshot which expects all dataclass `__init__` methods to take all field names as kwargs.
619+
_identifier: str | None = None,
620+
) -> None: ...
621+
622+
@overload
623+
def __init__(
624+
self,
625+
data: bytes,
626+
*,
627+
media_type: AudioMediaType | str,
628+
identifier: str | None = None,
629+
vendor_metadata: AudioOptions | None = None,
630+
kind: Literal['binary'] = 'binary',
631+
# Required for inline-snapshot which expects all dataclass `__init__` methods to take all field names as kwargs.
632+
_identifier: str | None = None,
633+
) -> None: ...
634+
494635
def __init__(
495636
self,
496637
data: bytes,
497638
*,
498-
media_type: AudioMediaType | ImageMediaType | DocumentMediaType | str,
639+
media_type: AudioMediaType | ImageMediaType | VideoMediaType | DocumentMediaType | str,
499640
identifier: str | None = None,
500-
vendor_metadata: dict[str, Any] | None = None,
641+
vendor_metadata: FileOptions | None = None,
501642
kind: Literal['binary'] = 'binary',
502643
# Required for inline-snapshot which expects all dataclass `__init__` methods to take all field names as kwargs.
503644
_identifier: str | None = None,
@@ -516,7 +657,7 @@ def narrow_type(bc: BinaryContent) -> BinaryContent | BinaryImage:
516657
data=bc.data,
517658
media_type=bc.media_type,
518659
identifier=bc.identifier,
519-
vendor_metadata=bc.vendor_metadata,
660+
vendor_metadata=cast(ImageOptions, bc.vendor_metadata),
520661
)
521662
else:
522663
return bc
@@ -599,7 +740,7 @@ def __init__(
599740
*,
600741
media_type: str,
601742
identifier: str | None = None,
602-
vendor_metadata: dict[str, Any] | None = None,
743+
vendor_metadata: ImageOptions | None = None,
603744
# Required for inline-snapshot which expects all dataclass `__init__` methods to take all field names as kwargs.
604745
kind: Literal['binary'] = 'binary',
605746
_identifier: str | None = None,

0 commit comments

Comments
 (0)