Skip to content

Commit db13fd0

Browse files
authored
Explicitly request image response modality from Google API when model supports it (#3172)
1 parent 1e0e99c commit db13fd0

10 files changed

+64
-59
lines changed

docs/builtin-tools.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ The [`ImageGenerationTool`][pydantic_ai.builtin_tools.ImageGenerationTool] enabl
199199
| Provider | Supported | Notes |
200200
|----------|-----------|-------|
201201
| OpenAI Responses || Full feature support. Only supported by models newer than `gpt-4o`. Metadata about the generated image, like the [`revised_prompt`](https://platform.openai.com/docs/guides/tools-image-generation#revised-prompt) sent to the underlying image model, is available on the [`BuiltinToolReturnPart`][pydantic_ai.messages.BuiltinToolReturnPart] that's available via [`ModelResponse.builtin_tool_calls`][pydantic_ai.messages.ModelResponse.builtin_tool_calls]. |
202-
| Google || No parameter support. Only supported by [image generation models](https://ai.google.dev/gemini-api/docs/image-generation) like `gemini-2.5-flash-image-preview`. These models do not support [structured output](output.md) or [function tools](tools.md). These models will always generate images, even if this built-in tool is not explicitly specified. |
202+
| Google || No parameter support. Only supported by [image generation models](https://ai.google.dev/gemini-api/docs/image-generation) like `gemini-2.5-flash-image`. These models do not support [structured output](output.md) or [function tools](tools.md). These models will always generate images, even if this built-in tool is not explicitly specified. |
203203
| Anthropic || |
204204
| Groq || |
205205
| Bedrock || |
@@ -232,7 +232,7 @@ Image generation with Google [image generation models](https://ai.google.dev/gem
232232
```py {title="image_generation_google.py"}
233233
from pydantic_ai import Agent, BinaryImage
234234

235-
agent = Agent('google-gla:gemini-2.5-flash-image-preview')
235+
agent = Agent('google-gla:gemini-2.5-flash-image')
236236

237237
result = agent.run_sync('Tell me a two-sentence story about an axolotl with an illustration.')
238238
print(result.output)

pydantic_ai_slim/pydantic_ai/models/google.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@
7373
GroundingMetadata,
7474
HttpOptionsDict,
7575
MediaResolution,
76+
Modality,
7677
Part,
7778
PartDict,
7879
SafetySettingDict,
@@ -415,6 +416,10 @@ async def _build_content_and_config(
415416
tool_config = self._get_tool_config(model_request_parameters, tools)
416417
system_instruction, contents = await self._map_messages(messages)
417418

419+
modalities = [Modality.TEXT.value]
420+
if self.profile.supports_image_output:
421+
modalities.append(Modality.IMAGE.value)
422+
418423
http_options: HttpOptionsDict = {
419424
'headers': {'Content-Type': 'application/json', 'User-Agent': get_user_agent()}
420425
}
@@ -443,6 +448,7 @@ async def _build_content_and_config(
443448
tool_config=tool_config,
444449
response_mime_type=response_mime_type,
445450
response_schema=response_schema,
451+
response_modalities=modalities,
446452
)
447453
return contents, config
448454

tests/models/cassettes/test_google/test_google_image_and_text_output.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ interactions:
2020
- text: Tell me a two-sentence story about an axolotl with an illustration.
2121
role: user
2222
generationConfig: {}
23-
uri: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-image-preview:generateContent
23+
uri: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-image:generateContent
2424
response:
2525
headers:
2626
alt-svc:
@@ -49,7 +49,7 @@ interactions:
4949
role: model
5050
finishReason: STOP
5151
index: 0
52-
modelVersion: gemini-2.5-flash-image-preview
52+
modelVersion: gemini-2.5-flash-image
5353
responseId: ZyjgaJODFaudz7IP2OK9-As
5454
usageMetadata:
5555
candidatesTokenCount: 1336

tests/models/cassettes/test_google/test_google_image_generation.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ interactions:
2020
- text: Generate an image of an axolotl.
2121
role: user
2222
generationConfig: {}
23-
uri: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-image-preview:generateContent
23+
uri: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-image:generateContent
2424
response:
2525
headers:
2626
alt-svc:
@@ -48,7 +48,7 @@ interactions:
4848
role: model
4949
finishReason: STOP
5050
index: 0
51-
modelVersion: gemini-2.5-flash-image-preview
51+
modelVersion: gemini-2.5-flash-image
5252
responseId: eMXNaJf4FrDmqtsP2JGRsAQ
5353
usageMetadata:
5454
candidatesTokenCount: 1304
@@ -93,7 +93,7 @@ interactions:
9393
- text: Now give it a sombrero.
9494
role: user
9595
generationConfig: {}
96-
uri: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-image-preview:generateContent
96+
uri: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-image:generateContent
9797
response:
9898
headers:
9999
alt-svc:
@@ -121,7 +121,7 @@ interactions:
121121
role: model
122122
finishReason: STOP
123123
index: 0
124-
modelVersion: gemini-2.5-flash-image-preview
124+
modelVersion: gemini-2.5-flash-image
125125
responseId: gMXNaOfYN4iUmtkPpO_k4AQ
126126
usageMetadata:
127127
candidatesTokenCount: 1304

tests/models/cassettes/test_google/test_google_image_generation_stream.yaml

Lines changed: 10 additions & 10 deletions
Large diffs are not rendered by default.

tests/models/cassettes/test_google/test_google_image_generation_with_text.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ interactions:
2020
- text: Generate an illustrated two-sentence story about an axolotl.
2121
role: user
2222
generationConfig: {}
23-
uri: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-image-preview:generateContent
23+
uri: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-image:generateContent
2424
response:
2525
headers:
2626
alt-svc:
@@ -49,7 +49,7 @@ interactions:
4949
role: model
5050
finishReason: STOP
5151
index: 0
52-
modelVersion: gemini-2.5-flash-image-preview
52+
modelVersion: gemini-2.5-flash-image
5353
responseId: haHNaLDWN5n3qtsPzZTmkAc
5454
usageMetadata:
5555
candidatesTokenCount: 1335

tests/models/cassettes/test_google/test_google_image_or_text_output.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ interactions:
2020
- text: Tell me a two-sentence story about an axolotl, no image please.
2121
role: user
2222
generationConfig: {}
23-
uri: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-image-preview:generateContent
23+
uri: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-image:generateContent
2424
response:
2525
headers:
2626
alt-svc:
@@ -46,7 +46,7 @@ interactions:
4646
role: model
4747
finishReason: STOP
4848
index: 0
49-
modelVersion: gemini-2.5-flash-image-preview
49+
modelVersion: gemini-2.5-flash-image
5050
responseId: tofdaKSRBMSgz7IPx5eRyAU
5151
usageMetadata:
5252
candidatesTokenCount: 48
@@ -79,7 +79,7 @@ interactions:
7979
- text: Generate an image of an axolotl.
8080
role: user
8181
generationConfig: {}
82-
uri: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-image-preview:generateContent
82+
uri: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-image:generateContent
8383
response:
8484
headers:
8585
alt-svc:
@@ -107,7 +107,7 @@ interactions:
107107
role: model
108108
finishReason: STOP
109109
index: 0
110-
modelVersion: gemini-2.5-flash-image-preview
110+
modelVersion: gemini-2.5-flash-image
111111
responseId: vIfdaKSMD96ez7IPpr73qAM
112112
usageMetadata:
113113
candidatesTokenCount: 1304

tests/models/cassettes/test_google/test_google_multiple_images.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ interactions:
2020
- text: Generate two separate images of axolotls.
2121
role: user
2222
generationConfig: {}
23-
uri: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-image-preview:generateContent
23+
uri: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-image:generateContent
2424
response:
2525
headers:
2626
alt-svc:
@@ -50,7 +50,7 @@ interactions:
5050
role: model
5151
finishReason: STOP
5252
index: 0
53-
modelVersion: gemini-2.5-flash-image-preview
53+
modelVersion: gemini-2.5-flash-image
5454
responseId: R4DdaO-WON7Qz7IPhNrg0QU
5555
usageMetadata:
5656
candidatesTokenCount: 1303

tests/models/cassettes/test_google/test_google_vertexai_image_generation.yaml

Lines changed: 16 additions & 11 deletions
Large diffs are not rendered by default.

tests/models/test_google.py

Lines changed: 16 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2590,7 +2590,7 @@ class CityLocation(BaseModel):
25902590

25912591

25922592
async def test_google_image_generation(allow_model_requests: None, google_provider: GoogleProvider):
2593-
m = GoogleModel('gemini-2.5-flash-image-preview', provider=google_provider)
2593+
m = GoogleModel('gemini-2.5-flash-image', provider=google_provider)
25942594
agent = Agent(m, output_type=BinaryImage)
25952595

25962596
result = await agent.run('Generate an image of an axolotl.')
@@ -2629,7 +2629,7 @@ async def test_google_image_generation(allow_model_requests: None, google_provid
26292629
output_tokens=1304,
26302630
details={'text_prompt_tokens': 10, 'image_candidates_tokens': 1290},
26312631
),
2632-
model_name='gemini-2.5-flash-image-preview',
2632+
model_name='gemini-2.5-flash-image',
26332633
timestamp=IsDatetime(),
26342634
provider_name='google-gla',
26352635
provider_details={'finish_reason': 'STOP'},
@@ -2673,7 +2673,7 @@ async def test_google_image_generation(allow_model_requests: None, google_provid
26732673
output_tokens=1304,
26742674
details={'text_prompt_tokens': 32, 'image_prompt_tokens': 1290, 'image_candidates_tokens': 1290},
26752675
),
2676-
model_name='gemini-2.5-flash-image-preview',
2676+
model_name='gemini-2.5-flash-image',
26772677
timestamp=IsDatetime(),
26782678
provider_name='google-gla',
26792679
provider_details={'finish_reason': 'STOP'},
@@ -2685,7 +2685,7 @@ async def test_google_image_generation(allow_model_requests: None, google_provid
26852685

26862686

26872687
async def test_google_image_generation_stream(allow_model_requests: None, google_provider: GoogleProvider):
2688-
m = GoogleModel('gemini-2.5-flash-image-preview', provider=google_provider)
2688+
m = GoogleModel('gemini-2.5-flash-image', provider=google_provider)
26892689
agent = Agent(m, output_type=BinaryImage)
26902690

26912691
async with agent.run_stream('Generate an image of an axolotl') as result:
@@ -2739,7 +2739,7 @@ async def test_google_image_generation_stream(allow_model_requests: None, google
27392739
output_tokens=1295,
27402740
details={'text_prompt_tokens': 10, 'image_candidates_tokens': 1290},
27412741
),
2742-
model_name='gemini-2.5-flash-image-preview',
2742+
model_name='gemini-2.5-flash-image',
27432743
timestamp=IsDatetime(),
27442744
provider_name='google-gla',
27452745
provider_details={'finish_reason': 'STOP'},
@@ -2768,7 +2768,7 @@ async def test_google_image_generation_stream(allow_model_requests: None, google
27682768

27692769

27702770
async def test_google_image_generation_with_text(allow_model_requests: None, google_provider: GoogleProvider):
2771-
m = GoogleModel('gemini-2.5-flash-image-preview', provider=google_provider)
2771+
m = GoogleModel('gemini-2.5-flash-image', provider=google_provider)
27722772
agent = Agent(m)
27732773

27742774
result = await agent.run('Generate an illustrated two-sentence story about an axolotl.')
@@ -2805,7 +2805,7 @@ async def test_google_image_generation_with_text(allow_model_requests: None, goo
28052805
output_tokens=1335,
28062806
details={'text_prompt_tokens': 14, 'image_candidates_tokens': 1290},
28072807
),
2808-
model_name='gemini-2.5-flash-image-preview',
2808+
model_name='gemini-2.5-flash-image',
28092809
timestamp=IsDatetime(),
28102810
provider_name='google-gla',
28112811
provider_details={'finish_reason': 'STOP'},
@@ -2817,8 +2817,8 @@ async def test_google_image_generation_with_text(allow_model_requests: None, goo
28172817

28182818

28192819
async def test_google_image_or_text_output(allow_model_requests: None, google_provider: GoogleProvider):
2820-
m = GoogleModel('gemini-2.5-flash-image-preview', provider=google_provider)
2821-
# ImageGenerationTool is listed here to indicate just that it doesn't cause any issues, even though it's not necessary with an image-preview model.
2820+
m = GoogleModel('gemini-2.5-flash-image', provider=google_provider)
2821+
# ImageGenerationTool is listed here to indicate just that it doesn't cause any issues, even though it's not necessary with an image model.
28222822
agent = Agent(m, output_type=str | BinaryImage, builtin_tools=[ImageGenerationTool()])
28232823

28242824
result = await agent.run('Tell me a two-sentence story about an axolotl, no image please.')
@@ -2837,7 +2837,7 @@ async def test_google_image_or_text_output(allow_model_requests: None, google_pr
28372837

28382838

28392839
async def test_google_image_and_text_output(allow_model_requests: None, google_provider: GoogleProvider):
2840-
m = GoogleModel('gemini-2.5-flash-image-preview', provider=google_provider)
2840+
m = GoogleModel('gemini-2.5-flash-image', provider=google_provider)
28412841
agent = Agent(m)
28422842

28432843
result = await agent.run('Tell me a two-sentence story about an axolotl with an illustration.')
@@ -2860,7 +2860,7 @@ class Animal(BaseModel):
28602860
species: str
28612861
name: str
28622862

2863-
model = GoogleModel('gemini-2.5-flash-image-preview', provider=google_provider)
2863+
model = GoogleModel('gemini-2.5-flash-image', provider=google_provider)
28642864
agent = Agent(model=model, output_type=Animal)
28652865

28662866
with pytest.raises(UserError, match='Tool output is not supported by this model.'):
@@ -2872,7 +2872,7 @@ class Animal(BaseModel):
28722872
species: str
28732873
name: str
28742874

2875-
model = GoogleModel('gemini-2.5-flash-image-preview', provider=google_provider)
2875+
model = GoogleModel('gemini-2.5-flash-image', provider=google_provider)
28762876
agent = Agent(model=model, output_type=NativeOutput(Animal))
28772877

28782878
with pytest.raises(UserError, match='Native structured output is not supported by this model.'):
@@ -2886,15 +2886,15 @@ class Animal(BaseModel):
28862886
species: str
28872887
name: str
28882888

2889-
model = GoogleModel('gemini-2.5-flash-image-preview', provider=google_provider)
2889+
model = GoogleModel('gemini-2.5-flash-image', provider=google_provider)
28902890
agent = Agent(model=model, output_type=PromptedOutput(Animal))
28912891

28922892
with pytest.raises(UserError, match='JSON output is not supported by this model.'):
28932893
await agent.run('Generate an image of an axolotl.')
28942894

28952895

28962896
async def test_google_image_generation_with_tools(allow_model_requests: None, google_provider: GoogleProvider):
2897-
model = GoogleModel('gemini-2.5-flash-image-preview', provider=google_provider)
2897+
model = GoogleModel('gemini-2.5-flash-image', provider=google_provider)
28982898
agent = Agent(model=model, output_type=BinaryImage)
28992899

29002900
@agent.tool_plain
@@ -2917,15 +2917,9 @@ async def test_google_image_generation_tool(allow_model_requests: None, google_p
29172917

29182918

29192919
async def test_google_vertexai_image_generation(allow_model_requests: None, vertex_provider: GoogleProvider):
2920-
model = GoogleModel('gemini-2.5-flash-image-preview', provider=vertex_provider)
2920+
model = GoogleModel('gemini-2.5-flash-image', provider=vertex_provider)
29212921

29222922
agent = Agent(model, output_type=BinaryImage)
29232923

29242924
result = await agent.run('Generate an image of an axolotl.')
2925-
assert result.output == snapshot(
2926-
BinaryImage(
2927-
data=IsBytes(),
2928-
media_type='image/png',
2929-
identifier='f3edd8',
2930-
)
2931-
)
2925+
assert result.output == snapshot(BinaryImage(data=IsBytes(), media_type='image/png', identifier='b037a4'))

0 commit comments

Comments
 (0)