Skip to content

Commit 17f34ba

Browse files
eyurtsevccurme
andauthored
openai[minor]: add image generation to responses api (#31424)
Does not support partial images during generation at the moment. Before doing that I'd like to figure out how to specify the aggregation logic without requiring changes in core. --------- Co-authored-by: Chester Curme <[email protected]>
1 parent 9a78246 commit 17f34ba

File tree

6 files changed

+1595
-1438
lines changed

6 files changed

+1595
-1438
lines changed

libs/partners/openai/langchain_openai/chat_models/base.py

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,15 @@
118118

119119
_FUNCTION_CALL_IDS_MAP_KEY = "__openai_function_call_ids__"
120120

121+
WellKnownTools = (
122+
"file_search",
123+
"web_search_preview",
124+
"computer_use_preview",
125+
"code_interpreter",
126+
"mcp",
127+
"image_generation",
128+
)
129+
121130

122131
def _convert_dict_to_message(_dict: Mapping[str, Any]) -> BaseMessage:
123132
"""Convert a dictionary to a LangChain message.
@@ -1487,13 +1496,7 @@ def bind_tools(
14871496
"type": "function",
14881497
"function": {"name": tool_choice},
14891498
}
1490-
elif tool_choice in (
1491-
"file_search",
1492-
"web_search_preview",
1493-
"computer_use_preview",
1494-
"code_interpreter",
1495-
"mcp",
1496-
):
1499+
elif tool_choice in WellKnownTools:
14971500
tool_choice = {"type": tool_choice}
14981501
# 'any' is not natively supported by OpenAI API.
14991502
# We support 'any' since other models use this instead of 'required'.
@@ -3050,6 +3053,13 @@ def _construct_responses_api_payload(
30503053
new_tools.append({"type": "function", **tool["function"]})
30513054
else:
30523055
new_tools.append(tool)
3056+
3057+
if tool["type"] == "image_generation" and "partial_images" in tool:
3058+
raise NotImplementedError(
3059+
"Partial image generation is not yet supported "
3060+
"via the LangChain ChatOpenAI client. Please "
3061+
"drop the 'partial_images' key from the image_generation tool."
3062+
)
30533063
payload["tools"] = new_tools
30543064
if tool_choice := payload.pop("tool_choice", None):
30553065
# chat api: {"type": "function", "function": {"name": "..."}}
@@ -3139,6 +3149,7 @@ def _pop_summary_index_from_reasoning(reasoning: dict) -> dict:
31393149

31403150

31413151
def _construct_responses_api_input(messages: Sequence[BaseMessage]) -> list:
3152+
"""Construct the input for the OpenAI Responses API."""
31423153
input_ = []
31433154
for lc_msg in messages:
31443155
msg = _convert_message_to_dict(lc_msg)
@@ -3191,6 +3202,7 @@ def _construct_responses_api_input(messages: Sequence[BaseMessage]) -> list:
31913202
computer_calls = []
31923203
code_interpreter_calls = []
31933204
mcp_calls = []
3205+
image_generation_calls = []
31943206
tool_outputs = lc_msg.additional_kwargs.get("tool_outputs", [])
31953207
for tool_output in tool_outputs:
31963208
if tool_output.get("type") == "computer_call":
@@ -3199,10 +3211,22 @@ def _construct_responses_api_input(messages: Sequence[BaseMessage]) -> list:
31993211
code_interpreter_calls.append(tool_output)
32003212
elif tool_output.get("type") == "mcp_call":
32013213
mcp_calls.append(tool_output)
3214+
elif tool_output.get("type") == "image_generation_call":
3215+
image_generation_calls.append(tool_output)
32023216
else:
32033217
pass
32043218
input_.extend(code_interpreter_calls)
32053219
input_.extend(mcp_calls)
3220+
3221+
# A previous image generation call can be referenced by ID
3222+
3223+
input_.extend(
3224+
[
3225+
{"type": "image_generation_call", "id": image_generation_call["id"]}
3226+
for image_generation_call in image_generation_calls
3227+
]
3228+
)
3229+
32063230
msg["content"] = msg.get("content") or []
32073231
if lc_msg.additional_kwargs.get("refusal"):
32083232
if isinstance(msg["content"], str):
@@ -3489,6 +3513,7 @@ def _convert_responses_chunk_to_generation_chunk(
34893513
"mcp_call",
34903514
"mcp_list_tools",
34913515
"mcp_approval_request",
3516+
"image_generation_call",
34923517
):
34933518
additional_kwargs["tool_outputs"] = [
34943519
chunk.item.model_dump(exclude_none=True, mode="json")
@@ -3516,6 +3541,9 @@ def _convert_responses_chunk_to_generation_chunk(
35163541
{"index": chunk.summary_index, "type": "summary_text", "text": ""}
35173542
]
35183543
}
3544+
elif chunk.type == "response.image_generation_call.partial_image":
3545+
# Partial images are not supported yet.
3546+
pass
35193547
elif chunk.type == "response.reasoning_summary_text.delta":
35203548
additional_kwargs["reasoning"] = {
35213549
"summary": [

libs/partners/openai/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ authors = []
77
license = { text = "MIT" }
88
requires-python = ">=3.9"
99
dependencies = [
10-
"langchain-core<1.0.0,>=0.3.61",
10+
"langchain-core<1.0.0,>=0.3.63",
1111
"openai<2.0.0,>=1.68.2",
1212
"tiktoken<1,>=0.7",
1313
]
Binary file not shown.
Binary file not shown.

libs/partners/openai/tests/integration_tests/chat_models/test_responses_api.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
BaseMessage,
1313
BaseMessageChunk,
1414
HumanMessage,
15+
MessageLikeRepresentation,
1516
)
1617
from pydantic import BaseModel
1718
from typing_extensions import TypedDict
@@ -452,3 +453,130 @@ def test_mcp_builtin() -> None:
452453
_ = llm_with_tools.invoke(
453454
[approval_message], previous_response_id=response.response_metadata["id"]
454455
)
456+
457+
458+
@pytest.mark.vcr()
459+
def test_image_generation_streaming() -> None:
460+
"""Test image generation streaming."""
461+
llm = ChatOpenAI(model="gpt-4.1", use_responses_api=True)
462+
tool = {
463+
"type": "image_generation",
464+
# For testing purposes let's keep the quality low, so the test runs faster.
465+
"quality": "low",
466+
"output_format": "jpeg",
467+
"output_compression": 100,
468+
"size": "1024x1024",
469+
}
470+
471+
# Example tool output for an image
472+
# {
473+
# "background": "opaque",
474+
# "id": "ig_683716a8ddf0819888572b20621c7ae4029ec8c11f8dacf8",
475+
# "output_format": "png",
476+
# "quality": "high",
477+
# "revised_prompt": "A fluffy, fuzzy cat sitting calmly, with soft fur, bright "
478+
# "eyes, and a cute, friendly expression. The background is "
479+
# "simple and light to emphasize the cat's texture and "
480+
# "fluffiness.",
481+
# "size": "1024x1024",
482+
# "status": "completed",
483+
# "type": "image_generation_call",
484+
# "result": # base64 encode image data
485+
# }
486+
487+
expected_keys = {
488+
"id",
489+
"background",
490+
"output_format",
491+
"quality",
492+
"result",
493+
"revised_prompt",
494+
"size",
495+
"status",
496+
"type",
497+
}
498+
499+
full: Optional[BaseMessageChunk] = None
500+
for chunk in llm.stream("Draw a random short word in green font.", tools=[tool]):
501+
assert isinstance(chunk, AIMessageChunk)
502+
full = chunk if full is None else full + chunk
503+
complete_ai_message = cast(AIMessageChunk, full)
504+
# At the moment, the streaming API does not pick up annotations fully.
505+
# So the following check is commented out.
506+
# _check_response(complete_ai_message)
507+
tool_output = complete_ai_message.additional_kwargs["tool_outputs"][0]
508+
assert set(tool_output.keys()).issubset(expected_keys)
509+
510+
511+
@pytest.mark.vcr()
512+
def test_image_generation_multi_turn() -> None:
513+
"""Test multi-turn editing of image generation by passing in history."""
514+
# Test multi-turn
515+
llm = ChatOpenAI(model="gpt-4.1", use_responses_api=True)
516+
# Test invocation
517+
tool = {
518+
"type": "image_generation",
519+
# For testing purposes let's keep the quality low, so the test runs faster.
520+
"quality": "low",
521+
"output_format": "jpeg",
522+
"output_compression": 100,
523+
"size": "1024x1024",
524+
}
525+
llm_with_tools = llm.bind_tools([tool])
526+
527+
chat_history: list[MessageLikeRepresentation] = [
528+
{"role": "user", "content": "Draw a random short word in green font."}
529+
]
530+
ai_message = llm_with_tools.invoke(chat_history)
531+
_check_response(ai_message)
532+
tool_output = ai_message.additional_kwargs["tool_outputs"][0]
533+
534+
# Example tool output for an image
535+
# {
536+
# "background": "opaque",
537+
# "id": "ig_683716a8ddf0819888572b20621c7ae4029ec8c11f8dacf8",
538+
# "output_format": "png",
539+
# "quality": "high",
540+
# "revised_prompt": "A fluffy, fuzzy cat sitting calmly, with soft fur, bright "
541+
# "eyes, and a cute, friendly expression. The background is "
542+
# "simple and light to emphasize the cat's texture and "
543+
# "fluffiness.",
544+
# "size": "1024x1024",
545+
# "status": "completed",
546+
# "type": "image_generation_call",
547+
# "result": # base64 encode image data
548+
# }
549+
550+
expected_keys = {
551+
"id",
552+
"background",
553+
"output_format",
554+
"quality",
555+
"result",
556+
"revised_prompt",
557+
"size",
558+
"status",
559+
"type",
560+
}
561+
562+
assert set(tool_output.keys()).issubset(expected_keys)
563+
564+
chat_history.extend(
565+
[
566+
# AI message with tool output
567+
ai_message,
568+
# New request
569+
{
570+
"role": "user",
571+
"content": (
572+
"Now, change the font to blue. Keep the word and everything else "
573+
"the same."
574+
),
575+
},
576+
]
577+
)
578+
579+
ai_message2 = llm_with_tools.invoke(chat_history)
580+
_check_response(ai_message2)
581+
tool_output2 = ai_message2.additional_kwargs["tool_outputs"][0]
582+
assert set(tool_output2.keys()).issubset(expected_keys)

0 commit comments

Comments
 (0)