diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index aaf8a3ae9d2d..bf80856c1bbf 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1283,6 +1283,7 @@ def _get_full_multimodal_text_prompt( "text": lambda part: _TextParser(part).get("text", None), "thinking": lambda part: _ThinkParser(part).get("thinking", None), "input_text": lambda part: _TextParser(part).get("text", None), + "output_text": lambda part: _TextParser(part).get("text", None), "input_image": lambda part: _ResponsesInputImageParser(part).get("image_url", None), "image_url": lambda part: _ImageParser(part).get("image_url", {}).get("url", None), "image_embeds": lambda part: _ImageEmbedsParser(part).get("image_embeds", None), @@ -1463,7 +1464,7 @@ def _parse_chat_message_content_part( ) return None - if part_type in ("text", "input_text", "refusal", "thinking"): + if part_type in ("text", "input_text", "output_text", "refusal", "thinking"): str_content = cast(str, content) if wrap_dicts: return {"type": "text", "text": str_content}