@@ -29,35 +29,76 @@ class GeminiMessage(TypedDict, total=False):
2929 text : str
3030
3131
32- def _extract_text_from_parts (parts : List [Any ]) -> str :
32+ def _format_parts_as_content_blocks (parts : List [Any ]) -> List [ FormattedContentItem ] :
3333 """
34- Extract and concatenate text from a parts array.
34+ Format Gemini parts array into structured content blocks.
35+
36+ Preserves structure for multimodal content (text + images) instead of
37+ concatenating everything into a string.
3538
3639 Args:
37- parts: List of parts that may contain text content
40+ parts: List of parts that may contain text, inline_data, etc.
3841
3942 Returns:
40- Concatenated text from all parts
43+ List of formatted content blocks
4144 """
42-
43- content_parts = []
45+ content_blocks : List [FormattedContentItem ] = []
4446
4547 for part in parts :
48+ # Handle dict with text field
4649 if isinstance (part , dict ) and "text" in part :
47- content_parts .append (part ["text" ])
50+ content_blocks .append ({ "type" : "text" , "text" : part ["text" ]} )
4851
52+ # Handle string parts
4953 elif isinstance (part , str ):
50- content_parts .append (part )
54+ content_blocks .append ({"type" : "text" , "text" : part })
55+
56+ # Handle dict with inline_data (images, documents, etc.)
57+ elif isinstance (part , dict ) and "inline_data" in part :
58+ inline_data = part ["inline_data" ]
59+ mime_type = inline_data .get ("mime_type" , "" )
60+ content_type = "image" if mime_type .startswith ("image/" ) else "document"
61+
62+ content_blocks .append (
63+ {
64+ "type" : content_type ,
65+ "inline_data" : inline_data ,
66+ }
67+ )
5168
69+ # Handle object with text attribute
5270 elif hasattr (part , "text" ):
53- # Get the text attribute value
5471 text_value = getattr (part , "text" , "" )
55- content_parts .append (text_value if text_value else str (part ))
56-
57- else :
58- content_parts .append (str (part ))
72+ if text_value :
73+ content_blocks .append ({"type" : "text" , "text" : text_value })
74+
75+ # Handle object with inline_data attribute
76+ elif hasattr (part , "inline_data" ):
77+ inline_data = part .inline_data
78+ # Convert to dict if needed
79+ if hasattr (inline_data , "mime_type" ) and hasattr (inline_data , "data" ):
80+ # Determine type based on mime_type
81+ mime_type = inline_data .mime_type
82+ content_type = "image" if mime_type .startswith ("image/" ) else "document"
83+
84+ content_blocks .append (
85+ {
86+ "type" : content_type ,
87+ "inline_data" : {
88+ "mime_type" : mime_type ,
89+ "data" : inline_data .data ,
90+ },
91+ }
92+ )
93+ else :
94+ content_blocks .append (
95+ {
96+ "type" : "image" ,
97+ "inline_data" : inline_data ,
98+ }
99+ )
59100
60- return "" . join ( content_parts )
101+ return content_blocks
61102
62103
63104def _format_dict_message (item : Dict [str , Any ]) -> FormattedMessage :
@@ -73,16 +114,17 @@ def _format_dict_message(item: Dict[str, Any]) -> FormattedMessage:
73114
74115 # Handle dict format with parts array (Gemini-specific format)
75116 if "parts" in item and isinstance (item ["parts" ], list ):
76- content = _extract_text_from_parts (item ["parts" ])
77- return {"role" : item .get ("role" , "user" ), "content" : content }
117+ content_blocks = _format_parts_as_content_blocks (item ["parts" ])
118+ return {"role" : item .get ("role" , "user" ), "content" : content_blocks }
78119
79120 # Handle dict with content field
80121 if "content" in item :
81122 content = item ["content" ]
82123
83124 if isinstance (content , list ):
84- # If content is a list, extract text from it
85- content = _extract_text_from_parts (content )
125+ # If content is a list, format it as content blocks
126+ content_blocks = _format_parts_as_content_blocks (content )
127+ return {"role" : item .get ("role" , "user" ), "content" : content_blocks }
86128
87129 elif not isinstance (content , str ):
88130 content = str (content )
@@ -110,14 +152,14 @@ def _format_object_message(item: Any) -> FormattedMessage:
110152
111153 # Handle object with parts attribute
112154 if hasattr (item , "parts" ) and hasattr (item .parts , "__iter__" ):
113- content = _extract_text_from_parts ( item .parts )
155+ content_blocks = _format_parts_as_content_blocks ( list ( item .parts ) )
114156 role = getattr (item , "role" , "user" ) if hasattr (item , "role" ) else "user"
115157
116158 # Ensure role is a string
117159 if not isinstance (role , str ):
118160 role = "user"
119161
120- return {"role" : role , "content" : content }
162+ return {"role" : role , "content" : content_blocks }
121163
122164 # Handle object with text attribute
123165 if hasattr (item , "text" ):
@@ -140,7 +182,8 @@ def _format_object_message(item: Any) -> FormattedMessage:
140182 content = item .content
141183
142184 if isinstance (content , list ):
143- content = _extract_text_from_parts (content )
185+ content_blocks = _format_parts_as_content_blocks (content )
186+ return {"role" : role , "content" : content_blocks }
144187
145188 elif not isinstance (content , str ):
146189 content = str (content )
@@ -193,6 +236,29 @@ def format_gemini_response(response: Any) -> List[FormattedMessage]:
193236 }
194237 )
195238
239+ elif hasattr (part , "inline_data" ) and part .inline_data :
240+ # Handle audio/media inline data
241+ import base64
242+
243+ inline_data = part .inline_data
244+ mime_type = getattr (inline_data , "mime_type" , "audio/pcm" )
245+ raw_data = getattr (inline_data , "data" , b"" )
246+
247+ # Encode binary data as base64 string for JSON serialization
248+ if isinstance (raw_data , bytes ):
249+ data = base64 .b64encode (raw_data ).decode ("utf-8" )
250+ else :
251+ # Already a string (base64)
252+ data = raw_data
253+
254+ content .append (
255+ {
256+ "type" : "audio" ,
257+ "mime_type" : mime_type ,
258+ "data" : data ,
259+ }
260+ )
261+
196262 if content :
197263 output .append (
198264 {
0 commit comments