@@ -29,35 +29,69 @@ class GeminiMessage(TypedDict, total=False):
2929 text : str
3030
3131
32- def _extract_text_from_parts (parts : List [Any ]) -> str :
32+ def _format_parts_as_content_blocks (parts : List [Any ]) -> List [ FormattedContentItem ] :
3333 """
34- Extract and concatenate text from a parts array.
34+ Format Gemini parts array into structured content blocks.
35+
36+ Preserves structure for multimodal content (text + images) instead of
37+ concatenating everything into a string.
3538
3639 Args:
37- parts: List of parts that may contain text content
40+ parts: List of parts that may contain text, inline_data, etc.
3841
3942 Returns:
40- Concatenated text from all parts
43+ List of formatted content blocks
4144 """
42-
43- content_parts = []
45+ content_blocks : List [FormattedContentItem ] = []
4446
4547 for part in parts :
48+ # Handle dict with text field
4649 if isinstance (part , dict ) and "text" in part :
47- content_parts .append (part ["text" ])
50+ content_blocks .append ({ "type" : "text" , "text" : part ["text" ]} )
4851
52+ # Handle string parts
4953 elif isinstance (part , str ):
50- content_parts .append (part )
54+ content_blocks .append ({ "type" : "text" , "text" : part } )
5155
56+ # Handle dict with inline_data (images)
57+ elif isinstance (part , dict ) and "inline_data" in part :
58+ inline_data = part ["inline_data" ]
59+ content_blocks .append (
60+ {
61+ "type" : "image" ,
62+ "inline_data" : inline_data ,
63+ }
64+ )
65+
66+ # Handle object with text attribute
5267 elif hasattr (part , "text" ):
53- # Get the text attribute value
5468 text_value = getattr (part , "text" , "" )
55- content_parts .append (text_value if text_value else str (part ))
56-
57- else :
58- content_parts .append (str (part ))
69+ if text_value :
70+ content_blocks .append ({"type" : "text" , "text" : text_value })
71+
72+ # Handle object with inline_data attribute
73+ elif hasattr (part , "inline_data" ):
74+ inline_data = part .inline_data
75+ # Convert to dict if needed
76+ if hasattr (inline_data , "mime_type" ) and hasattr (inline_data , "data" ):
77+ content_blocks .append (
78+ {
79+ "type" : "image" ,
80+ "inline_data" : {
81+ "mime_type" : inline_data .mime_type ,
82+ "data" : inline_data .data ,
83+ },
84+ }
85+ )
86+ else :
87+ content_blocks .append (
88+ {
89+ "type" : "image" ,
90+ "inline_data" : inline_data ,
91+ }
92+ )
5993
60- return "" . join ( content_parts )
94+ return content_blocks
6195
6296
6397def _format_dict_message (item : Dict [str , Any ]) -> FormattedMessage :
@@ -73,16 +107,17 @@ def _format_dict_message(item: Dict[str, Any]) -> FormattedMessage:
73107
74108 # Handle dict format with parts array (Gemini-specific format)
75109 if "parts" in item and isinstance (item ["parts" ], list ):
76- content = _extract_text_from_parts (item ["parts" ])
77- return {"role" : item .get ("role" , "user" ), "content" : content }
110+ content_blocks = _format_parts_as_content_blocks (item ["parts" ])
111+ return {"role" : item .get ("role" , "user" ), "content" : content_blocks }
78112
79113 # Handle dict with content field
80114 if "content" in item :
81115 content = item ["content" ]
82116
83117 if isinstance (content , list ):
84- # If content is a list, extract text from it
85- content = _extract_text_from_parts (content )
118+ # If content is a list, format it as content blocks
119+ content_blocks = _format_parts_as_content_blocks (content )
120+ return {"role" : item .get ("role" , "user" ), "content" : content_blocks }
86121
87122 elif not isinstance (content , str ):
88123 content = str (content )
@@ -110,14 +145,14 @@ def _format_object_message(item: Any) -> FormattedMessage:
110145
111146 # Handle object with parts attribute
112147 if hasattr (item , "parts" ) and hasattr (item .parts , "__iter__" ):
113- content = _extract_text_from_parts ( item .parts )
148+ content_blocks = _format_parts_as_content_blocks ( list ( item .parts ) )
114149 role = getattr (item , "role" , "user" ) if hasattr (item , "role" ) else "user"
115150
116151 # Ensure role is a string
117152 if not isinstance (role , str ):
118153 role = "user"
119154
120- return {"role" : role , "content" : content }
155+ return {"role" : role , "content" : content_blocks }
121156
122157 # Handle object with text attribute
123158 if hasattr (item , "text" ):
@@ -193,6 +228,29 @@ def format_gemini_response(response: Any) -> List[FormattedMessage]:
193228 }
194229 )
195230
231+ elif hasattr (part , "inline_data" ) and part .inline_data :
232+ # Handle audio/media inline data
233+ import base64
234+
235+ inline_data = part .inline_data
236+ mime_type = getattr (inline_data , "mime_type" , "audio/pcm" )
237+ raw_data = getattr (inline_data , "data" , b"" )
238+
239+ # Encode binary data as base64 string for JSON serialization
240+ if isinstance (raw_data , bytes ):
241+ data = base64 .b64encode (raw_data ).decode ("utf-8" )
242+ else :
243+ # Already a string (base64)
244+ data = raw_data
245+
246+ content .append (
247+ {
248+ "type" : "audio" ,
249+ "mime_type" : mime_type ,
250+ "data" : data ,
251+ }
252+ )
253+
196254 if content :
197255 output .append (
198256 {
0 commit comments