@@ -159,11 +159,35 @@ def encode_image(image_path: str) -> str:
159159
160160# === DATA PROCESSING ===
161161def _normalize_prompt_field (prompt : Any ) -> str :
162- """Normalize prompt field to string."""
162+ """Normalize prompt field to string.
163+
164+ Supports multiple input formats:
165+ - String: returned as-is
166+ - Simple list: first element converted to string
167+ - Object with 'messages' key: JSON serialized (for chat-like formats)
168+ - Other objects: JSON serialized
169+ """
163170 if isinstance (prompt , str ):
164171 return prompt
165172 elif isinstance (prompt , list ) and prompt :
173+ # Handle simple list format like ["prompt text"]
166174 return str (prompt [0 ])
175+ elif isinstance (prompt , dict ):
176+ # Handle complex object formats
177+ try :
178+ # Special handling for chat-like formats with messages
179+ if "messages" in prompt :
180+ # This handles formats like {"messages": [{"role": "user", "content": "..."}]}
181+ return json .dumps (prompt , ensure_ascii = False , separators = ("," , ":" ))
182+ else :
183+ # Handle other dictionary formats
184+ return json .dumps (prompt , ensure_ascii = False , separators = ("," , ":" ))
185+ except (TypeError , ValueError ) as e :
186+ # Fallback to string representation if JSON serialization fails
187+ logger .warning (
188+ f"Failed to serialize prompt object to JSON: { e } , using string representation"
189+ )
190+ return str (prompt )
167191 else :
168192 return ""
169193
@@ -200,9 +224,18 @@ def _parse_jsonl_line(
200224 prompt_id = json_obj .get ("id" , line_num )
201225
202226 # Extract and normalize prompt
203- prompt = _normalize_prompt_field (json_obj .get ("prompt" ))
227+ raw_prompt = json_obj .get ("prompt" )
228+ prompt = _normalize_prompt_field (raw_prompt )
204229 if not prompt :
205- effective_logger .warning (f"Empty prompt in line { line_num } : { line } " )
230+ # For debugging, show the type and structure of the raw prompt
231+ prompt_info = f"type: { type (raw_prompt ).__name__ } "
232+ if isinstance (raw_prompt , dict ) and "messages" in raw_prompt :
233+ prompt_info += f", has { len (raw_prompt ['messages' ])} messages"
234+ elif isinstance (raw_prompt , list ):
235+ prompt_info += f", list length: { len (raw_prompt )} "
236+ effective_logger .warning (
237+ f"Empty prompt in line { line_num } ({ prompt_info } ): { line } ..."
238+ )
206239 return None
207240
208241 # Handle images
0 commit comments