[serve] allow array content inputs for LLMs (#39829)

gante · web-flow · commit 8d19231bca94 · 2025-08-13T11:26:19.000+01:00
fix bug; add tests
diff --git a/src/transformers/commands/serving.py b/src/transformers/commands/serving.py
@@ -829,13 +829,22 @@ def get_processor_inputs_from_inbound_messages(messages, modality: Modality):
             parsed_message = {"role": message["role"], "content": []}
 
             if modality == Modality.LLM:
-                # If we're working with LLMs, then "content" is a single string.
-                content = message["content"] if isinstance(message["content"], str) else message["content"]["text"]
-                parsed_message["content"] = content
+                # Input: `content` is a string or a list of dictionaries with a "text" key.
+                # Output: `content` is a string.
+                if isinstance(message["content"], str):
+                    parsed_content = message["content"]
+                elif isinstance(message["content"], list):
+                    parsed_content = []
+                    for content in message["content"]:
+                        if content["type"] == "text":
+                            parsed_content.append(content["text"])
+                    parsed_content = " ".join(parsed_content)
+                parsed_message["content"] = parsed_content
 
             elif modality == Modality.VLM:
-                # If we're working with VLMs, then "content" is a dictionary, containing a "type" key indicating
-                # which other key will be present and the type of the value of said key.
+                # Input: `content` is a string or a list of dictionaries with a "type" key (possible types: "text",
+                # "image_url").
+                # Output: `content` is a list of dictionaries with a "type" key
                 if isinstance(message["content"], str):
                     parsed_message["content"].append({"type": "text", "text": message["content"]})
                 else:
diff --git a/tests/commands/test_serving.py b/tests/commands/test_serving.py
@@ -282,6 +282,37 @@ def test_processor_inputs_from_inbound_messages_llm(self):
         outputs = ServeCommand.get_processor_inputs_from_inbound_messages(messages, modality)
         self.assertListEqual(expected_outputs, outputs)
 
+        messages_with_type = [
+            {"role": "user", "content": [{"type": "text", "text": "How are you doing?"}]},
+            {
+                "role": "assistant",
+                "content": [
+                    {"type": "text", "text": "I'm doing great, thank you for asking! How can I assist you today?"}
+                ],
+            },
+            {"role": "user", "content": [{"type": "text", "text": "Can you help me write tests?"}]},
+        ]
+        outputs = ServeCommand.get_processor_inputs_from_inbound_messages(messages_with_type, modality)
+        self.assertListEqual(expected_outputs, outputs)
+
+        messages_multiple_text = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "How are you doing?"},
+                    {"type": "text", "text": "I'm doing great, thank you for asking! How can I assist you today?"},
+                ],
+            },
+        ]
+        expected_outputs_multiple_text = [
+            {
+                "role": "user",
+                "content": "How are you doing? I'm doing great, thank you for asking! How can I assist you today?",
+            },
+        ]
+        outputs = ServeCommand.get_processor_inputs_from_inbound_messages(messages_multiple_text, modality)
+        self.assertListEqual(expected_outputs_multiple_text, outputs)
+
     def test_processor_inputs_from_inbound_messages_vlm_text_only(self):
         modality = Modality.VLM
         messages = [