vllm-project
diff --git a/‎examples/offline_inference/vision_language_multi_image.py‎
Lines changed: 53 additions & 53 deletions b/‎examples/offline_inference/vision_language_multi_image.py‎
Lines changed: 53 additions & 53 deletions
diff --git a/‎vllm/model_executor/models/llava.py‎
Lines changed: 5 additions & 15 deletions b/‎vllm/model_executor/models/llava.py‎
Lines changed: 5 additions & 15 deletions
diff --git a/‎vllm/model_executor/models/llava_next.py‎
Lines changed: 5 additions & 19 deletions b/‎vllm/model_executor/models/llava_next.py‎
Lines changed: 5 additions & 19 deletions
@@ -371,13 +371,14 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
-def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "Kwai-Keye/Keye-VL-8B-Preview"
 
     engine_args = EngineArgs(
         model=model_name,
-        max_model_len=131072,
-        tensor_parallel_size=8,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=5,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
@@ -389,29 +390,32 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
                 *placeholders,
                 {"type": "text", "text": question},
             ],
-        }
+        },
     ]
 
-    processor = AutoProcessor.from_pretrained(model_name)
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
 
     prompt = processor.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
     )
 
+    image_data = [fetch_image(url) for url in image_urls]
+
     return ModelRequestData(
         engine_args=engine_args,
         prompt=prompt,
-        image_data=[fetch_image(url) for url in image_urls],
+        image_data=image_data,
     )
 
 
-def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
-    # NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
-    # it will generate poor response for multi-image inputs!
-    model_name = "llava-hf/llava-1.5-7b-hf"
+def load_keye_vl1_5(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "Kwai-Keye/Keye-VL-1_5-8B"
+
     engine_args = EngineArgs(
         model=model_name,
-        max_num_seqs=16,
+        trust_remote_code=True,
+        max_model_len=32768,
+        max_num_seqs=5,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
@@ -423,28 +427,32 @@ def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
                 *placeholders,
                 {"type": "text", "text": question},
             ],
-        }
+        },
     ]
 
-    processor = AutoProcessor.from_pretrained(model_name)
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
 
     prompt = processor.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
     )
 
+    image_data = [fetch_image(url) for url in image_urls]
+
     return ModelRequestData(
         engine_args=engine_args,
         prompt=prompt,
-        image_data=[fetch_image(url) for url in image_urls],
+        image_data=image_data,
     )
 
 
-def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "llava-hf/llava-v1.6-mistral-7b-hf"
+def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "moonshotai/Kimi-VL-A3B-Instruct"
+
     engine_args = EngineArgs(
         model=model_name,
-        max_model_len=8192,
-        max_num_seqs=16,
+        trust_remote_code=True,
+        max_model_len=4096,
+        max_num_seqs=4,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
@@ -459,7 +467,7 @@ def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
         }
     ]
 
-    processor = AutoProcessor.from_pretrained(model_name)
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
 
     prompt = processor.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
@@ -472,12 +480,13 @@ def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
-def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
+def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
     engine_args = EngineArgs(
         model=model_name,
-        max_model_len=16384,
-        max_num_seqs=16,
+        max_model_len=131072,
+        tensor_parallel_size=8,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
@@ -505,14 +514,13 @@ def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestDa
     )
 
 
-def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "Kwai-Keye/Keye-VL-8B-Preview"
-
+def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
+    # NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
+    # it will generate poor response for multi-image inputs!
+    model_name = "llava-hf/llava-1.5-7b-hf"
     engine_args = EngineArgs(
         model=model_name,
-        trust_remote_code=True,
-        max_model_len=8192,
-        max_num_seqs=5,
+        max_num_seqs=16,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
@@ -524,32 +532,28 @@ def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
                 *placeholders,
                 {"type": "text", "text": question},
             ],
-        },
+        }
     ]
 
-    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+    processor = AutoProcessor.from_pretrained(model_name)
 
     prompt = processor.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
     )
 
-    image_data = [fetch_image(url) for url in image_urls]
-
     return ModelRequestData(
         engine_args=engine_args,
         prompt=prompt,
-        image_data=image_data,
+        image_data=[fetch_image(url) for url in image_urls],
     )
 
 
-def load_keye_vl1_5(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "Kwai-Keye/Keye-VL-1_5-8B"
-
+def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "llava-hf/llava-v1.6-mistral-7b-hf"
     engine_args = EngineArgs(
         model=model_name,
-        trust_remote_code=True,
-        max_model_len=32768,
-        max_num_seqs=5,
+        max_model_len=8192,
+        max_num_seqs=16,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
@@ -561,32 +565,28 @@ def load_keye_vl1_5(question: str, image_urls: list[str]) -> ModelRequestData:
                 *placeholders,
                 {"type": "text", "text": question},
             ],
-        },
+        }
     ]
 
-    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+    processor = AutoProcessor.from_pretrained(model_name)
 
     prompt = processor.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
     )
 
-    image_data = [fetch_image(url) for url in image_urls]
-
     return ModelRequestData(
         engine_args=engine_args,
         prompt=prompt,
-        image_data=image_data,
+        image_data=[fetch_image(url) for url in image_urls],
     )
 
 
-def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
-    model_name = "moonshotai/Kimi-VL-A3B-Instruct"
-
+def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
     engine_args = EngineArgs(
         model=model_name,
-        trust_remote_code=True,
-        max_model_len=4096,
-        max_num_seqs=4,
+        max_model_len=16384,
+        max_num_seqs=16,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
@@ -601,7 +601,7 @@ def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
         }
     ]
 
-    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+    processor = AutoProcessor.from_pretrained(model_name)
 
     prompt = processor.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
 
@@ -57,7 +57,6 @@
 from .utils import (
     AutoWeightsLoader,
     WeightsMapper,
-    flatten_bn,
     init_vllm_registered_model,
     maybe_prefix,
 )
@@ -507,6 +506,8 @@ def init_vision_tower_for_llava(
     dummy_inputs=LlavaDummyInputsBuilder,
 )
 class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+    merge_by_field_config = True
+
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
@@ -592,37 +593,26 @@ def _parse_and_validate_image_input(
             return None
 
         if pixel_values is not None:
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError(
-                    f"Incorrect type of pixel values. Got type: {type(pixel_values)}"
-                )
-
             if self.config.vision_config.model_type == "pixtral":
                 return PixtralHFImagePixelInputs(
                     type="pixel_values_pixtral",
-                    pixel_values=flatten_bn(pixel_values),
+                    pixel_values=pixel_values,
                 )
 
             expected_h = expected_w = self.config.vision_config.image_size
             return LlavaImagePixelInputs(
                 type="pixel_values",
-                pixel_values=flatten_bn(pixel_values, concat=True),
+                pixel_values=pixel_values,
                 resolve_bindings={"h": expected_h, "w": expected_w},
             )
 
         if image_embeds is not None:
-            if not isinstance(image_embeds, (torch.Tensor, list)):
-                raise ValueError(
-                    "Incorrect type of image embeddings. "
-                    f"Got type: {type(image_embeds)}"
-                )
-
             if self.config.vision_config.model_type == "pixtral":
                 raise ValueError("Pixtral-HF does not support image_embeds.")
 
             return LlavaImageEmbeddingInputs(
                 type="image_embeds",
-                data=flatten_bn(image_embeds, concat=True),
+                data=image_embeds,
             )
 
         raise AssertionError("This line should be unreachable.")
 
@@ -34,7 +34,6 @@
 from .utils import (
     AutoWeightsLoader,
     WeightsMapper,
-    flatten_bn,
     init_vllm_registered_model,
     maybe_prefix,
 )
@@ -222,6 +221,8 @@ def _get_mm_fields_config(
     dummy_inputs=LlavaDummyInputsBuilder,
 )
 class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+    merge_by_field_config = True
+
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             # mapping for new names in checkpoint saved after transformers v4.52
@@ -302,36 +303,21 @@ def _parse_and_validate_image_input(
             return None
 
         if pixel_values is not None:
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError(
-                    f"Incorrect type of pixel values. Got type: {type(pixel_values)}"
-                )
-
-            if not isinstance(image_sizes, (torch.Tensor, list)):
-                raise ValueError(
-                    f"Incorrect type of image sizes. Got type: {type(image_sizes)}"
-                )
-
             expected_h = expected_w = self.config.vision_config.image_size
             return LlavaNextImagePixelInputs(
                 type="pixel_values",
-                pixel_values=flatten_bn(pixel_values),
-                image_sizes=flatten_bn(image_sizes, concat=True),
+                pixel_values=pixel_values,
+                image_sizes=image_sizes,
                 resolve_bindings={
                     "h": expected_h,
                     "w": expected_w,
                 },
             )
 
         if image_embeds is not None:
-            if not isinstance(image_embeds, torch.Tensor):
-                raise ValueError(
-                    f"Incorrect type of image embeds. Got type: {type(image_embeds)}"
-                )
-
             return LlavaNextImageEmbeddingInputs(
                 type="image_embeds",
-                data=flatten_bn(image_embeds),
+                data=image_embeds,
             )
 
         raise AssertionError("This line should be unreachable.")