vllm-project
diff --git a/‎docs/source/contributing/model/multimodal.md‎
Lines changed: 8 additions & 8 deletions b/‎docs/source/contributing/model/multimodal.md‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎docs/source/models/supported_models.md‎
Lines changed: 0 additions & 3 deletions b/‎docs/source/models/supported_models.md‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎examples/offline_inference/audio_language.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/offline_inference/audio_language.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/models/decoder_only/audio_language/test_ultravox.py‎
Lines changed: 4 additions & 1 deletion b/‎tests/models/decoder_only/audio_language/test_ultravox.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎tests/models/decoder_only/vision_language/test_models.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/models/decoder_only/vision_language/test_models.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/models/decoder_only/vision_language/test_pixtral.py‎
Lines changed: 10 additions & 16 deletions b/‎tests/models/decoder_only/vision_language/test_pixtral.py‎
Lines changed: 10 additions & 16 deletions
diff --git a/‎tests/models/multimodal/processing/test_llava_next.py‎
Lines changed: 2 additions & 2 deletions b/‎tests/models/multimodal/processing/test_llava_next.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tests/models/multimodal/processing/test_llava_onevision.py‎
Lines changed: 2 additions & 2 deletions b/‎tests/models/multimodal/processing/test_llava_onevision.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tests/models/registry.py‎
Lines changed: 3 additions & 1 deletion b/‎tests/models/registry.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎tests/multimodal/test_processing.py‎
Lines changed: 9 additions & 0 deletions b/‎tests/multimodal/test_processing.py‎
Lines changed: 9 additions & 0 deletions
@@ -860,8 +860,8 @@ prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
 )
 ```
 
-To accommodate this, instead of a string you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`
-with different `full` and `feature` attributes:
+To assign the vision embeddings to only the image tokens, instead of a string
+you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`:
 
 ```python
 hf_config = self.info.get_hf_config()
@@ -879,9 +879,9 @@ def get_replacement_fuyu(item_idx: int):
     image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                     [_NEWLINE_TOKEN_ID]) * nrows
 
-    return PromptUpdateDetails(
-        full=image_tokens + [bos_token_id],
-        features=image_tokens,
+    return PromptUpdateDetails.select_token_id(
+        image_tokens + [bos_token_id],
+        embed_token_id=_IMAGE_TOKEN_ID,
     )
 ```
 
@@ -914,9 +914,9 @@ def _get_prompt_updates(
         image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                         [_NEWLINE_TOKEN_ID]) * nrows
 
-        return PromptUpdateDetails(
-            full=image_tokens + [bos_token_id],
-            features=image_tokens,
+        return PromptUpdateDetails.select_token_id(
+            image_tokens + [bos_token_id],
+            embed_token_id=_IMAGE_TOKEN_ID,
         )
 
     return [
 
@@ -989,9 +989,6 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
 :::{important}
-To use Gemma3 series models, you have to install Hugging Face Transformers library from source via
-`pip install git+https://github.com/huggingface/transformers`.
-
 Pan-and-scan image pre-processing is currently supported on V0 (but not V1).
 You can enable it by passing `--mm-processor-kwargs '{"do_pan_and_scan": True}'`.
 :::
 
@@ -47,7 +47,7 @@ def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
         model=model_name,
         trust_remote_code=True,
         max_model_len=4096,
-        max_num_seqs=5,
+        max_num_seqs=2,
         limit_mm_per_prompt={"audio": audio_count},
     )
 
 
@@ -55,7 +55,10 @@ def server(request, audio_assets):
         for key, value in request.param.items()
     ]
 
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    with RemoteOpenAIServer(MODEL_NAME,
+                            args,
+                            env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
+                                      "30"}) as remote_server:
         yield remote_server
 
 
 
@@ -167,7 +167,7 @@
             "cherry_blossom": "<image>What is the season?",  # noqa: E501
         }),
         multi_image_prompt="<image><image>Describe the two images in detail.",  # noqa: E501
-        max_model_len=8192,
+        max_model_len=4096,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
         vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}}
 
@@ -176,6 +176,8 @@ def test_chat(
             model,
             dtype=dtype,
             tokenizer_mode="mistral",
+            load_format="mistral",
+            config_format="mistral",
             max_model_len=max_model_len,
             limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
     ) as vllm_model:
@@ -198,22 +200,14 @@ def test_chat(
 
 
 @large_gpu_test(min_gb=48)
-@pytest.mark.parametrize(
-    "prompt,expected_ranges",
-    [(_create_engine_inputs_hf(IMG_URLS[:1]), [{
-        "offset": 11,
-        "length": 494
-    }]),
-     (_create_engine_inputs_hf(IMG_URLS[1:4]), [{
-         "offset": 11,
-         "length": 266
-     }, {
-         "offset": 277,
-         "length": 1056
-     }, {
-         "offset": 1333,
-         "length": 418
-     }])])
+@pytest.mark.parametrize("prompt,expected_ranges",
+                         [(_create_engine_inputs_hf(IMG_URLS[:1]),
+                           [PlaceholderRange(offset=11, length=494)]),
+                          (_create_engine_inputs_hf(IMG_URLS[1:4]), [
+                              PlaceholderRange(offset=11, length=266),
+                              PlaceholderRange(offset=277, length=1056),
+                              PlaceholderRange(offset=1333, length=418)
+                          ])])
 def test_multi_modal_placeholders(vllm_runner, prompt,
                                   expected_ranges: list[PlaceholderRange],
                                   monkeypatch) -> None:
 
@@ -92,8 +92,8 @@ def _validate_image_prompt_replacements_one(
         first_placeholder = image_placeholders[0]
 
         # NOTE: There is a BOS token
-        assert first_placeholder["offset"] == 1
-        assert first_placeholder["length"] == (
+        assert first_placeholder.offset == 1
+        assert first_placeholder.length == (
             len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
 
     except Exception as exc:
 
@@ -92,8 +92,8 @@ def _validate_image_prompt_replacements_one(
 
         first_placeholder = image_placeholders[0]
 
-        assert first_placeholder["offset"] == 0
-        assert first_placeholder["length"] == len(
+        assert first_placeholder.offset == 0
+        assert first_placeholder.length == len(
             processed_inputs["prompt_token_ids"]) // num_imgs
     except Exception as exc:
         failed_size_excs.append((image_size, exc))
 
@@ -277,7 +277,9 @@ def check_available_online(
                                         trust_remote_code=True,
                                         hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
     "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m",
-                                      extras={"2b": "h2oai/h2ovl-mississippi-2b"}),  # noqa: E501
+                                      extras={"2b": "h2oai/h2ovl-mississippi-2b"},  # noqa: E501
+                                      max_transformers_version="4.48",  # noqa: E501
+                                      transformers_version_reason="HF model is not compatible."),  # noqa: E501
     "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
                                          extras={"2B": "OpenGVLab/InternVL2-2B"},  # noqa: E501
                                          trust_remote_code=True),
 
@@ -785,6 +785,7 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=6,
                         tokens=[32000, 32000],
+                        is_embed=None,
                     ),
                 ],
                 "pattern_4": [
@@ -793,6 +794,7 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=3,
                         tokens=[32000],
+                        is_embed=None,
                     ),
                 ],
             }
@@ -807,12 +809,14 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=1,
                         tokens=[32000, 32000],
+                        is_embed=None,
                     ),
                     PlaceholderFeaturesInfo(
                         modality="pattern_1",
                         item_idx=1,
                         start_idx=5,
                         tokens=[32000, 32000],
+                        is_embed=None,
                     ),
                 ],
                 "pattern_3": [
@@ -821,6 +825,7 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=7,
                         tokens=[1550, 918, 1550],
+                        is_embed=None,
                     ),
                 ],
                 # No match for pattern_4 as it has lower priority than pattern_1
@@ -835,12 +840,14 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=1,
                         tokens=[32000, 32000],
+                        is_embed=None,
                     ),
                     PlaceholderFeaturesInfo(
                         modality="pattern_1",
                         item_idx=1,
                         start_idx=3,
                         tokens=[32000, 32000],
+                        is_embed=None,
                     ),
                 ],
                 "pattern_4": [
@@ -849,6 +856,7 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=5,
                         tokens=[32000],
+                        is_embed=None,
                     ),
                 ],
                 "pattern_3": [
@@ -857,6 +865,7 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=6,
                         tokens=[1550, 918, 1550],
+                        is_embed=None,
                     ),
                 ],
             }
Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,7 @@ def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:`
`47`	`47`	`model=model_name,`
`48`	`48`	`trust_remote_code=True,`
`49`	`49`	`max_model_len=4096,`
`50`		`- max_num_seqs=5,`
	`50`	`+ max_num_seqs=2,`
`51`	`51`	`limit_mm_per_prompt={"audio": audio_count},`
`52`	`52`	`)`
`53`	`53`
Original file line number	Diff line number	Diff line change
`@@ -55,7 +55,10 @@ def server(request, audio_assets):`
`55`	`55`	`for key, value in request.param.items()`
`56`	`56`	`]`
`57`	`57`
`58`		`- with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:`
	`58`	`+ with RemoteOpenAIServer(MODEL_NAME,`
	`59`	`+ args,`
	`60`	`+ env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":`
	`61`	`+ "30"}) as remote_server:`
`59`	`62`	`yield remote_server`
`60`	`63`
`61`	`64`