Skip to content

Commit f5722a5

Browse files
DarkLight1337mgoinywang96
authored
[V1] Scatter and gather placeholders in the model runner (#15712)
Signed-off-by: DarkLight1337 <[email protected]> Signed-off-by: mgoin <[email protected]> Signed-off-by: Roger Wang <[email protected]> Co-authored-by: mgoin <[email protected]> Co-authored-by: Roger Wang <[email protected]>
1 parent 651cf0f commit f5722a5

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+497
-943
lines changed

docs/source/contributing/model/multimodal.md

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -860,8 +860,8 @@ prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
860860
)
861861
```
862862

863-
To accommodate this, instead of a string you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`
864-
with different `full` and `feature` attributes:
863+
To assign the vision embeddings to only the image tokens, instead of a string
864+
you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`:
865865

866866
```python
867867
hf_config = self.info.get_hf_config()
@@ -879,9 +879,9 @@ def get_replacement_fuyu(item_idx: int):
879879
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
880880
[_NEWLINE_TOKEN_ID]) * nrows
881881

882-
return PromptUpdateDetails(
883-
full=image_tokens + [bos_token_id],
884-
features=image_tokens,
882+
return PromptUpdateDetails.select_token_id(
883+
image_tokens + [bos_token_id],
884+
embed_token_id=_IMAGE_TOKEN_ID,
885885
)
886886
```
887887

@@ -914,9 +914,9 @@ def _get_prompt_updates(
914914
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
915915
[_NEWLINE_TOKEN_ID]) * nrows
916916

917-
return PromptUpdateDetails(
918-
full=image_tokens + [bos_token_id],
919-
features=image_tokens,
917+
return PromptUpdateDetails.select_token_id(
918+
image_tokens + [bos_token_id],
919+
embed_token_id=_IMAGE_TOKEN_ID,
920920
)
921921

922922
return [

docs/source/models/supported_models.md

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -989,9 +989,6 @@ See [this page](#generative-models) for more information on how to use generativ
989989
<sup>+</sup> Multiple items can be inputted per text prompt for this modality.
990990

991991
:::{important}
992-
To use Gemma3 series models, you have to install Hugging Face Transformers library from source via
993-
`pip install git+https://github.com/huggingface/transformers`.
994-
995992
Pan-and-scan image pre-processing is currently supported on V0 (but not V1).
996993
You can enable it by passing `--mm-processor-kwargs '{"do_pan_and_scan": True}'`.
997994
:::

examples/offline_inference/audio_language.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
4747
model=model_name,
4848
trust_remote_code=True,
4949
max_model_len=4096,
50-
max_num_seqs=5,
50+
max_num_seqs=2,
5151
limit_mm_per_prompt={"audio": audio_count},
5252
)
5353

tests/models/decoder_only/audio_language/test_ultravox.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,10 @@ def server(request, audio_assets):
5555
for key, value in request.param.items()
5656
]
5757

58-
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
58+
with RemoteOpenAIServer(MODEL_NAME,
59+
args,
60+
env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
61+
"30"}) as remote_server:
5962
yield remote_server
6063

6164

tests/models/decoder_only/vision_language/test_models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@
167167
"cherry_blossom": "<image>What is the season?", # noqa: E501
168168
}),
169169
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501
170-
max_model_len=8192,
170+
max_model_len=4096,
171171
max_num_seqs=2,
172172
auto_cls=AutoModelForImageTextToText,
173173
vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}}

tests/models/decoder_only/vision_language/test_pixtral.py

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,8 @@ def test_chat(
176176
model,
177177
dtype=dtype,
178178
tokenizer_mode="mistral",
179+
load_format="mistral",
180+
config_format="mistral",
179181
max_model_len=max_model_len,
180182
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
181183
) as vllm_model:
@@ -198,22 +200,14 @@ def test_chat(
198200

199201

200202
@large_gpu_test(min_gb=48)
201-
@pytest.mark.parametrize(
202-
"prompt,expected_ranges",
203-
[(_create_engine_inputs_hf(IMG_URLS[:1]), [{
204-
"offset": 11,
205-
"length": 494
206-
}]),
207-
(_create_engine_inputs_hf(IMG_URLS[1:4]), [{
208-
"offset": 11,
209-
"length": 266
210-
}, {
211-
"offset": 277,
212-
"length": 1056
213-
}, {
214-
"offset": 1333,
215-
"length": 418
216-
}])])
203+
@pytest.mark.parametrize("prompt,expected_ranges",
204+
[(_create_engine_inputs_hf(IMG_URLS[:1]),
205+
[PlaceholderRange(offset=11, length=494)]),
206+
(_create_engine_inputs_hf(IMG_URLS[1:4]), [
207+
PlaceholderRange(offset=11, length=266),
208+
PlaceholderRange(offset=277, length=1056),
209+
PlaceholderRange(offset=1333, length=418)
210+
])])
217211
def test_multi_modal_placeholders(vllm_runner, prompt,
218212
expected_ranges: list[PlaceholderRange],
219213
monkeypatch) -> None:

tests/models/multimodal/processing/test_llava_next.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,8 @@ def _validate_image_prompt_replacements_one(
9292
first_placeholder = image_placeholders[0]
9393

9494
# NOTE: There is a BOS token
95-
assert first_placeholder["offset"] == 1
96-
assert first_placeholder["length"] == (
95+
assert first_placeholder.offset == 1
96+
assert first_placeholder.length == (
9797
len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
9898

9999
except Exception as exc:

tests/models/multimodal/processing/test_llava_onevision.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,8 @@ def _validate_image_prompt_replacements_one(
9292

9393
first_placeholder = image_placeholders[0]
9494

95-
assert first_placeholder["offset"] == 0
96-
assert first_placeholder["length"] == len(
95+
assert first_placeholder.offset == 0
96+
assert first_placeholder.length == len(
9797
processed_inputs["prompt_token_ids"]) // num_imgs
9898
except Exception as exc:
9999
failed_size_excs.append((image_size, exc))

tests/models/registry.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,9 @@ def check_available_online(
277277
trust_remote_code=True,
278278
hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501
279279
"H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m",
280-
extras={"2b": "h2oai/h2ovl-mississippi-2b"}), # noqa: E501
280+
extras={"2b": "h2oai/h2ovl-mississippi-2b"}, # noqa: E501
281+
max_transformers_version="4.48", # noqa: E501
282+
transformers_version_reason="HF model is not compatible."), # noqa: E501
281283
"InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
282284
extras={"2B": "OpenGVLab/InternVL2-2B"}, # noqa: E501
283285
trust_remote_code=True),

tests/multimodal/test_processing.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -785,6 +785,7 @@ def test_find_update_tokens(
785785
item_idx=0,
786786
start_idx=6,
787787
tokens=[32000, 32000],
788+
is_embed=None,
788789
),
789790
],
790791
"pattern_4": [
@@ -793,6 +794,7 @@ def test_find_update_tokens(
793794
item_idx=0,
794795
start_idx=3,
795796
tokens=[32000],
797+
is_embed=None,
796798
),
797799
],
798800
}
@@ -807,12 +809,14 @@ def test_find_update_tokens(
807809
item_idx=0,
808810
start_idx=1,
809811
tokens=[32000, 32000],
812+
is_embed=None,
810813
),
811814
PlaceholderFeaturesInfo(
812815
modality="pattern_1",
813816
item_idx=1,
814817
start_idx=5,
815818
tokens=[32000, 32000],
819+
is_embed=None,
816820
),
817821
],
818822
"pattern_3": [
@@ -821,6 +825,7 @@ def test_find_update_tokens(
821825
item_idx=0,
822826
start_idx=7,
823827
tokens=[1550, 918, 1550],
828+
is_embed=None,
824829
),
825830
],
826831
# No match for pattern_4 as it has lower priority than pattern_1
@@ -835,12 +840,14 @@ def test_find_update_tokens(
835840
item_idx=0,
836841
start_idx=1,
837842
tokens=[32000, 32000],
843+
is_embed=None,
838844
),
839845
PlaceholderFeaturesInfo(
840846
modality="pattern_1",
841847
item_idx=1,
842848
start_idx=3,
843849
tokens=[32000, 32000],
850+
is_embed=None,
844851
),
845852
],
846853
"pattern_4": [
@@ -849,6 +856,7 @@ def test_find_update_tokens(
849856
item_idx=0,
850857
start_idx=5,
851858
tokens=[32000],
859+
is_embed=None,
852860
),
853861
],
854862
"pattern_3": [
@@ -857,6 +865,7 @@ def test_find_update_tokens(
857865
item_idx=0,
858866
start_idx=6,
859867
tokens=[1550, 918, 1550],
868+
is_embed=None,
860869
),
861870
],
862871
}

0 commit comments

Comments
 (0)