Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions examples/offline_inference/vision_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -1140,14 +1140,10 @@ def run_ovis2_5(questions: list[str], modality: str) -> ModelRequestData:
elif modality == "video":
placeholder = "<video>"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
messages = [
[{"role": "user", "content": f"{placeholder}\n{question}"}]
prompts = [
f"<|im_start|>user\n\n{placeholder}\n{question}<|im_end|>\n<|im_start|>assistant\n"
for question in questions
]
prompts = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)

return ModelRequestData(
engine_args=engine_args,
Expand Down
7 changes: 1 addition & 6 deletions examples/offline_inference/vision_language_multi_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -713,12 +713,7 @@
placeholders = "\n".join(
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
)
messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
prompt = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
prompt = f"<|im_start|>user\n\n{placeholders}\n{question}<|im_end|>\n<|im_start|>assistant\n"

Check failure on line 716 in examples/offline_inference/vision_language_multi_image.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E501)

examples/offline_inference/vision_language_multi_image.py:716:89: E501 Line too long (97 > 88)

return ModelRequestData(
engine_args=engine_args,
Expand Down
2 changes: 2 additions & 0 deletions vllm/model_executor/models/ovis.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,8 @@ def get_replacement_ovis(item_idx: int):
dummy_inputs=OvisDummyInputsBuilder,
)
class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
merge_by_field_config = True

@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
Expand Down
22 changes: 13 additions & 9 deletions vllm/model_executor/models/ovis2_5.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,9 @@ def _call_hf_processor(
self.visual_indicators_to_visual_tokens(indicator)
for indicator in visual_indicators
]
processed_outputs["video_indicator_tokens"] = indicator_tokens
processed_outputs["video_indicator_tokens"] = torch.tensor(
[indicator_tokens]
)
if "images" in mm_data:
visual_indicators = [
hf_processor.construct_visual_indicators((1, 1, 1), False)
Expand All @@ -391,7 +393,7 @@ def _call_hf_processor(
for indicator in visual_indicators
]

processed_outputs["indicator_tokens"] = indicator_tokens
processed_outputs["indicator_tokens"] = torch.tensor([indicator_tokens])
return processed_outputs

def _apply_hf_processor_tokens_only(
Expand Down Expand Up @@ -441,6 +443,8 @@ def get_replacement_ovis(item_idx, modality: str):
dummy_inputs=Ovis2_5DummyInputsBuilder,
)
class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
merge_by_field_config = True

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config = vllm_config.model_config.hf_config
Expand Down Expand Up @@ -491,13 +495,13 @@ def _parse_and_validate_image_input(

return OvisImagePatchInputs(
type="image_patches",
flat_data=flatten_bn(flatten_bn(pixel_values), concat=True),
flat_data=pixel_values,
patches_per_image=[
x.shape[0] // (self.config.vit_config.hidden_stride**2)
for x in flatten_bn(pixel_values)
],
indicator_tokens=flatten_bn(flatten_bn(indicator_tokens), concat=True),
grids=flatten_bn(flatten_bn(grids), concat=True),
indicator_tokens=indicator_tokens,
grids=grids,
)

raise AssertionError("This line should be unreachable.")
Expand Down Expand Up @@ -525,13 +529,13 @@ def _parse_and_validate_video_input(

return OvisVideoPatchInputs(
type="video_patches",
flat_data=flatten_bn(flatten_bn(pixel_values), concat=True),
flat_data=flatten_bn(pixel_values, concat=True),
patches_per_image=[
x.shape[0] // (self.config.vit_config.hidden_stride**2)
for x in flatten_bn(pixel_values)
for x in pixel_values
],
indicator_tokens=flatten_bn(flatten_bn(indicator_tokens), concat=True),
grids=flatten_bn(flatten_bn(grids), concat=True),
indicator_tokens=flatten_bn(indicator_tokens, concat=True),
grids=flatten_bn(grids, concat=True),
)

raise AssertionError("This line should be unreachable.")
Expand Down
Loading