diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py index f2d44cc1a057..f4383adab63a 100644 --- a/vllm/model_executor/models/ovis2_5.py +++ b/vllm/model_executor/models/ovis2_5.py @@ -548,8 +548,11 @@ def _process_image_input( image_patches_flat.to(target_dtype), grid_thws, self.vision_buckets) + visual_embeds = visual_embeds.contiguous().clone() + grid_thws = grid_thws.contiguous().clone() visual_tokens = self.visual_tokenizer(visual_embeds, grid_thws) visual_embeds = self.vte(visual_tokens) # 1:1 numeric eq. + indicator_tokens = indicator_tokens.contiguous().clone() indicator_embeds = self.vte(indicator_tokens) padded_patches_per_image = [ grid[1] * grid[2] // (self.config.vit_config.hidden_stride**2)