HabanaAI
diff --git a/‎.cd/README.md‎
Lines changed: 17 additions & 8 deletions b/‎.cd/README.md‎
Lines changed: 17 additions & 8 deletions
diff --git a/‎.jenkins/vision/configs/Qwen2.5-VL-7B-Instruct.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.jenkins/vision/configs/Qwen2.5-VL-7B-Instruct.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎requirements/common.txt‎
Lines changed: 1 addition & 0 deletions b/‎requirements/common.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎requirements/hpu.txt‎
Lines changed: 2 additions & 2 deletions b/‎requirements/hpu.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎vllm/entrypoints/chat_utils.py‎
Lines changed: 8 additions & 2 deletions b/‎vllm/entrypoints/chat_utils.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎vllm/model_executor/layers/sampler.py‎
Lines changed: 15 additions & 2 deletions b/‎vllm/model_executor/layers/sampler.py‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎vllm/model_executor/models/gemma3_mm.py‎
Lines changed: 16 additions & 18 deletions b/‎vllm/model_executor/models/gemma3_mm.py‎
Lines changed: 16 additions & 18 deletions
@@ -28,21 +28,30 @@ Supports a wide range of validated models including LLaMa, Mistral, and Qwen fam
 
 ## How to Use
 
+### 0. Clone the Repository
+
+Before proceeding with any of the steps below, make sure to clone the vLLM fork repository and navigate to the `.cd` directory. This ensures you have all necessary files and scripts for running the server or benchmarks.
+
+```bash
+git clone https://github.com/HabanaAI/vllm-fork.git
+cd vllm-fork/.cd/
+```
+
 ### 1. Run the server using Docker Compose
 
    The recommended and easiest way to start the vLLM server is with Docker Compose. At a minimum, set the following environment variables:
 
    - `MODEL` - Select a model from the table above.
    - `HF_TOKEN` - Your Hugging Face token (generate one at <https://huggingface.co>).
-   - `DOCKER_IMAGE` - The vLLM Docker image URL from Gaudi or local repository.
+   - `DOCKER_IMAGE` - The vLLM Docker image URL from Gaudi or local repository. When using the Gaudi repository, please select Docker images with the vllm-installer* prefix in the file name.
 
    **Example usage:**
 
    ```bash
    cd vllm-fork/.cd/
    MODEL="Qwen/Qwen2.5-14B-Instruct" \
    HF_TOKEN="<your huggingface token>" \
-   DOCKER_IMAGE="<docker image url>" \
+   DOCKER_IMAGE="vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/vllm-installer-2.7.1:latest" \
    docker compose up
    ```
 
@@ -54,7 +63,7 @@ Supports a wide range of validated models including LLaMa, Mistral, and Qwen fam
    cd vllm-fork/.cd/
    MODEL="Qwen/Qwen2.5-14B-Instruct" \
    HF_TOKEN="<your huggingface token>" \
-   DOCKER_IMAGE="<docker image url>" \
+   DOCKER_IMAGE="vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/vllm-installer-2.7.1:latest" \
    docker compose --profile benchmark up
    ```
 
@@ -81,7 +90,7 @@ Supports a wide range of validated models including LLaMa, Mistral, and Qwen fam
    cd vllm-fork/.cd/
    MODEL="Qwen/Qwen2.5-14B-Instruct" \
    HF_TOKEN="<your huggingface token>" \
-   DOCKER_IMAGE="<docker image url>" \
+   DOCKER_IMAGE="vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/vllm-installer-2.7.1:latest" \
    TENSOR_PARALLEL_SIZE=1 \
    MAX_MODEL_LEN=2048 \
    docker compose up
@@ -102,7 +111,7 @@ Supports a wide range of validated models including LLaMa, Mistral, and Qwen fam
    cd vllm-fork/.cd/
    MODEL="Qwen/Qwen2.5-14B-Instruct" \
    HF_TOKEN="<your huggingface token>" \
-   DOCKER_IMAGE="<docker image url>" \
+   DOCKER_IMAGE="vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/vllm-installer-2.7.1:latest" \
    INPUT_TOK=128 \
    OUTPUT_TOK=128 \
    CON_REQ=16 \
@@ -122,7 +131,7 @@ Supports a wide range of validated models including LLaMa, Mistral, and Qwen fam
    cd vllm-fork/.cd/
    MODEL="Qwen/Qwen2.5-14B-Instruct" \
    HF_TOKEN="<your huggingface token>" \
-   DOCKER_IMAGE="<docker image url>" \
+   DOCKER_IMAGE="vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/vllm-installer-2.7.1:latest" \
    TENSOR_PARALLEL_SIZE=1 \
    MAX_MODEL_LEN=2048 \
    INPUT_TOK=128 \
@@ -147,7 +156,7 @@ Supports a wide range of validated models including LLaMa, Mistral, and Qwen fam
 
    ```bash
    HF_TOKEN=<your huggingface token> \
-   DOCKER_IMAGE="<docker image url>" \
+   DOCKER_IMAGE="vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/vllm-installer-2.7.1:latest" \
    VLLM_SERVER_CONFIG_FILE=server/server_scenarios_text.yaml \
    VLLM_SERVER_CONFIG_NAME=llama31_8b_instruct \
    VLLM_BENCHMARK_CONFIG_FILE=benchmark/benchmark_scenarios_text.yaml \
@@ -178,7 +187,7 @@ Supports a wide range of validated models including LLaMa, Mistral, and Qwen fam
      -p 8000:8000 \
      -e HF_HOME='mnt/hf_cache'
      --name vllm-server \
-     <docker image name>
+     vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/vllm-installer-2.7.1:latest
    ```
 
    This method gives you full flexibility over Docker runtime options.
@@ -1,5 +1,5 @@
 model_name: "/mnt/weka/data/pytorch/Qwen/Qwen2.5-VL-7B-Instruct/"
 dtype: "bfloat16"
-max_model_len: 32768
+max_model_len: 35840
 max_num_seqs: 32
 num_prompts: 4
@@ -48,3 +48,4 @@ opentelemetry-sdk>=1.26.0  # vllm.tracing
 opentelemetry-api>=1.26.0  # vllm.tracing
 opentelemetry-exporter-otlp>=1.26.0  # vllm.tracing
 opentelemetry-semantic-conventions-ai>=0.4.1  # vllm.tracing
+modelscope  # required to support VLLM_USE_MODELSCOPE env
@@ -3,11 +3,11 @@
 
 # Dependencies for HPU code
 accelerate
-ray
+ray<2.49.0
 triton==3.1.0
 setuptools>=77.0.3
 setuptools-scm>=8
-vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@b7ce4ba
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@048015b
 
 # Dependencies for HPU vllm docker image
 datasets
 
@@ -405,8 +405,14 @@ def _resolve_chat_template_content_format(
     jinja_text = (hf_chat_template if isinstance(hf_chat_template, str)
                   else load_chat_template(chat_template, is_literal=True))
 
-    detected_format = ("string" if jinja_text is None else
-                       _detect_content_format(jinja_text, default="string"))
+    # The InternVL template has mixed content access patterns that fail with automatic detection.
+    # Set string format for proper operation if InternVL is used.
+    model_type = getattr(model_config.hf_config, 'model_type', '')
+    if model_type == 'internvl_chat' or 'internvl' in model_config.model.lower():
+        detected_format = "string"
+    else:
+        detected_format = ("string" if jinja_text is None else
+                           _detect_content_format(jinja_text, default="string"))
 
     return detected_format
 
 
@@ -197,6 +197,10 @@ def __init__(self):
         # speculative decoding and when prompt embeddings are specified.
         self.include_gpu_probs_tensor = False
         self.should_modify_greedy_probs_inplace = False
+        # Add HPU cache class variables
+        self._prompt_tokens_hpu_cache: Optional[torch.Tensor] = None
+        self._output_tokens_hpu_cache: Optional[torch.Tensor] = None
+        self._cached_seq_ids: Optional[set] = None
 
     def _init_sampling_tensors(
         self,
@@ -216,8 +220,10 @@ def _init_sampling_tensors(
 
         # Initialize new sampling tensors
         (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p,
-         top_k_scalar, top_p_scalar) = SamplingTensors.from_sampling_metadata(
-             sampling_metadata, vocab_size, logits.device, logits.dtype)
+         top_k_scalar, top_p_scalar, current_seq_ids) = \
+            SamplingTensors.from_sampling_metadata(
+             sampling_metadata, vocab_size, logits.device, logits.dtype, \
+             self._prompt_tokens_hpu_cache, self._output_tokens_hpu_cache, self._cached_seq_ids)
 
         self._sampling_tensors = sampling_tensors
         self._do_penalties = do_penalties
@@ -227,6 +233,13 @@ def _init_sampling_tensors(
         self._top_p_scalar = top_p_scalar
 
         self._apply_top_k_top_p_opt = ApplyToppTopkScalar(5)
+        # Check if batch composition changed - if so, invalidate prompt cache
+
+        # After tensors are created, update cache
+        if self._cached_seq_ids != current_seq_ids:
+            self._prompt_tokens_hpu_cache = None
+            self._output_tokens_hpu_cache = None
+            self._cached_seq_ids = current_seq_ids
 
     def forward(
         self,
 
@@ -569,11 +569,6 @@ def _process_image_input(
         pixel_values = image_input["pixel_values"]
         num_patches = image_input["num_patches"]
 
-        image_features = self._image_pixels_to_features(
-            self.vision_tower,
-            pixel_values,
-        )
-
         if is_hpu:
             batch_breakdown = greedy_plan(pixel_values.shape[0], \
                     self.vision_buckets.multimodal_buckets)
@@ -582,22 +577,25 @@ def _process_image_input(
 
             for i in batch_breakdown:
                 end_idx = start_idx + i
-                batch_sliced_image_features = \
-                        image_features[start_idx:end_idx, ...]
-                if is_lazy:
-                    image_embeds_multibatches += \
-                            [self.multi_modal_projector(
-                                batch_sliced_image_features,
-                                bypass_hpu_graphs=i
-                                not in self.graphed_multimodal_buckets
-                                and len(self.graphed_multimodal_buckets) > 0)]
-                else:
-                    image_embeds_multibatches += \
-                            [self.multi_modal_projector( \
-                                batch_sliced_image_features)]
+                indices = torch.arange(start_idx,
+                                       end_idx).to(pixel_values.device)
+                batch_sliced_pixel_values = torch.index_select(pixel_values,
+                                                               dim=0,
+                                                               index=indices)
+
+                image_features = self._image_pixels_to_features(
+                    self.vision_tower,
+                    batch_sliced_pixel_values,
+                )
+                image_embeds = self.multi_modal_projector(image_features)
+                image_embeds_multibatches += [image_embeds.clone()]
                 start_idx = end_idx
             image_embeds = torch.cat(image_embeds_multibatches, dim=0)
         else:
+            image_features = self._image_pixels_to_features(
+                self.vision_tower,
+                pixel_values,
+            )
             image_embeds = self.multi_modal_projector(image_features)
         return [
             e.flatten(0, 1) for e in image_embeds.split(num_patches.tolist())