Disable HPU Graphs

vivekgoe · vivekgoe · commit b728eae83cda · 2025-07-29T12:42:37.000+03:00
diff --git a/vllm_gaudi/lora/punica_wrapper/punica_hpu.py b/vllm_gaudi/lora/punica_wrapper/punica_hpu.py
@@ -23,33 +23,10 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int,
                  device: Union[torch.device, str], **kwargs):
         # Increasing max_num_batched_tokens by 3x to handle increase in
         # tensor size due to padding.
+        # TODO: Need to check if this override is still required
         PunicaWrapperBase.__init__(self, 3 * max_num_batched_tokens,
                                    max_batches, device)
 
-    def _update_base_metadata(
-        self,
-        mapping: "LoRAMapping",
-        lora_index_to_id: list[Optional[int]],
-        max_loras: int,
-        vocab_size: int,
-        extra_vocab_size: int,
-    ):
-        (
-            base_indices,
-            sampler_indices,
-            sampler_indices_padded,
-            embeddings_indices,
-            indices_len,
-        ) = convert_mapping(mapping, lora_index_to_id, max_loras, vocab_size,
-                            extra_vocab_size, self.device)
-        self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices)
-        self._sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices)
-        self._sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_(
-            sampler_indices_padded)
-        self._embeddings_indices[:embeddings_indices.
-                                 shape[0], :embeddings_indices.shape[1]].copy_(
-                                     embeddings_indices)
-        self.indices_len[:] = indices_len
 
     def add_lora_embedding(self,
                            y: torch.Tensor,
diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -442,9 +442,10 @@ def generate_proposals(self, *args, **kwargs):
 
 
 def _maybe_wrap_in_hpu_graph(*args, **kwargs):
-    return htorch.hpu.wrap_in_hpu_graph(
+    '''return htorch.hpu.wrap_in_hpu_graph(
         HpuModelAdapter(*args, **kwargs), disable_tensor_cache=True
-    ) if htorch.utils.internal.is_lazy() else HpuModelAdapter(*args, **kwargs)
+    ) if htorch.utils.internal.is_lazy() else HpuModelAdapter(*args, **kwargs)'''
+    return HpuModelAdapter(*args, **kwargs)
 
 
 def subtuple(obj: object,