Remove inference_mode() from platforms.hpu (#1691)

jkaniecki · web-flow · commit 89e6254b4b96 · 2025-07-31T15:19:34.000+02:00
Inference_mode() is causing recompilations with t.compile - we don't need it as we already put inference_mode on particular functions in model runner. It was introduced by Rebase 0.9.0.1 (#1507) - previously we didn't have such call.
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
@@ -68,10 +68,6 @@ def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
     def get_device_name(cls, device_id: int = 0) -> str:
         return cls.device_name
 
-    @classmethod
-    def inference_mode(cls):
-        return torch.no_grad()
-
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None: