add opaque_attention_op interface in platform

jikunshang · jikunshang · commit ac028fa44d4b · 2025-08-26T18:49:45.000+08:00
Signed-off-by: Kunshang Ji &lt;kunshang.ji@intel.com&gt;
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
@@ -190,8 +190,7 @@ def __init__(
         # torch.compile works by registering the attention as one giant
         # opaque custom op. For other platforms, we directly call them
         # and let torch.compile handle them.
-        self.use_direct_call = not current_platform.is_cuda_alike(
-        ) and not current_platform.is_cpu() and not current_platform.is_xpu()
+        self.use_direct_call = not current_platform.opaque_attention_op()
 
         self.use_output = self.attn_backend.accept_output_buffer
         compilation_config = get_current_vllm_config().compilation_config
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
@@ -335,3 +335,7 @@ def default_v1(cls, model_config) -> bool:
         return (cls.supports_v1(model_config)
                 and arch in (CpuArchEnum.X86, CpuArchEnum.POWERPC,
                              CpuArchEnum.ARM, CpuArchEnum.S390X))
+
+    @classmethod
+    def opaque_attention_op(cls) -> bool:
+        return True
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
@@ -442,6 +442,10 @@ def supports_v1(cls, model_config: "ModelConfig") -> bool:
     def use_custom_allreduce(cls) -> bool:
         return True
 
+    @classmethod
+    def opaque_attention_op(cls) -> bool:
+        return True
+
     @classmethod
     def get_static_graph_wrapper_cls(cls) -> str:
         return "vllm.compilation.cuda_graph.CUDAGraphWrapper"
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
@@ -509,6 +509,14 @@ def use_custom_allreduce(cls) -> bool:
         """
         return False
 
+    @classmethod
+    def opaque_attention_op(cls) -> bool:
+        """
+        Returns True if we register attention as one giant opaque custom op
+        on the current platform
+        """
+        return False
+
     @classmethod
     def validate_request(
         cls,
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
@@ -411,6 +411,10 @@ def use_custom_allreduce(cls) -> bool:
         supported_archs = ['gfx94', 'gfx95']
         return any(gfx in gcn_arch for gfx in supported_archs)
 
+    @classmethod
+    def opaque_attention_op(cls) -> bool:
+        return True
+
     @classmethod
     def get_cu_count(cls, device_id: int = 0) -> int:
         return torch.cuda.get_device_properties(
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
@@ -181,3 +181,7 @@ def get_global_graph_pool(self) -> Any:
         Currently xpu does NOT support Graph model.
         """
         raise NotImplementedError("XPU does not support Graph model.")
+
+    @classmethod
+    def opaque_attention_op(cls) -> bool:
+        return True