[gpt-oss] Enable gpt-oss on ampere (#22714)

zyongye · web-flow · commit 007dd90859cc · 2025-08-12T03:21:44.000-07:00
Signed-off-by: Yongye Zhu &lt;zyy1102000@gmail.com&gt;
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
@@ -25,5 +25,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             compilation_config.custom_ops = ["all"]
 
     def get_attn_backend_cls(self, backend_name, head_size, dtype,
-                             kv_cache_dtype, block_size, use_v1, use_mla):
-        return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend"  # noqa E501
+                             kv_cache_dtype, block_size, use_v1, use_mla,
+                             has_sink):
+        return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend"  # noqa E501
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
@@ -138,6 +138,7 @@ def __init__(
         self.head_size = head_size
         self.num_kv_heads = num_kv_heads
         self.sliding_window = sliding_window
+        self.has_sink = extra_impl_args.get("sinks") is not None
 
         quant_method = quant_config.get_quant_method(
             self, prefix=prefix) if quant_config else None
@@ -165,7 +166,8 @@ def __init__(
                                                  kv_cache_dtype,
                                                  block_size,
                                                  is_attention_free,
-                                                 use_mla=use_mla)
+                                                 use_mla=use_mla,
+                                                 has_sink=self.has_sink)
         else:
             self.attn_backend = attn_backend
 
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
@@ -144,6 +144,7 @@ def get_attn_backend(
     block_size: int,
     is_attention_free: bool = False,
     use_mla: bool = False,
+    has_sink: bool = False,
 ) -> type[AttentionBackend]:
     """Selects which attention backend to use and lazily imports it."""
     # Accessing envs.* behind an @lru_cache decorator can cause the wrong
@@ -158,6 +159,7 @@ def get_attn_backend(
         is_attention_free=is_attention_free,
         use_v1=envs.VLLM_USE_V1,
         use_mla=use_mla,
+        has_sink=has_sink,
     )
 
 
@@ -170,6 +172,7 @@ def _cached_get_attn_backend(
     is_attention_free: bool,
     use_v1: bool = False,
     use_mla: bool = False,
+    has_sink: bool = False,
 ) -> type[AttentionBackend]:
     # If there are no attention layers (e.g. we are running Mamba),
     # use the placeholder NO_ATTENTION
@@ -201,7 +204,7 @@ def _cached_get_attn_backend(
     # get device-specific attn_backend
     attention_cls = current_platform.get_attn_backend_cls(
         selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1,
-        use_mla)
+        use_mla, has_sink)
     if not attention_cls:
         raise ValueError(
             f"Invalid attention backend for {current_platform.device_name}")
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -42,7 +42,7 @@ def from_config(cls, config):
 
     @classmethod
     def get_min_capability(cls) -> int:
-        return 90
+        return 80
 
     @classmethod
     def get_name(cls) -> QuantizationMethods:
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
@@ -91,8 +91,8 @@ def get_device_name(cls, device_id: int = 0) -> str:
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool,
-                             use_mla: bool) -> str:
+                             block_size: int, use_v1: bool, use_mla: bool,
+                             has_sink: bool) -> str:
         if selected_backend and selected_backend != _Backend.TORCH_SDPA:
             logger.info("Cannot use %s backend on CPU.", selected_backend)
         if use_mla:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
@@ -222,8 +222,8 @@ def get_vit_attn_backend(cls, support_fa: bool = False) -> _Backend:
 
     @classmethod
     def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
-                             kv_cache_dtype, block_size, use_v1,
-                             use_mla) -> str:
+                             kv_cache_dtype, block_size, use_v1, use_mla,
+                             has_sink) -> str:
         if use_mla:
             # TODO(lucas): refactor to be more concise
             #  we should probably consider factoring out V1 here
@@ -321,6 +321,9 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
 
             # FlashAttention is the default for SM 8.0+ GPUs
             if cls.has_device_capability(80):
+                if has_sink:
+                    logger.info_once("Using Triton backend on V1 engine.")
+                    return TRITON_ATTN_VLLM_V1
                 if is_default_backend_supported := is_attn_backend_supported(
                         FLASH_ATTN_V1, head_size, dtype,
                         allow_import_error=False):
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
@@ -196,8 +196,8 @@ def get_vit_attn_backend(cls, support_fa: bool = False) -> _Backend:
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool,
-                             use_mla: bool) -> str:
+                             block_size: int, use_v1: bool, use_mla: bool,
+                             has_sink: bool) -> str:
         """Get the attention backend class of a device."""
         return ""
 
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
@@ -188,8 +188,8 @@ def get_vit_attn_backend(cls, support_fa: bool = False) -> _Backend:
 
     @classmethod
     def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
-                             kv_cache_dtype, block_size, use_v1,
-                             use_mla) -> str:
+                             kv_cache_dtype, block_size, use_v1, use_mla,
+                             has_sink) -> str:
         if use_mla:
             from vllm.attention.backends.rocm_aiter_mla import (
                 is_aiter_mla_enabled)
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
@@ -46,8 +46,8 @@ class TpuPlatform(Platform):
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool,
-                             use_mla: bool) -> str:
+                             block_size: int, use_v1: bool, use_mla: bool,
+                             has_sink) -> str:
         if (selected_backend != _Backend.PALLAS
                 and selected_backend != _Backend.PALLAS_VLLM_V1):
             logger.info("Cannot use %s backend on TPU.", selected_backend)
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
@@ -35,8 +35,8 @@ class XPUPlatform(Platform):
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool,
-                             use_mla: bool) -> str:
+                             block_size: int, use_v1: bool, use_mla: bool,
+                             has_sink: bool) -> str:
         if selected_backend is not None and selected_backend != _Backend.IPEX:
             logger.info("Cannot use %s backend on XPU.", selected_backend)
         use_v1 = envs.VLLM_USE_V1