[Bugfix gpt-oss] Fix float32 convert for flashinfer sink support (#23016)

mgoin · web-flow · commit 000cceca8c32 · 2025-08-16T11:16:00.000-07:00
Signed-off-by: mgoin &lt;mgoin64@gmail.com&gt;
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
@@ -308,6 +308,15 @@ def process_weights_after_loading(self, act_dtype: torch.dtype):
         if hasattr(self.impl, "process_weights_after_loading"):
             self.impl.process_weights_after_loading(act_dtype)
 
+        # FlashInfer requires attention sinks to be float32
+        if (self.backend == _Backend.FLASHINFER_VLLM_V1
+                and hasattr(self.impl, 'sinks')):
+            from vllm.v1.attention.backends.flashinfer import FlashInferImpl
+            assert isinstance(self.impl, FlashInferImpl)
+            if (self.impl.sinks is not None
+                    and self.impl.sinks.dtype != torch.float32):
+                self.impl.sinks = self.impl.sinks.to(torch.float32)
+
     def get_attn_backend(self) -> type[AttentionBackend]:
         return self.attn_backend
 
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
@@ -642,9 +642,6 @@ def __init__(
                     f"heads in the layer. Expected {num_heads}, but got "
                     f"{sinks.shape[0]}."
                 )
-            # Cast sinks to float32 if needed (FlashInfer requirement)
-            if sinks.dtype != torch.float32:
-                sinks = sinks.to(torch.float32)
             self.sinks = sinks
 
     def forward(

Original file line number	Diff line number	Diff line change
`@@ -642,9 +642,6 @@ def __init__(`
`642`	`642`	`f"heads in the layer. Expected {num_heads}, but got "`
`643`	`643`	`f"{sinks.shape[0]}."`
`644`	`644`	`)`
`645`		`- # Cast sinks to float32 if needed (FlashInfer requirement)`
`646`		`- if sinks.dtype != torch.float32:`
`647`		`- sinks = sinks.to(torch.float32)`
`648`	`645`	`self.sinks = sinks`
`649`	`646`
`650`	`647`	`def forward(`