Merge pull request #61 from MTG/selective_flash_attention

palonso · web-flow · commit 018694adac3c · 2025-07-15T18:21:09.000+02:00
Select flash attention automatically when available
diff --git a/src/nets/common_former.py b/src/nets/common_former.py
@@ -20,6 +20,13 @@ def __init__(self, d_in, d_out, num_heads, dropout=0.0, qkv_bias=False):
         self.proj = nn.Linear(d_in, d_out)
         self.dropout = dropout
 
+        self.sdp_backends = [
+            SDPBackend.FLASH_ATTENTION,
+            SDPBackend.EFFICIENT_ATTENTION,
+            SDPBackend.CUDNN_ATTENTION,
+            SDPBackend.MATH,
+        ]
+
     def forward(self, x):
         batch_size, num_tokens, embed_dim = x.shape
 
@@ -37,7 +44,7 @@ def forward(self, x):
 
         use_dropout = 0.0 if not self.training else self.dropout
 
-        with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+        with sdpa_kernel(self.sdp_backends, set_priority=True):
             context_vec = nn.functional.scaled_dot_product_attention(
                 queries,
                 keys,
diff --git a/src/nets/conformer.py b/src/nets/conformer.py
@@ -19,7 +19,6 @@ def __init__(
         qkv_bias=False,
         use_rope=False,
         max_len=10000,
-        use_flash_attention=True,
     ):
         super().__init__()
 
@@ -31,7 +30,6 @@ def __init__(
         self.d_in = d_in
         self.use_rope = use_rope
         self.rope_dim = self.head_dim
-        self.use_flash_attention = use_flash_attention
 
         self.qkv = nn.Linear(d_in, 3 * d_out, bias=qkv_bias)
         self.proj = nn.Linear(d_in, d_out)
@@ -45,6 +43,13 @@ def __init__(
                 embed_dim=d_out, max_len=max_len
             )
 
+        self.sdp_backends = [
+            SDPBackend.FLASH_ATTENTION,
+            SDPBackend.EFFICIENT_ATTENTION,
+            SDPBackend.CUDNN_ATTENTION,
+            SDPBackend.MATH,
+        ]
+
     def forward(self, x):
         batch_size, num_tokens, embed_dim = x.shape
 
@@ -74,17 +79,7 @@ def forward(self, x):
 
         use_dropout = 0.0 if not self.training else self.dropout
 
-        if self.use_flash_attention:
-            with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
-                context_vec = nn.functional.scaled_dot_product_attention(
-                    queries,
-                    keys,
-                    values,
-                    attn_mask=None,
-                    dropout_p=use_dropout,
-                    is_causal=True,
-                )
-        else:
+        with sdpa_kernel(self.sdp_backends, set_priority=True):
             context_vec = nn.functional.scaled_dot_product_attention(
                 queries,
                 keys,
@@ -290,15 +285,13 @@ def __init__(
         alpha=0.1,
         beta=0.1,
         use_rope=False,
-        use_flash_attention=True,
     ):
         super(ConformerBlock, self).__init__()
         self.feed_forward_residual_factor = feed_forward_residual_factor
         self.use_deepnorm = use_deepnorm
         self.alpha = alpha
         self.beta = beta
         self.use_rope = use_rope
-        self.use_flash_attention = use_flash_attention
 
         self.ff1 = FeedForwardBlock(embed_dim, feed_forward_expansion_factor, dropout)
         self.attention = MHAPyTorchScaledDotProduct(
@@ -307,7 +300,6 @@ def __init__(
             num_heads=num_heads,
             dropout=dropout,
             use_rope=use_rope,
-            use_flash_attention=self.use_flash_attention,
         )
         self.conv_block = ConvBlock(embed_dim, conv_kernel_size, dropout)
         self.ff2 = FeedForwardBlock(embed_dim, feed_forward_expansion_factor, dropout)
@@ -399,7 +391,6 @@ def __init__(
         use_rope: bool,
         num_patches: int,
         patch_size: Tuple[int, int] | None = None,
-        use_flash_attention: bool = True,
     ):
         super(Conformer, self).__init__()
         self.embed_dim = embed_dim
@@ -414,7 +405,6 @@ def __init__(
         self.use_deepnorm = use_deepnorm
         self.use_rope = use_rope
         self.num_patches = num_patches
-        self.use_flash_attention = use_flash_attention
 
         self.input_dropout = nn.Dropout(input_dropout)
 
@@ -437,7 +427,6 @@ def __init__(
                     alpha=self.alpha_deepnorm,
                     beta=self.beta_deepnorm,
                     use_rope=self.use_rope,
-                    use_flash_attention=self.use_flash_attention,
                 )
                 for _ in range(depth)
             ]