implemented xformers in the bert attention layer

ChaoPang · ChaoPang · commit bb65e9d3a9ed · 2025-04-23T15:45:18.000-04:00
diff --git a/src/cehrbert/models/hf_models/hf_cehrbert.py b/src/cehrbert/models/hf_models/hf_cehrbert.py
@@ -7,15 +7,11 @@
 from transformers import PreTrainedModel
 from transformers.activations import gelu_new
 from transformers.models.bert import modeling_bert
-from transformers.models.bert.modeling_bert import BertEncoder, BertOnlyMLMHead, BertPooler
+from transformers.models.bert.modeling_bert import BertEncoder, BertOnlyMLMHead, BertPooler, BertSelfAttention
 from transformers.pytorch_utils import Conv1D
-from transformers.utils import is_flash_attn_2_available, logging
+from transformers.utils import logging
 from transformers.utils.import_utils import _is_package_available
 
-if is_flash_attn_2_available():
-    from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-
 if _is_package_available("xformers"):
     from xformers.ops import fmha
     import xformers.ops as xops
@@ -38,29 +34,12 @@ def create_block_diagonal_mask(seqlens: torch.LongTensor) -> torch.Tensor:
     return mask  # shape: (total_len, total_len)
 
 
-class BertSelfFlashAttention(nn.Module):
-    def __init__(self, config, position_embedding_type=None):
-        super().__init__()
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
-            raise ValueError(
-                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
-                f"heads ({config.num_attention_heads})"
-            )
-        if not _is_package_available("xformers"):
-            raise RuntimeError("xformers is not installed for BertSelfFlashAttention")
-        LOG.info("BertSelfFlashAttention is successfully initialized")
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-        self.split_size = config.hidden_size
-        self.embed_dim = config.hidden_size
-        self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
-        self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+class BertSelfFlashAttention(BertSelfAttention):
 
     def split_heads(self, x: torch.Tensor) -> torch.Tensor:
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
-        return x.view(new_x_shape)
+        x = x.view(new_x_shape)
+        return x
 
     def forward(
         self,
@@ -72,20 +51,20 @@ def forward(
         past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         output_attentions: Optional[bool] = False,
     ) -> Tuple[torch.Tensor]:
-        query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
-        dtype = query.dtype
-        query_layer = self.split_heads(query).to(torch.bfloat16)
-        key_layer = self.split_heads(key).to(torch.bfloat16)
-        value_layer = self.split_heads(value).to(torch.bfloat16)
+
+        query_layer = self.split_heads(self.query(hidden_states)).to(torch.bfloat16)
+        key_layer = self.split_heads(self.key(hidden_states)).to(torch.bfloat16)
+        value_layer = self.split_heads(self.value(hidden_states)).to(torch.bfloat16)
         attn_dropout = self.dropout.p if self.training else 0.0
 
+        dtype = hidden_states.dtype
         attn_output = xops.memory_efficient_attention(
             query_layer, key_layer, value_layer, attn_bias=attention_mask, p=attn_dropout
         )
 
-        attn_output = attn_output.to(dtype)
-        attn_output = self.c_proj(attn_output)
-        attn_output = self.dropout(attn_output)
+        new_context_layer_shape = attn_output.size()[:-2] + (self.all_head_size,)
+        attn_output = attn_output.view(new_context_layer_shape).to(dtype)
+
         # The BertLayer expects a tuple
         return (attn_output,)