updated the attn_implementation to xformers

ChaoPang · ChaoPang · commit e5f3ae0828ad · 2025-04-23T16:51:06.000-04:00
diff --git a/src/cehrbert/models/hf_models/hf_cehrbert.py b/src/cehrbert/models/hf_models/hf_cehrbert.py
@@ -69,7 +69,7 @@ def forward(
         return (attn_output,)
 
 
-modeling_bert.BERT_SELF_ATTENTION_CLASSES.update({"flash_attention_2": BertSelfFlashAttention})
+modeling_bert.BERT_SELF_ATTENTION_CLASSES.update({"xformers": BertSelfFlashAttention})
 
 
 class PositionalEncodingLayer(nn.Module):
@@ -231,7 +231,7 @@ class CehrBertPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["BertLayer"]
     _supports_sdpa = True
-    _supports_flash_attn_2 = True
+    # _supports_flash_attn_2 = True
 
     def _init_weights(self, module):
         """Initialize the weights."""
@@ -286,7 +286,7 @@ def forward(
         # [batch_size, from_seq_length, to_seq_length]
         # ourselves in which case we just need to make it broadcastable to all heads.
         # The flash attention requires the original attention_mask
-        if not getattr(self.config, "_attn_implementation", "eager") == "flash_attention_2":
+        if not getattr(self.config, "_attn_implementation", "eager") == "xformers":
             if seq_lens is not None:
                 attention_mask = create_block_diagonal_mask(seq_lens)
             else:
diff --git a/src/cehrbert/runners/hf_cehrbert_pretrain_runner.py b/src/cehrbert/runners/hf_cehrbert_pretrain_runner.py
@@ -132,7 +132,7 @@ def load_and_create_model(
             **model_args.as_dict(),
         )
     model = CehrBertForPreTraining(model_config)
-    if model_args.attn_implementation == "flash_attention_2":
+    if model_args.attn_implementation == "xformers":
         model.gradient_checkpointing_enable()
         model.enable_input_require_grads()
         model.use_memory_efficient_attention = True  # or model.use_flash_attention_2 = True

Original file line number	Diff line number	Diff line change
`@@ -132,7 +132,7 @@ def load_and_create_model(`
`132`	`132`	`**model_args.as_dict(),`
`133`	`133`	`)`
`134`	`134`	`model = CehrBertForPreTraining(model_config)`
`135`		`- if model_args.attn_implementation == "flash_attention_2":`
	`135`	`+ if model_args.attn_implementation == "xformers":`
`136`	`136`	`model.gradient_checkpointing_enable()`
`137`	`137`	`model.enable_input_require_grads()`
`138`	`138`	`model.use_memory_efficient_attention = True # or model.use_flash_attention_2 = True`