updated the logic for constructing attention_mask

ChaoPang · ChaoPang · commit 8c6cc4d9c66a · 2025-04-23T17:23:01.000-04:00
diff --git a/src/cehrbert/models/hf_models/hf_cehrbert.py b/src/cehrbert/models/hf_models/hf_cehrbert.py
@@ -211,7 +211,7 @@ def forward(
         # Combine values with the concept embeddings
         x = self.concept_value_transformation_layer(x, concept_values, concept_value_masks)
         age_embeddings = self.age_embedding_layer(ages)
-        time_embeddings = self.age_embedding_layer(dates)
+        time_embeddings = self.time_embedding_layer(dates)
         positional_embeddings = self.positional_embedding_layer(visit_concept_orders)
         x = self.linear_proj(torch.cat([x, time_embeddings, age_embeddings, positional_embeddings], dim=-1))
         x = gelu_new(x)
@@ -298,21 +298,15 @@ def forward(
         # [batch_size, from_seq_length, to_seq_length]
         # ourselves in which case we just need to make it broadcastable to all heads.
         # The flash attention requires the original attention_mask
-        if not getattr(self.config, "_attn_implementation", "eager") == "xformers":
-            if seq_lens is not None:
-                attention_mask = create_block_diagonal_mask(seq_lens)
-            else:
-                attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+        if seq_lens is None:
+            attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
         else:
-            if seq_lens is None:
-                raise RuntimeError(
-                    f"seq_lens cannot be None when {getattr(self.config, '_attn_implementation', 'eager')} is used"
-                )
             if not _is_package_available("xformers"):
+                raise RuntimeError(f"seq_lens cannot be None when xformers is installed")
+            if input_ids.shape[0] > 1:
                 raise RuntimeError(
-                    f"xformers must be installed when {getattr(self.config, '_attn_implementation', 'eager')} is used"
+                    f"seq_lens is provided, which indicates sample packing, hence the batch_size must be one."
                 )
-
             seq_lens_list = seq_lens.flatten().to(torch.int).cpu().numpy().tolist()
             attention_mask = fmha.attn_bias.BlockDiagonalMask.from_seqlens(seq_lens_list, device=seq_lens.device)