hpcaitech
diff --git a/‎.github/workflows/build_on_pr.yml‎
Lines changed: 4 additions & 0 deletions b/‎.github/workflows/build_on_pr.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎colossalai/shardformer/modeling/qwen2.py‎
Lines changed: 12 additions & 21 deletions b/‎colossalai/shardformer/modeling/qwen2.py‎
Lines changed: 12 additions & 21 deletions
@@ -138,6 +138,10 @@ jobs:
             cp -p -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
           fi
 
+      - name: Install flash-attention
+        run: |
+          pip install flash-attn==2.7.4.post1 --no-build-isolation
+
       - name: Install Colossal-AI
         run: |
           BUILD_EXT=1 pip install -v -e .
 
@@ -4,31 +4,23 @@
 import torch
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.modeling_attn_mask_utils import (
+    _prepare_4d_causal_attention_mask,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
     SequenceClassifierOutputWithPast,
 )
-
-try:
-    from transformers.modeling_attn_mask_utils import (
-        _prepare_4d_causal_attention_mask,
-        _prepare_4d_causal_attention_mask_for_sdpa,
-    )
-    from transformers.models.qwen2.modeling_qwen2 import (
-        Qwen2Attention,
-        Qwen2ForCausalLM,
-        Qwen2ForSequenceClassification,
-        Qwen2Model,
-        apply_rotary_pos_emb,
-        repeat_kv,
-    )
-except ImportError:
-    Qwen2Model = "Qwen2Model"
-    Qwen2ForCausalLM = "Qwen2ForCausalLM"
-    Qwen2Attention = "Qwen2Attention"
-    Qwen2ForSequenceClassification = "Qwen2ForSequenceClassification"
-
+from transformers.models.qwen2.modeling_qwen2 import (
+    Qwen2Attention,
+    Qwen2ForCausalLM,
+    Qwen2ForSequenceClassification,
+    Qwen2Model,
+    apply_rotary_pos_emb,
+    repeat_kv,
+)
 from transformers.utils import logging
 
 from colossalai.pipeline.stage_manager import PipelineStageManager
@@ -434,7 +426,6 @@ def qwen2_for_sequence_classification_forward(
             logits = self.score(hidden_states)
 
             if self.config.pad_token_id is None and batch_size != 1:
-                print(self.config.pad_token_id)
                 raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
             if self.config.pad_token_id is None:
                 sequence_lengths = -1