|
4 | 4 | import torch
|
5 | 5 | from torch import nn
|
6 | 6 | from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
| 7 | +from transformers.modeling_attn_mask_utils import ( |
| 8 | + _prepare_4d_causal_attention_mask, |
| 9 | + _prepare_4d_causal_attention_mask_for_sdpa, |
| 10 | +) |
7 | 11 | from transformers.modeling_outputs import (
|
8 | 12 | BaseModelOutputWithPast,
|
9 | 13 | CausalLMOutputWithPast,
|
10 | 14 | SequenceClassifierOutputWithPast,
|
11 | 15 | )
|
12 |
| - |
13 |
| -try: |
14 |
| - from transformers.modeling_attn_mask_utils import ( |
15 |
| - _prepare_4d_causal_attention_mask, |
16 |
| - _prepare_4d_causal_attention_mask_for_sdpa, |
17 |
| - ) |
18 |
| - from transformers.models.qwen2.modeling_qwen2 import ( |
19 |
| - Qwen2Attention, |
20 |
| - Qwen2ForCausalLM, |
21 |
| - Qwen2ForSequenceClassification, |
22 |
| - Qwen2Model, |
23 |
| - apply_rotary_pos_emb, |
24 |
| - repeat_kv, |
25 |
| - ) |
26 |
| -except ImportError: |
27 |
| - Qwen2Model = "Qwen2Model" |
28 |
| - Qwen2ForCausalLM = "Qwen2ForCausalLM" |
29 |
| - Qwen2Attention = "Qwen2Attention" |
30 |
| - Qwen2ForSequenceClassification = "Qwen2ForSequenceClassification" |
31 |
| - |
| 16 | +from transformers.models.qwen2.modeling_qwen2 import ( |
| 17 | + Qwen2Attention, |
| 18 | + Qwen2ForCausalLM, |
| 19 | + Qwen2ForSequenceClassification, |
| 20 | + Qwen2Model, |
| 21 | + apply_rotary_pos_emb, |
| 22 | + repeat_kv, |
| 23 | +) |
32 | 24 | from transformers.utils import logging
|
33 | 25 |
|
34 | 26 | from colossalai.pipeline.stage_manager import PipelineStageManager
|
@@ -434,7 +426,6 @@ def qwen2_for_sequence_classification_forward(
|
434 | 426 | logits = self.score(hidden_states)
|
435 | 427 |
|
436 | 428 | if self.config.pad_token_id is None and batch_size != 1:
|
437 |
| - print(self.config.pad_token_id) |
438 | 429 | raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
|
439 | 430 | if self.config.pad_token_id is None:
|
440 | 431 | sequence_lengths = -1
|
|
0 commit comments