[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit de47d22fbc6e · 2025-05-13T07:45:25.000Z
for more information, see https://pre-commit.ci
diff --git a/colossalai/shardformer/modeling/qwen2.py b/colossalai/shardformer/modeling/qwen2.py
@@ -9,7 +9,6 @@
     CausalLMOutputWithPast,
     SequenceClassifierOutputWithPast,
 )
-from transformers.cache_utils import DynamicCache
 
 try:
     from transformers.modeling_attn_mask_utils import (
@@ -210,8 +209,7 @@ def qwen2_model_forward(
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-
+            past_key_values[idx] if past_key_values is not None else None
 
             if idx - start_idx < num_ckpt_layers:
                 layer_outputs = self._gradient_checkpointing_func(
@@ -523,7 +521,6 @@ def forward(
         key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
 
-
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
             if self.layer_idx is None:
@@ -649,7 +646,6 @@ def forward(
         else:
             raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
 
-
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
@@ -669,11 +665,9 @@ def forward(
         else:
             position_ids = position_ids.view(-1, seq_length).long()
 
-
         # embed positions
         hidden_states = inputs_embeds
 
-
         if shard_config.enable_flash_attention:
             # in this case, attention_mask is a dict rather than a tensor
             mask_shape = (batch_size, 1, seq_length, seq_length_with_past)
@@ -693,7 +687,6 @@ def forward(
                 sliding_window=self.config.sliding_window,
             )
 
-
         if (self.gradient_checkpointing or sp_mode in ["ring", "all_to_all"]) and self.training:
             if use_cache:
                 logger.warning_once(
@@ -746,7 +739,6 @@ def forward(
 
             hidden_states = layer_outputs[0]
 
-
             if output_attentions:
                 all_self_attns += (layer_outputs[1],)