[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit e65279940165 · 2025-06-06T08:37:16.000Z
for more information, see https://pre-commit.ci
diff --git a/litgpt/attention.py b/litgpt/attention.py
@@ -1,14 +1,14 @@
 import math
-from typing import Optional, Tuple, List, Union
+from typing import List, Optional, Tuple, Union
 
 import torch
-from torch.nn import functional as F
-from torch.nn.attention import SDPBackend, sdpa_kernel, SDPAParams
 from torch.backends.cuda import (
+    can_use_cudnn_attention,
     can_use_efficient_attention,
     can_use_flash_attention,
-    can_use_cudnn_attention,
 )
+from torch.nn import functional as F
+from torch.nn.attention import SDPAParams, SDPBackend, sdpa_kernel
 
 from litgpt.config import Config
 
@@ -92,6 +92,7 @@ class MultiHeadSelfAttention:
     `torch.nn.functional.scaled_dot_product_attention` is never used.
 
     """
+
     def __init__(
         self,
         config: Config,
@@ -215,9 +216,7 @@ def _filter_sdpa_kernels(
                 kernels = self._sdpa_kernels
             else:
                 kernels = [self._sdpa_kernels]
-            params = SDPAParams(
-                query, key, value, attn_mask, dropout_p, is_causal, enable_gqa
-            )
+            params = SDPAParams(query, key, value, attn_mask, dropout_p, is_causal, enable_gqa)
             warning_lst = []
             new_kernels = []
             for kernel in kernels:
@@ -259,7 +258,12 @@ def scaled_dot_product_attention(
         # - Logit softcapping is required; or
         # - We cannot access keys and values from `k_and_v` in parallel (this
         #   never happens if `is_causal == True`)
-        if return_scores or self.use_eager_sdpa_always or self.config.attention_logit_softcapping is not None or not k_and_v.both_in_parallel():
+        if (
+            return_scores
+            or self.use_eager_sdpa_always
+            or self.config.attention_logit_softcapping is not None
+            or not k_and_v.both_in_parallel()
+        ):
             y, scores = scaled_dot_product_attention(
                 query=query,
                 k_and_v=k_and_v,
@@ -401,7 +405,10 @@ def build_mask_cache(
     """
     # Usual causal mask:
     mask = torch.ones(
-        max_seq_length, max_seq_length, device=device, dtype=dtype,
+        max_seq_length,
+        max_seq_length,
+        device=device,
+        dtype=dtype,
     ).triu(diagonal=1)
     if sliding_window_size is not None:
         mask += torch.ones_like(mask).tril(diagonal=-sliding_window_size)
@@ -441,15 +448,23 @@ def build_mask_slice(
     tp_dtype = token_positions.dtype
     token_positions = token_positions.unsqueeze(2).to(device=device)
     kwargs = dict(device=device, dtype=tp_dtype)
-    bool_mask = torch.arange(
-        input_pos, input_pos + num, **kwargs,
-    ).view(1, 1, -1, 1) < token_positions
-    if sliding_window_size is not None:
-        extra_mask = torch.arange(
-            input_pos - sliding_window_size,
-            input_pos + num - sliding_window_size,
+    bool_mask = (
+        torch.arange(
+            input_pos,
+            input_pos + num,
             **kwargs,
-        ).view(1, 1, -1, 1) >= token_positions
+        ).view(1, 1, -1, 1)
+        < token_positions
+    )
+    if sliding_window_size is not None:
+        extra_mask = (
+            torch.arange(
+                input_pos - sliding_window_size,
+                input_pos + num - sliding_window_size,
+                **kwargs,
+            ).view(1, 1, -1, 1)
+            >= token_positions
+        )
         bool_mask += extra_mask
     mask = torch.zeros(bool_mask.shape, dtype=dtype, device=device)
     mask.masked_fill_(bool_mask, _minus_infinity(dtype))
diff --git a/litgpt/config.py b/litgpt/config.py
@@ -22,6 +22,7 @@ def find_multiple(n: int, k: int) -> int:
         return n
     return n + k - (n % k)
 
+
 # See `Config.start_of_layer_hook`. A start of layer hook is called just before
 # a layer is computed. The call is `hook(x, block_idx, input_pos)`, where
 # `x` is the layer input, `block_idx` the number of the layer, and `input_pos`
diff --git a/litgpt/generate/speculative_decoding.py b/litgpt/generate/speculative_decoding.py
@@ -20,6 +20,7 @@
     multinomial_num_samples_1,
     sample_top_p,
 )
+from litgpt.kvcache import DenseKVCache
 from litgpt.model import GPT
 from litgpt.prompts import PromptStyle, has_prompt_style, load_prompt_style
 from litgpt.tokenizer import Tokenizer
@@ -30,7 +31,6 @@
     get_default_supported_precision,
     load_checkpoint,
 )
-from litgpt.kvcache import DenseKVCache
 
 
 def sample(
@@ -149,7 +149,8 @@ def speculative_decoding(
     draft_token = token
     for idx in range(speculative_k):
         logits = draft_model(
-            idx=draft_token.unsqueeze(0), input_pos=draft_input_pos,
+            idx=draft_token.unsqueeze(0),
+            input_pos=draft_input_pos,
         )
         draft_token, draft_prob = sample(logits, **sample_kwargs)
         draft_input_pos += 1
@@ -161,7 +162,8 @@ def speculative_decoding(
     # Feed both original token and draft tokens to get target probabilities
     candidate_tokens = torch.cat((token, draft_tokens))
     target_logits = target_model(
-        idx=candidate_tokens.unsqueeze(0), input_pos=input_pos,
+        idx=candidate_tokens.unsqueeze(0),
+        input_pos=input_pos,
     )
 
     # Step 3: Convert target logits to probabilities using same sampling params
@@ -211,7 +213,7 @@ def speculative_decoding(
         draft_model(idx=draft_token.unsqueeze(0), input_pos=draft_input_pos)
         new_token, _ = sample(target_logits, **sample_kwargs)
     else:
-        input_pos += (len(accepted_tokens) + 1)
+        input_pos += len(accepted_tokens) + 1
         _resize_kv_caches(draft_model, input_pos)
         _resize_kv_caches(target_model, input_pos)
     return torch.cat((*accepted_tokens, new_token))
@@ -316,7 +318,10 @@ def generate(
     )
     _process_prompt(draft_model, prompt, prompt_chunksize, **sample_kwargs)
     token = _process_prompt(
-        target_model, prompt, prompt_chunksize, **sample_kwargs,
+        target_model,
+        prompt,
+        prompt_chunksize,
+        **sample_kwargs,
     )
     input_pos = prompt_size
 
diff --git a/litgpt/kvcache/base.py b/litgpt/kvcache/base.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union, List
+from typing import Dict, List, Optional, Tuple, Union
 
 import torch
 from torch.nn.attention import SDPBackend
diff --git a/litgpt/model.py b/litgpt/model.py
@@ -603,7 +603,6 @@ def __init__(
         self.config = config
         self.block_idx = block_idx
 
-
     def forward(
         self,
         x: torch.Tensor,
diff --git a/litgpt/scripts/convert_hf_checkpoint.py b/litgpt/scripts/convert_hf_checkpoint.py
@@ -293,7 +293,9 @@ def copy_weights_gemma_2(
 
 GEMMA3_VISION_MODEL_PREFIX = "model.vision_tower" if _TRANSFORMERS_GREATER_EQUAL_4_52 else "vision_tower"
 
-GEMMA3_MM_PROJECTOR_PREFIX = "model.multi_modal_projector" if _TRANSFORMERS_GREATER_EQUAL_4_52 else "multi_modal_projector"
+GEMMA3_MM_PROJECTOR_PREFIX = (
+    "model.multi_modal_projector" if _TRANSFORMERS_GREATER_EQUAL_4_52 else "multi_modal_projector"
+)
 
 
 def copy_weights_gemma_3(
diff --git a/tests/test_attention.py b/tests/test_attention.py
@@ -2,28 +2,27 @@
 import random
 from typing import Optional, Tuple
 
+import pytest
 import torch
 from torch.nn import functional as F
-import pytest
-
-from litgpt.config import Config
-from litgpt.model import (
-    apply_rope,
-    CausalSelfAttention,
-    GPT,
-    build_rope_cache,
-)
-from litgpt.kvcache import KVCache
-from litgpt.utils import batched_index_select
 
 from litgpt.attention import (
+    DefaultKeysAndValues,
+    MultiHeadSelfAttention,
     build_mask_cache,
     build_mask_slice,
-    DefaultKeysAndValues,
     do_softcapping,
-    MultiHeadSelfAttention,
     scaled_dot_product_attention,
 )
+from litgpt.config import Config
+from litgpt.kvcache import KVCache
+from litgpt.model import (
+    GPT,
+    CausalSelfAttention,
+    apply_rope,
+    build_rope_cache,
+)
+from litgpt.utils import batched_index_select
 
 
 @pytest.mark.parametrize(
@@ -126,7 +125,8 @@ def test_build_mask_slice(
         for bs in range(batch_size):
             for nq in range(n_query_groups):
                 token_positions[bs, nq, :] = torch.randperm(
-                    seq_len, device=device,
+                    seq_len,
+                    device=device,
                 )[:cache_length]
         mask = build_mask_slice(
             input_pos=input_pos,
@@ -137,15 +137,16 @@ def test_build_mask_slice(
             sliding_window_size=sliding_window_size,
         )
         mask_cmp = batched_index_select(
-            full_mask[input_pos: (input_pos + num), :],
+            full_mask[input_pos : (input_pos + num), :],
             dim=1,
             idx=token_positions,
         )
         torch.testing.assert_close(mask, mask_cmp)
 
 
 @pytest.mark.parametrize(
-    "dtype", [torch.float32, torch.float16, torch.bfloat16],
+    "dtype",
+    [torch.float32, torch.float16, torch.bfloat16],
 )
 def test_mask_sliding_window(dtype):
     """
@@ -329,9 +330,9 @@ def scaled_dot_product_attention(
         # with softcapping we cannot use SDPA
         if self.config.attention_logit_softcapping is not None:
             scores = q @ k.mT * scale
-            #self.debug_intermediates["scores1"] = scores
+            # self.debug_intermediates["scores1"] = scores
             scores = do_softcapping(scores, self.config.attention_logit_softcapping)
-            #self.debug_intermediates["scores2"] = scores
+            # self.debug_intermediates["scores2"] = scores
             if mask is None:
                 mask = torch.ones(q.size(2), q.size(2), dtype=q.dtype, device=q.device).triu(diagonal=1)
                 mask.masked_fill_(mask.bool(), torch.finfo(q.dtype).min)
@@ -347,7 +348,8 @@ def scaled_dot_product_attention(
 
 
 def rope_cache_OLD(
-    config: Config, device: Optional[torch.device] = None,
+    config: Config,
+    device: Optional[torch.device] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     if config.rope_adjustments is None:
         extra_config = None
@@ -368,9 +370,7 @@ def rope_cache_OLD(
             extra_config = {name: config.rope_adjustments[name] for name in adjusted_params_required}
         else:
             # Some but not all parameters are specified; raise an error
-            missing_params = [
-                param for param, present in zip(adjusted_params_required, params_present) if not present
-            ]
+            missing_params = [param for param, present in zip(adjusted_params_required, params_present) if not present]
             raise ValueError(
                 f"The following adjusted RoPE parameters are missing in rope_adjustments: {', '.join(missing_params)}. "
                 "All adjusted RoPE parameters must be specified together."
@@ -387,12 +387,13 @@ def rope_cache_OLD(
     )
 
 
-
 @pytest.mark.parametrize(
-    "model_name", ["gemma-2-27b", "gemma-3-27b-it"],
+    "model_name",
+    ["gemma-2-27b", "gemma-3-27b-it"],
 )
 @pytest.mark.parametrize(
-    "dtype", [torch.float32, torch.float16, torch.bfloat16],
+    "dtype",
+    [torch.float32, torch.float16, torch.bfloat16],
 )
 def test_multi_head_attention_for_gemma(model_name, dtype):
     """
@@ -414,7 +415,7 @@ def test_multi_head_attention_for_gemma(model_name, dtype):
         n_embd=32,
         intermediate_size=86,
         rotary_percentage=1.0,
-        rope_indices = [0, 1] if is_gemma_3 else None,
+        rope_indices=[0, 1] if is_gemma_3 else None,
     )
 
     # Obtain RoPE parameters and compare
@@ -433,10 +434,12 @@ def test_multi_head_attention_for_gemma(model_name, dtype):
     for rep in range(num_repeats):
         block_idx = rep % 2
         attn_new = CausalSelfAttention(
-            config, block_idx=block_idx,
+            config,
+            block_idx=block_idx,
         ).to(dtype=dtype)
         attn_old = CausalSelfAttention_OLD(
-            config, block_idx=block_idx,
+            config,
+            block_idx=block_idx,
         ).to(dtype=dtype)
         # Ensure they have the same weights
         attn_old.load_state_dict(attn_new.state_dict())