[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit 986564a9fda3 · 2025-05-23T07:01:11.000Z
for more information, see https://pre-commit.ci
diff --git a/litgpt/attention.py b/litgpt/attention.py
@@ -323,7 +323,10 @@ def build_mask_cache(
     """
     # Usual causal mask:
     mask = torch.ones(
-        max_seq_length, max_seq_length, device=device, dtype=dtype,
+        max_seq_length,
+        max_seq_length,
+        device=device,
+        dtype=dtype,
     ).triu(diagonal=1)
     if sliding_window_size is not None:
         mask += torch.ones_like(mask).tril(diagonal=-sliding_window_size)
@@ -363,15 +366,23 @@ def build_mask_slice(
     tp_dtype = token_positions.dtype
     token_positions = token_positions.unsqueeze(2).to(device=device)
     kwargs = dict(device=device, dtype=tp_dtype)
-    bool_mask = torch.arange(
-        input_pos, input_pos + num, **kwargs,
-    ).view(1, 1, -1, 1) < token_positions
-    if sliding_window_size is not None:
-        extra_mask = torch.arange(
-            input_pos - sliding_window_size,
-            input_pos + num - sliding_window_size,
+    bool_mask = (
+        torch.arange(
+            input_pos,
+            input_pos + num,
             **kwargs,
-        ).view(1, 1, -1, 1) >= token_positions
+        ).view(1, 1, -1, 1)
+        < token_positions
+    )
+    if sliding_window_size is not None:
+        extra_mask = (
+            torch.arange(
+                input_pos - sliding_window_size,
+                input_pos + num - sliding_window_size,
+                **kwargs,
+            ).view(1, 1, -1, 1)
+            >= token_positions
+        )
         bool_mask += extra_mask
     mask = torch.zeros(bool_mask.shape, dtype=dtype, device=device)
     mask.masked_fill_(bool_mask, torch.finfo(dtype).min)
diff --git a/litgpt/config.py b/litgpt/config.py
@@ -11,6 +11,7 @@
 
 from litgpt.utils import find_multiple
 
+
 # See `Config.start_of_layer_hook`. A start of layer hook is called just before
 # a layer is computed. The call is `hook(x, block_idx, input_pos)`, where
 # `x` is the layer input, `block_idx` the number of the layer, and `input_pos`
diff --git a/tests/test_model.py b/tests/test_model.py
@@ -37,9 +37,9 @@
 import litgpt.config as config_module
 from litgpt import GPT, Config
 from litgpt.attention import (
+    DefaultKeysAndValues,
     build_mask_cache,
     build_mask_slice,
-    DefaultKeysAndValues,
     scaled_dot_product_attention,
 )
 from litgpt.model import CausalSelfAttention
@@ -1540,7 +1540,8 @@ def test_build_mask_slice(
         for bs in range(batch_size):
             for nq in range(n_query_groups):
                 token_positions[bs, nq, :] = torch.randperm(
-                    seq_len, device=device,
+                    seq_len,
+                    device=device,
                 )[:cache_length]
         mask = build_mask_slice(
             input_pos=input_pos,
@@ -1551,7 +1552,7 @@ def test_build_mask_slice(
             sliding_window_size=sliding_window_size,
         )
         mask_cmp = batched_index_select(
-            full_mask[input_pos: (input_pos + num), :],
+            full_mask[input_pos : (input_pos + num), :],
             dim=1,
             idx=token_positions,
         )