[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit 5fb7ba951699 · 2025-04-04T19:32:58.000Z
for more information, see https://pre-commit.ci
diff --git a/litgpt/api.py b/litgpt/api.py
@@ -378,7 +378,9 @@ def distribute(
                 else:
                     kv_cache_size = fixed_kv_cache_size
                 model.set_kv_cache(
-                    batch_size=1, max_seq_length=kv_cache_size, device=fabric.device,
+                    batch_size=1,
+                    max_seq_length=kv_cache_size,
+                    device=fabric.device,
                 )
                 self.kv_cache_initialized = True
                 self.fixed_kv_cache_size = fixed_kv_cache_size
@@ -507,7 +509,9 @@ def generate(
             else:
                 device = self.preprocessor.device
             self.model.set_kv_cache(
-                batch_size=1, max_seq_length=max_returned_tokens, device=device,
+                batch_size=1,
+                max_seq_length=max_returned_tokens,
+                device=device,
             )
             self.kv_cache_initialized = True
 
@@ -516,7 +520,9 @@ def generate(
             tmp_device = self.model.mha.mask_cache.device
             self.model.clear_kv_cache()
             self.model.set_kv_cache(
-                batch_size=1, max_seq_length=max_returned_tokens, device=tmp_device,
+                batch_size=1,
+                max_seq_length=max_returned_tokens,
+                device=tmp_device,
             )
         else:
             for block in self.model.transformer.h:
diff --git a/litgpt/attention.py b/litgpt/attention.py
@@ -156,9 +156,7 @@ def __call__(
                 nh_k = self.config.n_query_groups
                 q_per_kv = nh_q // nh_k
                 if q_per_kv > 1:
-                    mask = mask.unsqueeze(2).expand(
-                        -1, -1, q_per_kv, -1, -1
-                    ).reshape(B, nh_q, T, -1)
+                    mask = mask.unsqueeze(2).expand(-1, -1, q_per_kv, -1, -1).reshape(B, nh_q, T, -1)
 
         # Efficient attention using Flash Attention CUDA kernels.
         # NOTE: efficient implementation is disabled if `mask` is not None or softcapping is enabled.
diff --git a/litgpt/config.py b/litgpt/config.py
@@ -3,7 +3,7 @@
 from copy import deepcopy
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any, Callable, Literal, Optional, Type, Union, List
+from typing import Any, Callable, List, Literal, Optional, Type, Union
 
 import torch
 import yaml
diff --git a/litgpt/generate/base.py b/litgpt/generate/base.py
@@ -171,7 +171,7 @@ def generate_fn(
 
     prompt_size = prompt.size(0)
     if prompt_size == 0:
-        raise ValueError(f"prompt must not be empty")
+        raise ValueError("prompt must not be empty")
     sample_kwargs = dict(
         temperature=temperature,
         top_k=top_k,
diff --git a/litgpt/model.py b/litgpt/model.py
@@ -5,6 +5,7 @@
 Based on the nanoGPT implementation: https://github.com/karpathy/nanoGPT and
 https://github.com/EleutherAI/gpt-neox/tree/main/megatron/model.
 """
+
 from functools import partial
 from typing import Any, List, Optional, Tuple, Union
 
diff --git a/tests/generate/test_main.py b/tests/generate/test_main.py
@@ -174,7 +174,8 @@ def test_main(fake_checkpoint_dir, monkeypatch, tensor_like):
     )
     assert (
         generate_mock.mock_calls
-        == [call(ANY, tensor_like, len_return_value, **sample_kwargs, eos_id=tokenizer_mock.return_value.eos_id)] * num_samples
+        == [call(ANY, tensor_like, len_return_value, **sample_kwargs, eos_id=tokenizer_mock.return_value.eos_id)]
+        * num_samples
     )
     expected_output = "foo bar baz\n" * num_samples
     # Allow for the config to be printed before the expected repeated strings.
@@ -209,9 +210,7 @@ def test_sample(temperature):
     )
     # Note: Both `sample` and `batched_sample` create only 1 sample, not 3.
     # It is like passing `logits[:, 1-:, :]`
-    token = batched_sample(
-        logits, kwargs=dict(temperature=temperature, top_p=0.8)
-    )
+    token = batched_sample(logits, kwargs=dict(temperature=temperature, top_p=0.8))
 
     assert token.shape == (2, 1)
     # sample is batch size 1 only for now - this should be [0, 1] once batched generation is supported
diff --git a/tests/test_batch.py b/tests/test_batch.py
@@ -32,7 +32,9 @@ def create_llm(tmp_path, batch_size, max_seq_length, device) -> tuple[LLM, GPT]:
     )
     model: GPT = llm.model
     model.set_kv_cache(
-        batch_size=batch_size, max_seq_length=max_seq_length, device=device,
+        batch_size=batch_size,
+        max_seq_length=max_seq_length,
+        device=device,
     )
 
     return llm, model
@@ -89,7 +91,9 @@ def test_batched_equivalence(tmp_path):
     # Switch to batched generation
     model.clear_kv_cache()
     model.set_kv_cache(
-        batch_size=batch_size, max_seq_length=max_seq_length, device=device,
+        batch_size=batch_size,
+        max_seq_length=max_seq_length,
+        device=device,
     )
 
     toks_1: torch.Tensor = batched_next_token(
diff --git a/tests/test_chat.py b/tests/test_chat.py
@@ -47,12 +47,8 @@ def test_generate(monkeypatch, generated, stop_tokens, expected):
     model.config.block_size = 100
     model.max_seq_length = 100
     # Mock methods called during generation
-    monkeypatch.setattr(
-        model, "kv_cache_max_prefill_length", lambda: 80
-    )
-    monkeypatch.setattr(
-        model, "kv_cache_max_tokens_forward", lambda: 20
-    )
+    monkeypatch.setattr(model, "kv_cache_max_prefill_length", lambda: 80)
+    monkeypatch.setattr(model, "kv_cache_max_tokens_forward", lambda: 20)
     it = iter(generated)
 
     def multinomial(*_, **__):

Original file line number	Diff line number	Diff line change
`@@ -174,7 +174,8 @@ def test_main(fake_checkpoint_dir, monkeypatch, tensor_like):`
`174`	`174`	`)`
`175`	`175`	`assert (`
`176`	`176`	`generate_mock.mock_calls`
`177`		`- == [call(ANY, tensor_like, len_return_value, *sample_kwargs, eos_id=tokenizer_mock.return_value.eos_id)] num_samples`
	`177`	`+ == [call(ANY, tensor_like, len_return_value, **sample_kwargs, eos_id=tokenizer_mock.return_value.eos_id)]`
	`178`	`+ * num_samples`
`178`	`179`	`)`
`179`	`180`	`expected_output = "foo bar baz\n" * num_samples`
`180`	`181`	`# Allow for the config to be printed before the expected repeated strings.`
`@@ -209,9 +210,7 @@ def test_sample(temperature):`
`209`	`210`	`)`
`210`	`211`	# Note: Both `sample` and `batched_sample` create only 1 sample, not 3.
`211`	`212`	# It is like passing `logits[:, 1-:, :]`
`212`		`- token = batched_sample(`
`213`		`- logits, kwargs=dict(temperature=temperature, top_p=0.8)`
`214`		`- )`
	`213`	`+ token = batched_sample(logits, kwargs=dict(temperature=temperature, top_p=0.8))`
`215`	`214`
`216`	`215`	`assert token.shape == (2, 1)`
`217`	`216`	`# sample is batch size 1 only for now - this should be [0, 1] once batched generation is supported`
Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,9 @@ def create_llm(tmp_path, batch_size, max_seq_length, device) -> tuple[LLM, GPT]:`
`32`	`32`	`)`
`33`	`33`	`model: GPT = llm.model`
`34`	`34`	`model.set_kv_cache(`
`35`		`- batch_size=batch_size, max_seq_length=max_seq_length, device=device,`
	`35`	`+ batch_size=batch_size,`
	`36`	`+ max_seq_length=max_seq_length,`
	`37`	`+ device=device,`
`36`	`38`	`)`
`37`	`39`
`38`	`40`	`return llm, model`
`@@ -89,7 +91,9 @@ def test_batched_equivalence(tmp_path):`
`89`	`91`	`# Switch to batched generation`
`90`	`92`	`model.clear_kv_cache()`
`91`	`93`	`model.set_kv_cache(`
`92`		`- batch_size=batch_size, max_seq_length=max_seq_length, device=device,`
	`94`	`+ batch_size=batch_size,`
	`95`	`+ max_seq_length=max_seq_length,`
	`96`	`+ device=device,`
`93`	`97`	`)`
`94`	`98`
`95`	`99`	`toks_1: torch.Tensor = batched_next_token(`