[sharktank] Fix attention dtype (#1243)

IanWood1 · web-flow · commit 5699ea13d161 · 2025-04-22T08:55:34.000-07:00
Fixes bug in refactor (#1098) that removed specifying different dtypes for cache vs attention. --------- Signed-off-by: Ian Wood <ianwood2024@u.northwestern.edu>
diff --git a/sharktank/sharktank/export_layer/export_kv_cache.py b/sharktank/sharktank/export_layer/export_kv_cache.py
@@ -65,7 +65,8 @@ def main():
         attn_head_count=attn_head_count,
         attn_head_dim=attn_head_dim,
         shard_count=args.sharding,
-        dtype=torch.float32,
+        cache_dtype=torch.float32,
+        attn_dtype=torch.float32,
         device=None,
     )
 
diff --git a/sharktank/sharktank/layers/paged_attention.py b/sharktank/sharktank/layers/paged_attention.py
@@ -60,7 +60,8 @@ def __init__(
         attn_head_dim: int,
         cache_partition_count: int = 2,
         block_seq_stride: int = 16,
-        dtype: torch.dtype = torch.float32,
+        cache_dtype: torch.dtype = torch.float32,
+        attn_dtype: torch.dtype = torch.float32,
         device: Optional[torch.device] = None,
         shard_count: int = 1,
     ):
@@ -85,7 +86,8 @@ def __init__(
         ]
         self.page_slab_flat_dim = math.prod(self.sub_page_dims)
         self.device = device
-        self.dtype = dtype
+        self.cache_dtype = cache_dtype
+        self.attn_dtype = attn_dtype
 
     def unflatten_page_table(
         self, state: list[Union[torch.Tensor, SplitPrimitiveTensor]]
@@ -146,7 +148,7 @@ def allocate(
         shards = [
             torch.empty(
                 [page_count, self.page_slab_flat_dim],
-                dtype=self.dtype,
+                dtype=self.cache_dtype,
                 device=self.device,
             )
             for _ in range(self.shard_count)
@@ -356,18 +358,18 @@ def repeat_kv(x: torch.Tensor) -> torch.Tensor:
 
         # Fake quant is already dequantized when stored in the cache.
         if cache_quantizer and not fake_quant:
-            k = cache_quantizer.dequantize_raw_tensor(k, self.dtype, name="xk_deq")
-            v = cache_quantizer.dequantize_raw_tensor(v, self.dtype, name="xv_deq")
+            k = cache_quantizer.dequantize_raw_tensor(k, self.attn_dtype, name="xk_deq")
+            v = cache_quantizer.dequantize_raw_tensor(v, self.attn_dtype, name="xv_deq")
 
         q = q.transpose(1, 2)
         k = k.transpose(1, 2)
         v = v.transpose(1, 2)
 
-        q = ops.to(q, dtype=self.dtype)
-        k = ops.to(k, dtype=self.dtype)
-        v = ops.to(v, dtype=self.dtype)
+        q = ops.to(q, dtype=self.attn_dtype)
+        k = ops.to(k, dtype=self.attn_dtype)
+        v = ops.to(v, dtype=self.attn_dtype)
         if mask is not None:
-            mask = ops.to(mask, dtype=self.dtype)
+            mask = ops.to(mask, dtype=self.attn_dtype)
 
         # Decomposed
         if attention_kernel == "decomposed":
diff --git a/sharktank/sharktank/layers/paged_llama_attention_block.py b/sharktank/sharktank/layers/paged_llama_attention_block.py
@@ -31,7 +31,6 @@ def __init__(
         head_dim: int,
         head_count_kv: int,
         rms_epsilon: float,
-        attention_dtype: Optional[torch.dtype] = None,
         attention_kernel: str = "torch",
         attention_scale: Optional[float] = None,
         softcap: Optional[float] = None,
@@ -44,15 +43,15 @@ def __init__(
             attn_head_count=head_count_kv,
             attn_head_dim=head_dim,
             block_seq_stride=cache.block_seq_stride,
-            dtype=cache.dtype,
+            cache_dtype=cache.cache_dtype,
+            attn_dtype=cache.attn_dtype,
             device=cache.device,
             shard_count=cache.shard_count,
         )
         self.block_index = block_index
         self.head_count = head_count
         self.head_dim = head_dim
         self.head_count_kv = head_count_kv
-        self.attention_dtype = attention_dtype
         self.attention_kernel = attention_kernel
         self.attention_scale = attention_scale
         self.softcap = softcap
diff --git a/sharktank/sharktank/models/llm/llm.py b/sharktank/sharktank/models/llm/llm.py
@@ -258,7 +258,6 @@ def __init__(
                 head_dim=config.hp.attn_head_dim,
                 head_count_kv=config.hp.attention_head_count_kv,
                 rms_epsilon=config.hp.attention_layer_norm_rms_epsilon,
-                attention_dtype=config.attention_dtype,
                 attention_kernel=attention_kernel,
                 fake_quant=fake_quant,
                 softcap=config.hp.attention_softcap,
diff --git a/sharktank/sharktank/utils/create_cache.py b/sharktank/sharktank/utils/create_cache.py
@@ -20,6 +20,7 @@ def create_paged_kv_cache(config: LlamaModelConfig) -> PagedAttention:
         cache_partition_count=2,  # One for each of K/V.
         block_seq_stride=config.block_seq_stride,
         device=config.device,
-        dtype=dtype,
+        cache_dtype=dtype,
+        attn_dtype=config.attention_dtype,
         shard_count=config.tensor_parallelism_size,
     )
diff --git a/sharktank/tests/layers/kv_cache_test.py b/sharktank/tests/layers/kv_cache_test.py
@@ -33,7 +33,8 @@ def test_paged(dtype: torch.dtype):
         transformer_block_count=transformer_block_count,
         attn_head_count=attn_head_count,
         attn_head_dim=attn_head_dim,
-        dtype=dtype,
+        cache_dtype=dtype,
+        attn_dtype=dtype,
         device=None,
     )
 
@@ -142,7 +143,8 @@ def test_sharded_paged():
         attn_head_count=attn_head_count,
         attn_head_dim=attn_head_dim,
         shard_count=shard_count,
-        dtype=torch.float32,
+        cache_dtype=torch.float32,
+        attn_dtype=torch.float32,
         device=None,
     )
 
diff --git a/sharktank/tests/layers/paged_llama_attention_block_test.py b/sharktank/tests/layers/paged_llama_attention_block_test.py
@@ -62,7 +62,8 @@ def testExportNondecomposed(self):
             attn_head_dim=self.attention_head_dim,
             cache_partition_count=self.cache_partition_count,
             block_seq_stride=self.block_seq_stride,
-            dtype=dtype,
+            cache_dtype=dtype,
+            attn_dtype=dtype,
         )
 
         cache_state = cache.allocate(self.page_count)
diff --git a/sharktank/tests/layers/sharded_paged_kv_cache_test.py b/sharktank/tests/layers/sharded_paged_kv_cache_test.py
@@ -38,7 +38,8 @@ def setUp(self):
             block_seq_stride=self.block_seq_stride,
             attn_head_dim=self.attn_head_dim,
             cache_partition_count=self.cache_partition_count,
-            dtype=self.dtype,
+            cache_dtype=self.dtype,
+            attn_dtype=self.dtype,
         )
         self.sharded_cache = PagedAttention(
             shard_count=self.shard_count,
@@ -47,7 +48,8 @@ def setUp(self):
             block_seq_stride=self.block_seq_stride,
             attn_head_dim=self.attn_head_dim,
             cache_partition_count=self.cache_partition_count,
-            dtype=self.dtype,
+            cache_dtype=self.dtype,
+            attn_dtype=self.dtype,
         )
 
     def make_unsharded_and_sharded_equal_cache_states(
diff --git a/sharktank/tests/layers/sharded_paged_llama_attention_block.py b/sharktank/tests/layers/sharded_paged_llama_attention_block.py
@@ -64,16 +64,17 @@ def make_paged_kv_cache(shard_count: int) -> PagedAttention:
                 attn_head_dim=self.attention_head_dim,
                 cache_partition_count=self.cache_partition_count,
                 block_seq_stride=self.block_seq_stride,
-                dtype=dtype,
+                cache_dtype=dtype,
+                attn_dtype=dtype,
                 shard_count=shard_count,
             )
 
         cache = make_paged_kv_cache(shard_count=1)
         sharded_cache = make_paged_kv_cache(shard_count=self.shard_count)
 
-        def make_unsharded_and_sharded_equal_cache_states() -> tuple[
-            list[torch.Tensor], list[SplitPrimitiveTensor]
-        ]:
+        def make_unsharded_and_sharded_equal_cache_states() -> (
+            tuple[list[torch.Tensor], list[SplitPrimitiveTensor]]
+        ):
             cache_state = cache.allocate(self.page_count)
             cache_state[0] = make_rand_torch(cache_state[0].shape, dtype=dtype)
             sharded_cache_state = sharded_cache.shard_state(deepcopy(cache_state))
diff --git a/sharktank/tests/models/llama/attention_test.py b/sharktank/tests/models/llama/attention_test.py
@@ -70,7 +70,8 @@ def test(self):
             cache_partition_count=2,  # One for each of K/V.
             block_seq_stride=block_seq_stride,
             device="cpu",
-            dtype=torch.float32,
+            cache_dtype=torch.float32,
+            attn_dtype=torch.float32,
         )
         attention_block = AttentionFFNBlock(
             theta=attention_block_theta,

Original file line number	Diff line number	Diff line change
`@@ -65,7 +65,8 @@ def main():`
`65`	`65`	`attn_head_count=attn_head_count,`
`66`	`66`	`attn_head_dim=attn_head_dim,`
`67`	`67`	`shard_count=args.sharding,`
`68`		`- dtype=torch.float32,`
	`68`	`+ cache_dtype=torch.float32,`
	`69`	`+ attn_dtype=torch.float32,`
`69`	`70`	`device=None,`
`70`	`71`	`)`
`71`	`72`
Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,7 @@ def create_paged_kv_cache(config: LlamaModelConfig) -> PagedAttention:`
`20`	`20`	`cache_partition_count=2, # One for each of K/V.`
`21`	`21`	`block_seq_stride=config.block_seq_stride,`
`22`	`22`	`device=config.device,`
`23`		`- dtype=dtype,`
	`23`	`+ cache_dtype=dtype,`
	`24`	`+ attn_dtype=config.attention_dtype,`
`24`	`25`	`shard_count=config.tensor_parallelism_size,`
`25`	`26`	`)`
Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,8 @@ def testExportNondecomposed(self):`
`62`	`62`	`attn_head_dim=self.attention_head_dim,`
`63`	`63`	`cache_partition_count=self.cache_partition_count,`
`64`	`64`	`block_seq_stride=self.block_seq_stride,`
`65`		`- dtype=dtype,`
	`65`	`+ cache_dtype=dtype,`
	`66`	`+ attn_dtype=dtype,`
`66`	`67`	`)`
`67`	`68`
`68`	`69`	`cache_state = cache.allocate(self.page_count)`