[sharktank] Fix attention dtype (nod-ai#1243)

IanWood1 · IanNod · commit 9d8b44fadf57 · 2025-05-02T10:19:06.000-05:00
Fixes bug in refactor (nod-ai#1098) that removed specifying different dtypes for cache vs attention. --------- Signed-off-by: Ian Wood <ianwood2024@u.northwestern.edu>
diff --git a/sharktank/sharktank/layers/paged_llama_attention_block.py b/sharktank/sharktank/layers/paged_llama_attention_block.py
@@ -31,7 +31,6 @@ def __init__(
         head_dim: int,
         head_count_kv: int,
         rms_epsilon: float,
-        attention_dtype: Optional[torch.dtype] = None,
         attention_kernel: str = "torch",
         attention_scale: Optional[float] = None,
         softcap: Optional[float] = None,
@@ -45,15 +44,14 @@ def __init__(
             attn_head_dim=head_dim,
             block_seq_stride=cache.block_seq_stride,
             cache_dtype=cache.cache_dtype,
-            attn_dtype=attention_dtype,
+            attn_dtype=cache.attn_dtype,
             device=cache.device,
             shard_count=cache.shard_count,
         )
         self.block_index = block_index
         self.head_count = head_count
         self.head_dim = head_dim
         self.head_count_kv = head_count_kv
-        self.attention_dtype = attention_dtype
         self.attention_kernel = attention_kernel
         self.attention_scale = attention_scale
         self.softcap = softcap
diff --git a/sharktank/sharktank/models/llm/llm.py b/sharktank/sharktank/models/llm/llm.py
@@ -258,7 +258,6 @@ def __init__(
                 head_dim=config.hp.attn_head_dim,
                 head_count_kv=config.hp.attention_head_count_kv,
                 rms_epsilon=config.hp.attention_layer_norm_rms_epsilon,
-                attention_dtype=config.attention_dtype,
                 attention_kernel=attention_kernel,
                 fake_quant=fake_quant,
                 softcap=config.hp.attention_softcap,