Fix dtype inference for quantized models

malfet · web-flow · commit d8f0be6ef604 · 2024-04-04T17:21:13.000-07:00
`self.output.weight` would be int8 if output is quantized linear layer

In that case, check for `scales` or `scales_and_zeros` (for int4) quantization
diff --git a/model.py b/model.py
@@ -107,7 +107,12 @@ def setup_caches(self, max_batch_size, max_seq_length):
         max_seq_length = find_multiple(max_seq_length, 8)
         self.max_seq_length = max_seq_length
         self.max_batch_size = max_batch_size
-        dtype=self.output.weight.dtype
+        dtype = self.output.weight.dtype
+        # For quantized layers, dtype is encoded in scales
+        if hasattr(self.output, "scales"):
+            dtype = self.output.scales.dtype
+        elif hasattr(self.output, "scales_and_zeros"):
+            dtype = self.output.scales_and_zeros.dtype
         for b in self.layers:
             b.attention.kv_cache = KVCache(max_batch_size, max_seq_length, self.config.n_local_heads, head_dim, dtype)