add support to llama quantizer

metascroy · metascroy · commit 5360d702b3ad · 2024-10-01T15:45:19.000-07:00
diff --git a/examples/models/llama2/source_transformation/quantize.py b/examples/models/llama2/source_transformation/quantize.py
@@ -519,14 +519,17 @@ def __init__(
         self.group_size = group_size
         self.bitwidth = bitwidth
         self.packed = packed
-        if (bitwidth != 4) and packed:
-            raise RuntimeError("pack only works with bitsize 4")
+        if (bitwidth not in [2, 4]) and packed:
+            raise RuntimeError("pack only works with bitsize 2, 4")
 
     @torch.no_grad()
     def create_quantized_state_dict(self, packed=False) -> Dict:
         cur_state_dict = self.mod.state_dict()
 
-        if self.bitwidth == 4:
+        if self.bitwidth == 2:
+            range_min = -2
+            range_max = 1
+        elif self.bitwidth == 4:
             range_min = -8
             range_max = 7
         elif self.bitwidth == 8:
@@ -555,17 +558,30 @@ def create_quantized_state_dict(self, packed=False) -> Dict:
                 )
 
                 if packed:
-                    if weight.shape[-1] % 2 != 0:
-                        raise RuntimeError("automatic padding not implemented yet")
-
-                    weight_range_shifted = weight.add(8).view(torch.uint8)
-                    weight_view = weight_range_shifted.view(
-                        weight.shape[0], weight.shape[1] // 2, 2
-                    )
-                    weight_even = weight_view[:, :, 0] * 16  # left shift 4
-                    weight_odd = weight_view[:, :, 1]
-                    weight_packed = weight_even + weight_odd
-                    weight = weight_packed
+                    if self.bitwidth == 2:
+                        if weight.shape[-1] % 4 != 0:
+                            raise RuntimeError("automatic padding not implemented yet")
+                        weight_range_shifted = weight.add(2).view(torch.uint8)
+                        weight_view = weight_range_shifted.view(
+                            weight.shape[0], weight.shape[1] // 4, 4
+                        )
+                        weight_0 = weight_view[:, :, 0]
+                        weight_1 = weight_view[:, :, 1] << 2
+                        weight_2 = weight_view[:, :, 2] << 4
+                        weight_3 = weight_view[:, :, 3] << 6
+                        weight_packed = weight_0 + weight_1 + weight_2 + weight_3
+                        weight = weight_packed
+                    elif self.bitwidth == 4:
+                        if weight.shape[-1] % 2 != 0:
+                            raise RuntimeError("automatic padding not implemented yet")
+                        weight_range_shifted = weight.add(8).view(torch.uint8)
+                        weight_view = weight_range_shifted.view(
+                            weight.shape[0], weight.shape[1] // 2, 2
+                        )
+                        weight_even = weight_view[:, :, 0] * 16  # left shift 4
+                        weight_odd = weight_view[:, :, 1]
+                        weight_packed = weight_even + weight_odd
+                        weight = weight_packed
 
                 weight = weight.to(device=self.device)
                 scales = scales.to(device=self.device)
@@ -658,7 +674,7 @@ def get_quant_embedding_transform(args):
         model,
         bitwidth=bitwidth,
         group_size=group_size,
-        packed=(bitwidth == 4),
+        packed=(bitwidth in [2, 4]),
     ).quantized_model()
 
 
diff --git a/exir/passes/_quant_patterns_and_replacements.py b/exir/passes/_quant_patterns_and_replacements.py
@@ -172,6 +172,138 @@ def embedding_byte_dtype_out_meta(
     )
 
 
+quantized_decomposed_lib.define(
+    "embedding_2bit(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
+    "int weight_quant_min, int weight_quant_max, Tensor indices) -> Tensor",
+)
+
+quantized_decomposed_lib.define(
+    "embedding_2bit.dtype(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
+    "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None) -> Tensor",
+)
+
+quantized_decomposed_lib.define(
+    "embedding_2bit.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
+    "int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)",
+)
+
+quantized_decomposed_lib.define(
+    "embedding_2bit.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
+    "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)",
+)
+
+
+@impl(quantized_decomposed_lib, "embedding_2bit", "CompositeExplicitAutograd")
+def embedding_2bit(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: Optional[torch.Tensor],
+    weight_quant_min: int,
+    weight_quant_max: int,
+    indices: torch.Tensor,
+) -> torch.Tensor:
+    embedding_weight_checks(weight, weight_scales, weight_zero_points)
+    group_size = (4 * weight.size(1)) // (
+        weight_scales.size(1) if weight_scales.dim() == 2 else 1
+    )
+    weight_0 = (weight & 3)
+    weight_1 = (weight & 12) >> 2
+    weight_2 = (weight & 48) >> 4
+    weight_3 = (weight & 192) >> 6
+    weight_unpacked = torch.stack((weight_0, weight_1, weight_2, weight_3), dim=-1)
+    weight = weight_unpacked.view(weight.shape[0], -1)
+    weight = weight.view(torch.int8).add(-2)
+
+    weight = torch.ops.quantized_decomposed.dequantize_per_channel_group.default(
+        weight,
+        weight_scales,
+        weight_zero_points,
+        weight_quant_min,
+        weight_quant_max,
+        weight.dtype,
+        group_size,
+        weight_scales.dtype,
+    )
+    return torch.ops.aten.embedding.default(weight, indices)
+
+
+@register_fake("quantized_decomposed::embedding_2bit.out")
+def embedding_2bit_out_meta(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: Optional[torch.Tensor],
+    weight_quant_min: int,
+    weight_quant_max: int,
+    indices: torch.Tensor,
+    out: torch.Tensor,
+) -> torch.Tensor:
+    return embedding_2bit(
+        weight,
+        weight_scales,
+        weight_zero_points,
+        weight_quant_min,
+        weight_quant_max,
+        indices,
+    )
+
+
+@impl(quantized_decomposed_lib, "embedding_2bit.dtype", "CompositeExplicitAutograd")
+def embedding_2bit_dtype(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: Optional[torch.Tensor],
+    weight_quant_min: int,
+    weight_quant_max: int,
+    indices: torch.Tensor,
+    dtype: Optional[torch.dtype],
+) -> torch.Tensor:
+    embedding_weight_checks(weight, weight_scales, weight_zero_points)
+    group_size = (4 * weight.size(1)) // (
+        weight_scales.size(1) if weight_scales.dim() == 2 else 1
+    )
+    weight_0 = (weight & 3)
+    weight_1 = (weight & 12) >> 2
+    weight_2 = (weight & 48) >> 4
+    weight_3 = (weight & 192) >> 6
+    weight_unpacked = torch.stack((weight_0, weight_1, weight_2, weight_3), dim=-1)
+    weight = weight_unpacked.view(weight.shape[0], -1)
+    weight = weight.view(torch.int8).add(-2)
+
+    weight = torch.ops.quantized_decomposed.dequantize_per_channel_group.default(
+        weight,
+        weight_scales,
+        weight_zero_points,
+        weight_quant_min,
+        weight_quant_max,
+        weight.dtype,
+        group_size,
+        dtype,
+    )
+    return torch.ops.aten.embedding.default(weight, indices)
+
+
+@register_fake("quantized_decomposed::embedding_2bit.dtype_out")
+def embedding_2bit_dtype_out_meta(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: Optional[torch.Tensor],
+    weight_quant_min: int,
+    weight_quant_max: int,
+    indices: torch.Tensor,
+    dtype: Optional[torch.dtype],
+    out: torch.Tensor,
+) -> torch.Tensor:
+    return embedding_2bit_dtype(
+        weight,
+        weight_scales,
+        weight_zero_points,
+        weight_quant_min,
+        weight_quant_max,
+        indices,
+        dtype,
+    )
+
+
 quantized_decomposed_lib.define(
     "embedding_4bit(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
     "int weight_quant_min, int weight_quant_max, Tensor indices) -> Tensor",