kushrast
diff --git a/‎examples/models/llama2/source_transformation/quantize.py‎
Lines changed: 61 additions & 22 deletions b/‎examples/models/llama2/source_transformation/quantize.py‎
Lines changed: 61 additions & 22 deletions
diff --git a/‎exir/passes/_quant_patterns_and_replacements.py‎
Lines changed: 132 additions & 0 deletions b/‎exir/passes/_quant_patterns_and_replacements.py‎
Lines changed: 132 additions & 0 deletions
@@ -494,6 +494,7 @@ def replace_embedding_weight_only_grouped_int8_per_channel(
                     group_size=group_size,
                     dtype=child.weight.dtype,
                     packed=packed,
+                    bitwidth=bitwidth,
                 ),
             )
         else:
@@ -519,14 +520,17 @@ def __init__(
         self.group_size = group_size
         self.bitwidth = bitwidth
         self.packed = packed
-        if (bitwidth != 4) and packed:
-            raise RuntimeError("pack only works with bitsize 4")
+        if (bitwidth not in [2, 4]) and packed:
+            raise RuntimeError("pack only works with bitsize 2, 4")
 
     @torch.no_grad()
     def create_quantized_state_dict(self, packed=False) -> Dict:
         cur_state_dict = self.mod.state_dict()
 
-        if self.bitwidth == 4:
+        if self.bitwidth == 2:
+            range_min = -2
+            range_max = 1
+        elif self.bitwidth == 4:
             range_min = -8
             range_max = 7
         elif self.bitwidth == 8:
@@ -555,17 +559,30 @@ def create_quantized_state_dict(self, packed=False) -> Dict:
                 )
 
                 if packed:
-                    if weight.shape[-1] % 2 != 0:
-                        raise RuntimeError("automatic padding not implemented yet")
-
-                    weight_range_shifted = weight.add(8).view(torch.uint8)
-                    weight_view = weight_range_shifted.view(
-                        weight.shape[0], weight.shape[1] // 2, 2
-                    )
-                    weight_even = weight_view[:, :, 0] * 16  # left shift 4
-                    weight_odd = weight_view[:, :, 1]
-                    weight_packed = weight_even + weight_odd
-                    weight = weight_packed
+                    if self.bitwidth == 2:
+                        if weight.shape[-1] % 4 != 0:
+                            raise RuntimeError("automatic padding not implemented yet")
+                        weight_range_shifted = weight.add(2).view(torch.uint8)
+                        weight_view = weight_range_shifted.view(
+                            weight.shape[0], weight.shape[1] // 4, 4
+                        )
+                        weight_0 = weight_view[:, :, 0]
+                        weight_1 = weight_view[:, :, 1] << 2
+                        weight_2 = weight_view[:, :, 2] << 4
+                        weight_3 = weight_view[:, :, 3] << 6
+                        weight_packed = weight_0 + weight_1 + weight_2 + weight_3
+                        weight = weight_packed
+                    elif self.bitwidth == 4:
+                        if weight.shape[-1] % 2 != 0:
+                            raise RuntimeError("automatic padding not implemented yet")
+                        weight_range_shifted = weight.add(8).view(torch.uint8)
+                        weight_view = weight_range_shifted.view(
+                            weight.shape[0], weight.shape[1] // 2, 2
+                        )
+                        weight_even = weight_view[:, :, 0] * 16  # left shift 4
+                        weight_odd = weight_view[:, :, 1]
+                        weight_packed = weight_even + weight_odd
+                        weight = weight_packed
 
                 weight = weight.to(device=self.device)
                 scales = scales.to(device=self.device)
@@ -598,13 +615,15 @@ def __init__(
         group_size: Optional[int] = None,
         dtype=torch.half,
         packed=False,
+        bitwidth: int = 8,
     ) -> None:
         super().__init__()
         if group_size is None or group_size == 0:
             group_size = embedding_dim
         self.group_size = group_size
         self.dtype = dtype
         self.packed = packed
+        self.bitwidth = bitwidth
         if not packed:
             self.register_buffer(
                 "weight",
@@ -613,12 +632,25 @@ def __init__(
                 ),
             )
         else:  # packed
-            self.register_buffer(
-                "weight",
-                torch.empty(
-                    (vocab_size, embedding_dim // 2), dtype=torch.uint8, device=device
-                ),
-            )
+            if bitwidth == 2:
+                self.register_buffer(
+                    "weight",
+                    torch.empty(
+                        (vocab_size, embedding_dim // 4),
+                        dtype=torch.uint8,
+                        device=device,
+                    ),
+                )
+            elif bitwidth == 4:
+                self.register_buffer(
+                    "weight",
+                    torch.empty(
+                        (vocab_size, embedding_dim // 2),
+                        dtype=torch.uint8,
+                        device=device,
+                    ),
+                )
+
         groups_per_row = (embedding_dim + group_size - 1) // group_size
         if groups_per_row > 1:
             self.register_buffer(
@@ -638,7 +670,14 @@ def forward(self, indices: torch.Tensor) -> torch.Tensor:
             return torch.ops.quantized_decomposed.embedding_byte.dtype(
                 self.weight, self.scales, None, 0, 0, indices, dtype=self.dtype
             )
-        else:  # 4bit packed
+        else:  # packed
+            if self.bitwidth == 2:
+                return torch.ops.quantized_decomposed.embedding_2bit.dtype(
+                    self.weight, self.scales, None, 0, 0, indices, dtype=self.dtype
+                )
+
+            # Remaining case (always return to make pyre happy)
+            assert self.bitwidth == 4
             return torch.ops.quantized_decomposed.embedding_4bit.dtype(
                 self.weight, self.scales, None, 0, 0, indices, dtype=self.dtype
             )
@@ -658,7 +697,7 @@ def get_quant_embedding_transform(args):
         model,
         bitwidth=bitwidth,
         group_size=group_size,
-        packed=(bitwidth == 4),
+        packed=(bitwidth in [2, 4]),
     ).quantized_model()
 
 
 
@@ -172,6 +172,138 @@ def embedding_byte_dtype_out_meta(
     )
 
 
+quantized_decomposed_lib.define(
+    "embedding_2bit(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
+    "int weight_quant_min, int weight_quant_max, Tensor indices) -> Tensor",
+)
+
+quantized_decomposed_lib.define(
+    "embedding_2bit.dtype(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
+    "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None) -> Tensor",
+)
+
+quantized_decomposed_lib.define(
+    "embedding_2bit.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
+    "int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)",
+)
+
+quantized_decomposed_lib.define(
+    "embedding_2bit.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
+    "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)",
+)
+
+
+@impl(quantized_decomposed_lib, "embedding_2bit", "CompositeExplicitAutograd")
+def embedding_2bit(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: Optional[torch.Tensor],
+    weight_quant_min: int,
+    weight_quant_max: int,
+    indices: torch.Tensor,
+) -> torch.Tensor:
+    embedding_weight_checks(weight, weight_scales, weight_zero_points)
+    group_size = (4 * weight.size(1)) // (
+        weight_scales.size(1) if weight_scales.dim() == 2 else 1
+    )
+    weight_0 = weight & 3
+    weight_1 = (weight & 12) >> 2
+    weight_2 = (weight & 48) >> 4
+    weight_3 = (weight & 192) >> 6
+    weight_unpacked = torch.stack((weight_0, weight_1, weight_2, weight_3), dim=-1)
+    weight = weight_unpacked.view(weight.shape[0], -1)
+    weight = weight.view(torch.int8).add(-2)
+
+    weight = torch.ops.quantized_decomposed.dequantize_per_channel_group.default(
+        weight,
+        weight_scales,
+        weight_zero_points,
+        weight_quant_min,
+        weight_quant_max,
+        weight.dtype,
+        group_size,
+        weight_scales.dtype,
+    )
+    return torch.ops.aten.embedding.default(weight, indices)
+
+
+@register_fake("quantized_decomposed::embedding_2bit.out")
+def embedding_2bit_out_meta(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: Optional[torch.Tensor],
+    weight_quant_min: int,
+    weight_quant_max: int,
+    indices: torch.Tensor,
+    out: torch.Tensor,
+) -> torch.Tensor:
+    return embedding_2bit(
+        weight,
+        weight_scales,
+        weight_zero_points,
+        weight_quant_min,
+        weight_quant_max,
+        indices,
+    )
+
+
+@impl(quantized_decomposed_lib, "embedding_2bit.dtype", "CompositeExplicitAutograd")
+def embedding_2bit_dtype(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: Optional[torch.Tensor],
+    weight_quant_min: int,
+    weight_quant_max: int,
+    indices: torch.Tensor,
+    dtype: Optional[torch.dtype],
+) -> torch.Tensor:
+    embedding_weight_checks(weight, weight_scales, weight_zero_points)
+    group_size = (4 * weight.size(1)) // (
+        weight_scales.size(1) if weight_scales.dim() == 2 else 1
+    )
+    weight_0 = weight & 3
+    weight_1 = (weight & 12) >> 2
+    weight_2 = (weight & 48) >> 4
+    weight_3 = (weight & 192) >> 6
+    weight_unpacked = torch.stack((weight_0, weight_1, weight_2, weight_3), dim=-1)
+    weight = weight_unpacked.view(weight.shape[0], -1)
+    weight = weight.view(torch.int8).add(-2)
+
+    weight = torch.ops.quantized_decomposed.dequantize_per_channel_group.default(
+        weight,
+        weight_scales,
+        weight_zero_points,
+        weight_quant_min,
+        weight_quant_max,
+        weight.dtype,
+        group_size,
+        dtype,
+    )
+    return torch.ops.aten.embedding.default(weight, indices)
+
+
+@register_fake("quantized_decomposed::embedding_2bit.dtype_out")
+def embedding_2bit_dtype_out_meta(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: Optional[torch.Tensor],
+    weight_quant_min: int,
+    weight_quant_max: int,
+    indices: torch.Tensor,
+    dtype: Optional[torch.dtype],
+    out: torch.Tensor,
+) -> torch.Tensor:
+    return embedding_2bit_dtype(
+        weight,
+        weight_scales,
+        weight_zero_points,
+        weight_quant_min,
+        weight_quant_max,
+        indices,
+        dtype,
+    )
+
+
 quantized_decomposed_lib.define(
     "embedding_4bit(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
     "int weight_quant_min, int weight_quant_max, Tensor indices) -> Tensor",