New embedding quant fusion

metascroy · facebook-github-bot · commit 4c610e4c1da2 · 2025-04-21T11:38:23.000-07:00
Summary:
The diff adds new quant fusion passes to recognize 2, 4, and 8 bit quantized embeedings (per group and per channel) and fuses them to ExecuTorch kernels.  This makes torchao's quantize_ integrate with ExecuTorch:


```
 quantize_(
    model,
    IntxWeightOnlyConfig(weight_dtype=torch.int4, granularity=PerGroup(32)),
    lambda m, fqn: isinstance(m, torch.nn.Embedding)
)

# lower model to executorch
```

For the model to lower, we need to run QuantFusionPass.  For subbyte, we also need to run constant_prop_pass.  (See new unit tests for examples).  In follow-up diffs, we will enable these passes by default in to_executorch before the memory passing and out-variant passes.

Differential Revision: D73381542
diff --git a/exir/TARGETS b/exir/TARGETS
@@ -16,6 +16,7 @@ python_library(
         "//caffe2:torch",
         "//executorch/exir/operator:convert",
         "//executorch/extension/pytree:pylib",
+        "//pytorch/ao:torchao",
     ],
 )
 
diff --git a/exir/passes/_quant_patterns_and_replacements.py b/exir/passes/_quant_patterns_and_replacements.py
@@ -22,6 +22,51 @@
     "get_quant_patterns_and_replacements",
 ]
 
+
+from torch import Tensor
+from torch.library import custom_op
+@custom_op("quant_fusion::_pack_embedding_weight", mutates_args=())
+def _pack_embedding_weight(weight: Tensor, bitwidth: int) -> Tensor:
+    num_embeddings, embedding_dim = weight.shape
+
+    if bitwidth == 2:
+        assert embedding_dim % 4 == 0, "embedding_dim must be divisible by 4"
+        weight_range_shifted = weight.add(2).view(torch.uint8)
+        weight_view = weight_range_shifted.view(
+            num_embeddings, embedding_dim // 4, 4
+        )
+        weight_0 = weight_view[:, :, 0]
+        weight_1 = weight_view[:, :, 1] << 2
+        weight_2 = weight_view[:, :, 2] << 4
+        weight_3 = weight_view[:, :, 3] << 6
+        packed_weight = weight_0 + weight_1 + weight_2 + weight_3
+        return packed_weight
+    elif bitwidth == 4:
+        assert embedding_dim % 2 == 0, "embedding_dim must be divisible by 2"
+        weight_range_shifted = weight.add(8).view(torch.uint8)
+        weight_view = weight_range_shifted.view(
+            weight.shape[0], weight.shape[1] // 2, 2
+        )
+        weight_even = weight_view[:, :, 0] * 16  # left shift 4
+        weight_odd = weight_view[:, :, 1]
+        packed_weight = weight_even + weight_odd
+        return packed_weight
+    elif bitwidth == 8:
+        return weight
+    
+    raise RuntimeError(f"Unsupported bitwidth {bitwidth}")
+
+
+# Use register_fake to add a ``FakeTensor`` kernel for the operator
+@_pack_embedding_weight.register_fake
+def _(weight, bit_width):
+    assert bit_width in [2, 4, 8]
+    num_embeddings, embedding_dim = weight.shape
+    values_per_byte = 8 // bit_width
+    assert embedding_dim % values_per_byte == 0
+    return torch.empty(num_embeddings, embedding_dim // values_per_byte, dtype=torch.uint8, device=weight.device)
+
+
 # TODO: extending an existing library that is defined in OSS might be a bit
 # confusing, we can investigate if it is possible to define a new library
 
@@ -70,7 +115,7 @@ def embedding_weight_checks(weight, weight_scales, weight_zero_points):
         weight_zero_points is None or weight_zero_points.dtype == weight_scales.dtype
     ), "Expecting weight_zero_points to be None or have same dtype as weight_scales"
     assert (
-        weight_zero_points is None or weight_zero_points.dim() == 1
+        weight_zero_points is None or weight_zero_points.dim() in [1, 2]
     ), f"Expecting weight_zero_points tensor to be None or have dim()==1, but found {weight_zero_points.dim()}"
     assert weight_zero_points is None or weight_zero_points.size(0) == weight.size(
         0
@@ -233,6 +278,19 @@ def embedding_2bit(
     )
     return torch.ops.aten.embedding.default(weight, indices)
 
+@register_fake("quantized_decomposed::embedding_2bit")
+def _(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: Optional[torch.Tensor],
+    weight_quant_min: int,
+    weight_quant_max: int,
+    indices: torch.Tensor,
+    ):
+    num_embeddings, packed_embedding_dim = weight.shape
+    embedding_dim = packed_embedding_dim * 4
+    embedding = torch.nn.Embedding(num_embeddings, embedding_dim, device=weight.device)
+    return embedding(indices)
 
 @register_fake("quantized_decomposed::embedding_2bit.out")
 def embedding_2bit_out_meta(
@@ -253,7 +311,6 @@ def embedding_2bit_out_meta(
         indices,
     )
 
-
 @impl(quantized_decomposed_lib, "embedding_2bit.dtype", "CompositeExplicitAutograd")
 def embedding_2bit_dtype(
     weight: torch.Tensor,
@@ -295,6 +352,20 @@ def embedding_2bit_dtype(
     )
     return torch.ops.aten.embedding.default(weight, indices)
 
+@register_fake("quantized_decomposed::embedding_2bit.dtype")
+def _(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: Optional[torch.Tensor],
+    weight_quant_min: int,
+    weight_quant_max: int,
+    indices: torch.Tensor,
+    dtype: Optional[torch.dtype],
+    ) -> torch.Tensor:
+    num_embeddings, packed_embedding_dim = weight.shape
+    embedding_dim = packed_embedding_dim * 4
+    embedding = torch.nn.Embedding(num_embeddings, embedding_dim, device=weight.device)
+    return embedding(indices).to(dtype)
 
 @register_fake("quantized_decomposed::embedding_2bit.dtype_out")
 def embedding_2bit_dtype_out_meta(
@@ -377,6 +448,19 @@ def embedding_4bit(
     )
     return torch.ops.aten.embedding.default(weight, indices)
 
+@register_fake("quantized_decomposed::embedding_4bit")
+def _(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: Optional[torch.Tensor],
+    weight_quant_min: int,
+    weight_quant_max: int,
+    indices: torch.Tensor,
+    ):
+    num_embeddings, packed_embedding_dim = weight.shape
+    embedding_dim = packed_embedding_dim * 2
+    embedding = torch.nn.Embedding(num_embeddings, embedding_dim, device=weight.device)
+    return embedding(indices)
 
 @register_fake("quantized_decomposed::embedding_4bit.out")
 def embedding_4bit_out_meta(
@@ -437,6 +521,20 @@ def embedding_4bit_dtype(
     )
     return torch.ops.aten.embedding.default(weight, indices)
 
+@register_fake("quantized_decomposed::embedding_4bit.dtype")
+def _(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: Optional[torch.Tensor],
+    weight_quant_min: int,
+    weight_quant_max: int,
+    indices: torch.Tensor,
+    dtype: Optional[torch.dtype],
+    ) -> torch.Tensor:
+    num_embeddings, packed_embedding_dim = weight.shape
+    embedding_dim = packed_embedding_dim * 2
+    embedding = torch.nn.Embedding(num_embeddings, embedding_dim, device=weight.device)
+    return embedding(indices).to(dtype)
 
 @register_fake("quantized_decomposed::embedding_4bit.dtype_out")
 def embedding_4bit_dtype_out_meta(
@@ -872,6 +970,76 @@ def replacement(x, dim, start, end, x_scale, x_zero_point, x_qmin, x_qmax):
         )
     ]
 
+def _get_embedding_ops_patterns_and_replacements_torchao() ->  List[Tuple[Callable, Callable, List[Callable]]]:
+    def embedding_byte_pattern(indices, int_data, group_size, scale, zero_point):
+        dq = torch.ops.torchao.dequantize_affine.default(int_data, [1, group_size], scale, zero_point, torch.int8, -128, 127)
+        return torch.ops.aten.embedding.default(dq, indices)
+    def embedding_byte_replacement(indices, int_data, group_size, scale, zero_point):
+        zero_point_dtype_cast = torch.ops.aten.to.dtype(zero_point, scale.dtype)
+        return torch.ops.quantized_decomposed.embedding_byte.default(
+            int_data,
+            scale,
+            zero_point_dtype_cast,
+            -128,
+            127,
+            indices,
+        )
+    def embedding_byte_dtype_pattern(indices, int_data, group_size, scale, zero_point, output_dtype):
+        dq = torch.ops.torchao.dequantize_affine.default(int_data, [1, group_size], scale, zero_point, torch.int8, -128, 127, 'INT', output_dtype)
+        return torch.ops.aten.embedding.default(dq, indices)
+    def embedding_byte_dtype_replacement(indices, int_data, group_size, scale, zero_point, output_dtype):
+        zero_point_dtype_cast = torch.ops.aten.to.dtype(zero_point, scale.dtype)
+        return torch.ops.quantized_decomposed.embedding_byte.dtype(
+            int_data,
+            scale,
+            zero_point_dtype_cast,
+            -128,
+            127,
+            indices,
+            dtype=output_dtype
+        )
+    
+    def embedding_2bit_pattern(indices, int_data, group_size, scale, zero_point):
+        dq = torch.ops.torchao.dequantize_affine.default(int_data, [1, group_size], scale, zero_point, torch.int8, -2, 1)
+        return torch.ops.aten.embedding.default(dq, indices)
+    def embedding_2bit_replacement(indices, int_data, group_size, scale, zero_point):
+        packed_int_data = torch.ops.quant_fusion._pack_embedding_weight.default(int_data, 2)
+        zero_point_dtype_cast = torch.ops.aten.to.dtype(zero_point, scale.dtype)
+        return torch.ops.quantized_decomposed.embedding_2bit.default(packed_int_data, scale, zero_point_dtype_cast, -2, 1, indices)
+
+    def embedding_2bit_dtype_pattern(indices, int_data, group_size, scale, zero_point, output_dtype):
+        dq = torch.ops.torchao.dequantize_affine.default(int_data, [1, group_size], scale, zero_point, torch.int8, -2, 1, 'INT', output_dtype)
+        return torch.ops.aten.embedding.default(dq, indices)
+    def embedding_2bit_dtype_replacement(indices, int_data, group_size, scale, zero_point, output_dtype):
+        packed_int_data = torch.ops.quant_fusion._pack_embedding_weight.default(int_data, 2)
+        zero_point_dtype_cast = torch.ops.aten.to.dtype(zero_point, scale.dtype)
+        return torch.ops.quantized_decomposed.embedding_2bit.dtype(packed_int_data, scale, zero_point_dtype_cast, -2, 1, indices, dtype=output_dtype)
+    
+    def embedding_4bit_pattern(indices, int_data, group_size, scale, zero_point):
+        dq = torch.ops.torchao.dequantize_affine.default(int_data, [1, group_size], scale, zero_point, torch.int8, -8, 7)
+        return torch.ops.aten.embedding.default(dq, indices)
+    def embedding_4bit_replacement(indices, int_data, group_size, scale, zero_point):
+        packed_int_data = torch.ops.quant_fusion._pack_embedding_weight.default(int_data, 4)
+        zero_point_dtype_cast = torch.ops.aten.to.dtype(zero_point, scale.dtype)
+        return torch.ops.quantized_decomposed.embedding_4bit.default(packed_int_data, scale, zero_point_dtype_cast, -8, 7, indices)
+    
+    def embedding_4bit_dtype_pattern(indices, int_data, group_size, scale, zero_point, output_dtype):
+        dq = torch.ops.torchao.dequantize_affine.default(int_data, [1, group_size], scale, zero_point, torch.int8, -8, 7, 'INT', output_dtype)
+        return torch.ops.aten.embedding.default(dq, indices)
+    def embedding_4bit_dtype_replacement(indices, int_data, group_size, scale, zero_point, output_dtype):
+        packed_int_data = torch.ops.quant_fusion._pack_embedding_weight.default(int_data, 4)
+        zero_point_dtype_cast = torch.ops.aten.to.dtype(zero_point, scale.dtype)
+        return torch.ops.quantized_decomposed.embedding_4bit.dtype(packed_int_data, scale, zero_point_dtype_cast, -8, 7, indices, dtype=output_dtype)
+
+    return [
+        (_trace_and_lower_to_edge_ops(embedding_byte_pattern), _trace_and_lower_to_edge_ops(embedding_byte_replacement), []),
+        (_trace_and_lower_to_edge_ops(embedding_byte_dtype_pattern), _trace_and_lower_to_edge_ops(embedding_byte_dtype_replacement), []),
+        (_trace_and_lower_to_edge_ops(embedding_2bit_pattern), _trace_and_lower_to_edge_ops(embedding_2bit_replacement), []),
+        (_trace_and_lower_to_edge_ops(embedding_2bit_dtype_pattern), _trace_and_lower_to_edge_ops(embedding_2bit_dtype_replacement), []),
+        (_trace_and_lower_to_edge_ops(embedding_4bit_pattern), _trace_and_lower_to_edge_ops(embedding_4bit_replacement), []),
+        (_trace_and_lower_to_edge_ops(embedding_4bit_dtype_pattern), _trace_and_lower_to_edge_ops(embedding_4bit_dtype_replacement), []),
+    ]
+
 
 def _get_embedding_ops_patterns_and_replacements() -> (
     List[Tuple[Callable, Callable, List[Callable]]]
@@ -1167,5 +1335,6 @@ def get_quant_patterns_and_replacements() -> (
             *_get_slice_patterns_and_replacements(),
             # *_get_fixed_qparams_ops_patterns_and_replacements(),
             *_get_embedding_ops_patterns_and_replacements(),
+            *_get_embedding_ops_patterns_and_replacements_torchao(),
         ]
     )
diff --git a/exir/passes/quant_fusion_pass.py b/exir/passes/quant_fusion_pass.py
@@ -89,7 +89,17 @@ def _get_qparams(node):
         qnode.replace_all_uses_with(maybe_cat)
         model.graph.erase_node(qnode)
 
-
+def _remove_dtype_getattr_nodes(model: GraphModule) -> None:
+    for n in model.graph.nodes:
+        if n.op == "call_function" and n.target == getattr:
+            if isinstance(n.args[0], torch.fx.Node) and n.args[1] == "dtype":
+                dtype = n.args[0].meta["val"].dtype
+                n.replace_all_uses_with(dtype)
+                model.graph.erase_node(n)
+    model.graph.eliminate_dead_code()
+    model.graph.lint()
+    model.recompile()
+                
 class QuantFusionPass(ExportPass):
     def __init__(self, _fix_node_meta_val=False):
         super().__init__()
@@ -123,6 +133,7 @@ def call(self, graph_module: GraphModule) -> PassResult:
                         torch.fx.Node, lambda x: x.meta["val"], (n.args, n.kwargs)
                     )
                     n.meta["val"] = n.target(*args, **kwargs)
+        _remove_dtype_getattr_nodes(graph_module)
         graph_module.graph.lint()
         graph_module.graph.eliminate_dead_code()
         return PassResult(graph_module, True)
diff --git a/exir/tests/TARGETS b/exir/tests/TARGETS
@@ -298,6 +298,8 @@ python_unittest(
         "//caffe2:torch",
         "//executorch/exir:lib",
         "//executorch/exir/passes:quant_fusion_pass",
+        "//pytorch/ao:torchao",
+        "//executorch/exir/passes:constant_prop_pass",
     ],
 )
 
diff --git a/exir/tests/test_quant_fusion_pass.py b/exir/tests/test_quant_fusion_pass.py
@@ -12,6 +12,7 @@
 from executorch import exir
 from executorch.exir import EdgeCompileConfig, to_edge
 from executorch.exir.passes.quant_fusion_pass import QuantFusionPass
+from executorch.exir.passes.constant_prop_pass import constant_prop_pass
 from executorch.exir.tests.common import register_additional_test_aten_ops
 from torch.ao.quantization import (  # @manual
     float_qparams_weight_only_qconfig,
@@ -30,7 +31,8 @@
 from torch.nn import functional as F
 
 from torch.testing import FileCheck
-
+from torchao.quantization.quant_api import quantize_, IntxWeightOnlyConfig
+from torchao.quantization.granularity import PerGroup, PerAxis
 
 class TestQuantFusionPass(unittest.TestCase):
     @classmethod
@@ -373,3 +375,78 @@ def forward(self, indices):
             # ).run(
             #     m.dump_graph_module().code
             # )
+
+    def test_embedding_torchao(self):
+        for bit_width, test_dtype_variant, test_per_group in zip([2, 4, 8], [True, False], [True, False]):
+            self._test_embedding_torchao(bit_width, test_dtype_variant, test_per_group)
+
+    def _test_embedding_torchao(self, bit_width: int, test_dtype_variant: bool, test_per_group: bool) -> None:
+        assert bit_width in [2, 4, 8]
+        embedding_suffix = f"{bit_width}bit" if bit_width < 8 else "byte"
+        if test_dtype_variant:
+            embedding_suffix = f"{embedding_suffix}_dtype"
+
+        indices = torch.tensor([1, 2, 3], dtype=torch.int64)
+        model = torch.nn.Sequential(*[torch.nn.Embedding(10, 64), torch.nn.Linear(64, 8)])
+        example_inputs = (indices,)
+        
+        # torchao adds a dtype cast to match embeddings original weight type
+        # this does not happen for float32 because it is the default dtype
+        model = model.to(torch.float16) if test_dtype_variant else model
+
+        # quantize the model
+        granularity = PerGroup(32) if test_per_group else PerAxis(0)
+        quantize_(
+            model,
+            IntxWeightOnlyConfig(weight_dtype=getattr(torch, f"int{bit_width}"), granularity=granularity),
+            lambda m, fqn: isinstance(m, torch.nn.Embedding)
+        )
+        expected_outputs = model(*example_inputs)
+        
+        compile_config = EdgeCompileConfig(
+            _check_ir_validity=False,
+            _use_edge_ops=True,
+        )
+        m = to_edge(
+            export(model, example_inputs, strict=True), compile_config=compile_config
+        )
+
+        # Before pass, we see torchao dequantize and embedding ops
+        FileCheck().check_count(
+            "executorch_exir_dialects_edge__ops_torchao_dequantize_affine_default", 1, exactly=True
+        ).check_count(
+            "executorch_exir_dialects_edge__ops_aten_embedding_default", 1, exactly=True,
+        ).run(
+            m.exported_program().graph_module.code
+        )
+
+        m = m.transform([QuantFusionPass(_fix_node_meta_val=True)])
+
+        # After pass, we see packing op and quantized embedding op, but no torchao dequantize op
+        FileCheck().check_count(
+            "executorch_exir_dialects_edge__ops_quant_fusion__pack_embedding_weight_default", 1 if bit_width < 8 else 0, exactly=True
+        ).check_count(
+            f"executorch_exir_dialects_edge__ops_quantized_decomposed_embedding_{embedding_suffix}", 1, exactly=True,
+        ).check_not(
+            "executorch_exir_dialects_edge__ops_torchao_dequantize_affine_default"
+        ).run(
+            m.exported_program().graph_module.code
+        )
+
+        constant_prop_pass(m.exported_program())
+
+        # After constant prop, we see quantized embedding op, but no packing op
+        FileCheck().check_count(
+             f"executorch_exir_dialects_edge__ops_quantized_decomposed_embedding_{embedding_suffix}", 1, exactly=True,
+        ).check_not(
+            "executorch_exir_dialects_edge__ops_quant_fusion__pack_embedding_weight_default",
+        ).run(
+            m.exported_program().graph_module.code
+        )
+
+        # Compare numerics
+        actual_outputs = m.exported_program().module()(*example_inputs)
+        self.assertTrue(torch.allclose(expected_outputs, actual_outputs))
+
+        # Can lower to executorch
+        exec_prog = m.to_executorch()
diff --git a/exir/tracer.py b/exir/tracer.py

Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,7 @@ python_library(`
`16`	`16`	`"//caffe2:torch",`
`17`	`17`	`"//executorch/exir/operator:convert",`
`18`	`18`	`"//executorch/extension/pytree:pylib",`
	`19`	`+ "//pytorch/ao:torchao",`
`19`	`20`	`],`
`20`	`21`	`)`
`21`	`22`
Original file line number	Diff line number	Diff line change
`@@ -298,6 +298,8 @@ python_unittest(`
`298`	`298`	`"//caffe2:torch",`
`299`	`299`	`"//executorch/exir:lib",`
`300`	`300`	`"//executorch/exir/passes:quant_fusion_pass",`
	`301`	`+ "//pytorch/ao:torchao",`
	`302`	`+ "//executorch/exir/passes:constant_prop_pass",`
`301`	`303`	`],`
`302`	`304`	`)`
`303`	`305`