pytorch
diff --git a/‎backends/cadence/aot/functions.yaml‎
Lines changed: 10 additions & 0 deletions b/‎backends/cadence/aot/functions.yaml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎backends/cadence/aot/functions_hifi.yaml‎
Lines changed: 10 additions & 0 deletions b/‎backends/cadence/aot/functions_hifi.yaml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 98 additions & 0 deletions b/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 98 additions & 0 deletions
diff --git a/‎backends/cadence/aot/tests/test_type_dispatch_passes.py‎
Lines changed: 52 additions & 0 deletions b/‎backends/cadence/aot/tests/test_type_dispatch_passes.py‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎backends/cadence/aot/type_dispatch.py‎
Lines changed: 47 additions & 40 deletions b/‎backends/cadence/aot/type_dispatch.py‎
Lines changed: 47 additions & 40 deletions
@@ -234,6 +234,16 @@
     - arg_meta: null
       kernel_name: impl::reference::quantized_matmul_out
 
+- func: cadence::quantized_matmul_asym8sxasym8s_asym8s.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_matmul_asym8sxasym8s_asym8s_out
+
+- func: cadence::quantized_matmul_asym8uxasym8u_asym8u.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_matmul_asym8uxasym8u_asym8u_out
+
 - func: cadence::quantized_linear.per_tensor_out(Tensor src, Tensor weight, Tensor bias, SymInt src_zero_point, SymInt weight_zero_point, SymInt out_multiplier, SymInt out_shift, SymInt out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
 
@@ -354,6 +354,16 @@
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::quantized_matmul_out
 
+- func: cadence::quantized_matmul_asym8sxasym8s_asym8s.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_matmul_asym8sxasym8s_asym8s_out
+
+- func: cadence::quantized_matmul_asym8uxasym8u_asym8u.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_matmul_asym8uxasym8u_asym8u_out
+
 - func: cadence::quantized_fully_connected.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
 
@@ -103,6 +103,18 @@
 lib.define(
     "quantized_matmul.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed=False, *, Tensor(a!) out) -> Tensor(a!)"
 )
+lib.define(
+    "quantized_matmul_asym8sxasym8s_asym8s(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed=False) -> (Tensor Z)"
+)
+lib.define(
+    "quantized_matmul_asym8sxasym8s_asym8s.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed=False, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.define(
+    "quantized_matmul_asym8uxasym8u_asym8u(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed=False) -> (Tensor Z)"
+)
+lib.define(
+    "quantized_matmul_asym8uxasym8u_asym8u.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed=False, *, Tensor(a!) out) -> Tensor(a!)"
+)
 
 lib.define(
     "convolution(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, "
@@ -700,6 +712,92 @@ def quantized_matmul_meta(
     return X.new_empty(out_size, dtype=X.dtype)
 
 
+@register_fake("cadence::quantized_matmul_asym8sxasym8s_asym8s")
+def quantized_matmul_asym8sxasym8s_asym8s_meta(
+    X: torch.Tensor,
+    X_zero_point: int,
+    Y: torch.Tensor,
+    Y_zero_point: int,
+    bias: Optional[torch.Tensor],
+    out_multiplier: int,
+    out_shift: int,
+    out_zero_point: int,
+    transposed: bool = False,
+) -> torch.Tensor:
+    X_size = list(X.size())
+    Y_size = list(Y.size())
+
+    # Get the batch dimensions for both tensors
+    X_batch_dims = X_size[:-2]
+    Y_batch_dims = Y_size[:-2]
+
+    # If they don't match, check that they're compatible
+    if X_batch_dims != Y_batch_dims:
+        assert prod(X_batch_dims) == prod(
+            Y_batch_dims
+        ), f"Batch dimensions of X and Y do not match: {X_batch_dims} vs {Y_batch_dims}"
+
+    # Get the matmul output size
+    if transposed:
+        assert X_size[-1] == Y_size[-1], "matrices cannot be multiplied"
+        mat_size = [X_size[-2], Y_size[-2]]
+    else:
+        assert X_size[-1] == Y_size[-2], "matrices cannot be multiplied"
+        mat_size = [X_size[-2], Y_size[-1]]
+
+    # Combine the larger batch dimensions with the matmul output size
+    out_size = (
+        X_batch_dims + mat_size
+        if len(X_batch_dims) > len(Y_batch_dims)
+        else Y_batch_dims + mat_size
+    )
+
+    return X.new_empty(out_size, dtype=X.dtype)
+
+
+@register_fake("cadence::quantized_matmul_asym8uxasym8u_asym8u")
+def quantized_matmul_asym8uxasym8u_asym8u_meta(
+    X: torch.Tensor,
+    X_zero_point: int,
+    Y: torch.Tensor,
+    Y_zero_point: int,
+    bias: Optional[torch.Tensor],
+    out_multiplier: int,
+    out_shift: int,
+    out_zero_point: int,
+    transposed: bool = False,
+) -> torch.Tensor:
+    X_size = list(X.size())
+    Y_size = list(Y.size())
+
+    # Get the batch dimensions for both tensors
+    X_batch_dims = X_size[:-2]
+    Y_batch_dims = Y_size[:-2]
+
+    # If they don't match, check that they're compatible
+    if X_batch_dims != Y_batch_dims:
+        assert prod(X_batch_dims) == prod(
+            Y_batch_dims
+        ), f"Batch dimensions of X and Y do not match: {X_batch_dims} vs {Y_batch_dims}"
+
+    # Get the matmul output size
+    if transposed:
+        assert X_size[-1] == Y_size[-1], "matrices cannot be multiplied"
+        mat_size = [X_size[-2], Y_size[-2]]
+    else:
+        assert X_size[-1] == Y_size[-2], "matrices cannot be multiplied"
+        mat_size = [X_size[-2], Y_size[-1]]
+
+    # Combine the larger batch dimensions with the matmul output size
+    out_size = (
+        X_batch_dims + mat_size
+        if len(X_batch_dims) > len(Y_batch_dims)
+        else Y_batch_dims + mat_size
+    )
+
+    return X.new_empty(out_size, dtype=X.dtype)
+
+
 @register_fake("cadence::im2row")
 def im2row_meta(
     input: torch.Tensor,
 
@@ -185,3 +185,55 @@ def test_uint8_dispatch_quantized_relu(self) -> None:
             ),
             1,
         )
+
+    def test_int8_dispatch_quantized_matmul(self) -> None:
+        """Test int8 x int8 inputs should dispatch to asym8sxasym8s_asym8s variant for quantized_matmul"""
+        x = torch.randint(-128, 127, (2, 3), dtype=torch.int8)
+        y = torch.randint(-128, 127, (3, 4), dtype=torch.int8)
+        bias = torch.randint(-2147483648, 2147483647, (4,), dtype=torch.int32)
+        gm = single_op_builder(
+            placeholders=(x, y, bias),
+            op=exir_ops.edge.cadence.quantized_matmul.default,
+            args=(x, 0, y, 0, bias, 1, 0, 0, False),
+        )
+        p = CompileTimeTypeDispatchPass()
+        gm = cast(PassResult, p(gm)).graph_module
+        # Original op should be replaced
+        self.assertEqual(
+            count_node(gm, exir_ops.edge.cadence.quantized_matmul.default),
+            0,
+        )
+        # Should be replaced with int8 specific variant
+        self.assertEqual(
+            count_node(
+                gm,
+                exir_ops.edge.cadence.quantized_matmul_asym8sxasym8s_asym8s.default,
+            ),
+            1,
+        )
+
+    def test_uint8_dispatch_quantized_matmul(self) -> None:
+        """Test uint8 x uint8 inputs should dispatch to asym8uxasym8u_asym8u variant for quantized_matmul"""
+        x = torch.randint(0, 255, (2, 3), dtype=torch.uint8)
+        y = torch.randint(0, 255, (3, 4), dtype=torch.uint8)
+        bias = torch.randint(-2147483648, 2147483647, (4,), dtype=torch.int32)
+        gm = single_op_builder(
+            placeholders=(x, y, bias),
+            op=exir_ops.edge.cadence.quantized_matmul.default,
+            args=(x, 0, y, 0, bias, 1, 0, 0, False),
+        )
+        p = CompileTimeTypeDispatchPass()
+        gm = cast(PassResult, p(gm)).graph_module
+        # Original op should be replaced
+        self.assertEqual(
+            count_node(gm, exir_ops.edge.cadence.quantized_matmul.default),
+            0,
+        )
+        # Should be replaced with uint8 specific variant
+        self.assertEqual(
+            count_node(
+                gm,
+                exir_ops.edge.cadence.quantized_matmul_asym8uxasym8u_asym8u.default,
+            ),
+            1,
+        )
@@ -6,6 +6,9 @@
 
 # pyre-strict
 
+from dataclasses import dataclass
+from typing import Optional
+
 import torch
 from executorch.backends.cadence.aot.pass_utils import (
     CadencePassAttribute,
@@ -17,29 +20,42 @@
 from torch.fx.node import Argument
 
 
+@dataclass
+class OpConfig:
+    """Configuration for type dispatch operations."""
+
+    base_name: str
+    input_arg_idx: int = 0
+    weight_arg_idx: Optional[int] = None
+    variant: str = "per_tensor"
+
+
 @register_cadence_pass(CadencePassAttribute(opt_level=4))
 class CompileTimeTypeDispatchPass(ExportPass):
     """
     Replaces generic ops with ops that have explicit types.
     """
 
-    _BINARY_TYPE_DISPATCH_MAP: dict[tuple[torch.dtype, torch.dtype], str] = {
+    _TYPE_DISPATCH_MAP: dict[tuple[torch.dtype, ...], str] = {
+        (torch.int8,): "asym8s_asym8s",
+        (torch.uint8,): "asym8u_asym8u",
         (torch.int8, torch.int8): "asym8sxasym8s_asym8s",
         (torch.uint8, torch.uint8): "asym8uxasym8u_asym8u",
     }
 
-    _UNARY_TYPE_DISPATCH_MAP: dict[torch.dtype, str] = {
-        torch.int8: "asym8s_asym8s",
-        torch.uint8: "asym8u_asym8u",
-    }
-
-    _BINARY_SUPPORTED_OPS: dict[OpOverload, str] = {
-        exir_ops.edge.cadence.quantized_fully_connected.per_tensor: "quantized_fully_connected",
-        exir_ops.edge.cadence.quantized_linear.per_tensor: "quantized_linear",
-    }
-
-    _SUPPORTED_UNARY_OPS: dict[OpOverload, str] = {
-        exir_ops.edge.cadence.quantized_relu.per_tensor: "quantized_relu",
+    _SUPPORTED_OPS: dict[OpOverload, OpConfig] = {
+        exir_ops.edge.cadence.quantized_fully_connected.per_tensor: OpConfig(
+            "quantized_fully_connected", input_arg_idx=0, weight_arg_idx=1
+        ),
+        exir_ops.edge.cadence.quantized_linear.per_tensor: OpConfig(
+            "quantized_linear", input_arg_idx=0, weight_arg_idx=1
+        ),
+        exir_ops.edge.cadence.quantized_matmul.default: OpConfig(
+            "quantized_matmul", input_arg_idx=0, weight_arg_idx=2, variant="default"
+        ),
+        exir_ops.edge.cadence.quantized_relu.per_tensor: OpConfig(
+            "quantized_relu", input_arg_idx=0
+        ),
     }
 
     def call_operator(
@@ -49,37 +65,28 @@ def call_operator(
         kwargs: dict[str, Argument],
         meta: NodeMetadata,
     ) -> ProxyValue:
-        if op in self._BINARY_SUPPORTED_OPS:
-            # pyre-ignore[16]: None has no attribute `to_tensor`.
-            input_dtype = args[0].to_tensor().dtype
-            weight_dtype = args[1].to_tensor().dtype
-            dtype_pair = (input_dtype, weight_dtype)
-
-            if dtype_pair not in self._BINARY_TYPE_DISPATCH_MAP:
-                raise RuntimeError(
-                    f"Unsupported input types for {op}: {input_dtype} and {weight_dtype}"
-                )
-
-            base_op_name = self._BINARY_SUPPORTED_OPS[op]
-            type_suffix = self._BINARY_TYPE_DISPATCH_MAP[dtype_pair]
-
-            typed_op_name = f"{base_op_name}_{type_suffix}"
-            typed_op = getattr(exir_ops.edge.cadence, typed_op_name).per_tensor
+        if op not in self._SUPPORTED_OPS:
+            return super().call_operator(op, args, kwargs, meta)
 
-            return super().call_operator(typed_op, args, kwargs, meta)
+        config = self._SUPPORTED_OPS[op]
 
-        elif op in self._SUPPORTED_UNARY_OPS:
-            input_dtype = args[0].to_tensor().dtype
+        # pyre-ignore[16]: None has no attribute `to_tensor`.
+        input_dtype = args[config.input_arg_idx].to_tensor().dtype
 
-            if input_dtype not in self._UNARY_TYPE_DISPATCH_MAP:
-                raise RuntimeError(f"Unsupported input type for {op}: {input_dtype}")
+        if config.weight_arg_idx is not None:
+            weight_dtype = args[config.weight_arg_idx].to_tensor().dtype
+            dtype_key = (input_dtype, weight_dtype)
+        else:
+            dtype_key = (input_dtype,)
 
-            base_op_name = self._SUPPORTED_UNARY_OPS[op]
-            type_suffix = self._UNARY_TYPE_DISPATCH_MAP[input_dtype]
+        if dtype_key not in self._TYPE_DISPATCH_MAP:
+            raise RuntimeError(f"Unsupported input types for {op}: {dtype_key}")
 
-            typed_op_name = f"{base_op_name}_{type_suffix}"
-            typed_op = getattr(exir_ops.edge.cadence, typed_op_name).per_tensor
+        type_suffix = self._TYPE_DISPATCH_MAP[dtype_key]
+        typed_op_name = f"{config.base_name}_{type_suffix}"
 
-            return super().call_operator(typed_op, args, kwargs, meta)
+        typed_op = getattr(
+            getattr(exir_ops.edge.cadence, typed_op_name), config.variant
+        )
 
-        return super().call_operator(op, args, kwargs, meta)
+        return super().call_operator(typed_op, args, kwargs, meta)