pytorch
diff --git a/‎backends/xnnpack/_passes/fuse_activation_pass.py‎
Lines changed: 2 additions & 0 deletions b/‎backends/xnnpack/_passes/fuse_activation_pass.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/xnnpack/_passes/fuse_batch_norm_with_conv.py‎
Lines changed: 17 additions & 1 deletion b/‎backends/xnnpack/_passes/fuse_batch_norm_with_conv.py‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎backends/xnnpack/operators/node_visitor.py‎
Lines changed: 30 additions & 1 deletion b/‎backends/xnnpack/operators/node_visitor.py‎
Lines changed: 30 additions & 1 deletion
diff --git a/‎backends/xnnpack/operators/op_conv2d.py‎
Lines changed: 22 additions & 11 deletions b/‎backends/xnnpack/operators/op_conv2d.py‎
Lines changed: 22 additions & 11 deletions
diff --git a/‎backends/xnnpack/partition/config/gemm_configs.py‎
Lines changed: 17 additions & 4 deletions b/‎backends/xnnpack/partition/config/gemm_configs.py‎
Lines changed: 17 additions & 4 deletions
diff --git a/‎backends/xnnpack/partition/configs.py‎
Lines changed: 4 additions & 0 deletions b/‎backends/xnnpack/partition/configs.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/xnnpack/runtime/XNNCompiler.cpp‎
Lines changed: 49 additions & 0 deletions b/‎backends/xnnpack/runtime/XNNCompiler.cpp‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎backends/xnnpack/serialization/runtime_schema.fbs‎
Lines changed: 1 addition & 0 deletions b/‎backends/xnnpack/serialization/runtime_schema.fbs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/xnnpack/serialization/schema.fbs‎
Lines changed: 1 addition & 0 deletions b/‎backends/xnnpack/serialization/schema.fbs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/xnnpack/serialization/xnnpack_graph_schema.py‎
Lines changed: 6 additions & 0 deletions b/‎backends/xnnpack/serialization/xnnpack_graph_schema.py‎
Lines changed: 6 additions & 0 deletions
@@ -68,6 +68,8 @@ def call(self, graph_module: torch.fx.GraphModule):
                         preceding_op.op == "call_function"
                         and preceding_op.target in self.FUSEABLE_OPS
                     ):
+                        if len(preceding_op.users) > 1:
+                            continue
                         # Delete activation, and embed metadata into preceding op
                         output_min_max = self.get_output_min_max_from_activation(
                             activation_node
 
@@ -5,12 +5,18 @@
 # LICENSE file in the root directory of this source tree.
 
 import operator
+from typing import cast, List
 
 import torch
 
+from executorch.backends.transforms import get_shape
 from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
 
-from executorch.backends.xnnpack.utils.utils import get_param_tensor, is_param_node
+from executorch.backends.xnnpack.utils.utils import (
+    get_input_node,
+    get_param_tensor,
+    is_param_node,
+)
 from executorch.exir import ExportedProgram
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import PassResult
@@ -134,6 +140,16 @@ def can_fuse(
         Determine whether a batch norm node can be fused with a preceding conv node.
         """
 
+        is_transpose = conv.args[6]
+        kernel_node = get_input_node(conv, 1)
+        kernel_shape = get_shape(kernel_node)
+        stride = cast(List[int], conv.args[3])
+
+        if is_transpose and (
+            kernel_shape[-1] != stride[0] or kernel_shape[-2] != stride[1]
+        ):
+            return False
+
         # All the users of batchnorm node must be getitem ops. batchnorm
         # returns a 3-element tuple. Each user must only access the first
         # element of the tuple.
 
@@ -337,7 +337,7 @@ def _check_per_channel_group_params(
         # For now group quantization is only supported for 4b weights
         assert quant_params.is_qc4w, "Only 4b group quantization is supported"
 
-    def define_tensor(
+    def define_tensor(  # noqa: C901
         self,
         tensor: torch.fx.Node,
         xnn_graph: XNNGraph,
@@ -346,6 +346,8 @@ def define_tensor(
         swap_nc_for_depthwise_weights: bool = False,
         quant_params: Optional[QuantParams] = None,
         fp32_static_weights: bool = False,
+        swap_in_out_for_transpose_weights: bool = False,
+        groups: int = 1,
     ) -> None:
         """
         Defines an tensor value into the XNNGraph
@@ -365,6 +367,9 @@ def define_tensor(
                         swap will happen before converting to nhwc.
             quant_params: Quantization meta data for this tensor, None if it is not quantized
             fp32_static_weights: XNN_FLAG_FP32_STATIC_WEIGHTS for fp16 conv
+            swap_in_out_for_transpose_weights: bool to indicate whether tensor shape should be
+                permuted and reshape from (inc, oc/groups, height, width) to  (oc, inc/groups, height, width)
+            groups: number of groups for swap_in_out_for_transpose_weights
         """
 
         if tensor in vals_to_ids:
@@ -397,12 +402,16 @@ def define_tensor(
             swap_nc_for_depthwise_weights,
             quant_params,
             fp32_static_weights,
+            swap_in_out_for_transpose_weights,
+            groups,
         )
 
         # convert tensor shape must reflect memory format, default is contiguous, so
         # only permute shape if we are converting the tensor to nhwc format
         if swap_nc_for_depthwise_weights:
             dims = [dims[1], dims[0]] + dims[2:]
+        if swap_in_out_for_transpose_weights:
+            dims = [dims[1] * groups, dims[0] // groups] + dims[2:]
         if convert_to_nhwc:
             check_or_raise(len(dims) == 4, "Converting to nhwc requires 4d tensor")
             dims = [dims[i] for i in PERM_NCHW_TO_NHWC]
@@ -433,6 +442,14 @@ def define_tensor(
             else:
                 assert f"Unsupported weight per channel quantization axis for depthwise conv2d: {quant_params.axis}, expecting 0."
 
+        if swap_in_out_for_transpose_weights and (
+            quant_params and quant_params.per_channel
+        ):
+            if quant_params.axis == 0:
+                quant_params.axis = len(dims) - 1
+            else:
+                assert f"Unsupported weight per channel quantization axis for conv_transpose2d: {quant_params.axis}, expecting 0."
+
         # Serialize tensor value
         ser_val = (
             XValue(xvalue_union=tvalue)
@@ -495,6 +512,8 @@ def get_serialized_buffer_index(
         swap_nc_for_depthwise_weights: bool,
         quant_params: Optional[QuantParams],
         fp32_static_weights: bool = False,
+        swap_in_out_for_transpose_weights: bool = False,
+        groups: int = 1,
     ) -> int:
         """
         If tensor holds some constant data, serialize it and return the
@@ -546,6 +565,16 @@ def get_serialized_buffer_index(
                 dims=((1, 0) + tuple(range(2, const_val.dim())))
             ).contiguous()
 
+        if swap_in_out_for_transpose_weights:
+            shape = const_val.shape
+            const_val = const_val.reshape(
+                (groups, const_val.shape[0] // groups) + const_val.shape[1:]
+            )
+            const_val = const_val.permute((0, 2, 1) + tuple(range(3, const_val.dim())))
+            const_val = const_val.reshape(
+                (shape[1] * groups, shape[0] // groups) + shape[2:]
+            ).contiguous()
+
         if convert_to_nhwc:
             const_val = const_val.to(memory_format=torch.channels_last)
 
 
@@ -16,6 +16,7 @@
 from executorch.backends.xnnpack.operators.quant_params import QuantParams
 from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import (
     XNNConv2d,
+    XNNConvTranspose2d,
     XNNDepthwiseConv2d,
     XNNGraph,
     XNode,
@@ -52,21 +53,31 @@ def define_node(
         )  # NHWC input
         kwargs["input1_id"] = vals_to_ids[get_input_node(node, 0)]
 
-        # filter shape for pytorch convolution is (oc, inc/groups, height, width)
-        # shape for xnnpack convolution is (oc, height, width, inc/groups), to convert
-        # to the proper shape, this is essentially a NCHW to NHWC conversion
+        # filter shape for pytorch convolution is (oc, inc/groups, height, width),
+        # filter shape for pytorch transpose convolution is (inc, oc/groups, height, width),
+        # shape for xnnpack convolution is (oc, height, width, inc/groups),
+        # shape for xnnpack transpose convolution is (oc, height, width, inc/groups),
+        # to convert to the proper shape, this is essentially a NCHW to NHWC conversion
         kernel_node = get_input_node(node, 1)
         kernel_shape = get_shape(kernel_node)
         groups = cast(int, node.args[8])
-        group_input_channels = kernel_shape[1]
-        group_output_channels = int(kernel_shape[0] / groups)
+        is_transpose = node.args[6]
+
+        if is_transpose:
+            group_input_channels = int(kernel_shape[0] / groups)
+            group_output_channels = kernel_shape[1]
+        else:
+            group_input_channels = kernel_shape[1]
+            group_output_channels = int(kernel_shape[0] / groups)
 
         # XNNPACK expects the kernel's N and C dimensions to be swapped for
         # Depthwise Convolution, which occurs under the following conditions:
         # 1) groups = input_channels (i.e. group_input_channels = 1)
         # 2) output_channels is a positive integer multiple of input channels
-        is_depthwise_conv = (group_input_channels == 1) and (
-            group_output_channels % group_input_channels == 0
+        is_depthwise_conv = (
+            (group_input_channels == 1)
+            and (group_output_channels % group_input_channels == 0)
+            and not is_transpose
         )
         weight_quant_params = QuantParams.from_weights(
             kernel_node, self._exported_program
@@ -81,6 +92,8 @@ def define_node(
             swap_nc_for_depthwise_weights=is_depthwise_conv,
             quant_params=weight_quant_params,
             fp32_static_weights=fp32_static_weights,
+            swap_in_out_for_transpose_weights=is_transpose,
+            groups=groups,
         )
         kwargs["filter_id"] = vals_to_ids[get_input_node(node, 1)]
 
@@ -120,10 +133,6 @@ def define_node(
         if len(padding) == 1:
             padding = padding + padding
 
-        # args[6] = transposed
-        check_or_raise(
-            not cast(bool, node.args[6]), "No support for transposed convolution"
-        )
         # args[7] = output padding
         check_or_raise(
             all(out_pad == 0 for out_pad in cast(List[int], node.args[7])),
@@ -152,6 +161,8 @@ def define_node(
 
         if is_depthwise_conv:
             conv_node_type = XNNDepthwiseConv2d
+        elif is_transpose:
+            conv_node_type = XNNConvTranspose2d
         else:
             conv_node_type = XNNConv2d
 
 
@@ -9,6 +9,7 @@
 from typing import cast, List, Optional, Tuple
 
 import torch
+from executorch.backends.xnnpack.operators.quant_params import QuantParams
 from executorch.backends.xnnpack.partition.config.xnnpack_config import (
     ConfigPrecisionType,
     XNNPartitionerConfig,
@@ -327,11 +328,23 @@ def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
             why(node, "Only support 1D + 2D Conv")
             return False  # Only support 1D + 2D Conv
 
-        transposed = cast(bool, node.args[6])
-        if transposed:
-            why(node, "Transposed Conv is not supported")
-            return False  # Currently don't support transposed conv
+        kernel_node = get_input_node(node, 1)
+        weight_quant_params = QuantParams.from_weights(kernel_node, ep)
 
+        is_transpose = node.args[6]
+        groups = cast(int, node.args[8])
+        if (
+            is_transpose
+            and weight_quant_params is not None
+            and weight_quant_params.per_channel
+            and groups > 1
+        ):
+            why(
+                node,
+                "XNNPACK does not support per input channel quantization"
+                "for transpose convolutions with groups > 1",
+            )
+            return False
         return True
 
     def supported_precision_types(self):
 
@@ -73,6 +73,7 @@
     torch.nn.BatchNorm2d,
     torch.nn.BatchNorm1d,
     torch.nn.Conv2d,
+    torch.nn.ConvTranspose2d,
     torch.nn.Linear,
     torch.nn.functional.linear,
     torch.nn.PReLU,  # Without this, the PReLU weight becomes not a get_attr
@@ -130,8 +131,11 @@
     torch.nn.functional.conv1d,
     torch.ao.nn.quantized.reference.modules.conv.Conv1d,
     torch.nn.Conv2d,
+    torch.nn.ConvTranspose2d,
     torch.nn.functional.conv2d,
+    torch.nn.functional.conv_transpose2d,
     torch.ao.nn.quantized.reference.modules.conv.Conv2d,
+    torch.ao.nn.quantized.reference.modules.conv.ConvTranspose2d,
     torch.nn.BatchNorm1d,
     torch.nn.BatchNorm2d,
 ]
 
@@ -979,6 +979,54 @@ Error defineConv2dNode(
   return Error::Ok;
 }
 
+/*
+Define serialized conv_transpose2d node into the subgraph, using the remapped
+ids to map the serialized ids, to the new ids generated when defining the tensor
+value
+*/
+Error defineConvTranspose2dNode(
+    xnn_subgraph_t subgraph_ptr,
+    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
+    const NodePtr node,
+    const fb_xnnpack::XNNGraph* graph) noexcept {
+  MAYBE_UNUSED(graph);
+  auto graph_node = node->xnode_union_as_XNNConvTranspose2d();
+
+  std::pair<float, float> min_max = getOutputMinMax(node);
+  xnn_status status = xnn_define_deconvolution_2d(
+      subgraph_ptr,
+      graph_node->padding_top(),
+      graph_node->padding_right(),
+      graph_node->padding_bottom(),
+      graph_node->padding_left(),
+      graph_node->adjustment_height(),
+      graph_node->adjustment_width(),
+      graph_node->kernel_height(),
+      graph_node->kernel_width(),
+      graph_node->subsampling_height(),
+      graph_node->subsampling_width(),
+      graph_node->dilation_height(),
+      graph_node->dilation_width(),
+      graph_node->groups(),
+      graph_node->group_input_channels(),
+      graph_node->group_output_channels(),
+      min_max.first,
+      min_max.second,
+      remapped_ids.at(graph_node->input1_id()),
+      remapped_ids.at(graph_node->filter_id()),
+      remapped_ids.at(graph_node->bias_id()),
+      remapped_ids.at(graph_node->output_id()),
+      graph_node->flags());
+  ET_CHECK_OR_RETURN_ERROR(
+      status == xnn_status_success,
+      Internal,
+      "Failed to create deconvolution node %i with code: %s",
+      node->debug_handle(),
+      xnn_status_to_string(status));
+
+  return Error::Ok;
+}
+
 /*
 Define serialized maxpool2d node into the subgraph, using the remapped ids
 to map the serialized ids, to the new ids generated when defining the
@@ -1840,6 +1888,7 @@ DefineNodeFunc getDefineNodeFunc(fb_xnnpack::XNodeUnion nodeType) {
     _DEFINE(StaticTranspose)
     _DEFINE(Clamp)
     _DEFINE(Conv2d)
+    _DEFINE(ConvTranspose2d)
     _DEFINE(Div)
     _DEFINE(StaticResizeBilinear2D)
     _DEFINE(StaticConstantPad)
 
@@ -137,6 +137,7 @@ union XNodeUnion {
   XNNScaledDotProductAttention,
   XNNBatchMatrixMultiply: _XNNNode2x1,
   XNNConcatenate5: _XNNCat,
+  XNNConvTranspose2d: _XNNNodeConv,
 }
 
 union XValueUnion {
 
@@ -133,6 +133,7 @@ union XNodeUnion {
   XNNScaledDotProductAttention,
   XNNBatchMatrixMultiply: _XNNNode2x1,
   XNNConcatenate5: _XNNCat,
+  XNNConvTranspose2d: _XNNNodeConv,
 }
 
 union XValueUnion {
 
@@ -103,6 +103,11 @@ class XNNConv2d(XNNNodeConv):
     pass
 
 
+@dataclass
+class XNNConvTranspose2d(XNNNodeConv):
+    pass
+
+
 @dataclass
 class XNNAdd(XNNNode2x1):
     pass
@@ -336,6 +341,7 @@ class XNNScaledDotProductAttention:
     XNNStaticTranspose,
     XNNClamp,
     XNNConv2d,
+    XNNConvTranspose2d,
     XNNDiv,
     XNNStaticResizeBilinear2D,
     XNNStaticConstantPad,
Original file line number	Diff line number	Diff line change
`@@ -137,6 +137,7 @@ union XNodeUnion {`
`137`	`137`	`XNNScaledDotProductAttention,`
`138`	`138`	`XNNBatchMatrixMultiply: _XNNNode2x1,`
`139`	`139`	`XNNConcatenate5: _XNNCat,`
	`140`	`+ XNNConvTranspose2d: _XNNNodeConv,`
`140`	`141`	`}`
`141`	`142`
`142`	`143`	`union XValueUnion {`
Original file line number	Diff line number	Diff line change
`@@ -133,6 +133,7 @@ union XNodeUnion {`
`133`	`133`	`XNNScaledDotProductAttention,`
`134`	`134`	`XNNBatchMatrixMultiply: _XNNNode2x1,`
`135`	`135`	`XNNConcatenate5: _XNNCat,`
	`136`	`+ XNNConvTranspose2d: _XNNNodeConv,`
`136`	`137`	`}`
`137`	`138`
`138`	`139`	`union XValueUnion {`