pytorch
diff --git a/‎CMakeLists.txt‎
Lines changed: 8 additions & 3 deletions b/‎CMakeLists.txt‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎README.md‎
Lines changed: 5 additions & 0 deletions b/‎README.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎backends/arm/_passes/TARGETS‎
Lines changed: 1 addition & 0 deletions b/‎backends/arm/_passes/TARGETS‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 2 additions & 0 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/arm/_passes/decompose_linear_pass.py‎
Lines changed: 112 additions & 0 deletions b/‎backends/arm/_passes/decompose_linear_pass.py‎
Lines changed: 112 additions & 0 deletions
diff --git a/‎backends/arm/arm_backend.py‎
Lines changed: 33 additions & 7 deletions b/‎backends/arm/arm_backend.py‎
Lines changed: 33 additions & 7 deletions
diff --git a/‎backends/arm/arm_partitioner.py‎
Lines changed: 11 additions & 2 deletions b/‎backends/arm/arm_partitioner.py‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎backends/arm/operators/__init__.py‎
Lines changed: 0 additions & 1 deletion b/‎backends/arm/operators/__init__.py‎
Lines changed: 0 additions & 1 deletion
@@ -721,10 +721,15 @@ if(EXECUTORCH_BUILD_PYBIND)
       -fPIC
       -frtti
       -fexceptions
-      # libtorch is built with the old ABI, so we need to do the same for any
-      # .cpp files that include torch, c10, or ATen targets.
-      -D_GLIBCXX_USE_CXX11_ABI=0
   )
+  if(EXECUTORCH_DO_NOT_USE_CXX11_ABI)
+      # libtorch is built with the old ABI, so we need to do the same for any
+      # .cpp files that include torch, c10, or ATen targets. Note that PyTorch
+      # nightly binary is built with _GLIBCXX_USE_CXX11_ABI set to 0 while its
+      # CI build sets this to 1 (default)
+    list(APPEND _pybind_compile_options -D_GLIBCXX_USE_CXX11_ABI=0)
+  endif()
+
   # util lib
   add_library(
     util ${CMAKE_CURRENT_SOURCE_DIR}/extension/evalue_util/print_evalue.cpp
 
@@ -43,6 +43,11 @@ We recommend using the latest release tag from the
 See [CONTRIBUTING.md](CONTRIBUTING.md) for details about issues, PRs, code
 style, CI jobs, and other development topics.
 
+To connect with us and other community members, we invite you to join PyTorch Slack community by filling out this [form](https://docs.google.com/forms/d/e/1FAIpQLSeADnUNW36fjKjYzyHDOzEB_abKQE9b6gqqW9NXse6O0MWh0A/viewform). Once you've joined, you can:
+* Head to the `#executorch-general` channel for general questions, discussion, and community support.
+* Join the `#executorch-contributors` channel if you're interested in contributing directly to project development.
+
+
 ## Directory Structure
 
 ```
 
@@ -7,6 +7,7 @@ python_library(
     deps = [
         "//executorch/backends/arm:tosa_quant_utils",
         "//executorch/backends/arm:tosa_utils",
+        "//executorch/backends/xnnpack/_passes:xnnpack_passes",
         "//executorch/exir:lib",
     ],
 )
@@ -23,6 +23,7 @@
 from executorch.backends.arm._passes.decompose_layernorm_pass import (
     DecomposeLayerNormPass,
 )
+from executorch.backends.arm._passes.decompose_linear_pass import DecomposeLinearPass
 from executorch.backends.arm._passes.decompose_meandim_pass import DecomposeMeanDimPass
 from executorch.backends.arm._passes.decompose_softmaxes_pass import (
     DecomposeSoftmaxesPass,
@@ -74,6 +75,7 @@ def transform_to_backend_pipeline(
         self.add_pass(ConvertSplitToSlicePass())
         self.add_pass(Conv1dUnsqueezePass(exported_program))
         self.add_pass(DecomposeSoftmaxesPass())
+        self.add_pass(DecomposeLinearPass())
         for spec in compile_spec:
             if spec.key == "permute_memory_format":
                 memory_format = spec.value.decode()
 
@@ -0,0 +1,112 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+from executorch.backends.arm._passes.arm_pass_utils import (
+    create_node,
+    get_first_fake_tensor,
+)
+from executorch.backends.arm.tosa_quant_utils import dq_op, q_op
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class DecomposeLinearPass(ExportPass):
+    """
+    This pass decomposes linear into a Conv2D with the required view operations.
+    linear(x, weights, bias) becomes:
+        x_reshaped       = view(x)
+        weights_reshaped = view(weights)
+        conv2d           = conv2d(x_reshaped, weights_reshaped, bias)
+        output           = view(conv2d)
+    It also inserts q/dq pairs if the linear node was quantized.
+    """
+
+    def call(self, graph_module):
+        for node in graph_module.graph.nodes:
+            if node.op != "call_function":
+                continue
+            if node.target != exir_ops.edge.aten.linear.default:
+                continue
+            args = node.args
+            input = args[0]
+            weights = args[1]
+            bias = args[2] if len(args) > 2 else None
+            output_shape = get_first_fake_tensor(node).shape
+            input_shape = get_first_fake_tensor(input).shape
+            weights_shape = get_first_fake_tensor(weights).shape
+            batches = int(np.prod(input_shape[:-1])) if len(input_shape) > 1 else 1
+            # input has shape (..., Ci)
+            input_reshaped_shape = [batches, input_shape[-1], 1, 1]
+            # weights have shape (Co, Ci)
+            weights_reshaped_shape = [weights_shape[0], weights_shape[1], 1, 1]
+
+            with graph_module.graph.inserting_before(node):
+                quantize = input.op == "call_function" and input.target == dq_op
+                q_params = input.args[1:] if quantize else None
+                # Reshape input to 4D with shape (N, Ci, 1, 1)
+                input_reshaped = create_node(
+                    graph=graph_module.graph,
+                    op_target=exir_ops.edge.aten.view_copy.default,
+                    args=(input, input_reshaped_shape),
+                    kwargs={},
+                    quantize=quantize,
+                    q_params=q_params,
+                )
+
+                quantize = weights.op == "call_function" and weights.target == dq_op
+                q_params = weights.args[1:] if quantize else None
+                # Reshape weights to 4D with shape (Co, Ci, 1, 1)
+                weights_reshaped = create_node(
+                    graph=graph_module.graph,
+                    op_target=exir_ops.edge.aten.view_copy.default,
+                    args=(weights, weights_reshaped_shape),
+                    kwargs={},
+                    quantize=quantize,
+                    q_params=q_params,
+                )
+
+                consumer_node = list(node.users)[0]
+                quantize = (
+                    consumer_node.op == "call_function" and consumer_node.target == q_op
+                )
+                q_params = consumer_node.args[1:] if quantize else None
+                conv = create_node(
+                    graph=graph_module.graph,
+                    op_target=exir_ops.edge.aten.convolution.default,
+                    args=(
+                        input_reshaped,
+                        weights_reshaped,
+                        bias,
+                        [1, 1],  # strides
+                        [0, 0],  # padding
+                        [1, 1],  # dilation
+                        False,  # transposed
+                        [0, 0],  # output padding
+                        1,  # groups
+                    ),
+                    kwargs={},
+                    quantize=quantize,
+                    q_params=q_params,
+                )
+
+            with graph_module.graph.inserting_after(conv):
+                # Reshape output to same rank as original input with shape (..., Co)
+                # No need to insert q/dq pair as Conv2D node above has inserted them if
+                # required.
+                output = create_node(
+                    graph=graph_module.graph,
+                    op_target=exir_ops.edge.aten.view_copy.default,
+                    args=(conv, list(output_shape)),
+                    kwargs={},
+                )
+
+            node.replace_all_uses_with(output)
+            graph_module.graph.erase_node(node)
+            graph_module.graph.eliminate_dead_code()
+        graph_module.recompile()
+        graph_module = super().call(graph_module).graph_module
+        return PassResult(graph_module, True)
@@ -13,13 +13,15 @@
 
 import logging
 import os
-from typing import cast, final, List, Optional
+from typing import final, List, Optional
 
 import serializer.tosa_serializer as ts
 from executorch.backends.arm.arm_vela import vela_compile
 from executorch.backends.arm.operators.node_visitor import get_node_visitors
 from executorch.backends.arm.operators.op_output import process_output
 from executorch.backends.arm.operators.op_placeholder import process_placeholder
+
+from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.backends.arm._passes.arm_pass_manager import (
     ArmPassManager,
 )  # usort: skip
@@ -31,7 +33,6 @@
 from executorch.exir.backend.backend_details import BackendDetails, PreprocessResult
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from torch.export.exported_program import ExportedProgram
-from torch.fx import Node
 
 # TOSA backend debug functionality
 logger = logging.getLogger(__name__)
@@ -87,16 +88,23 @@ def ethosu_compile_spec(
         if extra_flags is not None:
             self.compiler_flags.append(extra_flags)
 
+        base_tosa_version = "TOSA-0.80.0+BI"
+        if "U55" in config:
+            # Add the Ethos-U55 extension marker
+            base_tosa_version += "+u55"
+        self.tosa_version = TosaSpecification.create_from_string(base_tosa_version)
+
         return self
 
-    def tosa_compile_spec(self) -> "ArmCompileSpecBuilder":
+    def tosa_compile_spec(self, tosa_version: str) -> "ArmCompileSpecBuilder":
         """
         Generate compile spec for TOSA flatbuffer output
         """
         assert (
             self.output_format is None
         ), f"Output format already set: {self.output_format}"
         self.output_format = "tosa"
+        self.tosa_version = TosaSpecification.create_from_string(tosa_version)
         return self
 
     def dump_intermediate_artifacts_to(
@@ -130,6 +138,13 @@ def build(self) -> List[CompileSpec]:
         """
         Generate a list of compile spec objects from the builder
         """
+        assert self.tosa_version
+
+        # Always supply a TOSA version
+        self.compile_spec = [
+            CompileSpec("tosa_version", str(self.tosa_version).encode())
+        ]
+
         if self.output_format == "vela":
             self.compile_spec += [
                 CompileSpec("output_format", "vela".encode()),
@@ -211,33 +226,42 @@ def preprocess(  # noqa: C901
         if not output_format:
             raise RuntimeError("output format is required")
 
+        tosa_spec = TosaSpecification.create_from_compilespecs(compile_spec)
+        assert (
+            tosa_spec is not None
+        ), "TOSA backend needs a TOSA version specified in the CompileSpec!"
+
         if output_format == "vela" and len(compile_flags) == 0:
             # Not testing for compile_flags correctness here, just that they are
             # present. The compiler will give errors if they are not valid.
             raise RuntimeError("compile flags are required for vela output format")
 
+        logger.info(f"Converting ExportedProgram to TOSA: {tosa_spec}")
+
         # Converted output for this subgraph, serializer needs path early as it emits
         # const data directly. Path created and data written only in debug builds.
         tosa_graph = ts.TosaSerializer(artifact_path)
         graph_module = ArmPassManager().transform_to_backend_pipeline(
             exported_program=edge_program, compile_spec=compile_spec
         )
 
-        node_visitors = get_node_visitors(edge_program)
+        node_visitors = get_node_visitors(edge_program, tosa_spec)
 
         for node in graph_module.graph.nodes:
-            node = cast(Node, node)
             if node.op == "call_function":
-                process_call_function(node, tosa_graph, node_visitors)
+                process_call_function(node, tosa_graph, node_visitors, tosa_spec)
             elif node.op == "placeholder":
-                process_placeholder(node, tosa_graph, edge_program)
+                process_placeholder(node, tosa_graph, edge_program, tosa_spec)
             elif node.op == "output":
                 process_output(node, tosa_graph)
             else:
                 # This will only happen if an unpartitioned graph is passed without
                 # any checking of compatibility.
                 dbg_fail(node, tosa_graph, artifact_path)
 
+        # TODO: It would be awesome if this dump could somehow be done on top level and not here.
+        # Problem is that the desc.json has to be created on the tosa_graph object, which we can't
+        # access from top level.
         if artifact_path:
             tag = _get_first_delegation_tag(graph_module)
             dbg_tosa_dump(
@@ -258,4 +282,6 @@ def preprocess(  # noqa: C901
         else:
             raise RuntimeError(f"Unknown format {output_format}")
 
+        # Continueing from above. Can I put tosa_graph into this function?
+        # debug_handle_map = ...
         return PreprocessResult(processed_bytes=binary)
@@ -8,7 +8,7 @@
 import logging
 import operator
 import os
-from typing import cast, final, List
+from typing import Callable, cast, final, List, Optional, Tuple
 
 import torch
 from executorch.backends.arm.arm_backend import ArmBackend  # usort: skip
@@ -39,7 +39,6 @@ class TOSASupportedOperators(OperatorSupportBase):
     def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
         supported = node.op == "call_function" and node.target in [
             exir_ops.edge.aten.add.Tensor,
-            exir_ops.edge.aten.addmm.default,
             exir_ops.edge.aten.expand_copy.default,
             exir_ops.edge.aten.cat.default,
             exir_ops.edge.aten.bmm.default,
@@ -49,6 +48,7 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
             exir_ops.edge.aten.div.Tensor,
             exir_ops.edge.aten.exp.default,
             exir_ops.edge.aten.log.default,
+            exir_ops.edge.aten.linear.default,
             exir_ops.edge.aten.split_with_sizes_copy.default,
             exir_ops.edge.aten.full.default,
             exir_ops.edge.aten.mul.Tensor,
@@ -137,3 +137,12 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         return PartitionResult(
             tagged_exported_program=exported_program, partition_tags=partition_tags
         )
+
+    def ops_to_not_decompose(
+        self,
+        ep: ExportedProgram,
+    ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
+        ops_to_not_decompose = [
+            torch.ops.aten.linear.default,
+        ]
+        return (ops_to_not_decompose, None)
@@ -8,7 +8,6 @@
 from . import (  # noqa
     node_visitor,
     op_add,
-    op_addmm,
     op_avg_pool2d,
     op_batch_norm,
     op_bmm,
Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,7 @@ python_library(`
`7`	`7`	`deps = [`
`8`	`8`	`"//executorch/backends/arm:tosa_quant_utils",`
`9`	`9`	`"//executorch/backends/arm:tosa_utils",`
	`10`	`+ "//executorch/backends/xnnpack/_passes:xnnpack_passes",`
`10`	`11`	`"//executorch/exir:lib",`
`11`	`12`	`],`
`12`	`13`	`)`