pytorch
diff --git a/‎CMakeLists.txt‎
Lines changed: 8 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎backends/arm/TARGETS‎
Lines changed: 4 additions & 1 deletion b/‎backends/arm/TARGETS‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎backends/arm/runtime/targets.bzl‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/runtime/targets.bzl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/qualcomm/_passes/__init__.py‎
Lines changed: 22 additions & 2 deletions b/‎backends/qualcomm/_passes/__init__.py‎
Lines changed: 22 additions & 2 deletions
diff --git a/‎backends/qualcomm/_passes/convert_to_linear.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/qualcomm/_passes/convert_to_linear.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/qualcomm/_passes/decompose_any.py‎
Lines changed: 76 additions & 0 deletions b/‎backends/qualcomm/_passes/decompose_any.py‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎backends/qualcomm/_passes/decompose_einsum.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/qualcomm/_passes/decompose_einsum.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/qualcomm/_passes/decompose_linalg_vector_norm.py‎
Lines changed: 85 additions & 0 deletions b/‎backends/qualcomm/_passes/decompose_linalg_vector_norm.py‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎backends/qualcomm/_passes/decompose_silu.py‎
Lines changed: 10 additions & 12 deletions b/‎backends/qualcomm/_passes/decompose_silu.py‎
Lines changed: 10 additions & 12 deletions
diff --git a/‎backends/qualcomm/_passes/layout_transform.py‎
Lines changed: 5 additions & 0 deletions b/‎backends/qualcomm/_passes/layout_transform.py‎
Lines changed: 5 additions & 0 deletions
@@ -186,6 +186,10 @@ option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR "Build the Flat Tensor extension"
        OFF
 )
 
+option(EXECUTORCH_BUILD_EXTENSION_LLM "Build the LLM extension"
+       OFF
+)
+
 option(EXECUTORCH_BUILD_EXTENSION_MODULE "Build the Module extension" OFF)
 
 option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL "Build the Runner Util extension"
@@ -718,6 +722,10 @@ if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/flat_tensor/serialize)
 endif()
 
+if(EXECUTORCH_BUILD_EXTENSION_LLM)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/tokenizer)
+endif()
+
 if(EXECUTORCH_BUILD_EXTENSION_MODULE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
 endif()
 
@@ -4,7 +4,10 @@ load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
 python_library(
     name = "arm_partitioner",
     srcs = [
-        "arm_partitioner.py",
+        "ethosu_backend.py",
+        "ethosu_partitioner.py",
+        "tosa_backend.py",
+        "tosa_partitioner.py",
     ],
     typing = True,
     deps = [
 
@@ -12,7 +12,7 @@ def define_common_targets():
     )
     runtime.cxx_library(
         name = "arm_backend",
-        srcs = ["ArmBackendEthosU.cpp"],
+        srcs = ["EthosUBackend.cpp"],
         headers = [],
         compatible_with = ["ovr_config//cpu:arm32-embedded"],
         # arm_executor_runner.cpp needs to compile with executor as whole
 
@@ -2,35 +2,55 @@
 from .annotate_decomposed import AnnotateDecomposed
 from .annotate_quant_attrs import AnnotateQuantAttrs
 from .constant_i64_to_i32 import ConstantI64toI32
+from .convert_binary_op_with_scalar import ConvertBinaryOpsWithScalar
 from .convert_bmm_to_matmul import ConvertBmmToMatmul
 from .convert_interpolate_with_upsample2d import ConvertInterpolateWithUpsample2D
 from .convert_prelu import ConvertPReLU
 from .convert_to_linear import ConvertToLinear
+from .decompose_any import DecomposeAny
+from .decompose_einsum import DecomposeEinsum
+from .decompose_linalg_vector_norm import DecomposeLinalgVectorNorm
+from .decompose_silu import DecomposeSilu
 from .expand_broadcast_tensor_shape import ExpandBroadcastTensorShape
 from .fold_qdq import FoldQDQ
+from .fuse_consecutive_transpose import FuseConsecutiveTranspose
+from .insert_io_qdq import InsertIOQDQ
+from .insert_requantize import InsertRequantize
 from .layout_transform import LayoutTransform
 from .recompose_pixel_unshuffle import RecomposePixelUnshuffle
 from .recompose_rms_norm import RecomposeRmsNorm
+from .reduce_dynamic_range import ReduceDynamicRange
 from .remove_redundancy import RemoveRedundancy
 from .replace_index_put_input import ReplaceIndexPutInput
+from .replace_inf_buffer import ReplaceInfBuffer
 from .tensor_i64_to_i32 import TensorI64toI32
 
 
 __all__ = [
     AnnotateAndQuantScalar,
     AnnotateDecomposed,
     AnnotateQuantAttrs,
+    ConstantI64toI32,
     ConvertBmmToMatmul,
+    ConvertBinaryOpsWithScalar,
     ConvertInterpolateWithUpsample2D,
     ConvertPReLU,
     ConvertToLinear,
+    DecomposeAny,
+    DecomposeEinsum,
+    DecomposeLinalgVectorNorm,
+    DecomposeSilu,
     ExpandBroadcastTensorShape,
     FoldQDQ,
-    ConstantI64toI32,
-    TensorI64toI32,
+    FuseConsecutiveTranspose,
+    InsertIOQDQ,
+    InsertRequantize,
     LayoutTransform,
     RecomposePixelUnshuffle,
     RecomposeRmsNorm,
+    ReduceDynamicRange,
     RemoveRedundancy,
     ReplaceIndexPutInput,
+    ReplaceInfBuffer,
+    TensorI64toI32,
 ]
@@ -39,6 +39,7 @@ class ConvertToLinear(ExportPass):
     mm = exir_ops.edge.aten.mm.default
 
     addmm_patterns = [
+        {view_copy: 1, permute_copy: 1, addmm: 1},
         {view_copy: 2, permute_copy: 1, addmm: 1},
         {permute_copy: 1, addmm: 1},
     ]
 
@@ -0,0 +1,76 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir import to_edge
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class Any(torch.nn.Module):
+    def __init__(self, dim, keepdim):
+        super().__init__()
+        self.dim = tuple(dim) if isinstance(dim, list) else dim
+        self.keepdim = keepdim
+
+    def forward(self, x):
+        if self.dim is None:
+            x = torch.flatten(x)
+            self.dim = 0
+
+        x = x.to(torch.bool).to(torch.int32)
+        x = torch.sum(x, dim=self.dim, keepdim=self.keepdim, dtype=torch.int32)
+        return torch.not_equal(x, torch.zeros(1, dtype=torch.int32))
+
+
+class DecomposeAny(ExportPass):
+    """
+    Decompose for math equivalent op.
+    """
+
+    def __init__(self, quantization_capture=False) -> None:
+        super().__init__()
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        graph = graph_module.graph
+        for node in graph.nodes:
+            if "any.dim" in str(node.target):
+                dim = node.args[1] if len(node.args) > 1 else None
+                keepdim = node.args[2] if len(node.args) > 2 else False
+                model = Any(dim, keepdim)
+                edge_mgr = to_edge(
+                    torch.export.export(model, (node.args[0].meta["val"],))
+                )
+                decomposed_module = edge_mgr.exported_program()
+
+                with graph.inserting_before(node):
+                    # remap is used to map original node values to new node values,
+                    # which ensures that reference to nodes are correctly updated in the new graph
+                    remap = {"x": node.args[0]}
+
+                    for decomposed_node in decomposed_module.graph.nodes:
+                        # no need to copy existent 'output'
+                        if decomposed_node.op == "output":
+                            for user in node.users.copy():
+                                # remap
+                                user.replace_input_with(
+                                    node,
+                                    remap[decomposed_node.args[0][0]],
+                                )
+                        # no need to copy existent placeholders
+                        elif decomposed_node.op == "placeholder":
+                            # replace node map from string to graph node
+                            remap[decomposed_node] = remap.pop(decomposed_node.name)
+                        else:
+                            remap[decomposed_node] = graph.node_copy(
+                                decomposed_node,
+                                arg_transform=lambda x, remap=remap: remap[x],
+                            )
+
+                    graph.erase_node(node)
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
@@ -28,7 +28,7 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
 
                 with graph.inserting_before(node):
                     # remap is used to map original node values to new node values,
-                    # which ensures that reference to nodes are correclty updated in the new graph
+                    # which ensures that reference to nodes are correctly updated in the new graph
                     remap = {}
                     # Different from other nodes, einsum args[0] is the einsum equation,
                     # while input nodes are stored in args[1]
 
@@ -0,0 +1,85 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir import to_edge
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class LinalgVectorNorm(torch.nn.Module):
+    def __init__(self, exp, dim, keepdim):
+        super().__init__()
+        self.exp = exp
+        self.dim = tuple(dim) if dim is not None else None
+        self.keepdim = keepdim
+
+    def forward(self, x):
+        if self.dim is None:
+            x = torch.flatten(x)
+            self.dim = 0
+
+        x = torch.abs(x)
+        x = torch.pow(x, self.exp)
+        x = torch.sum(x, dim=self.dim, keepdim=self.keepdim)
+        return torch.pow(x, 1.0 / self.exp)
+
+
+class DecomposeLinalgVectorNorm(ExportPass):
+    """
+    Decompose for math equivalent op.
+    """
+
+    def __init__(self, quantization_capture=False) -> None:
+        super().__init__()
+        self.quantization_capture = quantization_capture
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        graph = graph_module.graph
+        for node in graph.nodes:
+            if "linalg_vector_norm" in str(node.target):
+                ord = node.args[1] if len(node.args) > 1 else 2.0
+                dim = node.args[2] if len(node.args) > 2 else None
+                keepdim = node.args[3] if len(node.args) > 3 else False
+                model = LinalgVectorNorm(ord, dim, keepdim)
+                if self.quantization_capture:
+                    decomposed_module = torch.export.export(
+                        model, (node.args[0].meta["val"],)
+                    ).module()
+                else:
+                    edge_mgr = to_edge(
+                        torch.export.export(model, (node.args[0].meta["val"],))
+                    )
+                    decomposed_module = edge_mgr.exported_program()
+
+                with graph.inserting_before(node):
+                    # remap is used to map original node values to new node values,
+                    # which ensures that reference to nodes are correctly updated in the new graph
+                    remap = {"x": node.args[0]}
+
+                    for decomposed_node in decomposed_module.graph.nodes:
+                        # no need to copy existent 'output'
+                        if decomposed_node.op == "output":
+                            for user in node.users.copy():
+                                # remap
+                                user.replace_input_with(
+                                    node,
+                                    remap[decomposed_node.args[0][0]],
+                                )
+                        # no need to copy existent placeholders
+                        elif decomposed_node.op == "placeholder":
+                            # replace node map from string to graph node
+                            remap[decomposed_node] = remap.pop(decomposed_node.name)
+                        else:
+                            remap[decomposed_node] = graph.node_copy(
+                                decomposed_node,
+                                arg_transform=lambda x, remap=remap: remap[x],
+                            )
+
+                    graph.erase_node(node)
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
@@ -7,7 +7,6 @@
 
 import torch
 from executorch.exir.pass_base import ExportPass, PassResult
-from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
 
 
 class DecomposeSilu(ExportPass):
@@ -22,24 +21,23 @@ def _copy_meta(self, meta: Dict):
 
     def call(self, graph_module: torch.fx.GraphModule):
         graph = graph_module.graph
-        partitions = get_source_partitions(
-            graph, [torch.nn.functional.silu, torch.ops.aten.silu.default]
-        )
-        for _, src_partitions in partitions.items():
-            for src_partition in src_partitions:
-
-                inputs = src_partition.input_nodes
-                silu_node = src_partition.nodes[0]
-                with graph_module.graph.inserting_after(inputs[0]):
+        for node in graph.nodes:
+            if (
+                node.op == "call_function"
+                and node.target == torch.ops.aten.silu.default
+            ):
+                silu_node = node
+                silu_node_input = node.args[0]
+                with graph_module.graph.inserting_after(silu_node_input):
                     sigmoid_node = graph.create_node(
-                        "call_function", torch.ops.aten.sigmoid, (inputs[0],)
+                        "call_function", torch.ops.aten.sigmoid, (silu_node_input,)
                     )
                     sigmoid_node.meta = self._copy_meta(silu_node.meta)
                     with graph_module.graph.inserting_after(sigmoid_node):
                         mul_node = graph.create_node(
                             "call_function",
                             torch.ops.aten.mul,
-                            (inputs[0], sigmoid_node),
+                            (silu_node_input, sigmoid_node),
                         )
                         mul_node.meta = self._copy_meta(silu_node.meta)
                         for user in silu_node.users.copy():
 
@@ -33,8 +33,10 @@ class LayoutTransform(ExportPass):
         exir_ops.edge.aten.adaptive_avg_pool2d.default,
         exir_ops.edge.aten.avg_pool2d.default,
         exir_ops.edge.aten.convolution.default,
+        exir_ops.edge.aten.instance_norm.default,
         exir_ops.edge.aten.max_pool2d_with_indices.default,
         exir_ops.edge.aten._native_batch_norm_legit_no_training.default,
+        exir_ops.edge.aten._native_batch_norm_legit.no_stats,
         exir_ops.edge.aten.native_group_norm.default,
         exir_ops.edge.aten.pixel_shuffle.default,
         exir_ops.edge.aten.pixel_unshuffle.default,
@@ -54,6 +56,7 @@ class LayoutTransform(ExportPass):
         exir_ops.edge.aten.eq.Scalar,
         exir_ops.edge.aten.eq.Tensor,
         exir_ops.edge.aten.full.default,
+        exir_ops.edge.aten.full_like.default,
         exir_ops.edge.aten.ge.Scalar,
         exir_ops.edge.aten.ge.Tensor,
         exir_ops.edge.aten.gelu.default,
@@ -75,6 +78,8 @@ class LayoutTransform(ExportPass):
         exir_ops.edge.aten.mean.dim,
         exir_ops.edge.aten.minimum.default,
         exir_ops.edge.aten.mul.Tensor,
+        exir_ops.edge.aten.ne.Scalar,
+        exir_ops.edge.aten.ne.Tensor,
         exir_ops.edge.aten.neg.default,
         exir_ops.edge.aten.pow.Tensor_Scalar,
         exir_ops.edge.aten.prelu.default,
Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@ def define_common_targets():`
`12`	`12`	`)`
`13`	`13`	`runtime.cxx_library(`
`14`	`14`	`name = "arm_backend",`
`15`		`- srcs = ["ArmBackendEthosU.cpp"],`
	`15`	`+ srcs = ["EthosUBackend.cpp"],`
`16`	`16`	`headers = [],`
`17`	`17`	`compatible_with = ["ovr_config//cpu:arm32-embedded"],`
`18`	`18`	`# arm_executor_runner.cpp needs to compile with executor as whole`
Original file line number	Diff line number	Diff line change
`@@ -39,6 +39,7 @@ class ConvertToLinear(ExportPass):`
`39`	`39`	`mm = exir_ops.edge.aten.mm.default`
`40`	`40`
`41`	`41`	`addmm_patterns = [`
	`42`	`+ {view_copy: 1, permute_copy: 1, addmm: 1},`
`42`	`43`	`{view_copy: 2, permute_copy: 1, addmm: 1},`
`43`	`44`	`{permute_copy: 1, addmm: 1},`
`44`	`45`	`]`