Add layernorm decomposition

oscarandersson8218 · oscarandersson8218 · commit 2ae39a07c571 · 2024-10-16T17:00:43.000+02:00
- Decompose layernorm
- Add unittest for layernorm

Signed-off-by: Oscar Andersson &lt;oscar.andersson@arm.com&gt;
Change-Id: Iccc382898cf247c560ef55c4711fab40d47f04dc
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
@@ -19,6 +19,9 @@
     ConvertSplitToSlicePass,
 )
 from executorch.backends.arm._passes.decompose_div_pass import DecomposeDivPass
+from executorch.backends.arm._passes.decompose_layernorm_pass import (
+    DecomposeLayerNormPass,
+)
 from executorch.backends.arm._passes.decompose_meandim_pass import DecomposeMeanDimPass
 from executorch.backends.arm._passes.decompose_var_pass import DecomposeVarPass
 from executorch.backends.arm._passes.insert_squeeze_after_sum_pass import (
@@ -50,6 +53,7 @@ def transform_to_backend_pipeline(
         self.add_pass(SizeAdjustConv2DPass())
         self.add_pass(RemoveClonePass())
         self.add_pass(ConvertExpandCopyToRepeatPass())
+        self.add_pass(DecomposeLayerNormPass())
         self.add_pass(DecomposeVarPass())
         self.add_pass(ConvertMeanDimToAveragePool())
         self.add_pass(DecomposeMeanDimPass())
@@ -65,6 +69,7 @@ def transform_to_backend_pipeline(
         return self._transform(exported_program.graph_module)
 
     def transform_for_annotation_pipeline(self, graph_module: torch.fx.GraphModule):
+        self.add_pass(DecomposeLayerNormPass())
         self.add_pass(DecomposeVarPass())
         self.add_pass(DecomposeMeanDimPass())
         self.add_pass(DecomposeDivPass())
diff --git a/backends/arm/_passes/decompose_layernorm_pass.py b/backends/arm/_passes/decompose_layernorm_pass.py
@@ -0,0 +1,146 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import operator
+
+import torch
+from executorch.backends.arm._passes.arm_pass_utils import create_node
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+def get_layer_norm_decomposition(op) -> tuple:
+    if op == exir_ops.edge.aten.native_layer_norm.default:
+        return (
+            exir_ops.edge.aten.mean.dim,
+            exir_ops.edge.aten.sub.Tensor,
+            exir_ops.edge.aten.var.correction,
+            exir_ops.edge.aten.full.default,
+            exir_ops.edge.aten.add.Tensor,
+            exir_ops.edge.aten.rsqrt.default,
+            exir_ops.edge.aten.mul.Tensor,
+            exir_ops.edge.aten.view_copy.default,
+        )
+    if op == torch.ops.aten.layer_norm.default:
+        return (
+            torch.ops.aten.mean.dim,
+            torch.ops.aten.sub.Tensor,
+            torch.ops.aten.var.correction,
+            torch.ops.aten.full.default,
+            torch.ops.aten.add.Tensor,
+            torch.ops.aten.rsqrt.default,
+            torch.ops.aten.mul.Tensor,
+            torch.ops.aten.view_copy.default,
+        )
+    raise RuntimeError(f"Can't get layer_norm composition for op {op}")
+
+
+class DecomposeLayerNormPass(ExportPass):
+    """
+    layernorm is defined as: ((x - E[x]) / sqrt(Var[x] + eps)) * weights + bias
+    Decompose layernorm(x, normalized_shape, weights, bias, eps) to a sequence of:
+    mean        = op_mean(x, dims)           # E[x]
+    var         = op_var(x, dims)            # Var[x]
+    denominator = op_sub(x, mean)            # (x - E[x])
+    add         = op_add(var, eps)           # Var[x] + eps
+    rsqrt       = op_rsqrt(add)              # 1 / sqrt(Var[x] + eps)
+    mul         = op_mul(denominator, rsqrt) # ((x - E[x]) / sqrt(Var[x] + eps)) * weigths
+    bias        = op_add(mul, bias)          # ((x - E[x]) / sqrt(Var[x] + eps)) * weigths + bias
+
+    Source: https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html
+    """
+
+    def call(self, gm: torch.fx.GraphModule):
+        for node in gm.graph.nodes:
+            if node.op != "call_function" or node.target not in (
+                exir_ops.edge.aten.native_layer_norm.default,
+                torch.ops.aten.layer_norm.default,
+            ):
+                continue
+
+            # epsilon default value
+            epsilon = 1e-5
+            weights = None
+            bias = None
+            args = node.args
+            meta = node.meta
+            match len(args):
+                case 5:
+                    x, normalized_shape, weights, bias, epsilon = args
+                case 4:
+                    x, normalized_shape, weights, bias = args
+                case 3:
+                    x, normalized_shape, weights = args
+                case 2:
+                    x, normalized_shape = args
+
+            n_dims = len(normalized_shape)
+            if isinstance(meta["val"], tuple):
+                shape = meta["val"][0].size()
+            else:
+                shape = meta["val"].size()
+            dtype = meta["val"][0].dtype
+            rank = len(shape)
+            dims = list(range(-1, -1 * (n_dims + 1), -1))
+            dims = [dim % rank for dim in dims]
+            weights_reshaped_shape = [shape[i] if i in dims else 1 for i in range(rank)]
+            epsilon_reshaped_shape = [1] * rank
+
+            (
+                mean_op,
+                sub_op,
+                var_op,
+                full_op,
+                add_op,
+                rsqrt_op,
+                mul_op,
+                view_op,
+            ) = get_layer_norm_decomposition(node.target)
+            with gm.graph.inserting_before(node):
+                keepdim = True
+                mean = create_node(gm.graph, mean_op, args=(x, dims, keepdim))
+                sub = create_node(gm.graph, sub_op, args=(x, mean))
+                var = create_node(
+                    gm.graph,
+                    var_op,
+                    args=(x, dims),
+                    kwargs={"correction": 0, "keepdim": keepdim},
+                )
+                full = create_node(
+                    gm.graph,
+                    full_op,
+                    args=(epsilon_reshaped_shape, epsilon),
+                    kwargs={"dtype": dtype},
+                )
+                add0 = create_node(gm.graph, add_op, args=(var, full))
+                rsqrt = create_node(gm.graph, rsqrt_op, args=(add0,))
+                mul0 = create_node(gm.graph, mul_op, args=(sub, rsqrt))
+                if weights is not None:
+                    weights_reshaped = create_node(
+                        gm.graph, view_op, args=(weights, weights_reshaped_shape)
+                    )
+                    mul1 = create_node(gm.graph, mul_op, args=(mul0, weights_reshaped))
+                else:
+                    mul1 = mul0
+                output = mul1
+                if bias is not None:
+                    bias_reshaped_shape = weights_reshaped_shape
+                    bias_reshaped = create_node(
+                        gm.graph, view_op, args=(bias, bias_reshaped_shape)
+                    )
+                    output = create_node(gm.graph, add_op, args=(mul1, bias_reshaped))
+
+                users = [user for user in node.users if node != user]
+                node.replace_all_uses_with(output)
+                for user in users:
+                    if user.target == operator.getitem:
+                        user.replace_all_uses_with(output)
+                gm.graph.erase_node(node)
+                gm.graph.eliminate_dead_code()
+        gm.recompile()
+        gm = super().call(gm).graph_module
+
+        return PassResult(gm, True)
diff --git a/backends/arm/arm_partitioner.py b/backends/arm/arm_partitioner.py
@@ -53,6 +53,7 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
             exir_ops.edge.aten.full.default,
             exir_ops.edge.aten.mul.Tensor,
             exir_ops.edge.aten._native_batch_norm_legit_no_training.default,
+            exir_ops.edge.aten.native_layer_norm.default,
             exir_ops.edge.aten.avg_pool2d.default,
             exir_ops.edge.aten.sigmoid.default,
             exir_ops.edge.aten.mm.default,
diff --git a/backends/arm/quantizer/arm_quantizer_utils.py b/backends/arm/quantizer/arm_quantizer_utils.py
@@ -149,11 +149,13 @@ def is_share_obs_or_fq_op(op: Callable) -> bool:
         torch.ops.aten.squeeze.default,
         torch.ops.aten.squeeze_copy.dim,
         torch.ops.aten.unsqueeze.default,
+        torch.ops.aten.unsqueeze_copy.default,
         # TODO: remove?
         torch.ops.aten.adaptive_avg_pool2d.default,
         torch.ops.aten.avg_pool2d.default,
         torch.ops.aten.view_copy.default,
         torch.ops.aten.view.default,
+        torch.ops.aten.full.default,
         torch.ops.aten.slice.Tensor,
         torch.ops.aten.split.Tensor,
         torch.ops.aten.split_with_sizes.default,
diff --git a/backends/arm/quantizer/quantization_annotation/add_annotator.py b/backends/arm/quantizer/quantization_annotation/add_annotator.py
@@ -6,8 +6,6 @@
 
 # pyre-unsafe
 
-import itertools
-import operator
 from typing import Callable, List, Optional
 
 import torch
@@ -16,7 +14,6 @@
 from executorch.backends.arm.quantizer.quantization_config import QuantizationConfig
 from torch.ao.quantization.quantizer import QuantizationAnnotation
 from torch.fx import Node
-from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
 
 
 @register_annotator("add")
@@ -25,14 +22,12 @@ def _annotate_add(
     quantization_config: QuantizationConfig,
     filter_fn: Optional[Callable[[Node], bool]] = None,
 ) -> Optional[List[List[Node]]]:
-    add_partitions = get_source_partitions(
-        gm.graph, [operator.add, torch.add, operator.iadd], filter_fn
-    )
-    add_partitions = list(itertools.chain.from_iterable(add_partitions.values()))
     annotated_partitions = []
-    for add_partition in add_partitions:
-        annotated_partitions.append(add_partition.nodes)
-        add_node = add_partition.output_nodes[0]
+    for node in gm.graph.nodes:
+        if node.target not in (torch.ops.aten.add.Tensor,):
+            continue
+        annotated_partitions.append(node)
+        add_node = node
         if arm_quantizer_utils.is_annotated(add_node):
             continue
 
diff --git a/backends/arm/test/ops/test_layer_norm.py b/backends/arm/test/ops/test_layer_norm.py
diff --git a/backends/arm/tosa_quant_utils.py b/backends/arm/tosa_quant_utils.py