Merge branch 'main' into Add-support-for-expm1

agrima1304 · web-flow · commit 3816878cbeba · 2025-08-11T15:16:58.000+01:00
diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py
@@ -37,6 +37,7 @@
 from .decompose_embedding_pass import DecomposeEmbeddingPass  # noqa  # noqa
 from .decompose_expm1_pass import DecomposeExpm1Pass  # noqa
 from .decompose_gelu_pass import DecomposeGeluPass  # noqa
+from .decompose_glu_pass import DecomposeGluPass  # noqa
 from .decompose_grouped_conv import DecomposeGroupedConv  # noqa
 from .decompose_groupnorm_pass import DecomposeGroupNormPass  # noqa
 from .decompose_layernorm_pass import DecomposeLayerNormPass  # noqa
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
@@ -42,6 +42,7 @@
     DecomposeEmbeddingPass,
     DecomposeExpm1Pass,
     DecomposeGeluPass,
+    DecomposeGluPass,
     DecomposeGroupedConv,
     DecomposeGroupNormPass,
     DecomposeLayerNormPass,
@@ -186,6 +187,7 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(ConvertSplitToSlicePass())
         self.add_pass(FuseBatchnorm2DPass(exported_program))
         self.add_pass(ConvertMmToBmmPass())
+        self.add_pass(DecomposeGluPass())
         self.add_pass(DecomposeLinearPass())
         self.add_pass(DecomposeLeakyReLUPass())
         self.add_pass(DecomposeGroupNormPass())
@@ -266,6 +268,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeMeanDimPass(graph_module, self.tosa_spec))
         self.add_pass(DecomposeNotEqualPass())
         self.add_pass(DecomposeCosineSimilarityPass())
+        self.add_pass(DecomposeGluPass())
         self.add_pass(DecomposeDivPass())
         self.add_pass(DecomposeLeakyReLUPass())
         self.add_pass(DecomposeLinearVectorNormPass())
diff --git a/backends/arm/_passes/decompose_glu_pass.py b/backends/arm/_passes/decompose_glu_pass.py
@@ -0,0 +1,75 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.backends.arm._passes import ArmPass
+from executorch.exir.dialects._ops import ops as exir_ops
+
+
+# For FP case
+edge_glu = exir_ops.edge.aten.glu.default
+
+# For INT case
+aten_glu = torch.ops.aten.glu.default
+
+
+def get_ops(op):
+    """Returns the appropriate operator functions based on the input operator."""
+    if op == edge_glu:
+        return (
+            exir_ops.edge.aten.mul.Tensor,
+            exir_ops.edge.aten.sigmoid.default,
+            exir_ops.edge.aten.slice_copy.Tensor,
+        )
+    elif op == aten_glu:
+        return (
+            torch.ops.aten.mul.Tensor,
+            torch.ops.aten.sigmoid.default,
+            torch.ops.aten.slice_copy.Tensor,
+        )
+    else:
+        raise ValueError(f"Unsupported operator: {op}")
+
+
+class DecomposeGluPass(ArmPass):
+    """Decomposes the GLU operator into hadamard product and sigmoid."""
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in [edge_glu, aten_glu]:
+            return super().call_operator(op, args, kwargs, meta)
+
+        hadamard_prod, sigmoid, slice_op = get_ops(op)
+        X = args[0]
+
+        dim = args[1] if len(args) > 1 else kwargs.get("dim", -1)
+
+        if "val" not in X.node.meta:
+            raise Exception("Could not get dimension metadata in input.")
+
+        if dim < 0:
+            dim += X.node.meta["val"].dim()
+
+        n = X.node.meta["val"].size(dim)
+
+        if n % 2:
+            raise RuntimeError(
+                f"glu expects an even split along dim={dim}, got size {n}"
+            )
+
+        middle = n // 2
+
+        T1 = super().call_operator(
+            slice_op, (X, dim, 0, middle), {}, meta, updated=True
+        )
+
+        T2 = super().call_operator(
+            slice_op, (X, dim, middle, n), {}, meta, updated=True
+        )
+
+        T2_sigmoid = super().call_operator(sigmoid, (T2,), {}, meta, updated=True)
+
+        return super().call_operator(
+            hadamard_prod, (T1, T2_sigmoid), {}, meta, updated=True
+        )
diff --git a/backends/arm/_passes/fuse_equal_placeholders_pass.py b/backends/arm/_passes/fuse_equal_placeholders_pass.py
@@ -3,6 +3,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import hashlib
+from collections import defaultdict
+
 import torch
 from executorch.backends.arm._passes.arm_pass_utils import (
     get_constant_placeholder_kind,
@@ -21,7 +24,7 @@ class FuseEqualPlaceholdersPass(ExportPass):
     """
     This pass optimizes memory usage by finding constant placeholders
     pointing to identical tensors and fusing them to one single placeholder
-    with multiple users.
+    with multiple users, using a cache for faster comparison.
     """
 
     def __init__(self, exported_program: ExportedProgram):
@@ -30,58 +33,54 @@ def __init__(self, exported_program: ExportedProgram):
 
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         modified = False
-        const_placeholder_nodes = []
-        for node in graph_module.graph.nodes:
-            if is_param_node(self.exported_program, node):
-                const_placeholder_nodes.append(node)
-
-        while const_placeholder_nodes:
 
-            # Find equal tensors
-            node1 = const_placeholder_nodes.pop()
-            eq_nodes = [node1]
-            tensor1 = get_param_tensor(self.exported_program, node1)
-            if tensor1 is None:
+        # Build a cache of params: mapping hash_key -> list of (node, tensor)
+        hash_buckets = defaultdict(list)
+        for node in graph_module.graph.nodes:
+            if not is_param_node(self.exported_program, node):
                 continue
+            tensor = get_param_tensor(self.exported_program, node)
+            if tensor is None:
+                continue
+            # Create a lightweight fingerprint: dtype + shape + SHA1 of raw bytes
+            # Ensure tensor is on CPU and contiguous
+            t_cpu = tensor.detach().cpu().contiguous()
+            data_bytes = t_cpu.numpy().tobytes()
+            key = (
+                str(t_cpu.dtype),
+                tuple(t_cpu.shape),
+                hashlib.sha1(data_bytes).hexdigest(),
+            )
+            hash_buckets[key].append((node, t_cpu))
 
-            for node2 in const_placeholder_nodes:
-                tensor2 = get_param_tensor(self.exported_program, node2)
-                if tensor2 is None:
-                    continue
-
-                if (
-                    tensor1.dtype == tensor2.dtype
-                    and tensor1.shape == tensor2.shape
-                    and torch.allclose(tensor1, tensor2, atol=1e-08)
-                ):
-                    eq_nodes.append(node2)
+        # For each bucket with more than one entry, fuse:
+        for nodes_tensors in hash_buckets.values():
+            if len(nodes_tensors) < 2:
+                continue
 
-            if len(eq_nodes) > 1:
-                common_name = node1.name + "_common"
-                common_kind = get_constant_placeholder_kind(
-                    self.exported_program, node1
+            # Create a new placeholder from first in list of equal placeholders.
+            rep_node, rep_tensor = nodes_tensors[0]
+            common_name = rep_node.name + "_common"
+            common_kind = get_constant_placeholder_kind(self.exported_program, rep_node)
+            common_persistent = True
+            with graph_module.graph.inserting_before(rep_node):
+                common_node = create_constant_placeholder(
+                    self.exported_program,
+                    graph_module.graph,
+                    common_name,
+                    common_kind,
+                    rep_tensor,
+                    common_persistent,
                 )
-                common_persisten_buffer = True
-
-                with graph_module.graph.inserting_before(node1):
-                    common_node = create_constant_placeholder(
-                        self.exported_program,
-                        graph_module.graph,
-                        common_name,
-                        common_kind,
-                        tensor1,
-                        common_persisten_buffer,
-                    )
-
-                for eq_node in eq_nodes:
-                    eq_node.replace_all_uses_with(common_node)
-                    delete_constant_placeholder(self.exported_program, eq_node)
-                    if eq_node != node1:
-                        const_placeholder_nodes.remove(eq_node)
 
+            # Replace uses and delete duplicates
+            for node, _ in nodes_tensors:
+                node.replace_all_uses_with(common_node)
+                delete_constant_placeholder(self.exported_program, node)
                 modified = True
 
         if modified:
             graph_module.recompile()
             graph_module = super().call(graph_module).graph_module
+
         return PassResult(graph_module=graph_module, modified=modified)
diff --git a/backends/arm/operator_support/tosa_supported_operators.py b/backends/arm/operator_support/tosa_supported_operators.py
@@ -259,6 +259,7 @@ def is_node_supported(
             exir_ops.edge.aten.masked_fill.Scalar,
             exir_ops.edge.aten.asinh.default,
             exir_ops.edge.aten.cosh.default,
+            exir_ops.edge.aten.glu.default,
         ]
 
         return supported
@@ -300,6 +301,7 @@ def is_node_supported(
             exir_ops.edge.aten.leaky_relu.default: None,
             exir_ops.edge.aten.round.default: None,
             exir_ops.edge.aten.addmm.default: None,
+            exir_ops.edge.aten.glu.default: None,
         }
 
         if node.target in needs_decomp_dict:
diff --git a/backends/arm/test/ops/test_glu.py b/backends/arm/test/ops/test_glu.py
@@ -0,0 +1,130 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+import torch.nn.functional as F
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineINT,
+    EthosU85PipelineINT,
+    TosaPipelineFP,
+    TosaPipelineINT,
+    VgfPipeline,
+)
+
+aten_op = "torch.ops.aten.glu.default"
+exir_op = "executorch_exir_dialects_edge__ops_aten__glu_default"
+
+
+input_t1 = Tuple[torch.Tensor]
+
+test_data_suite = {
+    "zeros": [torch.zeros(10, 10, 2), -1],
+    "ones": [torch.ones(10, 10, 2), -1],
+    "rand": [torch.rand(10, 10, 2) - 0.5, -1],
+    "randn_pos": [torch.randn(10, 2) + 10, -1],
+    "randn_neg": [torch.randn(10, 2) - 10, -1],
+    "ramp": [torch.linspace(-16, 15.8, 160).reshape(-1, 2), -1],
+    "zeros_custom_dim": [torch.zeros(7, 10, 5), 1],
+    "rand_custom_dim": [torch.rand(10, 3, 3) - 0.5, 0],
+}
+
+
+class Glu(torch.nn.Module):
+
+    def forward(self, a: torch.Tensor, dim: int) -> torch.Tensor:
+        return F.glu(a, dim=dim)
+
+
+@common.parametrize(
+    "test_data",
+    test_data_suite,
+)
+def test_glu_tosa_FP(test_data: Tuple):
+    pipeline = TosaPipelineFP[input_t1](
+        Glu(),
+        (*test_data,),
+        aten_op,
+        exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_data",
+    test_data_suite,
+)
+def test_glu_tosa_INT(test_data: Tuple):
+    pipeline = TosaPipelineINT[input_t1](
+        Glu(),
+        (*test_data,),
+        aten_op=[],
+        exir_op=exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_data",
+    test_data_suite,
+)
+@common.XfailIfNoCorstone300
+def test_glu_u55_INT(test_data: Tuple):
+    pipeline = EthosU55PipelineINT[input_t1](
+        Glu(),
+        (*test_data,),
+        aten_ops=[],
+        exir_ops=exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_data",
+    test_data_suite,
+)
+@common.XfailIfNoCorstone320
+def test_glu_u85_INT(test_data: Tuple):
+    pipeline = EthosU85PipelineINT[input_t1](
+        Glu(),
+        (*test_data,),
+        aten_ops=[],
+        exir_ops=exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_data",
+    test_data_suite,
+)
+@common.SkipIfNoModelConverter
+def test_glu_vgf_FP(test_data: input_t1):
+    pipeline = VgfPipeline[input_t1](
+        Glu(),
+        (*test_data,),
+        [],
+        [],
+        tosa_version="TOSA-1.0+FP",
+    )
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_data",
+    test_data_suite,
+)
+@common.SkipIfNoModelConverter
+def test_glu_vgf_INT(test_data: input_t1):
+    pipeline = VgfPipeline[input_t1](
+        Glu(),
+        (*test_data,),
+        [],
+        [],
+        tosa_version="TOSA-1.0+INT",
+    )
+    pipeline.run()

Original file line number	Diff line number	Diff line change
`@@ -259,6 +259,7 @@ def is_node_supported(`
`259`	`259`	`exir_ops.edge.aten.masked_fill.Scalar,`
`260`	`260`	`exir_ops.edge.aten.asinh.default,`
`261`	`261`	`exir_ops.edge.aten.cosh.default,`
	`262`	`+ exir_ops.edge.aten.glu.default,`
`262`	`263`	`]`
`263`	`264`
`264`	`265`	`return supported`
`@@ -300,6 +301,7 @@ def is_node_supported(`
`300`	`301`	`exir_ops.edge.aten.leaky_relu.default: None,`
`301`	`302`	`exir_ops.edge.aten.round.default: None,`
`302`	`303`	`exir_ops.edge.aten.addmm.default: None,`
	`304`	`+ exir_ops.edge.aten.glu.default: None,`
`303`	`305`	`}`
`304`	`306`
`305`	`307`	`if node.target in needs_decomp_dict:`