pytorch
diff --git a/‎.ci/scripts/test_llama_lora.sh‎
Lines changed: 51 additions & 14 deletions b/‎.ci/scripts/test_llama_lora.sh‎
Lines changed: 51 additions & 14 deletions
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/pull.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 3 additions & 0 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/arm/_passes/decompose_glu_pass.py‎
Lines changed: 75 additions & 0 deletions b/‎backends/arm/_passes/decompose_glu_pass.py‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎backends/arm/_passes/fuse_equal_placeholders_pass.py‎
Lines changed: 43 additions & 44 deletions b/‎backends/arm/_passes/fuse_equal_placeholders_pass.py‎
Lines changed: 43 additions & 44 deletions
diff --git a/‎backends/arm/operator_support/tosa_supported_operators.py‎
Lines changed: 2 additions & 0 deletions b/‎backends/arm/operator_support/tosa_supported_operators.py‎
Lines changed: 2 additions & 0 deletions
@@ -48,8 +48,17 @@ DOWNLOADED_PATH=$(
     --model_id "${HF_MODEL_REPO}" \
     --files "adapter_config.json" "adapter_model.pt" "consolidated.00.pth" "params.json" "tokenizer.model"
 )
-EXPORTED_MODEL_NAME="llama_3_2_1B_lora.pte"
-# Export model.
+# Build llama runner.
+cmake_install_executorch_libraries
+cmake_build_llama_runner
+
+# Constants.
+RUNTIME_ARGS="--tokenizer_path=${DOWNLOADED_PATH}/tokenizer.model --temperature=0 --seq_len=20 --warmup=1"
+PROMPT="What happens if you eat watermelon seeds?"
+EXPECTED_PREFIX="What happens if you eat watermelon seeds? Watermelon seeds are a good source of vitamin C,"
+
+# Export LoRA PTE file.
+MODEL_NAME="llama_3_2_1B_lora"
 $PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
     base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
     base.params="${DOWNLOADED_PATH}/params.json" \
@@ -61,36 +70,64 @@ $PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
     model.dtype_override="fp32" \
     backend.xnnpack.enabled=true \
     backend.xnnpack.extended_ops=true \
-    export.output_name="${EXPORTED_MODEL_NAME}"
-
-# Build llama runner.
-cmake_install_executorch_libraries
-cmake_build_llama_runner
+    export.output_name="${MODEL_NAME}.pte"
 
-PROMPT="What happens if you eat watermelon seeds?"
 # Run llama runner
-RUNTIME_ARGS="--model_path=${EXPORTED_MODEL_NAME} --tokenizer_path=${DOWNLOADED_PATH}/tokenizer.model --temperature=0 --seq_len=20 --warmup=1"
-
 NOW=$(date +"%H:%M:%S")
 echo "Starting to run llama runner at ${NOW}"
 # shellcheck source=/dev/null
-cmake-out/examples/models/llama/llama_main --prompt="${PROMPT}" ${RUNTIME_ARGS} > result.txt
+cmake-out/examples/models/llama/llama_main --model_path=${MODEL_NAME}.pte --prompt="${PROMPT}" ${RUNTIME_ARGS} > result.txt
 NOW=$(date +"%H:%M:%S")
 echo "Finished at ${NOW}"
 
 RESULT=$(cat result.txt)
-EXPECTED_PREFIX="What happens if you eat watermelon seeds? Watermelon seeds are a good source of vitamin C,"
-
 if [[ "${RESULT}" == "${EXPECTED_PREFIX}"* ]]; then
   echo "Expected result prefix: ${EXPECTED_PREFIX}"
   echo "Actual result: ${RESULT}"
+  # Do not clean up files if test passes, as they're re-used in the next test.
   echo "Success"
-  cleanup_files
 else
   echo "Expected result prefix: ${EXPECTED_PREFIX}"
   echo "Actual result: ${RESULT}"
   echo "Failure; results not the same"
+  cleanup_files
+  exit 1
+fi
 
+# Export LoRA PTE, PTD file.
+MODEL_SEPARATE="${MODEL_NAME}_separate"
+$PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
+    base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
+    base.params="${DOWNLOADED_PATH}/params.json" \
+    base.adapter_checkpoint="${DOWNLOADED_PATH}/adapter_model.pt" \
+    base.adapter_config="${DOWNLOADED_PATH}/adapter_config.json" \
+    base.tokenizer_path="${DOWNLOADED_PATH}/tokenizer.model" \
+    model.use_kv_cache=true \
+    model.use_sdpa_with_kv_cache=true \
+    model.dtype_override="fp32" \
+    backend.xnnpack.enabled=true \
+    backend.xnnpack.extended_ops=true \
+    export.output_name="${MODEL_SEPARATE}.pte" \
+    export.foundation_weights_file="${MODEL_SEPARATE}.ptd"
+
+# Run llama runner.
+NOW=$(date +"%H:%M:%S")
+echo "Starting to run llama runner at ${NOW}"
+# shellcheck source=/dev/null
+cmake-out/examples/models/llama/llama_main --model_path=${MODEL_SEPARATE}.pte --data_path=${MODEL_SEPARATE}.ptd --prompt="${PROMPT}" ${RUNTIME_ARGS} > result2.txt
+NOW=$(date +"%H:%M:%S")
+echo "Finished at ${NOW}"
+
+RESULT2=$(cat result2.txt)
+if [[ "${RESULT2}" == "${EXPECTED_PREFIX}"* ]]; then
+  echo "Expected result prefix: ${EXPECTED_PREFIX}"
+  echo "Actual result: ${RESULT2}"
+  echo "Success"
+  cleanup_files
+else
+  echo "Expected result prefix: ${EXPECTED_PREFIX}"
+  echo "Actual result: ${RESULT2}"
+  echo "Failure; results not the same"
   cleanup_files
   exit 1
 fi
@@ -315,7 +315,7 @@ jobs:
         bash examples/models/moshi/mimi/install_requirements.sh
 
         # reinstall executorch
-        bash ./install_executorch.sh
+        bash ./install_executorch.sh --minimal
 
         # run python unittest
         python -m unittest examples.models.moshi.mimi.test_mimi
 
@@ -288,6 +288,7 @@ jobs:
           - test_arm_baremetal: test_models_tosa
           - test_arm_baremetal: test_models_ethos-u55
           - test_arm_baremetal: test_models_ethos-u85
+          - test_arm_baremetal: test_smaller_stories_llama
       fail-fast: false
     with:
       runner: linux.2xlarge.memory
 
@@ -24,6 +24,7 @@ pip-out/
 # Any exported models and profiling outputs
 *.bin
 *.model
+*.etdump
 tokenizer.json
 *.pte
 *.ptd
@@ -58,6 +59,7 @@ xcuserdata/
 /include/
 /share/
 /version.py
+*.csv
 
 # Android
 *.aar
@@ -36,6 +36,7 @@
 from .decompose_div_pass import DecomposeDivPass  # noqa
 from .decompose_embedding_pass import DecomposeEmbeddingPass  # noqa  # noqa
 from .decompose_gelu_pass import DecomposeGeluPass  # noqa
+from .decompose_glu_pass import DecomposeGluPass  # noqa
 from .decompose_grouped_conv import DecomposeGroupedConv  # noqa
 from .decompose_groupnorm_pass import DecomposeGroupNormPass  # noqa
 from .decompose_layernorm_pass import DecomposeLayerNormPass  # noqa
 
@@ -41,6 +41,7 @@
     DecomposeDivPass,
     DecomposeEmbeddingPass,
     DecomposeGeluPass,
+    DecomposeGluPass,
     DecomposeGroupedConv,
     DecomposeGroupNormPass,
     DecomposeLayerNormPass,
@@ -184,6 +185,7 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(ConvertSplitToSlicePass())
         self.add_pass(FuseBatchnorm2DPass(exported_program))
         self.add_pass(ConvertMmToBmmPass())
+        self.add_pass(DecomposeGluPass())
         self.add_pass(DecomposeLinearPass())
         self.add_pass(DecomposeLeakyReLUPass())
         self.add_pass(DecomposeGroupNormPass())
@@ -264,6 +266,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeMeanDimPass(graph_module, self.tosa_spec))
         self.add_pass(DecomposeNotEqualPass())
         self.add_pass(DecomposeCosineSimilarityPass())
+        self.add_pass(DecomposeGluPass())
         self.add_pass(DecomposeDivPass())
         self.add_pass(DecomposeLeakyReLUPass())
         self.add_pass(DecomposeLinearVectorNormPass())
 
@@ -0,0 +1,75 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.backends.arm._passes import ArmPass
+from executorch.exir.dialects._ops import ops as exir_ops
+
+
+# For FP case
+edge_glu = exir_ops.edge.aten.glu.default
+
+# For INT case
+aten_glu = torch.ops.aten.glu.default
+
+
+def get_ops(op):
+    """Returns the appropriate operator functions based on the input operator."""
+    if op == edge_glu:
+        return (
+            exir_ops.edge.aten.mul.Tensor,
+            exir_ops.edge.aten.sigmoid.default,
+            exir_ops.edge.aten.slice_copy.Tensor,
+        )
+    elif op == aten_glu:
+        return (
+            torch.ops.aten.mul.Tensor,
+            torch.ops.aten.sigmoid.default,
+            torch.ops.aten.slice_copy.Tensor,
+        )
+    else:
+        raise ValueError(f"Unsupported operator: {op}")
+
+
+class DecomposeGluPass(ArmPass):
+    """Decomposes the GLU operator into hadamard product and sigmoid."""
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in [edge_glu, aten_glu]:
+            return super().call_operator(op, args, kwargs, meta)
+
+        hadamard_prod, sigmoid, slice_op = get_ops(op)
+        X = args[0]
+
+        dim = args[1] if len(args) > 1 else kwargs.get("dim", -1)
+
+        if "val" not in X.node.meta:
+            raise Exception("Could not get dimension metadata in input.")
+
+        if dim < 0:
+            dim += X.node.meta["val"].dim()
+
+        n = X.node.meta["val"].size(dim)
+
+        if n % 2:
+            raise RuntimeError(
+                f"glu expects an even split along dim={dim}, got size {n}"
+            )
+
+        middle = n // 2
+
+        T1 = super().call_operator(
+            slice_op, (X, dim, 0, middle), {}, meta, updated=True
+        )
+
+        T2 = super().call_operator(
+            slice_op, (X, dim, middle, n), {}, meta, updated=True
+        )
+
+        T2_sigmoid = super().call_operator(sigmoid, (T2,), {}, meta, updated=True)
+
+        return super().call_operator(
+            hadamard_prod, (T1, T2_sigmoid), {}, meta, updated=True
+        )
@@ -3,6 +3,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import hashlib
+from collections import defaultdict
+
 import torch
 from executorch.backends.arm._passes.arm_pass_utils import (
     get_constant_placeholder_kind,
@@ -21,7 +24,7 @@ class FuseEqualPlaceholdersPass(ExportPass):
     """
     This pass optimizes memory usage by finding constant placeholders
     pointing to identical tensors and fusing them to one single placeholder
-    with multiple users.
+    with multiple users, using a cache for faster comparison.
     """
 
     def __init__(self, exported_program: ExportedProgram):
@@ -30,58 +33,54 @@ def __init__(self, exported_program: ExportedProgram):
 
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         modified = False
-        const_placeholder_nodes = []
-        for node in graph_module.graph.nodes:
-            if is_param_node(self.exported_program, node):
-                const_placeholder_nodes.append(node)
-
-        while const_placeholder_nodes:
 
-            # Find equal tensors
-            node1 = const_placeholder_nodes.pop()
-            eq_nodes = [node1]
-            tensor1 = get_param_tensor(self.exported_program, node1)
-            if tensor1 is None:
+        # Build a cache of params: mapping hash_key -> list of (node, tensor)
+        hash_buckets = defaultdict(list)
+        for node in graph_module.graph.nodes:
+            if not is_param_node(self.exported_program, node):
                 continue
+            tensor = get_param_tensor(self.exported_program, node)
+            if tensor is None:
+                continue
+            # Create a lightweight fingerprint: dtype + shape + SHA1 of raw bytes
+            # Ensure tensor is on CPU and contiguous
+            t_cpu = tensor.detach().cpu().contiguous()
+            data_bytes = t_cpu.numpy().tobytes()
+            key = (
+                str(t_cpu.dtype),
+                tuple(t_cpu.shape),
+                hashlib.sha1(data_bytes).hexdigest(),
+            )
+            hash_buckets[key].append((node, t_cpu))
 
-            for node2 in const_placeholder_nodes:
-                tensor2 = get_param_tensor(self.exported_program, node2)
-                if tensor2 is None:
-                    continue
-
-                if (
-                    tensor1.dtype == tensor2.dtype
-                    and tensor1.shape == tensor2.shape
-                    and torch.allclose(tensor1, tensor2, atol=1e-08)
-                ):
-                    eq_nodes.append(node2)
+        # For each bucket with more than one entry, fuse:
+        for nodes_tensors in hash_buckets.values():
+            if len(nodes_tensors) < 2:
+                continue
 
-            if len(eq_nodes) > 1:
-                common_name = node1.name + "_common"
-                common_kind = get_constant_placeholder_kind(
-                    self.exported_program, node1
+            # Create a new placeholder from first in list of equal placeholders.
+            rep_node, rep_tensor = nodes_tensors[0]
+            common_name = rep_node.name + "_common"
+            common_kind = get_constant_placeholder_kind(self.exported_program, rep_node)
+            common_persistent = True
+            with graph_module.graph.inserting_before(rep_node):
+                common_node = create_constant_placeholder(
+                    self.exported_program,
+                    graph_module.graph,
+                    common_name,
+                    common_kind,
+                    rep_tensor,
+                    common_persistent,
                 )
-                common_persisten_buffer = True
-
-                with graph_module.graph.inserting_before(node1):
-                    common_node = create_constant_placeholder(
-                        self.exported_program,
-                        graph_module.graph,
-                        common_name,
-                        common_kind,
-                        tensor1,
-                        common_persisten_buffer,
-                    )
-
-                for eq_node in eq_nodes:
-                    eq_node.replace_all_uses_with(common_node)
-                    delete_constant_placeholder(self.exported_program, eq_node)
-                    if eq_node != node1:
-                        const_placeholder_nodes.remove(eq_node)
 
+            # Replace uses and delete duplicates
+            for node, _ in nodes_tensors:
+                node.replace_all_uses_with(common_node)
+                delete_constant_placeholder(self.exported_program, node)
                 modified = True
 
         if modified:
             graph_module.recompile()
             graph_module = super().call(graph_module).graph_module
+
         return PassResult(graph_module=graph_module, modified=modified)
@@ -258,6 +258,7 @@ def is_node_supported(
             exir_ops.edge.aten.masked_fill.Scalar,
             exir_ops.edge.aten.asinh.default,
             exir_ops.edge.aten.cosh.default,
+            exir_ops.edge.aten.glu.default,
         ]
 
         return supported
@@ -299,6 +300,7 @@ def is_node_supported(
             exir_ops.edge.aten.leaky_relu.default: None,
             exir_ops.edge.aten.round.default: None,
             exir_ops.edge.aten.addmm.default: None,
+            exir_ops.edge.aten.glu.default: None,
         }
 
         if node.target in needs_decomp_dict:
Original file line number	Diff line number	Diff line change
`@@ -258,6 +258,7 @@ def is_node_supported(`
`258`	`258`	`exir_ops.edge.aten.masked_fill.Scalar,`
`259`	`259`	`exir_ops.edge.aten.asinh.default,`
`260`	`260`	`exir_ops.edge.aten.cosh.default,`
	`261`	`+ exir_ops.edge.aten.glu.default,`
`261`	`262`	`]`
`262`	`263`
`263`	`264`	`return supported`
`@@ -299,6 +300,7 @@ def is_node_supported(`
`299`	`300`	`exir_ops.edge.aten.leaky_relu.default: None,`
`300`	`301`	`exir_ops.edge.aten.round.default: None,`
`301`	`302`	`exir_ops.edge.aten.addmm.default: None,`
	`303`	`+ exir_ops.edge.aten.glu.default: None,`
`302`	`304`	`}`
`303`	`305`
`304`	`306`	`if node.target in needs_decomp_dict:`