pytorch
diff --git a/‎.ci/scripts/gather_benchmark_configs.py‎
Lines changed: 1 addition & 0 deletions b/‎.ci/scripts/gather_benchmark_configs.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/android-perf-private-device-experiment.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/android-perf-private-device-experiment.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 14 additions & 13 deletions b/‎.github/workflows/pull.yml‎
Lines changed: 14 additions & 13 deletions
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 15 additions & 29 deletions b/‎.github/workflows/trunk.yml‎
Lines changed: 15 additions & 29 deletions
diff --git a/‎backends/arm/TARGETS‎
Lines changed: 2 additions & 0 deletions b/‎backends/arm/TARGETS‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 3 additions & 0 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/arm/_passes/fuse_equal_placeholders_pass.py‎
Lines changed: 83 additions & 0 deletions b/‎backends/arm/_passes/fuse_equal_placeholders_pass.py‎
Lines changed: 83 additions & 0 deletions
@@ -21,6 +21,7 @@
     "apple_iphone_15": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/3b5acd2e-92e2-4778-b651-7726bafe129d",
     "apple_iphone_15+ios_18": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/12c8b15c-8d03-4e07-950d-0a627e7595b4",
     "samsung_galaxy_s22": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa",
+    "samsung_galaxy_s22_private": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/ea6b049d-1508-4233-9a56-5d9eacbe1078",
     "samsung_galaxy_s24": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db",
     "google_pixel_8_pro": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a",
     "google_pixel_3_private_rooted": "arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98d23ca8-ea9e-4fb7-b725-d402017b198d",
 
@@ -23,7 +23,7 @@ on:
         description: Target devices to run benchmark
         required: false
         type: string
-        default: google_pixel_3_private_rooted
+        default: samsung_galaxy_s22_private
       benchmark_configs:
         description: The list of configs used the benchmark
         required: false
@@ -39,7 +39,7 @@ on:
         description: Target devices to run benchmark
         required: false
         type: string
-        default: google_pixel_3_private_rooted
+        default: samsung_galaxy_s22_private
       benchmark_configs:
         description: The list of configs used the benchmark
         required: false
@@ -58,5 +58,5 @@ jobs:
       contents: read
     with:
       models: ${{ inputs.models || 'mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' }}
-      devices: google_pixel_3_private_rooted
+      devices: samsung_galaxy_s22_private
       benchmark_configs: ${{ inputs.benchmark_configs }}
@@ -481,37 +481,38 @@ jobs:
       build-tool: buck2
       docker-image: executorch-ubuntu-22.04-clang12
 
-  unittest-arm:
+  unittest-arm-backend-with-no-fvp:
+    name: unittest-arm-backend-with-no-fvp
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
       contents: read
+    strategy:
+      matrix:
+        include:
+          - test_arm_baremetal: test_pytest_ops
+          - test_arm_baremetal: test_pytest_models
+      fail-fast: false
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-arm-sdk
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
       script: |
-        set -eux
-
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
-        BUILD_TOOL="cmake"
-
-        # Setup MacOS dependencies as there is no Docker support on MacOS atm
-        PYTHON_EXECUTABLE=python \
-        CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON" \
-        EXECUTORCH_BUILD_ARM_BAREMETAL=ON \
-        .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
+        source .ci/scripts/utils.sh
+        install_executorch "--use-pt-pinned-commit"
 
-        # Install Arm dependencies
         .ci/scripts/setup-arm-baremetal-tools.sh
 
-        # Run pytest without simulator
-        backends/arm/test/test_arm_baremetal.sh test_pytest
+        ARM_TEST=${{ matrix.test_arm_baremetal }}
+
+        # Test test_arm_baremetal.sh with test
+        backends/arm/test/test_arm_baremetal.sh "${ARM_TEST}"
 
   test-llama-runner-qnn-linux:
     name: test-llama-runner-qnn-linux
 
@@ -176,12 +176,22 @@ jobs:
         # Test selective build
         PYTHON_EXECUTABLE=python bash examples/portable/scripts/test_demo_backend_delegation.sh "${BUILD_TOOL}"
 
-  test-arm-backend-delegation:
-    name: test-arm-backend-delegation
+  test-arm-backend:
+    name: test-arm-backend
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
       contents: read
+    strategy:
+      matrix:
+        include:
+          - test_arm_baremetal: test_pytest_ops_ethosu_fvp
+          - test_arm_baremetal: test_pytest_models_ethosu_fvp
+          - test_arm_baremetal: test_run_ethosu_fvp
+          - test_arm_baremetal: test_models_tosa
+          - test_arm_baremetal: test_models_ethos-u55
+          - test_arm_baremetal: test_models_ethos-u85
+      fail-fast: false
     with:
       runner: linux.2xlarge.memory
       docker-image: executorch-ubuntu-22.04-arm-sdk
@@ -202,34 +212,10 @@ jobs:
         # Hopefully this is high enough for this setup.
         sudo sysctl fs.inotify.max_user_watches=1048576 # 1024 * 1024
 
-        # Test ethos-u delegate examples with run.sh
-        backends/arm/test/test_arm_baremetal.sh test_full_ethosu_fvp
-
-
-  test-arm-reference-delegation:
-    name: test-arm-reference-delegation
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    permissions:
-      id-token: write
-      contents: read
-    with:
-      runner: linux.2xlarge.memory
-      docker-image: executorch-ubuntu-22.04-arm-sdk
-      submodules: 'recursive'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        source .ci/scripts/utils.sh
-        install_executorch "--use-pt-pinned-commit"
-
-        .ci/scripts/setup-arm-baremetal-tools.sh
+        ARM_TEST=${{ matrix.test_arm_baremetal }}
 
-        # Run arm unit tests using the simulator
-        backends/arm/test/test_arm_baremetal.sh test_pytest_ethosu_fvp
+        # Test test_arm_baremetal.sh with test
+        backends/arm/test/test_arm_baremetal.sh "${ARM_TEST}"
 
   test-arm-cortex-m-size-test:
     name: test-arm-cortex-m-size-test
 
@@ -7,6 +7,8 @@ python_library(
         "ethosu_partitioner.py",
         "tosa_backend.py",
         "tosa_partitioner.py",
+        "vgf_backend.py",
+        "vgf_partitioner.py",
     ],
     deps = [
         ":arm_backend",
 
@@ -39,6 +39,7 @@
 )
 from .fuse_batchnorm2d_pass import FuseBatchnorm2DPass  # noqa
 from .fuse_constant_ops_pass import ComputeConstantOpsAOT, FuseConstantArgsPass  # noqa
+from .fuse_equal_placeholders_pass import FuseEqualPlaceholdersPass  # noqa
 from .fuse_quantized_activation_pass import FuseQuantizedActivationPass  # noqa
 from .insert_rescales_pass import InsertRescalePass  # noqa
 from .insert_table_ops import InsertTableOpsPass  # noqa
 
@@ -40,6 +40,7 @@
     FoldAndAnnotateQParamsPass,
     FuseBatchnorm2DPass,
     FuseConstantArgsPass,
+    FuseEqualPlaceholdersPass,
     FuseQuantizedActivationPass,
     InsertRescalePass,
     InsertTableOpsPass,
@@ -113,6 +114,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(FuseConstantArgsPass(exported_program))
 
         self.add_pass(InsertTableOpsPass(exported_program))
+        self.add_pass(FuseEqualPlaceholdersPass(exported_program))
         self.add_pass(AnnotateChannelsLastDimOrder())
         self.add_pass(InsertRescalePass())
 
@@ -164,6 +166,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(FuseViewCopyTransform())
         self.add_pass(FuseConstantArgsPass(exported_program))
         self.add_pass(InsertTableOpsPass(exported_program))
+        self.add_pass(FuseEqualPlaceholdersPass(exported_program))
         self.add_pass(AnnotateChannelsLastDimOrder())
         self.add_pass(InsertRescalePass())
 
 
@@ -0,0 +1,83 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.backends.arm._passes.arm_pass_utils import (
+    get_constant_placeholder_kind,
+    get_param_tensor,
+    is_param_node,
+)
+from executorch.backends.transforms.utils import (
+    create_constant_placeholder,
+    delete_constant_placeholder,
+)
+from executorch.exir import ExportedProgram
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class FuseEqualPlaceholdersPass(ExportPass):
+    """
+    This pass optimizes memory usage by finding constant placeholders
+    pointing to identical tensors and fusing them to one single placeholder
+    with multiple users.
+    """
+
+    def __init__(self, exported_program: ExportedProgram):
+        self.exported_program = exported_program
+        super().__init__()
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        modified = False
+        const_placeholder_nodes = []
+        for node in graph_module.graph.nodes:
+            if is_param_node(self.exported_program, node):
+                const_placeholder_nodes.append(node)
+
+        while const_placeholder_nodes:
+
+            # Find equal tensors
+            node1 = const_placeholder_nodes.pop()
+            eq_nodes = [node1]
+            tensor1 = get_param_tensor(self.exported_program, node1)
+            if tensor1 is None:
+                continue
+
+            for node2 in const_placeholder_nodes:
+                tensor2 = get_param_tensor(self.exported_program, node2)
+                if tensor2 is None:
+                    continue
+
+                if torch.equal(tensor1, tensor2):
+                    eq_nodes.append(node2)
+
+            if len(eq_nodes) > 1:
+                common_name = node1.name + "_common"
+                common_kind = get_constant_placeholder_kind(
+                    self.exported_program, node1
+                )
+                common_persisten_buffer = True
+
+                with graph_module.graph.inserting_before(node1):
+                    common_node = create_constant_placeholder(
+                        self.exported_program,
+                        graph_module.graph,
+                        common_name,
+                        common_kind,
+                        tensor1,
+                        common_persisten_buffer,
+                    )
+
+                for eq_node in eq_nodes:
+                    eq_node.replace_all_uses_with(common_node)
+                    delete_constant_placeholder(self.exported_program, eq_node)
+                    if eq_node != node1:
+                        const_placeholder_nodes.remove(eq_node)
+
+                modified = True
+
+        if modified:
+            graph_module.recompile()
+            graph_module = super().call(graph_module).graph_module
+        return PassResult(graph_module=graph_module, modified=modified)
Original file line number	Diff line number	Diff line change
`@@ -39,6 +39,7 @@`
`39`	`39`	`)`
`40`	`40`	`from .fuse_batchnorm2d_pass import FuseBatchnorm2DPass # noqa`
`41`	`41`	`from .fuse_constant_ops_pass import ComputeConstantOpsAOT, FuseConstantArgsPass # noqa`
	`42`	`+from .fuse_equal_placeholders_pass import FuseEqualPlaceholdersPass # noqa`
`42`	`43`	`from .fuse_quantized_activation_pass import FuseQuantizedActivationPass # noqa`
`43`	`44`	`from .insert_rescales_pass import InsertRescalePass # noqa`
`44`	`45`	`from .insert_table_ops import InsertTableOpsPass # noqa`