From fbfc2b5bba14f7ba68f06345e1915d13ea299302 Mon Sep 17 00:00:00 2001
From: Yufeng Shi <yufeng.shi@arm.com>
Date: Mon, 1 Sep 2025 18:22:14 +0100
Subject: [PATCH] Arm backend: Add pass to handle int64 placeholders

- Add InsertInt32CastsAfterInt64PlaceholdersPass to insert an
  int64->int32 cast node after each int64 placeholder.
- Deprecate the use of InsertCastForOpsWithInt64InputPass and in favor
  of InsertInt32CastsAfterInt64PlaceholdersPass; replace its usage

Change-Id: Ic092f57b56d1bab0e82205e5dd129c49e50862c0
Signed-off-by: Yufeng Shi <yufeng.shi@arm.com>
---
 backends/arm/README.md                        |  21 +--
 backends/arm/_passes/__init__.py              |   4 +-
 backends/arm/_passes/arm_pass_manager.py      |   4 +-
 ...rt_int32_casts_after_int64_placeholders.py | 122 ++++++++++++++++++
 .../_passes/insert_int64_input_cast_pass.py   | 109 ----------------
 .../test_CLIPTextModelWithProjection.py       |   9 +-
 .../test_SD3Transformer2DModel.py             |  18 ++-
 .../stable_diffusion/test_T5EncoderModel.py   |  20 +--
 backends/arm/test/models/test_llama.py        |   6 +-
 backends/arm/test/ops/test_embedding.py       |   6 +-
 ...32_casts_after_int64_placeholders_pass.py} |   4 +-
 11 files changed, 173 insertions(+), 150 deletions(-)
 create mode 100644 backends/arm/_passes/insert_int32_casts_after_int64_placeholders.py
 delete mode 100644 backends/arm/_passes/insert_int64_input_cast_pass.py
 rename backends/arm/test/passes/{test_insert_int64_to_int32_cast_pass.py => test_insert_int32_casts_after_int64_placeholders_pass.py} (89%)

diff --git a/backends/arm/README.md b/backends/arm/README.md
index 3151e461730..d64424436b4 100644
--- a/backends/arm/README.md
+++ b/backends/arm/README.md
@@ -206,14 +206,6 @@ The current TOSA version does not support int64. However, int64 is commonly used
 - For quantized models, these transformations will be automatically handled during annotation before the export stage.
 
 List of model specific and optional passes:
-- InsertCastForOpsWithInt64InputPass
-    - Functionality:
-        - For LLMs such as LLama, some opeartors like aten.embedding have int64 input. In order to lower these operators to TOSA, this pass will insert a casting node that converts the input from int64 to int32.
-    - Supported Ops:
-        - aten.embedding.default, aten.slice_copy.Tensor
-    - Example usage:
-        - backends/arm/test/models/test_llama.py
-
 - ConvertInt64ConstOpsToInt32Pass
     - Functionalities:
       - Rewrites constant-producing ops that output int64 to instead output int32, when values are within int32 bounds.
@@ -244,3 +236,16 @@ List of model specific and optional passes:
     - Example usage:
       - (Functionality 1) backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py
       - (Functionality 2) backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py
+
+- InsertInt32CastsAfterInt64PlaceholdersPass
+    - Functionalities:
+      - Inserts an int64 -> int32 cast immediately after each int64 placeholder (graph input).
+      - Redirects all uses of each int64 placeholder to its int32 cast output.
+      - Inserts local int32 -> int64 casts at call sites where an operator requires int64 inputs, e.g. `torch.nn.functional.one_hot`
+    - Pass ordering:
+      - When used with `ConvertInt64ConstOpsToInt32Pass` and `ConvertInt64OutputOpsToInt32Pass`, run this pass last.
+      - Rationale: Those passes may cause retracing to re-infer some int64 placeholders as int32. Running this pass last casts only inputs that remain int64, minimizing inserted casts.
+    - Example usage:
+      - backends/arm/test/models/test_llama.py
+      - backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py
+      - backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py
diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py
index f033e0d5322..f9e23f73cc5 100644
--- a/backends/arm/_passes/__init__.py
+++ b/backends/arm/_passes/__init__.py
@@ -75,8 +75,8 @@
 from .fuse_constant_ops_pass import ComputeConstantOpsAOT, FuseConstantArgsPass  # noqa
 from .fuse_equal_placeholders_pass import FuseEqualPlaceholdersPass  # noqa
 from .fuse_quantized_activation_pass import FuseQuantizedActivationPass  # noqa
-from .insert_int64_input_cast_pass import (  # noqa  # noqa
-    InsertCastForOpsWithInt64InputPass,
+from .insert_int32_casts_after_int64_placeholders import (  # noqa
+    InsertInt32CastsAfterInt64PlaceholdersPass,
 )
 from .insert_rescales_pass import InsertRescalePass  # noqa
 from .insert_table_ops import InsertTableOpsPass  # noqa
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
index c26cd8fb078..81e73e93842 100644
--- a/backends/arm/_passes/arm_pass_manager.py
+++ b/backends/arm/_passes/arm_pass_manager.py
@@ -76,7 +76,7 @@
     FuseConstantArgsPass,
     FuseEqualPlaceholdersPass,
     FuseQuantizedActivationPass,
-    InsertCastForOpsWithInt64InputPass,
+    InsertInt32CastsAfterInt64PlaceholdersPass,
     InsertRescalePass,
     InsertTableOpsPass,
     MatchArgDtypePass,
@@ -277,7 +277,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         )  # ConvertInt64ConstOpsToInt32Pass requires this pass to remove the assertation in Graph
         self.add_pass(ConvertInt64ConstOpsToInt32Pass())
         self.add_pass(ConvertInt64OutputOpsToInt32Pass())
-        self.add_pass(InsertCastForOpsWithInt64InputPass())
+        self.add_pass(InsertInt32CastsAfterInt64PlaceholdersPass())
         self.add_pass(DecomposeEmbeddingPass())
         self.add_pass(DecomposeScaledDotProductAttention())
         self.add_pass(DecomposeRoundPass())
diff --git a/backends/arm/_passes/insert_int32_casts_after_int64_placeholders.py b/backends/arm/_passes/insert_int32_casts_after_int64_placeholders.py
new file mode 100644
index 00000000000..4b619af790c
--- /dev/null
+++ b/backends/arm/_passes/insert_int32_casts_after_int64_placeholders.py
@@ -0,0 +1,122 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+
+import logging
+
+import torch
+from executorch.backends.arm._passes.arm_pass_utils import create_node
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import EdgeOpOverload, ExportPass, PassResult
+from torch._subclasses.fake_tensor import FakeTensor
+
+
+logger = logging.getLogger(__name__)
+
+
+class InsertInt32CastsAfterInt64PlaceholdersPass(ExportPass):
+    """
+    Insert an int64->int32 cast after each int64 placeholder.
+
+    Note: Overflow checks are not applied in this pass. It is the user's responsibility to ensure that values fit within
+    the int32 range.
+    """
+
+    # Ops that require i64 inputs → positions of args to upcast.
+    # Key: op overload; Value: zero-based indices of positional args that must be i64.
+    I64_INPUT_ARG_POSITIONS = {
+        torch.ops.aten.one_hot.default: (0,),
+    }
+
+    def _insert_callsite_i32_to_i64_casts(self, graph_module: torch.fx.GraphModule):
+        """
+        If an operator requires int64 inputs but dtype propagation (via call_operator)
+        produced int32, insert a local int32→int64 cast at the call site to satisfy
+        PyTorch's operator input validation.
+        """
+        modified = False
+        graph = graph_module.graph
+        for node in graph.nodes:
+            if node.op != "call_function":
+                continue
+            if node.target not in self.I64_INPUT_ARG_POSITIONS:
+                continue
+
+            with graph.inserting_before(node):
+                arg_positions = self.I64_INPUT_ARG_POSITIONS.get(node.target)
+                args_list = list(node.args)
+                for pos in arg_positions:  # type: ignore[union-attr]
+                    input_arg = args_list[pos]
+                    to_copy_op = self._get_decomposition(graph)
+                    cast_node = graph_module.graph.create_node(
+                        "call_function",
+                        to_copy_op,
+                        (input_arg,),
+                        {"dtype": torch.int64},
+                    )
+                    cast_node.meta["val"] = node.meta["val"].to(torch.int64)
+                    args_list[pos] = cast_node
+                node.args = tuple(args_list)
+                modified = True
+        return modified
+
+    def _graph_uses_edge_ops(self, graph: torch.fx.Graph) -> bool:
+        for n in graph.nodes:
+            if n.op == "call_function":
+                if isinstance(n.target, EdgeOpOverload):
+                    return True
+        return False
+
+    def _get_decomposition(self, graph: torch.fx.Graph):
+        if self._graph_uses_edge_ops(graph):
+            return exir_ops.edge.dim_order_ops._to_dim_order_copy.default
+        else:
+            return torch.ops.dim_order_ops._to_dim_order_copy.default
+
+    def _is_tensor_of_dtype(self, node_val, dtype: torch.dtype) -> bool:
+        return isinstance(node_val, FakeTensor) and node_val.dtype == dtype
+
+    def _insert_placeholder_i64_to_i32_casts(self, graph_module: torch.fx.GraphModule):
+        modified = False
+        graph = graph_module.graph
+        for node in graph.nodes:
+            if node.op != "placeholder":
+                continue
+            node_val = node.meta["val"]
+            if not self._is_tensor_of_dtype(node_val, torch.int64):
+                continue
+
+            to_copy_op = self._get_decomposition(graph)
+            with graph.inserting_after(node):
+                cast_after = create_node(
+                    graph,
+                    to_copy_op,
+                    args=(node,),
+                    kwargs={
+                        "dtype": torch.int32,
+                    },
+                )
+                users = [user for user in node.users if user != cast_after]
+                for user in users:
+                    user.replace_input_with(node, cast_after)
+                logger.warning(
+                    f"Inserting a casting node {cast_after.name} after {node.name} to cast int64 placeholder"
+                    f" to int32 for {node.name} defined in {node.meta.get('stack_trace','[no stack trace found]')}"
+                )
+                modified = True
+        return modified
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        modified = False
+        modified |= self._insert_placeholder_i64_to_i32_casts(graph_module)
+        modified |= self._insert_callsite_i32_to_i64_casts(graph_module)
+
+        if modified:
+            graph_module.graph.eliminate_dead_code()
+            graph_module.recompile()
+            graph_module = super().call(graph_module).graph_module
+        return PassResult(graph_module, modified)
diff --git a/backends/arm/_passes/insert_int64_input_cast_pass.py b/backends/arm/_passes/insert_int64_input_cast_pass.py
deleted file mode 100644
index 9577c920c1c..00000000000
--- a/backends/arm/_passes/insert_int64_input_cast_pass.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright 2025 Arm Limited and/or its affiliates.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-unsafe
-
-
-import logging
-
-import torch
-from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import ExportPass, PassResult
-
-from .arm_pass_utils import create_node, get_first_fake_tensor
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.WARNING)
-
-
-class InsertCastForOpsWithInt64InputPass(ExportPass):
-
-    aten_ops = (
-        torch.ops.aten.embedding.default,
-        torch.ops.aten.slice_copy.Tensor,
-    )
-    edge_ops = (
-        exir_ops.edge.aten.embedding.default,
-        exir_ops.edge.aten.slice_copy.Tensor,
-    )
-
-    def get_decomposition(self, op):
-        if op in self.edge_ops:
-            return exir_ops.edge.dim_order_ops._to_dim_order_copy.default
-
-        if op in self.aten_ops:
-            return torch.ops.dim_order_ops._to_dim_order_copy.default
-
-        raise RuntimeError(
-            f"[{self.__class__.__name__}] Can't get decomposition for op {op}"
-        )
-
-    def _check_aten_embedding_within_int32(self, weights, indices, node: torch.fx.Node):
-        weights_shape = get_first_fake_tensor(weights).shape
-        vocab_size = weights_shape[0]
-
-        # Essentially output = weight[indices] which means 0 <= indices[i] < vocab_size
-        # So should be good if vocab size or number embeddings is below max int32
-        if vocab_size >= torch.iinfo(torch.int32).max:
-            logger.warning(
-                f"[{node.name}] has size ({vocab_size}) that exceeds int32 limit,"
-                "so aten.embedding will not be lowered to TOSA."
-            )
-            return False
-
-        return True
-
-    def _insert_int32_cast_before_node(self, graph, node, original_input):
-        to_dim_order_copy_op = self.get_decomposition(node.target)
-        with graph.inserting_before(node):
-            cast_before = create_node(
-                graph,
-                to_dim_order_copy_op,
-                args=(original_input,),
-                kwargs={
-                    "dtype": torch.int32,
-                },
-            )
-            node.replace_input_with(original_input, cast_before)
-
-    def call(self, graph_module):
-        graph = graph_module.graph
-        modified_graph = False
-
-        for node in list(graph.nodes):
-            if node.op != "call_function":
-                continue
-            if node.target not in self.aten_ops + self.edge_ops:
-                continue
-
-            args = node.args
-
-            if node.target in (
-                exir_ops.edge.aten.embedding.default,
-                torch.ops.aten.embedding.default,
-            ):
-                weights = args[0]
-                indices = args[1]
-                if self._check_aten_embedding_within_int32(weights, indices, node):
-                    self._insert_int32_cast_before_node(graph, node, indices)
-                    modified_graph = True
-
-            elif node.target in (
-                exir_ops.edge.aten.slice_copy.Tensor,
-                torch.ops.aten.slice_copy.Tensor,
-            ):
-                # MLETORCH-829: Add range check for slice_copy
-                input_tensor = args[0]
-                fake_tensor = input_tensor.meta["val"]
-                if fake_tensor.dtype != torch.int64:
-                    continue
-
-                self._insert_int32_cast_before_node(graph, node, input_tensor)
-                modified_graph = True
-
-        if modified_graph:
-            graph_module.recompile()
-            graph_module = super().call(graph_module).graph_module
-        return PassResult(graph_module, modified_graph)
diff --git a/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py b/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py
index aa0f194590c..0e99f3f5bfa 100644
--- a/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py
+++ b/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py
@@ -11,7 +11,7 @@
 from executorch.backends.arm._passes import (
     ConvertInt64ConstOpsToInt32Pass,
     ConvertInt64OutputOpsToInt32Pass,
-    InsertCastForOpsWithInt64InputPass,
+    InsertInt32CastsAfterInt64PlaceholdersPass,
 )
 
 from executorch.backends.arm.test import common
@@ -33,10 +33,9 @@ class TestCLIPTextModelWithProjection(unittest.TestCase):
     # for that is some assert ops are removed by passes in the
     # .to_executorch step, i.e. after Arm partitioner.
     ops_after_partitioner = {
-        "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 3,
-        "executorch_exir_dialects_edge__ops_aten_view_copy_default": 1,
         "executorch_exir_dialects_edge__ops_aten_argmax_default": 1,
-        "torch.ops.higher_order.executorch_call_delegate": 1,
+        "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 2,
+        "torch.ops.higher_order.executorch_call_delegate": 2,
     }
 
     def _prepare_inputs(
@@ -71,9 +70,9 @@ def test_CLIPTextModelWithProjection_tosa_FP(self):
                     example_inputs=text_encoder_model_inputs,
                     compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"),
                     transform_passes=[
-                        InsertCastForOpsWithInt64InputPass(),
                         ConvertInt64ConstOpsToInt32Pass(),
                         ConvertInt64OutputOpsToInt32Pass(),
+                        InsertInt32CastsAfterInt64PlaceholdersPass(),
                     ],
                 )
                 .export()
diff --git a/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py b/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py
index 880dc17166d..f9d814d044b 100644
--- a/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py
+++ b/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py
@@ -22,11 +22,8 @@ class TestSD3Transformer2DModel(unittest.TestCase):
     SD3Transformer2DModel is the transformer model used by Stable Diffusion 3.5 Medium
     """
 
-    # Adjust nbr below as we increase op support. Note: most of the delegates
-    # calls are directly consecutive to each other in the .pte. The reason
-    # for that is some assert ops are removed by passes in the
-    # .to_executorch step, i.e. after Arm partitioner.
-    ops_after_partitioner = {
+    # Adjust nbr below as we increase op support.
+    ops_after_partitioner_FP = {
         "executorch_exir_dialects_edge__ops_aten_permute_copy_default": 1,
         "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 1,
         "executorch_exir_dialects_edge__ops_aten_view_copy_default": 2,
@@ -34,6 +31,13 @@ class TestSD3Transformer2DModel(unittest.TestCase):
         "torch.ops.higher_order.executorch_call_delegate": 1,
     }
 
+    ops_after_partitioner_INT = {
+        "executorch_exir_dialects_edge__ops_aten_permute_copy_default": 1,
+        "executorch_exir_dialects_edge__ops_aten_view_copy_default": 2,
+        "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 2,
+        "torch.ops.higher_order.executorch_call_delegate": 2,
+    }
+
     def _prepare_inputs(
         self,
         batch_size=2,
@@ -102,7 +106,7 @@ def test_SD3Transformer2DModel_tosa_FP(self):
                 )
                 .export()
                 .to_edge_transform_and_lower()
-                .check_count(self.ops_after_partitioner)
+                .check_count(self.ops_after_partitioner_FP)
                 .to_executorch()
                 .run_method_and_compare_outputs(
                     inputs=sd35_transformer2D_model_inputs,
@@ -125,7 +129,7 @@ def test_SD3Transformer2DModel_tosa_INT(self):
                 .quantize()
                 .export()
                 .to_edge_transform_and_lower()
-                .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+                .check_count(self.ops_after_partitioner_INT)
                 .to_executorch()
                 .run_method_and_compare_outputs(
                     inputs=sd35_transformer2D_model_inputs,
diff --git a/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py b/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py
index 7c1a45f27cb..22a47042eb1 100644
--- a/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py
+++ b/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py
@@ -10,7 +10,7 @@
 from executorch.backends.arm._passes import (
     ConvertInt64ConstOpsToInt32Pass,
     ConvertInt64OutputOpsToInt32Pass,
-    InsertCastForOpsWithInt64InputPass,
+    InsertInt32CastsAfterInt64PlaceholdersPass,
 )
 
 from executorch.backends.arm.test import common
@@ -27,16 +27,17 @@ class TestT5EncoderModel(unittest.TestCase):
     T5EncoderModel is one of the text_encoder used by Stable Diffusion 3.5 Medium
     """
 
-    # Adjust nbr below as we increase op support. Note: most of the delegates
-    # calls are directly consecutive to each other in the .pte. The reason
-    # for that is some assert ops are removed by passes in the
-    # .to_executorch step, i.e. after Arm partitioner.
-    ops_after_partitioner = {
+    # Adjust nbr below as we increase op support.
+    ops_after_partitioner_FP = {
         "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 2,
-        "executorch_exir_dialects_edge__ops_aten_view_copy_default": 1,
         "torch.ops.higher_order.executorch_call_delegate": 2,
     }
 
+    ops_after_partitioner_INT = {
+        "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 3,
+        "torch.ops.higher_order.executorch_call_delegate": 3,
+    }
+
     def _prepare_inputs(
         self,
         batch_size=12,
@@ -69,15 +70,15 @@ def test_T5EncoderModel_tosa_FP(self):
                     example_inputs=t5_encoder_model_inputs,
                     compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"),
                     transform_passes=[
-                        InsertCastForOpsWithInt64InputPass(),
                         ConvertInt64ConstOpsToInt32Pass(),
                         ConvertInt64OutputOpsToInt32Pass(),
+                        InsertInt32CastsAfterInt64PlaceholdersPass(),
                     ],
                 )
                 .export()
                 .to_edge_transform_and_lower()
                 .dump_operator_distribution()
-                .check_count(self.ops_after_partitioner)
+                .check_count(self.ops_after_partitioner_FP)
                 .to_executorch()
                 .run_method_and_compare_outputs(
                     inputs=t5_encoder_model_inputs,
@@ -97,6 +98,7 @@ def test_T5EncoderModel_tosa_INT(self):
                 .export()
                 .to_edge_transform_and_lower()
                 .dump_operator_distribution()
+                .check_count(self.ops_after_partitioner_INT)
                 .to_executorch()
                 .run_method_and_compare_outputs(
                     inputs=t5_encoder_model_inputs,
diff --git a/backends/arm/test/models/test_llama.py b/backends/arm/test/models/test_llama.py
index 7732943d5fb..d47398be3b0 100644
--- a/backends/arm/test/models/test_llama.py
+++ b/backends/arm/test/models/test_llama.py
@@ -15,7 +15,7 @@
 
 import pytest
 import torch
-from executorch.backends.arm._passes import InsertCastForOpsWithInt64InputPass
+from executorch.backends.arm._passes import InsertInt32CastsAfterInt64PlaceholdersPass
 
 from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.test_pipeline import (
@@ -112,7 +112,7 @@ def test_llama_tosa_FP():
             aten_op=[],
             exir_op=[],
             use_to_edge_transform_and_lower=True,
-            transform_passes=[InsertCastForOpsWithInt64InputPass()],
+            transform_passes=[InsertInt32CastsAfterInt64PlaceholdersPass()],
         )
         pipeline.run()
 
@@ -149,6 +149,7 @@ def test_llama_vgf_FP():
             exir_op=[],
             tosa_version="TOSA-1.0+FP",
             use_to_edge_transform_and_lower=True,
+            transform_passes=[InsertInt32CastsAfterInt64PlaceholdersPass()],
         )
         pipeline.run()
 
@@ -168,6 +169,5 @@ def test_llama_vgf_INT():
             exir_op=[],
             tosa_version="TOSA-1.0+INT",
             use_to_edge_transform_and_lower=True,
-            transform_passes=[InsertCastForOpsWithInt64InputPass()],
         )
         pipeline.run()
diff --git a/backends/arm/test/ops/test_embedding.py b/backends/arm/test/ops/test_embedding.py
index cb3983bd364..901fbbc0916 100644
--- a/backends/arm/test/ops/test_embedding.py
+++ b/backends/arm/test/ops/test_embedding.py
@@ -8,7 +8,7 @@
 
 import pytest
 import torch
-from executorch.backends.arm._passes import InsertCastForOpsWithInt64InputPass
+from executorch.backends.arm._passes import InsertInt32CastsAfterInt64PlaceholdersPass
 
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import (
@@ -68,7 +68,7 @@ def test_embedding_tosa_FP(test_input: input_params):
         op.aten_op,
         op.exir_op,
         use_to_edge_transform_and_lower=True,
-        transform_passes=[InsertCastForOpsWithInt64InputPass()],
+        transform_passes=[InsertInt32CastsAfterInt64PlaceholdersPass()],
     )
     pipeline.run()
 
@@ -101,7 +101,7 @@ def test_embedding_vgf_FP(test_input: input_params):
         op.exir_op,
         tosa_version="TOSA-1.0+FP",
         use_to_edge_transform_and_lower=True,
-        transform_passes=[InsertCastForOpsWithInt64InputPass()],
+        transform_passes=[InsertInt32CastsAfterInt64PlaceholdersPass()],
     )
     pipeline.run()
 
diff --git a/backends/arm/test/passes/test_insert_int64_to_int32_cast_pass.py b/backends/arm/test/passes/test_insert_int32_casts_after_int64_placeholders_pass.py
similarity index 89%
rename from backends/arm/test/passes/test_insert_int64_to_int32_cast_pass.py
rename to backends/arm/test/passes/test_insert_int32_casts_after_int64_placeholders_pass.py
index 6125e9b01cc..efc1bebb610 100644
--- a/backends/arm/test/passes/test_insert_int64_to_int32_cast_pass.py
+++ b/backends/arm/test/passes/test_insert_int32_casts_after_int64_placeholders_pass.py
@@ -6,7 +6,7 @@
 from typing import Tuple
 
 import torch
-from executorch.backends.arm._passes import InsertCastForOpsWithInt64InputPass
+from executorch.backends.arm._passes import InsertInt32CastsAfterInt64PlaceholdersPass
 
 from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
 
@@ -40,7 +40,7 @@ def test_int64_model_tosa_FP():
         module.get_inputs(),
         ops_before_pass=op_checks_before,
         ops_after_pass=op_checks_after,
-        pass_list=[InsertCastForOpsWithInt64InputPass],
+        pass_list=[InsertInt32CastsAfterInt64PlaceholdersPass],
     )
     pipeline.pop_stage(-1)  # Do not compare output
     pipeline.run()