pytorch
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 7 additions & 0 deletions b/‎.github/workflows/pull.yml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎backends/apple/coreml/recipes/coreml_recipe_provider.py‎
Lines changed: 5 additions & 2 deletions b/‎backends/apple/coreml/recipes/coreml_recipe_provider.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎backends/apple/coreml/test/test_coreml_quantizer.py‎
Lines changed: 2 additions & 4 deletions b/‎backends/apple/coreml/test/test_coreml_quantizer.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎backends/apple/coreml/test/test_coreml_recipes.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/apple/coreml/test/test_coreml_recipes.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/apple/mps/test/test_mps_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/apple/mps/test/test_mps_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/README.md‎
Lines changed: 13 additions & 8 deletions b/‎backends/arm/README.md‎
Lines changed: 13 additions & 8 deletions
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 2 additions & 2 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/_passes/insert_int32_casts_after_int64_placeholders.py‎
Lines changed: 122 additions & 0 deletions b/‎backends/arm/_passes/insert_int32_casts_after_int64_placeholders.py‎
Lines changed: 122 additions & 0 deletions
diff --git a/‎backends/arm/_passes/insert_int64_input_cast_pass.py‎
Lines changed: 0 additions & 109 deletions b/‎backends/arm/_passes/insert_int64_input_cast_pass.py‎
Lines changed: 0 additions & 109 deletions
@@ -971,6 +971,13 @@ jobs:
         ./cmake-out/backends/vulkan/test/custom_ops/q4gsw_linear
         ./cmake-out/backends/vulkan/test/custom_ops/choose_qparams_per_row
 
+        # "Classic" Operator tests
+        PYTHON_EXECUTABLE=python bash backends/vulkan/test/scripts/test_op.sh --build
+        # TODO(ssjia): figure out how to run custom op tests in CI. Currently, they are
+        # failing due to to the libstdc++.so.6 installed with conda not supporting
+        # GLIBCXX_3.4.30. These tests are still run in Meta internal CI.
+        # ./cmake-out/backends/vulkan/test/op_tests/vulkan_sdpa_test
+
         # Run e2e testing for selected operators. More operators will be tested via this
         # route in the future.
         python -m unittest backends/vulkan/test/test_vulkan_delegate.py -k "*pt2e*"
 
@@ -69,6 +69,7 @@ def create_recipe(
                 recipe_type, activation_dtype=torch.float32, **kwargs
             )
         elif recipe_type == CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_CHANNEL:
+            self._validate_and_set_deployment_target(kwargs, ct.target.iOS18, "torchao")
             return self._build_torchao_quantized_recipe(
                 recipe_type,
                 weight_dtype=torch.int4,
@@ -77,6 +78,7 @@ def create_recipe(
             )
         elif recipe_type == CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_GROUP:
             group_size = kwargs.pop("group_size", 32)
+            self._validate_and_set_deployment_target(kwargs, ct.target.iOS18, "torchao")
             return self._build_torchao_quantized_recipe(
                 recipe_type,
                 weight_dtype=torch.int4,
@@ -85,11 +87,14 @@ def create_recipe(
                 **kwargs,
             )
         elif recipe_type == CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_CHANNEL:
+            self._validate_and_set_deployment_target(kwargs, ct.target.iOS16, "torchao")
             return self._build_torchao_quantized_recipe(
                 recipe_type, weight_dtype=torch.int8, is_per_channel=True, **kwargs
             )
         elif recipe_type == CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_GROUP:
             group_size = kwargs.pop("group_size", 32)
+            # override minimum_deployment_target to ios18 for torchao (GH issue #13122)
+            self._validate_and_set_deployment_target(kwargs, ct.target.iOS18, "torchao")
             return self._build_torchao_quantized_recipe(
                 recipe_type,
                 weight_dtype=torch.int8,
@@ -312,8 +317,6 @@ def _build_torchao_quantized_recipe(
             ao_quantization_configs=[config],
         )
 
-        # override minimum_deployment_target to ios18 for torchao (GH issue #13122)
-        self._validate_and_set_deployment_target(kwargs, ct.target.iOS18, "torchao")
         lowering_recipe = self._get_coreml_lowering_recipe(**kwargs)
 
         return ExportRecipe(
 
@@ -15,7 +15,7 @@
 )
 
 from executorch.backends.apple.coreml.quantizer import CoreMLQuantizer
-from torch.export import export_for_training
+from torch.export import export
 from torchao.quantization.pt2e.quantize_pt2e import (
     convert_pt2e,
     prepare_pt2e,
@@ -32,9 +32,7 @@ def quantize_and_compare(
     ) -> None:
         assert quantization_type in {"PTQ", "QAT"}
 
-        pre_autograd_aten_dialect = export_for_training(
-            model, example_inputs, strict=True
-        ).module()
+        pre_autograd_aten_dialect = export(model, example_inputs, strict=True).module()
 
         quantization_config = LinearQuantizerConfig.from_dict(
             {
 
@@ -501,7 +501,7 @@ def test_minimum_deployment_target_validation(self):
             (CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_GROUP, ct.target.iOS18, {}),
             (
                 CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_CHANNEL,
-                ct.target.iOS18,
+                ct.target.iOS16,
                 {},
             ),
             (CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_GROUP, ct.target.iOS18, {}),
 
@@ -206,7 +206,7 @@ def lower_module_and_test_output(
 
         expected_output = model(*sample_inputs)
 
-        model = torch.export.export_for_training(
+        model = torch.export.export(
             model, sample_inputs, dynamic_shapes=dynamic_shapes, strict=True
         ).module()
 
 
@@ -206,14 +206,6 @@ The current TOSA version does not support int64. However, int64 is commonly used
 - For quantized models, these transformations will be automatically handled during annotation before the export stage.
 
 List of model specific and optional passes:
-- InsertCastForOpsWithInt64InputPass
-    - Functionality:
-        - For LLMs such as LLama, some opeartors like aten.embedding have int64 input. In order to lower these operators to TOSA, this pass will insert a casting node that converts the input from int64 to int32.
-    - Supported Ops:
-        - aten.embedding.default, aten.slice_copy.Tensor
-    - Example usage:
-        - backends/arm/test/models/test_llama.py
-
 - ConvertInt64ConstOpsToInt32Pass
     - Functionalities:
       - Rewrites constant-producing ops that output int64 to instead output int32, when values are within int32 bounds.
@@ -244,3 +236,16 @@ List of model specific and optional passes:
     - Example usage:
       - (Functionality 1) backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py
       - (Functionality 2) backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py
+
+- InsertInt32CastsAfterInt64PlaceholdersPass
+    - Functionalities:
+      - Inserts an int64 -> int32 cast immediately after each int64 placeholder (graph input).
+      - Redirects all uses of each int64 placeholder to its int32 cast output.
+      - Inserts local int32 -> int64 casts at call sites where an operator requires int64 inputs, e.g. `torch.nn.functional.one_hot`
+    - Pass ordering:
+      - When used with `ConvertInt64ConstOpsToInt32Pass` and `ConvertInt64OutputOpsToInt32Pass`, run this pass last.
+      - Rationale: Those passes may cause retracing to re-infer some int64 placeholders as int32. Running this pass last casts only inputs that remain int64, minimizing inserted casts.
+    - Example usage:
+      - backends/arm/test/models/test_llama.py
+      - backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py
+      - backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py
@@ -75,8 +75,8 @@
 from .fuse_constant_ops_pass import ComputeConstantOpsAOT, FuseConstantArgsPass  # noqa
 from .fuse_equal_placeholders_pass import FuseEqualPlaceholdersPass  # noqa
 from .fuse_quantized_activation_pass import FuseQuantizedActivationPass  # noqa
-from .insert_int64_input_cast_pass import (  # noqa  # noqa
-    InsertCastForOpsWithInt64InputPass,
+from .insert_int32_casts_after_int64_placeholders import (  # noqa
+    InsertInt32CastsAfterInt64PlaceholdersPass,
 )
 from .insert_rescales_pass import InsertRescalePass  # noqa
 from .insert_table_ops import InsertTableOpsPass  # noqa
 
@@ -76,7 +76,7 @@
     FuseConstantArgsPass,
     FuseEqualPlaceholdersPass,
     FuseQuantizedActivationPass,
-    InsertCastForOpsWithInt64InputPass,
+    InsertInt32CastsAfterInt64PlaceholdersPass,
     InsertRescalePass,
     InsertTableOpsPass,
     MatchArgDtypePass,
@@ -277,7 +277,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         )  # ConvertInt64ConstOpsToInt32Pass requires this pass to remove the assertation in Graph
         self.add_pass(ConvertInt64ConstOpsToInt32Pass())
         self.add_pass(ConvertInt64OutputOpsToInt32Pass())
-        self.add_pass(InsertCastForOpsWithInt64InputPass())
+        self.add_pass(InsertInt32CastsAfterInt64PlaceholdersPass())
         self.add_pass(DecomposeEmbeddingPass())
         self.add_pass(DecomposeScaledDotProductAttention())
         self.add_pass(DecomposeRoundPass())
 
@@ -0,0 +1,122 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+
+import logging
+
+import torch
+from executorch.backends.arm._passes.arm_pass_utils import create_node
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import EdgeOpOverload, ExportPass, PassResult
+from torch._subclasses.fake_tensor import FakeTensor
+
+
+logger = logging.getLogger(__name__)
+
+
+class InsertInt32CastsAfterInt64PlaceholdersPass(ExportPass):
+    """
+    Insert an int64->int32 cast after each int64 placeholder.
+
+    Note: Overflow checks are not applied in this pass. It is the user's responsibility to ensure that values fit within
+    the int32 range.
+    """
+
+    # Ops that require i64 inputs → positions of args to upcast.
+    # Key: op overload; Value: zero-based indices of positional args that must be i64.
+    I64_INPUT_ARG_POSITIONS = {
+        torch.ops.aten.one_hot.default: (0,),
+    }
+
+    def _insert_callsite_i32_to_i64_casts(self, graph_module: torch.fx.GraphModule):
+        """
+        If an operator requires int64 inputs but dtype propagation (via call_operator)
+        produced int32, insert a local int32→int64 cast at the call site to satisfy
+        PyTorch's operator input validation.
+        """
+        modified = False
+        graph = graph_module.graph
+        for node in graph.nodes:
+            if node.op != "call_function":
+                continue
+            if node.target not in self.I64_INPUT_ARG_POSITIONS:
+                continue
+
+            with graph.inserting_before(node):
+                arg_positions = self.I64_INPUT_ARG_POSITIONS.get(node.target)
+                args_list = list(node.args)
+                for pos in arg_positions:  # type: ignore[union-attr]
+                    input_arg = args_list[pos]
+                    to_copy_op = self._get_decomposition(graph)
+                    cast_node = graph_module.graph.create_node(
+                        "call_function",
+                        to_copy_op,
+                        (input_arg,),
+                        {"dtype": torch.int64},
+                    )
+                    cast_node.meta["val"] = node.meta["val"].to(torch.int64)
+                    args_list[pos] = cast_node
+                node.args = tuple(args_list)
+                modified = True
+        return modified
+
+    def _graph_uses_edge_ops(self, graph: torch.fx.Graph) -> bool:
+        for n in graph.nodes:
+            if n.op == "call_function":
+                if isinstance(n.target, EdgeOpOverload):
+                    return True
+        return False
+
+    def _get_decomposition(self, graph: torch.fx.Graph):
+        if self._graph_uses_edge_ops(graph):
+            return exir_ops.edge.dim_order_ops._to_dim_order_copy.default
+        else:
+            return torch.ops.dim_order_ops._to_dim_order_copy.default
+
+    def _is_tensor_of_dtype(self, node_val, dtype: torch.dtype) -> bool:
+        return isinstance(node_val, FakeTensor) and node_val.dtype == dtype
+
+    def _insert_placeholder_i64_to_i32_casts(self, graph_module: torch.fx.GraphModule):
+        modified = False
+        graph = graph_module.graph
+        for node in graph.nodes:
+            if node.op != "placeholder":
+                continue
+            node_val = node.meta["val"]
+            if not self._is_tensor_of_dtype(node_val, torch.int64):
+                continue
+
+            to_copy_op = self._get_decomposition(graph)
+            with graph.inserting_after(node):
+                cast_after = create_node(
+                    graph,
+                    to_copy_op,
+                    args=(node,),
+                    kwargs={
+                        "dtype": torch.int32,
+                    },
+                )
+                users = [user for user in node.users if user != cast_after]
+                for user in users:
+                    user.replace_input_with(node, cast_after)
+                logger.warning(
+                    f"Inserting a casting node {cast_after.name} after {node.name} to cast int64 placeholder"
+                    f" to int32 for {node.name} defined in {node.meta.get('stack_trace','[no stack trace found]')}"
+                )
+                modified = True
+        return modified
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        modified = False
+        modified |= self._insert_placeholder_i64_to_i32_casts(graph_module)
+        modified |= self._insert_callsite_i32_to_i64_casts(graph_module)
+
+        if modified:
+            graph_module.graph.eliminate_dead_code()
+            graph_module.recompile()
+            graph_module = super().call(graph_module).graph_module
+        return PassResult(graph_module, modified)
Original file line number	Diff line number	Diff line change
`@@ -501,7 +501,7 @@ def test_minimum_deployment_target_validation(self):`
`501`	`501`	`(CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_GROUP, ct.target.iOS18, {}),`
`502`	`502`	`(`
`503`	`503`	`CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_CHANNEL,`
`504`		`- ct.target.iOS18,`
	`504`	`+ ct.target.iOS16,`
`505`	`505`	`{},`
`506`	`506`	`),`
`507`	`507`	`(CoreMLRecipeType.TORCHAO_INT8_WEIGHT_ONLY_PER_GROUP, ct.target.iOS18, {}),`