From fbfc2b5bba14f7ba68f06345e1915d13ea299302 Mon Sep 17 00:00:00 2001 From: Yufeng Shi Date: Mon, 1 Sep 2025 18:22:14 +0100 Subject: [PATCH] Arm backend: Add pass to handle int64 placeholders - Add InsertInt32CastsAfterInt64PlaceholdersPass to insert an int64->int32 cast node after each int64 placeholder. - Deprecate the use of InsertCastForOpsWithInt64InputPass and in favor of InsertInt32CastsAfterInt64PlaceholdersPass; replace its usage Change-Id: Ic092f57b56d1bab0e82205e5dd129c49e50862c0 Signed-off-by: Yufeng Shi --- backends/arm/README.md | 21 +-- backends/arm/_passes/__init__.py | 4 +- backends/arm/_passes/arm_pass_manager.py | 4 +- ...rt_int32_casts_after_int64_placeholders.py | 122 ++++++++++++++++++ .../_passes/insert_int64_input_cast_pass.py | 109 ---------------- .../test_CLIPTextModelWithProjection.py | 9 +- .../test_SD3Transformer2DModel.py | 18 ++- .../stable_diffusion/test_T5EncoderModel.py | 20 +-- backends/arm/test/models/test_llama.py | 6 +- backends/arm/test/ops/test_embedding.py | 6 +- ...32_casts_after_int64_placeholders_pass.py} | 4 +- 11 files changed, 173 insertions(+), 150 deletions(-) create mode 100644 backends/arm/_passes/insert_int32_casts_after_int64_placeholders.py delete mode 100644 backends/arm/_passes/insert_int64_input_cast_pass.py rename backends/arm/test/passes/{test_insert_int64_to_int32_cast_pass.py => test_insert_int32_casts_after_int64_placeholders_pass.py} (89%) diff --git a/backends/arm/README.md b/backends/arm/README.md index 3151e461730..d64424436b4 100644 --- a/backends/arm/README.md +++ b/backends/arm/README.md @@ -206,14 +206,6 @@ The current TOSA version does not support int64. However, int64 is commonly used - For quantized models, these transformations will be automatically handled during annotation before the export stage. List of model specific and optional passes: -- InsertCastForOpsWithInt64InputPass - - Functionality: - - For LLMs such as LLama, some opeartors like aten.embedding have int64 input. In order to lower these operators to TOSA, this pass will insert a casting node that converts the input from int64 to int32. - - Supported Ops: - - aten.embedding.default, aten.slice_copy.Tensor - - Example usage: - - backends/arm/test/models/test_llama.py - - ConvertInt64ConstOpsToInt32Pass - Functionalities: - Rewrites constant-producing ops that output int64 to instead output int32, when values are within int32 bounds. @@ -244,3 +236,16 @@ List of model specific and optional passes: - Example usage: - (Functionality 1) backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py - (Functionality 2) backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py + +- InsertInt32CastsAfterInt64PlaceholdersPass + - Functionalities: + - Inserts an int64 -> int32 cast immediately after each int64 placeholder (graph input). + - Redirects all uses of each int64 placeholder to its int32 cast output. + - Inserts local int32 -> int64 casts at call sites where an operator requires int64 inputs, e.g. `torch.nn.functional.one_hot` + - Pass ordering: + - When used with `ConvertInt64ConstOpsToInt32Pass` and `ConvertInt64OutputOpsToInt32Pass`, run this pass last. + - Rationale: Those passes may cause retracing to re-infer some int64 placeholders as int32. Running this pass last casts only inputs that remain int64, minimizing inserted casts. + - Example usage: + - backends/arm/test/models/test_llama.py + - backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py + - backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py index f033e0d5322..f9e23f73cc5 100644 --- a/backends/arm/_passes/__init__.py +++ b/backends/arm/_passes/__init__.py @@ -75,8 +75,8 @@ from .fuse_constant_ops_pass import ComputeConstantOpsAOT, FuseConstantArgsPass # noqa from .fuse_equal_placeholders_pass import FuseEqualPlaceholdersPass # noqa from .fuse_quantized_activation_pass import FuseQuantizedActivationPass # noqa -from .insert_int64_input_cast_pass import ( # noqa # noqa - InsertCastForOpsWithInt64InputPass, +from .insert_int32_casts_after_int64_placeholders import ( # noqa + InsertInt32CastsAfterInt64PlaceholdersPass, ) from .insert_rescales_pass import InsertRescalePass # noqa from .insert_table_ops import InsertTableOpsPass # noqa diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py index c26cd8fb078..81e73e93842 100644 --- a/backends/arm/_passes/arm_pass_manager.py +++ b/backends/arm/_passes/arm_pass_manager.py @@ -76,7 +76,7 @@ FuseConstantArgsPass, FuseEqualPlaceholdersPass, FuseQuantizedActivationPass, - InsertCastForOpsWithInt64InputPass, + InsertInt32CastsAfterInt64PlaceholdersPass, InsertRescalePass, InsertTableOpsPass, MatchArgDtypePass, @@ -277,7 +277,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule): ) # ConvertInt64ConstOpsToInt32Pass requires this pass to remove the assertation in Graph self.add_pass(ConvertInt64ConstOpsToInt32Pass()) self.add_pass(ConvertInt64OutputOpsToInt32Pass()) - self.add_pass(InsertCastForOpsWithInt64InputPass()) + self.add_pass(InsertInt32CastsAfterInt64PlaceholdersPass()) self.add_pass(DecomposeEmbeddingPass()) self.add_pass(DecomposeScaledDotProductAttention()) self.add_pass(DecomposeRoundPass()) diff --git a/backends/arm/_passes/insert_int32_casts_after_int64_placeholders.py b/backends/arm/_passes/insert_int32_casts_after_int64_placeholders.py new file mode 100644 index 00000000000..4b619af790c --- /dev/null +++ b/backends/arm/_passes/insert_int32_casts_after_int64_placeholders.py @@ -0,0 +1,122 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + + +import logging + +import torch +from executorch.backends.arm._passes.arm_pass_utils import create_node +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import EdgeOpOverload, ExportPass, PassResult +from torch._subclasses.fake_tensor import FakeTensor + + +logger = logging.getLogger(__name__) + + +class InsertInt32CastsAfterInt64PlaceholdersPass(ExportPass): + """ + Insert an int64->int32 cast after each int64 placeholder. + + Note: Overflow checks are not applied in this pass. It is the user's responsibility to ensure that values fit within + the int32 range. + """ + + # Ops that require i64 inputs → positions of args to upcast. + # Key: op overload; Value: zero-based indices of positional args that must be i64. + I64_INPUT_ARG_POSITIONS = { + torch.ops.aten.one_hot.default: (0,), + } + + def _insert_callsite_i32_to_i64_casts(self, graph_module: torch.fx.GraphModule): + """ + If an operator requires int64 inputs but dtype propagation (via call_operator) + produced int32, insert a local int32→int64 cast at the call site to satisfy + PyTorch's operator input validation. + """ + modified = False + graph = graph_module.graph + for node in graph.nodes: + if node.op != "call_function": + continue + if node.target not in self.I64_INPUT_ARG_POSITIONS: + continue + + with graph.inserting_before(node): + arg_positions = self.I64_INPUT_ARG_POSITIONS.get(node.target) + args_list = list(node.args) + for pos in arg_positions: # type: ignore[union-attr] + input_arg = args_list[pos] + to_copy_op = self._get_decomposition(graph) + cast_node = graph_module.graph.create_node( + "call_function", + to_copy_op, + (input_arg,), + {"dtype": torch.int64}, + ) + cast_node.meta["val"] = node.meta["val"].to(torch.int64) + args_list[pos] = cast_node + node.args = tuple(args_list) + modified = True + return modified + + def _graph_uses_edge_ops(self, graph: torch.fx.Graph) -> bool: + for n in graph.nodes: + if n.op == "call_function": + if isinstance(n.target, EdgeOpOverload): + return True + return False + + def _get_decomposition(self, graph: torch.fx.Graph): + if self._graph_uses_edge_ops(graph): + return exir_ops.edge.dim_order_ops._to_dim_order_copy.default + else: + return torch.ops.dim_order_ops._to_dim_order_copy.default + + def _is_tensor_of_dtype(self, node_val, dtype: torch.dtype) -> bool: + return isinstance(node_val, FakeTensor) and node_val.dtype == dtype + + def _insert_placeholder_i64_to_i32_casts(self, graph_module: torch.fx.GraphModule): + modified = False + graph = graph_module.graph + for node in graph.nodes: + if node.op != "placeholder": + continue + node_val = node.meta["val"] + if not self._is_tensor_of_dtype(node_val, torch.int64): + continue + + to_copy_op = self._get_decomposition(graph) + with graph.inserting_after(node): + cast_after = create_node( + graph, + to_copy_op, + args=(node,), + kwargs={ + "dtype": torch.int32, + }, + ) + users = [user for user in node.users if user != cast_after] + for user in users: + user.replace_input_with(node, cast_after) + logger.warning( + f"Inserting a casting node {cast_after.name} after {node.name} to cast int64 placeholder" + f" to int32 for {node.name} defined in {node.meta.get('stack_trace','[no stack trace found]')}" + ) + modified = True + return modified + + def call(self, graph_module: torch.fx.GraphModule): + modified = False + modified |= self._insert_placeholder_i64_to_i32_casts(graph_module) + modified |= self._insert_callsite_i32_to_i64_casts(graph_module) + + if modified: + graph_module.graph.eliminate_dead_code() + graph_module.recompile() + graph_module = super().call(graph_module).graph_module + return PassResult(graph_module, modified) diff --git a/backends/arm/_passes/insert_int64_input_cast_pass.py b/backends/arm/_passes/insert_int64_input_cast_pass.py deleted file mode 100644 index 9577c920c1c..00000000000 --- a/backends/arm/_passes/insert_int64_input_cast_pass.py +++ /dev/null @@ -1,109 +0,0 @@ -# Copyright 2025 Arm Limited and/or its affiliates. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-unsafe - - -import logging - -import torch -from executorch.exir.dialects._ops import ops as exir_ops -from executorch.exir.pass_base import ExportPass, PassResult - -from .arm_pass_utils import create_node, get_first_fake_tensor - -logger = logging.getLogger(__name__) -logger.setLevel(logging.WARNING) - - -class InsertCastForOpsWithInt64InputPass(ExportPass): - - aten_ops = ( - torch.ops.aten.embedding.default, - torch.ops.aten.slice_copy.Tensor, - ) - edge_ops = ( - exir_ops.edge.aten.embedding.default, - exir_ops.edge.aten.slice_copy.Tensor, - ) - - def get_decomposition(self, op): - if op in self.edge_ops: - return exir_ops.edge.dim_order_ops._to_dim_order_copy.default - - if op in self.aten_ops: - return torch.ops.dim_order_ops._to_dim_order_copy.default - - raise RuntimeError( - f"[{self.__class__.__name__}] Can't get decomposition for op {op}" - ) - - def _check_aten_embedding_within_int32(self, weights, indices, node: torch.fx.Node): - weights_shape = get_first_fake_tensor(weights).shape - vocab_size = weights_shape[0] - - # Essentially output = weight[indices] which means 0 <= indices[i] < vocab_size - # So should be good if vocab size or number embeddings is below max int32 - if vocab_size >= torch.iinfo(torch.int32).max: - logger.warning( - f"[{node.name}] has size ({vocab_size}) that exceeds int32 limit," - "so aten.embedding will not be lowered to TOSA." - ) - return False - - return True - - def _insert_int32_cast_before_node(self, graph, node, original_input): - to_dim_order_copy_op = self.get_decomposition(node.target) - with graph.inserting_before(node): - cast_before = create_node( - graph, - to_dim_order_copy_op, - args=(original_input,), - kwargs={ - "dtype": torch.int32, - }, - ) - node.replace_input_with(original_input, cast_before) - - def call(self, graph_module): - graph = graph_module.graph - modified_graph = False - - for node in list(graph.nodes): - if node.op != "call_function": - continue - if node.target not in self.aten_ops + self.edge_ops: - continue - - args = node.args - - if node.target in ( - exir_ops.edge.aten.embedding.default, - torch.ops.aten.embedding.default, - ): - weights = args[0] - indices = args[1] - if self._check_aten_embedding_within_int32(weights, indices, node): - self._insert_int32_cast_before_node(graph, node, indices) - modified_graph = True - - elif node.target in ( - exir_ops.edge.aten.slice_copy.Tensor, - torch.ops.aten.slice_copy.Tensor, - ): - # MLETORCH-829: Add range check for slice_copy - input_tensor = args[0] - fake_tensor = input_tensor.meta["val"] - if fake_tensor.dtype != torch.int64: - continue - - self._insert_int32_cast_before_node(graph, node, input_tensor) - modified_graph = True - - if modified_graph: - graph_module.recompile() - graph_module = super().call(graph_module).graph_module - return PassResult(graph_module, modified_graph) diff --git a/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py b/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py index aa0f194590c..0e99f3f5bfa 100644 --- a/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py +++ b/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py @@ -11,7 +11,7 @@ from executorch.backends.arm._passes import ( ConvertInt64ConstOpsToInt32Pass, ConvertInt64OutputOpsToInt32Pass, - InsertCastForOpsWithInt64InputPass, + InsertInt32CastsAfterInt64PlaceholdersPass, ) from executorch.backends.arm.test import common @@ -33,10 +33,9 @@ class TestCLIPTextModelWithProjection(unittest.TestCase): # for that is some assert ops are removed by passes in the # .to_executorch step, i.e. after Arm partitioner. ops_after_partitioner = { - "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 3, - "executorch_exir_dialects_edge__ops_aten_view_copy_default": 1, "executorch_exir_dialects_edge__ops_aten_argmax_default": 1, - "torch.ops.higher_order.executorch_call_delegate": 1, + "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 2, + "torch.ops.higher_order.executorch_call_delegate": 2, } def _prepare_inputs( @@ -71,9 +70,9 @@ def test_CLIPTextModelWithProjection_tosa_FP(self): example_inputs=text_encoder_model_inputs, compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"), transform_passes=[ - InsertCastForOpsWithInt64InputPass(), ConvertInt64ConstOpsToInt32Pass(), ConvertInt64OutputOpsToInt32Pass(), + InsertInt32CastsAfterInt64PlaceholdersPass(), ], ) .export() diff --git a/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py b/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py index 880dc17166d..f9d814d044b 100644 --- a/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py +++ b/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py @@ -22,11 +22,8 @@ class TestSD3Transformer2DModel(unittest.TestCase): SD3Transformer2DModel is the transformer model used by Stable Diffusion 3.5 Medium """ - # Adjust nbr below as we increase op support. Note: most of the delegates - # calls are directly consecutive to each other in the .pte. The reason - # for that is some assert ops are removed by passes in the - # .to_executorch step, i.e. after Arm partitioner. - ops_after_partitioner = { + # Adjust nbr below as we increase op support. + ops_after_partitioner_FP = { "executorch_exir_dialects_edge__ops_aten_permute_copy_default": 1, "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 1, "executorch_exir_dialects_edge__ops_aten_view_copy_default": 2, @@ -34,6 +31,13 @@ class TestSD3Transformer2DModel(unittest.TestCase): "torch.ops.higher_order.executorch_call_delegate": 1, } + ops_after_partitioner_INT = { + "executorch_exir_dialects_edge__ops_aten_permute_copy_default": 1, + "executorch_exir_dialects_edge__ops_aten_view_copy_default": 2, + "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 2, + "torch.ops.higher_order.executorch_call_delegate": 2, + } + def _prepare_inputs( self, batch_size=2, @@ -102,7 +106,7 @@ def test_SD3Transformer2DModel_tosa_FP(self): ) .export() .to_edge_transform_and_lower() - .check_count(self.ops_after_partitioner) + .check_count(self.ops_after_partitioner_FP) .to_executorch() .run_method_and_compare_outputs( inputs=sd35_transformer2D_model_inputs, @@ -125,7 +129,7 @@ def test_SD3Transformer2DModel_tosa_INT(self): .quantize() .export() .to_edge_transform_and_lower() - .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .check_count(self.ops_after_partitioner_INT) .to_executorch() .run_method_and_compare_outputs( inputs=sd35_transformer2D_model_inputs, diff --git a/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py b/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py index 7c1a45f27cb..22a47042eb1 100644 --- a/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py +++ b/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py @@ -10,7 +10,7 @@ from executorch.backends.arm._passes import ( ConvertInt64ConstOpsToInt32Pass, ConvertInt64OutputOpsToInt32Pass, - InsertCastForOpsWithInt64InputPass, + InsertInt32CastsAfterInt64PlaceholdersPass, ) from executorch.backends.arm.test import common @@ -27,16 +27,17 @@ class TestT5EncoderModel(unittest.TestCase): T5EncoderModel is one of the text_encoder used by Stable Diffusion 3.5 Medium """ - # Adjust nbr below as we increase op support. Note: most of the delegates - # calls are directly consecutive to each other in the .pte. The reason - # for that is some assert ops are removed by passes in the - # .to_executorch step, i.e. after Arm partitioner. - ops_after_partitioner = { + # Adjust nbr below as we increase op support. + ops_after_partitioner_FP = { "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 2, - "executorch_exir_dialects_edge__ops_aten_view_copy_default": 1, "torch.ops.higher_order.executorch_call_delegate": 2, } + ops_after_partitioner_INT = { + "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 3, + "torch.ops.higher_order.executorch_call_delegate": 3, + } + def _prepare_inputs( self, batch_size=12, @@ -69,15 +70,15 @@ def test_T5EncoderModel_tosa_FP(self): example_inputs=t5_encoder_model_inputs, compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"), transform_passes=[ - InsertCastForOpsWithInt64InputPass(), ConvertInt64ConstOpsToInt32Pass(), ConvertInt64OutputOpsToInt32Pass(), + InsertInt32CastsAfterInt64PlaceholdersPass(), ], ) .export() .to_edge_transform_and_lower() .dump_operator_distribution() - .check_count(self.ops_after_partitioner) + .check_count(self.ops_after_partitioner_FP) .to_executorch() .run_method_and_compare_outputs( inputs=t5_encoder_model_inputs, @@ -97,6 +98,7 @@ def test_T5EncoderModel_tosa_INT(self): .export() .to_edge_transform_and_lower() .dump_operator_distribution() + .check_count(self.ops_after_partitioner_INT) .to_executorch() .run_method_and_compare_outputs( inputs=t5_encoder_model_inputs, diff --git a/backends/arm/test/models/test_llama.py b/backends/arm/test/models/test_llama.py index 7732943d5fb..d47398be3b0 100644 --- a/backends/arm/test/models/test_llama.py +++ b/backends/arm/test/models/test_llama.py @@ -15,7 +15,7 @@ import pytest import torch -from executorch.backends.arm._passes import InsertCastForOpsWithInt64InputPass +from executorch.backends.arm._passes import InsertInt32CastsAfterInt64PlaceholdersPass from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.test_pipeline import ( @@ -112,7 +112,7 @@ def test_llama_tosa_FP(): aten_op=[], exir_op=[], use_to_edge_transform_and_lower=True, - transform_passes=[InsertCastForOpsWithInt64InputPass()], + transform_passes=[InsertInt32CastsAfterInt64PlaceholdersPass()], ) pipeline.run() @@ -149,6 +149,7 @@ def test_llama_vgf_FP(): exir_op=[], tosa_version="TOSA-1.0+FP", use_to_edge_transform_and_lower=True, + transform_passes=[InsertInt32CastsAfterInt64PlaceholdersPass()], ) pipeline.run() @@ -168,6 +169,5 @@ def test_llama_vgf_INT(): exir_op=[], tosa_version="TOSA-1.0+INT", use_to_edge_transform_and_lower=True, - transform_passes=[InsertCastForOpsWithInt64InputPass()], ) pipeline.run() diff --git a/backends/arm/test/ops/test_embedding.py b/backends/arm/test/ops/test_embedding.py index cb3983bd364..901fbbc0916 100644 --- a/backends/arm/test/ops/test_embedding.py +++ b/backends/arm/test/ops/test_embedding.py @@ -8,7 +8,7 @@ import pytest import torch -from executorch.backends.arm._passes import InsertCastForOpsWithInt64InputPass +from executorch.backends.arm._passes import InsertInt32CastsAfterInt64PlaceholdersPass from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.test_pipeline import ( @@ -68,7 +68,7 @@ def test_embedding_tosa_FP(test_input: input_params): op.aten_op, op.exir_op, use_to_edge_transform_and_lower=True, - transform_passes=[InsertCastForOpsWithInt64InputPass()], + transform_passes=[InsertInt32CastsAfterInt64PlaceholdersPass()], ) pipeline.run() @@ -101,7 +101,7 @@ def test_embedding_vgf_FP(test_input: input_params): op.exir_op, tosa_version="TOSA-1.0+FP", use_to_edge_transform_and_lower=True, - transform_passes=[InsertCastForOpsWithInt64InputPass()], + transform_passes=[InsertInt32CastsAfterInt64PlaceholdersPass()], ) pipeline.run() diff --git a/backends/arm/test/passes/test_insert_int64_to_int32_cast_pass.py b/backends/arm/test/passes/test_insert_int32_casts_after_int64_placeholders_pass.py similarity index 89% rename from backends/arm/test/passes/test_insert_int64_to_int32_cast_pass.py rename to backends/arm/test/passes/test_insert_int32_casts_after_int64_placeholders_pass.py index 6125e9b01cc..efc1bebb610 100644 --- a/backends/arm/test/passes/test_insert_int64_to_int32_cast_pass.py +++ b/backends/arm/test/passes/test_insert_int32_casts_after_int64_placeholders_pass.py @@ -6,7 +6,7 @@ from typing import Tuple import torch -from executorch.backends.arm._passes import InsertCastForOpsWithInt64InputPass +from executorch.backends.arm._passes import InsertInt32CastsAfterInt64PlaceholdersPass from executorch.backends.arm.test.tester.test_pipeline import PassPipeline @@ -40,7 +40,7 @@ def test_int64_model_tosa_FP(): module.get_inputs(), ops_before_pass=op_checks_before, ops_after_pass=op_checks_after, - pass_list=[InsertCastForOpsWithInt64InputPass], + pass_list=[InsertInt32CastsAfterInt64PlaceholdersPass], ) pipeline.pop_stage(-1) # Do not compare output pipeline.run()