pytorch
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 8 additions & 0 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎backends/arm/_passes/decompose_masked_fill.py‎
Lines changed: 52 additions & 0 deletions b/‎backends/arm/_passes/decompose_masked_fill.py‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎backends/arm/operator_support/tosa_supported_operators.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/arm/operator_support/tosa_supported_operators.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/quantizer/quantization_annotator.py‎
Lines changed: 0 additions & 1 deletion b/‎backends/arm/quantizer/quantization_annotator.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎backends/arm/test/ops/test_masked_fill.py‎
Lines changed: 144 additions & 0 deletions b/‎backends/arm/test/ops/test_masked_fill.py‎
Lines changed: 144 additions & 0 deletions
diff --git a/‎backends/arm/test/ops/test_multihead_attention.py‎
Lines changed: 37 additions & 0 deletions b/‎backends/arm/test/ops/test_multihead_attention.py‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎backends/arm/test/tester/test_pipeline.py‎
Lines changed: 2 additions & 10 deletions b/‎backends/arm/test/tester/test_pipeline.py‎
Lines changed: 2 additions & 10 deletions
diff --git a/‎backends/cadence/aot/compiler.py‎
Lines changed: 4 additions & 4 deletions b/‎backends/cadence/aot/compiler.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎backends/nxp/backend/edge_program_converter.py‎
Lines changed: 6 additions & 1 deletion b/‎backends/nxp/backend/edge_program_converter.py‎
Lines changed: 6 additions & 1 deletion
@@ -40,6 +40,7 @@
 from .decompose_leaky_relu_pass import DecomposeLeakyReLUPass  # noqa
 from .decompose_linalg_vector_norm_pass import DecomposeLinearVectorNormPass  # noqa
 from .decompose_linear_pass import DecomposeLinearPass  # noqa
+from .decompose_masked_fill import DecomposeMaskedFill  # noqa
 from .decompose_maxpool2d_with_dilation import DecomposeMaxPool2DPass  # noqa
 from .decompose_meandim_pass import DecomposeMeanDimPass  # noqa
 from .decompose_ne_pass import DecomposeNotEqualPass  # noqa
 
@@ -45,6 +45,7 @@
     DecomposeLeakyReLUPass,
     DecomposeLinearPass,
     DecomposeLinearVectorNormPass,
+    DecomposeMaskedFill,
     DecomposeMaxPool2DPass,
     DecomposeMeanDimPass,
     DecomposeNotEqualPass,
@@ -113,6 +114,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(
             DecomposeMeanDimPass(exported_program.graph_module, self.tosa_spec)
         )
+
         self.add_pass(ConvertFullLikeToFullPass())
         self.add_pass(ConvertToClampPass())
         self.add_pass(ConvertMinMaxPass())
@@ -146,6 +148,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(DecomposeMaxPool2DPass())
         self.add_pass(SizeAdjustInputPass())
         self.add_pass(DecomposeSelectPass())
+
         self.add_pass(ConvertSqueezesToViewPass())
 
         self.add_pass(FuseViewCopyTransform())
@@ -160,6 +163,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         return self._transform(exported_program.graph_module)
 
     def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
+        self.add_pass(DecomposeMaskedFill())
         self.add_pass(DecomposeRoundPass())
         self.add_pass(DecomposeAcoshPass())
         self.add_pass(DecomposeAsinPass())
@@ -285,4 +289,8 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(ReplaceInfValues())
         self.add_pass(DecomposeSumPass())
 
+        if not self.tosa_spec.is_U55_subset:
+            # Uses where which is not supported on Ethos-U55
+            self.add_pass(DecomposeMaskedFill())
+
         return self._transform(graph_module)
@@ -0,0 +1,52 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+
+import torch
+
+from executorch.backends.arm._passes import ArmPass
+from executorch.exir.dialects._ops import ops as exir_ops
+
+
+edge_ops = (exir_ops.edge.aten.masked_fill.Scalar,)
+aten_ops = (torch.ops.aten.masked_fill.Scalar,)
+
+
+def _get_decomposition(op) -> tuple:
+    if op in edge_ops:
+        return (
+            exir_ops.edge.aten.where.self,
+            exir_ops.edge.aten.full_like.default,
+        )
+    if op in aten_ops:
+        return (
+            torch.ops.aten.where.self,
+            torch.ops.aten.full_like.default,
+        )
+    raise RuntimeError(f"Unable to get decomposition for op {op}")
+
+
+class DecomposeMaskedFill(ArmPass):
+    """
+    Masked fill takes in a boolean mask, a tensor and a scalar value.
+    Fills the tensor with the scalar value according to the boolean mask.
+    Decomposed to a where and a full_like operator.
+    """
+
+    def call_operator(self, op, args, kwargs, meta, updated=False):
+        if op not in (edge_ops + aten_ops):
+            return super().call_operator(op, args, kwargs, meta, updated)
+
+        x, mask, scalar = args
+
+        where_op, full_like_op = _get_decomposition(op)
+
+        scalar_tensor = super().call_operator(full_like_op, (x, scalar), {}, meta, True)
+
+        return super().call_operator(
+            where_op, (mask, scalar_tensor, x), kwargs, meta, True
+        )
@@ -254,6 +254,7 @@ def is_node_supported(
             exir_ops.edge.aten.asin.default,
             exir_ops.edge.aten.atanh.default,
             exir_ops.edge.aten.addmm.default,
+            exir_ops.edge.aten.masked_fill.Scalar,
         ]
 
         return supported
 
@@ -500,7 +500,6 @@ def any_or_hardtanh_min_zero(n: Node):
     elif node.target in [operator.getitem]:
         if not is_output_annotated(node.args[0]):  # type: ignore[attr-defined, arg-type]
             return None
-
         shared_qspec = SharedQuantizationSpec(node.args[0])  # type: ignore[arg-type]
         quant_properties.quant_inputs = [_QuantProperty(0, shared_qspec)]  # type: ignore[arg-type]
         quant_properties.quant_output = _QuantProperty(0, shared_qspec)  # type: ignore[arg-type]
 
@@ -0,0 +1,144 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU85PipelineBI,
+    OpNotSupportedPipeline,
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+
+aten_op = "torch.aten.ops.masked_fill.Scalar"
+exir_op = "executorch_exir_dialects_edge__ops_aten_masked_fill_scalar"
+
+input_t = Tuple[torch.Tensor, torch.Tensor, float]
+
+
+class MaskedFill(torch.nn.Module):
+    def forward(
+        self, x: torch.Tensor, mask: torch.Tensor, value: float
+    ) -> torch.Tensor:
+        return torch.masked_fill(x, mask, value)
+
+
+test_modules = {
+    "masked_fill_1": lambda: (
+        MaskedFill(),
+        (
+            torch.rand(1, 3, 4, 5),
+            (torch.rand(1, 3, 4, 5) < 0.5),  # boolean mask
+            -1.0,
+        ),
+    ),
+    "masked_fill_2": lambda: (
+        MaskedFill(),
+        (
+            torch.rand(1, 10, 10, 10),
+            (torch.rand(1, 10, 10, 10) > 0.75),
+            3.14,
+        ),
+    ),
+    "masked_fill_3_zero_fill": lambda: (
+        MaskedFill(),
+        (
+            torch.rand(1, 3, 4, 5),
+            torch.rand(1, 3, 4, 5) < 0.2,
+            0.0,
+        ),
+    ),
+    "masked_fill_4_full_mask": lambda: (
+        MaskedFill(),
+        (
+            torch.rand(1, 3, 4, 5),
+            torch.ones(1, 3, 4, 5, dtype=torch.bool),
+            7.0,
+        ),
+    ),
+    "masked_fill_5_no_mask": lambda: (
+        MaskedFill(),
+        (
+            torch.rand(1, 3, 4, 5),
+            torch.zeros(1, 3, 4, 5, dtype=torch.bool),
+            -3.0,
+        ),
+    ),
+    "masked_fill_6_scalar_broadcast": lambda: (
+        MaskedFill(),
+        (
+            torch.rand(1, 1, 1, 1),
+            torch.tensor([[[[True]]]]),
+            42.0,
+        ),
+    ),
+    "masked_fill_7_large_tensor": lambda: (
+        MaskedFill(),
+        (
+            torch.rand(1, 8, 8, 8),
+            torch.rand(1, 8, 8, 8) > 0.5,
+            -127.0,
+        ),
+    ),
+    "masked_fill_8_extreme_scalar_inf": lambda: (
+        MaskedFill(),
+        (
+            torch.rand(1, 3, 7, 5),
+            torch.rand(1, 3, 7, 5) > 0.5,
+            float("inf"),
+        ),
+    ),
+}
+
+
+@common.parametrize("test_module", test_modules)
+def test_masked_fill_scalar_tosa_MI(test_module):
+    module, inputs = test_module()
+    pipeline = TosaPipelineMI[input_t](module, inputs, aten_op=[])
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_modules)
+def test_masked_fill_scalar_tosa_BI(test_module):
+    module, inputs = test_module()
+    pipeline = TosaPipelineBI[input_t](
+        module,
+        inputs,
+        aten_op=[],
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_modules)
+@common.XfailIfNoCorstone300
+def test_masked_fill_scalar_u55_BI(test_module):
+    module, inputs = test_module()
+    pipeline = OpNotSupportedPipeline[input_t](
+        module,
+        inputs,
+        {exir_op: 0, "executorch_exir_dialects_edge__ops_aten_where_self": 1},
+        n_expected_delegates=0,
+        quantize=True,
+        u55_subset=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_modules)
+@common.XfailIfNoCorstone320
+def test_masked_fill_scalar_u85_BI(test_module):
+    module, inputs = test_module()
+    pipeline = EthosU85PipelineBI[input_t](
+        module,
+        inputs,
+        aten_ops=[],
+        exir_ops=exir_op,
+    )
+    pipeline.run()
@@ -11,6 +11,7 @@
     EthosU85PipelineBI,
     TosaPipelineBI,
     TosaPipelineMI,
+    VgfPipeline,
 )
 
 
@@ -105,3 +106,39 @@ def test_multihead_attention_u85_BI(test_data: input_t1):
         per_channel_quantization=False,
     )
     pipeline.run()
+
+
+@common.parametrize(
+    "test_data",
+    test_suite,
+)
+@common.SkipIfNoModelConverter
+def test_multihead_attention_vgf_FP(test_data: input_t1):
+    test_data_vals, module = test_data()
+    pipeline = VgfPipeline[input_t1](
+        module,
+        (*test_data_vals, *test_data_vals, *test_data_vals),
+        [],
+        [],
+        tosa_version="TOSA-1.0+FP",
+    )
+    pipeline.run()
+
+
+@common.parametrize(
+    "test_data",
+    test_suite,
+)
+@common.SkipIfNoModelConverter
+def test_multihead_attention_vgf_INT(test_data: input_t1):
+    test_data_vals, module = test_data()
+    pipeline = VgfPipeline[input_t1](
+        module,
+        (*test_data_vals, *test_data_vals, *test_data_vals),
+        [],
+        [],
+        tosa_version="TOSA-1.0+INT",
+        # TODO: Per-channel quantization is broken (MLETORCH-1144)
+        per_channel_quantization=False,
+    )
+    pipeline.run()
@@ -854,7 +854,7 @@ def __init__(
         vgf_compiler_flags: Optional[str] = "",
         tosa_version: str = "TOSA-1.0+FP",
         symmetric_io_quantization: bool = False,
-        per_channel_quantization: bool = False,
+        per_channel_quantization: bool = True,
         use_to_edge_transform_and_lower: bool = True,
         custom_path: str = None,
         atol: float = 1e-03,
@@ -866,11 +866,6 @@ def __init__(
         ] = None,
     ):
 
-        if (
-            symmetric_io_quantization or per_channel_quantization
-        ) and tosa_version == "TOSA-1.0+FP":
-            raise ValueError("Dont configure quantization with FP TOSA profile.")
-
         tosa_profile = TosaSpecification.create_from_string(tosa_version)
         compile_spec = common.get_vgf_compile_spec(
             tosa_profile, compiler_flags=vgf_compiler_flags, custom_path=custom_path
@@ -887,18 +882,15 @@ def __init__(
             transform_passes=transform_passes,
         )
 
-        if symmetric_io_quantization or per_channel_quantization:
+        if "INT" in tosa_version:
             quantizer = VgfQuantizer(compile_spec)
             quantization_config = get_symmetric_quantization_config(
                 is_per_channel=per_channel_quantization
             )
             if symmetric_io_quantization:
                 quantizer.set_io(quantization_config)
             quant_stage = Quantize(quantizer, quantization_config)
-        else:
-            quant_stage = None
 
-        if "INT" in tosa_version:
             self.add_stage(self.tester.quantize, quant_stage, pos=0)
 
             self.add_stage_after(
 
@@ -59,7 +59,7 @@ def trace(
     dump_graphs: bool = False,
 ) -> ExportedProgram:
     """
-    Trace the model with export_for_training and return an ExportedProgram.
+    Trace the model with export and return an ExportedProgram.
     """
 
     # Make the model inference mode by calling model.eval()
@@ -83,9 +83,9 @@ def trace(
     remove_decompositions(decomp_table, ops_to_keep)
 
     # Export with dynamo
-    program = torch.export.export_for_training(
-        model, inputs, strict=True
-    ).run_decompositions(decomp_table)
+    program = torch.export.export(model, inputs, strict=True).run_decompositions(
+        decomp_table
+    )
 
     if dump_graphs:
         logging.info("Graph before quantization:")
 
@@ -23,15 +23,20 @@
 
 # noinspection PyProtectedMember
 functions_converters = {
+    exir_ops.edge.aten.abs.default: AbsConverter,  # noqa F405
+    exir_ops.edge.aten._adaptive_avg_pool2d.default: AdaptiveAvgPool2dConverter,  # noqa F405
     exir_ops.edge.aten.addmm.default: AddMMConverter,  # noqa F405
+    exir_ops.edge.aten.add.Tensor: AddTensorConverter,  # noqa F405
     exir_ops.edge.aten.avg_pool2d.default: AvgPool2dConverter,  # noqa F405
+    exir_ops.edge.aten.clone.default: CloneConverter,  # noqa F405
     exir_ops.edge.aten.constant_pad_nd.default: ConstantPadNDConverter,  # noqa F405
     exir_ops.edge.aten.convolution.default: ConvolutionConverter,  # noqa F405
+    exir_ops.edge.aten.hardtanh.default: HardTanhConverter,  # noqa F405
     exir_ops.edge.aten.max_pool2d.default: MaxPool2dConverter,  # noqa F405
+    exir_ops.edge.aten.mean.dim: MeanDimConverter,  # noqa F405
     exir_ops.edge.aten.mm.default: MMConverter,  # noqa F405
     exir_ops.edge.aten.permute_copy.default: PermuteCopyConverter,  # noqa F405
     exir_ops.edge.aten.relu.default: ReLUConverter,  # noqa F405
-    exir_ops.edge.aten.hardtanh.default: HardTanhConverter,  # noqa F405
     exir_ops.edge.aten._softmax.default: SoftmaxConverter,  # noqa F405
     exir_ops.edge.aten.view_copy.default: ViewCopyConverter,  # noqa F405
 }
Original file line number	Diff line number	Diff line change
`@@ -254,6 +254,7 @@ def is_node_supported(`
`254`	`254`	`exir_ops.edge.aten.asin.default,`
`255`	`255`	`exir_ops.edge.aten.atanh.default,`
`256`	`256`	`exir_ops.edge.aten.addmm.default,`
	`257`	`+ exir_ops.edge.aten.masked_fill.Scalar,`
`257`	`258`	`]`
`258`	`259`
`259`	`260`	`return supported`