pytorch
diff --git a/‎backends/apple/coreml/partition/coreml_partitioner.py‎
Lines changed: 15 additions & 1 deletion b/‎backends/apple/coreml/partition/coreml_partitioner.py‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎backends/apple/coreml/test/test_coreml_partitioner.py‎
Lines changed: 46 additions & 0 deletions b/‎backends/apple/coreml/test/test_coreml_partitioner.py‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎backends/arm/_passes/fuse_batchnorm2d_pass.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/arm/_passes/fuse_batchnorm2d_pass.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/cadence/aot/functions_hifi.yaml‎
Lines changed: 10 additions & 0 deletions b/‎backends/cadence/aot/functions_hifi.yaml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎backends/cadence/hifi/operators/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions b/‎backends/cadence/hifi/operators/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/cadence/hifi/operators/op_clamp.cpp‎
Lines changed: 10 additions & 0 deletions b/‎backends/cadence/hifi/operators/op_clamp.cpp‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎backends/cadence/hifi/operators/op_mean.cpp‎
Lines changed: 10 additions & 0 deletions b/‎backends/cadence/hifi/operators/op_mean.cpp‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎backends/cadence/hifi/operators/op_quantized_relu_out.cpp‎
Lines changed: 40 additions & 0 deletions b/‎backends/cadence/hifi/operators/op_quantized_relu_out.cpp‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎backends/cadence/hifi/operators/op_softmax.cpp‎
Lines changed: 9 additions & 0 deletions b/‎backends/cadence/hifi/operators/op_softmax.cpp‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎build/run_android_emulator.sh‎
Lines changed: 4 additions & 0 deletions b/‎build/run_android_emulator.sh‎
Lines changed: 4 additions & 0 deletions
@@ -3,7 +3,7 @@
 # Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
 import logging
-from typing import List, Optional
+from typing import Callable, List, Optional, Tuple
 
 import coremltools as ct
 
@@ -104,3 +104,17 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         return PartitionResult(
             tagged_exported_program=exported_program, partition_tags=partition_tags
         )
+
+    def ops_to_not_decompose(
+        self, ep: ExportedProgram
+    ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
+        do_not_decompose = []
+        op_support = OperatorsSupportedForCoreMLBackend()
+        for node in ep.graph.nodes:
+            if (
+                node.op == "call_function"
+                and isinstance(node.target, torch._ops.OpOverload)
+                and op_support.is_node_supported(None, node)
+            ):
+                do_not_decompose.append(node.target)
+        return do_not_decompose, None
@@ -13,6 +13,7 @@
 
 from executorch.backends.apple.coreml.compiler import CoreMLBackend
 from executorch.backends.apple.coreml.partition import CoreMLPartitioner
+from executorch.exir.backend.utils import format_delegated_graph
 
 
 class TestCoreMLPartitioner(unittest.TestCase):
@@ -79,6 +80,50 @@ def test_vit_skip_conv(self):
             "getitem",
         ]
 
+    def test_ops_to_not_decompose(self):
+        class Model(torch.nn.Module):
+            def forward(self, q, k, v, mask):
+                return torch.ops.aten.scaled_dot_product_attention.default(
+                    q, k, v, attn_mask=mask
+                )
+
+        model = Model()
+        model.eval()
+
+        batch_size = 1
+        n_heads = 12
+        seq_len = 1
+        max_seq_length = 32
+        embedding_dim = 16
+        q = torch.randn(batch_size, n_heads, seq_len, embedding_dim)
+        k = torch.randn(batch_size, n_heads, max_seq_length, embedding_dim)
+        v = torch.randn(batch_size, n_heads, max_seq_length, embedding_dim)
+        mask = torch.randn(seq_len, max_seq_length)
+        example_inputs = (q, k, v, mask)
+        ep = torch.export.export(model, example_inputs)
+        coreml_partitioner = CoreMLPartitioner()
+
+        # Using to_edge_transform_and_lower, we expect SDPA will be preserved and show up in delegated graph
+        edge_program_manager = executorch.exir.to_edge_transform_and_lower(
+            ep, partitioner=[coreml_partitioner]
+        )
+        self.assertTrue(
+            "executorch.exir.dialects.edge._ops.aten.scaled_dot_product_attention.default"
+            in format_delegated_graph(
+                edge_program_manager.exported_program().graph_module
+            )
+        )
+
+        # Using to_edge flow, we expect SDPA will be decomposed and not show up in delegated graph
+        edge_program_manager2 = executorch.exir.to_edge(ep)
+        edge_program_manager2.to_backend(coreml_partitioner)
+        self.assertTrue(
+            "executorch.exir.dialects.edge._ops.aten.scaled_dot_product_attention.default"
+            not in format_delegated_graph(
+                edge_program_manager2.exported_program().graph_module
+            )
+        )
+
     def test_buffer(self):
         embedding_dim = 3
         max_seq_len = 2
@@ -129,4 +174,5 @@ def forward(self, q, k_val, input_pos):
     test_runner = TestCoreMLPartitioner()
     test_runner.test_add_sub_skip_mm()
     test_runner.test_vit_skip_conv()
+    test_runner.test_ops_to_not_decompose()
     test_runner.test_buffer()
@@ -114,6 +114,7 @@ def try_set_param(
             if not try_set_param(conv_bias_node, fused_conv_bias) and try_set_param(
                 bn_bias_node, fused_conv_bias
             ):
+                # pyre-ignore[60]
                 # Conv didn't have bias but batchnorm did, steal bias from batchnorm.
                 conv_args = (*conv.args[0:2], bn_bias_node, *conv.args[3:])
                 conv.args = conv_args
 
@@ -204,11 +204,21 @@
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::quantized_linear_out
 
+- func: cadence::quantized_linear.per_tensor_out(Tensor src, Tensor weight, Tensor bias, SymInt src_zero_point, SymInt weight_zero_point, SymInt out_multiplier, SymInt out_shift, SymInt out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_linear_per_tensor_out
+
 - func: cadence::quantized_relu.out(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::quantized_relu_out
 
+- func: cadence::quantized_relu.per_tensor_out(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_relu_per_tensor_out
+
 - func: cadence::quantized_linear.per_tensor_out(Tensor src, Tensor weight, Tensor bias, SymInt src_zero_point, SymInt weight_zero_point, SymInt out_multiplier, SymInt out_shift, SymInt out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
 
@@ -76,8 +76,8 @@ target_include_directories(
 
 # Custom ops that are needed to run the test model.
 add_library(
-  custom_ops "quantized_linear_out.cpp" "quantized_layer_norm.cpp"
-             "quantize_per_tensor.cpp" "quantized_relu_out.cpp" "dequantize_per_tensor.cpp"
+  custom_ops "op_quantized_linear_out.cpp" "op_quantized_layer_norm.cpp"
+             "op_quantize_per_tensor.cpp" "op_quantized_relu_out.cpp" "op_dequantize_per_tensor.cpp"
 )
 target_include_directories(
   custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
 
@@ -321,6 +321,16 @@ Tensor& clamp_Tensor_out(
 
   return out;
 }
+
+Tensor& clamp_tensor_out(
+    RuntimeContext& ctx,
+    const Tensor& in,
+    const executorch::aten::optional<Tensor>& min_opt,
+    const executorch::aten::optional<Tensor>& max_opt,
+    Tensor& out) {
+  clamp_Tensor_out(ctx, in, min_opt, max_opt, out);
+}
+
 } // namespace native
 } // namespace HiFi
 } // namespace impl
 
@@ -168,6 +168,16 @@ Tensor& mean_out(
   return out;
 }
 
+Tensor& mean_dim_out(
+    RuntimeContext& ctx,
+    const Tensor& in,
+    optional<ArrayRef<int64_t>> dim_list,
+    bool keepdim,
+    optional<ScalarType> dtype,
+    Tensor& out) {
+  mean_out(ctx, in, dim_list, keepdim, dtype, out);
+}
+
 } // namespace native
 } // namespace HiFi
 } // namespace impl
 
@@ -75,6 +75,46 @@ void quantized_relu_per_tensor_out(
   }
 }
 
+void quantized_relu_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& in_zero_point,
+    const int64_t out_zero_point,
+    const Tensor& out_multiplier,
+    const Tensor& out_shift,
+    Tensor& output) {
+  int8_t _in_zero_point = in_zero_point.const_data_ptr<int8_t>()[0];
+  int32_t _out_multiplier = out_multiplier.const_data_ptr<int32_t>()[0];
+  int32_t _out_shift = out_shift.const_data_ptr<int32_t>()[0];
+
+  quantized_relu_per_tensor_out(
+      ctx,
+      input,
+      _in_zero_point,
+      out_zero_point,
+      _out_multiplier,
+      _out_shift,
+      output);
+}
+
+void quantized_relu_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const int64_t in_zero_point,
+    const int64_t out_zero_point,
+    const int64_t out_multiplier,
+    const int64_t out_shift,
+    Tensor& output) {
+  quantized_relu_per_tensor_out(
+      ctx,
+      input,
+      in_zero_point,
+      out_zero_point,
+      out_multiplier,
+      out_shift,
+      output);
+}
+
 } // namespace native
 } // namespace HiFi
 } // namespace impl
 
@@ -194,6 +194,15 @@ Tensor& _softmax_out(
   return out;
 }
 
+Tensor& softmax_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    int64_t dim,
+    bool half_to_float,
+    Tensor& out) {
+  _softmax_out(ctx, in, dim, half_to_float, out);
+}
+
 } // namespace native
 } // namespace HiFi
 } // namespace impl
 
@@ -18,6 +18,8 @@ $ADB_PATH wait-for-device shell 'while [[ -z $(getprop sys.boot_completed) ]]; d
 echo "List all running emulators"
 $ADB_PATH devices
 
+adb uninstall com.example.executorchllamademo || true
+adb uninstall com.example.executorchllamademo.test || true
 adb install -t app-debug.apk
 adb install -t app-debug-androidTest.apk
 
@@ -26,6 +28,8 @@ adb push model.pte /data/local/tmp/llama
 adb push tokenizer.bin /data/local/tmp/llama
 adb shell am instrument -w -r com.example.executorchllamademo.test/androidx.test.runner.AndroidJUnitRunner
 
+adb uninstall org.pytorch.executorch || true
+adb uninstall org.pytorch.executorch.test || true
 adb install -t android-test-debug.apk
 adb install -t android-test-debug-androidTest.apk