pytorch
diff --git a/‎backends/arm/test/ops/test_depthwise_conv.py‎
Lines changed: 6 additions & 7 deletions b/‎backends/arm/test/ops/test_depthwise_conv.py‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎backends/arm/test/ops/test_div.py‎
Lines changed: 28 additions & 2 deletions b/‎backends/arm/test/ops/test_div.py‎
Lines changed: 28 additions & 2 deletions
diff --git a/‎backends/arm/test/ops/test_mul.py‎
Lines changed: 1 addition & 6 deletions b/‎backends/arm/test/ops/test_mul.py‎
Lines changed: 1 addition & 6 deletions
diff --git a/‎backends/cadence/CMakeLists.txt‎
Lines changed: 3 additions & 1 deletion b/‎backends/cadence/CMakeLists.txt‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎backends/cadence/aot/functions.yaml‎
Lines changed: 50 additions & 0 deletions b/‎backends/cadence/aot/functions.yaml‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎backends/cadence/aot/remove_ops.py‎
Lines changed: 2 additions & 1 deletion b/‎backends/cadence/aot/remove_ops.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/cadence/reference/operators/CMakeLists.txt‎
Lines changed: 11 additions & 0 deletions b/‎backends/cadence/reference/operators/CMakeLists.txt‎
Lines changed: 11 additions & 0 deletions
@@ -156,11 +156,14 @@
     ("two_dw_conv2d", two_dw_conv2d),
 ]
 
-testsuite_conv2d_u85_xfails = [
+testsuite_conv2d_u85 = [
     ("2x2_1x6x4x4_gp6_st1", dw_conv2d_2x2_1x6x4x4_gp6_st1),
     ("3x3_1x3x256x256_gp3_st1", dw_conv2d_3x3_1x3x256x256_gp3_st1),
     ("3x3_1x4x256x256_gp4_st1", dw_conv2d_3x3_1x4x256x256_gp4_st1),
     ("3x3_1x4x256x256_gp4_nobias", dw_conv2d_3x3_1x4x256x256_gp4_nobias),
+]
+
+testsuite_conv2d_u85_xfails = [
     ("3x3_2x8x198x198_gp8_st3", dw_conv2d_3x3_2x8x198x198_gp8_st3),
     ("two_dw_conv2d", two_dw_conv2d),
 ]
@@ -284,7 +287,7 @@ def test_dw_conv1d_u55_BI(
             model.get_inputs(),
         )
 
-    @parameterized.expand(testsuite_conv1d[2:])
+    @parameterized.expand(testsuite_conv1d + testsuite_conv2d_u85)
     def test_dw_conv_u85_BI(
         self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False
     ):
@@ -296,12 +299,8 @@ def test_dw_conv_u85_BI(
             model.get_inputs(),
         )
 
-    testsuite_conv2d_u85_xfails.remove(
-        ("3x3_1x3x256x256_gp3_st1", dw_conv2d_3x3_1x3x256x256_gp3_st1)
-    )  # Works
-
     # All test cases except 3x3_1x3x256x256_gp3_st1 have numerical issues on FVP. MLETORCH-520
-    @parameterized.expand(testsuite_conv2d_u85_xfails + testsuite_conv1d[:2])
+    @parameterized.expand(testsuite_conv2d_u85_xfails)
     @conftest.expectedFailureOnFVP
     def test_dw_conv_u85_BI_xfails(
         self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False
 
@@ -183,8 +183,21 @@ def test_div_tosa_BI(
         test_data = (input_, other_)
         self._test_div_tosa_BI_pipeline(self.Div(), test_data)
 
+    @parameterized.expand(test_data_suite[:2])
+    def test_div_u55_BI(
+        self,
+        test_name: str,
+        input_: Union[torch.Tensor, torch.types.Number],
+        other_: Union[torch.Tensor, torch.types.Number],
+        rounding_mode: Optional[str] = None,
+    ):
+        test_data = (input_, other_)
+        self._test_div_ethos_BI_pipeline(
+            self.Div(), common.get_u55_compile_spec(), test_data
+        )
+
     # Numerical issues on FVP likely due to mul op, MLETORCH-521
-    @parameterized.expand(test_data_suite)
+    @parameterized.expand(test_data_suite[2:])
     @conftest.expectedFailureOnFVP
     def test_div_u55_BI_xfails(
         self,
@@ -198,8 +211,21 @@ def test_div_u55_BI_xfails(
             self.Div(), common.get_u55_compile_spec(), test_data
         )
 
+    @parameterized.expand(test_data_suite[:2])
+    def test_div_u85_BI(
+        self,
+        test_name: str,
+        input_: Union[torch.Tensor, torch.types.Number],
+        other_: Union[torch.Tensor, torch.types.Number],
+        rounding_mode: Optional[str] = None,
+    ):
+        test_data = (input_, other_)
+        self._test_div_ethos_BI_pipeline(
+            self.Div(), common.get_u85_compile_spec(), test_data
+        )
+
     # Numerical issues on FVP likely due to mul op, MLETORCH-521
-    @parameterized.expand(test_data_suite)
+    @parameterized.expand(test_data_suite[2:])
     @conftest.expectedFailureOnFVP
     def test_div_u85_BI_xfails(
         self,
 
@@ -152,9 +152,7 @@ def test_mul_tosa_BI(
         test_data = (input_, other_)
         self._test_mul_tosa_BI_pipeline(self.Mul(), test_data)
 
-    # Numerical issues on FVP, MLETORCH-521
     @parameterized.expand(test_data_sute)
-    @conftest.expectedFailureOnFVP
     def test_mul_u55_BI(
         self,
         test_name: str,
@@ -166,10 +164,7 @@ def test_mul_u55_BI(
             common.get_u55_compile_spec(), self.Mul(), test_data
         )
 
-    # Numerical issues on FVP, MLETORCH-521
-    # test_data_sute[0] works on U85
-    @parameterized.expand(test_data_sute[1:])
-    @conftest.expectedFailureOnFVP
+    @parameterized.expand(test_data_sute)
     def test_mul_u85_BI(
         self,
         test_name: str,
 
@@ -23,7 +23,6 @@ include(${EXECUTORCH_ROOT}/build/Utils.cmake)
 
 # Let files say "include <executorch/path/to/header.h>".
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
-set(TARGET_DIR reference)
 
 if(EXECUTORCH_CADENCE_CPU_RUNNER)
   include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
@@ -61,6 +60,9 @@ if(EXECUTORCH_CADENCE_CPU_RUNNER)
                                       ${_common_include_directories}
   )
 
+  set(TARGET_DIR reference)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels)
+
   target_link_libraries(
   cadence_runner
   executorch
 
@@ -142,6 +142,41 @@
     - arg_meta: null
       kernel_name: torch::executor::where_out
 
+- op: transpose_copy.int_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::transpose_copy_int_out
+
+- op: eq.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::eq_scalar_out
+
+- op: logical_not.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::logical_not_out
+
+- op: any.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::any_out
+
+- op: native_group_norm.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::native_group_norm_out
+
+- op: sum.IntList_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::sum_dim_out
+
+- op: select_copy.int_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::select_copy_int_out
+
 # custom ops
 - func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
@@ -183,3 +218,18 @@
   kernels:
     - arg_meta: null
       kernel_name: impl::reference::quantized_matmul_out
+
+- func: cadence::quantized_linear.per_tensor_out(Tensor src, Tensor weight, Tensor bias, SymInt src_zero_point, SymInt weight_zero_point, SymInt out_multiplier, SymInt out_shift, SymInt out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_linear_per_tensor_out
+
+- func: cadence::im2row.out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, Tensor in_zero_point, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::im2row_out
+
+- func: cadence::quantized_conv.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_conv_per_tensor_out
@@ -110,7 +110,8 @@ def call_operator(
 
         # Otherwise, we replace args[0] with cat_inputs.
         new_args = list(args)
-        new_args[0] = cat_inputs
+        # pyre error introduced after D66937105
+        new_args[0] = cat_inputs  # pyre-ignore[6]
         return super().call_operator(op, tuple(new_args), kwargs, meta)
 
 
 
@@ -55,6 +55,16 @@ set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_expand_copy.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_gelu.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_empty.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_transpose_copy.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_eq.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_logical_not.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_any.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_native_group_norm.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sum.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_select_copy.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/dtype_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/normalization_ops_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/select_copy_util.cpp"
 )
 add_library(aten_ops_cadence ${_aten_ops__srcs})
 target_link_libraries(aten_ops_cadence PUBLIC executorch)
@@ -78,6 +88,7 @@ add_library(
   "quantize_per_tensor.cpp"
   "dequantize_per_tensor.cpp"
   "quantized_matmul_out.cpp"
+  "im2row_out.cpp"
 )
 target_include_directories(
   custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}