pytorch
diff --git a/‎.ci/scripts/unittest-buck2.sh‎
Lines changed: 6 additions & 4 deletions b/‎.ci/scripts/unittest-buck2.sh‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎.lintrunner.toml‎
Lines changed: 4 additions & 0 deletions b/‎.lintrunner.toml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 0 additions & 8 deletions b/‎CMakeLists.txt‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎backends/apple/coreml/scripts/build_tests.sh‎
Lines changed: 1 addition & 2 deletions b/‎backends/apple/coreml/scripts/build_tests.sh‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎backends/arm/_passes/scalars_to_attribute_pass.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/_passes/scalars_to_attribute_pass.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎backends/cadence/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/cadence/aot/functions_hifi.yaml‎
Lines changed: 104 additions & 4 deletions b/‎backends/cadence/aot/functions_hifi.yaml‎
Lines changed: 104 additions & 4 deletions
@@ -15,7 +15,7 @@ buck2 query "//backends/apple/... + //backends/example/... + \
 //kernels/optimized/... + //kernels/portable/... + //kernels/quantized/... + \
 //kernels/test/... + //runtime/... + //schema/... + //test/... + //util/..."
 
-UNBUILDABLE_OPTIMIZED_OPS_REGEX="gelu|fft_r2c|log_softmax"
+UNBUILDABLE_OPTIMIZED_OPS_REGEX="_elu|gelu|fft|log_softmax"
 BUILDABLE_OPTIMIZED_OPS=$(buck2 query //kernels/optimized/cpu/... | grep -E -v $UNBUILDABLE_OPTIMIZED_OPS_REGEX)
 
 # TODO: build prim_ops_test_cpp again once supported_features works in
@@ -24,6 +24,8 @@ BUILDABLE_KERNELS_PRIM_OPS_TARGETS=$(buck2 query //kernels/prim_ops/... | grep -
 # TODO: expand the covered scope of Buck targets.
 # //runtime/kernel/... is failing because //third-party:torchgen_files's shell script can't find python on PATH.
 # //runtime/test/... requires Python torch, which we don't have in our OSS buck setup.
-buck2 test $BUILDABLE_OPTIMIZED_OPS //kernels/portable/... \
-      $BUILDABLE_KERNELS_PRIM_OPS_TARGETS //runtime/backend/... //runtime/core/... \
-      //runtime/executor: //runtime/kernel/... //runtime/platform/...
+for op in "build" "test"; do
+    buck2 $op $BUILDABLE_OPTIMIZED_OPS //kernels/portable/... \
+          $BUILDABLE_KERNELS_PRIM_OPS_TARGETS //runtime/backend/... //runtime/core/... \
+          //runtime/executor: //runtime/kernel/... //runtime/platform/...
+done
@@ -271,6 +271,10 @@ exclude_patterns = [
     'examples/**',
     'exir/verification/bindings.cpp',
     'extension/**',
+    # Uses properly-gated (ET_USE_PYTORCH_HEADERS) ATen include.
+    'kernels/portable/cpu/util/elementwise_util.h',
+    'kernels/portable/cpu/util/math_util.h',
+    'kernels/portable/cpu/util/vectorized_math.h',
     'kernels/optimized/**',
     'runtime/core/exec_aten/**',
     # Want to be able to keep c10 in sync with PyTorch core.
 
@@ -430,14 +430,6 @@ endif()
 
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/configurations)
 
-#
-# gflags: Commandline flag host library.
-#
-
-if(EXECUTORCH_BUILD_GFLAGS)
-  add_subdirectory(third-party/gflags)
-endif()
-
 # Install `executorch` library as well as `executorch-config.cmake` under
 # ${CMAKE_INSTALL_PREFIX}/
 install(
 
@@ -33,8 +33,7 @@ cmake "$EXECUTORCH_ROOT_PATH" -B"$CMAKE_EXECUTORCH_BUILD_DIR_PATH" \
 -DPLATFORM=MAC_UNIVERSAL \
 -DDEPLOYMENT_TARGET=13.0 \
 -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
--DEXECUTORCH_BUILD_XNNPACK=OFF \
--DEXECUTORCH_BUILD_GFLAGS=OFF
+-DEXECUTORCH_BUILD_XNNPACK=OFF
 
 cmake --build "$CMAKE_EXECUTORCH_BUILD_DIR_PATH"  -j9 -t executorch
 
 
@@ -12,8 +12,8 @@
 from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
 
 from executorch.exir.pass_base import ExportPass, PassResult
-from torch.ao.quantization.fx.utils import get_new_attr_name_with_prefix
 from torch.fx import GraphModule, Node
+from torchao.quantization.pt2e.utils import get_new_attr_name_with_prefix
 
 
 class ScalarsToAttributePass(ExportPass):
 
@@ -82,6 +82,7 @@ elseif(EXECUTORCH_FUSION_G3_OPT)
   ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 else()
   set(TARGET_DIR reference)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels)
 endif()
 
 
 
@@ -32,6 +32,36 @@
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::add_out
 
+- op: bitwise_and.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::bitwise_and_Scalar_out
+
+- op: bitwise_and.Tensor_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::bitwise_and_Tensor_out
+
+- op: bitwise_or.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::bitwise_or_Scalar_out
+
+- op: bitwise_or.Tensor_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::bitwise_or_Tensor_out
+
+- op: bitwise_xor.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::bitwise_xor_Scalar_out
+
+- op: bitwise_xor.Tensor_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::bitwise_xor_Tensor_out
+
 - op: bmm.out
   kernels:
     - arg_meta: null
@@ -65,27 +95,82 @@
 - op: embedding.out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::embedding_out
+      kernel_name: cadence::impl::HiFi::embedding_out
+
+- op: eq.Tensor_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::eq_tensor_out
+
+- op: fmod.Tensor_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::fmod_Tensor_out
+
+- op: fmod.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::fmod_Scalar_out
 
 - op: full.out
   kernels:
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::full_out
 
-- op: gt.Scalar_out
+- op: ge.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::ge_scalar_out
+
+- op: ge.Tensor_out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::gt_scalar_out
+      kernel_name: cadence::impl::HiFi::ge_tensor_out
 
 - op: gelu.out
   kernels:
     - arg_meta: null
       kernel_name: torch::executor::gelu_out
 
+- op: gt.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::gt_scalar_out
+
+- op: gt.Tensor_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::gt_tensor_out
+
 - op: hardtanh.out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::hardtanh_out
+      kernel_name: cadence::impl::HiFi::hardtanh_out
+
+- op: le.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::le_scalar_out
+
+- op: le.Tensor_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::le_tensor_out
+
+- op: lt.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::lt_scalar_out
+
+- op: lt.Tensor_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::lt_tensor_out
+
+- op: masked_fill.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::masked_fill_scalar_out
 
 - op: max_pool2d_with_indices.out
   kernels:
@@ -117,6 +202,11 @@
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::mul_out
 
+- op: ne.Tensor_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::ne_tensor_out
+
 - op: permute_copy.out
   kernels:
     - arg_meta: null
@@ -147,6 +237,11 @@
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::rsqrt_out
 
+- op: select_copy.int_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::select_copy_int_out
+
 - op: sigmoid.out
   kernels:
     - arg_meta: null
@@ -239,6 +334,11 @@
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::quantized_fully_connected_out
 
+- func: cadence::quantized_matmul.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_matmul_out
+
 - func: cadence::quantized_fully_connected.per_tensor_out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, int weight_zero_point, int out_multiplier, int out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null