diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py
index 3f98745093c..937e3e39bc1 100644
--- a/backends/cadence/aot/compiler.py
+++ b/backends/cadence/aot/compiler.py
@@ -235,12 +235,12 @@ def quantize_and_export_to_cadence(
 def export_to_executorch_gen_etrecord(
     model: torch.nn.Module,
     inputs: tuple[object, ...],
-    dump_graphs: bool = False,
     output_dir: Optional[str] = None,
     opt_level: int = 1,
+    dump_graphs: bool = False,
 ) -> ExecutorchProgramManager:
-    edge_prog_manager = export_to_edge(model, inputs)
     cadence_passes = get_cadence_passes(opt_level)
+    edge_prog_manager = export_to_edge(model, inputs, dump_graphs)
 
     # Run a couple required passes for quant/dequant ops
     cadence_prog_manager = edge_prog_manager.transform(
diff --git a/backends/cadence/hifi/operators/op_add.cpp b/backends/cadence/hifi/operators/op_add.cpp
index 43cb0d8cd62..ec0e48e3791 100644
--- a/backends/cadence/hifi/operators/op_add.cpp
+++ b/backends/cadence/hifi/operators/op_add.cpp
@@ -9,6 +9,8 @@
 #include <executorch/backends/cadence/hifi/kernels/kernels.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/kernels/portable/cpu/util/dtype_util.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
@@ -121,7 +123,7 @@ Tensor& add_out(
   float alpha_val;
   torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
 
-  constexpr auto name = "add.out";
+  static constexpr const char op_name[] = "add.out";
   constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */
 
   int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim();
@@ -178,23 +180,25 @@ Tensor& add_out(
     return out;
   }
 
-  ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
-    ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
-      using CTYPE_IN = typename torch::executor::
-          promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
-      ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
-      CTYPE_IN alpha_val;
-      torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
-
-      ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
-        AddInner<
-            can_cast<CTYPE_IN, CTYPE_OUT>::value,
-            CTYPE_A,
-            CTYPE_B,
-            CTYPE_IN,
-            CTYPE_OUT>::run(a, b, alpha_val, out);
-      });
-    });
+  // Compute Dtype
+  ScalarType compute_type =
+      torch::executor::native::utils::get_compute_type(common_type);
+
+  ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
+    const CTYPE_COMPUTE val_alpha =
+        torch::executor::native::utils::scalar_to<CTYPE_COMPUTE>(alpha);
+    torch::executor::native::utils::
+        apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+            [val_alpha](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+              return val_a + val_alpha * val_b;
+            },
+            ctx,
+            a,
+            torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
+            b,
+            torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
+            out,
+            torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16);
   });
 
   return out;
diff --git a/backends/cadence/hifi/operators/op_div.cpp b/backends/cadence/hifi/operators/op_div.cpp
index 88e670b432f..05f3db7ec31 100644
--- a/backends/cadence/hifi/operators/op_div.cpp
+++ b/backends/cadence/hifi/operators/op_div.cpp
@@ -9,6 +9,8 @@
 #include <executorch/backends/cadence/hifi/kernels/kernels.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/kernels/portable/cpu/util/dtype_util.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/kernels/portable/cpu/util/math_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
@@ -134,25 +136,26 @@ div_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
       InvalidArgument,
       out);
 
-  ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "div.out", CTYPE_A, [&]() {
-    ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "div.out", CTYPE_B, [&]() {
-      ET_SWITCH_FLOAT_TYPES(common_type, ctx, "div.out", CTYPE_IN, [&]() {
-        ET_SWITCH_FLOAT_TYPES(out_type, ctx, "div.out", CTYPE_OUT, [&]() {
-          torch::executor::
-              apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
-                  [](const CTYPE_A val_a, const CTYPE_B val_b) {
-                    CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
-                    CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
-                    CTYPE_IN value = a_casted / b_casted;
-
-                    return static_cast<CTYPE_OUT>(value);
-                  },
-                  a,
-                  b,
-                  out);
-        });
-      });
-    });
+  // Compute Dtype
+  ScalarType compute_type =
+      torch::executor::native::utils::get_compute_type(common_type);
+
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "div.out";
+
+  ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
+    torch::executor::native::utils::
+        apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+            [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+              return val_a / val_b;
+            },
+            ctx,
+            a,
+            torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
+            b,
+            torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
+            out,
+            torch::executor::native::utils::SupportedTensorDtypes::FLOATHBF16);
   });
 
   return out;
@@ -254,35 +257,59 @@ Tensor& div_out_mode(
     return out;
   }
 
-  ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "div.out_mode", CTYPE_A, [&]() {
-    ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "div.out_mode", CTYPE_B, [&]() {
-      ET_SWITCH_FLOAT_TYPES(common_type, ctx, "div.out_mode", CTYPE_IN, [&]() {
-        ET_SWITCH_REAL_TYPES(out_type, ctx, "div.out_mode", CTYPE_OUT, [&]() {
-          torch::executor::
-              apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
-                  [mode](const CTYPE_A val_a, const CTYPE_B val_b) {
-                    CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
-                    CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
-                    CTYPE_IN value = a_casted / b_casted;
-                    if (mode.has_value() && mode.value() == "trunc") {
-                      value = std::trunc(value);
-                    } else if (mode.has_value() && mode.value() == "floor") {
-                      value = std::floor(value);
-                    }
-                    return static_cast<CTYPE_OUT>(value);
-                  },
-                  a,
-                  b,
-                  out);
-        });
-      });
-    });
+  bool div_by_zero_error = false;
+  const bool mode_is_trunc = (mode.has_value() && mode.value() == "trunc");
+  // Compute Dtype
+  ScalarType compute_type =
+      torch::executor::native::utils::get_compute_type(common_type);
+
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "div.out";
+
+  ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
+    torch::executor::native::utils::
+        apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+            [mode_is_trunc, &div_by_zero_error](
+                const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+              if (executorch::runtime::is_integral_type<
+                      CTYPE_COMPUTE,
+                      /*includeBool=*/true>::value) {
+                if (val_b == 0) {
+                  div_by_zero_error = true;
+                  return static_cast<CTYPE_COMPUTE>(0);
+                }
+              }
+              CTYPE_COMPUTE value = val_a / val_b;
+              if (mode_is_trunc) {
+                value = std::trunc(value);
+              } else {
+                // We established above that the mode is either trunc or floor,
+                // so it must be floor.
+                value =
+                    torch::executor::native::utils::floor_divide(val_a, val_b);
+              }
+              return value;
+            },
+            ctx,
+            a,
+            torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
+            b,
+            torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
+            out,
+            torch::executor::native::utils::SupportedTensorDtypes::REALHBF16);
   });
 
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      !div_by_zero_error,
+      InvalidArgument,
+      out,
+      "Div mode operation encountered integer division by zero");
+
   return out;
 }
 
 } // namespace native
 } // namespace HiFi
 } // namespace impl
-} // namespace cadence
\ No newline at end of file
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/op_mean.cpp b/backends/cadence/hifi/operators/op_mean.cpp
index 478e10da712..ed5ed3359e5 100644
--- a/backends/cadence/hifi/operators/op_mean.cpp
+++ b/backends/cadence/hifi/operators/op_mean.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/kernels/portable/cpu/util/dtype_util.h>
 #include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
@@ -141,11 +142,11 @@ Tensor& mean_dim_out(
     return out;
   }
 
-  ET_SWITCH_REALHB_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] {
-    ET_SWITCH_FLOATH_TYPES(out.scalar_type(), ctx, name, CTYPE_OUT, [&] {
+  ET_SWITCH_REALHB_TYPES(in.scalar_type(), ctx, "mean.out", CTYPE_IN, [&] {
+    ET_SWITCH_FLOATH_TYPES(out.scalar_type(), ctx, "mean.out", CTYPE_OUT, [&] {
       CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
-      const size_t num = torch::executor::get_reduced_dim_product(in, dim_list);
-
+      const size_t num =
+          torch::executor::exeget_reduced_dim_product(in, dim_list);
       for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
         CTYPE_OUT sum = 0;
         if (in.numel() > 0) {
diff --git a/backends/cadence/hifi/operators/op_mul.cpp b/backends/cadence/hifi/operators/op_mul.cpp
index ad12606bdf6..396833dd1af 100644
--- a/backends/cadence/hifi/operators/op_mul.cpp
+++ b/backends/cadence/hifi/operators/op_mul.cpp
@@ -9,6 +9,8 @@
 #include <executorch/backends/cadence/hifi/kernels/kernels.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/kernels/portable/cpu/util/dtype_util.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
@@ -144,20 +146,26 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
     return out;
   }
 
-  ET_SWITCH_REALHB_TYPES(a_type, ctx, "mul.out", CTYPE_A, [&]() {
-    ET_SWITCH_REALHB_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() {
-      using CTYPE_IN = typename torch::executor::
-          promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
-      ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
-      ET_SWITCH_REALHB_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() {
-        MulInner<
-            can_cast<CTYPE_IN, CTYPE_OUT>::value,
-            CTYPE_A,
-            CTYPE_B,
-            CTYPE_IN,
-            CTYPE_OUT>::run(a, b, out);
-      });
-    });
+  // Compute Dtype
+  ScalarType compute_type =
+      torch::executor::native::utils::get_compute_type(common_type);
+
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "mul.Scalar_out";
+
+  ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
+    torch::executor::native::utils::
+        apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+            [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+              return val_a * val_b;
+            },
+            ctx,
+            a,
+            torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
+            b,
+            torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
+            out,
+            torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16);
   });
 
   return out;
@@ -166,4 +174,4 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
 } // namespace native
 } // namespace HiFi
 } // namespace impl
-} // namespace cadence
\ No newline at end of file
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/op_sigmoid.cpp b/backends/cadence/hifi/operators/op_sigmoid.cpp
index b9fa73b879f..35321cc27eb 100644
--- a/backends/cadence/hifi/operators/op_sigmoid.cpp
+++ b/backends/cadence/hifi/operators/op_sigmoid.cpp
@@ -9,6 +9,8 @@
 #include <cmath>
 
 #include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/kernels/portable/cpu/util/dtype_util.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
@@ -58,19 +60,27 @@ Tensor& sigmoid_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
     return out;
   }
 
-  ET_SWITCH_REALHB_TYPES(in_type, ctx, "sigmoid.out", CTYPE_IN, [&]() {
-    ET_SWITCH_FLOATH_TYPES(out_type, ctx, "sigmoid.out", CTYPE_OUT, [&]() {
-      torch::executor::apply_unary_map_fn(
-          [](const CTYPE_IN val_in) {
-            // perform math in double to preserve precision
-            double in_casted = static_cast<double>(val_in);
-            double out_val = 1.0 / (1.0 + exp(-in_casted));
-            return static_cast<CTYPE_OUT>(out_val);
-          },
-          in.const_data_ptr<CTYPE_IN>(),
-          out.mutable_data_ptr<CTYPE_OUT>(),
-          in.numel());
-    });
+  ScalarType compute_type =
+      executorch::runtime::isFloatingType(in.scalar_type()) ? in.scalar_type()
+                                                            : ScalarType::Float;
+  compute_type = torch::executor::native::utils::get_compute_type(compute_type);
+
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "sigmoid.out";
+
+  ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
+    torch::executor::native::utils::
+        apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+            [](const CTYPE_COMPUTE val_in) {
+              CTYPE_COMPUTE out_val = static_cast<CTYPE_COMPUTE>(1.0) /
+                  (static_cast<CTYPE_COMPUTE>(1.0) + exp(-val_in));
+              return out_val;
+            },
+            ctx,
+            in,
+            torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
+            out,
+            torch::executor::native::utils::SupportedTensorDtypes::FLOATHBF16);
   });
 
   return out;
@@ -79,4 +89,4 @@ Tensor& sigmoid_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
 } // namespace native
 } // namespace HiFi
 } // namespace impl
-} // namespace cadence
\ No newline at end of file
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/op_sub.cpp b/backends/cadence/hifi/operators/op_sub.cpp
index 0a362dbf959..cf10e414354 100644
--- a/backends/cadence/hifi/operators/op_sub.cpp
+++ b/backends/cadence/hifi/operators/op_sub.cpp
@@ -9,6 +9,8 @@
 #include <executorch/backends/cadence/hifi/kernels/kernels.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/kernels/portable/cpu/util/dtype_util.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
@@ -176,22 +178,28 @@ Tensor& sub_out(
     return out;
   }
 
-  ET_SWITCH_REALH_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
-    ET_SWITCH_REALH_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
-      using CTYPE_IN = typename torch::executor::
-          promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
-      ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
-      CTYPE_IN alpha_val;
-      torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
-      ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
-        SubInner<
-            can_cast<CTYPE_IN, CTYPE_OUT>::value,
-            CTYPE_A,
-            CTYPE_B,
-            CTYPE_IN,
-            CTYPE_OUT>::run(a, b, alpha_val, out);
-      });
-    });
+  // Compute Dtype
+  ScalarType compute_type =
+      torch::executor::native::utils::get_compute_type(common_type);
+
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "sub.out";
+
+  ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
+    const CTYPE_COMPUTE val_alpha =
+        torch::executor::native::utils::scalar_to<CTYPE_COMPUTE>(alpha);
+    torch::executor::native::utils::
+        apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+            [val_alpha](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+              return val_a - val_alpha * val_b;
+            },
+            ctx,
+            a,
+            torch::executor::native::utils::SupportedTensorDtypes::REALHBF16,
+            b,
+            torch::executor::native::utils::SupportedTensorDtypes::REALHBF16,
+            out,
+            torch::executor::native::utils::SupportedTensorDtypes::REALHBF16);
   });
 
   return out;
@@ -200,4 +208,4 @@ Tensor& sub_out(
 } // namespace native
 } // namespace HiFi
 } // namespace impl
-} // namespace cadence
\ No newline at end of file
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/op_where.cpp b/backends/cadence/hifi/operators/op_where.cpp
index 06bd0bc3c9f..c94d2ee65c5 100644
--- a/backends/cadence/hifi/operators/op_where.cpp
+++ b/backends/cadence/hifi/operators/op_where.cpp
@@ -8,6 +8,8 @@
 
 #include <executorch/backends/cadence/hifi/kernels/kernels.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/kernels/portable/cpu/util/dtype_util.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
@@ -148,28 +150,43 @@ Tensor& where_out(
     }
     return out;
   }
-  ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
-    ET_SWITCH_REALHB_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
-      using CTYPE_OUT =
-          typename torch::executor::promote_types<CTYPE_A, CTYPE_B>::type;
-      torch::executor::
-          apply_ternary_elementwise_fn<CTYPE_A, CTYPE_B, uint8_t, CTYPE_OUT>(
-              [](const CTYPE_A val_a,
-                 const CTYPE_B val_b,
-                 const uint8_t val_c) {
-                CTYPE_OUT a_casted = static_cast<CTYPE_OUT>(val_a);
-                CTYPE_OUT b_casted = static_cast<CTYPE_OUT>(val_b);
-                return val_c ? a_casted : b_casted;
-              },
-              a,
-              b,
-              cond,
-              out);
-    });
+
+  // Compute Dtype
+  ScalarType compute_type =
+      torch::executor::native::utils::get_compute_type(common_type);
+
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "where.self_out";
+
+  ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
+    torch::executor::native::utils::apply_tritensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name>(
+        [](const CTYPE_COMPUTE val_a,
+           const CTYPE_COMPUTE val_b,
+           const CTYPE_COMPUTE val_c) { return val_c ? val_a : val_b; },
+        ctx,
+        a,
+        torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
+        b,
+        torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
+        cond,
+        torch::executor::native::utils::SupportedTensorDtypes::BOOL_OR_BYTE,
+        out,
+        torch::executor::native::utils::SupportedTensorDtypes::SAME_AS_COMMON);
   });
   return out;
 }
 
+Tensor& where_self_out(
+    RuntimeContext& ctx,
+    const Tensor& cond,
+    const Tensor& a,
+    const Tensor& b,
+    Tensor& out) {
+  return cadence::impl::HiFi::native::where_out(ctx, cond, a, b, out);
+}
+
 } // namespace native
 } // namespace HiFi
 } // namespace impl
diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl
index 96f063728c8..6c671a5f24a 100644
--- a/backends/cadence/hifi/operators/targets.bzl
+++ b/backends/cadence/hifi/operators/targets.bzl
@@ -11,20 +11,230 @@ def define_common_targets():
     # Define build targets for all operators registered in the tables above.
 
     runtime.cxx_library(
-        name = "cadence_hifi_ops",
-        srcs = glob([
-            "*.cpp",
-        ]),
-        exported_headers = glob(["*.h"]),
+        name = "quantize_per_tensor",
+        srcs = [
+            "quantize_per_tensor.cpp"
+        ],
+        platforms = CXX,
+        deps = [
+            "//executorch/kernels/portable/cpu/util:all_deps",
+            "//executorch/kernels/portable/cpu/pattern:all_deps",
+            "//executorch/runtime/kernel:kernel_includes",
+            "//executorch/kernels/portable/cpu:scalar_utils",
+            "//executorch/backends/cadence/hifi/kernels:kernels",
+            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
+        ],
+        visibility = [
+            "//executorch/backends/cadence/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
+    runtime.cxx_library(
+        name = "dequantize_per_tensor",
+        srcs = [
+            "dequantize_per_tensor.cpp"
+        ],
+        platforms = CXX,
+        deps = [
+            "//executorch/kernels/portable/cpu/util:all_deps",
+            "//executorch/kernels/portable/cpu/pattern:all_deps",
+            "//executorch/runtime/kernel:kernel_includes",
+            "//executorch/kernels/portable/cpu:scalar_utils",
+            "//executorch/backends/cadence/hifi/kernels:kernels",
+            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
+        ],
+        visibility = [
+            "//executorch/backends/cadence/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
+    runtime.cxx_library(
+        name = "quantized_layer_norm",
+        srcs = [
+            "quantized_layer_norm.cpp"
+        ],
+        exported_headers = ["operators.h"],
+        platforms = CXX,
+        deps = [
+            "//executorch/kernels/portable/cpu/util:all_deps",
+            "//executorch/kernels/portable/cpu/pattern:all_deps",
+            "//executorch/runtime/kernel:kernel_includes",
+            "//executorch/kernels/portable/cpu:scalar_utils",
+            "//executorch/backends/cadence/hifi/kernels:kernels",
+            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
+        ],
+        visibility = [
+            "//executorch/backends/cadence/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
+    runtime.cxx_library(
+        name = "quantized_linear_out",
+        srcs = [
+            "quantized_linear_out.cpp"
+        ],
+        exported_headers = ["operators.h"],
+        platforms = CXX,
+        deps = [
+            "//executorch/kernels/portable/cpu/util:all_deps",
+            "//executorch/kernels/portable/cpu/pattern:all_deps",
+            "//executorch/runtime/kernel:kernel_includes",
+            "//executorch/kernels/portable/cpu:scalar_utils",
+            "//executorch/backends/cadence/hifi/kernels:kernels",
+            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
+        ],
+        visibility = [
+            "//executorch/backends/cadence/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
+    runtime.cxx_library(
+        name = "op_add",
+        srcs = [
+            "op_add.cpp",
+        ],
         platforms = CXX,
         deps = [
             "//executorch/kernels/portable/cpu/util:all_deps",
             "//executorch/kernels/portable/cpu/pattern:all_deps",
             "//executorch/runtime/kernel:kernel_includes",
             "//executorch/kernels/portable/cpu:scalar_utils",
-            "fbsource//third-party/nnlib-hifi4/xa_nnlib:libxa_nnlib",
-            "fbsource//third-party/nnlib-hifi4/xa_nnlib:libxa_nnlib_common",
             "//executorch/backends/cadence/hifi/kernels:kernels",
+            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions",
+            "//executorch/kernels/portable/cpu/util:dtype_util",
+            "//executorch/kernels/portable/cpu/util:elementwise_util",
+        ],
+        visibility = [
+            "//executorch/backends/cadence/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
+
+    runtime.cxx_library(
+        name = "op_mul",
+        srcs = [
+            "op_mul.cpp",
+        ],
+        platforms = CXX,
+        deps = [
+            "//executorch/kernels/portable/cpu/util:all_deps",
+            "//executorch/kernels/portable/cpu/pattern:all_deps",
+            "//executorch/runtime/kernel:kernel_includes",
+            "//executorch/kernels/portable/cpu:scalar_utils",
+            "//executorch/backends/cadence/hifi/kernels:kernels",
+            "//executorch/kernels/portable/cpu/util:dtype_util",
+            "//executorch/kernels/portable/cpu/util:elementwise_util",
+            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
+        ],
+        visibility = [
+            "//executorch/backends/cadence/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
+    runtime.cxx_library(
+        name = "op_sub",
+        srcs = [
+            "op_sub.cpp",
+        ],
+        platforms = CXX,
+        deps = [
+            "//executorch/kernels/portable/cpu/util:all_deps",
+            "//executorch/kernels/portable/cpu/pattern:all_deps",
+            "//executorch/runtime/kernel:kernel_includes",
+            "//executorch/kernels/portable/cpu:scalar_utils",
+            "//executorch/backends/cadence/hifi/kernels:kernels",
+            "//executorch/kernels/portable/cpu/util:dtype_util",
+            "//executorch/kernels/portable/cpu/util:elementwise_util",
+            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
+        ],
+        visibility = [
+            "//executorch/backends/cadence/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
+    runtime.cxx_library(
+        name = "op_div",
+        srcs = [
+            "op_div.cpp",
+        ],
+        platforms = CXX,
+        deps = [
+            "//executorch/kernels/portable/cpu/util:all_deps",
+            "//executorch/kernels/portable/cpu/pattern:all_deps",
+            "//executorch/runtime/kernel:kernel_includes",
+            "//executorch/kernels/portable/cpu:scalar_utils",
+            "//executorch/backends/cadence/hifi/kernels:kernels",
+            "//executorch/kernels/portable/cpu/util:dtype_util",
+            "//executorch/kernels/portable/cpu/util:elementwise_util",
+            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
+        ],
+        visibility = [
+            "//executorch/backends/cadence/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
+    runtime.cxx_library(
+        name = "op_sigmoid",
+        srcs = [
+            "op_sigmoid.cpp",
+        ],
+        platforms = CXX,
+        deps = [
+            "//executorch/kernels/portable/cpu/util:all_deps",
+            "//executorch/kernels/portable/cpu/pattern:all_deps",
+            "//executorch/runtime/kernel:kernel_includes",
+            "//executorch/backends/cadence/hifi/kernels:kernels",
+            "//executorch/kernels/portable/cpu/util:dtype_util",
+            "//executorch/kernels/portable/cpu/util:elementwise_util",
+            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
+        ],
+        visibility = [
+            "//executorch/backends/cadence/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
+    runtime.cxx_library(
+        name = "op_tanh",
+        srcs = [
+            "op_tanh.cpp",
+        ],
+        platforms = CXX,
+        deps = [
+            "//executorch/kernels/portable/cpu/util:all_deps",
+            "//executorch/kernels/portable/cpu/pattern:all_deps",
+            "//executorch/runtime/kernel:kernel_includes",
+            "//executorch/backends/cadence/hifi/kernels:kernels",
+            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
+        ],
+        visibility = [
+            "//executorch/backends/cadence/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
+    
+    runtime.cxx_library(
+        name = "op_where",
+        srcs = [
+            "op_where.cpp",
+        ],
+        platforms = CXX,
+        deps = [
+            "//executorch/kernels/portable/cpu/util:all_deps",
+            "//executorch/kernels/portable/cpu/pattern:all_deps",
+            "//executorch/runtime/kernel:kernel_includes",
+            "//executorch/backends/cadence/hifi/kernels:kernels",
+            "//executorch/kernels/portable/cpu/util:elementwise_util",
+            "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions"
         ],
         visibility = [
             "//executorch/backends/cadence/...",
diff --git a/backends/cadence/hifi/third-party/nnlib/TARGETS b/backends/cadence/hifi/third-party/nnlib/TARGETS
new file mode 100644
index 00000000000..67f2bab681a
--- /dev/null
+++ b/backends/cadence/hifi/third-party/nnlib/TARGETS
@@ -0,0 +1,5 @@
+load("targets.bzl", "define_common_targets")
+
+oncall("odai_jarvis")
+
+define_common_targets()
diff --git a/backends/cadence/hifi/third-party/nnlib/targets.bzl b/backends/cadence/hifi/third-party/nnlib/targets.bzl
new file mode 100644
index 00000000000..615eacaa666
--- /dev/null
+++ b/backends/cadence/hifi/third-party/nnlib/targets.bzl
@@ -0,0 +1,18 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    runtime.cxx_library(
+        name = "nnlib-extensions",
+        srcs = native.glob(["*.c", "*.cpp"]),
+        exported_headers = glob(["*.h"]),
+        visibility = [
+            "//executorch/backends/cadence/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+        deps = [
+            "fbsource//third-party/nnlib-hifi4/xa_nnlib:libxa_nnlib",
+        ],
+    )
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
index 9eab22b05b7..2a18d57e99f 100644
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
@@ -25,7 +25,6 @@
 #include "xa_nnlib_err_chk.h"
 #include "xa_nnlib_kernels_api.h"
 
-
 #if HAVE_VFPU
 static void internal_elm_add_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
                     const    FLOAT32 * __restrict__ p_inp1,
@@ -425,4 +424,3 @@ WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
   return 0;
 
 }
-
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
index b9aa102a15f..e11fccbba52 100644
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
@@ -20,11 +20,10 @@
 
 ******************************************************************************/
 #include "xa_type_def.h"
-#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_common_fpu.h"
-#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nn_common.h"
-#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h"
-#include "nnlib-hifi4/xa_nnlib/algo/kernels/basic/hifi4/xa_nn_basic_state.h"
-#include "nnlib-hifi4/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h"
+#include "xa_nnlib_common_fpu.h"
+#include "xa_nn_common.h"
+#include "xa_nnlib_err_chk.h"
+#include "xa_nnlib_kernels_api.h"
 
 #if HAVE_VFPU
 static void internal_elm_mul_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c
index 6a7f6d0f77d..426d60b0742 100644
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c
@@ -20,10 +20,10 @@
 
 ******************************************************************************/
 #include "xa_type_def.h"
-#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_common_fpu.h"
-#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nn_common.h"
-#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h"
-#include "nnlib-hifi4/xa_nnlib/algo/kernels/basic/hifi4/xa_nn_basic_state.h"
+#include "xa_nnlib_common_fpu.h"
+#include "xa_nn_common.h"
+#include "xa_nnlib_err_chk.h"
+// #include "xa_nn_basic_state.h"
 #include "xa_nnlib_kernels_api.h"
 
 
@@ -835,4 +835,4 @@ WORD32 xa_nn_elm_where_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
   return 0;
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c
index 5978a92d269..fcc89260be4 100644
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c
@@ -428,7 +428,7 @@ static inline void xa_nn_reduce_sum_4D_f32_f32(const FLOAT32 * __restrict__ p_in
               p_wsrc2 = (xtfloatx2 *)(p_scr_in + (itr_n * plane_size) + (itr_h * wc_plane_size) + (itr_w * temp_inp_c));
               p_dst = (xtfloatx2 *)(p_scratch + (itr_n * hw_plane_size) + (itr_h * temp_inp_w) + itr_w);
               align_src = AE_LA64_PP(p_wsrc2);
-              xtfloatx2 i1 = AE_MOVXTFLOATX2_FROMF32X2(AE_MOVDA32(0));
+              xtfloatx2 i1 = XT_AE_MOVXTFLOATX2_FROMF32X2(AE_MOVDA32(0));
               for(itr_c = 0; itr_c < (temp_inp_c >> 2); itr_c++)
               {
                 xtfloatx2 j1, j2;
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index eb4873d1d17..1dc36afce23 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -88,7 +88,7 @@ def define_common_targets():
         deps = [
             "//executorch/runtime/kernel:kernel_includes",
         ],
-        visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/optimized/cpu/..."],
+        visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/optimized/cpu/...", "@EXECUTORCH_CLIENTS"],
     )
 
     runtime.cxx_library(
@@ -103,7 +103,7 @@ def define_common_targets():
             "//executorch/kernels/portable/cpu:scalar_utils",
             "//executorch/runtime/kernel:kernel_includes",
         ],
-        visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/optimized/cpu/..."],
+        visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/optimized/cpu/...", "@EXECUTORCH_CLIENTS"],
     )
 
     runtime.cxx_library(