From b354f46b9f5b14f75fc7ab2bcf14485dc50986a5 Mon Sep 17 00:00:00 2001
From: Hardik Sharma <hardiksharma@meta.com>
Date: Thu, 12 Dec 2024 15:22:55 -0800
Subject: [PATCH 1/5] Allow backends/cadence to use TestUtil.h. (#7304)

Summary:

Create a separate buck target for `TestUtil.h` that can be used by backends. The current `test_util` target has dependencies that don't compile for xtensa toolchain.

Reviewed By: zonglinpeng

Differential Revision: D67128600
---
 kernels/test/targets.bzl | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl
index 2dd019e1b3e..18fa646aec4 100644
--- a/kernels/test/targets.bzl
+++ b/kernels/test/targets.bzl
@@ -41,6 +41,29 @@ def define_common_targets():
 
     for aten_kernel in (True, False):
         aten_suffix = "_aten" if aten_kernel else ""
+        runtime.cxx_library(
+            name = "gtest_utils" + aten_suffix,
+            exported_headers=[
+                "TestUtil.h",
+            ],
+            visibility = [
+                "//executorch/kernels/...",
+                "@EXECUTORCH_CLIENTS",
+            ],
+            preprocessor_flags = ["-DUSE_ATEN_LIB"] if aten_kernel else [],
+            exported_deps = [
+                "//executorch/runtime/core:core",
+                "//executorch/runtime/kernel:kernel_includes",
+                "//executorch/test/utils:utils" + aten_suffix,
+                "//executorch/runtime/platform:pal_interface",
+            ],
+            fbcode_exported_deps = [
+                "//common/gtest:gtest",
+            ],
+            xplat_exported_deps = [
+                "//third-party/googletest:gtest_main",
+            ],
+        )
         runtime.cxx_library(
             name = "test_util" + aten_suffix,
             srcs = [
@@ -49,7 +72,6 @@ def define_common_targets():
             ],
             exported_headers = [
                 "BinaryLogicalOpTest.h",
-                "TestUtil.h",
                 "UnaryUfuncRealHBBF16ToFloatHBF16Test.h",
             ],
             visibility = [
@@ -59,6 +81,7 @@ def define_common_targets():
             preprocessor_flags = ["-DUSE_ATEN_LIB"] if aten_kernel else [],
             exported_deps = [
                 ":supported_features_header",
+                ":gtest_utils",
                 "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
                 "//executorch/runtime/core/exec_aten/testing_util:tensor_util" + aten_suffix,
                 "//executorch/runtime/kernel:kernel_includes",

From 4145467ae38f73162236aa38bbf5e8e5f7c59c71 Mon Sep 17 00:00:00 2001
From: Hardik Sharma <hardiksharma@meta.com>
Date: Thu, 12 Dec 2024 15:22:55 -0800
Subject: [PATCH 2/5] Xtensa ISS PAL layer for logging/timing. (#7311)

Summary:

Overrides for `et_pal*` weak symbols for logging/timing with xtensa ISS.

Reviewed By: zonglinpeng

Differential Revision: D67128599
---
 backends/cadence/runtime/TARGETS     |  3 +
 backends/cadence/runtime/et_pal.cpp  | 90 ++++++++++++++++++++++++++++
 backends/cadence/runtime/targets.bzl | 15 +++++
 3 files changed, 108 insertions(+)
 create mode 100644 backends/cadence/runtime/et_pal.cpp
 create mode 100644 backends/cadence/runtime/targets.bzl

diff --git a/backends/cadence/runtime/TARGETS b/backends/cadence/runtime/TARGETS
index 95a7bdc3694..4055f1922a1 100644
--- a/backends/cadence/runtime/TARGETS
+++ b/backends/cadence/runtime/TARGETS
@@ -1,3 +1,4 @@
+load(":targets.bzl", "define_common_targets")
 load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
 
 oncall("odai_jarvis")
@@ -22,3 +23,5 @@ python_library(
         "//executorch/exir:lib",
     ],
 )
+
+define_common_targets()
diff --git a/backends/cadence/runtime/et_pal.cpp b/backends/cadence/runtime/et_pal.cpp
new file mode 100644
index 00000000000..fdf058f05b3
--- /dev/null
+++ b/backends/cadence/runtime/et_pal.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#if defined(XTENSA)
+
+#include <stdio.h>
+#include <sys/times.h>
+
+#include <xtensa/sim.h>
+
+#include <executorch/runtime/platform/platform.h>
+
+#define ET_LOG_OUTPUT_FILE stdout
+
+void et_pal_emit_log_message(
+    et_timestamp_t timestamp,
+    et_pal_log_level_t level,
+    const char* filename,
+    ET_UNUSED const char* function,
+    size_t line,
+    const char* message,
+    ET_UNUSED size_t length) {
+  // Not all platforms have ticks == nanoseconds, but this one does.
+  timestamp /= 1000; // To microseconds
+  int us = timestamp % 1000000;
+  timestamp /= 1000000; // To seconds
+  int sec = timestamp % 60;
+  timestamp /= 60; // To minutes
+  int min = timestamp % 60;
+  timestamp /= 60; // To hours
+  int hour = timestamp;
+
+  fprintf(
+      ET_LOG_OUTPUT_FILE,
+      "%c %02d:%02d:%02d.%06d executorch:%s:%d] %s\n",
+      static_cast<char>(level),
+      hour,
+      min,
+      sec,
+      us,
+      filename,
+      static_cast<int>(line),
+      message);
+  fflush(ET_LOG_OUTPUT_FILE);
+}
+
+et_timestamp_t et_pal_current_ticks(void) {
+  struct tms curr_time;
+  times(&curr_time);
+  return curr_time.tms_utime;
+}
+
+void et_pal_init(void) {
+  xt_iss_client_command("all", "enable");
+}
+
+#else
+
+#include <time.h>
+
+#include <cstdio>
+#include <cstdlib>
+
+#include <executorch/runtime/platform/platform.h>
+
+#define ET_LOG_OUTPUT_FILE stderr
+
+#define NSEC_PER_USEC 1000UL
+#define USEC_IN_SEC 1000000UL
+#define NSEC_IN_USEC 1000UL
+#define NSEC_IN_SEC (NSEC_IN_USEC * USEC_IN_SEC)
+
+et_timestamp_t et_pal_current_ticks(void) {
+  struct timespec ts;
+  auto ret = clock_gettime(CLOCK_REALTIME, &ts);
+  if (ret != 0) {
+    fprintf(ET_LOG_OUTPUT_FILE, "Could not get time\n");
+    fflush(ET_LOG_OUTPUT_FILE);
+    std::abort();
+  }
+
+  return ((ts.tv_sec * NSEC_IN_SEC) + (ts.tv_nsec));
+}
+
+#endif
diff --git a/backends/cadence/runtime/targets.bzl b/backends/cadence/runtime/targets.bzl
new file mode 100644
index 00000000000..dabe42ad824
--- /dev/null
+++ b/backends/cadence/runtime/targets.bzl
@@ -0,0 +1,15 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    runtime.cxx_library(
+        name = "et_pal",
+        srcs = ["et_pal.cpp"],
+        link_whole = True,
+        visibility = [
+            "//executorch/backends/cadence/...",
+            "@EXECUTORCH_CLIENTS"
+        ],
+        exported_deps = [
+            "//executorch/runtime/platform:platform",
+        ],
+    )

From 9d2157e438c4977f5f7f8b8f686eb63ce2d7d474 Mon Sep 17 00:00:00 2001
From: Hardik Sharma <hardiksharma@meta.com>
Date: Thu, 12 Dec 2024 15:22:55 -0800
Subject: [PATCH 3/5] Use macro `XT_KERNEL_CHECK` to handle errors returned by
 nnlib. (#7312)

Summary:

Use ET_KERNEL_CHECK to detect error codes returned by xa_nn* library calls.

Reviewed By: zonglinpeng

Differential Revision: D67128597
---
 .../cadence/fusion_g3/operators/op_add.cpp    | 119 ++++++++++++++----
 .../fusion_g3/operators/tests/test_op_add.cpp |  33 +++--
 2 files changed, 120 insertions(+), 32 deletions(-)

diff --git a/backends/cadence/fusion_g3/operators/op_add.cpp b/backends/cadence/fusion_g3/operators/op_add.cpp
index 0a7c7e7e035..683323b2662 100644
--- a/backends/cadence/fusion_g3/operators/op_add.cpp
+++ b/backends/cadence/fusion_g3/operators/op_add.cpp
@@ -13,18 +13,27 @@
 #include <executorch/runtime/platform/assert.h>
 #include <xa_nnlib_kernels_api.h>
 
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using executorch::runtime::canCast;
-using torch::executor::Error;
-using torch::executor::KernelRuntimeContext;
+using ::executorch::aten::Scalar;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::canCast;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::KernelRuntimeContext;
 
 namespace cadence {
 namespace impl {
 namespace G3 {
 namespace native {
 
+#define XT_KERNEL_CHECK(ctx, out, kernel, ...) \
+  const auto ret = kernel(__VA_ARGS__);        \
+  ET_KERNEL_CHECK_MSG(                         \
+      ctx,                                     \
+      ret == 0,                                \
+      InvalidArgument,                         \
+      out,                                     \
+      "Failed to run kernel: " #kernel "(" #__VA_ARGS__ ")");
+
 Tensor& add_out(
     KernelRuntimeContext& ctx,
     const Tensor& a,
@@ -121,13 +130,30 @@ Tensor& add_out(
     torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
 
     if ((a.numel() == 1) && (alpha_val == 1)) {
-      xa_nn_elm_add_scalar_32x32_32(
-          out_data, inp2_data, inp1_data[0], alpha_val, out.numel());
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_add_scalar_32x32_32,
+          out_data,
+          inp2_data,
+          inp1_data[0],
+          alpha_val,
+          out.numel());
     } else if (b.numel() == 1) {
-      xa_nn_elm_add_scalar_32x32_32(
-          out_data, inp1_data, inp2_data[0], alpha_val, out.numel());
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_add_scalar_32x32_32,
+          out_data,
+          inp1_data,
+          inp2_data[0],
+          alpha_val,
+          out.numel());
     } else if (broadcast) {
-      xa_nn_elm_add_broadcast_5D_32x32_32(
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_add_broadcast_5D_32x32_32,
           out_data,
           out_shape,
           inp1_data,
@@ -137,8 +163,15 @@ Tensor& add_out(
           max_dim,
           alpha_val);
     } else {
-      xa_nn_elm_add_32x32_32(
-          out_data, inp1_data, inp2_data, alpha_val, out.numel());
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_add_32x32_32,
+          out_data,
+          inp1_data,
+          inp2_data,
+          alpha_val,
+          out.numel());
     }
   } else if ((compute_type == ScalarType::Float) && (optimized)) {
     const float* const inp1_data = a.const_data_ptr<float>();
@@ -149,13 +182,30 @@ Tensor& add_out(
     torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
 
     if ((a.numel() == 1) && (alpha_val == 1.0)) {
-      xa_nn_elm_add_scalar_f32xf32_f32(
-          out_data, inp2_data, inp1_data[0], alpha_val, out.numel());
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_add_scalar_f32xf32_f32,
+          out_data,
+          inp2_data,
+          inp1_data[0],
+          alpha_val,
+          out.numel());
     } else if (b.numel() == 1) {
-      xa_nn_elm_add_scalar_f32xf32_f32(
-          out_data, inp1_data, inp2_data[0], alpha_val, out.numel());
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_add_scalar_f32xf32_f32,
+          out_data,
+          inp1_data,
+          inp2_data[0],
+          alpha_val,
+          out.numel());
     } else if (broadcast) {
-      xa_nn_elm_add_broadcast_5D_f32xf32_f32(
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_add_broadcast_5D_f32xf32_f32,
           out_data,
           out_shape,
           inp1_data,
@@ -165,8 +215,15 @@ Tensor& add_out(
           max_dim,
           alpha_val);
     } else {
-      xa_nn_elm_add_f32xf32_f32(
-          out_data, inp1_data, inp2_data, alpha_val, out.numel());
+      XT_KERNEL_CHECK(
+          ctx,
+          out,
+          xa_nn_elm_add_f32xf32_f32,
+          out_data,
+          inp1_data,
+          inp2_data,
+          alpha_val,
+          out.numel());
     }
   } else {
     ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
@@ -242,8 +299,15 @@ Tensor& add_scalar_out(
 
     int* const out_data = out.mutable_data_ptr<int>();
 
-    xa_nn_elm_add_scalar_32x32_32(
-        out_data, inp1_data, inp2_val, alpha_val, out.numel());
+    XT_KERNEL_CHECK(
+        ctx,
+        out,
+        xa_nn_elm_add_scalar_32x32_32,
+        out_data,
+        inp1_data,
+        inp2_val,
+        alpha_val,
+        out.numel());
 
   } else if (compute_type == ScalarType::Float) {
     const float* const inp1_data = a.const_data_ptr<float>();
@@ -255,8 +319,15 @@ Tensor& add_scalar_out(
 
     float* const out_data = out.mutable_data_ptr<float>();
 
-    xa_nn_elm_add_scalar_f32xf32_f32(
-        out_data, inp1_data, inp2_val, alpha_val, out.numel());
+    XT_KERNEL_CHECK(
+        ctx,
+        out,
+        xa_nn_elm_add_scalar_f32xf32_f32,
+        out_data,
+        inp1_data,
+        inp2_val,
+        alpha_val,
+        out.numel());
 
   } else {
     ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
diff --git a/backends/cadence/fusion_g3/operators/tests/test_op_add.cpp b/backends/cadence/fusion_g3/operators/tests/test_op_add.cpp
index 06bf4bf4ec1..cbc419d47e1 100644
--- a/backends/cadence/fusion_g3/operators/tests/test_op_add.cpp
+++ b/backends/cadence/fusion_g3/operators/tests/test_op_add.cpp
@@ -10,6 +10,8 @@
 #include <stdio.h>
 
 #include <executorch/backends/cadence/fusion_g3/operators/operators.h>
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
@@ -24,24 +26,19 @@ namespace {
 using ::executorch::aten::Scalar;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
+using ::executorch::aten::TensorImpl;
+using ::executorch::runtime::Error;
 using ::executorch::runtime::KernelRuntimeContext;
 using ::executorch::runtime::runtime_init;
 using ::executorch::runtime::testing::TensorFactory;
-using ::testing::Test;
 
-class FusionG3OperatorTest : public Test {
+class FusionG3OperatorTest : public OperatorTest {
  public:
-  void SetUp() override {
-    runtime_init();
-  }
-
  protected:
   Tensor&
   add_out(const Tensor& a, const Tensor& b, const Scalar& alpha, Tensor& out) {
     return cadence::impl::G3::native::add_out(context_, a, b, alpha, out);
   }
-
-  KernelRuntimeContext context_;
 };
 
 TEST_F(FusionG3OperatorTest, TwoDimFloatTensorAddTest) {
@@ -77,6 +74,26 @@ TEST_F(FusionG3OperatorTest, AddWithBroadcastTest) {
   EXPECT_TENSOR_EQ(out, tf.full(size_a, 2));
 }
 
+TEST_F(FusionG3OperatorTest, KernelCheckTest) {
+  TensorFactory<ScalarType::Float> tf;
+  // Broadcast add.
+  const std::vector<TensorImpl::SizesType> sizeOfA{1, 3, 2, 4}, sizeOfB{2, 4};
+  const Tensor b = tf.ones(sizeOfB);
+  Tensor out = tf.zeros(sizeOfA);
+  // Create a null tensor to force kernel check failure.
+  TensorImpl nullTensorImpl(
+      b.scalar_type(),
+      b.dim(),
+      const_cast<TensorImpl::SizesType*>(b.sizes().data()),
+      // Use nullptr to force kernel check failure.
+      /*data=*/nullptr,
+      const_cast<TensorImpl::DimOrderType*>(b.dim_order().data()));
+  Tensor nullTensor(&nullTensorImpl);
+
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, add_out(tf.ones(sizeOfA), nullTensor, 1, out));
+}
+
 } // namespace
 } // namespace native
 } // namespace G3

From 0a2e2f36c8213b8c96b6b4171d4342d5687076a5 Mon Sep 17 00:00:00 2001
From: Hardik Sharma <hardiksharma@meta.com>
Date: Thu, 12 Dec 2024 15:22:55 -0800
Subject: [PATCH 4/5] Separate buck targets per operator. (#7314)

Summary:

Keep targets separate so we only compile the operators we need.

Reviewed By: zonglinpeng

Differential Revision: D67128598
---
 .../cadence/fusion_g3/operators/targets.bzl   | 65 +++++++++++++------
 1 file changed, 46 insertions(+), 19 deletions(-)

diff --git a/backends/cadence/fusion_g3/operators/targets.bzl b/backends/cadence/fusion_g3/operators/targets.bzl
index 9318b369370..47d035d420d 100644
--- a/backends/cadence/fusion_g3/operators/targets.bzl
+++ b/backends/cadence/fusion_g3/operators/targets.bzl
@@ -1,6 +1,45 @@
 load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
+def define_operator(name: str, deps: list[str] | None = None) -> None:
+    op_name = "op_{}".format(name)
+
+    # Deps used by all operators.
+    common_deps = [
+        "//executorch/kernels/portable/cpu/util:all_deps",
+        "//executorch/kernels/portable/cpu/pattern:all_deps",
+        "//executorch/runtime/kernel:kernel_includes",
+        "//executorch/kernels/portable/cpu:scalar_utils",
+        "fbsource//third-party/nnlib-FusionG3/xa_nnlib:libxa_nnlib_common",
+        "fbsource//third-party/nnlib-FusionG3/xa_nnlib:libxa_nnlib",
+    ]
+    if deps == None:
+        deps = []
+
+    runtime.cxx_library(
+        name = op_name,
+        srcs = [op_name + ".cpp"],
+        platforms = CXX,
+        visibility = [
+            "//executorch/backends/cadence/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+        deps = deps + common_deps,
+        exported_deps = [
+            ":operators_header",
+        ],
+    )
+
+OPERATORS = [
+    "add",
+    "cat",
+    "dequantize",
+    "mul",
+    "native_layer_norm",
+    "quantize",
+    "softmax",
+]
+
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
 
@@ -11,28 +50,16 @@ def define_common_targets():
     # Define build targets for all operators registered in the tables above.
 
     runtime.cxx_library(
-        name = "cadence_g3_ops",
-        srcs = glob([
-            "*.cpp",
-        ]),
-        exported_headers = glob([
-            "*.h",
-        ]),
-        platforms = CXX,
-        deps = [
-            "//executorch/kernels/portable/cpu/util:all_deps",
-            "//executorch/kernels/portable/cpu/pattern:all_deps",
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/kernels/portable/cpu:scalar_utils",
-            "fbsource//third-party/nnlib-FusionG3/xa_nnlib:libxa_nnlib_common",
-            "fbsource//third-party/nnlib-FusionG3/xa_nnlib:libxa_nnlib",
-        ],
+        name = "operators_header",
+        exported_headers = ["operators.h"],
         visibility = [
             "//executorch/backends/cadence/...",
-            "@EXECUTORCH_CLIENTS",
         ],
         exported_deps = [
-            "fbsource//third-party/nnlib-FusionG3/xa_nnlib:libxa_nnlib_common",
-            "fbsource//third-party/nnlib-FusionG3/xa_nnlib:libxa_nnlib",
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/kernel:kernel_runtime_context",
         ],
     )
+
+    for op in OPERATORS:
+        define_operator(op)

From 9d9f728ffa5e570cfde3bbcf8754db71c637b73d Mon Sep 17 00:00:00 2001
From: Hardik Sharma <hardiksharma@meta.com>
Date: Thu, 12 Dec 2024 15:22:55 -0800
Subject: [PATCH 5/5] FusionG3 operators. (#7315)

Summary:

Cleanup header order and `using` declarations for operators to match style guide.

Reviewed By: zonglinpeng

Differential Revision: D67128499
---
 .../cadence/fusion_g3/operators/op_add.cpp    |  5 ++-
 .../cadence/fusion_g3/operators/op_cat.cpp    | 17 ++++----
 .../fusion_g3/operators/op_dequantize.cpp     | 24 ++++++-----
 .../cadence/fusion_g3/operators/op_mul.cpp    | 17 ++++----
 .../operators/op_native_layer_norm.cpp        | 20 +++++----
 .../fusion_g3/operators/op_quantize.cpp       | 43 ++++++++++---------
 .../fusion_g3/operators/op_softmax.cpp        | 18 ++++----
 .../cadence/fusion_g3/operators/operators.h   |  2 +
 .../fusion_g3/operators/tests/test_op_add.cpp |  2 +
 9 files changed, 83 insertions(+), 65 deletions(-)

diff --git a/backends/cadence/fusion_g3/operators/op_add.cpp b/backends/cadence/fusion_g3/operators/op_add.cpp
index 683323b2662..a68cef54b44 100644
--- a/backends/cadence/fusion_g3/operators/op_add.cpp
+++ b/backends/cadence/fusion_g3/operators/op_add.cpp
@@ -6,12 +6,15 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/backends/cadence/fusion_g3/operators/operators.h>
+
+#include <xa_nnlib_kernels_api.h>
+
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
-#include <xa_nnlib_kernels_api.h>
 
 using ::executorch::aten::Scalar;
 using ::executorch::aten::ScalarType;
diff --git a/backends/cadence/fusion_g3/operators/op_cat.cpp b/backends/cadence/fusion_g3/operators/op_cat.cpp
index 7fae3fa29c4..f0f327c024b 100644
--- a/backends/cadence/fusion_g3/operators/op_cat.cpp
+++ b/backends/cadence/fusion_g3/operators/op_cat.cpp
@@ -6,16 +6,17 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <cstring>
+
+#include <xa_nnlib_kernels_api.h>
+
 #include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include <xa_nnlib_kernels_api.h>
-#include <cstring>
 
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using torch::executor::Error;
-using torch::executor::KernelRuntimeContext;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::KernelRuntimeContext;
 
 /* ScalarType in Executorch do not have support for below data types.
  * So, creating a placeholder for these data types. Once, ScalarTypes is
@@ -194,4 +195,4 @@ Tensor& cat_out(
 } // namespace native
 } // namespace G3
 } // namespace impl
-} // namespace cadence
\ No newline at end of file
+} // namespace cadence
diff --git a/backends/cadence/fusion_g3/operators/op_dequantize.cpp b/backends/cadence/fusion_g3/operators/op_dequantize.cpp
index cd5d4a753ef..ed5b3125ac4 100644
--- a/backends/cadence/fusion_g3/operators/op_dequantize.cpp
+++ b/backends/cadence/fusion_g3/operators/op_dequantize.cpp
@@ -6,18 +6,20 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/kernels/portable/cpu/util/reduce_util.h>
-#include <executorch/runtime/kernel/kernel_includes.h>
-#include <xa_nnlib_kernels_api.h>
 #include <algorithm>
 #include <cinttypes>
 #include <cmath>
 
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using torch::executor::Error;
-using torch::executor::KernelRuntimeContext;
+#include <xa_nnlib_kernels_api.h>
+
+#include <executorch/kernels/portable/cpu/util/reduce_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+using ::executorch::aten::Scalar;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::KernelRuntimeContext;
 
 template <typename T>
 using optional = exec_aten::optional<T>;
@@ -185,7 +187,7 @@ void dequantize_impl(
       if (axis == NULL) {
 // calculate the dequantized output, cast scale to float to match fbgemm
 // behavior
-#define ASYM_DEQUANTIZE_IMPL_TESNOR(IN_CTYPE, OUT_CTYPE, out_dtype)            \
+#define ASYM_DEQUANTIZE_IMPL_TENSOR(IN_CTYPE, OUT_CTYPE, out_dtype)            \
   case ScalarType::out_dtype: {                                                \
     /* Hoist these function calls out of our inner loop because they might not \
      * get inlined without LTO, particularly in ATen mode. */                  \
@@ -201,7 +203,7 @@ void dequantize_impl(
 #define ASYM_CALCULATE_INT_TYPE_TENSOR(IN_CTYPE, in_dtype)               \
   case ScalarType::in_dtype:                                             \
     switch (out.scalar_type()) {                                         \
-      ET_FORALL_FLOAT_TYPES_WITH(IN_CTYPE, ASYM_DEQUANTIZE_IMPL_TESNOR); \
+      ET_FORALL_FLOAT_TYPES_WITH(IN_CTYPE, ASYM_DEQUANTIZE_IMPL_TENSOR); \
       default:                                                           \
         ET_CHECK_MSG(                                                    \
             false,                                                       \
@@ -219,7 +221,7 @@ void dequantize_impl(
                 static_cast<int8_t>(input.scalar_type()));
         }
 #undef ASYM_CALCULATE_INT_TYPE_TENSOR
-#undef ASYM_DEQUANTIZE_IMPL_TESNOR
+#undef ASYM_DEQUANTIZE_IMPL_TENSOR
       } else {
         // a list contains all dimensions except axis
         int64_t dims[input.dim() - 1];
diff --git a/backends/cadence/fusion_g3/operators/op_mul.cpp b/backends/cadence/fusion_g3/operators/op_mul.cpp
index 914ecf9d7e4..840cb16c7cf 100644
--- a/backends/cadence/fusion_g3/operators/op_mul.cpp
+++ b/backends/cadence/fusion_g3/operators/op_mul.cpp
@@ -6,18 +6,19 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <xa_nnlib_kernels_api.h>
+
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
-#include <xa_nnlib_kernels_api.h>
 
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using executorch::runtime::canCast;
-using torch::executor::Error;
-using torch::executor::KernelRuntimeContext;
+using ::executorch::aten::Scalar;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::canCast;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::KernelRuntimeContext;
 
 namespace cadence {
 namespace impl {
@@ -238,4 +239,4 @@ Tensor& mul_scalar_out(
 } // namespace native
 } // namespace G3
 } // namespace impl
-} // namespace cadence
\ No newline at end of file
+} // namespace cadence
diff --git a/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp b/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp
index 68d111795c0..a5fbe31eee5 100644
--- a/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp
+++ b/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp
@@ -6,18 +6,20 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <cmath>
+#include <tuple>
+
+#include <xa_nnlib_kernels_api.h>
+
 #include <executorch/kernels/portable/cpu/util/normalization_ops_util.h>
 #include <executorch/kernels/portable/cpu/vec_ops.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include <xa_nnlib_kernels_api.h>
-#include <cmath>
-#include <tuple>
 
-using Tensor = exec_aten::Tensor;
-using ScalarType = exec_aten::ScalarType;
-using IntArrayRef = exec_aten::ArrayRef<int64_t>;
-using torch::executor::Error;
-using torch::executor::KernelRuntimeContext;
+using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::KernelRuntimeContext;
 
 namespace cadence {
 namespace impl {
@@ -255,4 +257,4 @@ std::tuple<Tensor&, Tensor&, Tensor&> native_layer_norm_out(
 } // namespace native
 } // namespace G3
 } // namespace impl
-} // namespace cadence
\ No newline at end of file
+} // namespace cadence
diff --git a/backends/cadence/fusion_g3/operators/op_quantize.cpp b/backends/cadence/fusion_g3/operators/op_quantize.cpp
index 399e1be25a1..fc206b67cd6 100644
--- a/backends/cadence/fusion_g3/operators/op_quantize.cpp
+++ b/backends/cadence/fusion_g3/operators/op_quantize.cpp
@@ -6,18 +6,21 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/kernels/portable/cpu/util/reduce_util.h>
-#include <executorch/runtime/kernel/kernel_includes.h>
-#include <xa_nnlib_kernels_api.h>
 #include <algorithm>
 #include <cinttypes>
 #include <cmath>
 
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using torch::executor::Error;
-using torch::executor::KernelRuntimeContext;
+#include <xa_nnlib_kernels_api.h>
+
+#include <executorch/kernels/portable/cpu/util/reduce_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+using ::executorch::aten::ArrayRef;
+using ::executorch::aten::optional;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::KernelRuntimeContext;
 
 /* ScalarType in Executorch do not have support for below data types.
  * So, creating a placeholder for these data types. Once, ScalarTypes is
@@ -142,7 +145,7 @@ void quantize_impl(
     int* axis,
     int quant_min,
     int quant_max) {
-  const exec_aten::ArrayRef<Tensor::SizesType> input_size = input.sizes();
+  const ArrayRef<Tensor::SizesType> input_size = input.sizes();
 
   int kTensorDimensionLimit = 5;
 
@@ -301,8 +304,8 @@ void quantize_impl(
           }
         }
 
-        exec_aten::optional<exec_aten::ArrayRef<int64_t>> optional_dim_list{
-            exec_aten::ArrayRef<int64_t>{dims, size_t(input.dim() - 1)}};
+        optional<ArrayRef<int64_t>> optional_dim_list{
+            ArrayRef<int64_t>{dims, size_t(input.dim() - 1)}};
 
 // Actual quantization logic
 // input, out are the input and output tensors
@@ -487,8 +490,8 @@ void quantize_impl(
           }
         }
 
-        exec_aten::optional<exec_aten::ArrayRef<int64_t>> optional_dim_list{
-            exec_aten::ArrayRef<int64_t>{dims, size_t(input.dim() - 1)}};
+        optional<ArrayRef<int64_t>> optional_dim_list{
+            ArrayRef<int64_t>{dims, size_t(input.dim() - 1)}};
 
 // Actual quantization logic
 // input, out are the input and output tensors
@@ -565,9 +568,9 @@ Tensor& quantize_per_tensor_out(
     int64_t quant_max,
     ScalarType dtype,
     Tensor& out) {
-  torch::executor::Error err = resize_tensor(out, input.sizes());
+  Error err = resize_tensor(out, input.sizes());
   ET_CHECK_MSG(
-      err == torch::executor::Error::Ok,
+      err == Error::Ok,
       "Failed to resize out Tensor in quantize_per_tensor_out");
 
   // check_quantize_per_tensor_args(input, quant_min, quant_max, dtype, out);
@@ -600,7 +603,7 @@ Tensor& quantize_per_tensor_tensor_args_out(
   // after ET_KERNEL_CHECK is fully implemented and properly allows non fatal
   // failures.
   if (scale.scalar_type() != ScalarType::Double) {
-    context.fail(torch::executor::Error::InvalidArgument);
+    context.fail(Error::InvalidArgument);
     return out;
   }
   ET_CHECK_MSG(
@@ -657,7 +660,7 @@ Tensor& quantize_per_channel_out(
     int64_t quant_max,
     ScalarType dtype,
     Tensor& out) {
-  torch::executor::Error err = resize_tensor(out, input.sizes());
+  Error err = resize_tensor(out, input.sizes());
 
   // normalize axis
   ET_CHECK_MSG(
@@ -671,7 +674,7 @@ Tensor& quantize_per_channel_out(
   }
 
   ET_CHECK_MSG(
-      err == torch::executor::Error::Ok,
+      err == Error::Ok,
       "Failed to resize out Tensor in quantize_per_channel_out");
 
   ET_CHECK_MSG(
@@ -776,9 +779,9 @@ Tensor& quantize_per_token_out(
           input_strides.data(),
           executorch::runtime::TensorShapeDynamism::STATIC);
   Tensor reshaped_input(&reshaped_input_impl);
-  torch::executor::Error err = resize_tensor(out, input.sizes());
+  Error err = resize_tensor(out, input.sizes());
   ET_CHECK_MSG(
-      err == torch::executor::Error::Ok,
+      err == Error::Ok,
       "Failed to resize out Tensor in quantize_per_channel_out");
 #endif
 
diff --git a/backends/cadence/fusion_g3/operators/op_softmax.cpp b/backends/cadence/fusion_g3/operators/op_softmax.cpp
index 9b51bdd22a3..9f343481508 100644
--- a/backends/cadence/fusion_g3/operators/op_softmax.cpp
+++ b/backends/cadence/fusion_g3/operators/op_softmax.cpp
@@ -6,18 +6,20 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <cmath>
+
+#include <xa_nnlib_kernels_api.h>
+
 #include <executorch/kernels/portable/cpu/util/activation_ops_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include <xa_nnlib_kernels_api.h>
-#include <cmath>
 
-using exec_aten::Scalar;
-using exec_aten::ScalarType;
-using exec_aten::Tensor;
-using torch::executor::Error;
-using torch::executor::KernelRuntimeContext;
+using ::executorch::aten::ArrayRef;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::KernelRuntimeContext;
 
 namespace cadence {
 namespace impl {
@@ -51,7 +53,7 @@ Tensor& _softmax_out(
   dim = dim < 0 ? dim + executorch::runtime::nonzero_dim(in) : dim;
 
   int inp_shapes[in.dim()];
-  const exec_aten::ArrayRef<Tensor::SizesType> in_size = in.sizes();
+  const ArrayRef<Tensor::SizesType> in_size = in.sizes();
   for (int i = 0; i < in.dim(); i++) {
     inp_shapes[i] = in_size[i];
   }
diff --git a/backends/cadence/fusion_g3/operators/operators.h b/backends/cadence/fusion_g3/operators/operators.h
index fc4d5ff6252..9d7f7b9c30e 100644
--- a/backends/cadence/fusion_g3/operators/operators.h
+++ b/backends/cadence/fusion_g3/operators/operators.h
@@ -6,6 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#pragma once
+
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/kernel/kernel_runtime_context.h>
 
diff --git a/backends/cadence/fusion_g3/operators/tests/test_op_add.cpp b/backends/cadence/fusion_g3/operators/tests/test_op_add.cpp
index cbc419d47e1..bba778035b6 100644
--- a/backends/cadence/fusion_g3/operators/tests/test_op_add.cpp
+++ b/backends/cadence/fusion_g3/operators/tests/test_op_add.cpp
@@ -8,6 +8,8 @@
 
 #include <gtest/gtest.h>
 #include <stdio.h>
+#include <sys/times.h>
+#include <xtensa/sim.h>
 
 #include <executorch/backends/cadence/fusion_g3/operators/operators.h>
 #include <executorch/kernels/test/TestUtil.h>