diff --git a/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp b/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp
index 833606fb651..daffecda1bf 100644
--- a/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp
+++ b/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp
@@ -31,25 +31,24 @@ void dequantize_per_tensor_out(
 
   if (input.scalar_type() == ScalarType::Byte) {
     const uint8_t* input_data = input.const_data_ptr<uint8_t>();
-    impl::vision::native::kernels::dequantize<uint8_t>(
+    kernels::dequantize<uint8_t>(
         out_data, input_data, scale, zero_point, numel);
   } else if (input.scalar_type() == ScalarType::Char) {
     const int8_t* input_data = input.const_data_ptr<int8_t>();
-    impl::vision::native::kernels::dequantize<int8_t>(
-        out_data, input_data, scale, zero_point, numel);
+    kernels::dequantize<int8_t>(out_data, input_data, scale, zero_point, numel);
   } else if (
       input.scalar_type() == ScalarType::Bits16 ||
       input.scalar_type() == ScalarType::UInt16) {
     const uint16_t* input_data = input.const_data_ptr<uint16_t>();
-    impl::vision::native::kernels::dequantize<uint16_t>(
+    kernels::dequantize<uint16_t>(
         out_data, input_data, scale, zero_point, numel);
   } else if (input.scalar_type() == ScalarType::Short) {
     const int16_t* input_data = input.const_data_ptr<int16_t>();
-    impl::vision::native::kernels::dequantize<int16_t>(
+    kernels::dequantize<int16_t>(
         out_data, input_data, scale, zero_point, numel);
   } else if (input.scalar_type() == ScalarType::Int) {
     const int32_t* input_data = input.const_data_ptr<int32_t>();
-    impl::vision::native::kernels::dequantize<int32_t>(
+    kernels::dequantize<int32_t>(
         out_data, input_data, scale, zero_point, numel);
   } else {
     ET_CHECK_MSG(
diff --git a/backends/cadence/vision/operators/op_quantize_per_tensor.cpp b/backends/cadence/vision/operators/op_quantize_per_tensor.cpp
index 8d209af24b1..cd72d2de2b5 100644
--- a/backends/cadence/vision/operators/op_quantize_per_tensor.cpp
+++ b/backends/cadence/vision/operators/op_quantize_per_tensor.cpp
@@ -33,25 +33,25 @@ void quantize_per_tensor_out(
 
   if (out.scalar_type() == ScalarType::Byte) {
     uint8_t* out_data = out.mutable_data_ptr<uint8_t>();
-    impl::vision::native::kernels::quantize<uint8_t>(
+    kernels::quantize<uint8_t>(
         out_data, input_data, 1. / scale, zero_point, numel);
   } else if (out.scalar_type() == ScalarType::Char) {
     int8_t* out_data = out.mutable_data_ptr<int8_t>();
-    impl::vision::native::kernels::quantize<int8_t>(
+    kernels::quantize<int8_t>(
         out_data, input_data, 1. / scale, zero_point, numel);
   } else if (
       out.scalar_type() == ScalarType::Bits16 ||
       out.scalar_type() == ScalarType::UInt16) {
     uint16_t* out_data = out.mutable_data_ptr<uint16_t>();
-    impl::vision::native::kernels::quantize<uint16_t>(
+    kernels::quantize<uint16_t>(
         out_data, input_data, 1. / scale, zero_point, numel);
   } else if (out.scalar_type() == ScalarType::Short) {
     int16_t* out_data = out.mutable_data_ptr<int16_t>();
-    impl::vision::native::kernels::quantize<int16_t>(
+    kernels::quantize<int16_t>(
         out_data, input_data, 1. / scale, zero_point, numel);
   } else if (out.scalar_type() == ScalarType::Int) {
     int32_t* out_data = out.mutable_data_ptr<int32_t>();
-    impl::vision::native::kernels::quantize<int32_t>(
+    kernels::quantize<int32_t>(
         out_data, input_data, 1. / scale, zero_point, numel);
   } else {
     ET_CHECK_MSG(
diff --git a/backends/cadence/vision/operators/op_quantized_conv_out.cpp b/backends/cadence/vision/operators/op_quantized_conv_out.cpp
index 6ffb36aa836..1e1e6c8cdc7 100644
--- a/backends/cadence/vision/operators/op_quantized_conv_out.cpp
+++ b/backends/cadence/vision/operators/op_quantized_conv_out.cpp
@@ -141,8 +141,7 @@ __attribute__((noinline)) void conv2d_nchw_core_generic(
             if (quantized) {
               float val = bias_scale * acc;
               out_plane[_oh * ow + _ow] =
-                  ::impl::vision::native::kernels::quantize<OT>(
-                      val, inv_out_scale, out_zero_point);
+                  kernels::quantize<OT>(val, inv_out_scale, out_zero_point);
             } else {
               out_plane[_oh * ow + _ow] = acc;
             }
@@ -267,8 +266,8 @@ __attribute__((noinline)) void conv2d_nhwc_core_generic(
             }
             if (quantized) {
               float val = bias_scale * acc;
-              out_line[_oc] = ::impl::vision::native::kernels::quantize<OT>(
-                  val, inv_out_scale, out_zero_point);
+              out_line[_oc] =
+                  kernels::quantize<OT>(val, inv_out_scale, out_zero_point);
             } else {
               out_line[_oc] = acc;
             }
@@ -530,6 +529,80 @@ void quantized_conv_per_tensor_out(
   }
 }
 
+void quantized_conv2d_nchw_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    int64_t out_multiplier,
+    int64_t out_shift,
+    Tensor& out) {
+  quantized_conv_per_tensor_out(
+      ctx,
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      in_zero_point,
+      weight_zero_point,
+      bias_scale,
+      output_scale,
+      output_zero_point,
+      out_multiplier,
+      out_shift,
+      false, // channel_last = false for NCHW
+      out);
+}
+
+void quantized_conv2d_nhwc_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    int64_t out_multiplier,
+    int64_t out_shift,
+    Tensor& out) {
+  quantized_conv_per_tensor_out(
+      ctx,
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      in_zero_point,
+      weight_zero_point,
+      bias_scale,
+      output_scale,
+      output_zero_point,
+      out_multiplier,
+      out_shift,
+      true, // channel_last = true for NHWC
+      out);
+}
+
 } // namespace native
 } // namespace vision
 } // namespace impl
diff --git a/backends/cadence/vision/operators/op_softmax.cpp b/backends/cadence/vision/operators/op_softmax.cpp
index e2963bdcffe..58ca33c6a0b 100644
--- a/backends/cadence/vision/operators/op_softmax.cpp
+++ b/backends/cadence/vision/operators/op_softmax.cpp
@@ -6,13 +6,13 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <api.h>
 #include <executorch/backends/cadence/vision/kernels/kernels.h>
 #include <executorch/kernels/portable/cpu/util/activation_ops_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include <idma_init.h>
+#include <include/api.h>
+#include <include_private/idma_init.h>
 #include <stdio.h>
 
 using executorch::aten::ScalarType;
diff --git a/backends/cadence/vision/operators/quantized_ops.h b/backends/cadence/vision/operators/quantized_ops.h
index b42e45b0b3d..a7251724c53 100644
--- a/backends/cadence/vision/operators/quantized_ops.h
+++ b/backends/cadence/vision/operators/quantized_ops.h
@@ -49,7 +49,7 @@ inline __attribute__((always_inline)) void quantized_linear_per_tensor_(
             (int32_t)weight_data[j * in_dim + k] - (int32_t)weight_zero_point;
         sum += x * w;
       }
-      out_data[i * out_dim + j] = ::impl::vision::native::kernels::quantize<T>(
+      out_data[i * out_dim + j] = impl::vision::kernels::quantize<T>(
           sum, requant_scale, out_zero_point);
     }
   }
@@ -121,8 +121,8 @@ inline __attribute__((always_inline)) void quantized_linear_per_channel_(
       // Compute the out_scale from out_multiplier and out_shift
       const float out_scale =
           -out_multiplier_data[j] * 1.0 / (1 << 31) * pow(2, out_shift_data[j]);
-      out_data[i * out_dim + j] = ::impl::vision::native::kernels::quantize<T>(
-          sum, out_scale, out_zero_point);
+      out_data[i * out_dim + j] =
+          impl::vision::kernels::quantize<T>(sum, out_scale, out_zero_point);
     }
   }
 }
diff --git a/backends/cadence/vision/operators/targets.bzl b/backends/cadence/vision/operators/targets.bzl
index b12118a9c47..2dd47e12bd2 100644
--- a/backends/cadence/vision/operators/targets.bzl
+++ b/backends/cadence/vision/operators/targets.bzl
@@ -21,6 +21,25 @@ def define_operator(name: str, deps: list[str] | None = None) -> None:
     if deps == None:
         deps = []
 
+    # Determine which headers to export based on operator name
+    exported_headers = ["operators.h"]
+    
+    # Add quantized_ops.h header for quantized operators
+    quantized_ops = [
+        "quantized_fully_connected_out",
+        "quantized_matmul_out", 
+        "quantized_layer_norm",
+        "quantized_relu_out",
+        "quantized_conv_out",
+        "quantized_linear_out",
+        "quantize_per_tensor",
+        "dequantize_per_tensor",
+        "requantize_out"
+    ]
+    
+    if name in quantized_ops:
+        exported_headers.append("quantized_ops.h")
+
     runtime.cxx_library(
         name = op_name,
         srcs = [op_name + ".cpp"],
@@ -31,7 +50,7 @@ def define_operator(name: str, deps: list[str] | None = None) -> None:
         ],
         compatible_with = ["ovr_config//cpu:xtensa"],
         deps = deps + common_deps,
-        exported_headers = ["operators.h"],
+        exported_headers = exported_headers,
     )
 
 OPERATORS = [
diff --git a/backends/cadence/vision/third-party/include_private/idma_init.h b/backends/cadence/vision/third-party/include_private/idma_init.h
index ee0666842fd..841a39cf891 100644
--- a/backends/cadence/vision/third-party/include_private/idma_init.h
+++ b/backends/cadence/vision/third-party/include_private/idma_init.h
@@ -1,31 +1,36 @@
 #ifndef __IDMA__INIT_H__
 #define __IDMA__INIT_H__
 
-#include "dtypes.h"
+#include "../include/dtypes.h"
 #include "common.h"
 
-#define IDMA_BUFF_SIZE 16384 // 16 kb DRAM storage. Assume 4 buffers (2 input and 2 output)
+#define IDMA_BUFF_SIZE \
+  16384 // 16 kb DRAM storage. Assume 4 buffers (2 input and 2 output)
 
 #ifndef PLACE_IN_DRAM0
-	#define PLACE_IN_DRAM0 __attribute__ ((aligned(2*IVP_SIMD_WIDTH), section(".dram0.data")))
+#define PLACE_IN_DRAM0 \
+  __attribute__((aligned(2 * IVP_SIMD_WIDTH), section(".dram0.data")))
 #endif
 
 #ifndef PLACE_IN_DRAM1
-	#define PLACE_IN_DRAM1 __attribute__ ((aligned(2*IVP_SIMD_WIDTH), section(".dram1.data")))
+#define PLACE_IN_DRAM1 \
+  __attribute__((aligned(2 * IVP_SIMD_WIDTH), section(".dram1.data")))
 #endif
 
 float32_t data_dram0[IDMA_BUFF_SIZE / 2] PLACE_IN_DRAM0;
 float32_t data_dram1[IDMA_BUFF_SIZE / 2] PLACE_IN_DRAM1;
 
-float32_t *inpData[2] = {&data_dram0[0], &data_dram1[0]};
-float32_t *outData[2] = {&data_dram0[IDMA_BUFF_SIZE / 4], &data_dram1[IDMA_BUFF_SIZE / 4]};
+float32_t* inpData[2] = {&data_dram0[0], &data_dram1[0]};
+float32_t* outData[2] = {
+    &data_dram0[IDMA_BUFF_SIZE / 4],
+    &data_dram1[IDMA_BUFF_SIZE / 4]};
 
 IDMA_BUFFER_DEFINE(buffer_idma_ch0, 1, IDMA_2D_DESC);
 IDMA_BUFFER_DEFINE(buffer_idma_ch1, 1, IDMA_2D_DESC);
 
-idma_buffer_t * descbuf[] = {
-  buffer_idma_ch0,
-  buffer_idma_ch1,
+idma_buffer_t* descbuf[] = {
+    buffer_idma_ch0,
+    buffer_idma_ch1,
 };
 
-#endif // __IDMA__INIT_H__
\ No newline at end of file
+#endif // __IDMA__INIT_H__
diff --git a/backends/cadence/vision/third-party/library/api/vsoftmaxf.c b/backends/cadence/vision/third-party/library/api/vsoftmaxf.c
index 413b6f10567..27487c75d6c 100644
--- a/backends/cadence/vision/third-party/library/api/vsoftmaxf.c
+++ b/backends/cadence/vision/third-party/library/api/vsoftmaxf.c
@@ -63,33 +63,33 @@ y[N]   result, Q7.8 or floating point
 x,y    Must not overlap
 -------------------------------------------------------------------------*/
 
-#define IVP_ADDSN_2X32(b_, c_)                                                 \
-  ({                                                                           \
-    xb_vecN_2x32v a_;                                                          \
-    xb_vecN_2x64w tmp_a_;                                                      \
-    tmp_a_ = IVP_MULN_2X32(b_, 1);                                             \
-    IVP_MULAN_2X32(tmp_a_, c_, 1);                                             \
-    a_ = IVP_PACKVRN_2X64W(tmp_a_, 0);                                         \
-    a_;                                                                        \
+#define IVP_ADDSN_2X32(b_, c_)         \
+  ({                                   \
+    xb_vecN_2x32v a_;                  \
+    xb_vecN_2x64w tmp_a_;              \
+    tmp_a_ = IVP_MULN_2X32(b_, 1);     \
+    IVP_MULAN_2X32(tmp_a_, c_, 1);     \
+    a_ = IVP_PACKVRN_2X64W(tmp_a_, 0); \
+    a_;                                \
   })
 
 #if !HAVE_VFPU
-DISCARD_FUN(void, vsoftmaxf, (float32_t * y, const float32_t *x, int N))
+DISCARD_FUN(void, vsoftmaxf, (float32_t * y, const float32_t* x, int N))
 #else
-void vsoftmaxf(float32_t *y, const float32_t *x, int N) {
+void vsoftmaxf(float32_t* y, const float32_t* x, int N) {
 #if !defined(IVP_MULN_2X32)
 #else
-  const int *pTbl = (const int *)expftbl_Q30;
+  const int* pTbl = (const int*)expftbl_Q30;
 #endif
-  const xb_vecN_2xf32 *restrict pX;
-  xb_vecN_2xf32 *restrict pY;
+  const xb_vecN_2xf32* restrict pX;
+  xb_vecN_2xf32* restrict pY;
   xb_vecN_2xf32 norm, ysum, xmax;
   int n;
   valign al_X, al_R, al_Y;
   if (N < 0)
     return;
   xmax = minusInff.f;
-  pX = (const xb_vecN_2xf32 *)x;
+  pX = (const xb_vecN_2xf32*)x;
   al_X = IVP_LAN_2XF32_PP(pX);
   al_Y = IVP_ZALIGN();
   for (n = 0; n < (N >> (LOG2_IVP_SIMD_WIDTH - 1)); n++) {
@@ -99,17 +99,17 @@ void vsoftmaxf(float32_t *y, const float32_t *x, int N) {
   }
   if (N & (IVP_SIMD_WIDTH / 2 - 1)) {
     xb_vecN_2xf32 x;
-    IVP_LAVN_2XF32_XP(x, al_X, pX,
-                      sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
-    IVP_MAXNUMN_2XF32T(xmax, xmax, x,
-                       IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1))));
+    IVP_LAVN_2XF32_XP(
+        x, al_X, pX, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
+    IVP_MAXNUMN_2XF32T(
+        xmax, xmax, x, IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1))));
   }
 
   xmax = IVP_REPN_2XF32(IVP_RMAXNUMN_2XF32(xmax), 0);
   __Pragma("no_reorder");
   ysum = 0.f;
-  pX = (const xb_vecN_2xf32 *)x;
-  pY = (xb_vecN_2xf32 *)y;
+  pX = (const xb_vecN_2xf32*)x;
+  pY = (xb_vecN_2xf32*)y;
   al_X = IVP_LAN_2XF32_PP(pX);
   {
     vboolN_2 bnan;
@@ -163,8 +163,8 @@ void vsoftmaxf(float32_t *y, const float32_t *x, int N) {
     }
     if (N & (IVP_SIMD_WIDTH / 2 - 1)) {
       xb_vecN_2xf32 x;
-      IVP_LAVN_2XF32_XP(x, al_X, pX,
-                        sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
+      IVP_LAVN_2XF32_XP(
+          x, al_X, pX, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
       x = IVP_SUBN_2XF32(x, xmax);
       bnan |= IVP_UNN_2XF32(x, x);
       {
@@ -206,18 +206,18 @@ void vsoftmaxf(float32_t *y, const float32_t *x, int N) {
         zout = IVP_MULN_2XF32(gf, IVP_MOVN_2XF32_FROMN_2X32(exp));
         x = zout;
       }
-      IVP_ADDN_2XF32T(ysum, ysum, x,
-                      IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1))));
-      IVP_SAVN_2XF32_XP(x, al_Y, pY,
-                        sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
+      IVP_ADDN_2XF32T(
+          ysum, ysum, x, IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1))));
+      IVP_SAVN_2XF32_XP(
+          x, al_Y, pY, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
     }
     IVP_SAPOSN_2XF32_FP(al_Y, pY);
     ysum = IVP_MOVN_2XF32T(qNaNf.f, ysum, bnan);
   }
   norm = XT_RECIP_S(IVP_RADDN_2XF32(ysum));
   __Pragma("no_reorder");
-  pX = (const xb_vecN_2xf32 *)y;
-  pY = (xb_vecN_2xf32 *)y;
+  pX = (const xb_vecN_2xf32*)y;
+  pY = (xb_vecN_2xf32*)y;
 
   al_R = IVP_LAN_2XF32_PP(pX);
 
@@ -229,11 +229,11 @@ void vsoftmaxf(float32_t *y, const float32_t *x, int N) {
   }
   if (N & (IVP_SIMD_WIDTH / 2 - 1)) {
     xb_vecN_2xf32 x;
-    IVP_LAVN_2XF32_XP(x, al_R, pX,
-                      sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
+    IVP_LAVN_2XF32_XP(
+        x, al_R, pX, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
     x = IVP_MULN_2XF32(x, norm);
-    IVP_SAVN_2XF32_XP(x, al_Y, pY,
-                      sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
+    IVP_SAVN_2XF32_XP(
+        x, al_Y, pY, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
   }
   IVP_SAPOSN_2XF32_FP(al_Y, pY);
 
diff --git a/backends/cadence/vision/third-party/library/tables/expf_tbl.c b/backends/cadence/vision/third-party/library/tables/expf_tbl.c
index 0ed5dd22257..f1c6f3d44ae 100644
--- a/backends/cadence/vision/third-party/library/tables/expf_tbl.c
+++ b/backends/cadence/vision/third-party/library/tables/expf_tbl.c
@@ -42,22 +42,28 @@
    p(order)=p(order)-(sum(p)-2);
 */
 const int32_t ALIGN_2SIMD expftbl_Q30[8] = {
-    234841,    1329551,   10400465,   59570027,
-    257946177, 744260763, 1073741824, 0 /* Padding to allow for vector loads */
+    234841,
+    1329551,
+    10400465,
+    59570027,
+    257946177,
+    744260763,
+    1073741824,
+    0 /* Padding to allow for vector loads */
 };
 
 const union ufloat32uint32 ALIGN_2SIMD
     expfminmax[2] = /* minimum and maximum arguments of expf() input */
     {
         {0xc2ce8ed0}, /*-1.0327893066e+002f */
-        {0x42b17218}  /* 8.8722839355e+001f */
+        {0x42b17218} /* 8.8722839355e+001f */
 };
 
 const int32_t invln2_Q30 = 1549082005L; /* 1/ln(2), Q30 */
 
 const union ufloat32uint32 ALIGN_2SIMD log2_e[2] = {
     {0x3fb8aa3b}, /* 1.4426950216      */
-    {0x32a57060}  /* 1.9259629891e-008 */
+    {0x32a57060} /* 1.9259629891e-008 */
 };
 
 /*
@@ -70,5 +76,10 @@ p(order)=p(order)-(sum(p)-2);
 num2hex(single(p));
 */
 const union ufloat32uint32 ALIGN_2SIMD expftblf[] = {
-    {0x39655635}, {0x3aa24c7a}, {0x3c1eb2d1}, {0x3d633ddb},
-    {0x3e75ff24}, {0x3f317212}, {0x3f800000}};
+    {0x39655635},
+    {0x3aa24c7a},
+    {0x3c1eb2d1},
+    {0x3d633ddb},
+    {0x3e75ff24},
+    {0x3f317212},
+    {0x3f800000}};
diff --git a/backends/cadence/vision/third-party/library/tables/inff_tbl.c b/backends/cadence/vision/third-party/library/tables/inff_tbl.c
index 9b2bf62e6bf..8464ee9f549 100644
--- a/backends/cadence/vision/third-party/library/tables/inff_tbl.c
+++ b/backends/cadence/vision/third-party/library/tables/inff_tbl.c
@@ -31,7 +31,7 @@
 #include "dtypes.h"
 
 const union ufloat32uint32 minusInff = {0xff800000}; /* -Inf */
-const union ufloat32uint32 plusInff = {0x7f800000};  /* +Inf */
+const union ufloat32uint32 plusInff = {0x7f800000}; /* +Inf */
 const union ufloat32uint32 realmaxf = {
     0x7f7fffff}; /* maximum floating point number */
 const union ufloat32uint32 realminf = {
diff --git a/backends/cadence/vision/third-party/library/tables/nanf_tbl.c b/backends/cadence/vision/third-party/library/tables/nanf_tbl.c
index 27c5f437b9a..f165234fce4 100644
--- a/backends/cadence/vision/third-party/library/tables/nanf_tbl.c
+++ b/backends/cadence/vision/third-party/library/tables/nanf_tbl.c
@@ -27,9 +27,9 @@
 */
 
 /* Portable data types. */
-#include "dtypes.h"
 /* NaN values for single precision routines. */
 #include "nanf_tbl.h"
+#include "dtypes.h"
 
 const union ufloat32uint32 sNaNf = {0x7f800001}; /* Signalling NaN          */
 const union ufloat32uint32 qNaNf = {0x7fc00000}; /* Quiet NaN               */
diff --git a/backends/cadence/vision/third-party/targets.bzl b/backends/cadence/vision/third-party/targets.bzl
index 6bbb7da8d49..26a097010d5 100644
--- a/backends/cadence/vision/third-party/targets.bzl
+++ b/backends/cadence/vision/third-party/targets.bzl
@@ -16,7 +16,7 @@ def define_common_targets():
             "include/*.h", 
             "include_private/*.h"
         ]),
-        header_namespace = "backends/cadence/vision/third-party",
+        header_namespace = "",
         visibility = [
             "//executorch/backends/cadence/...",
             "@EXECUTORCH_CLIENTS",
@@ -28,7 +28,11 @@ def define_common_targets():
         }),
         compiler_flags = select({
             "DEFAULT": ["-UCOMPILER_XTENSA"],  # Ensure COMPILER_XTENSA is not defined for non-Xtensa builds
-            "ovr_config//cpu:xtensa": ["-DCOMPILER_XTENSA"],
+            "ovr_config//cpu:xtensa": [
+                "-DCOMPILER_XTENSA",
+                "-Ixplat/executorch/backends/cadence/vision/third-party/include",
+                "-Ixplat/executorch/backends/cadence/vision/third-party/include_private",
+            ],
         }),
         define_static_target = True,
     )