diff --git a/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp b/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp index 833606fb651..daffecda1bf 100644 --- a/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp +++ b/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp @@ -31,25 +31,24 @@ void dequantize_per_tensor_out( if (input.scalar_type() == ScalarType::Byte) { const uint8_t* input_data = input.const_data_ptr(); - impl::vision::native::kernels::dequantize( + kernels::dequantize( out_data, input_data, scale, zero_point, numel); } else if (input.scalar_type() == ScalarType::Char) { const int8_t* input_data = input.const_data_ptr(); - impl::vision::native::kernels::dequantize( - out_data, input_data, scale, zero_point, numel); + kernels::dequantize(out_data, input_data, scale, zero_point, numel); } else if ( input.scalar_type() == ScalarType::Bits16 || input.scalar_type() == ScalarType::UInt16) { const uint16_t* input_data = input.const_data_ptr(); - impl::vision::native::kernels::dequantize( + kernels::dequantize( out_data, input_data, scale, zero_point, numel); } else if (input.scalar_type() == ScalarType::Short) { const int16_t* input_data = input.const_data_ptr(); - impl::vision::native::kernels::dequantize( + kernels::dequantize( out_data, input_data, scale, zero_point, numel); } else if (input.scalar_type() == ScalarType::Int) { const int32_t* input_data = input.const_data_ptr(); - impl::vision::native::kernels::dequantize( + kernels::dequantize( out_data, input_data, scale, zero_point, numel); } else { ET_CHECK_MSG( diff --git a/backends/cadence/vision/operators/op_quantize_per_tensor.cpp b/backends/cadence/vision/operators/op_quantize_per_tensor.cpp index 8d209af24b1..cd72d2de2b5 100644 --- a/backends/cadence/vision/operators/op_quantize_per_tensor.cpp +++ b/backends/cadence/vision/operators/op_quantize_per_tensor.cpp @@ -33,25 +33,25 @@ void quantize_per_tensor_out( if (out.scalar_type() == ScalarType::Byte) { uint8_t* out_data = out.mutable_data_ptr(); - impl::vision::native::kernels::quantize( + kernels::quantize( out_data, input_data, 1. / scale, zero_point, numel); } else if (out.scalar_type() == ScalarType::Char) { int8_t* out_data = out.mutable_data_ptr(); - impl::vision::native::kernels::quantize( + kernels::quantize( out_data, input_data, 1. / scale, zero_point, numel); } else if ( out.scalar_type() == ScalarType::Bits16 || out.scalar_type() == ScalarType::UInt16) { uint16_t* out_data = out.mutable_data_ptr(); - impl::vision::native::kernels::quantize( + kernels::quantize( out_data, input_data, 1. / scale, zero_point, numel); } else if (out.scalar_type() == ScalarType::Short) { int16_t* out_data = out.mutable_data_ptr(); - impl::vision::native::kernels::quantize( + kernels::quantize( out_data, input_data, 1. / scale, zero_point, numel); } else if (out.scalar_type() == ScalarType::Int) { int32_t* out_data = out.mutable_data_ptr(); - impl::vision::native::kernels::quantize( + kernels::quantize( out_data, input_data, 1. / scale, zero_point, numel); } else { ET_CHECK_MSG( diff --git a/backends/cadence/vision/operators/op_quantized_conv_out.cpp b/backends/cadence/vision/operators/op_quantized_conv_out.cpp index 6ffb36aa836..1e1e6c8cdc7 100644 --- a/backends/cadence/vision/operators/op_quantized_conv_out.cpp +++ b/backends/cadence/vision/operators/op_quantized_conv_out.cpp @@ -141,8 +141,7 @@ __attribute__((noinline)) void conv2d_nchw_core_generic( if (quantized) { float val = bias_scale * acc; out_plane[_oh * ow + _ow] = - ::impl::vision::native::kernels::quantize( - val, inv_out_scale, out_zero_point); + kernels::quantize(val, inv_out_scale, out_zero_point); } else { out_plane[_oh * ow + _ow] = acc; } @@ -267,8 +266,8 @@ __attribute__((noinline)) void conv2d_nhwc_core_generic( } if (quantized) { float val = bias_scale * acc; - out_line[_oc] = ::impl::vision::native::kernels::quantize( - val, inv_out_scale, out_zero_point); + out_line[_oc] = + kernels::quantize(val, inv_out_scale, out_zero_point); } else { out_line[_oc] = acc; } @@ -530,6 +529,80 @@ void quantized_conv_per_tensor_out( } } +void quantized_conv2d_nchw_per_tensor_out( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + int64_t out_multiplier, + int64_t out_shift, + Tensor& out) { + quantized_conv_per_tensor_out( + ctx, + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + false, // channel_last = false for NCHW + out); +} + +void quantized_conv2d_nhwc_per_tensor_out( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + int64_t out_multiplier, + int64_t out_shift, + Tensor& out) { + quantized_conv_per_tensor_out( + ctx, + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + true, // channel_last = true for NHWC + out); +} + } // namespace native } // namespace vision } // namespace impl diff --git a/backends/cadence/vision/operators/op_softmax.cpp b/backends/cadence/vision/operators/op_softmax.cpp index e2963bdcffe..58ca33c6a0b 100644 --- a/backends/cadence/vision/operators/op_softmax.cpp +++ b/backends/cadence/vision/operators/op_softmax.cpp @@ -6,13 +6,13 @@ * LICENSE file in the root directory of this source tree. */ -#include #include #include #include #include #include -#include +#include +#include #include using executorch::aten::ScalarType; diff --git a/backends/cadence/vision/operators/quantized_ops.h b/backends/cadence/vision/operators/quantized_ops.h index b42e45b0b3d..a7251724c53 100644 --- a/backends/cadence/vision/operators/quantized_ops.h +++ b/backends/cadence/vision/operators/quantized_ops.h @@ -49,7 +49,7 @@ inline __attribute__((always_inline)) void quantized_linear_per_tensor_( (int32_t)weight_data[j * in_dim + k] - (int32_t)weight_zero_point; sum += x * w; } - out_data[i * out_dim + j] = ::impl::vision::native::kernels::quantize( + out_data[i * out_dim + j] = impl::vision::kernels::quantize( sum, requant_scale, out_zero_point); } } @@ -121,8 +121,8 @@ inline __attribute__((always_inline)) void quantized_linear_per_channel_( // Compute the out_scale from out_multiplier and out_shift const float out_scale = -out_multiplier_data[j] * 1.0 / (1 << 31) * pow(2, out_shift_data[j]); - out_data[i * out_dim + j] = ::impl::vision::native::kernels::quantize( - sum, out_scale, out_zero_point); + out_data[i * out_dim + j] = + impl::vision::kernels::quantize(sum, out_scale, out_zero_point); } } } diff --git a/backends/cadence/vision/operators/targets.bzl b/backends/cadence/vision/operators/targets.bzl index b12118a9c47..2dd47e12bd2 100644 --- a/backends/cadence/vision/operators/targets.bzl +++ b/backends/cadence/vision/operators/targets.bzl @@ -21,6 +21,25 @@ def define_operator(name: str, deps: list[str] | None = None) -> None: if deps == None: deps = [] + # Determine which headers to export based on operator name + exported_headers = ["operators.h"] + + # Add quantized_ops.h header for quantized operators + quantized_ops = [ + "quantized_fully_connected_out", + "quantized_matmul_out", + "quantized_layer_norm", + "quantized_relu_out", + "quantized_conv_out", + "quantized_linear_out", + "quantize_per_tensor", + "dequantize_per_tensor", + "requantize_out" + ] + + if name in quantized_ops: + exported_headers.append("quantized_ops.h") + runtime.cxx_library( name = op_name, srcs = [op_name + ".cpp"], @@ -31,7 +50,7 @@ def define_operator(name: str, deps: list[str] | None = None) -> None: ], compatible_with = ["ovr_config//cpu:xtensa"], deps = deps + common_deps, - exported_headers = ["operators.h"], + exported_headers = exported_headers, ) OPERATORS = [ diff --git a/backends/cadence/vision/third-party/include_private/idma_init.h b/backends/cadence/vision/third-party/include_private/idma_init.h index ee0666842fd..841a39cf891 100644 --- a/backends/cadence/vision/third-party/include_private/idma_init.h +++ b/backends/cadence/vision/third-party/include_private/idma_init.h @@ -1,31 +1,36 @@ #ifndef __IDMA__INIT_H__ #define __IDMA__INIT_H__ -#include "dtypes.h" +#include "../include/dtypes.h" #include "common.h" -#define IDMA_BUFF_SIZE 16384 // 16 kb DRAM storage. Assume 4 buffers (2 input and 2 output) +#define IDMA_BUFF_SIZE \ + 16384 // 16 kb DRAM storage. Assume 4 buffers (2 input and 2 output) #ifndef PLACE_IN_DRAM0 - #define PLACE_IN_DRAM0 __attribute__ ((aligned(2*IVP_SIMD_WIDTH), section(".dram0.data"))) +#define PLACE_IN_DRAM0 \ + __attribute__((aligned(2 * IVP_SIMD_WIDTH), section(".dram0.data"))) #endif #ifndef PLACE_IN_DRAM1 - #define PLACE_IN_DRAM1 __attribute__ ((aligned(2*IVP_SIMD_WIDTH), section(".dram1.data"))) +#define PLACE_IN_DRAM1 \ + __attribute__((aligned(2 * IVP_SIMD_WIDTH), section(".dram1.data"))) #endif float32_t data_dram0[IDMA_BUFF_SIZE / 2] PLACE_IN_DRAM0; float32_t data_dram1[IDMA_BUFF_SIZE / 2] PLACE_IN_DRAM1; -float32_t *inpData[2] = {&data_dram0[0], &data_dram1[0]}; -float32_t *outData[2] = {&data_dram0[IDMA_BUFF_SIZE / 4], &data_dram1[IDMA_BUFF_SIZE / 4]}; +float32_t* inpData[2] = {&data_dram0[0], &data_dram1[0]}; +float32_t* outData[2] = { + &data_dram0[IDMA_BUFF_SIZE / 4], + &data_dram1[IDMA_BUFF_SIZE / 4]}; IDMA_BUFFER_DEFINE(buffer_idma_ch0, 1, IDMA_2D_DESC); IDMA_BUFFER_DEFINE(buffer_idma_ch1, 1, IDMA_2D_DESC); -idma_buffer_t * descbuf[] = { - buffer_idma_ch0, - buffer_idma_ch1, +idma_buffer_t* descbuf[] = { + buffer_idma_ch0, + buffer_idma_ch1, }; -#endif // __IDMA__INIT_H__ \ No newline at end of file +#endif // __IDMA__INIT_H__ diff --git a/backends/cadence/vision/third-party/library/api/vsoftmaxf.c b/backends/cadence/vision/third-party/library/api/vsoftmaxf.c index 413b6f10567..27487c75d6c 100644 --- a/backends/cadence/vision/third-party/library/api/vsoftmaxf.c +++ b/backends/cadence/vision/third-party/library/api/vsoftmaxf.c @@ -63,33 +63,33 @@ y[N] result, Q7.8 or floating point x,y Must not overlap -------------------------------------------------------------------------*/ -#define IVP_ADDSN_2X32(b_, c_) \ - ({ \ - xb_vecN_2x32v a_; \ - xb_vecN_2x64w tmp_a_; \ - tmp_a_ = IVP_MULN_2X32(b_, 1); \ - IVP_MULAN_2X32(tmp_a_, c_, 1); \ - a_ = IVP_PACKVRN_2X64W(tmp_a_, 0); \ - a_; \ +#define IVP_ADDSN_2X32(b_, c_) \ + ({ \ + xb_vecN_2x32v a_; \ + xb_vecN_2x64w tmp_a_; \ + tmp_a_ = IVP_MULN_2X32(b_, 1); \ + IVP_MULAN_2X32(tmp_a_, c_, 1); \ + a_ = IVP_PACKVRN_2X64W(tmp_a_, 0); \ + a_; \ }) #if !HAVE_VFPU -DISCARD_FUN(void, vsoftmaxf, (float32_t * y, const float32_t *x, int N)) +DISCARD_FUN(void, vsoftmaxf, (float32_t * y, const float32_t* x, int N)) #else -void vsoftmaxf(float32_t *y, const float32_t *x, int N) { +void vsoftmaxf(float32_t* y, const float32_t* x, int N) { #if !defined(IVP_MULN_2X32) #else - const int *pTbl = (const int *)expftbl_Q30; + const int* pTbl = (const int*)expftbl_Q30; #endif - const xb_vecN_2xf32 *restrict pX; - xb_vecN_2xf32 *restrict pY; + const xb_vecN_2xf32* restrict pX; + xb_vecN_2xf32* restrict pY; xb_vecN_2xf32 norm, ysum, xmax; int n; valign al_X, al_R, al_Y; if (N < 0) return; xmax = minusInff.f; - pX = (const xb_vecN_2xf32 *)x; + pX = (const xb_vecN_2xf32*)x; al_X = IVP_LAN_2XF32_PP(pX); al_Y = IVP_ZALIGN(); for (n = 0; n < (N >> (LOG2_IVP_SIMD_WIDTH - 1)); n++) { @@ -99,17 +99,17 @@ void vsoftmaxf(float32_t *y, const float32_t *x, int N) { } if (N & (IVP_SIMD_WIDTH / 2 - 1)) { xb_vecN_2xf32 x; - IVP_LAVN_2XF32_XP(x, al_X, pX, - sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1))); - IVP_MAXNUMN_2XF32T(xmax, xmax, x, - IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1)))); + IVP_LAVN_2XF32_XP( + x, al_X, pX, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1))); + IVP_MAXNUMN_2XF32T( + xmax, xmax, x, IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1)))); } xmax = IVP_REPN_2XF32(IVP_RMAXNUMN_2XF32(xmax), 0); __Pragma("no_reorder"); ysum = 0.f; - pX = (const xb_vecN_2xf32 *)x; - pY = (xb_vecN_2xf32 *)y; + pX = (const xb_vecN_2xf32*)x; + pY = (xb_vecN_2xf32*)y; al_X = IVP_LAN_2XF32_PP(pX); { vboolN_2 bnan; @@ -163,8 +163,8 @@ void vsoftmaxf(float32_t *y, const float32_t *x, int N) { } if (N & (IVP_SIMD_WIDTH / 2 - 1)) { xb_vecN_2xf32 x; - IVP_LAVN_2XF32_XP(x, al_X, pX, - sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1))); + IVP_LAVN_2XF32_XP( + x, al_X, pX, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1))); x = IVP_SUBN_2XF32(x, xmax); bnan |= IVP_UNN_2XF32(x, x); { @@ -206,18 +206,18 @@ void vsoftmaxf(float32_t *y, const float32_t *x, int N) { zout = IVP_MULN_2XF32(gf, IVP_MOVN_2XF32_FROMN_2X32(exp)); x = zout; } - IVP_ADDN_2XF32T(ysum, ysum, x, - IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1)))); - IVP_SAVN_2XF32_XP(x, al_Y, pY, - sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1))); + IVP_ADDN_2XF32T( + ysum, ysum, x, IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1)))); + IVP_SAVN_2XF32_XP( + x, al_Y, pY, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1))); } IVP_SAPOSN_2XF32_FP(al_Y, pY); ysum = IVP_MOVN_2XF32T(qNaNf.f, ysum, bnan); } norm = XT_RECIP_S(IVP_RADDN_2XF32(ysum)); __Pragma("no_reorder"); - pX = (const xb_vecN_2xf32 *)y; - pY = (xb_vecN_2xf32 *)y; + pX = (const xb_vecN_2xf32*)y; + pY = (xb_vecN_2xf32*)y; al_R = IVP_LAN_2XF32_PP(pX); @@ -229,11 +229,11 @@ void vsoftmaxf(float32_t *y, const float32_t *x, int N) { } if (N & (IVP_SIMD_WIDTH / 2 - 1)) { xb_vecN_2xf32 x; - IVP_LAVN_2XF32_XP(x, al_R, pX, - sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1))); + IVP_LAVN_2XF32_XP( + x, al_R, pX, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1))); x = IVP_MULN_2XF32(x, norm); - IVP_SAVN_2XF32_XP(x, al_Y, pY, - sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1))); + IVP_SAVN_2XF32_XP( + x, al_Y, pY, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1))); } IVP_SAPOSN_2XF32_FP(al_Y, pY); diff --git a/backends/cadence/vision/third-party/library/tables/expf_tbl.c b/backends/cadence/vision/third-party/library/tables/expf_tbl.c index 0ed5dd22257..f1c6f3d44ae 100644 --- a/backends/cadence/vision/third-party/library/tables/expf_tbl.c +++ b/backends/cadence/vision/third-party/library/tables/expf_tbl.c @@ -42,22 +42,28 @@ p(order)=p(order)-(sum(p)-2); */ const int32_t ALIGN_2SIMD expftbl_Q30[8] = { - 234841, 1329551, 10400465, 59570027, - 257946177, 744260763, 1073741824, 0 /* Padding to allow for vector loads */ + 234841, + 1329551, + 10400465, + 59570027, + 257946177, + 744260763, + 1073741824, + 0 /* Padding to allow for vector loads */ }; const union ufloat32uint32 ALIGN_2SIMD expfminmax[2] = /* minimum and maximum arguments of expf() input */ { {0xc2ce8ed0}, /*-1.0327893066e+002f */ - {0x42b17218} /* 8.8722839355e+001f */ + {0x42b17218} /* 8.8722839355e+001f */ }; const int32_t invln2_Q30 = 1549082005L; /* 1/ln(2), Q30 */ const union ufloat32uint32 ALIGN_2SIMD log2_e[2] = { {0x3fb8aa3b}, /* 1.4426950216 */ - {0x32a57060} /* 1.9259629891e-008 */ + {0x32a57060} /* 1.9259629891e-008 */ }; /* @@ -70,5 +76,10 @@ p(order)=p(order)-(sum(p)-2); num2hex(single(p)); */ const union ufloat32uint32 ALIGN_2SIMD expftblf[] = { - {0x39655635}, {0x3aa24c7a}, {0x3c1eb2d1}, {0x3d633ddb}, - {0x3e75ff24}, {0x3f317212}, {0x3f800000}}; + {0x39655635}, + {0x3aa24c7a}, + {0x3c1eb2d1}, + {0x3d633ddb}, + {0x3e75ff24}, + {0x3f317212}, + {0x3f800000}}; diff --git a/backends/cadence/vision/third-party/library/tables/inff_tbl.c b/backends/cadence/vision/third-party/library/tables/inff_tbl.c index 9b2bf62e6bf..8464ee9f549 100644 --- a/backends/cadence/vision/third-party/library/tables/inff_tbl.c +++ b/backends/cadence/vision/third-party/library/tables/inff_tbl.c @@ -31,7 +31,7 @@ #include "dtypes.h" const union ufloat32uint32 minusInff = {0xff800000}; /* -Inf */ -const union ufloat32uint32 plusInff = {0x7f800000}; /* +Inf */ +const union ufloat32uint32 plusInff = {0x7f800000}; /* +Inf */ const union ufloat32uint32 realmaxf = { 0x7f7fffff}; /* maximum floating point number */ const union ufloat32uint32 realminf = { diff --git a/backends/cadence/vision/third-party/library/tables/nanf_tbl.c b/backends/cadence/vision/third-party/library/tables/nanf_tbl.c index 27c5f437b9a..f165234fce4 100644 --- a/backends/cadence/vision/third-party/library/tables/nanf_tbl.c +++ b/backends/cadence/vision/third-party/library/tables/nanf_tbl.c @@ -27,9 +27,9 @@ */ /* Portable data types. */ -#include "dtypes.h" /* NaN values for single precision routines. */ #include "nanf_tbl.h" +#include "dtypes.h" const union ufloat32uint32 sNaNf = {0x7f800001}; /* Signalling NaN */ const union ufloat32uint32 qNaNf = {0x7fc00000}; /* Quiet NaN */ diff --git a/backends/cadence/vision/third-party/targets.bzl b/backends/cadence/vision/third-party/targets.bzl index 6bbb7da8d49..26a097010d5 100644 --- a/backends/cadence/vision/third-party/targets.bzl +++ b/backends/cadence/vision/third-party/targets.bzl @@ -16,7 +16,7 @@ def define_common_targets(): "include/*.h", "include_private/*.h" ]), - header_namespace = "backends/cadence/vision/third-party", + header_namespace = "", visibility = [ "//executorch/backends/cadence/...", "@EXECUTORCH_CLIENTS", @@ -28,7 +28,11 @@ def define_common_targets(): }), compiler_flags = select({ "DEFAULT": ["-UCOMPILER_XTENSA"], # Ensure COMPILER_XTENSA is not defined for non-Xtensa builds - "ovr_config//cpu:xtensa": ["-DCOMPILER_XTENSA"], + "ovr_config//cpu:xtensa": [ + "-DCOMPILER_XTENSA", + "-Ixplat/executorch/backends/cadence/vision/third-party/include", + "-Ixplat/executorch/backends/cadence/vision/third-party/include_private", + ], }), define_static_target = True, )