From 887579a1ef4c6033eb8f1263fbd3ffc5c353fca4 Mon Sep 17 00:00:00 2001 From: Zonglin Peng Date: Fri, 1 Nov 2024 12:58:41 -0700 Subject: [PATCH 1/2] port add ops, create new buck target, add op_add kernel modification Summary: done the three things as titled Differential Revision: D65300260 --- backends/cadence/hifi/operators/targets.bzl | 121 ++++++++++++++++++ .../cadence/hifi/third-party/nnlib/TARGETS | 5 + .../hifi/third-party/nnlib/targets.bzl | 18 +++ .../nnlib/xa_nn_elm_add_f32_broadcast.c | 2 - .../nnlib/xa_nn_elm_mul_f32_broadcast.c | 9 +- 5 files changed, 148 insertions(+), 7 deletions(-) create mode 100644 backends/cadence/hifi/third-party/nnlib/TARGETS create mode 100644 backends/cadence/hifi/third-party/nnlib/targets.bzl diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl index 96f063728c8..c474018fd10 100644 --- a/backends/cadence/hifi/operators/targets.bzl +++ b/backends/cadence/hifi/operators/targets.bzl @@ -31,3 +31,124 @@ def define_common_targets(): "@EXECUTORCH_CLIENTS", ], ) + + runtime.cxx_library( + name = "op_add", + srcs = glob([ + "op_add.cpp", + ]), + platforms = CXX, + deps = [ + "//executorch/kernels/portable/cpu/util:all_deps", + "//executorch/kernels/portable/cpu/pattern:all_deps", + "//executorch/runtime/kernel:kernel_includes", + "//executorch/kernels/portable/cpu:scalar_utils", + "//executorch/backends/cadence/hifi/kernels:kernels", + "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions" + ], + visibility = [ + "//executorch/backends/cadence/...", + "@EXECUTORCH_CLIENTS", + ], + ) + + + runtime.cxx_library( + name = "op_mul", + srcs = glob([ + "op_mul.cpp", + ]), + platforms = CXX, + deps = [ + "//executorch/kernels/portable/cpu/util:all_deps", + "//executorch/kernels/portable/cpu/pattern:all_deps", + "//executorch/runtime/kernel:kernel_includes", + "//executorch/kernels/portable/cpu:scalar_utils", + "//executorch/backends/cadence/hifi/kernels:kernels", + "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions" + ], + visibility = [ + "//executorch/backends/cadence/...", + "@EXECUTORCH_CLIENTS", + ], + ) + + runtime.cxx_library( + name = "op_sub", + srcs = glob([ + "op_sub.cpp", + ]), + platforms = CXX, + deps = [ + "//executorch/kernels/portable/cpu/util:all_deps", + "//executorch/kernels/portable/cpu/pattern:all_deps", + "//executorch/runtime/kernel:kernel_includes", + "//executorch/kernels/portable/cpu:scalar_utils", + "//executorch/backends/cadence/hifi/kernels:kernels", + "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions" + ], + visibility = [ + "//executorch/backends/cadence/...", + "@EXECUTORCH_CLIENTS", + ], + ) + + runtime.cxx_library( + name = "op_div", + srcs = glob([ + "op_div.cpp", + ]), + platforms = CXX, + deps = [ + "//executorch/kernels/portable/cpu/util:all_deps", + "//executorch/kernels/portable/cpu/pattern:all_deps", + "//executorch/runtime/kernel:kernel_includes", + "//executorch/kernels/portable/cpu:scalar_utils", + "//executorch/backends/cadence/hifi/kernels:kernels", + "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions" + ], + visibility = [ + "//executorch/backends/cadence/...", + "@EXECUTORCH_CLIENTS", + ], + ) + + runtime.cxx_library( + name = "op_sigmoid", + srcs = glob([ + "op_sigmoid.cpp", + ]), + platforms = CXX, + deps = [ + "//executorch/kernels/portable/cpu/util:all_deps", + "//executorch/kernels/portable/cpu/pattern:all_deps", + "//executorch/runtime/kernel:kernel_includes", + "//executorch/kernels/portable/cpu:scalar_utils", + "//executorch/backends/cadence/hifi/kernels:kernels", + "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions" + ], + visibility = [ + "//executorch/backends/cadence/...", + "@EXECUTORCH_CLIENTS", + ], + ) + + runtime.cxx_library( + name = "op_tanh", + srcs = glob([ + "op_tanh.cpp", + ]), + platforms = CXX, + deps = [ + "//executorch/kernels/portable/cpu/util:all_deps", + "//executorch/kernels/portable/cpu/pattern:all_deps", + "//executorch/runtime/kernel:kernel_includes", + "//executorch/kernels/portable/cpu:scalar_utils", + "//executorch/backends/cadence/hifi/kernels:kernels", + "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions" + ], + visibility = [ + "//executorch/backends/cadence/...", + "@EXECUTORCH_CLIENTS", + ], + ) diff --git a/backends/cadence/hifi/third-party/nnlib/TARGETS b/backends/cadence/hifi/third-party/nnlib/TARGETS new file mode 100644 index 00000000000..67f2bab681a --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/TARGETS @@ -0,0 +1,5 @@ +load("targets.bzl", "define_common_targets") + +oncall("odai_jarvis") + +define_common_targets() diff --git a/backends/cadence/hifi/third-party/nnlib/targets.bzl b/backends/cadence/hifi/third-party/nnlib/targets.bzl new file mode 100644 index 00000000000..615eacaa666 --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/targets.bzl @@ -0,0 +1,18 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX") +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + runtime.cxx_library( + name = "nnlib-extensions", + srcs = native.glob(["*.c", "*.cpp"]), + exported_headers = glob(["*.h"]), + visibility = [ + "//executorch/backends/cadence/...", + "@EXECUTORCH_CLIENTS", + ], + deps = [ + "fbsource//third-party/nnlib-hifi4/xa_nnlib:libxa_nnlib", + ], + ) diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c index 9eab22b05b7..2a18d57e99f 100644 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c @@ -25,7 +25,6 @@ #include "xa_nnlib_err_chk.h" #include "xa_nnlib_kernels_api.h" - #if HAVE_VFPU static void internal_elm_add_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out, const FLOAT32 * __restrict__ p_inp1, @@ -425,4 +424,3 @@ WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out, return 0; } - diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c index b9aa102a15f..e11fccbba52 100644 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c @@ -20,11 +20,10 @@ ******************************************************************************/ #include "xa_type_def.h" -#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_common_fpu.h" -#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nn_common.h" -#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h" -#include "nnlib-hifi4/xa_nnlib/algo/kernels/basic/hifi4/xa_nn_basic_state.h" -#include "nnlib-hifi4/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_fpu.h" +#include "xa_nn_common.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" #if HAVE_VFPU static void internal_elm_mul_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out, From 4ed5dbcc5b17c0a6b4bd7a225709a0521a23a709 Mon Sep 17 00:00:00 2001 From: Zonglin Peng Date: Fri, 1 Nov 2024 15:00:43 -0700 Subject: [PATCH 2/2] linked sub mul div tanh sigmoid from oss (#6619) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/6619 titled Differential Revision: D65359613 --- backends/cadence/hifi/operators/targets.bzl | 2 - .../nnlib/xa_nn_elm_div_mode_f32_broadcast.c | 56 +++++++++---------- 2 files changed, 28 insertions(+), 30 deletions(-) diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl index c474018fd10..036069bb5e0 100644 --- a/backends/cadence/hifi/operators/targets.bzl +++ b/backends/cadence/hifi/operators/targets.bzl @@ -123,7 +123,6 @@ def define_common_targets(): "//executorch/kernels/portable/cpu/util:all_deps", "//executorch/kernels/portable/cpu/pattern:all_deps", "//executorch/runtime/kernel:kernel_includes", - "//executorch/kernels/portable/cpu:scalar_utils", "//executorch/backends/cadence/hifi/kernels:kernels", "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions" ], @@ -143,7 +142,6 @@ def define_common_targets(): "//executorch/kernels/portable/cpu/util:all_deps", "//executorch/kernels/portable/cpu/pattern:all_deps", "//executorch/runtime/kernel:kernel_includes", - "//executorch/kernels/portable/cpu:scalar_utils", "//executorch/backends/cadence/hifi/kernels:kernels", "//executorch/backends/cadence/hifi/third-party/nnlib:nnlib-extensions" ], diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c index 95b449f43f7..17c619d150e 100644 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c @@ -54,7 +54,7 @@ WORD32 xa_nn_elm_div_mode_f32xf32_f32(FLOAT32 * __restrict__ p_out, XT_LASX2IP(x1, inp1_a, inp1); XT_LASX2IP(x2, inp2_a, inp2); y = XT_DIV_SX2(x1, x2); - y = FITRUNC_SX2(y); + y = XT_FITRUNC_SX2(y); XT_SASX2IP(y, out_a, out); } } @@ -66,7 +66,7 @@ WORD32 xa_nn_elm_div_mode_f32xf32_f32(FLOAT32 * __restrict__ p_out, XT_LASX2IP(x1, inp1_a, inp1); XT_LASX2IP(x2, inp2_a, inp2); y = XT_DIV_SX2(x1, x2); - y = FIFLOOR_SX2(y); + y = XT_FIFLOOR_SX2(y); XT_SASX2IP(y, out_a, out); } } @@ -80,9 +80,9 @@ WORD32 xa_nn_elm_div_mode_f32xf32_f32(FLOAT32 * __restrict__ p_out, XT_LSIP(a2, (xtfloat *)inp2, 0); a = XT_DIV_S(a1, a2); if(mode == 0) - a = FITRUNC_S(a); + a = XT_FITRUNC_S(a); else - a = FIFLOOR_S(a); + a = XT_FIFLOOR_S(a); XT_SSI(a, (xtfloat *)out, 0); } @@ -138,7 +138,7 @@ static void internal_elm_div_mode_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict_ XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); y = XT_DIV_SX2(x2, x1); - y = FITRUNC_SX2(y); + y = XT_FITRUNC_SX2(y); XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); } } @@ -149,7 +149,7 @@ static void internal_elm_div_mode_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict_ XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); y = XT_DIV_SX2(x2, x1); - y = FIFLOOR_SX2(y); + y = XT_FIFLOOR_SX2(y); XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); } } @@ -166,7 +166,7 @@ static void internal_elm_div_mode_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict_ XT_LASX2IP(x1, vinp1, p_a); XT_LASX2IP(x2, vinp2, p_b); y = XT_DIV_SX2(x2, x1); - y = FITRUNC_SX2(y); + y = XT_FITRUNC_SX2(y); XT_SASX2IP(y, out_a, p_c); } } @@ -177,7 +177,7 @@ static void internal_elm_div_mode_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict_ XT_LASX2IP(x1, vinp1, p_a); XT_LASX2IP(x2, vinp2, p_b); y = XT_DIV_SX2(x2, x1); - y = FIFLOOR_SX2(y); + y = XT_FIFLOOR_SX2(y); XT_SASX2IP(y, out_a, p_c); } } @@ -189,9 +189,9 @@ static void internal_elm_div_mode_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict_ XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); c0 = XT_DIV_S(b0, a0); if(mode == 0) - c0 = FITRUNC_S(c0); + c0 = XT_FITRUNC_S(c0); else - c0 = FIFLOOR_S(c0); + c0 = XT_FIFLOOR_S(c0); XT_SSI(c0, (xtfloat *)p_c, 0); } } @@ -213,7 +213,7 @@ static void internal_elm_div_mode_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict_ XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); y = XT_DIV_SX2(x1, x2); - y = FITRUNC_SX2(y); + y = XT_FITRUNC_SX2(y); XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); } } @@ -224,7 +224,7 @@ static void internal_elm_div_mode_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict_ XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); y = XT_DIV_SX2(x1, x2); - y = FIFLOOR_SX2(y); + y = XT_FIFLOOR_SX2(y); XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); } } @@ -241,7 +241,7 @@ static void internal_elm_div_mode_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict_ XT_LASX2IP(x1, vinp1, p_a); XT_LASX2IP(x2, vinp2, p_b); y = XT_DIV_SX2(x1, x2); - y = FITRUNC_SX2(y); + y = XT_FITRUNC_SX2(y); XT_SASX2IP(y, out_a, p_c); } } @@ -252,7 +252,7 @@ static void internal_elm_div_mode_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict_ XT_LASX2IP(x1, vinp1, p_a); XT_LASX2IP(x2, vinp2, p_b); y = XT_DIV_SX2(x1, x2); - y = FIFLOOR_SX2(y); + y = XT_FIFLOOR_SX2(y); XT_SASX2IP(y, out_a, p_c); } } @@ -264,9 +264,9 @@ static void internal_elm_div_mode_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict_ XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); c0 = XT_DIV_S(a0, b0); if(mode == 0) - c0 = FITRUNC_S(c0); + c0 = XT_FITRUNC_S(c0); else - c0 = FIFLOOR_S(c0); + c0 = XT_FIFLOOR_S(c0); XT_SSI(c0, (xtfloat *)p_c, 0); } } @@ -302,7 +302,7 @@ static void internal_elm_div_mode_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p { XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); y = XT_DIV_SX2(x2, x1); - y = FITRUNC_SX2(y); + y = XT_FITRUNC_SX2(y); XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); } } @@ -312,7 +312,7 @@ static void internal_elm_div_mode_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p { XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); y = XT_DIV_SX2(x2, x1); - y = FIFLOOR_SX2(y); + y = XT_FIFLOOR_SX2(y); XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); } } @@ -328,7 +328,7 @@ static void internal_elm_div_mode_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p { XT_LASX2IP(x1, inp1_a, p_a); y = XT_DIV_SX2(x2, x1); - y = FITRUNC_SX2(y); + y = XT_FITRUNC_SX2(y); XT_SASX2IP(y, out_a, p_c); } } @@ -338,7 +338,7 @@ static void internal_elm_div_mode_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p { XT_LASX2IP(x1, inp1_a, p_a); y = XT_DIV_SX2(x2, x1); - y = FIFLOOR_SX2(y); + y = XT_FIFLOOR_SX2(y); XT_SASX2IP(y, out_a, p_c); } } @@ -349,9 +349,9 @@ static void internal_elm_div_mode_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); out = XT_DIV_S(x2, a0_7); if(mode == 0) - out = FITRUNC_S(out); + out = XT_FITRUNC_S(out); else - out = FIFLOOR_S(out); + out = XT_FIFLOOR_S(out); XT_SSI(out, (xtfloat *)p_c, 0); } } @@ -366,7 +366,7 @@ static void internal_elm_div_mode_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p { XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); y = XT_DIV_SX2(x1, x2); - y = FITRUNC_SX2(y); + y = XT_FITRUNC_SX2(y); XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); } } @@ -376,7 +376,7 @@ static void internal_elm_div_mode_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p { XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); y = XT_DIV_SX2(x1, x2); - y = FIFLOOR_SX2(y); + y = XT_FIFLOOR_SX2(y); XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); } } @@ -392,7 +392,7 @@ static void internal_elm_div_mode_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p { XT_LASX2IP(x1, inp1_a, p_a); y = XT_DIV_SX2(x1, x2); - y = FITRUNC_SX2(y); + y = XT_FITRUNC_SX2(y); XT_SASX2IP(y, out_a, p_c); } } @@ -402,7 +402,7 @@ static void internal_elm_div_mode_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p { XT_LASX2IP(x1, inp1_a, p_a); y = XT_DIV_SX2(x1, x2); - y = FIFLOOR_SX2(y); + y = XT_FIFLOOR_SX2(y); XT_SASX2IP(y, out_a, p_c); } } @@ -413,9 +413,9 @@ static void internal_elm_div_mode_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); out = XT_DIV_S(a0_7, x2); if(mode == 0) - out = FITRUNC_S(out); + out = XT_FITRUNC_S(out); else - out = FIFLOOR_S(out); + out = XT_FIFLOOR_S(out); XT_SSI(out, (xtfloat *)p_c, 0); } }