From 0bf646ee59605f487f6309079e0d8d8234d3cf28 Mon Sep 17 00:00:00 2001 From: "cmadhira@cadence.com" Date: Fri, 8 Nov 2024 22:33:57 +0530 Subject: [PATCH 1/4] Added Fusion G3 NN library with kernels related to add, mul, quantize, dequantize, cat, layer norm, softmax to backends/cadence folder. Added operators to backends/cadence folder --- backends/cadence/CMakeLists.txt | 7 +- backends/cadence/aot/functions_fusion_g3.yaml | 119 +++ .../fusion_g3/operators/CMakeLists.txt | 85 ++ .../cadence/fusion_g3/operators/op_add.cpp | 247 +++++ .../cadence/fusion_g3/operators/op_cat.cpp | 149 +++ .../fusion_g3/operators/op_dequantize.cpp | 807 +++++++++++++++++ .../cadence/fusion_g3/operators/op_mul.cpp | 226 +++++ .../operators/op_native_layer_norm.cpp | 276 ++++++ .../fusion_g3/operators/op_quantize.cpp | 797 +++++++++++++++++ .../fusion_g3/operators/op_softmax.cpp | 117 +++ .../third-party/nnlib/CMakeLists.txt | 19 + .../algo/common/include/xa_api_defs.h | 51 ++ .../common/include/xa_nnlib_common_internal.h | 149 +++ .../common/include/xa_nnlib_definitions.h | 46 + .../algo/common/include/xa_nnlib_err_chk.h | 65 ++ .../algo/common/src/xa_nnlib_common_api.c | 43 + .../algo/kernels/activations/xa_nn_softmax.c | 508 +++++++++++ .../algo/kernels/basic/xa_nn_elm_add_32x32.c | 655 ++++++++++++++ .../algo/kernels/basic/xa_nn_elm_add_f32.c | 618 +++++++++++++ .../basic/xa_nn_elm_dequantize_asym16_f32.c | 172 ++++ .../basic/xa_nn_elm_dequantize_asym16u_f32.c | 170 ++++ .../basic/xa_nn_elm_dequantize_asym4_f32.c | 174 ++++ .../basic/xa_nn_elm_dequantize_asym4u_f32.c | 177 ++++ .../basic/xa_nn_elm_dequantize_asym8_f32.c | 170 ++++ .../basic/xa_nn_elm_dequantize_asym8u_f32.c | 174 ++++ .../basic/xa_nn_elm_dequantize_sym16_f32.c | 148 +++ .../basic/xa_nn_elm_dequantize_sym16u_f32.c | 154 ++++ .../basic/xa_nn_elm_dequantize_sym4_f32.c | 157 ++++ .../basic/xa_nn_elm_dequantize_sym4u_f32.c | 158 ++++ .../basic/xa_nn_elm_dequantize_sym8_f32.c | 153 ++++ .../basic/xa_nn_elm_dequantize_sym8u_f32.c | 157 ++++ .../algo/kernels/basic/xa_nn_elm_mul_32x32.c | 601 +++++++++++++ .../algo/kernels/basic/xa_nn_elm_mul_f32.c | 625 +++++++++++++ .../basic/xa_nn_elm_quantize_f32_asym16.c | 219 +++++ .../basic/xa_nn_elm_quantize_f32_asym16u.c | 221 +++++ .../basic/xa_nn_elm_quantize_f32_asym4.c | 219 +++++ .../basic/xa_nn_elm_quantize_f32_asym4u.c | 220 +++++ .../basic/xa_nn_elm_quantize_f32_asym8.c | 219 +++++ .../basic/xa_nn_elm_quantize_f32_asym8u.c | 220 +++++ .../basic/xa_nn_elm_quantize_f32_sym16.c | 199 +++++ .../basic/xa_nn_elm_quantize_f32_sym16u.c | 201 +++++ .../basic/xa_nn_elm_quantize_f32_sym4.c | 203 +++++ .../basic/xa_nn_elm_quantize_f32_sym4u.c | 201 +++++ .../basic/xa_nn_elm_quantize_f32_sym8.c | 200 +++++ .../basic/xa_nn_elm_quantize_f32_sym8u.c | 200 +++++ .../algo/kernels/norm/xa_nn_layer_norm.c | 845 ++++++++++++++++++ .../xa_nnlib/algo/kernels/reorg/xa_nn_cat.c | 136 +++ .../algo/kernels/tables/include/expf_tbl.h | 44 + .../algo/kernels/tables/src/expf_tbl.c | 50 ++ .../nnlib/nnlib-FuG3/xa_nnlib/build/common.mk | 151 ++++ .../nnlib-FuG3/xa_nnlib/build/detect_core.mk | 36 + .../xa_nnlib/build/ldscript_nnlib.txt | 48 + .../nnlib/nnlib-FuG3/xa_nnlib/build/makefile | 42 + .../xa_nnlib/build/makefile_nn_lib_fusion_g3 | 102 +++ .../xa_nnlib/build/symbols_nnlib.txt | 39 + .../xa_nnlib/doc/FusionG3-NNLib-API.pdf | Bin 0 -> 1034936 bytes .../doc/FusionG3-NNLib-Kernel-Testreport.pdf | Bin 0 -> 734144 bytes .../doc/FusionG3-NNLib-Performance.pdf | Bin 0 -> 526057 bytes .../include/nnlib/xa_nnlib_kernels_api.h | 347 +++++++ .../nnlib-FuG3/xa_nnlib/include/xa_type_def.h | 84 ++ 60 files changed, 12619 insertions(+), 1 deletion(-) create mode 100644 backends/cadence/aot/functions_fusion_g3.yaml create mode 100644 backends/cadence/fusion_g3/operators/CMakeLists.txt create mode 100644 backends/cadence/fusion_g3/operators/op_add.cpp create mode 100644 backends/cadence/fusion_g3/operators/op_cat.cpp create mode 100644 backends/cadence/fusion_g3/operators/op_dequantize.cpp create mode 100644 backends/cadence/fusion_g3/operators/op_mul.cpp create mode 100644 backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp create mode 100644 backends/cadence/fusion_g3/operators/op_quantize.cpp create mode 100644 backends/cadence/fusion_g3/operators/op_softmax.cpp create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/CMakeLists.txt create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/common/include/xa_api_defs.h create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/common/include/xa_nnlib_common_internal.h create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/common/include/xa_nnlib_definitions.h create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/common/src/xa_nnlib_common_api.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/activations/xa_nn_softmax.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_add_32x32.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_add_f32.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_asym16_f32.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_asym16u_f32.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_asym4_f32.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_asym4u_f32.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_asym8_f32.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_asym8u_f32.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_sym16_f32.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_sym16u_f32.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_sym4_f32.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_sym4u_f32.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_sym8_f32.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_sym8u_f32.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_mul_32x32.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_mul_f32.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_asym16.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_asym16u.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_asym4.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_asym4u.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_asym8.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_asym8u.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_sym16.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_sym16u.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_sym4.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_sym4u.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_sym8.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_sym8u.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/norm/xa_nn_layer_norm.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/reorg/xa_nn_cat.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/tables/include/expf_tbl.h create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/tables/src/expf_tbl.c create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/build/common.mk create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/build/detect_core.mk create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/build/ldscript_nnlib.txt create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/build/makefile create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/build/makefile_nn_lib_fusion_g3 create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/build/symbols_nnlib.txt create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/doc/FusionG3-NNLib-API.pdf create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/doc/FusionG3-NNLib-Kernel-Testreport.pdf create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/doc/FusionG3-NNLib-Performance.pdf create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h create mode 100644 backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/include/xa_type_def.h diff --git a/backends/cadence/CMakeLists.txt b/backends/cadence/CMakeLists.txt index 3c1aa2945ab..3cd880622cd 100644 --- a/backends/cadence/CMakeLists.txt +++ b/backends/cadence/CMakeLists.txt @@ -76,7 +76,12 @@ endif() if(EXECUTORCH_NNLIB_OPT) set(TARGET_DIR hifi) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels) +endif() + +if(EXECUTORCH_FUSION_G3_OPT) + set(TARGET_DIR fusion_g3) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib) endif() add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/operators) -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels) diff --git a/backends/cadence/aot/functions_fusion_g3.yaml b/backends/cadence/aot/functions_fusion_g3.yaml new file mode 100644 index 00000000000..796efc9f223 --- /dev/null +++ b/backends/cadence/aot/functions_fusion_g3.yaml @@ -0,0 +1,119 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This yaml file contains operators that are also defined by the ATen library. +# For lean mode: +# - Codegen'd target `executorch_generated_lib` will be reading all the information +# from this file, including operator schema and kernel metadata. +# - Selective build target `codegen:executorch_defined_ops` now is selecting all the +# operators in this file, by dumping all the op names into `selected_operators.yaml`. +# +# See the README.md file in executorch/kernels/portable for a description of the syntax used +# by this file. + + +# aten ops +- op: _to_copy.out + kernels: + - arg_meta: null + kernel_name: torch::executor::to_copy_out + +- op: _softmax.out + kernels: + - arg_meta: null + kernel_name: impl::FusionG3::softmax_out + +- op: add.out + kernels: + - arg_meta: null + kernel_name: impl::FusionG3::add_out + +- op: add.Scalar_out + kernels: + - arg_meta: null + kernel_name: impl::FusionG3::add_scalar_out + +- op: bmm.out + kernels: + - arg_meta: null + kernel_name: torch::executor::bmm_out + +- op: cat.out + kernels: + - arg_meta: null + kernel_name: impl::FusionG3::cat_out + +- op: clone.out + kernels: + - arg_meta: null + kernel_name: torch::executor::clone_out + +- op: div.out + kernels: + - arg_meta: null + kernel_name: torch::executor::div_out + +- op: div.out_mode + kernels: + - arg_meta: null + kernel_name: torch::executor::div_out_mode + +- op: embedding.out + kernels: + - arg_meta: null + kernel_name: torch::executor::embedding_out + +- op: full.out + kernels: + - arg_meta: null + kernel_name: torch::executor::full_out + +- op: mul.out + kernels: + - arg_meta: null + kernel_name: impl::FusionG3::mul_out + +- op: mul.Scalar_out + kernels: + - arg_meta: null + kernel_name: impl::FusionG3::mul_scalar_out + +- op: permute_copy.out + kernels: + - arg_meta: null + kernel_name: torch::executor::permute_copy_out + +- op: sigmoid.out + kernels: + - arg_meta: null + kernel_name: torch::executor::sigmoid_out + +- op: slice_copy.Tensor_out + kernels: + - arg_meta: null + kernel_name: torch::executor::slice_copy_Tensor_out + +- op: split_with_sizes_copy.out + kernels: + - arg_meta: null + kernel_name: torch::executor::split_with_sizes_copy_out + +- op: sub.out + kernels: + - arg_meta: null + kernel_name: torch::executor::sub_out + +- op: view_copy.out + kernels: + - arg_meta: null + kernel_name: torch::executor::view_copy_out + +- op: where.self_out + kernels: + - arg_meta: null + kernel_name: torch::executor::where_out + +- op: native_layer_norm.out + kernels: + - arg_meta: null + kernel_name: impl::FusionG3::native_layer_norm_out + diff --git a/backends/cadence/fusion_g3/operators/CMakeLists.txt b/backends/cadence/fusion_g3/operators/CMakeLists.txt new file mode 100644 index 00000000000..ce542e1aa40 --- /dev/null +++ b/backends/cadence/fusion_g3/operators/CMakeLists.txt @@ -0,0 +1,85 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +cmake_minimum_required(VERSION 3.19) + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif() + +include(${EXECUTORCH_ROOT}/build/Utils.cmake) +include(${EXECUTORCH_ROOT}/build/Codegen.cmake) + +if(NOT PYTHON_EXECUTABLE) + resolve_python_executable() +endif() + +# ATen compliant ops that are needed to run this model. +set(_aten_ops__srcs + "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/copy_ops_util.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/broadcast_util.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/index_util.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/kernel_ops_util.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/matmul_ops_util.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/slice_util.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_add.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_mul.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_cat.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_softmax.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_native_layer_norm.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_quantize.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/op_dequantize.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_div.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_embedding.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_full.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_permute_copy.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sigmoid.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_slice_copy.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_split_with_sizes_copy.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sub.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_to_copy.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_view_copy.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/dtype_util.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/normalization_ops_util.cpp" +) +add_library(aten_ops_cadence ${_aten_ops__srcs}) +target_link_libraries(aten_ops_cadence PUBLIC executorch) +target_link_libraries(aten_ops_cadence PRIVATE xa_nnlib) + +# Let files say "include ". +set(_common_include_directories ${EXECUTORCH_ROOT}/..) + +target_include_directories( + aten_ops_cadence PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} + ${_common_include_directories} + ${EXECUTORCH_ROOT}/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/common/include/ + ${EXECUTORCH_ROOT}/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/include/nnlib + ${EXECUTORCH_ROOT}/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/include + ${EXECUTORCH_ROOT}/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/tables/include +) + +# Generate C++ bindings to register kernels into both PyTorch (for AOT) and +# Executorch (for runtime). Here select all ops in functions.yaml +gen_selected_ops( + LIB_NAME "cadence_ops_lib" OPS_SCHEMA_YAML + "${CMAKE_CURRENT_LIST_DIR}/../../aot/functions_fusion_g3.yaml" "" "" +) +generate_bindings_for_kernels( + LIB_NAME "cadence_ops_lib" OPS_SCHEMA_YAML FUNCTIONS_YAML + ${CMAKE_CURRENT_SOURCE_DIR}/../../aot/functions_fusion_g3.yaml +) +message("Generated files ${gen_command_sources}") + +gen_operators_lib( + LIB_NAME "cadence_ops_lib" KERNEL_LIBS DEPS aten_ops_cadence +) diff --git a/backends/cadence/fusion_g3/operators/op_add.cpp b/backends/cadence/fusion_g3/operators/op_add.cpp new file mode 100644 index 00000000000..ef2bf8de6fc --- /dev/null +++ b/backends/cadence/fusion_g3/operators/op_add.cpp @@ -0,0 +1,247 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include + +using exec_aten::Scalar; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using torch::executor::KernelRuntimeContext; +using executorch::runtime::canCast; +using torch::executor::Error; + +namespace impl { +namespace FusionG3 { +namespace native { + + +Tensor& add_out(KernelRuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + const Scalar& alpha, + Tensor& out) +{ + // Common Dtype + ScalarType common_type = executorch::runtime::promoteTypes(a.scalar_type(), b.scalar_type()); + + // Check Common Dtype + ET_KERNEL_CHECK( + ctx, + (canCast(common_type, out.scalar_type()) && + torch::executor::check_alpha_type(torch::executor::native::utils:: + get_scalar_dtype(alpha), common_type)), + InvalidArgument, + out); + + // Check Dim Order + ET_KERNEL_CHECK( + ctx, executorch::runtime::tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + + // Resize + ET_KERNEL_CHECK( + ctx, + torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + // Compute Dtype + ScalarType compute_type = torch::executor::native::utils::get_compute_type(common_type); + + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "add.out"; + + const exec_aten::ArrayRef a_size = a.sizes(); + const exec_aten::ArrayRef b_size = b.sizes(); + const exec_aten::ArrayRef out_size = out.sizes(); + + int kTensorDimensionLimit = 5; + + int inp1_shape[kTensorDimensionLimit]; + int inp2_shape[kTensorDimensionLimit]; + int out_shape[kTensorDimensionLimit]; + + /* input shapes and output shapes */ + for(auto i = 0; i < a_size.size(); i++) + { + inp1_shape[i] = a_size[i]; + } + + for(auto i = 0; i < b_size.size(); i++) + { + inp2_shape[i] = b_size[i]; + } + + for(auto i = 0; i < out_size.size(); i++) + { + out_shape[i] = out_size[i]; + } + + /*find broadcast*/ + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool broadcast = (a_is_broadcasted || b_is_broadcasted); + + int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); + + if(compute_type == ScalarType::Int) + { + const int* const inp1_data = a.const_data_ptr(); + const int* const inp2_data = b.const_data_ptr(); + int* const out_data = out.mutable_data_ptr(); + + int alpha_val; + torch::executor::native::utils::extract_scalar(alpha, &alpha_val); + if(broadcast) + { + xa_nn_elm_add_broadcast_5D_32x32_32(out_data, out_shape, + inp1_data, inp1_shape, inp2_data, inp2_shape, max_dim, alpha_val); + } + else + { + xa_nn_elm_add_32x32_32(out_data, inp1_data, inp2_data, alpha_val, out.numel()); + } + } + else if(compute_type == ScalarType::Float) + { + const float* const inp1_data = a.const_data_ptr(); + const float* const inp2_data = b.const_data_ptr(); + float* const out_data = out.mutable_data_ptr(); + + float alpha_val; + torch::executor::native::utils::extract_scalar(alpha, &alpha_val); + + if(broadcast) + { + xa_nn_elm_add_broadcast_5D_f32xf32_f32(out_data, out_shape, inp1_data, + inp1_shape, inp2_data, inp2_shape, max_dim, alpha_val); + } + else + { + xa_nn_elm_add_f32xf32_f32(out_data, inp1_data, inp2_data, alpha_val, out.numel()); + } + } + else + { + ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { + const CTYPE_COMPUTE val_alpha = torch::executor::native::utils:: + scalar_to(alpha); + torch::executor::native::utils:: + apply_bitensor_elementwise_fn( + [val_alpha](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + return val_a + val_alpha * val_b; + }, + ctx, + a, + torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16, + b, + torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16, + out, + torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16); + }); + } + + return out; +} + +Tensor& add_scalar_out(KernelRuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + const Scalar& alpha, + Tensor& out) +{ + // Common Dtype + ScalarType common_type = torch::executor::native::utils:: + promote_type_with_scalar(a.scalar_type(), b); + + // Check Common Dtype + ET_KERNEL_CHECK( + ctx, + (common_type == out.scalar_type() && + torch::executor::check_alpha_type(torch::executor::native::utils:: + get_scalar_dtype(alpha), common_type)), + InvalidArgument, + out); + + // Check Dim Order + ET_KERNEL_CHECK( + ctx, executorch::runtime::tensors_have_same_dim_order(a, out), + InvalidArgument, out); + + // Resize + ET_KERNEL_CHECK( + ctx, executorch::runtime::resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, out); + + + + // Compute Dtype + ScalarType compute_type = torch::executor::native::utils::get_compute_type(common_type); + + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "add.Scalar_out"; + + if(compute_type == ScalarType::Int) + { + const int* const inp1_data = a.const_data_ptr(); + int inp2_val; + torch::executor::native::utils::extract_scalar(b, &inp2_val); + + int alpha_val; + torch::executor::native::utils::extract_scalar(alpha, &alpha_val); + + int* const out_data = out.mutable_data_ptr(); + + xa_nn_elm_add_scalar_32x32_32(out_data, inp1_data, inp2_val, + alpha_val, out.numel()); + } + else if(compute_type == ScalarType::Float) + { + const float* const inp1_data = a.const_data_ptr(); + float inp2_val; + torch::executor::native::utils::extract_scalar(b, &inp2_val); + + float alpha_val; + torch::executor::native::utils::extract_scalar(alpha, &alpha_val); + + float* const out_data = out.mutable_data_ptr(); + + xa_nn_elm_add_scalar_f32xf32_f32(out_data, inp1_data, inp2_val, + alpha_val, out.numel()); + } + else + { + ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { + torch::executor::native::utils:: + apply_unitensor_elementwise_fn( + [b, alpha](const CTYPE_COMPUTE val_a) { + CTYPE_COMPUTE val_b = torch::executor::native:: + utils::scalar_to(b); + CTYPE_COMPUTE val_alpha = torch::executor::native:: + utils::scalar_to(alpha); + return val_a + val_alpha * val_b; + }, + ctx, + a, + torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16, + out, + torch::executor::native::utils:: + SupportedTensorDtypes::SAME_AS_COMMON); + }); + } + return out; +} + +} // namespace native +} // namespace FusionG3 +} // namespace impl + diff --git a/backends/cadence/fusion_g3/operators/op_cat.cpp b/backends/cadence/fusion_g3/operators/op_cat.cpp new file mode 100644 index 00000000000..217ada9caa2 --- /dev/null +++ b/backends/cadence/fusion_g3/operators/op_cat.cpp @@ -0,0 +1,149 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include + +using exec_aten::Scalar; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using torch::executor::KernelRuntimeContext; +using torch::executor::Error; + + +namespace impl { +namespace FusionG3 { +namespace native { + + +Tensor& cat_out(KernelRuntimeContext& ctx, + exec_aten::ArrayRef tensors, + int64_t dim, + Tensor& out) +{ + if (dim < 0) + { + dim += out.dim(); + } + + ET_KERNEL_CHECK(ctx, torch::executor::check_cat_args(tensors, dim, out), InvalidArgument, out); + + int kTensorDimensionLimit = executorch::runtime::kTensorDimensionLimit; + Tensor::SizesType expected_out_size[kTensorDimensionLimit]; + size_t expected_out_dim = 0; + torch::executor::get_cat_out_target_size(tensors, dim, expected_out_size, &expected_out_dim); + + ET_KERNEL_CHECK( + ctx, + executorch::runtime::resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok, + InvalidArgument, + out); + + + + const signed char *inp_tensors[tensors.size()]; + const int *inp_tensors_shapes[tensors.size()]; + + int inp_shapes_size[tensors.size()]; + + int temp_sizes[tensors.size()][kTensorDimensionLimit]; + exec_aten::ArrayRef temp_size; + + for(int i = 0; i < tensors.size(); i++) + { + inp_tensors[i] = tensors[i].const_data_ptr(); + temp_size = tensors[i].sizes(); + + for(int j = 0; j < temp_size.size(); j++) + { + temp_sizes[i][j] = temp_size[j]; + } + inp_tensors_shapes[i] = temp_sizes[i]; //input shapes + inp_shapes_size[i] = temp_size.size(); //number of input dimensions + } + + signed char *out_data = out.mutable_data_ptr(); + + const exec_aten::ArrayRef out_size = out.sizes(); + int out_shapes[kTensorDimensionLimit]; + for(int i = 0; i < out_size.size(); i++) //output shapes + { + out_shapes[i] = out_size[i]; + } + + if(out.scalar_type() == ScalarType::Int) + { + xa_nn_cat(out_data, out_shapes, inp_tensors, inp_tensors_shapes, + inp_shapes_size[0], tensors.size(), (int)dim, sizeof(int)); + } + else if(out.scalar_type() == ScalarType::Short) + { + xa_nn_cat(out_data, out_shapes, inp_tensors, inp_tensors_shapes, + inp_shapes_size[0], tensors.size(), (int)dim, sizeof(short)); + } + else if(out.scalar_type() == ScalarType::Char) + { + xa_nn_cat(out_data, out_shapes, inp_tensors, inp_tensors_shapes, + inp_shapes_size[0], tensors.size(), (int)dim, sizeof(char)); + + } + else + { + // Special handling when all inputs are 1D-empty tensors for aten consistency + // In that case, just return an 1D-empty tensor without checking dim + bool all_1d_empty = true; + for (size_t i = 0; i < tensors.size(); ++i) + { + if (tensors[i].numel() != 0 || tensors[i].dim() != 1) + { + all_1d_empty = false; + break; + } + } + if (all_1d_empty) + { + return out; + } + + const size_t outer = executorch::runtime::getLeadingDims(out, dim); + const size_t dim_stride = executorch::runtime::getTrailingDims(out, dim); + const size_t ninputs = tensors.size(); + + const auto out_type = out.scalar_type(); + ET_SWITCH_REALHB_TYPES(out_type, ctx, "cat.out", CTYPE_OUT, [&] { + CTYPE_OUT* out_ptr = out.mutable_data_ptr(); + for (size_t i = 0; i < outer; ++i) { + for (size_t j = 0; j < ninputs; ++j) { + const auto in_type = tensors[j].scalar_type(); + ET_SWITCH_REALHB_TYPES(in_type, ctx, "cat.out", CTYPE_IN, [&] { + if (tensors[j].numel() == 0) { + return; + } + size_t inner = tensors[j].size(dim) * dim_stride; + const CTYPE_IN* const in_ptr = + tensors[j].const_data_ptr() + i * inner; + + for (size_t k = 0; k < inner; ++k) { + out_ptr[k] = static_cast(in_ptr[k]); + } + out_ptr += inner; + }); + } + } + }); + } + + return out; +} + +} // namespace native +} // namespace FusionG3 +} // namespace impl \ No newline at end of file diff --git a/backends/cadence/fusion_g3/operators/op_dequantize.cpp b/backends/cadence/fusion_g3/operators/op_dequantize.cpp new file mode 100644 index 00000000000..c1fe7936947 --- /dev/null +++ b/backends/cadence/fusion_g3/operators/op_dequantize.cpp @@ -0,0 +1,807 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include + +using exec_aten::Scalar; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using torch::executor::KernelRuntimeContext; +using torch::executor::Error; + +template +using optional = exec_aten::optional; +/* ScalarType in Executorch do not have support for below data types. + * So, creating a placeholder for these data types. Once, ScalarTypes is + * updated to have support for below data types, these can be removed and + * operator need to be updated accordingly + */ + + enum datatype { + Ushort = 20, + Bits4u = 21, + Bits4 = 22 + }; + +/** + * For an input tensor, use the scale and zero_point arguments to quantize it. + */ +namespace impl { +namespace FusionG3 { +namespace native { + +namespace { + +/** + * Asserts that the parameters are valid. + */ +void check_dequantize_per_tensor_args(const Tensor& input, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + exec_aten::optional& out_dtype, + Tensor& out) +{ + ET_CHECK_MSG( + input.scalar_type() == ScalarType::Byte || + input.scalar_type() == ScalarType::Char || + input.scalar_type() == ScalarType::Bits16 || + input.scalar_type() == ScalarType::Short || + input.scalar_type() == (ScalarType) Ushort || + input.scalar_type() == (ScalarType) Bits4 || + input.scalar_type() == (ScalarType) Bits4u || + input.scalar_type() == ScalarType::Int, + + "input.scalar_type() %" PRId8 " is not supported:", + static_cast(input.scalar_type())); + + ET_CHECK_MSG( + input.scalar_type() == dtype, + "input.scalar_type() %" PRId8 " is not matching dtype argumenta:", + static_cast(input.scalar_type())); + + if (out_dtype.has_value()) { + ET_CHECK_MSG( + out.scalar_type() == out_dtype.value(), + "output_dtype must match the dtype of the out tensor"); + } + + ET_CHECK_MSG( + quant_min <= quant_max, + "quant min: %" PRId64 " is greater than quant max: %" PRId64, + quant_min, + quant_max); +} + +} // namespace + + +/* Local function which calls the kernels based on the input datatype */ +void Dequantize_impl(Tensor& out, + const Tensor& input, + float *scale_data, + int *zero_point_data, + int *axis, + exec_aten::optional out_dtype) +{ + const exec_aten::ArrayRef input_size = input.sizes(); + + int kTensorDimensionLimit = 5; + + int inp_shape[kTensorDimensionLimit]; + + for(auto i = 0; i < input_size.size(); i++) + { + inp_shape[i] = input_size[i]; + } + + bool is_asym_dequant = 0; + + if(zero_point_data != NULL) //asymmetric dequant + { + if(axis != NULL) //channel + { + for(int i = 0; i < input.size(*axis) ; i++) + { + if(zero_point_data[i] != 0) + { + is_asym_dequant |= 1; + } + } + } + else + { + if(*zero_point_data != 0) //tesor + { + is_asym_dequant |= 1; + } + } + } + float* out_data = out.mutable_data_ptr(); + + if(is_asym_dequant) + { + if (input.scalar_type() == ScalarType::Byte) + { + const uint8_t* input_data = input.const_data_ptr(); + xa_nn_elm_dequantize_asym8u_f32( + out_data, input_data, inp_shape, input.dim(), axis, + zero_point_data, scale_data); + } + else if (input.scalar_type() == ScalarType::Char) + { + const int8_t* input_data = input.const_data_ptr(); + xa_nn_elm_dequantize_asym8_f32( + out_data, input_data, inp_shape, input.dim(), axis, + zero_point_data, scale_data); + } + else if (input.scalar_type() == (ScalarType) Ushort) + { + const uint16_t* input_data = input.const_data_ptr(); + xa_nn_elm_dequantize_asym16u_f32( + out_data, input_data, inp_shape, input.dim(), axis, + zero_point_data, scale_data); + } + else if (input.scalar_type() == ScalarType::Short) + { + const int16_t* input_data = input.const_data_ptr(); + xa_nn_elm_dequantize_asym16_f32( + out_data, input_data, inp_shape, input.dim(), axis, + zero_point_data, scale_data); + } + else if (input.scalar_type() == (ScalarType) Bits4u) + { + const uint8_t* input_data = input.const_data_ptr(); + xa_nn_elm_dequantize_asym4u_f32( + out_data, input_data, inp_shape, input.dim(), axis, + zero_point_data, scale_data); + } + else if (input.scalar_type() == (ScalarType) Bits4) + { + const int8_t* input_data = input.const_data_ptr(); + xa_nn_elm_dequantize_asym4_f32( + out_data, input_data, inp_shape, input.dim(), axis, + zero_point_data, scale_data); + } + else + { + if(axis == NULL) + { + // calculate the dequantized output, cast scale to float to match fbgemm + // behavior + #define ASYM_DEQUANTIZE_IMPL_TESNOR(IN_CTYPE, OUT_CTYPE, out_dtype) \ + case ScalarType::out_dtype: { \ + /* Hoist these function calls out of our inner loop because they might not \ + * get inlined without LTO, particularly in ATen mode. */ \ + auto* out_data_ptr = out.mutable_data_ptr(); \ + const auto* input_data_ptr = input.const_data_ptr(); \ + const auto input_numel = input.numel(); \ + for (size_t i = 0; i < input_numel; i++) { \ + out_data_ptr[i] = static_cast( \ + (input_data_ptr[i] - static_cast(*zero_point_data)) * \ + static_cast(*scale_data)); \ + } \ + } break; + #define ASYM_CALCULATE_INT_TYPE_TENSOR(IN_CTYPE, in_dtype) \ + case ScalarType::in_dtype: \ + switch (out.scalar_type()) { \ + ET_FORALL_FLOAT_TYPES_WITH(IN_CTYPE, ASYM_DEQUANTIZE_IMPL_TESNOR); \ + default: \ + ET_CHECK_MSG( \ + false, \ + "Unhandled output dtype %" PRId8, \ + static_cast(out.scalar_type())); \ + } \ + break; + switch (input.scalar_type()) { + ET_FORALL_INT_TYPES(ASYM_CALCULATE_INT_TYPE_TENSOR); + ASYM_CALCULATE_INT_TYPE_TENSOR(uint16_t, Bits16); + default: + ET_CHECK_MSG( + false, + "Unhandled input dtype %" PRId8, + static_cast(input.scalar_type())); + } + #undef ASYM_CALCULATE_INT_TYPE_TENSOR + #undef ASYM_DEQUANTIZE_IMPL_TESNOR + } + else + { + // a list contains all dimensions except axis + int64_t dims[input.dim() - 1]; + for (int64_t i = 0; i < input.dim() - 1; i++) + { + if (i < *axis) + { + dims[i] = i; + } + else + { + dims[i] = i + 1; + } + } + + exec_aten::optional> optional_dim_list{ + exec_aten::ArrayRef{dims, size_t(input.dim() - 1)}}; + + // Actual dequantization logic + // input, out are the input and output tensors + // channel_ix is the index along the axis dimension. 0 <= channel_ix < + // input.size(axis). + // i.e. if the tensor has shape (N,C,H,W), axis being 1, then channel_ix + // will be 0, 1, 2, ... C-1 + // in_ix is the flat index of the element you are dequantizing. + // in other words you are dequantizing in_data[in_ix] + #define ASYM_DEQUANTIZE_IMPL_CHANNEL(CTYPE_IN, CTYPE_OUT, out_dtype) \ + case ScalarType::out_dtype: \ + if (input.dim() == 1) { \ + auto* out_data_ptr = out.mutable_data_ptr(); \ + const auto* input_data_ptr = input.const_data_ptr(); \ + ET_CHECK_MSG( \ + *axis == 0, "Axis must be 0 for a single dimensional tensors"); \ + const optional dim; \ + torch::executor::apply_over_dim( \ + [input_data_ptr, out_data_ptr, zero_point_data, scale_data]( \ + size_t numel, size_t stride, size_t base_ix) { \ + for (size_t i = 0; i < numel; i++) { \ + size_t current_ix = base_ix * stride + i; \ + float _scale = scale_data[current_ix]; \ + int64_t zero_point = 0; \ + if (zero_point_data != nullptr) { \ + zero_point = zero_point_data[current_ix]; \ + } \ + out_data_ptr[current_ix] = \ + static_cast( \ + input_data_ptr[current_ix] - zero_point) * \ + _scale; \ + } \ + }, \ + input, \ + dim); \ + break; \ + } \ + for (size_t channel_ix = 0; channel_ix < input.size(*axis); ++channel_ix) { \ + float _scale = scale_data[channel_ix]; \ + int64_t _zero_point = 0; \ + if (zero_point_data != nullptr) { \ + _zero_point = zero_point_data[channel_ix]; \ + } \ + auto* out_data_ptr = out.mutable_data_ptr(); \ + const auto* input_data_ptr = input.const_data_ptr(); \ + torch::executor::apply_over_dim_list( \ + [input_data_ptr, out_data_ptr, _scale, _zero_point](size_t in_ix) { \ + out_data_ptr[in_ix] = static_cast( \ + (input_data_ptr[in_ix] - _zero_point) * _scale); \ + }, \ + input, \ + optional_dim_list, \ + channel_ix); \ + } \ + break; + #define ASYM_CALCULATE_INT_TYPE_CHANNEL(IN_CTYPE, in_dtype) \ + case ScalarType::in_dtype: \ + switch (out.scalar_type()) { \ + ET_FORALL_FLOAT_TYPES_WITH(IN_CTYPE, ASYM_DEQUANTIZE_IMPL_CHANNEL); \ + default: \ + ET_CHECK_MSG( \ + false, \ + "Unhandled output dtype %" PRId8, \ + static_cast(out.scalar_type())); \ + } \ + break; + switch (input.scalar_type()) { + ET_FORALL_INT_TYPES(ASYM_CALCULATE_INT_TYPE_CHANNEL); + ASYM_CALCULATE_INT_TYPE_CHANNEL(uint16_t, Bits16); + default: + ET_CHECK_MSG( + false, + "Unhandled input dtype %" PRId8, + static_cast(input.scalar_type())); + } + #undef ASYM_CALCULATE_INT_TYPE_CHANNEL + #undef ASYM_DEQUANTIZE_IMPL_CHANNEL + } + } + } + else + { + if (input.scalar_type() == ScalarType::Byte) + { + const uint8_t* input_data = input.const_data_ptr(); + xa_nn_elm_dequantize_sym8u_f32( + out_data, input_data, inp_shape, input.dim(), axis, scale_data); + } + else if (input.scalar_type() == ScalarType::Char) + { + const int8_t* input_data = input.const_data_ptr(); + xa_nn_elm_dequantize_sym8_f32( + out_data, input_data, inp_shape, input.dim(), axis, scale_data); + } + else if (input.scalar_type() == (ScalarType) Ushort) + { + const uint16_t* input_data = input.const_data_ptr(); + xa_nn_elm_dequantize_sym16u_f32( + out_data, input_data, inp_shape, input.dim(), axis, scale_data); + } + else if (input.scalar_type() == ScalarType::Short) + { + const int16_t* input_data = input.const_data_ptr(); + xa_nn_elm_dequantize_sym16_f32( + out_data, input_data, inp_shape, input.dim(), axis, scale_data); + } + else if (input.scalar_type() == (ScalarType) Bits4u) + { + const uint8_t* input_data = input.const_data_ptr(); + xa_nn_elm_dequantize_sym4u_f32( + out_data, input_data, inp_shape, input.dim(), axis, scale_data); + } + else if (input.scalar_type() == (ScalarType) Bits4) + { + const int8_t* input_data = input.const_data_ptr(); + xa_nn_elm_dequantize_sym4_f32( + out_data, input_data, inp_shape, input.dim(), axis, scale_data); + } + else + { + if(axis == NULL) + { + // calculate the dequantized output, cast scale to float to match fbgemm + // behavior + #define SYM_DEQUANTIZE_IMPL_TESNOR(IN_CTYPE, OUT_CTYPE, out_dtype) \ + case ScalarType::out_dtype: { \ + /* Hoist these function calls out of our inner loop because they might not \ + * get inlined without LTO, particularly in ATen mode. */ \ + auto* out_data_ptr = out.mutable_data_ptr(); \ + const auto* input_data_ptr = input.const_data_ptr(); \ + const auto input_numel = input.numel(); \ + for (size_t i = 0; i < input_numel; i++) { \ + out_data_ptr[i] = static_cast( \ + (input_data_ptr[i] - static_cast(*zero_point_data)) * \ + static_cast(*scale_data)); \ + } \ + } break; + #define SYM_CALCULATE_INT_TYPE_TENSOR(IN_CTYPE, in_dtype) \ + case ScalarType::in_dtype: \ + switch (out.scalar_type()) { \ + ET_FORALL_FLOAT_TYPES_WITH(IN_CTYPE, SYM_DEQUANTIZE_IMPL_TESNOR); \ + default: \ + ET_CHECK_MSG( \ + false, \ + "Unhandled output dtype %" PRId8, \ + static_cast(out.scalar_type())); \ + } \ + break; + switch (input.scalar_type()) { + ET_FORALL_INT_TYPES(SYM_CALCULATE_INT_TYPE_TENSOR); + SYM_CALCULATE_INT_TYPE_TENSOR(uint16_t, Bits16); + default: + ET_CHECK_MSG( + false, + "Unhandled input dtype %" PRId8, + static_cast(input.scalar_type())); + } + #undef SYM_DEQUANTIZE_IMPL_TESNOR + #undef SYM_CALCULATE_INT_TYPE_TENSOR + } + else + { + // a list contains all dimensions except axis + int64_t dims[input.dim() - 1]; + for (int64_t i = 0; i < input.dim() - 1; i++) + { + if (i < *axis) + { + dims[i] = i; + } + else + { + dims[i] = i + 1; + } + } + + exec_aten::optional> optional_dim_list{ + exec_aten::ArrayRef{dims, size_t(input.dim() - 1)}}; + + // Actual dequantization logic + // input, out are the input and output tensors + // channel_ix is the index along the axis dimension. 0 <= channel_ix < + // input.size(axis). + // i.e. if the tensor has shape (N,C,H,W), axis being 1, then channel_ix + // will be 0, 1, 2, ... C-1 + // in_ix is the flat index of the element you are dequantizing. + // in other words you are dequantizing in_data[in_ix] + #define SYM_DEQUANTIZE_IMPL_CHANNEL(CTYPE_IN, CTYPE_OUT, out_dtype) \ + case ScalarType::out_dtype: \ + if (input.dim() == 1) { \ + auto* out_data_ptr = out.mutable_data_ptr(); \ + const auto* input_data_ptr = input.const_data_ptr(); \ + ET_CHECK_MSG( \ + *axis == 0, "Axis must be 0 for a single dimensional tensors"); \ + const optional dim; \ + torch::executor::apply_over_dim( \ + [input_data_ptr, out_data_ptr, zero_point_data, scale_data]( \ + size_t numel, size_t stride, size_t base_ix) { \ + for (size_t i = 0; i < numel; i++) { \ + size_t current_ix = base_ix * stride + i; \ + float _scale = scale_data[current_ix]; \ + int64_t zero_point = 0; \ + if (zero_point_data != nullptr) { \ + zero_point = zero_point_data[current_ix]; \ + } \ + out_data_ptr[current_ix] = \ + static_cast( \ + input_data_ptr[current_ix] - zero_point) * \ + _scale; \ + } \ + }, \ + input, \ + dim); \ + break; \ + } \ + for (size_t channel_ix = 0; channel_ix < input.size(*axis); ++channel_ix) { \ + float _scale = scale_data[channel_ix]; \ + int64_t _zero_point = 0; \ + if (zero_point_data != nullptr) { \ + _zero_point = zero_point_data[channel_ix]; \ + } \ + auto* out_data_ptr = out.mutable_data_ptr(); \ + const auto* input_data_ptr = input.const_data_ptr(); \ + torch::executor::apply_over_dim_list( \ + [input_data_ptr, out_data_ptr, _scale, _zero_point](size_t in_ix) { \ + out_data_ptr[in_ix] = static_cast( \ + (input_data_ptr[in_ix] - _zero_point) * _scale); \ + }, \ + input, \ + optional_dim_list, \ + channel_ix); \ + } \ + break; + #define SYM_CALCULATE_INT_TYPE_CHANNEL(IN_CTYPE, in_dtype) \ + case ScalarType::in_dtype: \ + switch (out.scalar_type()) { \ + ET_FORALL_FLOAT_TYPES_WITH(IN_CTYPE, SYM_DEQUANTIZE_IMPL_CHANNEL); \ + default: \ + ET_CHECK_MSG( \ + false, \ + "Unhandled output dtype %" PRId8, \ + static_cast(out.scalar_type())); \ + } \ + break; + switch (input.scalar_type()) { + ET_FORALL_INT_TYPES(SYM_CALCULATE_INT_TYPE_CHANNEL); + SYM_CALCULATE_INT_TYPE_CHANNEL(uint16_t, Bits16); + default: + ET_CHECK_MSG( + false, + "Unhandled input dtype %" PRId8, + static_cast(input.scalar_type())); + } + #undef SYM_DEQUANTIZE_IMPL_CHANNEL + #undef SYM_CALCULATE_INT_TYPE_CHANNEL + } + } + } +} + +/** + * Dequantizes the input tensor according to the formula (input - zero_point) * + * scale + * + * NOTE: quant_min and quant_max are not used in computation, but rather + * metadata that is passed around which can be useful for pattern matching. See + * https://github.com/pytorch/pytorch/pull/87093#discussion_r1000841181 for more + * info. + */ +Tensor& dequantize_per_tensor_out( + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + exec_aten::optional out_dtype, + Tensor& out) +{ + torch::executor::Error err = resize_tensor(out, input.sizes()); + ET_CHECK_MSG( + err == torch::executor::Error::Ok, + "Failed to resize out Tensor in dequantize_per_tensor_out"); + + check_dequantize_per_tensor_args( + input, quant_min, quant_max, dtype, out_dtype, out); + + float scale_data = (float)scale; + int zero_point_data = (int)zero_point; + + Dequantize_impl(out, + input, + &scale_data, + &zero_point_data, + NULL, + out_dtype); + + return out; +} + +Tensor& dequantize_per_tensor_tensor_args_out(const Tensor& input, + const Tensor& scale, + const Tensor& zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + exec_aten::optional out_dtype, + Tensor& out) +{ + ET_CHECK_MSG( + scale.scalar_type() == ScalarType::Double, + "Expected scale to be Double tensor received: %" PRId8, + static_cast(scale.scalar_type())); + ET_CHECK_MSG( + zero_point.scalar_type() == ScalarType::Long, + "Expected scale to be Long tensor received: %" PRId8, + static_cast(zero_point.scalar_type())); + ET_CHECK_MSG( + scale.numel() == 1, + "Exepcted scale to only have one element received: %zd", + ssize_t(scale.numel())); + ET_CHECK_MSG( + zero_point.numel() == 1, + "Exepcted zero_point to only have one element received: %zd", + ssize_t(zero_point.numel())); + + dequantize_per_tensor_out( + input, + scale.const_data_ptr()[0], + zero_point.const_data_ptr()[0], + quant_min, + quant_max, + dtype, + out_dtype, + out); + + return out; +} + +Tensor& dequantize_per_channel_out(const Tensor& input, + const Tensor& scale, + const exec_aten::optional& opt_zero_points, + int64_t axis, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + exec_aten::optional out_dtype, + Tensor& out) +{ + torch::executor::Error err = resize_tensor(out, input.sizes()); + + // normalize axis + ET_CHECK_MSG( + executorch::runtime::tensor_has_dim(input, axis), + "axis %zd is not legal it should be -input.dim() <= axis < input.dim() %zd", + ssize_t(axis), + ssize_t(input.dim())); + + if (axis < 0) + { + axis += executorch::runtime::nonzero_dim(input); + } + + ET_CHECK_MSG( + err == torch::executor::Error::Ok, + "Failed to resize out Tensor in dequantize_per_channel_out"); + + ET_CHECK_MSG( + scale.scalar_type() == ScalarType::Double, + "scale.scalar_type() %" PRId8 " is not double type", + static_cast(scale.scalar_type())); + + ET_CHECK_MSG( + scale.numel() == input.size(axis), + "scale.numel() %zd != input.size(axis) %zd", + ssize_t(scale.numel()), + ssize_t(input.size(axis))); + + if (opt_zero_points.has_value()) { + auto zero_point = opt_zero_points.value(); + ET_CHECK_MSG( + zero_point.scalar_type() == ScalarType::Long, + "zero_point.scalar_type() %" PRId8 " is not integer type", + static_cast(zero_point.scalar_type())); + + ET_CHECK_MSG( + zero_point.numel() == input.size(axis), + "zero_point.numel() %zd != input.size(axis) %zd", + ssize_t(zero_point.numel()), + ssize_t(input.size(axis))); + } + + check_dequantize_per_tensor_args( + input, quant_min, quant_max, dtype, out_dtype, out); + + int *axis_ptr = (int *)&axis; + + const double* scale_dt = scale.const_data_ptr(); + const int64_t* zero_point_dt; + int zero_point_data[input.size(axis)]; + int *zero_point_ptr; + if (opt_zero_points.has_value()) + { + zero_point_dt = opt_zero_points.value().const_data_ptr(); + zero_point_ptr = &zero_point_data[0]; + for(int i = 0; i < scale.numel(); i++) + { + zero_point_ptr[i] = (int)zero_point_dt[i]; + } + } + else + { + zero_point_ptr = nullptr; + } + float scale_data[input.size(axis)]; + for(int i = 0; i < scale.numel(); i++) + { + scale_data[i] = (float)scale_dt[i]; + } + Dequantize_impl(out, + input, + scale_data, + zero_point_ptr, + axis_ptr, + out_dtype); + + return out; +} + +Tensor& dequantize_per_channel_out( + KernelRuntimeContext& context, + const Tensor& input, + const Tensor& scale, + const exec_aten::optional& opt_zero_points, + int64_t axis, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + exec_aten::optional out_dtype, + Tensor& out) +{ + (void)context; + torch::executor::Error err = resize_tensor(out, input.sizes()); + ET_CHECK_MSG( + err == torch::executor::Error::Ok, + "Failed to resize out Tensor in dequantize_per_channel_out"); + return dequantize_per_channel_out( + input, + scale, + opt_zero_points, + axis, + quant_min, + quant_max, + dtype, + out_dtype, + out); +} + +Tensor& dequantize_per_tensor_out( + KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + exec_aten::optional out_dtype, + Tensor& out) +{ + // TODO(larryliu): Add a context arg to the real op function and remove this + // wrapper + (void)context; + return dequantize_per_tensor_out( + input, scale, zero_point, quant_min, quant_max, dtype, out_dtype, out); +} + +Tensor& dequantize_per_tensor_tensor_args_out( + KernelRuntimeContext& context, + const Tensor& input, + const Tensor& scale, + const Tensor& zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + exec_aten::optional out_dtype, + Tensor& out) +{ + // TODO(larryliu): Add a context arg to the real op function and remove this + // wrapper + (void)context; + return dequantize_per_tensor_tensor_args_out( + input, scale, zero_point, quant_min, quant_max, dtype, out_dtype, out); +} + +Tensor& dequantize_per_token_out( + const Tensor& input, + const Tensor& scale, + const Tensor& zero_points, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + ScalarType out_dtype, + Tensor& out) +{ + // Refactor this into a util + size_t num_channels = 1; + for (size_t i = 0; i < input.dim() - 1; i++) + { + num_channels *= input.size(i); + } + // This unfortunate change is needed because we compile op_quantize for aten + // mode as well + std::array input_sizes; + input_sizes[0] = static_cast(num_channels); + input_sizes[1] = + static_cast(input.size(input.dim() - 1)); +#ifdef USE_ATEN_LIB + Tensor reshaped_input = at::from_blob( + input.mutable_data_ptr(), + input_sizes, + at::TensorOptions(input.scalar_type())); +#else + std::array input_dim_order{0, 1}; + std::array input_strides; + executorch::runtime::dim_order_to_stride_nocheck( + input_sizes.data(), input_dim_order.data(), 2, input_strides.data()); + void* input_data = input.mutable_data_ptr(); + torch::executor::TensorImpl reshaped_input_impl = executorch::runtime::etensor::TensorImpl( + input.scalar_type(), + 2, + input_sizes.data(), + input_data, + input_dim_order.data(), + input_strides.data(), + executorch::runtime::TensorShapeDynamism::STATIC); + Tensor reshaped_input(&reshaped_input_impl); + torch::executor::Error err = resize_tensor(out, input.sizes()); + ET_CHECK_MSG( + err == torch::executor::Error::Ok, + "Failed to resize out Tensor in dequantize_per_channel_out"); +#endif + + return dequantize_per_channel_out( + reshaped_input, + scale, + zero_points, + 0, /* axis */ + quant_min, + quant_max, + dtype, + out_dtype, + out); +} + +Tensor& dequantize_per_token_out( + KernelRuntimeContext& context, + const Tensor& input, + const Tensor& scale, + const Tensor& zero_points, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + ScalarType out_dtype, + Tensor& out) { + (void)context; + return dequantize_per_token_out( + input, scale, zero_points, quant_min, quant_max, dtype, out_dtype, out); +} + +} // namespace native +} // namespace FusionG3 +} // namespace impl diff --git a/backends/cadence/fusion_g3/operators/op_mul.cpp b/backends/cadence/fusion_g3/operators/op_mul.cpp new file mode 100644 index 00000000000..347886e03e6 --- /dev/null +++ b/backends/cadence/fusion_g3/operators/op_mul.cpp @@ -0,0 +1,226 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +using exec_aten::Scalar; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using torch::executor::KernelRuntimeContext; +using executorch::runtime::canCast; +using torch::executor::Error; + +namespace impl { +namespace FusionG3 { +namespace native { + +Tensor& mul_out(KernelRuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) +{ + // Common Dtype + ScalarType common_type = executorch::runtime::promoteTypes(a.scalar_type(), b.scalar_type()); + + // Check Common Dtype + ET_KERNEL_CHECK( + ctx, canCast(common_type, out.scalar_type()), InvalidArgument, out); + + // Check Dim Order + ET_KERNEL_CHECK( + ctx, executorch::runtime::tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + + // Resize + ET_KERNEL_CHECK( + ctx, + torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + // Compute Dtype + ScalarType compute_type = torch::executor::native::utils::get_compute_type(common_type); + + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "mul.out"; + + const exec_aten::ArrayRef a_size = a.sizes(); + const exec_aten::ArrayRef b_size = b.sizes(); + const exec_aten::ArrayRef out_size = out.sizes(); + + int kTensorDimensionLimit = 5; + + int inp1_shape[kTensorDimensionLimit]; + int inp2_shape[kTensorDimensionLimit]; + int out_shape[kTensorDimensionLimit]; + + /* input shapes and output shapes */ + for(auto i = 0; i < a_size.size(); i++) + { + inp1_shape[i] = a_size[i]; + } + + for(auto i = 0; i < b_size.size(); i++) + { + inp2_shape[i] = b_size[i]; + } + + for(auto i = 0; i < out_size.size(); i++) + { + out_shape[i] = out_size[i]; + } + + /*find broadcast*/ + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool broadcast = (a_is_broadcasted || b_is_broadcasted); + + int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); + + if(compute_type == ScalarType::Int) + { + const int* const inp1_data = a.const_data_ptr(); + const int* const inp2_data = b.const_data_ptr(); + int* const out_data = out.mutable_data_ptr(); + + if(broadcast) + { + xa_nn_elm_mul_broadcast_5D_32x32_32(out_data, out_shape, inp1_data, + inp1_shape, inp2_data, inp2_shape, max_dim); + } + else + { + xa_nn_elm_mul_32x32_32(out_data, inp1_data, inp2_data, out.numel()); + } + } + else if(compute_type == ScalarType::Float) + { + const float* const inp1_data = a.const_data_ptr(); + const float* const inp2_data = b.const_data_ptr(); + float* const out_data = out.mutable_data_ptr(); + + if(broadcast) + { + xa_nn_elm_mul_broadcast_5D_f32xf32_f32(out_data, out_shape, inp1_data, + inp1_shape, inp2_data, inp2_shape, max_dim); + } + else + { + xa_nn_elm_mul_f32xf32_f32(out_data, inp1_data, inp2_data, out.numel()); + } + } + else + { + ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { + torch::executor::native::utils::apply_bitensor_elementwise_fn( + [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + return val_a * val_b; + }, + ctx, + a, + torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16, + b, + torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16, + out, + torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16); + }); + } + + return out; +} + +Tensor& mul_scalar_out(KernelRuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) +{ + // Common Dtype + ScalarType common_type = torch::executor::native::utils:: + promote_type_with_scalar(a.scalar_type(), b); + + // Check Common Dtype + ET_KERNEL_CHECK(ctx, common_type == out.scalar_type(), InvalidArgument, out); + + // Check Dim Order + ET_KERNEL_CHECK( + ctx, executorch::runtime::tensors_have_same_dim_order(a, out), + InvalidArgument, out); + + // Resize + ET_KERNEL_CHECK( + ctx, resize_tensor(out, a.sizes()) == Error::Ok, InvalidArgument, out); + + // Compute Dtype + ScalarType compute_type = torch::executor::native:: + utils::get_compute_type(common_type); + + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "mul.Scalar_out"; + + if(compute_type == ScalarType::Int) + { + const int* const inp1_data = a.const_data_ptr(); + int inp2_val; + torch::executor::native::utils::extract_scalar(b, &inp2_val); + int* const out_data = out.mutable_data_ptr(); + + xa_nn_elm_mul_scalar_32x32_32(out_data, inp1_data, inp2_val, out.numel()); + } + else if(compute_type == ScalarType::Float) + { + const float* const inp1_data = a.const_data_ptr(); + float inp2_val; + torch::executor::native::utils::extract_scalar(b, &inp2_val); + float* const out_data = out.mutable_data_ptr(); + + xa_nn_elm_mul_scalar_f32xf32_f32(out_data, inp1_data, inp2_val, out.numel()); + } + else + { + ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { + const CTYPE_COMPUTE val_b = torch::executor::native:: + utils::scalar_to(b); + torch::executor::native::utils:: + apply_unitensor_elementwise_fn( + [val_b](const CTYPE_COMPUTE val_a) { return val_a * val_b; }, + ctx, + a, + torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16, + out, + torch::executor::native::utils::SupportedTensorDtypes::SAME_AS_COMMON); + }); + } + + return out; +} + +} // namespace impl +} // namespace FusionG3 +} // namespace native + + + + + + + + + + + + + + + + + + + diff --git a/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp b/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp new file mode 100644 index 00000000000..7a7e240924e --- /dev/null +++ b/backends/cadence/fusion_g3/operators/op_native_layer_norm.cpp @@ -0,0 +1,276 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include + + +using torch::executor::KernelRuntimeContext; +using Tensor = exec_aten::Tensor; +using ScalarType = exec_aten::ScalarType; +using IntArrayRef = exec_aten::ArrayRef; +using torch::executor::Error; + +namespace impl { +namespace FusionG3 { +namespace native { + +namespace { + +template +void layer_norm( + const Tensor& input, + IntArrayRef normalized_shape, + const exec_aten::optional& weight, + const exec_aten::optional& bias, + CTYPE eps, + Tensor& out, + Tensor& mean, + Tensor& rstd) { + size_t dim = input.dim() - normalized_shape.size(); + size_t dim_size = input.size(dim); + + size_t leading = executorch::runtime::getLeadingDims(input, dim); + size_t normalized = executorch::runtime::getTrailingDims(input, dim) * dim_size; + + if (leading == 0) + { + return; + } + + CTYPE* out_data = out.mutable_data_ptr(); + CTYPE* mean_data = mean.mutable_data_ptr(); + CTYPE* rstd_data = rstd.mutable_data_ptr(); + + if (normalized == 0) + { + for (int i = 0; i < leading; ++i) + { + mean_data[i] = static_cast(0); + rstd_data[i] = static_cast(NAN); + } + return; + } + + const CTYPE* input_data = input.const_data_ptr(); + const CTYPE* weight_data; + if (weight.has_value()) + { + weight_data = weight.value().const_data_ptr(); + } + else + { + weight_data = nullptr; + } + const CTYPE* bias_data; + if (bias.has_value()) + { + bias_data = bias.value().const_data_ptr(); + } + else + { + bias_data = nullptr; + } + + for (int i = 0; i < leading; ++i) + { + const CTYPE* x = input_data + i * normalized; + CTYPE* y = out_data + i * normalized; + + // compute E[X] and Var[x] = E[x^2] - E[x]^2 + CTYPE sum = torch::executor::reduce_add(x, normalized); + CTYPE sq_sum = torch::executor::vec_powerf(x, normalized); + CTYPE mean_value = sum / normalized; + CTYPE variance = sq_sum / normalized - mean_value * mean_value; + CTYPE std = std::sqrt(variance + eps); + + // Calculate the elements of output + for (int j = 0; j < normalized; ++j) + { + CTYPE w = weight_data ? weight_data[j] : static_cast(1); + CTYPE b = bias_data ? bias_data[j] : static_cast(0); + y[j] = (x[j] - mean_value) / std * w + b; + } + + mean_data[i] = mean_value; + rstd_data[i] = 1.0 / std; + } +} + +} // namespace + +// native_layer_norm.out(Tensor input, int[] normalized_shape, Tensor? weight, +// Tensor? bias, float eps, *, Tensor(a!) out, Tensor(b!) mean_out, Tensor(c!) +// rstd_out) -> (Tensor(a!), Tensor(b!), Tensor(c!)) +// As a reference, there's math_native_layer_norm in ATen: +// https://www.internalfb.com/code/fbsource/[2da5b17b086554c6cd0c3ab08a35aeec2a8bad8c]/xplat/caffe2/aten/src/ATen/native/layer_norm.cpp?lines=188 +std::tuple native_layer_norm_out( + KernelRuntimeContext& ctx, + const Tensor& input, + IntArrayRef normalized_shape, + const exec_aten::optional& weight, + const exec_aten::optional& bias, + double eps, + Tensor& out, + Tensor& mean_out, + Tensor& rstd_out) +{ + (void)ctx; + + std::tuple ret_val(out, mean_out, rstd_out); + + ET_KERNEL_CHECK( + ctx, + torch::executor::check_layer_norm_args( + input, normalized_shape, weight, bias, out, mean_out, rstd_out), + InvalidArgument, + ret_val); + + // Only support default dim order for now. + // TODO: Support other dim orders. + ET_KERNEL_CHECK( + ctx, executorch::runtime::tensor_is_default_dim_order(input), InvalidArgument, ret_val); + + ET_KERNEL_CHECK( + ctx, + executorch::runtime::tensors_have_same_dim_order(input, out, mean_out, rstd_out), + InvalidArgument, + ret_val); + + if (weight.has_value()) + { + ET_KERNEL_CHECK( + ctx, + executorch::runtime::tensors_have_same_dim_order(input, weight.value()), + InvalidArgument, + ret_val); + } + + if (bias.has_value()) + { + ET_KERNEL_CHECK( + ctx, + executorch::runtime::tensors_have_same_dim_order(input, bias.value()), + InvalidArgument, + ret_val); + } + int kTensorDimensionLimit = executorch::runtime::kTensorDimensionLimit; + Tensor::SizesType mean_rstd_sizes[kTensorDimensionLimit]; + size_t mean_rstd_ndim = 0; + torch::executor::get_layer_norm_out_target_size( + input, normalized_shape, mean_rstd_sizes, &mean_rstd_ndim); + + ET_KERNEL_CHECK( + ctx, + resize_tensor(out, input.sizes()) == Error::Ok, + InvalidArgument, + ret_val); + + ET_KERNEL_CHECK( + ctx, + executorch::runtime::resize_tensor(mean_out, {mean_rstd_sizes, + mean_rstd_ndim}) == Error::Ok, + InvalidArgument, + ret_val); + + ET_KERNEL_CHECK( + ctx, + executorch::runtime::resize_tensor(rstd_out, {mean_rstd_sizes, + mean_rstd_ndim}) == Error::Ok, + InvalidArgument, + ret_val); + + + int input_shape[kTensorDimensionLimit]; + for(int i = 0; i < input.dim(); i++) + { + input_shape[i] = input.size(i); + } + + if(out.scalar_type() == ScalarType::Float) + { + float * const out_data = out.mutable_data_ptr(); + float * const mean_data = mean_out.mutable_data_ptr(); + float * const rstd_data = rstd_out.mutable_data_ptr(); + const float * const inp_data = input.const_data_ptr(); + int dim = input.dim() - normalized_shape.size(); + + int num_elm = 1; + for(int i = 0; i < normalized_shape.size() ; i++) + { + num_elm *= normalized_shape[i]; + } + + float *weight_data; + if (weight.has_value()) + { + weight_data = weight.value().mutable_data_ptr(); + } + else + { + weight_data = (float *)malloc(num_elm * sizeof(float)); + for(int i = 0; i < num_elm; i++) + { + weight_data[i] = 1; + } + } + float *bias_data; + if (bias.has_value()) + { + bias_data = bias.value().mutable_data_ptr(); + } + else + { + bias_data = (float *)malloc(num_elm * sizeof(float)); + for(int i = 0; i < num_elm; i++) + { + bias_data[i] = 0; + } + } + + xa_nn_native_layer_norm_f32_f32(out_data, mean_data, rstd_data, + inp_data, input_shape, input.dim(), + dim, weight_data, bias_data, + (float)eps); + + if (!bias.has_value()) + { + free(bias_data); + } + if (!weight.has_value()) + { + free(weight_data); + } + } + else + { + ET_SWITCH_FLOAT_TYPES( + input.scalar_type(), ctx, "native_layer_norm.out", CTYPE, [&]() { + layer_norm( + input, + normalized_shape, + weight, + bias, + eps, + out, + mean_out, + rstd_out); + }); + } + + return ret_val; + +} + +} // namespace native +} // namespace FusionG3 +} // namespace impl diff --git a/backends/cadence/fusion_g3/operators/op_quantize.cpp b/backends/cadence/fusion_g3/operators/op_quantize.cpp new file mode 100644 index 00000000000..537137aca33 --- /dev/null +++ b/backends/cadence/fusion_g3/operators/op_quantize.cpp @@ -0,0 +1,797 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include + +using exec_aten::Scalar; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using torch::executor::KernelRuntimeContext; +using torch::executor::Error; + +/* ScalarType in Executorch do not have support for below data types. + * So, creating a placeholder for these data types. Once, ScalarTypes is + * updated to have support for below data types, these can be removed and + * operator need to be updated accordingly + */ + enum datatype { + Ushort = 20, + Bits4u = 21, + Bits4 = 22 + }; + + +/** + * For an input tensor, use the scale and zero_point arguments to quantize it. + */ + +namespace impl { +namespace FusionG3 { +namespace native { + + +namespace { + +/** + * Asserts that the parameters are valid. + */ +void check_quantize_per_tensor_args(const Tensor& input, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) +{ + // Ensure self and out has the same shape + ET_CHECK_MSG( + torch::executor::isFloatingType(input.scalar_type()), + "input.scalar_type() %" PRId8 " is not floating type", + static_cast(input.scalar_type())); + + int32_t quant_min_lower_bound = 0, quant_max_upper_bound = 0; + ScalarType out_dtype = out.scalar_type(); + ET_CHECK_MSG( + out_dtype == dtype, + "out.scalar_type() %" PRId8 " is not matching dtype argument %" PRId8, + static_cast(out_dtype), + static_cast(dtype)); + + if (out_dtype == ScalarType::Byte) + { + quant_min_lower_bound = + static_cast(std::numeric_limits::min()); + quant_max_upper_bound = + static_cast(std::numeric_limits::max()); + } + else if (dtype == ScalarType::Char) + { + quant_min_lower_bound = + static_cast(std::numeric_limits::min()); + quant_max_upper_bound = + static_cast(std::numeric_limits::max()); + } + else if (dtype == ScalarType::Bits16) + { + quant_min_lower_bound = std::numeric_limits::min(); + quant_max_upper_bound = std::numeric_limits::max(); + } + else if (dtype == ScalarType::Short) + { + quant_min_lower_bound = std::numeric_limits::min(); + quant_max_upper_bound = std::numeric_limits::max(); + } + else if (dtype == (ScalarType)Ushort) + { + quant_min_lower_bound = std::numeric_limits::min(); + quant_max_upper_bound = std::numeric_limits::max(); + } + else if (dtype == (ScalarType)Bits4u) + { + quant_min_lower_bound = std::numeric_limits::min(); + quant_max_upper_bound = std::numeric_limits::max(); + /* Minimum and maximum values fo unsigned 4-bit data type */ + quant_min_lower_bound = quant_min_lower_bound >> 4; + quant_max_upper_bound = quant_max_upper_bound >> 4; + } + else if (dtype == (ScalarType)Bits4) + { + quant_min_lower_bound = std::numeric_limits::min(); + quant_max_upper_bound = std::numeric_limits::max(); + /* Minimum and maximum values fo signed 4-bit data type */ + quant_min_lower_bound = quant_min_lower_bound >> 4; + quant_max_upper_bound = quant_max_upper_bound >> 4; + } + else if (dtype == ScalarType::Int) + { + quant_min_lower_bound = std::numeric_limits::min(); + quant_max_upper_bound = std::numeric_limits::max(); + } + else + { + ET_CHECK_MSG( + false, "Unsupported dtype: %" PRId8, static_cast(out_dtype)); + } + + ET_CHECK_MSG( + quant_min >= quant_min_lower_bound, + "quant_min out of bound for dtype, expected quant_min_lower_bound: %" PRId32 + " actual quant_min: %" PRId64, + quant_min_lower_bound, + quant_min); + + ET_CHECK_MSG( + quant_max <= quant_max_upper_bound, + "quant_max out of bound for dtype, expected quant_max_upper_bound: %" PRId32 + " actual quant_max: %" PRId64, + quant_max_upper_bound, + quant_max); +}/* check_quantize_per_tensor_args */ + +} // namespace + +template +T quantize_val( + double scale, + int64_t zero_point, + K value, + int64_t quant_min, + int64_t quant_max) +{ + int64_t qvalue; + float inv_scale = 1.0f / static_cast(scale); + qvalue = static_cast( + static_cast(zero_point) + + std::nearbyint(static_cast(inv_scale * value))); + + qvalue = std::max(qvalue, quant_min); + qvalue = std::min(qvalue, quant_max); + return static_cast(qvalue); +} + + +/* Local function which calls the kernels based on the output datatype */ +void quantize_impl(Tensor& out, + const Tensor& input, + float *scale_data, + int *zero_point_data, + int *axis, + int quant_min, + int quant_max) +{ + const exec_aten::ArrayRef input_size = input.sizes(); + + int kTensorDimensionLimit = 5; + + int inp_shape[kTensorDimensionLimit]; + + for(auto i = 0; i < input_size.size(); i++) + { + inp_shape[i] = input_size[i]; + } + + const float* input_data = input.const_data_ptr(); + + bool is_asym_quant = 0; + + if(zero_point_data != NULL) //asymmetric quant + { + if(axis != NULL) //channel + { + for(int i = 0; i < input.size(*axis) ; i++) + { + if(zero_point_data[i] != 0) + { + is_asym_quant |= 1; + } + } + } + else + { + if(*zero_point_data != 0) //tensor + { + is_asym_quant |= 1; + } + } + } + + if(is_asym_quant) + { + if (out.scalar_type() == ScalarType::Byte) + { + uint8_t* out_data = out.mutable_data_ptr(); + xa_nn_elm_quantize_f32_asym8u( + out_data, input_data, inp_shape, input.dim(), axis, + scale_data, zero_point_data, quant_min, quant_max); + } + else if (out.scalar_type() == ScalarType::Char) + { + int8_t* out_data = out.mutable_data_ptr(); + xa_nn_elm_quantize_f32_asym8( + out_data, input_data, inp_shape, input.dim(), axis, + scale_data, zero_point_data, quant_min, quant_max); + } + else if (out.scalar_type() == (ScalarType)Ushort) + { + uint16_t* out_data = out.mutable_data_ptr(); + xa_nn_elm_quantize_f32_asym16u( + out_data, input_data, inp_shape, input.dim(), axis, + scale_data, zero_point_data, quant_min, quant_max); + } + else if (out.scalar_type() == ScalarType::Short) + { + int16_t* out_data = out.mutable_data_ptr(); + xa_nn_elm_quantize_f32_asym16( + out_data, input_data, inp_shape, input.dim(), axis, + scale_data, zero_point_data, quant_min, quant_max); + } + else if (out.scalar_type() == (ScalarType)Bits4u) + { + uint8_t* out_data = out.mutable_data_ptr(); + xa_nn_elm_quantize_f32_asym4u( + out_data, input_data, inp_shape, input.dim(), axis, + scale_data, zero_point_data, quant_min, quant_max); + } + else if (out.scalar_type() == (ScalarType)Bits4) + { + int8_t* out_data = out.mutable_data_ptr(); + xa_nn_elm_quantize_f32_asym4( + out_data, input_data, inp_shape, input.dim(), axis, + scale_data, zero_point_data, quant_min, quant_max); + } + else + { + if(axis == NULL) + { + // Vector quantization + // calculate the quantized input + #define ASYM_QUANTIZE_IMPL_TENSOR(IN_CTYPE, OUT_CTYPE, out_dtype) \ + case ScalarType::out_dtype: { \ + /* Hoist these function calls out of our inner loop because they might not \ + * get inlined without LTO, particularly in ATen mode. */ \ + auto* out_data_ptr = out.mutable_data_ptr(); \ + const auto* input_data_ptr = input.const_data_ptr(); \ + const auto input_numel = input.numel(); \ + for (size_t i = 0; i < input_numel; i++) { \ + IN_CTYPE value = input_data_ptr[i]; \ + out_data_ptr[i] = quantize_val( \ + (double)*scale_data, (int64_t)*zero_point_data, value, \ + (int64_t)quant_min, (int64_t)quant_max); \ + } \ + } break; + #define ASYM_CALCULATE_FLOAT_TYPE_TENSOR(IN_CTYPE, in_dtype) \ + case ScalarType::in_dtype: \ + switch (out.scalar_type()) { \ + ET_FORALL_INT_TYPES_WITH(IN_CTYPE, ASYM_QUANTIZE_IMPL_TENSOR); \ + ASYM_QUANTIZE_IMPL_TENSOR(IN_CTYPE, uint16_t, Bits16) \ + default: \ + ET_CHECK_MSG( \ + false, \ + "Unhandled output dtype %" PRId8, \ + static_cast(out.scalar_type())); \ + } \ + break; + + switch (input.scalar_type()) { + ET_FORALL_FLOAT_TYPES(ASYM_CALCULATE_FLOAT_TYPE_TENSOR); + default: + ET_CHECK_MSG( + false, + "Unhandled input dtype %" PRId8, + static_cast(input.scalar_type())); + } + + } + else + { + // Channel based quantization + // a list contains all dimensions except axis + int64_t dims[input.dim() - 1]; + for (int64_t i = 0; i < input.dim() - 1; i++) + { + if (i < *axis) + { + dims[i] = i; + } + else + { + dims[i] = i + 1; + } + } + + exec_aten::optional> optional_dim_list{ + exec_aten::ArrayRef{dims, size_t(input.dim() - 1)}}; + + // Actual quantization logic + // input, out are the input and output tensors + // channel_ix is the index along the axis dimension. 0 <= channel_ix < + // input.size(axis). + // i.e. if the tensor has shape (N,C,H,W), axis being 1, then channel_ix + // will be 0, 1, 2, ... C-1 + // in_ix is the flat index of the element you are quantizing. + // in other words you are quantizing in_data[in_ix] + #define ASYM_QUANTIZE_IMPL_CHANNEL(CTYPE_IN, CTYPE_OUT, out_dtype) \ + case ScalarType::out_dtype: \ + for (size_t channel_ix = 0; channel_ix < input.size(*axis); ++channel_ix) { \ + double _scale = (double)scale_data[channel_ix]; \ + int64_t _zero_point = (int64_t)zero_point_data[channel_ix]; \ + auto* out_data_ptr = out.mutable_data_ptr(); \ + const auto* input_data_ptr = input.const_data_ptr(); \ + torch::executor::apply_over_dim_list( \ + [input_data_ptr, \ + out_data_ptr, \ + _scale, \ + _zero_point, \ + quant_min, \ + quant_max](size_t in_ix) { \ + out_data_ptr[in_ix] = quantize_val( \ + _scale, \ + _zero_point, \ + input_data_ptr[in_ix], \ + quant_min, \ + quant_max); \ + }, \ + input, \ + optional_dim_list, \ + channel_ix); \ + } \ + break; + #define ASYM_CALCULATE_FLOAT_TYPE_CHANNEL(CTYPE_IN, in_dtype) \ + case ScalarType::in_dtype: \ + switch (out.scalar_type()) { \ + ET_FORALL_INT_TYPES_WITH(CTYPE_IN, ASYM_QUANTIZE_IMPL_CHANNEL); \ + ASYM_QUANTIZE_IMPL_CHANNEL(CTYPE_IN, uint16_t, Bits16) \ + default: \ + ET_CHECK_MSG( \ + false, \ + "Unhandled output dtype %" PRId8, \ + static_cast(out.scalar_type())); \ + } \ + break; + + switch (input.scalar_type()) { + ET_FORALL_FLOAT_TYPES(ASYM_CALCULATE_FLOAT_TYPE_CHANNEL); + default: + ET_CHECK_MSG( + false, + "Unhandled input dtype %" PRId8, + static_cast(input.scalar_type())); + } + } + + #undef ASYM_CALCULATE_FLOAT_TYPE_TENSOR + #undef ASYM_CALCULATE_FLOAT_TYPE_CHANNEL + #undef ASYM_ASYM_QUANTIZE_IMPL_CHANNEL_TENSOR + #undef ASYM_ASYM_QUANTIZE_IMPL_CHANNEL_CHANNEL + } + } + else + { + if (out.scalar_type() == ScalarType::Byte) + { + uint8_t* out_data = out.mutable_data_ptr(); + xa_nn_elm_quantize_f32_sym8u( + out_data, input_data, inp_shape, input.dim(), axis, + scale_data, quant_min, quant_max); + } + else if (out.scalar_type() == ScalarType::Char) + { + int8_t* out_data = out.mutable_data_ptr(); + xa_nn_elm_quantize_f32_sym8( + out_data, input_data, inp_shape, input.dim(), axis, + scale_data, quant_min, quant_max); + } + else if (out.scalar_type() == (ScalarType) Ushort) + { + uint16_t* out_data = out.mutable_data_ptr(); + xa_nn_elm_quantize_f32_sym16u( + out_data, input_data, inp_shape, input.dim(), axis, + scale_data, quant_min, quant_max); + } + else if (out.scalar_type() == ScalarType::Short) + { + int16_t* out_data = out.mutable_data_ptr(); + xa_nn_elm_quantize_f32_sym16( + out_data, input_data, inp_shape, input.dim(), axis, + scale_data, quant_min, quant_max); + } + else if (out.scalar_type() == (ScalarType) Bits4u) + { + uint8_t* out_data = out.mutable_data_ptr(); + xa_nn_elm_quantize_f32_sym4u( + out_data, input_data, inp_shape, input.dim(), axis, + scale_data, quant_min, quant_max); + } + else if (out.scalar_type() == (ScalarType) Bits4) + { + int8_t* out_data = out.mutable_data_ptr(); + xa_nn_elm_quantize_f32_sym4( + out_data, input_data, inp_shape, input.dim(), axis, + scale_data, quant_min, quant_max); + } + else + { + if(axis == NULL) + { + // calculate the quantized input + #define SYM_QUANTIZE_IMPL_TENSOR(IN_CTYPE, OUT_CTYPE, out_dtype) \ + case ScalarType::out_dtype: { \ + /* Hoist these function calls out of our inner loop because they might not \ + * get inlined without LTO, particularly in ATen mode. */ \ + auto* out_data_ptr = out.mutable_data_ptr(); \ + const auto* input_data_ptr = input.const_data_ptr(); \ + const auto input_numel = input.numel(); \ + for (size_t i = 0; i < input_numel; i++) { \ + IN_CTYPE value = input_data_ptr[i]; \ + out_data_ptr[i] = quantize_val( \ + (double)*scale_data, (int64_t)*zero_point_data, value, \ + (int64_t)quant_min, (int64_t)quant_max); \ + } \ + } break; + #define SYM_CALCULATE_FLOAT_TYPE_TENSOR(IN_CTYPE, in_dtype) \ + case ScalarType::in_dtype: \ + switch (out.scalar_type()) { \ + ET_FORALL_INT_TYPES_WITH(IN_CTYPE, SYM_QUANTIZE_IMPL_TENSOR); \ + SYM_QUANTIZE_IMPL_TENSOR(IN_CTYPE, uint16_t, Bits16) \ + default: \ + ET_CHECK_MSG( \ + false, \ + "Unhandled output dtype %" PRId8, \ + static_cast(out.scalar_type())); \ + } \ + break; + + switch (input.scalar_type()) { + ET_FORALL_FLOAT_TYPES(SYM_CALCULATE_FLOAT_TYPE_TENSOR); + default: + ET_CHECK_MSG( + false, + "Unhandled input dtype %" PRId8, + static_cast(input.scalar_type())); + } + + } + else + { + // a list contains all dimensions except axis + int64_t dims[input.dim() - 1]; + for (int64_t i = 0; i < input.dim() - 1; i++) + { + if (i < *axis) + { + dims[i] = i; + } + else + { + dims[i] = i + 1; + } + } + + exec_aten::optional> optional_dim_list{ + exec_aten::ArrayRef{dims, size_t(input.dim() - 1)}}; + + // Actual quantization logic + // input, out are the input and output tensors + // channel_ix is the index along the axis dimension. 0 <= channel_ix < + // input.size(axis). + // i.e. if the tensor has shape (N,C,H,W), axis being 1, then channel_ix + // will be 0, 1, 2, ... C-1 + // in_ix is the flat index of the element you are quantizing. + // in other words you are quantizing in_data[in_ix] + #define SYM_QUANTIZE_IMPL_CHANNEL(CTYPE_IN, CTYPE_OUT, out_dtype) \ + case ScalarType::out_dtype: \ + for (size_t channel_ix = 0; channel_ix < input.size(*axis); ++channel_ix) { \ + double _scale = (double)scale_data[channel_ix]; \ + int64_t _zero_point = (int64_t)zero_point_data[channel_ix]; \ + auto* out_data_ptr = out.mutable_data_ptr(); \ + const auto* input_data_ptr = input.const_data_ptr(); \ + torch::executor::apply_over_dim_list( \ + [input_data_ptr, \ + out_data_ptr, \ + _scale, \ + _zero_point, \ + quant_min, \ + quant_max](size_t in_ix) { \ + out_data_ptr[in_ix] = quantize_val( \ + _scale, \ + _zero_point, \ + input_data_ptr[in_ix], \ + quant_min, \ + quant_max); \ + }, \ + input, \ + optional_dim_list, \ + channel_ix); \ + } \ + break; + #define SYM_CALCULATE_FLOAT_TYPE_CHANNEL(CTYPE_IN, in_dtype) \ + case ScalarType::in_dtype: \ + switch (out.scalar_type()) { \ + ET_FORALL_INT_TYPES_WITH(CTYPE_IN, SYM_QUANTIZE_IMPL_CHANNEL); \ + SYM_QUANTIZE_IMPL_CHANNEL(CTYPE_IN, uint16_t, Bits16) \ + default: \ + ET_CHECK_MSG( \ + false, \ + "Unhandled output dtype %" PRId8, \ + static_cast(out.scalar_type())); \ + } \ + break; + + switch (input.scalar_type()) { + ET_FORALL_FLOAT_TYPES(SYM_CALCULATE_FLOAT_TYPE_CHANNEL); + default: + ET_CHECK_MSG( + false, + "Unhandled input dtype %" PRId8, + static_cast(input.scalar_type())); + } + } + #undef SYM_CALCULATE_FLOAT_TYPE_TENSOR + #undef SYM_CALCULATE_FLOAT_TYPE_CHANNEL + #undef SYM_ASYM_QUANTIZE_IMPL_CHANNEL_TENSOR + #undef SYM_ASYM_QUANTIZE_IMPL_CHANNEL_CHANNEL + } + } +} + +// Quantize the input tensor +Tensor& quantize_per_tensor_out(KernelRuntimeContext& context, + const Tensor& input, + double scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) +{ + torch::executor::Error err = resize_tensor(out, input.sizes()); + ET_CHECK_MSG( + err == torch::executor::Error::Ok, + "Failed to resize out Tensor in quantize_per_tensor_out"); + + check_quantize_per_tensor_args(input, quant_min, quant_max, dtype, out); + + float scale_data = (float)scale; + int zero_point_data = (int)zero_point; + quantize_impl(out, + input, + &scale_data, + &zero_point_data, + NULL, + (int) quant_min, + (int) quant_max); + + return out; +} + + +Tensor& quantize_per_tensor_tensor_args_out(KernelRuntimeContext& context, + const Tensor& input, + const Tensor& scale, + const Tensor& zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) +{ + // Temporary change to allow not fatal failure for now to unblock some + // expected failure tests that are dying instead of failure. Will revisit + // after ET_KERNEL_CHECK is fully implemented and properly allows non fatal + // failures. + if (scale.scalar_type() != ScalarType::Double) + { + context.fail(torch::executor::Error::InvalidArgument); + return out; + } + ET_CHECK_MSG( + scale.scalar_type() == ScalarType::Double, + "Expected scale to be Double tensor received: %" PRId8, + static_cast(scale.scalar_type())); + ET_CHECK_MSG( + zero_point.scalar_type() == ScalarType::Long, + "Expected zero_point to be Long tensor received: %" PRId8, + static_cast(zero_point.scalar_type())); + ET_CHECK_MSG( + scale.numel() == 1, + "Exepcted scale to only have one element received: %zd", + ssize_t(scale.numel())); + ET_CHECK_MSG( + zero_point.numel() == 1, + "Exepcted zero_point to only have one element received: %zd", + ssize_t(zero_point.numel())); + + quantize_per_tensor_out(context, + input, + scale.const_data_ptr()[0], + zero_point.const_data_ptr()[0], + quant_min, + quant_max, + dtype, + out); + + return out; +} + +Tensor& quantize_per_tensor_tensor_args_out(const Tensor& input, + const Tensor& scale, + const Tensor& zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) +{ + auto context = torch::executor::RuntimeContext(); + auto& res = quantize_per_tensor_tensor_args_out( + context, input, scale, zero_point, quant_min, quant_max, dtype, out); + ET_CHECK(context.failure_state() == Error::Ok); + return res; +} + +Tensor& quantize_per_channel_out(const Tensor& input, + const Tensor& scale, + const Tensor& zero_point, + int64_t axis, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) +{ + torch::executor::Error err = resize_tensor(out, input.sizes()); + + // normalize axis + ET_CHECK_MSG( + executorch::runtime::tensor_has_dim(input, axis), + "axis %zd is not legal it should be -input.dim() <= axis < input.dim() %zd", + ssize_t(axis), + ssize_t(input.dim())); + + if (axis < 0) + { + axis += executorch::runtime::nonzero_dim(input); + } + + ET_CHECK_MSG( + err == torch::executor::Error::Ok, + "Failed to resize out Tensor in quantize_per_channel_out"); + + ET_CHECK_MSG( + scale.scalar_type() == ScalarType::Double, + "scale.scalar_type() %" PRId8 " is not double type", + static_cast(scale.scalar_type())); + + ET_CHECK_MSG( + scale.numel() == input.size(axis), + "scale.numel() %zd != input.size(axis) %zd", + scale.numel(), + input.size(axis)); + + ET_CHECK_MSG( + zero_point.scalar_type() == ScalarType::Long, + "zero_point.scalar_type() %" PRId8 " is not integer type", + static_cast(zero_point.scalar_type())); + + ET_CHECK_MSG( + zero_point.numel() == input.size(axis), + "zero_point.numel() %zd != input.size(axis) %zd", + zero_point.numel(), + input.size(axis)); + + check_quantize_per_tensor_args(input, quant_min, quant_max, dtype, out); + + + const double* scale_dt = scale.const_data_ptr(); + const int64_t* zero_point_dt = zero_point.const_data_ptr(); + + float scale_data[input.size(axis)]; + int zero_point_data[input.size(axis)]; + + for(int i = 0; i < scale.numel(); i++) + { + scale_data[i] = (float)scale_dt[i]; + zero_point_data[i] = (int)zero_point_dt[i]; + } + + int *axis_ptr = (int *)&axis; + + quantize_impl(out, + input, + scale_data, + zero_point_data, + axis_ptr, + (int) quant_min, + (int) quant_max); + + return out; +} + +Tensor& quantize_per_channel_out(KernelRuntimeContext& context, + const Tensor& input, + const Tensor& scale, + const Tensor& zero_point, + int64_t axis, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) +{ + (void)context; + return quantize_per_channel_out( + input, scale, zero_point, axis, quant_min, quant_max, dtype, out); +} + +Tensor& quantize_per_token_out( + const Tensor& input, + const Tensor& scale, + const Tensor& zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) +{ + size_t num_tokens = 1; + for (size_t i = 0; i < input.dim() - 1; i++) + { + num_tokens *= input.size(i); + } + // This unfortunate change is needed because we compile op_quantize for aten + // mode as well +#ifdef USE_ATEN_LIB + std::vector sizes(2); + sizes[0] = num_tokens; + sizes[1] = input.size(input.dim() - 1); + Tensor reshaped_input = at::from_blob( + input.mutable_data_ptr(), sizes, at::TensorOptions(input.scalar_type())); +#else + std::array input_dim_order{0, 1}; + std::array input_sizes; + input_sizes[0] = num_tokens; + input_sizes[1] = input.size(input.dim() - 1); + std::array input_strides; + executorch::runtime::dim_order_to_stride_nocheck( + input_sizes.data(), input_dim_order.data(), 2, input_strides.data()); + void* input_data = input.mutable_data_ptr(); + torch::executor::TensorImpl reshaped_input_impl = executorch::runtime::etensor::TensorImpl( + input.scalar_type(), + 2, + input_sizes.data(), + input_data, + input_dim_order.data(), + input_strides.data(), + executorch::runtime::TensorShapeDynamism::STATIC); + Tensor reshaped_input(&reshaped_input_impl); + torch::executor::Error err = resize_tensor(out, input.sizes()); + ET_CHECK_MSG( + err == torch::executor::Error::Ok, + "Failed to resize out Tensor in quantize_per_channel_out"); +#endif + + return quantize_per_channel_out( + reshaped_input, scale, zero_point, 0, quant_min, quant_max, dtype, out); +} + +Tensor& quantize_per_token_out( + KernelRuntimeContext& context, + const Tensor& input, + const Tensor& scale, + const Tensor& zero_point, + int64_t quant_min, + int64_t quant_max, + ScalarType dtype, + Tensor& out) +{ + (void)context; + return quantize_per_token_out( + input, scale, zero_point, quant_min, quant_max, dtype, out); +} + +}; // namespace native +}; // namespace FusionG3 +}; // namespace impl \ No newline at end of file diff --git a/backends/cadence/fusion_g3/operators/op_softmax.cpp b/backends/cadence/fusion_g3/operators/op_softmax.cpp new file mode 100644 index 00000000000..01dcdfafa9d --- /dev/null +++ b/backends/cadence/fusion_g3/operators/op_softmax.cpp @@ -0,0 +1,117 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include +#include + +using exec_aten::Scalar; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using torch::executor::KernelRuntimeContext; +using torch::executor::Error; + +namespace impl { +namespace FusionG3 { +namespace native { + +Tensor& softmax_out( + KernelRuntimeContext& ctx, + const Tensor& in, + int64_t dim, + bool half_to_float, + Tensor& out) +{ + (void)ctx; + + ET_KERNEL_CHECK( + ctx, + torch::executor::check_softmax_args(in, dim, half_to_float, out), + InvalidArgument, + out); + + ET_KERNEL_CHECK( + ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out); + + ET_KERNEL_CHECK( + ctx, executorch::runtime::tensors_have_same_dim_order(in, out), InvalidArgument, out); + + // Adjust for negative dim + dim = dim < 0 ? dim + executorch::runtime::nonzero_dim(in) : dim; + + int inp_shapes[in.dim()]; + const exec_aten::ArrayRef in_size = in.sizes(); + for(int i = 0; i < in.dim(); i++) + { + inp_shapes[i] = in_size[i]; + } + + if(out.scalar_type() == ScalarType::Float) + { + const float * const inp_data = in.const_data_ptr(); + float * const out_data = out.mutable_data_ptr(); + int axis = dim; + xa_nn_softmax_f32_f32(out_data, inp_data, inp_shapes, + in.dim(), &axis); + } + else + { + ET_SWITCH_FLOATH_TYPES(in.scalar_type(), ctx, "_softmax.out", CTYPE, [&]() { + const CTYPE* const in_data = in.const_data_ptr(); + CTYPE* const out_data = out.mutable_data_ptr(); + + torch::executor::apply_over_dim( + [in_data, out_data]( + const size_t size, const size_t stride, const size_t base) { + // calculate max in softmax dim. During softmax computation each + // value is subtracted by the maximum in value before calling exp + // to preserve numerical stability. + const CTYPE max_in = torch::executor::apply_unary_reduce_fn( + [](const CTYPE val_in, CTYPE val_accum) { + return std::max(val_in, val_accum); + }, + in_data + base, + size, + stride); + + const CTYPE temp_sum = torch::executor:: + apply_unary_map_reduce_fn( + [max_in](const CTYPE val_in) { + return std::exp(val_in - max_in); + }, + [](const CTYPE mapped_in, CTYPE val_accum) { + return val_accum + mapped_in; + }, + in_data + base, + size, + stride); + + torch::executor::apply_unary_map_fn( + [max_in, temp_sum](const CTYPE val_in) { + return std::exp(val_in - max_in) / temp_sum; + }, + in_data + base, + out_data + base, + size, + stride); + }, + in, + dim); + }); + } + + return out; +} + +} // namespace native +} // namespace FusionG3 +} // namespace impl \ No newline at end of file diff --git a/backends/cadence/fusion_g3/third-party/nnlib/CMakeLists.txt b/backends/cadence/fusion_g3/third-party/nnlib/CMakeLists.txt new file mode 100644 index 00000000000..dd88be148a4 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/CMakeLists.txt @@ -0,0 +1,19 @@ +cmake_minimum_required(VERSION 3.10.0) +project(cadence_nnlib) + +add_custom_target( + nnlib_target ALL + COMMAND + make install_nnlib -f makefile -C + ${EXECUTORCH_ROOT}/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/build + OBJDIR=${CMAKE_CURRENT_BINARY_DIR}/obj + LIBDIR=${CMAKE_CURRENT_BINARY_DIR}/lib -j8 +) + +add_library(xa_nnlib STATIC IMPORTED GLOBAL) +add_dependencies(xa_nnlib nnlib_target) + +set_property( + TARGET xa_nnlib PROPERTY IMPORTED_LOCATION + "${CMAKE_CURRENT_BINARY_DIR}/lib/xa_nnlib.a" +) diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/common/include/xa_api_defs.h b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/common/include/xa_api_defs.h new file mode 100644 index 00000000000..daf56bc0e90 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/common/include/xa_api_defs.h @@ -0,0 +1,51 @@ +/******************************************************************************* +* Copyright (c) 2024 Cadence Design Systems, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to use this Software with Cadence processor cores only and +* not with any other processors and platforms, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +******************************************************************************/ + +#ifndef __XA_API_DEFS_H__ +#define __XA_API_DEFS_H__ + +/*****************************************************************************/ +/* Constant hash defines */ +/*****************************************************************************/ +/* API version */ +#define XA_APIVERSION_MAJOR 1 +#define XA_APIVERSION_MINOR 0 + +/* last compatible version */ +/* sometimes a new API version is just for a bugfix, or a added feature in */ +/* this case it is better to use a newer version even though a library was */ +/* made for an older version, library API can then be upgraded to newer API */ +/* version after checking for compatibility or by adding features */ +#define XA_LASTCOMP_APIVERSION_MAJOR 1 +#define XA_LASTCOMP_APIVERSION_MINOR 0 + +#define XA_STR(str) #str +#define XA_MAKE_VERSION_STR(maj, min) XA_STR(maj) "." XA_STR(min) +#define XA_APIVERSION XA_MAKE_VERSION_STR(\ + XA_APIVERSION_MAJOR, \ + XA_APIVERSION_MINOR) + +#define XA_LAST_COMP_APIVERSION XA_MAKE_VERSION_STR(\ + XA_LASTCOMP_APIVERSION_MAJOR, \ + XA_LASTCOMP_APIVERSION_MINOR) + +#endif /* __XA_API_DEFS_H__ */ diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/common/include/xa_nnlib_common_internal.h b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/common/include/xa_nnlib_common_internal.h new file mode 100644 index 00000000000..4aff9d5b7ae --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/common/include/xa_nnlib_common_internal.h @@ -0,0 +1,149 @@ +/******************************************************************************* +* Copyright (c) 2024 Cadence Design Systems, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to use this Software with Cadence processor cores only and +* not with any other processors and platforms, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +******************************************************************************/ +#ifndef __XA_NNLIB_COMMON_INTERNAL_H__ +#define __XA_NNLIB_COMMON_INTERNAL_H__ + +#include +#include +#include +#include +#include + +/* floating point units detection flags on PDXNX/Fusion G3/Fusion G6 cores */ +#if defined (PDX_MUL_MXF32) +#define HAVE_SP_VFPU 1 /* single precision FPU is selected */ +#else +#define HAVE_SP_VFPU 0 /* single precision FPU is NOT selected */ +#endif +#if defined (PDX_MUL_M2XF64) +#define HAVE_DP_VFPU 1 /* double precision FPU is selected */ +#else +#define HAVE_DP_VFPU 0 /* double precision FPU is NOT selected */ +#endif +/* scalar FPUs flags */ +#define HAVE_SP_FPU XCHAL_HAVE_FP +#define HAVE_DP_FPU XCHAL_HAVE_DFP + +#if (HAVE_SP_FPU) +#include +#endif +#if (HAVE_DP_FPU) +#include +#endif + +#ifndef PDX_M +#define PDX_M 4 /* SIMD width in 32-bit elements */ +#endif + +#if PDX_M==4 +#define PDX_2M (PDX_M*2) +#define PDX_4M (PDX_M*4) +#define PDX_M2 (PDX_M/2) +#define PDX_M4 (PDX_M/4) +/* log2(PDX_M) */ +#define LOG2_PDX_M 2 +#define LOG2_PDX_4M (LOG2_PDX_M+2) +#define LOG2_PDX_2M (LOG2_PDX_M+1) +#define LOG2_PDX_M2 (LOG2_PDX_M-1) +#define LOG2_PDX_M4 (LOG2_PDX_M-2) +#elif PDX_M==8 +#define PDX_2M (PDX_M*2) +#define PDX_4M (PDX_M*4) +#define PDX_M2 (PDX_M/2) +#define PDX_M4 (PDX_M/4) +/* log2(PDX_M) */ +#define LOG2_PDX_M 3 +#define LOG2_PDX_4M (LOG2_PDX_M+2) +#define LOG2_PDX_2M (LOG2_PDX_M+1) +#define LOG2_PDX_M2 (LOG2_PDX_M-1) +#define LOG2_PDX_M4 (LOG2_PDX_M-2) +#else +#error unsupported PDX_M +#endif + +#define UNSUPPORTED_PARAM -1 +#define MAX_DIMS 5 +#define MASK_LOG2_PDX_4M 15 +#define SIZE_OF_INT sizeof(WORD32) +#define SIZE_OF_INT16 sizeof(WORD16) +#define SIZE_OF_INT8 sizeof(WORD8) +#define SIZE_OF_FLOAT sizeof(FLOAT32) +/* log2(size of int) */ +#define LOG2_SIZE_INT 2 +/* log2(size of float) */ +#define LOG2_SIZE_FLOAT 2 +#define NAN 0x7fc00000 + +#define INT16_LOWER_LIMIT -32768 +#define INT16_UPPER_LIMIT 32767 +#define UINT16_LOWER_LIMIT 0 +#define UINT16_UPPER_LIMIT 65535 +#define INT8_LOWER_LIMIT -128 +#define INT8_UPPER_LIMIT 127 +#define UINT8_LOWER_LIMIT 0 +#define UINT8_UPPER_LIMIT 255 +#define INT4_LOWER_LIMIT -8 +#define INT4_UPPER_LIMIT 7 +#define UINT4_LOWER_LIMIT 0 +#define UINT4_UPPER_LIMIT 15 + +#define SHIFT_FACTOR_4_BIT 4 +#define SCALE_FACTOR_4_BIT 16 + +/* Macros for constants */ +#define CONST_ONE 1 +#define CONST_TWO 2 +#define CONST_THREE 3 +#define CONST_FOUR 4 +#define CONST_FIVE 5 +#define CONST_SIX 6 + +#define LOOP_UNROLL_BY_8 8 +#define IS_NOT_32_MULTIPLE 31 +#define SEL_INDEX 30 + +#define Q24_SHIFT_BITS 24 // Bit shift for Q24 representation +#define FRACTIONAL_COMPONENT_SHIFT 22 // Bit shift for fractional component extraction +#define EXPONENT_SHIFT_BITS 54 // Bit shift for extracting exponent +#define POLYNOMIAL_APPROXIMATION_SHIFT 31 // Bit shift for polynomial approximation +#define EXPONENT_BIAS 127 // Bias for exponent in floating-point representation +#define Q31_SHIFT_BITS 30 // Bit shift for Q31 representation +#define Q24_SHIFT_BITS_MINUS_ONE 23 + +#define IS_ALIGN(p) ((((int)(p))&0x7) == 0) +#define ALIGN(x) __attribute__((aligned(x))) +#define ALIGN_PDX_4M __attribute__((aligned(PDX_4M))) /* alignment on PDX_M*4 byte boundary */ + +/*----------------------------------------------------- + Common constants +-----------------------------------------------------*/ + +#define M_PI_FLT 3.14159265358979323846f +#define M_PI_DBL 3.14159265358979323846 + +#ifdef __cplusplus +#define externC extern "C" +#else +#define externC extern +#endif + +#endif /* __XA_NNLIB_COMMON_INTERNAL_H__ */ diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/common/include/xa_nnlib_definitions.h b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/common/include/xa_nnlib_definitions.h new file mode 100644 index 00000000000..3a132b72c79 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/common/include/xa_nnlib_definitions.h @@ -0,0 +1,46 @@ +/******************************************************************************* +* Copyright (c) 2024 Cadence Design Systems, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to use this Software with Cadence processor cores only and +* not with any other processors and platforms, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +******************************************************************************/ + + +#ifndef __XA_NNLIB_DEFINITIONS_H__ +#define __XA_NNLIB_DEFINITIONS_H__ + +#include "xa_api_defs.h" + +/* Identification Strings */ + +#define LIBNAME "FusionG3 Neural Network Library" +#define LIBVERSION "1.0.0" + +#define LIB_APIVERSION_MAJOR 1 +#define LIB_APIVERSION_MINOR 0 + +#if LIB_APIVERSION_MAJOR != XA_APIVERSION_MAJOR || \ +LIB_APIVERSION_MINOR != XA_APIVERSION_MINOR +#error "Version Mismatch" +#endif + +#define LIB_APIVERSION XA_MAKE_VERSION_STR(\ + LIB_APIVERSION_MAJOR, \ + LIB_APIVERSION_MINOR) + +#endif /* __XA_NNLIB_DEFINITIONS_H__ */ diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h new file mode 100644 index 00000000000..6694921defe --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h @@ -0,0 +1,65 @@ +/******************************************************************************* +* Copyright (c) 2024 Cadence Design Systems, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to use this Software with Cadence processor cores only and +* not with any other processors and platforms, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +******************************************************************************/ + +#ifndef __XA_NNLIB_ERR_CHK_H__ +#define __XA_NNLIB_ERR_CHK_H__ + +#ifndef NULL +#define NULL (void *)0 +#endif /* NULL */ + +#ifndef DISABLE_ARG_CHK + +#define XA_NNLIB_ARG_CHK_PTR(_ptr, _err) \ +do { \ + if((_ptr) == NULL) return (_err); \ +} while(0) + +#define XA_NNLIB_ARG_CHK_ALIGN(_ptr, _align, _err) \ +do { \ + if(((unsigned int)(_ptr) & ((_align) - 1)) != 0) return (_err); \ +} while(0) + +#define XA_NNLIB_ARG_CHK_COND(_cond, _err) \ +do { \ + if((_cond)) return (_err); \ +} while(0) + +#define XA_NNLIB_ARG_CLIP(_val, _min, _max) \ +do { \ + if(_val < _min){ \ + _val = _min; \ + } \ + if(_val > _max){ \ + _val = _max; \ + } \ +} while(0) + +#else /* DISABLE_ARG_CHK */ + +#define XA_NNLIB_ARG_CHK_PTR(_ptr, _err) +#define XA_NNLIB_ARG_CHK_ALIGN(_ptr, _align, _err) +#define XA_NNLIB_ARG_CHK_COND(_cond, _err) +#define XA_NNLIB_ARG_CLIP(_val,_min, _max) + +#endif /* DISABLE_ARG_CHK */ +#endif /* __XA_NNLIB_ERR_CHK_H__ */ diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/common/src/xa_nnlib_common_api.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/common/src/xa_nnlib_common_api.c new file mode 100644 index 00000000000..bb4f8966103 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/common/src/xa_nnlib_common_api.c @@ -0,0 +1,43 @@ +/******************************************************************************* +* Copyright (c) 2024 Cadence Design Systems, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to use this Software with Cadence processor cores only and +* not with any other processors and platforms, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +******************************************************************************/ + +#include "xa_nnlib_definitions.h" + +const char lib_name[] = LIBNAME; +const char lib_ver[] = LIBVERSION; +const char api_ver[] = LIB_APIVERSION; + + +const char * xa_nnlib_get_lib_name_string(void) +{ + return lib_name; +} + +const char * xa_nnlib_get_lib_version_string(void) +{ + return lib_ver; +} + +const char * xa_nnlib_get_lib_api_version_string(void) +{ + return api_ver; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/activations/xa_nn_softmax.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/activations/xa_nn_softmax.c new file mode 100644 index 00000000000..596413c1741 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/activations/xa_nn_softmax.c @@ -0,0 +1,508 @@ +/******************************************************************************* + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + ******************************************************************************/ + +#include "expf_tbl.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +#define EXPONENT(x0, out_val) \ +{ \ + xb_vecMxf32 approx; \ + xb_vecMx32 in_int, frac, exp, temp, exp0, exp1; \ + in_int = PDX_TRUNC32_MXF32(x0, Q24_SHIFT_BITS); \ + xb_vecMx80 temp0 = PDX_MULW_MX32(in_int, invln2_q30); \ + xb_vecMx80 temp1 = PDX_SRAI_MX80(temp0, FRACTIONAL_COMPONENT_SHIFT); \ + frac = PDX_SRLI_MX32(PDX_PACKV_MX80(temp1), CONST_ONE); \ + temp1 = PDX_SRAI_MX80(temp0, EXPONENT_SHIFT_BITS); \ + exp = PDX_PACKV_MX80(temp1); \ + xb_vecMx32 f2 = PDX_PACKSIV_MX80(PDX_MULW_MX32(frac, frac), \ + POLYNOMIAL_APPROXIMATION_SHIFT); \ + xb_vecMx32 y1 = PDX_LSR_32_I(expftbl_q30, 0); \ + xb_vecMx32 y2 = PDX_LSR_32_I(expftbl_q30, SIZE_OF_INT); \ + xb_vecMx32 c1, c2; \ + c1 = PDX_LSR_32_I(expftbl_q30, CONST_TWO * SIZE_OF_INT); \ + temp = PDX_PACKSIV_MX80(PDX_MULW_MX32(f2, y1), POLYNOMIAL_APPROXIMATION_SHIFT); \ + y1 = PDX_ADD_MX32(c1, temp); \ + c2 = PDX_LSR_32_I(expftbl_q30, CONST_THREE * SIZE_OF_INT); \ + temp = PDX_PACKSIV_MX80(PDX_MULW_MX32(f2, y2), POLYNOMIAL_APPROXIMATION_SHIFT); \ + y2 = PDX_ADD_MX32(c2, temp); \ + c1 = PDX_LSR_32_I(expftbl_q30, CONST_FOUR * SIZE_OF_INT); \ + temp = PDX_PACKSIV_MX80(PDX_MULW_MX32(f2, y1), POLYNOMIAL_APPROXIMATION_SHIFT); \ + y1 = PDX_ADD_MX32(c1, temp); \ + c2 = PDX_LSR_32_I(expftbl_q30, CONST_FIVE * SIZE_OF_INT); \ + temp = PDX_PACKSIV_MX80(PDX_MULW_MX32(f2, y2), POLYNOMIAL_APPROXIMATION_SHIFT); \ + y2 = PDX_ADD_MX32(c2, temp); \ + c1 = PDX_LSR_32_I(expftbl_q30, CONST_SIX * SIZE_OF_INT); \ + temp = PDX_PACKSIV_MX80(PDX_MULW_MX32(f2, y1), POLYNOMIAL_APPROXIMATION_SHIFT); \ + y1 = PDX_ADD_MX32(c1, temp); \ + xb_vecMx32 g = PDX_ADD_MX32(y1, PDX_PACKSIV_MX80(PDX_MULW_MX32(frac, y2), \ + POLYNOMIAL_APPROXIMATION_SHIFT)); \ + approx = PDX_FLOATF32_MX32(g, Q31_SHIFT_BITS); \ + exp1 = PDX_SRAI_MX32(exp, CONST_ONE); \ + exp0 = PDX_SUB_MX32(exp, exp1); \ + exp0 = PDX_ADD_MX32(EXPONENT_BIAS, exp0); \ + exp1 = PDX_ADD_MX32(EXPONENT_BIAS, exp1); \ + exp0 = PDX_SLLI_MX32(exp0, Q24_SHIFT_BITS_MINUS_ONE); \ + exp1 = PDX_SLLI_MX32(exp1, Q24_SHIFT_BITS_MINUS_ONE); \ + xb_vecMxf32 scale0 = PDX_MOV_MXF32_FROM_4MX8(PDX_MOV_4MX8_FROM_MX32(exp0)); \ + xb_vecMxf32 scale1 = PDX_MOV_MXF32_FROM_4MX8(PDX_MOV_4MX8_FROM_MX32(exp1)); \ + out_val = PDX_MUL_MXF32(approx, scale0); \ + out_val = PDX_MUL_MXF32(out_val, scale1); \ +} + +WORD32 xa_nn_softmax_f32_f32(FLOAT32 *p_out, + const FLOAT32 *p_inp, + const WORD32 *p_inp_shape, + WORD32 num_inp_dims, + WORD32 *p_axis) +{ + WORD32 i, j, dim = 0; + + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_inp, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, UNSUPPORTED_PARAM); + + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND((num_inp_dims <= 0), UNSUPPORTED_PARAM); + if (p_axis != NULL) + { + XA_NNLIB_ARG_CHK_COND((*p_axis < 0), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((*p_axis >= num_inp_dims), UNSUPPORTED_PARAM); + } + + for (i = 0; i < num_inp_dims; i++) + { + XA_NNLIB_ARG_CHK_COND((p_inp_shape[i] <= 0), UNSUPPORTED_PARAM); + } + + /* Leading dimensions across which softmax calculation is repeated */ + WORD32 leading_dim = CONST_ONE; + /* number of elements for which softmax is computed */ + WORD32 inner_count = CONST_ONE; + /* Stride with which the elements are loaded */ + WORD32 inner_stride = CONST_ONE; + + if (p_axis != NULL) + { + dim = *p_axis; + inner_count = p_inp_shape[dim]; + + /* Calculate number of elements of leading dimensions */ + for (int i = 0; i < dim; i++) + { + leading_dim *= p_inp_shape[i]; + } + + for (int i = dim + 1; i < num_inp_dims; i++) + { + inner_stride *= p_inp_shape[i]; + } + } + else /* if p_axis is NULL, then softmax is calculated over entire input dimensions */ + { + for (i = 0; i < num_inp_dims; i++) + { + inner_count *= p_inp_shape[i]; + } + } + + if (inner_stride == CONST_ONE) + { + xb_vecMxf32 x0, x1, max_vec; + xb_vecMxf32 *restrict p_out_mxf32 = (xb_vecMxf32 *)p_out; + xb_vecMxf32 *restrict p_out_exp_mxf32 = (xb_vecMxf32 *)p_out; + xb_vecMxf32 *restrict p_out_softmax_mxf32 = p_out_exp_mxf32; + + /* Calculate number of remaining elements after processing inner loop*/ + WORD32 rem_elem_bytes = (inner_count & (PDX_M - 1)) << LOG2_SIZE_FLOAT; + + /* Set 4-way vboolM vector bit to true based on remaining elements */ + xb_vecMx32 rem_elem_vec = rem_elem_bytes; + xb_vecMx32 list_rem_bytes = {4, 8, 12, 16}; + + /* Set flag only for the remaining elements */ + vboolM bool_vec = PDX_GE_MX32(rem_elem_vec, list_rem_bytes); + + const xb_vecMxf32 *restrict p_in_exp_mxf32 = (const xb_vecMxf32 *)p_inp; + valign ax_inp = PDX_LA_MXF32_PP(p_in_exp_mxf32); + + /* Loop count of maximum value calculation*/ + WORD32 count = inner_count - PDX_M; + + /* Offset from base address to load inputs for maximum value calculation */ + WORD32 offset2 = ((count - (count & (PDX_2M - 1))) >> 1) + PDX_M; + + const FLOAT32 *p_inp_out_itr = p_inp; + for (i = 0; i < leading_dim; i++) + { + const xb_vecMxf32 *restrict p_in_mxf32 = (const xb_vecMxf32 *)p_inp_out_itr; + const xb_vecMxf32 *restrict p_in_max_mxf32 = + (const xb_vecMxf32 *)(p_inp_out_itr + offset2); + + p_inp_out_itr += inner_count; + + /* Align load priming of input */ + valign ax = PDX_LA_MXF32_PP(p_in_mxf32); + valign ax_max = PDX_LA_MXF32_PP(p_in_max_mxf32); + + FLOAT32 max_elem = 0; + + /* Aligning load input (4-way) */ + PDX_LA_MXF32_IP(max_vec, ax, p_in_mxf32); + + /* Calculate maximum value among elements for which softmax will be computed */ + + /* Loop runs for inner_count/8 iterations */ + for (j = 0; j < (count >> LOG2_PDX_2M); j++) + { + /* Load input (4-way) */ + PDX_LA_MXF32_IP(x0, ax, p_in_mxf32); + PDX_LA_MXF32_IP(x1, ax_max, p_in_max_mxf32); + + /* Calculate max value 4-way */ + x0 = PDX_MAXNUM_MXF32(x0, x1); + max_vec = PDX_MAXNUM_MXF32(max_vec, x0); + } + + if ((count & (PDX_2M - 1)) >= PDX_M) + { + /* Load input (4-way) */ + PDX_LA_MXF32_IP(x1, ax_max, p_in_max_mxf32); + + /* Calculate max value 4-way */ + max_vec = PDX_MAXNUM_MXF32(max_vec, x1); + } + + vboolM a; + + PDX_LAV_MXF32_XP(x0, ax_max, p_in_max_mxf32, rem_elem_bytes); + PDX_MAXNUM_MXF32_T(max_vec, max_vec, x0, bool_vec); + + PDX_RBMAXNUM_MXF32(a, max_elem, max_vec); + max_vec = max_elem; + + valign align_z = PDX_Z_ALIGN(); + + FLOAT32 exp_sum = 0; + xb_vecMxf32 exp_sum_mxf32 = PDX_ZERO_MXF32(); + xb_vecMxf32 out_val; + xb_vecMxf32 inv_exp_sum_mxf32; + + /* Calculate exponent of each element */ + for (j = 0; j < inner_count >> LOG2_PDX_M; j++) + { + /* Aligning load input (4-way) */ + PDX_LA_MXF32_IP(x0, ax_inp, p_in_exp_mxf32); + + /* Sub max value from each input element */ + x0 = PDX_SUB_MXF32(x0, max_vec); + + /* Calculate exponent */ + EXPONENT(x0, out_val); + + /* Accumulate the exp values */ + exp_sum_mxf32 = PDX_ADD_MXF32(exp_sum_mxf32, out_val); + + /* Store output */ + PDX_SA_MXF32_IP(out_val, align_z, p_out_mxf32); + } + + if (rem_elem_bytes > 0) + { + /* Load remaining input data */ + PDX_LAV_MXF32_XP(x0, ax_inp, p_in_exp_mxf32, rem_elem_bytes); + + /* x[j] - mean_value */ + PDX_SUB_MXF32_T(x0, x0, max_vec, bool_vec); + + /* Calculate exponent */ + EXPONENT(x0, out_val); + + /* Accumulate the exp values */ + PDX_ADD_MXF32_T(exp_sum_mxf32, exp_sum_mxf32, out_val, bool_vec); + + /* Store the normalized data */ + PDX_SAV_MXF32_XP(out_val, align_z, p_out_mxf32, rem_elem_bytes); + } + + PDX_SAPOS_MXF32_FP(align_z, p_out_mxf32); + + exp_sum = PDX_RADD_MXF32(exp_sum_mxf32); + + FLOAT32 inv_exp_sum; + PDX_DIV_F32_T(inv_exp_sum, 1, exp_sum, 1); + inv_exp_sum_mxf32 = inv_exp_sum; + + /* Align load priming of output */ + valign a_out = PDX_LA_MXF32_PP(p_out_exp_mxf32); + + align_z = PDX_Z_ALIGN(); + + /* Compute softmax for each element */ + for (j = 0; j < inner_count >> LOG2_PDX_M; j++) + { + /* Load exp values of each element (4-way) */ + PDX_LA_MXF32_IP(x0, a_out, p_out_exp_mxf32); + + /* Calculate the softmax */ + x0 = PDX_MUL_MXF32(x0, inv_exp_sum_mxf32); + + /* Store the softmax */ + PDX_SA_MXF32_IP(x0, align_z, p_out_softmax_mxf32); + } + + /* Load remaining input data */ + PDX_LAV_MXF32_XP(x0, a_out, p_out_exp_mxf32, rem_elem_bytes); + + /* Calculate the softmax */ + x0 = PDX_MUL_MXF32(x0, inv_exp_sum_mxf32); + + /* Store the softmax */ + PDX_SAV_MXF32_XP(x0, align_z, p_out_softmax_mxf32, rem_elem_bytes); + PDX_SAPOS_MXF32_FP(align_z, p_out_softmax_mxf32); + } + } + else + { + WORD32 rem_elem_bytes; + valign ax, ax_inp; + xb_vecMxf32 x0, x1, max_vec; + WORD32 k; + WORD32 offset = inner_stride * inner_count; + WORD32 inner_stride_bytes = inner_stride << LOG2_SIZE_FLOAT; + + const FLOAT32 *p_inp1 = p_inp; + const FLOAT32 *p_out1 = p_out; + /* number of remaining elements to be processed */ + WORD32 rem_elem = (inner_stride & (PDX_M - 1)); + xb_vecMxf32 *restrict p_out_mxf32; + + for (i = 0; i < leading_dim; i++) + { + const FLOAT32 *p_inp2 = p_inp1; + const FLOAT32 *p_out2 = p_out1; + for (j = 0; j < inner_stride - rem_elem; j += 4) + { + p_inp2 = p_inp1 + j; + p_out2 = p_out1 + j; + + const FLOAT32 *p_inp3; + const xb_vecMxf32 *restrict p_in_mxf32 = + (const xb_vecMxf32 *)(p_inp2); + const xb_vecMxf32 *restrict p_in_max_mxf32 = + (const xb_vecMxf32 *)(p_inp2 + inner_stride); + + ax = PDX_LA_MXF32_PP(p_in_mxf32); + PDX_LA_MXF32_XP(max_vec, ax, p_in_mxf32, inner_stride_bytes * 2); + + /* inner_count -> group of elements on which softmax is computed */ + for (k = 0; k < (inner_count - 1) >> 1; k++) + { + /* Align load priming of input */ + ax_inp = PDX_LA_MXF32_PP(p_in_max_mxf32); + ax = PDX_LA_MXF32_PP(p_in_mxf32); + + /* Load input elements with stride "inner_stride" */ + PDX_LA_MXF32_XP(x1, ax_inp, p_in_max_mxf32, + inner_stride_bytes * 2); + PDX_LA_MXF32_XP(x0, ax, p_in_mxf32, inner_stride_bytes * 2); + + /* Calculate maximum across each lane of vector */ + x0 = PDX_MAXNUM_MXF32(x0, x1); + max_vec = PDX_MAXNUM_MXF32(x0, max_vec); + } + + WORD32 rem = ((inner_count - 1) & (1)); + + if (rem) + { + /* Align load priming of input */ + ax = PDX_LA_MXF32_PP(p_in_max_mxf32); + + /* Load input elements with stride "inner_stride" */ + PDX_LA_MXF32_XP(x0, ax, p_in_max_mxf32, inner_stride_bytes * 2); + + /* Calculate maximum across each lane of vector */ + max_vec = PDX_MAXNUM_MXF32(x0, max_vec); + } + + /* Calculate exponent of each element */ + xb_vecMxf32 exp_sum_mxf32 = PDX_ZERO_MXF32(); + valign align_z = PDX_Z_ALIGN(); + xb_vecMxf32 out_val; + p_inp3 = p_inp2; + const FLOAT32 *p_out3 = p_out2; + + p_in_mxf32 = (const xb_vecMxf32 *)(p_inp3); + + for (k = 0; k < inner_count; k++) + { + p_out_mxf32 = (xb_vecMxf32 *)p_out3; + /* Align load priming of input */ + ax = PDX_LA_MXF32_PP(p_in_mxf32); + + /* Load input elements with stride "inner_stride" */ + PDX_LA_MXF32_XP(x0, ax, p_in_mxf32, inner_stride_bytes); + + /* Sub max value from each input element */ + x0 = PDX_SUB_MXF32(x0, max_vec); + + /* Calculate exponent */ + EXPONENT(x0, out_val); + + /* Accumulate the exp values */ + exp_sum_mxf32 = PDX_ADD_MXF32(exp_sum_mxf32, out_val); + + /* Store output */ + PDX_SA_MXF32_IP(out_val, align_z, p_out_mxf32); + PDX_SAPOS_MXF32_FP(align_z, p_out_mxf32); + p_out3 += inner_stride; + } + + xb_vecMxf32 inv_exp_sum_mxf32; + inv_exp_sum_mxf32 = PDX_DIV_MXF32(1, exp_sum_mxf32); + + /* Compute softmax */ + align_z = PDX_Z_ALIGN(); + const xb_vecMxf32 *restrict p_out_exp_mxf32 = (xb_vecMxf32 *)(p_out2); + + for (k = 0; k < inner_count; k++) + { + xb_vecMxf32 *restrict p_out_softmax_mxf32 = (xb_vecMxf32 *)p_out2; + /* Align load priming */ + ax = PDX_LA_MXF32_PP(p_out_exp_mxf32); + + /* Aligning load exp values of each element (4-way) */ + PDX_LA_MXF32_XP(x0, ax, p_out_exp_mxf32, inner_stride_bytes); + + /* Calculate the softmax */ + x0 = PDX_MUL_MXF32(x0, inv_exp_sum_mxf32); + + /* Store the softmax */ + PDX_SA_MXF32_IP(x0, align_z, p_out_softmax_mxf32); + PDX_SAPOS_MXF32_FP(align_z, p_out_softmax_mxf32); + p_out2 += inner_stride; + } + } + + /* Process remaining elements */ + rem_elem_bytes = rem_elem * SIZE_OF_FLOAT; + p_inp2 = p_inp1 + j; + p_out2 = p_out1 + j; + + const FLOAT32 *p_inp3 = p_inp2; + const xb_vecMxf32 *restrict p_in_mxf32 = (const xb_vecMxf32 *)(p_inp3); + + ax = PDX_LA_MXF32_PP(p_in_mxf32); + PDX_LAV_MXF32_XP(max_vec, ax, p_in_mxf32, rem_elem_bytes); + + /* Calculate maximum among group of elements on which softmax is computed */ + for (k = 0; k < inner_count - 1; k++) + { + p_inp3 += inner_stride; + p_in_mxf32 = (const xb_vecMxf32 *)(p_inp3); + + /* Align load priming of input */ + ax = PDX_LA_MXF32_PP(p_in_mxf32); + + /* Load input elements with stride "inner_stride" */ + PDX_LAV_MXF32_XP(x0, ax, p_in_mxf32, rem_elem_bytes); + + /* Calculate maximum across each lane of vector */ + max_vec = PDX_MAXNUM_MXF32(x0, max_vec); + } + + /* Calculate exponent of each element */ + xb_vecMxf32 exp_sum_mxf32 = PDX_ZERO_MXF32(); + valign align_z = PDX_Z_ALIGN(); + xb_vecMxf32 out_val; + p_inp3 = p_inp2; + const FLOAT32 *p_out3 = p_out2; + + /* Calculate exp of group of elements on which softmax is calculated */ + for (k = 0; k < inner_count; k++) + { + /* const Float32 *p_inp3 = p_inp2 + k*inner_stride; */ + const xb_vecMxf32 *restrict p_in_mxf32 = (const xb_vecMxf32 *)(p_inp3); + xb_vecMxf32 *restrict p_out_mxf32 = (xb_vecMxf32 *)p_out3; + + p_inp3 += inner_stride; + p_out3 += inner_stride; + + /* Align load priming of input */ + ax = PDX_LA_MXF32_PP(p_in_mxf32); + + /* Load input elements with stride "inner_stride" */ + PDX_LAV_MXF32_XP(x0, ax, p_in_mxf32, rem_elem_bytes); + + /* Sub max value from each input element */ + x0 = PDX_SUB_MXF32(x0, max_vec); + + /* Calculate exponent */ + EXPONENT(x0, out_val); + + /* Accumulate the exp values */ + exp_sum_mxf32 = PDX_ADD_MXF32(exp_sum_mxf32, out_val); + + /* Store output */ + PDX_SAV_MXF32_XP(out_val, align_z, p_out_mxf32, rem_elem_bytes); + PDX_SAPOS_MXF32_FP(align_z, p_out_mxf32); + } + + xb_vecMxf32 inv_exp_sum_mxf32; + inv_exp_sum_mxf32 = PDX_DIV_MXF32(1, exp_sum_mxf32); + + /* Compute softmax */ + p_inp3 = p_out2; + p_out3 = p_out2; + align_z = PDX_Z_ALIGN(); + + /* Calculate softmax of group of elements */ + for (k = 0; k < inner_count; k++) + { + const xb_vecMxf32 *restrict p_out_exp_mxf32 = (const xb_vecMxf32 *)(p_inp3); + xb_vecMxf32 *restrict p_out_softmax_mxf32 = (xb_vecMxf32 *)p_out3; + + p_inp3 += inner_stride; + p_out3 = p_inp3; + /* Align load priming */ + ax = PDX_LA_MXF32_PP(p_out_exp_mxf32); + + /* Load exp values of each element (4-way) */ + PDX_LAV_MXF32_XP(x0, ax, p_out_exp_mxf32, rem_elem_bytes); + + /* Calculate the softmax */ + x0 = PDX_MUL_MXF32(x0, inv_exp_sum_mxf32); + + /* Store the softmax */ + PDX_SAV_MXF32_XP(x0, align_z, p_out_softmax_mxf32, rem_elem_bytes); + PDX_SAPOS_MXF32_FP(align_z, p_out_softmax_mxf32); + } + + p_inp1 = p_inp1 + offset; + p_out1 = p_out1 + offset; + } + } + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_add_32x32.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_add_32x32.c new file mode 100644 index 00000000000..3688746295a --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_add_32x32.c @@ -0,0 +1,655 @@ +/******************************************************************************* +* Copyright (c) 2024 Cadence Design Systems, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to use this Software with Cadence processor cores only and +* not with any other processors and platforms, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +******************************************************************************/ +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_add_32x32_32(WORD32 *p_out, + const WORD32 *p_inp1, + const WORD32 *p_inp2, + WORD32 alpha, + WORD32 num_elm) +{ + WORD32 n, m; + + xb_vecMx32 x0, y0, z0; + valign ax, ay, az; + + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp1, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp2, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(WORD32), UNSUPPORTED_PARAM); + + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND((num_elm <= 0), UNSUPPORTED_PARAM); + + const xb_vecMx32 *restrict px = (const xb_vecMx32 *)p_inp1; + const xb_vecMx32 *restrict py = (const xb_vecMx32 *)p_inp2; + xb_vecMx32 *restrict pz = (xb_vecMx32 *)p_out; + + /* Move from scalar register to 4-way 32bit vec register with replicate */ + xb_vecMx32 vec_alpha = PDX_MOVR32_A32(alpha); + + /* Align load priming */ + ax = PDX_LA_MX32_PP(px); + ay = PDX_LA_MX32_PP(py); + + /* Zeroing align register */ + az = PDX_Z_ALIGN(); + + /* Loop runs for num_elm /4 iterations */ + for (n = 0; n < (num_elm >> LOG2_PDX_M); n++) + { + /* Load 2 inputs */ + PDX_LA_MX32_IP(x0, ax, px); + PDX_LA_MX32_IP(y0, ay, py); + + /* 4-way (Input2 * alpha) */ + y0 = PDX_MUL_MX32(y0, vec_alpha); + /* 4-way Add (Input1 + alpha*Input2) */ + z0 = PDX_ADD_MX32(x0, y0); + /* Aligning store (4-way 32bit elements) + * with post increment addressing + */ + PDX_SA_MX32_IP(z0, az, pz); + } + + /* Process remaining elements */ + m = (num_elm & (PDX_M - CONST_ONE)) << LOG2_SIZE_INT; + PDX_LAV_MX32_XP(x0, ax, px, m); + PDX_LAV_MX32_XP(y0, ay, py, m); + y0 = PDX_MUL_MX32(y0, vec_alpha); + z0 = PDX_ADD_MX32(x0, y0); + PDX_SAV_MX32_XP(z0, az, pz, m); + PDX_SAPOS_MX32_FP(az, pz); + + return 0; +} + +WORD32 xa_nn_elm_add_scalar_32x32_32(WORD32 *p_out, + const WORD32 *p_inp1, + const WORD32 inp2, + WORD32 alpha, + WORD32 num_elm) +{ + WORD32 n, m; + + xb_vecMx32 x0, y0, z0; + valign ax, az; + const xb_vecMx32 *restrict px = (const xb_vecMx32 *)p_inp1; + xb_vecMx32 *restrict pz = (xb_vecMx32 *)p_out; + WORD32 in2 = alpha * inp2; + + /* Vectorize input2 for SIMD */ + y0 = in2; + + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp1, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(WORD32), UNSUPPORTED_PARAM); + + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND((num_elm <= 0), UNSUPPORTED_PARAM); + + /* Align load priming */ + ax = PDX_LA_MX32_PP(px); + + /* Zeroing align register */ + az = PDX_Z_ALIGN(); + + /* loop iterates for multiple of LOG2_PDX_M */ + for (n = 0; n < (num_elm >> LOG2_PDX_M); n++) + { + /* Aligning load Input1 */ + PDX_LA_MX32_IP(x0, ax, px); + + /* 4-way Add (Input1 + alpha*Input2) */ + z0 = PDX_ADD_MX32(x0, y0); + + /* Aligning store (4-way 32bit elements) + * with post increment addressing + */ + PDX_SA_MX32_IP(z0, az, pz); + } + + /* Remaining elements after processing the loop */ + m = (num_elm & (PDX_M - CONST_ONE)) << LOG2_SIZE_INT; + + /* Variable aligining load */ + PDX_LAV_MX32_XP(x0, ax, px, m); + + /* Add remaining elements */ + z0 = PDX_ADD_MX32(x0, y0); + + /* Variable aligining store and flush */ + PDX_SAV_MX32_XP(z0, az, pz, m); + PDX_SAPOS_MX32_FP(az, pz); + + return 0; +} + +static inline void shapes_convert_5D(WORD32 *const __restrict__ p_5d_out_shape, + WORD32 *const __restrict__ p_5d_inp1_shape, /* new input1 shapes */ + WORD32 *const __restrict__ p_5d_inp2_shape, /* new input2 shapes */ + const WORD32 *const __restrict__ p_out_shape, + const WORD32 *const __restrict__ p_inp1_shape, /* original input1 shapes */ + const WORD32 *const __restrict__ p_inp2_shape, /* original input1 shapes */ + const WORD32 num_inp_dims) +{ + /* Convert number of dimension less than 5D to 5D */ + for (WORD32 i = 0; i < num_inp_dims; i++) + { + p_5d_out_shape[i + MAX_DIMS - num_inp_dims] = p_out_shape[i]; + p_5d_inp1_shape[i + MAX_DIMS - num_inp_dims] = p_inp1_shape[i]; + p_5d_inp2_shape[i + MAX_DIMS - num_inp_dims] = p_inp2_shape[i]; + } +} + +static inline WORD32 check_shapes(const WORD32 *const p_inp1_shape, + const WORD32 *const p_inp2_shape, + const WORD32 *const p_out_shape) +{ + /* Check the shapes of input and output */ + for (WORD32 i = 0; i < MAX_DIMS; i++) + { + if (((p_inp1_shape[i] != p_inp2_shape[i]) + && (p_inp1_shape[i] != CONST_ONE) + && (p_inp2_shape[i] != CONST_ONE)) + || (p_out_shape[i] + != (p_inp1_shape[i] > p_inp2_shape[i] ? + p_inp1_shape[i] : p_inp2_shape[i]))) + { + return UNSUPPORTED_PARAM; + } + } + return 0; +} + +static inline void strides_calculation(const WORD32 *const inp1_shape, + const WORD32 *const inp2_shape, + WORD32 *const inp1_strides, + WORD32 *const inp2_strides) +{ + inp1_strides[MAX_DIMS - CONST_ONE] = CONST_ONE; + inp2_strides[MAX_DIMS - CONST_ONE] = CONST_ONE; + for (WORD32 i = MAX_DIMS - CONST_TWO; i >= 0; i--) + { + inp1_strides[i] = inp1_strides[i + CONST_ONE] + * inp1_shape[i + CONST_ONE]; + inp2_strides[i] = inp2_strides[i + CONST_ONE] + * inp2_shape[i + CONST_ONE]; + } +} + +static inline void internal_elm_add_broadcast_2D_32x32_32( + WORD32 *__restrict__ p_out, + const WORD32 *__restrict__ p_inp1, + const WORD32 *__restrict__ p_inp2, + WORD32 out_lc, + WORD32 in_lc, + const WORD32 *input1_shapes, + const WORD32 *input2_shapes, + WORD32 alpha) +{ + + WORD32 n, m; + + xb_vecMx32 x0, x1, y0, y1, z0, z1; + + xb_vecMx32 vec_alpha = PDX_MOVR32_A32(alpha); + + const xb_vecMx32 *__restrict__ px = (const xb_vecMx32 *)&p_inp1[0]; + const xb_vecMx32 *__restrict__ py = (const xb_vecMx32 *)&p_inp2[0]; + + valign ax, ax0, ax1, ay, ay0, ay1, az, az0, az1; + ax = PDX_LA_MX32_PP(px); + ay = PDX_LA_MX32_PP(py); + az = PDX_Z_ALIGN(); + + const WORD32 *px_baseptr = &p_inp1[0]; + const xb_vecMx32 *__restrict__ px0 = (const xb_vecMx32 *)&px_baseptr[0]; + const xb_vecMx32 *__restrict__ px1 = (const xb_vecMx32 *)(&px_baseptr[0] + + ((out_lc / CONST_TWO) * in_lc)); + + ax0 = PDX_LA_MX32_PP(px0); + ax1 = PDX_LA_MX32_PP(px1); + + const WORD32 *py_baseptr = &p_inp2[0]; + const xb_vecMx32 *__restrict__ py0 = (const xb_vecMx32 *)&py_baseptr[0]; + const xb_vecMx32 *__restrict__ py1 = (const xb_vecMx32 *)(&py_baseptr[0] + + ((out_lc / CONST_TWO) * in_lc)); + + ay0 = PDX_LA_MX32_PP(py0); + ay1 = PDX_LA_MX32_PP(py1); + + WORD32 *pz_baseptr = &p_out[0]; + xb_vecMx32 *__restrict__ pz0 = (xb_vecMx32 *)&pz_baseptr[0]; + xb_vecMx32 *__restrict__ pz1 = (xb_vecMx32 *)(&pz_baseptr[0] + + ((out_lc / CONST_TWO) * in_lc)); + + az0 = PDX_Z_ALIGN(); + az1 = PDX_Z_ALIGN(); + + if (input1_shapes[3] == CONST_ONE) + { + for (WORD32 i = 0; i < out_lc - CONST_ONE; i += CONST_TWO) + { + for (n = 0; n < (in_lc >> LOG2_PDX_M); n++) + { + PDX_LA_MX32_IP(x0, ax0, px); + + PDX_LA_MX32_IP(y0, ay0, py0); + PDX_LA_MX32_IP(y1, ay1, py1); + + y0 = PDX_MUL_MX32(y0, vec_alpha); + z0 = PDX_ADD_MX32(x0, y0); + + y1 = PDX_MUL_MX32(y1, vec_alpha); + z1 = PDX_ADD_MX32(x0, y1); + + PDX_SA_MX32_IP(z0, az0, pz0); + PDX_SA_MX32_IP(z1, az1, pz1); + } + m = (in_lc & (PDX_M - CONST_ONE)) * sizeof(*p_inp1); + PDX_LAV_MX32_XP(x0, ax, px, m); + PDX_LAV_MX32_XP(y0, ay0, py0, m); + PDX_LAV_MX32_XP(y1, ay1, py1, m); + + y0 = PDX_MUL_MX32(y0, vec_alpha); + z0 = PDX_ADD_MX32(x0, y0); + + y1 = PDX_MUL_MX32(y1, vec_alpha); + z1 = PDX_ADD_MX32(x0, y1); + + PDX_SAV_MX32_XP(z0, az0, pz0, m); + PDX_SAV_MX32_XP(z1, az1, pz1, m); + PDX_SAPOS_MX32_FP(az0, pz0); + PDX_SAPOS_MX32_FP(az1, pz1); + + px = (const xb_vecMx32 *)&p_inp1[0]; + ax = PDX_LA_MX32_PP(px); + } + if (out_lc % CONST_TWO != 0) + { + for (n = 0; n < (in_lc >> LOG2_PDX_M); n++) + { + PDX_LA_MX32_IP(y1, ay1, py1); + PDX_LA_MX32_IP(x0, ax, px); + + y1 = PDX_MUL_MX32(y1, vec_alpha); + z0 = PDX_ADD_MX32(x0, y1); + + PDX_SA_MX32_IP(z0, az1, pz1); + } + m = (in_lc & (PDX_M - CONST_ONE)) * sizeof(*p_inp1); + PDX_LAV_MX32_XP(y1, ay1, py1, m); + PDX_LAV_MX32_XP(x0, ax, px, m); + + y1 = PDX_MUL_MX32(y1, vec_alpha); + z0 = PDX_ADD_MX32(x0, y1); + + PDX_SAV_MX32_XP(z0, az1, pz1, m); + PDX_SAPOS_MX32_FP(az1, pz1); + } + } + else + { + for (WORD32 i = 0; i < out_lc - CONST_ONE; i += CONST_TWO) + { + for (n = 0; n < (in_lc >> LOG2_PDX_M); n++) + { + PDX_LA_MX32_IP(y0, ay, py); + + PDX_LA_MX32_IP(x0, ax0, px0); + PDX_LA_MX32_IP(x1, ax1, px1); + + y0 = PDX_MUL_MX32(y0, vec_alpha); + z0 = PDX_ADD_MX32(x0, y0); + + z1 = PDX_ADD_MX32(x1, y0); + + PDX_SA_MX32_IP(z0, az0, pz0); + PDX_SA_MX32_IP(z1, az1, pz1); + } + m = (in_lc & (PDX_M - CONST_ONE)) * sizeof(*p_inp1); + PDX_LAV_MX32_XP(y0, ay, py, m); + PDX_LAV_MX32_XP(x0, ax0, px0, m); + PDX_LAV_MX32_XP(x1, ax1, px1, m); + + y0 = PDX_MUL_MX32(y0, vec_alpha); + z0 = PDX_ADD_MX32(x0, y0); + + z1 = PDX_ADD_MX32(x1, y0); + + PDX_SAV_MX32_XP(z0, az0, pz0, m); + PDX_SAV_MX32_XP(z1, az1, pz1, m); + PDX_SAPOS_MX32_FP(az0, pz0); + PDX_SAPOS_MX32_FP(az1, pz1); + + py = (const xb_vecMx32 *)&p_inp2[0]; + ay = PDX_LA_MX32_PP(py); + } + if (out_lc % CONST_TWO != 0) + { + for (n = 0; n < (in_lc >> LOG2_PDX_M); n++) + { + PDX_LA_MX32_IP(y0, ay, py); + PDX_LA_MX32_IP(x1, ax1, px1); + + y0 = PDX_MUL_MX32(y0, vec_alpha); + z0 = PDX_ADD_MX32(x1, y0); + + PDX_SA_MX32_IP(z0, az1, pz1); + } + m = (in_lc & (PDX_M - CONST_ONE)) * sizeof(*p_inp1); + PDX_LAV_MX32_XP(y0, ay, py, m); + PDX_LAV_MX32_XP(x0, ax1, px1, m); + y0 = PDX_MUL_MX32(y0, vec_alpha); + z0 = PDX_ADD_MX32(x0, y0); + PDX_SAV_MX32_XP(z0, az1, pz1, m); + PDX_SAPOS_MX32_FP(az1, pz1); + } + } +} + +static inline void internal_elm_add_broadcast_1D_scalar_32x32_32( + WORD32 *__restrict__ p_out, + const WORD32 *__restrict__ p_inp1, + const WORD32 *__restrict__ p_inp2, + WORD32 num_elm, + const WORD32 *__restrict__ input1_shapes, + const WORD32 inp1_const, + const WORD32 alpha) +{ + + xb_vecMx32 i1, i2, y, z, vec_alpha = alpha; + xb_vecMx32 *restrict p_i1 = (xb_vecMx32*) p_inp1; + xb_vecMx32 *restrict p_i2 = (xb_vecMx32*) p_inp2; + xb_vecMx32 *restrict p_o = (xb_vecMx32*) p_out; + + valign ax, az; + az = PDX_Z_ALIGN(); + + WORD32 m; + + if ((input1_shapes[4] == CONST_ONE) || (inp1_const == CONST_ONE)) + { + i1 = PDX_LSR_32_I(p_inp1, 0); + ax = PDX_LA_MX32_PP(p_i2); + for(WORD32 i = 0; i < (num_elm >> LOG2_PDX_M); i++) + { + PDX_LA_MX32_IP(i2, ax, p_i2); + y = PDX_MUL_MX32(i2, vec_alpha); + z = PDX_ADD_MX32(i1, y); + PDX_SA_MX32_IP(z, az, p_o); + } + m = (num_elm & (PDX_M - CONST_ONE)) * SIZE_OF_INT; + PDX_LAV_MX32_XP(i2, ax, p_i2, m); + y = PDX_MUL_MX32(i2, vec_alpha); + z = PDX_ADD_MX32(i1, y); + PDX_SAV_MX32_XP(z, az, p_o, m); + } + else + { + i2 = PDX_LSR_32_I(p_inp2, 0); + y = PDX_MUL_MX32(i2, vec_alpha); + ax = PDX_LA_MX32_PP(p_i1); + for (WORD32 i = 0; i < (num_elm >> LOG2_PDX_M); i++) + { + PDX_LA_MX32_IP(i1, ax, p_i1); + z = PDX_ADD_MX32(i1, y); + PDX_SA_MX32_IP(z, az, p_o); + } + m = (num_elm & (PDX_M - CONST_ONE)) * SIZE_OF_INT; + PDX_LAV_MX32_XP(i1, ax, p_i1, m); + z = PDX_ADD_MX32(i1, y); + PDX_SAV_MX32_XP(z, az, p_o, m); + } + PDX_SAPOS_MX32_FP(az, p_o); +} + + +WORD32 xa_nn_elm_add_broadcast_5D_32x32_32(WORD32 *__restrict__ p_out, + const WORD32 *const p_out_shape, + const WORD32 *__restrict__ p_inp1, + const WORD32 *const p_inp1_shape, + const WORD32 *__restrict__ p_inp2, + const WORD32 *const p_inp2_shape, + WORD32 num_inp_dims, + WORD32 alpha) +{ + + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_out_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp1, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp2, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + + /* UNSUPPORTED_PARAM input checks */ + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || + (num_inp_dims > MAX_DIMS)), UNSUPPORTED_PARAM); + + /* 5D shapes initialization */ + WORD32 p_5d_out_shape[MAX_DIMS] = {CONST_ONE, CONST_ONE, CONST_ONE, CONST_ONE, CONST_ONE}; + WORD32 p_5d_inp1_shape[MAX_DIMS] = {CONST_ONE, CONST_ONE, CONST_ONE, CONST_ONE, CONST_ONE}; + WORD32 p_5d_inp2_shape[MAX_DIMS] = {CONST_ONE, CONST_ONE, CONST_ONE, CONST_ONE, CONST_ONE}; + + shapes_convert_5D(p_5d_out_shape, p_5d_inp1_shape, p_5d_inp2_shape, + p_out_shape, p_inp1_shape, p_inp2_shape, num_inp_dims); + + /* Check shapes for broadcast compatibility */ + WORD32 error = 0; + error = check_shapes(p_5d_inp1_shape, p_5d_inp2_shape, p_5d_out_shape); + if (error) + { + return UNSUPPORTED_PARAM; + } + + /* strides calculation */ + WORD32 inp1_strides[MAX_DIMS], inp2_strides[MAX_DIMS]; + strides_calculation(p_5d_inp1_shape, p_5d_inp2_shape, inp1_strides, + inp2_strides); + + /* check for broadcast need */ + WORD32 need_broadcast = 0; + WORD32 inp1_const = CONST_ONE, inp2_const = CONST_ONE; + for (int i = 0; i < MAX_DIMS; i++) + { + if (p_5d_inp1_shape[i] != p_5d_inp2_shape[i]) + { + if (p_5d_inp1_shape[i] == CONST_ONE) + { + inp1_strides[i] = 0; + } + else + { + inp2_strides[i] = 0; + } + need_broadcast = CONST_ONE; + } + + if (p_5d_inp1_shape[i] != CONST_ONE) + inp1_const &= 0; + if (p_5d_inp2_shape[i] != CONST_ONE) + inp2_const &= 0; + } + + const WORD32 *__restrict__ p_inp1_base = p_inp1; + const WORD32 *__restrict__ p_inp2_base = p_inp2; + WORD32 *p_out_base = p_out; + + /* if broadcast is not needed */ + if (need_broadcast == 0) + { + xa_nn_elm_add_32x32_32( + p_out_base, + p_inp1_base, + p_inp2_base, + alpha, + p_5d_out_shape[0] * inp1_strides[0]); + } + + /* if broadcast is needed */ + else if (inp1_const == CONST_ONE || inp2_const == CONST_ONE) + { + internal_elm_add_broadcast_1D_scalar_32x32_32(p_out_base, + p_inp1_base, + p_inp2_base, + p_5d_out_shape[0] * p_5d_out_shape[1] * p_5d_out_shape[2] + * p_5d_out_shape[3] * p_5d_out_shape[4], + p_5d_inp1_shape, + inp1_const, + alpha); + } + /* check if 4th dim in both inputs is the same */ + else if (inp1_strides[4] == inp2_strides[4]) + { + WORD32 in_lc, out_lc; + /* check if 3rd dim needs to be broadcasted */ + if (inp1_strides[3] == 0 || inp2_strides[3] == 0) + { + /* Repeat the 4th dime as the 3rd dim needs to be broadcasted */ + in_lc = p_5d_out_shape[4]; + out_lc = p_5d_out_shape[3]; + for (WORD32 itr0 = 0; itr0 < p_5d_out_shape[0]; itr0++) + { + const WORD32 *__restrict__ p_inp1_itr0 = p_inp1_base; + const WORD32 *__restrict__ p_inp2_itr0 = p_inp2_base; + for (WORD32 itr1 = 0; itr1 < p_5d_out_shape[1]; itr1++) + { + const WORD32 *__restrict__ p_inp1_itr1 = p_inp1_itr0; + const WORD32 *__restrict__ p_inp2_itr1 = p_inp2_itr0; + for (WORD32 itr2 = 0; itr2 < p_5d_out_shape[2]; itr2++) + { + internal_elm_add_broadcast_2D_32x32_32(p_out_base, + p_inp1_itr1, + p_inp2_itr1, + out_lc, + in_lc, + p_5d_inp1_shape, + p_5d_inp2_shape, + alpha); + p_out_base += in_lc * out_lc; + p_inp1_itr1 += inp1_strides[2]; + p_inp2_itr1 += inp2_strides[2]; + } + p_inp1_itr0 += inp1_strides[1]; + p_inp2_itr0 += inp2_strides[1]; + } + p_inp1_base += inp1_strides[0]; + p_inp2_base += inp2_strides[0]; + } + } + else + { + /* 3rd and 4th dimensions need not be broadcasted. The lower + * dimension broadcasting (0th, 1st, 2nd) will be taken care + * while calculating the input addresses */ + in_lc = p_5d_out_shape[3] * p_5d_out_shape[4]; + for (WORD32 itr0 = 0; itr0 < p_5d_out_shape[0]; itr0++) + { + const WORD32 *__restrict__ p_inp1_itr0 = p_inp1_base; + const WORD32 *__restrict__ p_inp2_itr0 = p_inp2_base; + for (WORD32 itr1 = 0; itr1 < p_5d_out_shape[1]; itr1++) + { + const WORD32 *__restrict__ p_inp1_itr1 = p_inp1_itr0; + const WORD32 *__restrict__ p_inp2_itr1 = p_inp2_itr0; + for (WORD32 itr2 = 0; itr2 < p_5d_out_shape[2]; itr2++) + { + xa_nn_elm_add_32x32_32(p_out_base, + p_inp1_itr1, + p_inp2_itr1, + alpha, + in_lc); + p_out_base += in_lc; + p_inp1_itr1 += inp1_strides[2]; + p_inp2_itr1 += inp2_strides[2]; + } + p_inp1_itr0 += inp1_strides[1]; + p_inp2_itr0 += inp2_strides[1]; + } + p_inp1_base += inp1_strides[0]; + p_inp2_base += inp2_strides[0]; + } + } + } + else + { + /* If the last dim itself is broadcastable */ + for (WORD32 itr0 = 0; itr0 < p_5d_out_shape[0]; itr0++) + { + const WORD32 *__restrict__ p_inp1_itr0 = p_inp1_base; + const WORD32 *__restrict__ p_inp2_itr0 = p_inp2_base; + for (WORD32 itr1 = 0; itr1 < p_5d_out_shape[1]; itr1++) + { + const WORD32 *__restrict__ p_inp1_itr1 = p_inp1_itr0; + const WORD32 *__restrict__ p_inp2_itr1 = p_inp2_itr0; + for (WORD32 itr2 = 0; itr2 < p_5d_out_shape[2]; itr2++) + { + const WORD32 *__restrict__ p_inp1_itr2 = p_inp1_itr1; + const WORD32 *__restrict__ p_inp2_itr2 = p_inp2_itr1; + for (WORD32 itr3 = 0; itr3 < p_5d_out_shape[3]; itr3++) + { + internal_elm_add_broadcast_1D_scalar_32x32_32( + p_out_base, + p_inp1_itr2, + p_inp2_itr2, + p_5d_out_shape[4], + p_5d_inp1_shape, + inp1_const, + alpha); + p_out_base += p_5d_out_shape[4]; + p_inp1_itr2 += inp1_strides[3]; + p_inp2_itr2 += inp2_strides[3]; + } + p_inp1_itr1 += inp1_strides[2]; + p_inp2_itr1 += inp2_strides[2]; + } + p_inp1_itr0 += inp1_strides[1]; + p_inp2_itr0 += inp2_strides[1]; + } + p_inp1_base += inp1_strides[0]; + p_inp2_base += inp2_strides[0]; + } + } + + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_add_f32.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_add_f32.c new file mode 100644 index 00000000000..959cadab0a1 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_add_f32.c @@ -0,0 +1,618 @@ +/******************************************************************************* + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + ******************************************************************************/ +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_add_f32xf32_f32(FLOAT32 *p_out, + const FLOAT32 *p_inp1, + const FLOAT32 *p_inp2, + FLOAT32 alpha, + WORD32 num_elm) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp1, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp2, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), UNSUPPORTED_PARAM); + + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND((num_elm <= 0), UNSUPPORTED_PARAM); + + WORD32 n, m; + + xb_vecMxf32 x0, y0; + valign ax, ay, az; + + const xb_vecMxf32 *restrict p_x = (const xb_vecMxf32*) p_inp1; + const xb_vecMxf32 *restrict p_y = (const xb_vecMxf32*) p_inp2; + xb_vecMxf32 *restrict p_z = (xb_vecMxf32*) p_out; + + xb_vecMxf32 vec_alpha = alpha; + + ax = PDX_LA_MXF32_PP(p_x); + ay = PDX_LA_MXF32_PP(p_y); + az = PDX_Z_ALIGN(); + + for (n = 0; n < (num_elm >> LOG2_PDX_M); n++) + { + PDX_LA_MXF32_IP(x0, ax, p_x); + PDX_LA_MXF32_IP(y0, ay, p_y); + PDX_MULA_MXF32(x0, y0, vec_alpha); + PDX_SA_MXF32_IP(x0, az, p_z); + } + m = (num_elm & (PDX_M - CONST_ONE)) << LOG2_SIZE_FLOAT; + PDX_LAV_MXF32_XP(x0, ax, p_x, m); + PDX_LAV_MXF32_XP(y0, ay, p_y, m); + PDX_MULA_MXF32(x0, y0, vec_alpha); + PDX_SAV_MXF32_XP(x0, az, p_z, m); + PDX_SAPOS_MXF32_FP(az, p_z); + + return 0; +} + +WORD32 xa_nn_elm_add_scalar_f32xf32_f32(FLOAT32 *p_out, + const FLOAT32 *p_inp1, + const FLOAT32 inp2, + FLOAT32 alpha, + WORD32 num_elm) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp1, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), UNSUPPORTED_PARAM); + + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND((num_elm <= 0), UNSUPPORTED_PARAM); + + WORD32 n, m; + + xb_vecMxf32 x, y, vec_alpha; + vec_alpha = alpha; + y = inp2; + valign ax, az; + + const xb_vecMxf32 *restrict p_x = (const xb_vecMxf32*) p_inp1; + xb_vecMxf32 *restrict p_z = (xb_vecMxf32*) p_out; + + /* Align load priming */ + ax = PDX_LA_MXF32_PP(p_x); + + /* Zeroing align register */ + az = PDX_Z_ALIGN(); + + /* loop iterates for multiple of LOG2_PDX_M */ + for (n = 0; n < (num_elm >> LOG2_PDX_M); n++) + { + /* Aligning load input1 (4-way 32 bit) */ + PDX_LA_MXF32_IP(x, ax, p_x); + /* Add input1 and (input2 * alpha) */ + PDX_MULA_MXF32(x, y, vec_alpha); + /* Aligning store (4-way 32bit elements) + * with post increment addressing + */ + PDX_SA_MXF32_IP(x, az, p_z); + } + + /* Remaining elements after processing the loop */ + m = (num_elm & (PDX_M - CONST_ONE)) << LOG2_SIZE_FLOAT; + + /* Variable aligining load */ + PDX_LAV_MXF32_XP(x, ax, p_x, m); + + PDX_MULA_MXF32(x, y, vec_alpha); + /* Variable aligining store and flush */ + PDX_SAV_MXF32_XP(x, az, p_z, m); + PDX_SAPOS_MXF32_FP(az, p_z); + + return 0; +} + +static inline void shapes_convert_5D(WORD32 *const __restrict__ p_5d_out_shape, + WORD32 *const __restrict__ p_5d_inp1_shape, /* new input1 shapes */ + WORD32 *const __restrict__ p_5d_inp2_shape, /* new input2 shapes */ + const WORD32 *const __restrict__ p_out_shape, + const WORD32 *const __restrict__ p_inp1_shape, /* original input1 shapes */ + const WORD32 *const __restrict__ p_inp2_shape, /* original input1 shapes */ + const WORD32 num_inp_dims) +{ + /* Convert number of dimension less than 5D to 5D */ + for (WORD32 i = 0; i < num_inp_dims; i++) + { + p_5d_out_shape[i + MAX_DIMS - num_inp_dims] = p_out_shape[i]; + p_5d_inp1_shape[i + MAX_DIMS - num_inp_dims] = p_inp1_shape[i]; + p_5d_inp2_shape[i + MAX_DIMS - num_inp_dims] = p_inp2_shape[i]; + } +} + +static inline WORD32 check_shapes(const WORD32 *const p_inp1_shape, + const WORD32 *const p_inp2_shape, + const WORD32 *const p_out_shape) +{ + /* Check the shapes of input and output */ + for (WORD32 i = 0; i < MAX_DIMS; i++) + { + if (((p_inp1_shape[i] != p_inp2_shape[i]) + && (p_inp1_shape[i] != CONST_ONE) + && (p_inp2_shape[i] != CONST_ONE)) + || (p_out_shape[i] + != (p_inp1_shape[i] > p_inp2_shape[i] ? + p_inp1_shape[i] : p_inp2_shape[i]))) + { + return UNSUPPORTED_PARAM; + } + } + return 0; +} + +static inline void strides_calculation(const WORD32 *const inp1_shape, + const WORD32 *const inp2_shape, + WORD32 *const inp1_strides, + WORD32 *const inp2_strides) +{ + inp1_strides[MAX_DIMS - CONST_ONE] = CONST_ONE; + inp2_strides[MAX_DIMS - CONST_ONE] = CONST_ONE; + for (WORD32 i = MAX_DIMS - CONST_TWO; i >= 0; i--) + { + inp1_strides[i] = inp1_strides[i + CONST_ONE] + * inp1_shape[i + CONST_ONE]; + inp2_strides[i] = inp2_strides[i + CONST_ONE] + * inp2_shape[i + CONST_ONE]; + } +} + +static inline void internal_elm_add_broadcast_2D_f32xf32_f32( + FLOAT32 *__restrict__ p_out, + const FLOAT32 *__restrict__ p_inp1, + const FLOAT32 *__restrict__ p_inp2, + WORD32 out_lc, + WORD32 in_lc, + const WORD32 *input1_shapes, + const WORD32 *input2_shapes, + FLOAT32 alpha) +{ + WORD32 n, m; + + xb_vecMxf32 x0, x1, y0, y1; + + xb_vecMxf32 vec_alpha = alpha; + + const xb_vecMxf32 *__restrict__ p_x = (const xb_vecMxf32*) &p_inp1[0]; + const xb_vecMxf32 *__restrict__ p_y = (const xb_vecMxf32*) &p_inp2[0]; + + valign ax, ax0, ax1, ay, ay0, ay1, az, az0, az1; + ax = PDX_LA_MXF32_PP(p_x); + ay = PDX_LA_MXF32_PP(p_y); + az = PDX_Z_ALIGN(); + + const FLOAT32 *__restrict__ p_x_baseptr = &p_inp1[0]; + const xb_vecMxf32 *__restrict__ p_x0 = (const xb_vecMxf32*) &p_x_baseptr[0]; + const xb_vecMxf32 *__restrict__ p_x1 = (const xb_vecMxf32*) (&p_x_baseptr[0] + + ((out_lc / CONST_TWO) * in_lc)); + + ax0 = PDX_LA_MXF32_PP(p_x0); + ax1 = PDX_LA_MXF32_PP(p_x1); + + const FLOAT32 *__restrict__ p_y_baseptr = &p_inp2[0]; + const xb_vecMxf32 *__restrict__ p_y0 = (const xb_vecMxf32*) &p_y_baseptr[0]; + const xb_vecMxf32 *__restrict__ p_y1 = (const xb_vecMxf32*) (&p_y_baseptr[0] + + ((out_lc / CONST_TWO) * in_lc)); + + ay0 = PDX_LA_MXF32_PP(p_y0); + ay1 = PDX_LA_MXF32_PP(p_y1); + + FLOAT32 *__restrict__ p_z_baseptr = &p_out[0]; + xb_vecMxf32 *__restrict__ p_z0 = (xb_vecMxf32*) &p_z_baseptr[0]; + xb_vecMxf32 *__restrict__ p_z1 = (xb_vecMxf32*) (&p_z_baseptr[0] + + ((out_lc / CONST_TWO) * in_lc)); + + az0 = PDX_Z_ALIGN(); + az1 = PDX_Z_ALIGN(); + + if (input1_shapes[3] == CONST_ONE) + { + /* input1_shapes[3] is 1 */ + for (WORD32 i = 0; i < out_lc - CONST_ONE; i += CONST_TWO) + { + for (n = 0; n < (in_lc >> LOG2_PDX_M); n++) + { + PDX_LA_MXF32_IP(x0, ax, p_x); + x1 = x0; + + PDX_LA_MXF32_IP(y0, ay0, p_y0); + PDX_LA_MXF32_IP(y1, ay1, p_y1); + + PDX_MULA_MXF32(x0, y0, vec_alpha); + PDX_SA_MXF32_IP(x0, az0, p_z0); + + PDX_MULA_MXF32(x1, y1, vec_alpha); + PDX_SA_MXF32_IP(x1, az1, p_z1); + } + m = (in_lc & (PDX_M - CONST_ONE)) * SIZE_OF_FLOAT; + PDX_LAV_MXF32_XP(x0, ax, p_x, m); + x1 = x0; + + PDX_LAV_MXF32_XP(y0, ay0, p_y0, m); + PDX_LAV_MXF32_XP(y1, ay1, p_y1, m); + + PDX_MULA_MXF32(x0, y0, vec_alpha); + PDX_SAV_MXF32_XP(x0, az0, p_z0, m); + + PDX_MULA_MXF32(x1, y1, vec_alpha); + PDX_SAV_MXF32_XP(x1, az1, p_z1, m); + + PDX_SAPOS_MXF32_FP(az0, p_z0); + PDX_SAPOS_MXF32_FP(az1, p_z1); + + p_x = (const xb_vecMxf32*) &p_inp1[0]; + ax = PDX_LA_MXF32_PP(p_x); + } + if (out_lc % CONST_TWO != 0) + { + for (n = 0; n < (in_lc >> LOG2_PDX_M); n++) + { + PDX_LA_MXF32_IP(y1, ay1, p_y1); + PDX_LA_MXF32_IP(x0, ax, p_x); + PDX_MULA_MXF32(x0, y1, vec_alpha); + PDX_SA_MXF32_IP(x0, az1, p_z1); + } + m = (in_lc & (PDX_M - CONST_ONE)) * SIZE_OF_FLOAT; + PDX_LAV_MXF32_XP(y1, ay1, p_y1, m); + PDX_LAV_MXF32_XP(x0, ax, p_x, m); + PDX_MULA_MXF32(x0, y1, vec_alpha); + PDX_SAV_MXF32_XP(x0, az1, p_z1, m); + PDX_SAPOS_MXF32_FP(az1, p_z1); + } + } + else + { + /* input2_shapes[3] is 1 */ + for (WORD32 i = 0; i < out_lc - CONST_ONE; i += CONST_TWO) + { + for (n = 0; n < (in_lc >> LOG2_PDX_M); n++) + { + PDX_LA_MXF32_IP(y0, ay, p_y); + + PDX_LA_MXF32_IP(x0, ax0, p_x0); + PDX_LA_MXF32_IP(x1, ax1, p_x1); + + PDX_MULA_MXF32(x0, y0, vec_alpha); + PDX_MULA_MXF32(x1, y0, vec_alpha); + + PDX_SA_MXF32_IP(x0, az0, p_z0); + PDX_SA_MXF32_IP(x1, az1, p_z1); + } + m = (in_lc & (PDX_M - CONST_ONE)) * SIZE_OF_FLOAT; + PDX_LAV_MXF32_XP(y0, ay, p_y, m); + PDX_LAV_MXF32_XP(x0, ax0, p_x0, m); + PDX_LAV_MXF32_XP(x1, ax1, p_x1, m); + PDX_MULA_MXF32(x0, y0, vec_alpha); + PDX_MULA_MXF32(x1, y0, vec_alpha); + PDX_SAV_MXF32_XP(x0, az0, p_z0, m); + PDX_SAV_MXF32_XP(x1, az1, p_z1, m); + PDX_SAPOS_MXF32_FP(az0, p_z0); + PDX_SAPOS_MXF32_FP(az1, p_z1); + + p_y = (const xb_vecMxf32*) &p_inp2[0]; + ay = PDX_LA_MXF32_PP(p_y); + } + if (out_lc % CONST_TWO != 0) + { + for (n = 0; n < (in_lc >> LOG2_PDX_M); n++) + { + PDX_LA_MXF32_IP(y0, ay, p_y); + PDX_LA_MXF32_IP(x1, ax1, p_x1); + PDX_MULA_MXF32(x1, y0, vec_alpha); + PDX_SA_MXF32_IP(x1, az1, p_z1); + } + m = (in_lc & (PDX_M - CONST_ONE)) * SIZE_OF_FLOAT; + PDX_LAV_MXF32_XP(y0, ay, p_y, m); + PDX_LAV_MXF32_XP(x0, ax1, p_x1, m); + PDX_MULA_MXF32(x0, y0, vec_alpha); + PDX_SAV_MXF32_XP(x0, az1, p_z1, m); + PDX_SAPOS_MXF32_FP(az1, p_z1); + } + } +} + +static inline void internal_elm_add_broadcast_1D_scalar_f32xf32_f32( + FLOAT32 *__restrict__ p_out, + const FLOAT32 *__restrict__ p_inp1, + const FLOAT32 *__restrict__ p_inp2, + WORD32 num_elm, + const WORD32 *__restrict__ input1_shapes, + const WORD32 inp1_const, + const FLOAT32 alpha) +{ + + xb_vecMxf32 x0, x1, y, vec_alpha = alpha; + xb_vecMxf32 *restrict p_i1 = (xb_vecMxf32*) p_inp1; + xb_vecMxf32 *restrict p_i2 = (xb_vecMxf32*) p_inp2; + xb_vecMxf32 *restrict p_o = (xb_vecMxf32*) p_out; + + valign ax, az; + az = PDX_Z_ALIGN(); + + WORD32 m; + + if ((input1_shapes[4] == CONST_ONE) || (inp1_const == CONST_ONE)) + { + x0 = PDX_LSR_F32_I(p_inp1, 0); + ax = PDX_LA_MXF32_PP(p_i2); + for(WORD32 i = 0; i < (num_elm >> LOG2_PDX_M); i++) + { + PDX_LA_MXF32_IP(y, ax, p_i2); + x1 = x0; + PDX_MULA_MXF32(x1, y, vec_alpha); + PDX_SA_MXF32_IP(x1, az, p_o); + } + m = (num_elm & (PDX_M - CONST_ONE)) * SIZE_OF_FLOAT; + PDX_LAV_MXF32_XP(y, ax, p_i2, m); + x1 = x0; + PDX_MULA_MXF32(x1, y, vec_alpha); + PDX_SAV_MXF32_XP(x1, az, p_o, m); + } + else + { + y = PDX_LSR_F32_I(p_inp2, 0); + ax = PDX_LA_MXF32_PP(p_i1); + for (WORD32 i = 0; i < (num_elm >> LOG2_PDX_M); i++) + { + PDX_LA_MXF32_IP(x0, ax, p_i1); + PDX_MULA_MXF32(x0, y, vec_alpha); + PDX_SA_MXF32_IP(x0, az, p_o); + } + m = (num_elm & (PDX_M - CONST_ONE)) * SIZE_OF_FLOAT; + PDX_LAV_MXF32_XP(x0, ax, p_i1, m); + PDX_MULA_MXF32(x0, y, vec_alpha); + PDX_SAV_MXF32_XP(x0, az, p_o, m); + } + PDX_SAPOS_MXF32_FP(az, p_o); +} + +WORD32 xa_nn_elm_add_broadcast_5D_f32xf32_f32(FLOAT32 *__restrict__ p_out, + const WORD32 *const p_out_shape, + const FLOAT32 *__restrict__ p_inp1, + const WORD32 *const p_inp1_shape, + const FLOAT32 *__restrict__ p_inp2, + const WORD32 *const p_inp2_shape, + WORD32 num_inp_dims, + FLOAT32 alpha) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_out_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp1, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp2, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + + /* UNSUPPORTED_PARAM input checks */ + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > MAX_DIMS)), + UNSUPPORTED_PARAM); + + /* 5D shapes initialization */ + WORD32 p_5d_out_shape[MAX_DIMS] = {CONST_ONE, CONST_ONE, CONST_ONE, + CONST_ONE, CONST_ONE}; + WORD32 p_5d_inp1_shape[MAX_DIMS] = {CONST_ONE, CONST_ONE, CONST_ONE, + CONST_ONE, CONST_ONE}; + WORD32 p_5d_inp2_shape[MAX_DIMS] = {CONST_ONE, CONST_ONE, CONST_ONE, + CONST_ONE, CONST_ONE}; + + shapes_convert_5D(p_5d_out_shape, p_5d_inp1_shape, p_5d_inp2_shape, + p_out_shape, p_inp1_shape, p_inp2_shape, num_inp_dims); + + /* Check shapes for broadcast compatibility */ + WORD32 error = 0; + error = check_shapes(p_5d_inp1_shape, p_5d_inp2_shape, p_5d_out_shape); + if (error) + { + return UNSUPPORTED_PARAM; + } + + /* Strides calculation */ + WORD32 inp1_strides[MAX_DIMS], inp2_strides[MAX_DIMS]; + strides_calculation(p_5d_inp1_shape, p_5d_inp2_shape, inp1_strides, + inp2_strides); + + /* Check for broadcast need */ + WORD32 need_broadcast = 0; + WORD32 inp1_const = CONST_ONE, inp2_const = CONST_ONE; + for (int i = 0; i < MAX_DIMS; i++) + { + if (p_5d_inp1_shape[i] != p_5d_inp2_shape[i]) + { + if (p_5d_inp1_shape[i] == CONST_ONE) + { + inp1_strides[i] = 0; + } + else + { + inp2_strides[i] = 0; + } + need_broadcast = CONST_ONE; + } + + if (p_5d_inp1_shape[i] != CONST_ONE) inp1_const &= 0; + if (p_5d_inp2_shape[i] != CONST_ONE) inp2_const &= 0; + } + + const FLOAT32 *__restrict__ p_inp1_base = p_inp1; + const FLOAT32 *__restrict__ p_inp2_base = p_inp2; + FLOAT32 *p_out_base = p_out; + + /* If broadcast is not needed */ + if (need_broadcast == 0) + { + xa_nn_elm_add_f32xf32_f32( + p_out_base, + p_inp1_base, + p_inp2_base, + alpha, + p_5d_out_shape[0] * inp1_strides[0]); + } + + /* If broadcast is needed */ + else if (inp1_const == CONST_ONE || inp2_const == CONST_ONE) + { + internal_elm_add_broadcast_1D_scalar_f32xf32_f32( + p_out_base, + p_inp1_base, + p_inp2_base, + p_5d_out_shape[0] * p_5d_out_shape[1] * p_5d_out_shape[2] + * p_5d_out_shape[3] * p_5d_out_shape[4], + p_5d_inp1_shape, + inp1_const, + alpha); + } + /* Check if 4th dim in both inputs is the same */ + else if (inp1_strides[4] == inp2_strides[4]) + { + WORD32 in_lc, out_lc; + /* Check if 3rd dim needs to be broadcasted */ + if (inp1_strides[3] == 0 || inp2_strides[3] == 0) + { + /* Repeat the 4th dimension as the 3rd dimension needs to be broadcasted */ + in_lc = p_5d_out_shape[4]; + out_lc = p_5d_out_shape[3]; + for (WORD32 itr0 = 0; itr0 < p_5d_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_itr0 = p_inp1_base; + const FLOAT32 *__restrict__ p_inp2_itr0 = p_inp2_base; + for (WORD32 itr1 = 0; itr1 < p_5d_out_shape[1]; itr1++) + { + const FLOAT32 *__restrict__ p_inp1_itr1 = p_inp1_itr0; + const FLOAT32 *__restrict__ p_inp2_itr1 = p_inp2_itr0; + for (WORD32 itr2 = 0; itr2 < p_5d_out_shape[2]; itr2++) + { + internal_elm_add_broadcast_2D_f32xf32_f32( + p_out_base, + p_inp1_itr1, + p_inp2_itr1, + out_lc, + in_lc, + p_5d_inp1_shape, + p_5d_inp2_shape, + alpha); + + p_out_base += in_lc * out_lc; + p_inp1_itr1 += inp1_strides[2]; + p_inp2_itr1 += inp2_strides[2]; + } + p_inp1_itr0 += inp1_strides[1]; + p_inp2_itr0 += inp2_strides[1]; + } + p_inp1_base += inp1_strides[0]; + p_inp2_base += inp2_strides[0]; + } + } + else + { + /* 3rd and 4th dimensions need not be broadcasted. The lower + * dimension broadcasting (0th, 1st, 2nd) will be taken care + * while calculating the input addresses */ + in_lc = p_5d_out_shape[3] * p_5d_out_shape[4]; + for (WORD32 itr0 = 0; itr0 < p_5d_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_itr0 = p_inp1_base; + const FLOAT32 *__restrict__ p_inp2_itr0 = p_inp2_base; + for (WORD32 itr1 = 0; itr1 < p_5d_out_shape[1]; itr1++) + { + const FLOAT32 *__restrict__ p_inp1_itr1 = p_inp1_itr0; + const FLOAT32 *__restrict__ p_inp2_itr1 = p_inp2_itr0; + for (WORD32 itr2 = 0; itr2 < p_5d_out_shape[2]; itr2++) + { + xa_nn_elm_add_f32xf32_f32( + p_out_base, + p_inp1_itr1, + p_inp2_itr1, + alpha, + in_lc); + p_out_base += in_lc; + p_inp1_itr1 += inp1_strides[2]; + p_inp2_itr1 += inp2_strides[2]; + } + p_inp1_itr0 += inp1_strides[1]; + p_inp2_itr0 += inp2_strides[1]; + } + p_inp1_base += inp1_strides[0]; + p_inp2_base += inp2_strides[0]; + } + } + } + else + { + /* If the last dim itself is broadcastable */ + for (WORD32 itr0 = 0; itr0 < p_5d_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_itr0 = p_inp1_base; + const FLOAT32 *__restrict__ p_inp2_itr0 = p_inp2_base; + for (WORD32 itr1 = 0; itr1 < p_5d_out_shape[1]; itr1++) + { + const FLOAT32 *__restrict__ p_inp1_itr1 = p_inp1_itr0; + const FLOAT32 *__restrict__ p_inp2_itr1 = p_inp2_itr0; + for (WORD32 itr2 = 0; itr2 < p_5d_out_shape[2]; itr2++) + { + const FLOAT32 *__restrict__ p_inp1_itr2 = p_inp1_itr1; + const FLOAT32 *__restrict__ p_inp2_itr2 = p_inp2_itr1; + for (WORD32 itr3 = 0; itr3 < p_5d_out_shape[3]; itr3++) + { + internal_elm_add_broadcast_1D_scalar_f32xf32_f32( + p_out_base, + p_inp1_itr2, + p_inp2_itr2, + p_5d_out_shape[4], + p_5d_inp1_shape, + inp1_const, + alpha); + p_out_base += p_5d_out_shape[4]; + p_inp1_itr2 += inp1_strides[3]; + p_inp2_itr2 += inp2_strides[3]; + } + p_inp1_itr1 += inp1_strides[2]; + p_inp2_itr1 += inp2_strides[2]; + } + p_inp1_itr0 += inp1_strides[1]; + p_inp2_itr0 += inp2_strides[1]; + } + p_inp1_base += inp1_strides[0]; + p_inp2_base += inp2_strides[0]; + } + } + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_asym16_f32.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_asym16_f32.c new file mode 100644 index 00000000000..a9388a06e35 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_asym16_f32.c @@ -0,0 +1,172 @@ +/******************************************************************************* + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + ******************************************************************************/ + +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_dequantize_asym16_f32(FLOAT32 *__restrict__ p_out, + const WORD16 *__restrict__ p_inp, + const WORD32 *const p_inp_shape, + WORD32 num_inp_dims, + WORD32 *p_axis, + WORD32 *p_inp_zero_bias, + FLOAT32 *p_inp_scale) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_zero_bias, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_scale, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(WORD16), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_zero_bias, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_scale, sizeof(FLOAT32), UNSUPPORTED_PARAM); + + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > MAX_DIMS)), + UNSUPPORTED_PARAM); + + /* Number of elements to be processed with a stride of 1 */ + WORD32 num_elm = CONST_ONE; + /* Number of leading dimensions of axis */ + WORD32 leading_dims = CONST_ONE; + /* Number of trailing dimensions of axis */ + WORD32 trailing_dims = CONST_ONE; + WORD32 length_per_step = 0; + WORD32 axis_count = CONST_ONE; + + if (p_axis == NULL) + { + XA_NNLIB_ARG_CHK_COND( + ((p_inp_zero_bias[0] < INT16_LOWER_LIMIT) || + (p_inp_zero_bias[0] > INT16_UPPER_LIMIT)), + UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < num_inp_dims; i++) + { + num_elm *= p_inp_shape[i]; + } + } + else + { + WORD32 axis = *p_axis; + /* Invalid input checks + * axis should be in the range [0,num_inp_dims-1] + */ + XA_NNLIB_ARG_CHK_COND(((axis < 0) || (axis >= num_inp_dims)), + UNSUPPORTED_PARAM); + + for (WORD32 i = 0; i < p_inp_shape[*p_axis]; i++) + { + XA_NNLIB_ARG_CHK_COND( + ((p_inp_zero_bias[i] < INT16_LOWER_LIMIT) || + (p_inp_zero_bias[i] > INT16_UPPER_LIMIT)), + UNSUPPORTED_PARAM); + } + + /* Calculating leading dims */ + for (WORD32 i = 0; i < axis; i++) + { + leading_dims *= p_inp_shape[i]; + } + + /* Calculating trailing dims */ + for (WORD32 i = axis + CONST_ONE; i < num_inp_dims; i++) + { + trailing_dims *= p_inp_shape[i]; + } + + num_elm = trailing_dims; + + /* Number of elements to be skipped after trailing number of + * elements quantized with a scale and zero_bias values to get + * the next base addresses. + */ + length_per_step = p_inp_shape[axis] * trailing_dims; + + /* Length of the dimension along axis */ + axis_count = p_inp_shape[axis]; + } + + /* Base pointers that points to the first element in the channel */ + const WORD16 *__restrict__ inp_base; + FLOAT32 *__restrict__ out_base; + + /* Vector pointers for the base pointers */ + const xb_vecMx16 *__restrict__ inp_base_p; + xb_vecMxf32 *__restrict__ out_base_p; + + WORD32 leading_dim_idx; + WORD32 m = (num_elm & (PDX_M - CONST_ONE)); + WORD32 m_16 = m * SIZE_OF_INT16; + WORD32 m_32 = m * SIZE_OF_FLOAT; + + valign align_a, align_out; + align_out = PDX_Z_ALIGN(); + + xb_vecMx32 x0, b0; + xb_vecMxf32 y0; + + /* Outermost loop iterates over the channels */ + for (WORD32 axis_index = 0; axis_index < axis_count; axis_index++) + { + xb_vecMxf32 d_inp_scale = p_inp_scale[axis_index]; + xb_vecMx32 d_inp_zero_bias = p_inp_zero_bias[axis_index]; + inp_base = p_inp + (axis_index * trailing_dims); + out_base = p_out + (axis_index * trailing_dims); + + /* This loop iterates over the leading dims. + * All the elements are dequantized at a time for + * single scale and zero_bias once loaded + */ + for (leading_dim_idx = 0; leading_dim_idx < leading_dims; + leading_dim_idx++) + { + inp_base_p = (const xb_vecMx16*) inp_base; + align_a = PDX_LA_MX16_PP(inp_base_p); + out_base_p = (xb_vecMxf32*) out_base; + + for (WORD32 i = 0; i < (num_elm >> LOG2_PDX_M); i++) + { + PDX_LA32_MX16_IP(x0, align_a, inp_base_p); + b0 = PDX_SUB_MX32(x0, d_inp_zero_bias); + y0 = PDX_MUL_MXF32(b0, d_inp_scale); + PDX_SA_MXF32_IP(y0, align_out, out_base_p); + } + PDX_LAV32_MX16_XP(x0, align_a, inp_base_p, m_16); + b0 = PDX_SUB_MX32(x0, d_inp_zero_bias); + y0 = PDX_MUL_MXF32(b0, d_inp_scale); + PDX_SAV_MXF32_XP(y0, align_out, out_base_p, m_32); + PDX_SAPOS_MXF32_FP(align_out, out_base_p); + + inp_base = inp_base + length_per_step; + out_base = out_base + length_per_step; + } + } + + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_asym16u_f32.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_asym16u_f32.c new file mode 100644 index 00000000000..cb5b1a8f7e1 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_asym16u_f32.c @@ -0,0 +1,170 @@ +/******************************************************************************* + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + ******************************************************************************/ + +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_dequantize_asym16u_f32(FLOAT32 *__restrict__ p_out, + const UWORD16 *__restrict__ p_inp, + const WORD32 *const p_inp_shape, + WORD32 num_inp_dims, + WORD32 *p_axis, + WORD32 *p_inp_zero_bias, + FLOAT32 *p_inp_scale) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_zero_bias, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_scale, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(UWORD16), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_zero_bias, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_scale, sizeof(FLOAT32), UNSUPPORTED_PARAM); + + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > MAX_DIMS)), + UNSUPPORTED_PARAM); + + /* Number of elements to be processed with a stride of 1 */ + WORD32 num_elm = CONST_ONE; + /* Number of leading dimensions of axis */ + WORD32 leading_dims = CONST_ONE; + /* Number of trailing dimensions of axis */ + WORD32 trailing_dims = CONST_ONE; + WORD32 length_per_step = 0; + WORD32 axis_count = CONST_ONE; + + if (p_axis == NULL) + { + XA_NNLIB_ARG_CHK_COND( + ((p_inp_zero_bias[0] < UINT16_LOWER_LIMIT) || + (p_inp_zero_bias[0] > UINT16_UPPER_LIMIT)), + UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < num_inp_dims; i++) + { + num_elm *= p_inp_shape[i]; + } + } + else + { + WORD32 axis = *p_axis; + + /* Invalid input checks + * axis should be in the range [0,num_inp_dims-1] + */ + XA_NNLIB_ARG_CHK_COND(((axis < 0) || (axis >= num_inp_dims)), + UNSUPPORTED_PARAM); + + for (WORD32 i = 0; i < p_inp_shape[*p_axis]; i++) + { + XA_NNLIB_ARG_CHK_COND( + ((p_inp_zero_bias[i] < UINT16_LOWER_LIMIT) || + (p_inp_zero_bias[i] > UINT16_UPPER_LIMIT)), + UNSUPPORTED_PARAM); + } + + /* Calculating leading dims */ + for (WORD32 i = 0; i < axis; i++) + { + leading_dims *= p_inp_shape[i]; + } + + /* Calculating trailing dims */ + for (WORD32 i = axis + CONST_ONE; i < num_inp_dims; i++) + { + trailing_dims *= p_inp_shape[i]; + } + + num_elm = trailing_dims; + + /* Number of elements to be skipped after trailing number of + * elements quantized with a scale and zero_bias values to get + * the next base addresses. + */ + length_per_step = p_inp_shape[axis] * trailing_dims; + + /* Length of the dimension along axis */ + axis_count = p_inp_shape[axis]; + } + + /* Base pointers that points to the first element in the channel */ + const UWORD16 *__restrict__ inp_base; + FLOAT32 *__restrict__ out_base; + + /* Vector pointers for the base pointers */ + const xb_vecMxu16 *__restrict__ inp_base_p; + xb_vecMxf32 *__restrict__ out_base_p; + + WORD32 m = (num_elm & (PDX_M - CONST_ONE)); + WORD32 m_16 = m * SIZE_OF_INT16; + WORD32 m_32 = m * SIZE_OF_FLOAT; + + WORD32 leading_dim_idx; + valign align_a, align_out; + align_out = PDX_Z_ALIGN(); + + xb_vecMxu32 x0; + xb_vecMx32 b0; + xb_vecMxf32 y0; + + /* Outermost loop iterates over the channels */ + for (WORD32 axis_index = 0; axis_index < axis_count; axis_index++) + { + xb_vecMxf32 d_inp_scale = p_inp_scale[axis_index]; + xb_vecMxu32 d_inp_zero_bias = p_inp_zero_bias[axis_index]; + inp_base = p_inp + (axis_index * trailing_dims); + out_base = p_out + (axis_index * trailing_dims); + + for (leading_dim_idx = 0; leading_dim_idx < leading_dims; + leading_dim_idx++) + { + inp_base_p = (const xb_vecMxu16*) inp_base; + align_a = PDX_LA_MXU16_PP(inp_base_p); + out_base_p = (xb_vecMxf32*) out_base; + + for (WORD32 i = 0; i < (num_elm >> LOG2_PDX_M); i++) + { + PDX_LAU32_MX16_IP(x0, align_a, inp_base_p); + b0 = PDX_SUB_MX32(x0, d_inp_zero_bias); + y0 = PDX_MUL_MXF32(b0, d_inp_scale); + PDX_SA_MXF32_IP(y0, align_out, out_base_p); + } + PDX_LAVU32_MX16_XP(x0, align_a, inp_base_p, m_16); + b0 = PDX_SUB_MX32(x0, d_inp_zero_bias); + y0 = PDX_MUL_MXF32(b0, d_inp_scale); + PDX_SAV_MXF32_XP(y0, align_out, out_base_p, m_32); + PDX_SAPOS_MXF32_FP(align_out, out_base_p); + + inp_base = inp_base + length_per_step; + out_base = out_base + length_per_step; + } + } + + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_asym4_f32.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_asym4_f32.c new file mode 100644 index 00000000000..299ef7979f2 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_asym4_f32.c @@ -0,0 +1,174 @@ +/****************************************************************************** + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + ******************************************************************************/ + +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_dequantize_asym4_f32(FLOAT32 *__restrict__ p_out, + const WORD8 *__restrict__ p_inp, + const WORD32 *const p_inp_shape, + WORD32 num_inp_dims, + WORD32 *p_axis, + WORD32 *p_inp_zero_bias, + FLOAT32 *p_inp_scale) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_zero_bias, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_scale, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(WORD8), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_zero_bias, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_scale, sizeof(FLOAT32), UNSUPPORTED_PARAM); + + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > MAX_DIMS)), + UNSUPPORTED_PARAM); + + /* Number of elements to be processed with a stride of 1 */ + WORD32 num_elm = CONST_ONE; + /* Number of leading dimensions of axis */ + WORD32 leading_dims = CONST_ONE; + /* Number of trailing dimensions of axis */ + WORD32 trailing_dims = CONST_ONE; + WORD32 length_per_step = 0; + WORD32 axis_count = CONST_ONE; + + if (p_axis == NULL) + { + XA_NNLIB_ARG_CHK_COND( + ((p_inp_zero_bias[0] < INT4_LOWER_LIMIT) || + (p_inp_zero_bias[0] > INT4_UPPER_LIMIT)), + UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < num_inp_dims; i++) + { + num_elm *= p_inp_shape[i]; + } + } + else + { + WORD32 axis = *p_axis; + + /* Invalid input checks */ + // axis should be in the range [0,num_inp_dims-1] + XA_NNLIB_ARG_CHK_COND(((axis < 0) || (axis >= num_inp_dims)), + UNSUPPORTED_PARAM); + + for (WORD32 i = 0; i < p_inp_shape[*p_axis]; i++) + { + XA_NNLIB_ARG_CHK_COND( + ((p_inp_zero_bias[i] < INT4_LOWER_LIMIT) || + (p_inp_zero_bias[i] > INT4_UPPER_LIMIT)), + UNSUPPORTED_PARAM); + } + + /* calculating leading dims */ + for (WORD32 i = 0; i < axis; i++) + { + leading_dims *= p_inp_shape[i]; + } + + /* calculating trailing dims */ + for (WORD32 i = axis + CONST_ONE; i < num_inp_dims; i++) + { + trailing_dims *= p_inp_shape[i]; + } + + num_elm = trailing_dims; + + /* Number of elements to be skipped after trailing number of + * elements quantized with a scale and zero_bias values to get + * the next base addresses. + */ + length_per_step = p_inp_shape[axis] * trailing_dims; + + /* Length of the dimension along axis */ + axis_count = p_inp_shape[axis]; + } + + /* Base pointers that points to the first element in the channel */ + const WORD8 *__restrict__ inp_base; + FLOAT32 *__restrict__ out_base; + + /* Vector pointers for the base pointers */ + const xb_vecMx8 *__restrict__ inp_base_p; + xb_vecMxf32 *__restrict__ out_base_p; + + WORD32 leading_dim_idx; + WORD32 m = (num_elm & (PDX_M - CONST_ONE)); + WORD32 m_8 = m * SIZE_OF_INT8; + WORD32 m_32 = m * SIZE_OF_FLOAT; + + valign align_a, align_out; + align_out = PDX_Z_ALIGN(); + + xb_vecMx32 x0, b0; + xb_vecMxf32 y0; + + /* Outermost loop iterates over the channels */ + for (WORD32 axis_index = 0; axis_index < axis_count; axis_index++) + { + xb_vecMxf32 d_inp_scale = (p_inp_scale[axis_index] + / SCALE_FACTOR_4_BIT); + xb_vecMx32 d_inp_zero_bias = (p_inp_zero_bias[axis_index] + << SHIFT_FACTOR_4_BIT); + inp_base = p_inp + (axis_index * trailing_dims); + out_base = p_out + (axis_index * trailing_dims); + + /* This loop iterates over the leading dims. + * All the elements are quantized at a time for + * single scale and zero_bias once loaded + */ + for (leading_dim_idx = 0; leading_dim_idx < leading_dims; + leading_dim_idx++) + { + inp_base_p = (const xb_vecMx8*) inp_base; + align_a = PDX_LA_MX8_PP(inp_base_p); + out_base_p = (xb_vecMxf32*) out_base; + + for (WORD32 i = 0; i < (num_elm >> LOG2_PDX_M); i++) + { + PDX_LA32_MX8_IP(x0, align_a, inp_base_p); + b0 = PDX_SUB_MX32(x0, d_inp_zero_bias); + y0 = PDX_MUL_MXF32(b0, d_inp_scale); + PDX_SA_MXF32_IP(y0, align_out, out_base_p); + } + PDX_LAV32_MX8_XP(x0, align_a, inp_base_p, m_8); + b0 = PDX_SUB_MX32(x0, d_inp_zero_bias); + y0 = PDX_MUL_MXF32(b0, d_inp_scale); + PDX_SAV_MXF32_XP(y0, align_out, out_base_p, m_32); + PDX_SAPOS_MXF32_FP(align_out, out_base_p); + + inp_base = inp_base + length_per_step; + out_base = out_base + length_per_step; + } + } + + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_asym4u_f32.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_asym4u_f32.c new file mode 100644 index 00000000000..5d4df3303f2 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_asym4u_f32.c @@ -0,0 +1,177 @@ +/******************************************************************************* + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + ******************************************************************************/ + +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_dequantize_asym4u_f32(FLOAT32 *__restrict__ p_out, + const UWORD8 *__restrict__ p_inp, + const WORD32 *const p_inp_shape, + WORD32 num_inp_dims, + WORD32 *p_axis, + WORD32 *p_inp_zero_bias, + FLOAT32 *p_inp_scale) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_zero_bias, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_scale, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(UWORD8), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_zero_bias, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_scale, sizeof(FLOAT32), UNSUPPORTED_PARAM); + + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > MAX_DIMS)), + UNSUPPORTED_PARAM); + + /* Number of elements to be processed with a stride of 1 */ + WORD32 num_elm = CONST_ONE; + /* Number of leading dimensions of axis */ + WORD32 leading_dims = CONST_ONE; + /* Number of trailing dimensions of axis */ + WORD32 trailing_dims = CONST_ONE; + WORD32 length_per_step = 0; + WORD32 axis_count = CONST_ONE; + + if (p_axis == NULL) + { + XA_NNLIB_ARG_CHK_COND( + ((p_inp_zero_bias[0] < UINT4_LOWER_LIMIT) || + (p_inp_zero_bias[0] > UINT4_UPPER_LIMIT)), + UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < num_inp_dims; i++) + { + num_elm *= p_inp_shape[i]; + } + } + else + { + WORD32 axis = *p_axis; + + /* Invalid input checks + * axis should be in the range [0,num_inp_dims-1] + */ + XA_NNLIB_ARG_CHK_COND(((axis < 0) || (axis >= num_inp_dims)), + UNSUPPORTED_PARAM); + + for (WORD32 i = 0; i < p_inp_shape[*p_axis]; i++) + { + XA_NNLIB_ARG_CHK_COND( + ((p_inp_zero_bias[i] < UINT4_LOWER_LIMIT) || + (p_inp_zero_bias[i] > UINT4_UPPER_LIMIT)), + UNSUPPORTED_PARAM); + } + + /* Calculating leading dims */ + for (WORD32 i = 0; i < axis; i++) + { + leading_dims *= p_inp_shape[i]; + } + + /* Calculating trailing dims */ + for (WORD32 i = axis + CONST_ONE; i < num_inp_dims; i++) + { + trailing_dims *= p_inp_shape[i]; + } + + num_elm = trailing_dims; + + /* Number of elements to be skipped after trailing number of + * elements quantized with a scale and zero_bias values to get + * the next base addresses. + */ + length_per_step = p_inp_shape[axis] * trailing_dims; + + /* Length of the dimension along axis */ + axis_count = p_inp_shape[axis]; + } + + /* Base pointers that points to the first element in the channel */ + const UWORD8 *__restrict__ inp_base; + FLOAT32 *__restrict__ out_base; + + /* Vector pointers for the base pointers */ + const xb_vecMxu8 *__restrict__ inp_base_p; + xb_vecMxf32 *__restrict__ out_base_p; + + WORD32 leading_dim_idx; + + WORD32 m = (num_elm & (PDX_M - CONST_ONE)); + WORD32 m_8 = m * SIZE_OF_INT8; + WORD32 m_32 = m * SIZE_OF_FLOAT; + + valign align_a, align_out; + align_out = PDX_Z_ALIGN(); + + xb_vecMxu32 x0; + xb_vecMx32 b0; + xb_vecMxf32 y0; + + /* Outermost loop iterates over the channels */ + for (WORD32 axis_index = 0; axis_index < axis_count; axis_index++) + { + xb_vecMxf32 d_inp_scale = (p_inp_scale[axis_index] + / SCALE_FACTOR_4_BIT); + xb_vecMxu32 d_inp_zero_bias = (p_inp_zero_bias[axis_index] + << SHIFT_FACTOR_4_BIT); + inp_base = p_inp + (axis_index * trailing_dims); + out_base = p_out + (axis_index * trailing_dims); + + /* This loop iterates over the leading dims. + * All the elements are quantized at a time for + * single scale and zero_bias once loaded + */ + for (leading_dim_idx = 0; leading_dim_idx < leading_dims; + leading_dim_idx++) + { + inp_base_p = (const xb_vecMxu8*) inp_base; + align_a = PDX_LA_MXU8_PP(inp_base_p); + out_base_p = (xb_vecMxf32*) out_base; + + for (WORD32 i = 0; i < (num_elm >> LOG2_PDX_M); i++) + { + PDX_LAU32_MX8_IP(x0, align_a, inp_base_p); + b0 = PDX_SUB_MX32(x0, d_inp_zero_bias); + y0 = PDX_MUL_MXF32(b0, d_inp_scale); + PDX_SA_MXF32_IP(y0, align_out, out_base_p); + } + PDX_LAVU32_MX8_XP(x0, align_a, inp_base_p, m_8); + b0 = PDX_SUB_MX32(x0, d_inp_zero_bias); + y0 = PDX_MUL_MXF32(b0, d_inp_scale); + PDX_SAV_MXF32_XP(y0, align_out, out_base_p, m_32); + PDX_SAPOS_MXF32_FP(align_out, out_base_p); + + inp_base = inp_base + length_per_step; + out_base = out_base + length_per_step; + } + } + + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_asym8_f32.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_asym8_f32.c new file mode 100644 index 00000000000..1ed811c0549 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_asym8_f32.c @@ -0,0 +1,170 @@ +/******************************************************************************* + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + ******************************************************************************/ + +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_dequantize_asym8_f32(FLOAT32 *__restrict__ p_out, + const WORD8 *__restrict__ p_inp, + const WORD32 *const p_inp_shape, + WORD32 num_inp_dims, + WORD32 *p_axis, + WORD32 *p_inp_zero_bias, + FLOAT32 *p_inp_scale) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_zero_bias, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_scale, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(WORD8), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_zero_bias, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_scale, sizeof(FLOAT32), UNSUPPORTED_PARAM); + + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > MAX_DIMS)), + UNSUPPORTED_PARAM); + + /* Number of elements to be processed with a stride of 1 */ + WORD32 num_elm = CONST_ONE; + /* Number of leading dimensions of axis */ + WORD32 leading_dims = CONST_ONE; + /* Number of trailing dimensions of axis */ + WORD32 trailing_dims = CONST_ONE; + WORD32 length_per_step = 0; + WORD32 axis_count = CONST_ONE; + + if (p_axis == NULL) + { + XA_NNLIB_ARG_CHK_COND( + ((p_inp_zero_bias[0] < INT8_LOWER_LIMIT) || + (p_inp_zero_bias[0] > INT8_UPPER_LIMIT)), + UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < num_inp_dims; i++) + { + num_elm *= p_inp_shape[i]; + } + } + else + { + WORD32 axis = *p_axis; + + /* Invalid input checks + * axis should be in the range [0,num_inp_dims-1] + */ + XA_NNLIB_ARG_CHK_COND(((axis < 0) || (axis >= num_inp_dims)), + UNSUPPORTED_PARAM); + + for (WORD32 i = 0; i < p_inp_shape[*p_axis]; i++) + { + XA_NNLIB_ARG_CHK_COND( + ((p_inp_zero_bias[i] < INT8_LOWER_LIMIT) || + (p_inp_zero_bias[i] > INT8_UPPER_LIMIT)), + UNSUPPORTED_PARAM); + } + + /* Calculating leading dims */ + for (WORD32 i = 0; i < axis; i++) + { + leading_dims *= p_inp_shape[i]; + } + + /* Calculating trailing dims */ + for (WORD32 i = axis + CONST_ONE; i < num_inp_dims; i++) + { + trailing_dims *= p_inp_shape[i]; + } + + num_elm = trailing_dims; + + /* Number of elements to be skipped after trailing number of + * elements quantized with a scale and zero_bias values to get + * the next base addresses. + */ + length_per_step = p_inp_shape[axis] * trailing_dims; + + /* length of the dimension along axis */ + axis_count = p_inp_shape[axis]; + } + + /* Base pointers that points to the first element in the channel */ + const WORD8 *__restrict__ inp_base; + FLOAT32 *__restrict__ out_base; + + /* Vector pointers for the base pointers */ + const xb_vecMx8 *__restrict__ inp_base_p; + xb_vecMxf32 *__restrict__ out_base_p; + + WORD32 leading_dim_idx; + + WORD32 m = (num_elm & (PDX_M - CONST_ONE)); + WORD32 m_8 = m * SIZE_OF_INT8; + WORD32 m_32 = m * SIZE_OF_FLOAT; + + valign align_a, align_out; + align_out = PDX_Z_ALIGN(); + + xb_vecMx32 x0, b0; + xb_vecMxf32 y0; + + /* Outermost loop iterates over the channels */ + for (WORD32 axis_index = 0; axis_index < axis_count; axis_index++) + { + xb_vecMxf32 d_inp_scale = p_inp_scale[axis_index]; + xb_vecMx32 d_inp_zero_bias = p_inp_zero_bias[axis_index]; + inp_base = p_inp + (axis_index * trailing_dims); + out_base = p_out + (axis_index * trailing_dims); + + for (leading_dim_idx = 0; leading_dim_idx < leading_dims; + leading_dim_idx++) + { + inp_base_p = (const xb_vecMx8*) inp_base; + align_a = PDX_LA_MX8_PP(inp_base_p); + out_base_p = (xb_vecMxf32*) out_base; + + for (WORD32 i = 0; i < (num_elm >> LOG2_PDX_M); i++) + { + PDX_LA32_MX8_IP(x0, align_a, inp_base_p); + b0 = PDX_SUB_MX32(x0, d_inp_zero_bias); + y0 = PDX_MUL_MXF32(b0, d_inp_scale); + PDX_SA_MXF32_IP(y0, align_out, out_base_p); + } + PDX_LAV32_MX8_XP(x0, align_a, inp_base_p, m_8); + b0 = PDX_SUB_MX32(x0, d_inp_zero_bias); + y0 = PDX_MUL_MXF32(b0, d_inp_scale); + PDX_SAV_MXF32_XP(y0, align_out, out_base_p, m_32); + PDX_SAPOS_MXF32_FP(align_out, out_base_p); + + inp_base = inp_base + length_per_step; + out_base = out_base + length_per_step; + } + } + + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_asym8u_f32.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_asym8u_f32.c new file mode 100644 index 00000000000..f07bc9dd710 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_asym8u_f32.c @@ -0,0 +1,174 @@ +/******************************************************************************* + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + ******************************************************************************/ + +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_dequantize_asym8u_f32(FLOAT32 *__restrict__ p_out, + const UWORD8 *__restrict__ p_inp, + const WORD32 *const p_inp_shape, + WORD32 num_inp_dims, + WORD32 *p_axis, + WORD32 *p_inp_zero_bias, + FLOAT32 *p_inp_scale) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_zero_bias, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_scale, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(UWORD8), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_zero_bias, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_scale, sizeof(FLOAT32), UNSUPPORTED_PARAM); + + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > MAX_DIMS)), + UNSUPPORTED_PARAM); + + /* Number of elements to be processed with a stride of 1 */ + WORD32 num_elm = CONST_ONE; + /* Number of leading dimensions of axis */ + WORD32 leading_dims = CONST_ONE; + /* Number of trailing dimensions of axis */ + WORD32 trailing_dims = CONST_ONE; + WORD32 length_per_step = 0; + WORD32 axis_count = CONST_ONE; + + if (p_axis == NULL) + { + XA_NNLIB_ARG_CHK_COND( + ((p_inp_zero_bias[0] < UINT8_LOWER_LIMIT) || + (p_inp_zero_bias[0] > UINT8_UPPER_LIMIT)), + UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < num_inp_dims; i++) + { + num_elm *= p_inp_shape[i]; + } + } + else + { + WORD32 axis = *p_axis; + + /* Invalid input checks + * axis should be in the range [0,num_inp_dims-1] + */ + XA_NNLIB_ARG_CHK_COND(((axis < 0) || (axis >= num_inp_dims)), + UNSUPPORTED_PARAM); + + for (WORD32 i = 0; i < p_inp_shape[*p_axis]; i++) + { + XA_NNLIB_ARG_CHK_COND( + ((p_inp_zero_bias[i] < UINT8_LOWER_LIMIT) || + (p_inp_zero_bias[i] > UINT8_UPPER_LIMIT)), + UNSUPPORTED_PARAM); + } + + /* Calculating leading dims */ + for (WORD32 i = 0; i < axis; i++) + { + leading_dims *= p_inp_shape[i]; + } + + /* Calculating trailing dims */ + for (WORD32 i = axis + CONST_ONE; i < num_inp_dims; i++) + { + trailing_dims *= p_inp_shape[i]; + } + + num_elm = trailing_dims; + + /* Number of elements to be skipped after trailing number of + * elements quantized with a scale and zero_bias values to get + * the next base addresses. + */ + length_per_step = p_inp_shape[axis] * trailing_dims; + + /* Length of the dimension along axis */ + axis_count = p_inp_shape[axis]; + } + + /* Base pointers that points to the first element in the channel */ + const UWORD8 *__restrict__ inp_base; + FLOAT32 *__restrict__ out_base; + + /* Vector pointers for the base pointers */ + const xb_vecMxu8 *__restrict__ inp_base_p; + xb_vecMxf32 *__restrict__ out_base_p; + + WORD32 leading_dim_idx; + WORD32 m = (num_elm & (PDX_M - CONST_ONE)); + WORD32 m_8 = m * SIZE_OF_INT8; + WORD32 m_32 = m * SIZE_OF_FLOAT; + + valign align_a, align_out; + align_out = PDX_Z_ALIGN(); + + xb_vecMxu32 x0; + xb_vecMx32 b0; + xb_vecMxf32 y0; + + /* Outermost loop iterates over the channels */ + for (WORD32 axis_index = 0; axis_index < axis_count; axis_index++) + { + xb_vecMxf32 d_inp_scale = p_inp_scale[axis_index]; + xb_vecMxu32 d_inp_zero_bias = p_inp_zero_bias[axis_index]; + inp_base = p_inp + (axis_index * trailing_dims); + out_base = p_out + (axis_index * trailing_dims); + + /* This loop iterates over the leading dims. + * All the elements are quantized at a time for + * single scale and zero_bias once loaded + */ + for (leading_dim_idx = 0; leading_dim_idx < leading_dims; + leading_dim_idx++) + { + inp_base_p = (const xb_vecMxu8*) inp_base; + align_a = PDX_LA_MXU8_PP(inp_base_p); + out_base_p = (xb_vecMxf32*) out_base; + + for (WORD32 i = 0; i < (num_elm >> LOG2_PDX_M); i++) + { + PDX_LAU32_MX8_IP(x0, align_a, inp_base_p); + b0 = PDX_SUB_MX32(x0, d_inp_zero_bias); + y0 = PDX_MUL_MXF32(b0, d_inp_scale); + PDX_SA_MXF32_IP(y0, align_out, out_base_p); + } + PDX_LAVU32_MX8_XP(x0, align_a, inp_base_p, m_8); + b0 = PDX_SUB_MX32(x0, d_inp_zero_bias); + y0 = PDX_MUL_MXF32(b0, d_inp_scale); + PDX_SAV_MXF32_XP(y0, align_out, out_base_p, m_32); + PDX_SAPOS_MXF32_FP(align_out, out_base_p); + + inp_base = inp_base + length_per_step; + out_base = out_base + length_per_step; + } + } + + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_sym16_f32.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_sym16_f32.c new file mode 100644 index 00000000000..411d030b97a --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_sym16_f32.c @@ -0,0 +1,148 @@ +/******************************************************************************* + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + ******************************************************************************/ + +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_dequantize_sym16_f32(FLOAT32 *__restrict__ p_out, + const WORD16 *__restrict__ p_inp, + const WORD32 *const p_inp_shape, + WORD32 num_inp_dims, + WORD32 *p_axis, + FLOAT32 *p_inp_scale) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_scale, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(WORD16), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_scale, sizeof(FLOAT32), UNSUPPORTED_PARAM); + + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > MAX_DIMS)), + UNSUPPORTED_PARAM); + + /* Number of elements to be processed with a stride of 1 */ + WORD32 num_elm = CONST_ONE; + /* Number of leading dimensions of axis */ + WORD32 leading_dims = CONST_ONE; + /* Number of trailing dimensions of axis */ + WORD32 trailing_dims = CONST_ONE; + WORD32 length_per_step = 0; + WORD32 axis_count = CONST_ONE; + + if (p_axis == NULL) + { + for (WORD32 i = 0; i < num_inp_dims; i++) + { + num_elm *= p_inp_shape[i]; + } + } + else + { + WORD32 axis = *p_axis; + + /* Invalid input checks */ + /* axis should be in the range [0,num_inp_dims-1] */ + XA_NNLIB_ARG_CHK_COND(((axis < 0) || (axis >= num_inp_dims)), + UNSUPPORTED_PARAM); + + /* Calculating leading dims */ + for (WORD32 i = 0; i < axis; i++) + { + leading_dims *= p_inp_shape[i]; + } + + /* Calculating trailing dims */ + for (WORD32 i = axis + CONST_ONE; i < num_inp_dims; i++) + { + trailing_dims *= p_inp_shape[i]; + } + + num_elm = trailing_dims; + + /* Number of elements to be skipped after trailing number of + * elements quantized with a scale and zero_bias values to get + * the next base addresses. + */ + length_per_step = p_inp_shape[axis] * trailing_dims; + + /* Length of the dimension along axis */ + axis_count = p_inp_shape[axis]; + } + + const WORD16 *__restrict__ inp_base; + FLOAT32 *__restrict__ out_base; + + const xb_vecMx16 *__restrict__ inp_base_p; + xb_vecMxf32 *__restrict__ out_base_p; + + WORD32 m = (num_elm & (PDX_M - CONST_ONE)); + WORD32 m_16 = m * SIZE_OF_INT16; + WORD32 m_32 = m * SIZE_OF_FLOAT; + + WORD32 leading_dim_idx; + valign align_a, align_out; + align_out = PDX_Z_ALIGN(); + + xb_vecMx32 x0; + xb_vecMxf32 y0; + + /* Outermost loop iterates over the channels */ + for (WORD32 axis_index = 0; axis_index < axis_count; axis_index++) + { + xb_vecMxf32 d_inp_scale = p_inp_scale[axis_index]; + inp_base = p_inp + (axis_index * trailing_dims); + out_base = p_out + (axis_index * trailing_dims); + + for (leading_dim_idx = 0; leading_dim_idx < leading_dims; + leading_dim_idx++) + { + inp_base_p = (const xb_vecMx16*) inp_base; + align_a = PDX_LA_MX16_PP(inp_base_p); + out_base_p = (xb_vecMxf32*) out_base; + + for (WORD32 i = 0; i < (num_elm >> LOG2_PDX_M); i++) + { + PDX_LA32_MX16_IP(x0, align_a, inp_base_p); + y0 = PDX_MUL_MXF32(x0, d_inp_scale); + PDX_SA_MXF32_IP(y0, align_out, out_base_p); + } + PDX_LAV32_MX16_XP(x0, align_a, inp_base_p, m_16); + y0 = PDX_MUL_MXF32(x0, d_inp_scale); + PDX_SAV_MXF32_XP(y0, align_out, out_base_p, m_32); + PDX_SAPOS_MXF32_FP(align_out, out_base_p); + + inp_base = inp_base + length_per_step; + out_base = out_base + length_per_step; + } + } + + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_sym16u_f32.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_sym16u_f32.c new file mode 100644 index 00000000000..bf175df96c0 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_sym16u_f32.c @@ -0,0 +1,154 @@ +/******************************************************************************* + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + ******************************************************************************/ + +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_dequantize_sym16u_f32(FLOAT32 *__restrict__ p_out, + const UWORD16 *__restrict__ p_inp, + const WORD32 *const p_inp_shape, + WORD32 num_inp_dims, + WORD32 *p_axis, + FLOAT32 *p_inp_scale) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_scale, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(UWORD16), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_scale, sizeof(FLOAT32), UNSUPPORTED_PARAM); + + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > MAX_DIMS)), + UNSUPPORTED_PARAM); + + /* Number of elements to be processed with a stride of 1 */ + WORD32 num_elm = CONST_ONE; + /* Number of leading dimensions of axis */ + WORD32 leading_dims = CONST_ONE; + /* Number of trailing dimensions of axis */ + WORD32 trailing_dims = CONST_ONE; + WORD32 length_per_step = 0; + WORD32 axis_count = CONST_ONE; + + if (p_axis == NULL) + { + for (WORD32 i = 0; i < num_inp_dims; i++) + { + num_elm *= p_inp_shape[i]; + } + } + else + { + WORD32 axis = *p_axis; + /* Invalid input checks + * axis should be in the range [0,num_inp_dims-1] + */ + XA_NNLIB_ARG_CHK_COND(((axis < 0) || (axis >= num_inp_dims)), + UNSUPPORTED_PARAM); + + /* Calculating leading dims */ + for (WORD32 i = 0; i < axis; i++) + { + leading_dims *= p_inp_shape[i]; + } + + /* Calculating trailing dims */ + for (WORD32 i = axis + CONST_ONE; i < num_inp_dims; i++) + { + trailing_dims *= p_inp_shape[i]; + } + + num_elm = trailing_dims; + + /* Number of elements to be skipped after trailing number of + * elements quantized with a scale and zero_bias values to get + * the next base addresses. + */ + length_per_step = p_inp_shape[axis] * trailing_dims; + + /* Length of the dimension along axis */ + axis_count = p_inp_shape[axis]; + } + + /* Base pointers that points to the first element in the channel */ + const UWORD16 *__restrict__ inp_base; + FLOAT32 *__restrict__ out_base; + + /* Vector pointers for the base pointers */ + const xb_vecMxu16 *__restrict__ inp_base_p; + xb_vecMxf32 *__restrict__ out_base_p; + + WORD32 m = (num_elm & (PDX_M - CONST_ONE)); + WORD32 m_16 = m * SIZE_OF_INT16; + WORD32 m_32 = m * SIZE_OF_FLOAT; + + WORD32 leading_dim_idx; + valign align_a, align_out; + align_out = PDX_Z_ALIGN(); + + xb_vecMxu32 x0; + xb_vecMxf32 y0; + + /* Outermost loop iterates over the channels */ + for (WORD32 axis_index = 0; axis_index < axis_count; axis_index++) + { + xb_vecMxf32 d_inp_scale = p_inp_scale[axis_index]; + inp_base = p_inp + (axis_index * trailing_dims); + out_base = p_out + (axis_index * trailing_dims); + + /* This loop iterates over the leading dims. + * All the elements are quantized at a time for + * single scale and zero_bias once loaded + */ + for (leading_dim_idx = 0; leading_dim_idx < leading_dims; + leading_dim_idx++) + { + inp_base_p = (const xb_vecMxu16*) inp_base; + align_a = PDX_LA_MXU16_PP(inp_base_p); + out_base_p = (xb_vecMxf32*) out_base; + + for (WORD32 i = 0; i < (num_elm >> LOG2_PDX_M); i++) + { + PDX_LAU32_MX16_IP(x0, align_a, inp_base_p); + y0 = PDX_MUL_MXF32(x0, d_inp_scale); + PDX_SA_MXF32_IP(y0, align_out, out_base_p); + } + PDX_LAVU32_MX16_XP(x0, align_a, inp_base_p, m_16); + y0 = PDX_MUL_MXF32(x0, d_inp_scale); + PDX_SAV_MXF32_XP(y0, align_out, out_base_p, m_32); + PDX_SAPOS_MXF32_FP(align_out, out_base_p); + + inp_base = inp_base + length_per_step; + out_base = out_base + length_per_step; + } + } + + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_sym4_f32.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_sym4_f32.c new file mode 100644 index 00000000000..9613402a10c --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_sym4_f32.c @@ -0,0 +1,157 @@ +/******************************************************************************* + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + ******************************************************************************/ +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_dequantize_sym4_f32(FLOAT32 *__restrict__ p_out, + const WORD8 *__restrict__ p_inp, + const WORD32 *const p_inp_shape, + WORD32 num_inp_dims, + WORD32 *p_axis, + FLOAT32 *p_inp_scale) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_scale, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(WORD8), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_scale, sizeof(FLOAT32), UNSUPPORTED_PARAM); + + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > MAX_DIMS)), + UNSUPPORTED_PARAM); + + /* Number of elements to be processed with a stride of 1 */ + WORD32 num_elm = CONST_ONE; + /* Number of leading dimensions of axis */ + WORD32 leading_dims = CONST_ONE; + /* Number of trailing dimensions of axis */ + WORD32 trailing_dims = CONST_ONE; + WORD32 length_per_step = 0; + WORD32 axis_count = CONST_ONE; + + if (p_axis == NULL) + { + for (WORD32 i = 0; i < num_inp_dims; i++) + { + num_elm *= p_inp_shape[i]; + } + } + else + { + WORD32 axis = *p_axis; + + /* Invalid input checks + * axis should be in the range [0,num_inp_dims-1] + */ + XA_NNLIB_ARG_CHK_COND(((axis < 0) || (axis >= num_inp_dims)), + UNSUPPORTED_PARAM); + + /* Calculating leading dims */ + for (WORD32 i = 0; i < axis; i++) + { + leading_dims *= p_inp_shape[i]; + } + + /* Calculating trailing dims */ + for (WORD32 i = axis + CONST_ONE; i < num_inp_dims; i++) + { + trailing_dims *= p_inp_shape[i]; + } + + num_elm = trailing_dims; + + /* Number of elements to be skipped after trailing number of + * elements quantized with a scale and zero_bias values to get + * the next base addresses. + */ + length_per_step = p_inp_shape[axis] * trailing_dims; + + /* Length of the dimension along axis */ + axis_count = p_inp_shape[axis]; + } + + /* Base pointers that points to the first element in the channel */ + const WORD8 *__restrict__ inp_base; + FLOAT32 *__restrict__ out_base; + + /* Vector pointers for the base pointers */ + const xb_vecMx8 *__restrict__ inp_base_p; + xb_vecMxf32 *__restrict__ out_base_p; + + WORD32 m = (num_elm & (PDX_M - CONST_ONE)); + WORD32 m_8 = m * SIZE_OF_INT8; + WORD32 m_32 = m * SIZE_OF_FLOAT; + + valign align_a, align_out; + align_out = PDX_Z_ALIGN(); + + WORD32 leading_dim_idx; + xb_vecMx32 x0; + xb_vecMxf32 y0; + + /* Outermost loop iterates over the channels */ + for (WORD32 axis_index = 0; axis_index < axis_count; axis_index++) + { + xb_vecMxf32 d_inp_scale = (p_inp_scale[axis_index] + / SCALE_FACTOR_4_BIT); + inp_base = p_inp + (axis_index * trailing_dims); + out_base = p_out + (axis_index * trailing_dims); + + /* This loop iterates over the leading dims. + * All the elements are quantized at a time for + * single scale and zero_bias once loaded + */ + for (leading_dim_idx = 0; leading_dim_idx < leading_dims; + leading_dim_idx++) + { + inp_base_p = (const xb_vecMx8*) inp_base; + align_a = PDX_LA_MX8_PP(inp_base_p); + out_base_p = (xb_vecMxf32*) out_base; + + /* Unroll the loop by x4 for SIMD */ + for (WORD32 i = 0; i < (num_elm >> LOG2_PDX_M); i++) + { + PDX_LA32_MX8_IP(x0, align_a, inp_base_p); + y0 = PDX_MUL_MXF32(x0, d_inp_scale); + PDX_SA_MXF32_IP(y0, align_out, out_base_p); + } + /* Remaining iterations */ + PDX_LAV32_MX8_XP(x0, align_a, inp_base_p, m_8); + y0 = PDX_MUL_MXF32(x0, d_inp_scale); + PDX_SAV_MXF32_XP(y0, align_out, out_base_p, m_32); + PDX_SAPOS_MXF32_FP(align_out, out_base_p); + + inp_base = inp_base + length_per_step; + out_base = out_base + length_per_step; + } + } + + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_sym4u_f32.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_sym4u_f32.c new file mode 100644 index 00000000000..c9aa8ff3a12 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_sym4u_f32.c @@ -0,0 +1,158 @@ +/******************************************************************************* + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + ******************************************************************************/ + +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_dequantize_sym4u_f32(FLOAT32 *__restrict__ p_out, + const UWORD8 *__restrict__ p_inp, + const WORD32 *const p_inp_shape, + WORD32 num_inp_dims, + WORD32 *p_axis, + FLOAT32 *p_inp_scale) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_scale, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(UWORD8), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_scale, sizeof(FLOAT32), UNSUPPORTED_PARAM); + + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > MAX_DIMS)), + UNSUPPORTED_PARAM); + + /* Number of elements to be processed with a stride of 1 */ + WORD32 num_elm = CONST_ONE; + /* Number of leading dimensions of axis */ + WORD32 leading_dims = CONST_ONE; + /* Number of trailing dimensions of axis */ + WORD32 trailing_dims = CONST_ONE; + WORD32 length_per_step = 0; + WORD32 axis_count = CONST_ONE; + + if (p_axis == NULL) + { + for (WORD32 i = 0; i < num_inp_dims; i++) + { + num_elm *= p_inp_shape[i]; + } + } + else + { + WORD32 axis = *p_axis; + + /* Invalid input checks + * axis should be in the range [0,num_inp_dims-1] + */ + XA_NNLIB_ARG_CHK_COND(((axis < 0) || (axis >= num_inp_dims)), + UNSUPPORTED_PARAM); + + /* Calculating leading dims */ + for (WORD32 i = 0; i < axis; i++) + { + leading_dims *= p_inp_shape[i]; + } + + /* Calculating trailing dims */ + for (WORD32 i = axis + CONST_ONE; i < num_inp_dims; i++) + { + trailing_dims *= p_inp_shape[i]; + } + + num_elm = trailing_dims; + + /* Number of elements to be skipped after trailing number of + * elements quantized with a scale and zero_bias values to get + * the next base addresses. + */ + length_per_step = p_inp_shape[axis] * trailing_dims; + + /* Length of the dimension along axis */ + axis_count = p_inp_shape[axis]; + } + + /* Base pointers that points to the first element in the channel */ + const UWORD8 *__restrict__ inp_base; + FLOAT32 *__restrict__ out_base; + + /* Vector pointers for the base pointers */ + const xb_vecMxu8 *__restrict__ inp_base_p; + xb_vecMxf32 *__restrict__ out_base_p; + + WORD32 m = (num_elm & (PDX_M - CONST_ONE)); + WORD32 m_8 = m * SIZE_OF_INT8; + WORD32 m_32 = m * SIZE_OF_FLOAT; + + valign align_a, align_out; + align_out = PDX_Z_ALIGN(); + + WORD32 leading_dim_idx; + xb_vecMxu32 x0; + xb_vecMxf32 y0; + + /* Outermost loop iterates over the channels */ + for (WORD32 axis_index = 0; axis_index < axis_count; axis_index++) + { + xb_vecMxf32 d_inp_scale = (p_inp_scale[axis_index] + / SCALE_FACTOR_4_BIT); + inp_base = p_inp + (axis_index * trailing_dims); + out_base = p_out + (axis_index * trailing_dims); + + /* This loop iterates over the leading dims. + * All the elements are quantized at a time for + * single scale and zero_bias once loaded + */ + for (leading_dim_idx = 0; leading_dim_idx < leading_dims; + leading_dim_idx++) + { + inp_base_p = (const xb_vecMxu8*) inp_base; + align_a = PDX_LA_MXU8_PP(inp_base_p); + out_base_p = (xb_vecMxf32*) out_base; + + /* Unroll the loop by x4 for SIMD */ + for (WORD32 i = 0; i < (num_elm >> LOG2_PDX_M); i++) + { + PDX_LAU32_MX8_IP(x0, align_a, inp_base_p); + y0 = PDX_MUL_MXF32(x0, d_inp_scale); + PDX_SA_MXF32_IP(y0, align_out, out_base_p); + } + /* Remaining iterations */ + PDX_LAVU32_MX8_XP(x0, align_a, inp_base_p, m_8); + y0 = PDX_MUL_MXF32(x0, d_inp_scale); + PDX_SAV_MXF32_XP(y0, align_out, out_base_p, m_32); + PDX_SAPOS_MXF32_FP(align_out, out_base_p); + + inp_base = inp_base + length_per_step; + out_base = out_base + length_per_step; + } + } + + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_sym8_f32.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_sym8_f32.c new file mode 100644 index 00000000000..517c9183e07 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_sym8_f32.c @@ -0,0 +1,153 @@ +/******************************************************************************* + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + ******************************************************************************/ + +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_dequantize_sym8_f32(FLOAT32 *__restrict__ p_out, + const WORD8 *__restrict__ p_inp, + const WORD32 *const p_inp_shape, + WORD32 num_inp_dims, + WORD32 *p_axis, + FLOAT32 *p_inp_scale) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_scale, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(WORD8), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_scale, sizeof(FLOAT32), UNSUPPORTED_PARAM); + + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > MAX_DIMS)), + UNSUPPORTED_PARAM); + + /* Number of elements to be processed with a stride of 1 */ + WORD32 num_elm = CONST_ONE; + /* Number of leading dimensions of axis */ + WORD32 leading_dims = CONST_ONE; + /* Number of trailing dimensions of axis */ + WORD32 trailing_dims = CONST_ONE; + WORD32 length_per_step = 0; + WORD32 axis_count = CONST_ONE; + + if (p_axis == NULL) + { + for (WORD32 i = 0; i < num_inp_dims; i++) + { + num_elm *= p_inp_shape[i]; + } + } + else + { + WORD32 axis = *p_axis; + + /* Invalid input checks + * axis should be in the range [0,num_inp_dims-1] + */ + XA_NNLIB_ARG_CHK_COND(((axis < 0) || (axis >= num_inp_dims)), + UNSUPPORTED_PARAM); + + /* Calculating leading dims */ + for (WORD32 i = 0; i < axis; i++) + { + leading_dims *= p_inp_shape[i]; + } + + /* Calculating trailing dims */ + for (WORD32 i = axis + CONST_ONE; i < num_inp_dims; i++) + { + trailing_dims *= p_inp_shape[i]; + } + + num_elm = trailing_dims; + + /* Number of elements to be skipped after trailing number of + * elements quantized with a scale and zero_bias values to get + * the next base addresses. + */ + length_per_step = p_inp_shape[axis] * trailing_dims; + + /* Length of the dimension along axis */ + axis_count = p_inp_shape[axis]; + } + + /* Base pointers that points to the first element in the channel */ + const WORD8 *__restrict__ inp_base; + FLOAT32 *__restrict__ out_base; + + /* Vector pointers for the base pointers */ + const xb_vecMx8 *__restrict__ inp_base_p; + xb_vecMxf32 *__restrict__ out_base_p; + + WORD32 m = (num_elm & (PDX_M - CONST_ONE)); + WORD32 m_8 = m * SIZE_OF_INT8; + WORD32 m_32 = m * SIZE_OF_FLOAT; + + WORD32 leading_dim_idx; + valign align_a, align_out; + align_out = PDX_Z_ALIGN(); + + xb_vecMx32 x0; + xb_vecMxf32 y0; + + /* Outermost loop iterates over the channels */ + for (WORD32 axis_index = 0; axis_index < axis_count; axis_index++) + { + xb_vecMxf32 d_inp_scale = p_inp_scale[axis_index]; + inp_base = p_inp + (axis_index * trailing_dims); + out_base = p_out + (axis_index * trailing_dims); + + for (leading_dim_idx = 0; leading_dim_idx < leading_dims; + leading_dim_idx++) + { + inp_base_p = (const xb_vecMx8*) inp_base; + align_a = PDX_LA_MX8_PP(inp_base_p); + out_base_p = (xb_vecMxf32*) out_base; + + /* Unroll the loop by x4 for SIMD */ + for (WORD32 i = 0; i < (num_elm >> LOG2_PDX_M); i++) + { + PDX_LA32_MX8_IP(x0, align_a, inp_base_p); + y0 = PDX_MUL_MXF32(x0, d_inp_scale); + PDX_SA_MXF32_IP(y0, align_out, out_base_p); + } + /* Remaining iterations */ + PDX_LAV32_MX8_XP(x0, align_a, inp_base_p, m_8); + y0 = PDX_MUL_MXF32(x0, d_inp_scale); + PDX_SAV_MXF32_XP(y0, align_out, out_base_p, m_32); + PDX_SAPOS_MXF32_FP(align_out, out_base_p); + + inp_base = inp_base + length_per_step; + out_base = out_base + length_per_step; + } + } + + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_sym8u_f32.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_sym8u_f32.c new file mode 100644 index 00000000000..217bf98b4eb --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_dequantize_sym8u_f32.c @@ -0,0 +1,157 @@ +/******************************************************************************* + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + ******************************************************************************/ + +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_dequantize_sym8u_f32(FLOAT32 *__restrict__ p_out, + const UWORD8 *__restrict__ p_inp, + const WORD32 *const p_inp_shape, + WORD32 num_inp_dims, + WORD32 *p_axis, + FLOAT32 *p_inp_scale) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_scale, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(UWORD8), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_scale, sizeof(FLOAT32), UNSUPPORTED_PARAM); + + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > MAX_DIMS)), + UNSUPPORTED_PARAM); + + /* Number of elements to be processed with a stride of 1 */ + WORD32 num_elm = CONST_ONE; + /* Number of leading dimensions of axis */ + WORD32 leading_dims = CONST_ONE; + /* Number of trailing dimensions of axis */ + WORD32 trailing_dims = CONST_ONE; + WORD32 length_per_step = 0; + WORD32 axis_count = CONST_ONE; + + if (p_axis == NULL) + { + for (WORD32 i = 0; i < num_inp_dims; i++) + { + num_elm *= p_inp_shape[i]; + } + } + else + { + WORD32 axis = *p_axis; + + /* Invalid input checks + * axis should be in the range [0,num_inp_dims-1] + */ + XA_NNLIB_ARG_CHK_COND(((axis < 0) || (axis >= num_inp_dims)), + UNSUPPORTED_PARAM); + + /* Calculating leading dims */ + for (WORD32 i = 0; i < axis; i++) + { + leading_dims *= p_inp_shape[i]; + } + + /* Calculating trailing dims */ + for (WORD32 i = axis + CONST_ONE; i < num_inp_dims; i++) + { + trailing_dims *= p_inp_shape[i]; + } + + num_elm = trailing_dims; + + /* Number of elements to be skipped after trailing number of + * elements quantized with a scale and zero_bias values to get + * the next base addresses. + */ + length_per_step = p_inp_shape[axis] * trailing_dims; + + /* Length of the dimension along axis */ + axis_count = p_inp_shape[axis]; + } + + /* Base pointers that points to the first element in the channel */ + const UWORD8 *__restrict__ inp_base; + FLOAT32 *__restrict__ out_base; + + /* Vector pointers for the base pointers */ + const xb_vecMxu8 *__restrict__ inp_base_p; + xb_vecMxf32 *__restrict__ out_base_p; + + WORD32 m = (num_elm & (PDX_M - CONST_ONE)); + WORD32 m_8 = m * SIZE_OF_INT8; + WORD32 m_32 = m * SIZE_OF_FLOAT; + + WORD32 leading_dim_idx; + valign align_a, align_out; + align_out = PDX_Z_ALIGN(); + + xb_vecMxu32 x0; + xb_vecMxf32 y0; + + /* Outermost loop iterates over the channels */ + for (WORD32 axis_index = 0; axis_index < axis_count; axis_index++) + { + xb_vecMxf32 d_inp_scale = p_inp_scale[axis_index]; + inp_base = p_inp + (axis_index * trailing_dims); + out_base = p_out + (axis_index * trailing_dims); + + /* This loop iterates over the leading dims. + * All the elements are quantized at a time for + * single scale and zero_bias once loaded + */ + for (leading_dim_idx = 0; leading_dim_idx < leading_dims; + leading_dim_idx++) + { + inp_base_p = (const xb_vecMxu8*) inp_base; + align_a = PDX_LA_MXU8_PP(inp_base_p); + out_base_p = (xb_vecMxf32*) out_base; + + /* Unroll the loop by x4 for SIMD */ + for (WORD32 i = 0; i < (num_elm >> LOG2_PDX_M); i++) + { + PDX_LAU32_MX8_IP(x0, align_a, inp_base_p); + y0 = PDX_MUL_MXF32(x0, d_inp_scale); + PDX_SA_MXF32_IP(y0, align_out, out_base_p); + } + /* Remaining iterations */ + PDX_LAVU32_MX8_XP(x0, align_a, inp_base_p, m_8); + y0 = PDX_MUL_MXF32(x0, d_inp_scale); + PDX_SAV_MXF32_XP(y0, align_out, out_base_p, m_32); + PDX_SAPOS_MXF32_FP(align_out, out_base_p); + + inp_base = inp_base + length_per_step; + out_base = out_base + length_per_step; + } + } + + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_mul_32x32.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_mul_32x32.c new file mode 100644 index 00000000000..3ec109cafdf --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_mul_32x32.c @@ -0,0 +1,601 @@ +/******************************************************************************* +* Copyright (c) 2024 Cadence Design Systems, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to use this Software with Cadence processor cores only and +* not with any other processors and platforms, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +******************************************************************************/ + +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_mul_scalar_32x32_32(WORD32 *__restrict__ p_out, + const WORD32 *__restrict__ p_inp1, + const WORD32 inp2, + WORD32 num_elm) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp1, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(WORD32), UNSUPPORTED_PARAM); + + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND((num_elm <= 0), UNSUPPORTED_PARAM); + + WORD32 n, m; + + /* Declaration of SIMD variables */ + xb_vecMx32 x0, y0, z0; + + /* Declaration of valign registers */ + valign ax, az; + + /* Initialization of SIMD pointers */ + const xb_vecMx32 *__restrict__ p_x = (const xb_vecMx32 *)p_inp1; + xb_vecMx32 *__restrict__ p_z = (xb_vecMx32 *)p_out; + + /* Initialization of valign registers */ + ax = PDX_LA_MX32_PP(p_x); + az = PDX_Z_ALIGN(); + + /* Vectorize the inp2 for SIMD operation */ + y0 = inp2; + + /* Unroll the loop by x4 for SIMD */ + for (n = 0; n < (num_elm >> LOG2_PDX_M); n++) + { + PDX_LA_MX32_IP(x0, ax, p_x); + z0 = PDX_MUL_MX32(x0, y0); + PDX_SA_MX32_IP(z0, az, p_z); + } + /* Remaining iterations */ + m = (num_elm & (PDX_M - CONST_ONE)) * sizeof(*p_inp1); + PDX_LAV_MX32_XP(x0, ax, p_x, m); + z0 = PDX_MUL_MX32(x0, y0); + PDX_SAV_MX32_XP(z0, az, p_z, m); + PDX_SAPOS_MX32_FP(az, p_z); + + return 0; + +} /* xa_nn_elm_mul_scalar_32x32_32() */ + +WORD32 xa_nn_elm_mul_32x32_32( + WORD32 *__restrict__ p_out, + const WORD32 *__restrict__ p_inp1, + const WORD32 *__restrict__ p_inp2, + WORD32 num_elm) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp1, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp2, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(WORD32), UNSUPPORTED_PARAM); + + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND((num_elm <= 0), UNSUPPORTED_PARAM); + + WORD32 n, m; + + /* Declaration of SIMD variables */ + xb_vecMx32 x0, y0, z0; + + /* Declaration of valign registers */ + valign ax, ay, az; + + /* Initialization of SIMD pointers */ + const xb_vecMx32 *__restrict__ p_x = (const xb_vecMx32 *)p_inp1; + const xb_vecMx32 *__restrict__ p_y = (const xb_vecMx32 *)p_inp2; + xb_vecMx32 *__restrict__ p_z = (xb_vecMx32 *)p_out; + + /* Initialization of valign registers */ + ax = PDX_LA_MX32_PP(p_x); + ay = PDX_LA_MX32_PP(p_y); + az = PDX_Z_ALIGN(); + + /* Unroll the loop by x4 for SIMD */ + for (n = 0; n < (num_elm >> LOG2_PDX_M); n++) + { + PDX_LA_MX32_IP(x0, ax, p_x); + PDX_LA_MX32_IP(y0, ay, p_y); + z0 = PDX_MUL_MX32(x0, y0); + PDX_SA_MX32_IP(z0, az, p_z); + } + /* Remaining iterations */ + m = (num_elm & (PDX_M - CONST_ONE)) * sizeof(*p_inp1); + PDX_LAV_MX32_XP(x0, ax, p_x, m); + PDX_LAV_MX32_XP(y0, ay, p_y, m); + z0 = PDX_MUL_MX32(x0, y0); + PDX_SAV_MX32_XP(z0, az, p_z, m); + PDX_SAPOS_MX32_FP(az, p_z); + + return 0; + +} /* xa_nn_elm_mul_32x32_32() */ + +static inline void shapes_convert_5D(WORD32 *const __restrict__ p_5d_out_shape, + WORD32 *const __restrict__ p_5d_inp1_shape, // new input1 shapes + WORD32 *const __restrict__ p_5d_inp2_shape, // new input2 shapes + const WORD32 *const __restrict__ p_out_shape, + const WORD32 *const __restrict__ p_inp1_shape, // original input1 shapes + const WORD32 *const __restrict__ p_inp2_shape, // original input1 shapes + const WORD32 num_inp_dims) +{ + /* convert number of dimension less than 5D to 5D */ + for (WORD32 i = 0; i < num_inp_dims; i++) + { + p_5d_out_shape[i + MAX_DIMS - num_inp_dims] = p_out_shape[i]; + p_5d_inp1_shape[i + MAX_DIMS - num_inp_dims] = p_inp1_shape[i]; + p_5d_inp2_shape[i + MAX_DIMS - num_inp_dims] = p_inp2_shape[i]; + } +} + +static inline WORD32 check_shapes(const WORD32 *const p_inp1_shape, + const WORD32 *const p_inp2_shape, + const WORD32 *const p_out_shape) +{ + /* Check the shapes of input and output */ + for (WORD32 i = 0; i < MAX_DIMS; i++) + { + if (((p_inp1_shape[i] != p_inp2_shape[i]) + && (p_inp1_shape[i] != CONST_ONE) + && (p_inp2_shape[i] != CONST_ONE)) + || (p_out_shape[i] + != (p_inp1_shape[i] > p_inp2_shape[i] ? + p_inp1_shape[i] : p_inp2_shape[i]))) + { + return UNSUPPORTED_PARAM; + } + } + return 0; +} + +static inline void strides_calculation(const WORD32 *const inp1_shape, + const WORD32 *const inp2_shape, + WORD32 *const inp1_strides, + WORD32 *const inp2_strides) +{ + inp1_strides[MAX_DIMS - CONST_ONE] = CONST_ONE; + inp2_strides[MAX_DIMS - CONST_ONE] = CONST_ONE; + + /* Calculation of strides */ + for (WORD32 i = MAX_DIMS - CONST_TWO; i >= 0; i--) + { + inp1_strides[i] = inp1_strides[i + CONST_ONE] + * inp1_shape[i + CONST_ONE]; + inp2_strides[i] = inp2_strides[i + CONST_ONE] + * inp2_shape[i + CONST_ONE]; + } +} + +static inline void internal_elm_mul_broadcast_2D_32x32_32( + WORD32 *__restrict__ p_out, + const WORD32 *__restrict__ p_inp1, + const WORD32 *__restrict__ p_inp2, + WORD32 out_lc, + WORD32 in_lc, + const WORD32 *input1_shapes, + const WORD32 *input2_shapes) +{ + WORD32 n, m; + + /* Declaration of SIMD variables */ + xb_vecMx32 x0, y0, y1, z0, z1; + + /* Initialization of SIMD pointers */ + const xb_vecMx32 *__restrict__ p_x; + + /* Declaration of valign registers */ + valign ax, ay0, ay1, az0, az1; + + WORD32 *pz_baseptr = &p_out[0]; + /* base address calculation for output */ + xb_vecMx32 *__restrict__ p_z0 = (xb_vecMx32 *)&pz_baseptr[0]; + /* Middle address calculation for output */ + xb_vecMx32 *__restrict__ p_z1 = (xb_vecMx32 *)(&pz_baseptr[0] + + ((out_lc / CONST_TWO) * in_lc)); + + /* priming */ + az0 = PDX_Z_ALIGN(); + az1 = PDX_Z_ALIGN(); + + const WORD32 *px_baseptr; + xb_vecMx32 *p_inp; + + /* pointer for base address for input1 */ + const xb_vecMx32 *__restrict__ p_y0; + /* pointer for middle address for input1 */ + const xb_vecMx32 *__restrict__ p_y1; + /* if the last dim of input1 itself is broadcastable */ + if (input1_shapes[3] == CONST_ONE) + { + p_x = (const xb_vecMx32 *)p_inp1; + p_inp = (xb_vecMx32 *)p_inp1; + px_baseptr = p_inp2; + /* base address calculation for input1 */ + p_y0 = (const xb_vecMx32 *)&px_baseptr[0]; + /* Middle address calculation for input1*/ + p_y1 = (const xb_vecMx32 *)(&px_baseptr[0] + + ((out_lc / CONST_TWO) * in_lc)); + + } + else /* if the last dim of input2 itself is broadcastable */ + { + p_x = (const xb_vecMx32 *)p_inp2; + p_inp = (xb_vecMx32 *) p_inp2; + px_baseptr = p_inp1; + /* base address calculation for input2 */ + p_y0 = (const xb_vecMx32 *)&px_baseptr[0]; + /* Middle address calculation for input2 */ + p_y1 = (const xb_vecMx32 *)(&px_baseptr[0] + + ((out_lc / CONST_TWO) * in_lc)); + } + /* priming */ + ax = PDX_LA_MX32_PP(p_x); + ay0 = PDX_LA_MX32_PP(p_y0); + ay1 = PDX_LA_MX32_PP(p_y1); + + /* Unroll the loop by x2 for SIMD */ + for (WORD32 i = 0; i < out_lc - CONST_ONE; i += CONST_TWO) + { + /* unroll the loop by x4 for SIMD */ + for (n = 0; n < (in_lc >> LOG2_PDX_M); n++) + { + /* load the 4 elements from input1 */ + PDX_LA_MX32_IP(x0, ax, p_x); + + /* load the 4 elements from input2 base address */ + PDX_LA_MX32_IP(y0, ay0, p_y0); + /* load the 4 elements from input2 Middle address */ + PDX_LA_MX32_IP(y1, ay1, p_y1); + + /* Multiplication of x0 and y0 */ + z0 = PDX_MUL_MX32(x0, y0); + /* Multiplication of x0 and y1 */ + z1 = PDX_MUL_MX32(x0, y1); + + /* Store the output */ + PDX_SA_MX32_IP(z0, az0, p_z0); + PDX_SA_MX32_IP(z1, az1, p_z1); + } + /* Remaining iterations of inner loop */ + m = (in_lc & (PDX_M - CONST_ONE)) * sizeof(*p_inp1); + PDX_LAV_MX32_XP(x0, ax, p_x, m); + PDX_LAV_MX32_XP(y0, ay0, p_y0, m); + PDX_LAV_MX32_XP(y1, ay1, p_y1, m); + z0 = PDX_MUL_MX32(x0, y0); + z1 = PDX_MUL_MX32(x0, y1); + PDX_SAV_MX32_XP(z0, az0, p_z0, m); + PDX_SAV_MX32_XP(z1, az1, p_z1, m); + PDX_SAPOS_MX32_FP(az0, p_z0); + PDX_SAPOS_MX32_FP(az1, p_z1); + + /* Input1 Pointer updates to base address as input1 is broadcasted */ + p_x = (const xb_vecMx32 *)&p_inp[0]; + ax = PDX_LA_MX32_PP(p_x); + } + /* loop through remaining iterations of outer loop */ + if (out_lc % CONST_TWO != 0) + { + /* unroll the loop by x4 for SIMD */ + for (n = 0; n < (in_lc >> LOG2_PDX_M); n++) + { + PDX_LA_MX32_IP(y1, ay1, p_y1); + PDX_LA_MX32_IP(x0, ax, p_x); + z0 = PDX_MUL_MX32(x0, y1); + PDX_SA_MX32_IP(z0, az1, p_z1); + } + /* Remaining iterations */ + m = (in_lc & (PDX_M - CONST_ONE)) * sizeof(*p_inp1); + PDX_LAV_MX32_XP(y1, ay1, p_y1, m); + PDX_LAV_MX32_XP(x0, ax, p_x, m); + z0 = PDX_MUL_MX32(x0, y1); + PDX_SAV_MX32_XP(z0, az1, p_z1, m); + PDX_SAPOS_MX32_FP(az1, p_z1); + } +} + +static inline void internal_elm_mul_broadcast_1D_scalar_32x32_32( + WORD32 *__restrict__ p_out, + const WORD32 *__restrict__ p_inp1, + const WORD32 *__restrict__ p_inp2, + WORD32 num_elm, + const WORD32 *input1_shapes, + const WORD32 *input2_shapes) +{ + WORD32 elm; + WORD32 *p_elm; + WORD32 n, m; + + /* Declaration of SIMD variables */ + xb_vecMx32 x0, y0, z0; + + /* Declaration of valign registers */ + valign ax, az; + + /* if the last dim of input1 itself is broadcastable */ + if (input1_shapes[4] == CONST_ONE) + { + elm = p_inp1[0]; + p_elm = (WORD32 *)p_inp2; + } + else /* if the last dim of input2 itself is broadcastable */ + { + elm = p_inp2[0]; + p_elm = (WORD32 *)p_inp1; + } + + /* Initialization of SIMD pointers */ + const xb_vecMx32 *p_x = (const xb_vecMx32 *)p_elm; + xb_vecMx32 *restrict p_z = (xb_vecMx32 *)p_out; + + /* Initialization of valign registers */ + ax = PDX_LA_MX32_PP(p_x); + az = PDX_Z_ALIGN(); + + /* vectorize the elm for SIMD */ + y0 = elm; + + /* unroll the loop by x4 for SIMD */ + for (n = 0; n < (num_elm >> LOG2_PDX_M); n++) + { + PDX_LA_MX32_IP(x0, ax, p_x); + z0 = PDX_MUL_MX32(x0, y0); + PDX_SA_MX32_IP(z0, az, p_z); + } + /* Remaining iterations */ + m = (num_elm & (PDX_M - CONST_ONE)) * sizeof(*p_elm); + PDX_LAV_MX32_XP(x0, ax, p_x, m); + z0 = PDX_MUL_MX32(x0, y0); + PDX_SAV_MX32_XP(z0, az, p_z, m); + PDX_SAPOS_MX32_FP(az, p_z); +} + +WORD32 xa_nn_elm_mul_broadcast_5D_32x32_32(WORD32 *__restrict__ p_out, + const WORD32 *const p_out_shape, + const WORD32 *__restrict__ p_inp1, + const WORD32 *const p_inp1_shape, + const WORD32 *__restrict__ p_inp2, + const WORD32 *const p_inp2_shape, + WORD32 num_inp_dims) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_out_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp1, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp2, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + + /* UNSUPPORTED_PARAM input checks */ + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || + (num_inp_dims > MAX_DIMS)), UNSUPPORTED_PARAM); + + /* 5D shapes initialization */ + WORD32 p_5d_out_shape[MAX_DIMS] = {CONST_ONE, CONST_ONE, CONST_ONE, CONST_ONE, CONST_ONE}; + WORD32 p_5d_inp1_shape[MAX_DIMS] = {CONST_ONE, CONST_ONE, CONST_ONE, CONST_ONE, CONST_ONE}; + WORD32 p_5d_inp2_shape[MAX_DIMS] = {CONST_ONE, CONST_ONE, CONST_ONE, CONST_ONE, CONST_ONE}; + + shapes_convert_5D(p_5d_out_shape, p_5d_inp1_shape, p_5d_inp2_shape, + p_out_shape, p_inp1_shape, p_inp2_shape, num_inp_dims); + + /* Check shapes for broadcast compatibility */ + WORD32 error = 0; + error = check_shapes(p_5d_inp1_shape, p_5d_inp2_shape, p_5d_out_shape); + if (error) + { + return UNSUPPORTED_PARAM; + } + + /* strides calculation */ + WORD32 inp1_strides[MAX_DIMS], inp2_strides[MAX_DIMS]; + strides_calculation(p_5d_inp1_shape, p_5d_inp2_shape, + inp1_strides, inp2_strides); + + /* check for broadcast need */ + WORD32 need_broadcast = 0; + WORD32 inp1_const = CONST_ONE, inp2_const = CONST_ONE; + for (int i = 0; i < MAX_DIMS; i++) + { + if (p_5d_inp1_shape[i] != p_5d_inp2_shape[i]) + { + if (p_5d_inp1_shape[i] == CONST_ONE) + { + inp1_strides[i] = 0; + } + else + { + inp2_strides[i] = 0; + } + need_broadcast = CONST_ONE; + } + + if (p_5d_inp1_shape[i] != CONST_ONE) + inp1_const &= 0; + if (p_5d_inp2_shape[i] != CONST_ONE) + inp2_const &= 0; + } + + const WORD32 *__restrict__ p_inp1_base = p_inp1; + const WORD32 *__restrict__ p_inp2_base = p_inp2; + WORD32 *p_out_base = p_out; + + /* if broadcast is not needed */ + if (need_broadcast == 0) + { + xa_nn_elm_mul_32x32_32( + p_out_base, + p_inp1_base, + p_inp2_base, + p_5d_out_shape[0] * inp1_strides[0]); + } + + /* if broadcast is needed */ + else if (inp1_const == CONST_ONE || inp2_const == CONST_ONE) + { + WORD32 elm; + WORD32 *p_elm; + + if (inp1_const == CONST_ONE) + { + elm = p_inp1[0]; + p_elm = (WORD32 *)p_inp2; + } + else + { + elm = p_inp2[0]; + p_elm = (WORD32 *)p_inp1; + } + + WORD32 elm_num = p_5d_out_shape[0] * p_5d_out_shape[1] * + p_5d_out_shape[2] * p_5d_out_shape[3] * p_5d_out_shape[4]; + + xa_nn_elm_mul_scalar_32x32_32( + p_out_base, + p_elm, + elm, + elm_num); + } + /* check if 4th dim in both inputs is the same */ + else if (inp1_strides[4] == inp2_strides[4]) + { + WORD32 in_lc, out_lc; + /* check if 3rd dim needs to be broadcasted */ + if (inp1_strides[3] == 0 || inp2_strides[3] == 0) + { + /* Repeat the 4th dim as the 3rd dim needs to be broadcasted */ + in_lc = p_5d_out_shape[4]; + out_lc = p_5d_out_shape[3]; + for (WORD32 itr0 = 0; itr0 < p_5d_out_shape[0]; itr0++) + { + const WORD32 *__restrict__ p_inp1_itr0 = p_inp1_base; + const WORD32 *__restrict__ p_inp2_itr0 = p_inp2_base; + for (WORD32 itr1 = 0; itr1 < p_5d_out_shape[1]; itr1++) + { + const WORD32 *__restrict__ p_inp1_itr1 = p_inp1_itr0; + const WORD32 *__restrict__ p_inp2_itr1 = p_inp2_itr0; + for (WORD32 itr2 = 0; itr2 < p_5d_out_shape[2]; itr2++) + { + internal_elm_mul_broadcast_2D_32x32_32( + p_out_base, + p_inp1_itr1, + p_inp2_itr1, + out_lc, + in_lc, + p_5d_inp1_shape, + p_5d_inp2_shape); + p_out_base += in_lc * out_lc; + p_inp1_itr1 += inp1_strides[2]; + p_inp2_itr1 += inp2_strides[2]; + } + p_inp1_itr0 += inp1_strides[1]; + p_inp2_itr0 += inp2_strides[1]; + } + p_inp1_base += inp1_strides[0]; + p_inp2_base += inp2_strides[0]; + } + } + else + { + /* 3rd and 4th dimensions need not be broadcasted. The lower + * dimension broadcasting (0th, 1st, 2nd) will be taken care + * while calculating the input addresses */ + in_lc = p_5d_out_shape[3] * p_5d_out_shape[4]; + for (WORD32 itr0 = 0; itr0 < p_5d_out_shape[0]; itr0++) + { + const WORD32 *__restrict__ p_inp1_itr0 = p_inp1_base; + const WORD32 *__restrict__ p_inp2_itr0 = p_inp2_base; + for (WORD32 itr1 = 0; itr1 < p_5d_out_shape[1]; itr1++) + { + const WORD32 *__restrict__ p_inp1_itr1 = p_inp1_itr0; + const WORD32 *__restrict__ p_inp2_itr1 = p_inp2_itr0; + for (WORD32 itr2 = 0; itr2 < p_5d_out_shape[2]; itr2++) + { + xa_nn_elm_mul_32x32_32( + p_out_base, + p_inp1_itr1, + p_inp2_itr1, + in_lc); + p_out_base += in_lc; + p_inp1_itr1 += inp1_strides[2]; + p_inp2_itr1 += inp2_strides[2]; + } + p_inp1_itr0 += inp1_strides[1]; + p_inp2_itr0 += inp2_strides[1]; + } + p_inp1_base += inp1_strides[0]; + p_inp2_base += inp2_strides[0]; + } + } + } + else + { + /* if the last dim itself is broadcastable */ + for (WORD32 itr0 = 0; itr0 < p_5d_out_shape[0]; itr0++) + { + const WORD32 *__restrict__ p_inp1_itr0 = p_inp1_base; + const WORD32 *__restrict__ p_inp2_itr0 = p_inp2_base; + for (WORD32 itr1 = 0; itr1 < p_5d_out_shape[1]; itr1++) + { + const WORD32 *__restrict__ p_inp1_itr1 = p_inp1_itr0; + const WORD32 *__restrict__ p_inp2_itr1 = p_inp2_itr0; + for (WORD32 itr2 = 0; itr2 < p_5d_out_shape[2]; itr2++) + { + const WORD32 *__restrict__ p_inp1_itr2 = p_inp1_itr1; + const WORD32 *__restrict__ p_inp2_itr2 = p_inp2_itr1; + for (WORD32 itr3 = 0; itr3 < p_5d_out_shape[3]; itr3++) + { + internal_elm_mul_broadcast_1D_scalar_32x32_32( + p_out_base, + p_inp1_itr2, + p_inp2_itr2, + p_5d_out_shape[4], + p_5d_inp1_shape, + p_5d_inp2_shape); + p_out_base += p_5d_out_shape[4]; + p_inp1_itr2 += inp1_strides[3]; + p_inp2_itr2 += inp2_strides[3]; + } + p_inp1_itr1 += inp1_strides[2]; + p_inp2_itr1 += inp2_strides[2]; + } + p_inp1_itr0 += inp1_strides[1]; + p_inp2_itr0 += inp2_strides[1]; + } + p_inp1_base += inp1_strides[0]; + p_inp2_base += inp2_strides[0]; + } + } + + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_mul_f32.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_mul_f32.c new file mode 100644 index 00000000000..6cd51f3de7e --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_mul_f32.c @@ -0,0 +1,625 @@ +/******************************************************************************* +* Copyright (c) 2024 Cadence Design Systems, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to use this Software with Cadence processor cores only and +* not with any other processors and platforms, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +******************************************************************************/ + +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_mul_scalar_f32xf32_f32(FLOAT32 *__restrict__ p_out, + const FLOAT32 *__restrict__ p_inp1, + const FLOAT32 inp2, + WORD32 num_elm) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp1, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), UNSUPPORTED_PARAM); + + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND((num_elm <= 0), UNSUPPORTED_PARAM); + + WORD32 n, m; + + /* Declaration of SIMD variables */ + xb_vecMxf32 x0 = 0, z0 = 0; + + /* Declaration of valign registers */ + valign ax, az; + + /* Initialization of SIMD pointers */ + const xb_vecMxf32 *p_x = (const xb_vecMxf32 *)p_inp1; + xb_vecMxf32 *restrict p_z = (xb_vecMxf32 *)p_out; + + /* Initialization of valign registers */ + ax = PDX_LA_MXF32_PP(p_x); + az = PDX_Z_ALIGN(); + + /* Vectorize the inp2 for SIMD operation */ + xb_vecMxf32 y0 = inp2; + + /* Unroll the loop by x4 for SIMD */ + for (n = 0; n < (num_elm >> LOG2_PDX_M); n++) + { + PDX_LA_MXF32_IP(x0, ax, p_x); + z0 = PDX_MUL_MXF32(x0, y0); + PDX_SA_MXF32_IP(z0, az, p_z); + } + /* Remaining iterations */ + m = (num_elm & (PDX_M - CONST_ONE)) * sizeof(*p_inp1); + PDX_LAV_MXF32_XP(x0, ax, p_x, m); + z0 = PDX_MUL_MXF32(x0, y0); + PDX_SAV_MXF32_XP(z0, az, p_z, m); + PDX_SAPOS_MXF32_FP(az, p_z); + + return 0; +} /* xa_nn_elm_mul_scalar_f32xf32_f32() */ + +WORD32 xa_nn_elm_mul_f32xf32_f32(FLOAT32 *__restrict__ p_out, + const FLOAT32 *__restrict__ p_inp1, + const FLOAT32 *__restrict__ p_inp2, + WORD32 num_elm) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp1, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp2, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), UNSUPPORTED_PARAM); + + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND((num_elm <= 0), UNSUPPORTED_PARAM); + + WORD32 n, m; + + /* Declaration of SIMD variables */ + xb_vecMxf32 x0 = 0, y0 = 0, z0 = 0; + + /* Declaration of valign registers */ + valign ax, ay, az; + + /* Initialization of SIMD pointers */ + const xb_vecMxf32 *__restrict__ p_x = (const xb_vecMxf32 *)p_inp1; + const xb_vecMxf32 *__restrict__ p_y = (const xb_vecMxf32 *)p_inp2; + xb_vecMxf32 *__restrict__ p_z = (xb_vecMxf32 *)p_out; + + /* Initialization of valign registers */ + ax = PDX_LA_MXF32_PP(p_x); + ay = PDX_LA_MXF32_PP(p_y); + az = PDX_Z_ALIGN(); + + /* Unroll the loop by x4 for SIMD */ + for (n = 0; n < (num_elm >> LOG2_PDX_M); n++) + { + PDX_LA_MXF32_IP(x0, ax, p_x); + PDX_LA_MXF32_IP(y0, ay, p_y); + z0 = PDX_MUL_MXF32(x0, y0); + PDX_SA_MXF32_IP(z0, az, p_z); + } + /* Remaining iterations */ + m = (num_elm & (PDX_M - CONST_ONE)) * sizeof(*p_inp1); + PDX_LAV_MXF32_XP(x0, ax, p_x, m); + PDX_LAV_MXF32_XP(y0, ay, p_y, m); + z0 = PDX_MUL_MXF32(x0, y0); + PDX_SAV_MXF32_XP(z0, az, p_z, m); + PDX_SAPOS_MXF32_FP(az, p_z); + + return 0; +} /* xa_nn_elm_mul_f32xf32_f32() */ + +static inline void shapes_convert_5D(WORD32 *const __restrict__ p_5d_out_shape, + WORD32 *const __restrict__ p_5d_inp1_shape, // new input1 shapes + WORD32 *const __restrict__ p_5d_inp2_shape, // new input2 shapes + const WORD32 *const __restrict__ p_out_shape, + const WORD32 *const __restrict__ p_inp1_shape, // original input1 shapes + const WORD32 *const __restrict__ p_inp2_shape, // original input1 shapes + const WORD32 num_inp_dims) +{ + /* convert the any dimension to 5D */ + for (WORD32 i = 0; i < num_inp_dims; i++) + { + p_5d_out_shape[i + MAX_DIMS - num_inp_dims] = p_out_shape[i]; + p_5d_inp1_shape[i + MAX_DIMS - num_inp_dims] = p_inp1_shape[i]; + p_5d_inp2_shape[i + MAX_DIMS - num_inp_dims] = p_inp2_shape[i]; + } +} + +static inline WORD32 check_shapes(const WORD32 *const p_inp1_shape, + const WORD32 *const p_inp2_shape, + const WORD32 *const p_out_shape) +{ + /* Check the shapes of input and output */ + for (WORD32 i = 0; i < MAX_DIMS; i++) + { + if (((p_inp1_shape[i] != p_inp2_shape[i]) + && (p_inp1_shape[i] != CONST_ONE) + && (p_inp2_shape[i] != CONST_ONE)) + || (p_out_shape[i] + != (p_inp1_shape[i] > p_inp2_shape[i] ? + p_inp1_shape[i] : p_inp2_shape[i]))) + { + return UNSUPPORTED_PARAM; + } + } + return 0; +} + +static inline void strides_calculation( + const WORD32 *const inp1_shape, + const WORD32 *const inp2_shape, + WORD32 *const inp1_strides, + WORD32 *const inp2_strides) +{ + inp1_strides[MAX_DIMS - CONST_ONE] = CONST_ONE; + inp2_strides[MAX_DIMS - CONST_ONE] = CONST_ONE; + + /* Calculation of strides */ + for (WORD32 i = MAX_DIMS - CONST_TWO; i >= 0; i--) + { + inp1_strides[i] = inp1_strides[i + CONST_ONE] + * inp1_shape[i + CONST_ONE]; + inp2_strides[i] = inp2_strides[i + CONST_ONE] + * inp2_shape[i + CONST_ONE]; + } +} + +static inline void internal_elm_mul_broadcast_2D_f32xf32_f32( + FLOAT32 *__restrict__ p_out, + const FLOAT32 *__restrict__ p_inp1, + const FLOAT32 *__restrict__ p_inp2, + WORD32 out_lc, + WORD32 in_lc, + const WORD32 *input1_shapes, + const WORD32 *input2_shapes) +{ + WORD32 n, m; + + /* Declaration of SIMD variables */ + xb_vecMxf32 x0, y0, y1, z0, z1; + + /* Initialization of SIMD pointers */ + const xb_vecMxf32 *__restrict__ p_x; + + /* Declaration of valign registers */ + valign ax, ay0, ay1, az0, az1; + + FLOAT32 *pz_baseptr = &p_out[0]; + /* base address calculation for output */ + xb_vecMxf32 *__restrict__ p_z0 = (xb_vecMxf32*) &pz_baseptr[0]; + /* Middle address calculation for output */ + xb_vecMxf32 *__restrict__ p_z1 = (xb_vecMxf32*) (&pz_baseptr[0] + + ((out_lc / CONST_TWO) * in_lc)); + + /* priming */ + az0 = PDX_Z_ALIGN(); + az1 = PDX_Z_ALIGN(); + + const FLOAT32 *px_baseptr; + xb_vecMxf32 *p_inp; + + /* pointer for base address for input1 */ + const xb_vecMxf32 *__restrict__ p_y0; + /* pointer for middle address for input1 */ + const xb_vecMxf32 *__restrict__ p_y1; + /* if the last dim of input1 itself is broadcastable */ + if (input1_shapes[3] == CONST_ONE) + { + p_x = (const xb_vecMxf32*) p_inp1; + p_inp = (xb_vecMxf32*) p_inp1; + px_baseptr = p_inp2; + /* base address calculation for input1 */ + p_y0 = (const xb_vecMxf32*) &px_baseptr[0]; + /* Middle address calculation for input1*/ + p_y1 = (const xb_vecMxf32*) (&px_baseptr[0] + + ((out_lc / CONST_TWO) * in_lc)); + + } + else /* if the last dim of input2 itself is broadcastable */ + { + p_x = (const xb_vecMxf32*) p_inp2; + p_inp = (xb_vecMxf32*) p_inp2; + px_baseptr = p_inp1; + /* base address calculation for input2 */ + p_y0 = (const xb_vecMxf32*) &px_baseptr[0]; + /* Middle address calculation for input2 */ + p_y1 = (const xb_vecMxf32*) (&px_baseptr[0] + + ((out_lc / CONST_TWO) * in_lc)); + } + /* priming */ + ax = PDX_LA_MXF32_PP(p_x); + ay0 = PDX_LA_MXF32_PP(p_y0); + ay1 = PDX_LA_MXF32_PP(p_y1); + + /* Unroll the loop by x2 for SIMD */ + for (WORD32 i = 0; i < out_lc - CONST_ONE; i += CONST_TWO) + { + /* unroll the loop by x4 for SIMD */ + for (n = 0; n < (in_lc >> LOG2_PDX_M); n++) + { + /* load the 4 elements from input1 */ + PDX_LA_MXF32_IP(x0, ax, p_x); + + /* load the 4 elements from input2 base address */ + PDX_LA_MXF32_IP(y0, ay0, p_y0); + /* load the 4 elements from input2 Middle address */ + PDX_LA_MXF32_IP(y1, ay1, p_y1); + + /* Multiplication of x0 and y0 */ + z0 = PDX_MUL_MXF32(x0, y0); + /* Multiplication of x0 and y1 */ + z1 = PDX_MUL_MXF32(x0, y1); + + /* Store the output */ + PDX_SA_MXF32_IP(z0, az0, p_z0); + PDX_SA_MXF32_IP(z1, az1, p_z1); + } + /* Remaining iterations of inner loop */ + m = (in_lc & (PDX_M - CONST_ONE)) * sizeof(*p_inp1); + PDX_LAV_MXF32_XP(x0, ax, p_x, m); + PDX_LAV_MXF32_XP(y0, ay0, p_y0, m); + PDX_LAV_MXF32_XP(y1, ay1, p_y1, m); + z0 = PDX_MUL_MXF32(x0, y0); + z1 = PDX_MUL_MXF32(x0, y1); + PDX_SAV_MXF32_XP(z0, az0, p_z0, m); + PDX_SAV_MXF32_XP(z1, az1, p_z1, m); + PDX_SAPOS_MXF32_FP(az0, p_z0); + PDX_SAPOS_MXF32_FP(az1, p_z1); + + /* Input1 Pointer updates to base address as input1 is broadcasted */ + p_x = (const xb_vecMxf32*) &p_inp[0]; + ax = PDX_LA_MXF32_PP(p_x); + } + /* loop through remaining iterations of outer loop */ + if (out_lc % CONST_TWO != 0) + { + /* unroll the loop by x4 for SIMD */ + for (n = 0; n < (in_lc >> LOG2_PDX_M); n++) + { + PDX_LA_MXF32_IP(y1, ay1, p_y1); + PDX_LA_MXF32_IP(x0, ax, p_x); + z0 = PDX_MUL_MXF32(x0, y1); + PDX_SA_MXF32_IP(z0, az1, p_z1); + } + /* Remaining iterations */ + m = (in_lc & (PDX_M - CONST_ONE)) * sizeof(*p_inp1); + PDX_LAV_MXF32_XP(y1, ay1, p_y1, m); + PDX_LAV_MXF32_XP(x0, ax, p_x, m); + z0 = PDX_MUL_MXF32(x0, y1); + PDX_SAV_MXF32_XP(z0, az1, p_z1, m); + PDX_SAPOS_MXF32_FP(az1, p_z1); + } +} + +static inline void internal_elm_mul_broadcast_1D_scalar_f32xf32_f32( + FLOAT32 *__restrict__ p_out, + const FLOAT32 *__restrict__ p_inp1, + const FLOAT32 *__restrict__ p_inp2, + WORD32 num_elm, + const WORD32 *input1_shapes, + const WORD32 *input2_shapes) +{ + FLOAT32 elm; + FLOAT32 *p_elm; + + /* Declaration of SIMD variables */ + xb_vecMxf32 x0, z0; + valign ax, az; + WORD32 n, m = 0; + + /* if the last dim of input1 itself is broadcastable */ + if (input1_shapes[4] == CONST_ONE) + { + elm = p_inp1[0]; + p_elm = (FLOAT32 *)p_inp2; + } + else /* if the last dim of input2 itself is broadcastable */ + { + elm = p_inp2[0]; + p_elm = (FLOAT32 *)p_inp1; + } + + /* Initialization of SIMD pointers */ + const xb_vecMxf32 *p_x = (const xb_vecMxf32 *)p_elm; + xb_vecMxf32 *restrict p_z = (xb_vecMxf32 *)p_out; + + /* Initialization of valign registers */ + ax = PDX_LA_MXF32_PP(p_x); + az = PDX_Z_ALIGN(); + + /* vectorize the elm for SIMD */ + xb_vecMxf32 y0 = elm; + + if((num_elm & IS_NOT_32_MULTIPLE) == 0) + { + /* unroll the loop by x4 for SIMD */ + for (n = 0; n < (num_elm >> LOG2_PDX_M); n++) + { + PDX_LA_MXF32_IP(x0, ax, p_x); + z0 = PDX_MUL_MXF32(x0, y0); + PDX_SA_MXF32_IP(z0, az, p_z); + } + } + else + { + + /* unroll the loop by x8 for SIMD */ + for (n = 0; n < (num_elm >> LOG2_PDX_2M); n++) + { + PDX_LA_MXF32_IP(x0, ax, p_x); + z0 = PDX_MUL_MXF32(x0, y0); + PDX_SA_MXF32_IP(z0, az, p_z); + + PDX_LA_MXF32_IP(x0, ax, p_x); + z0 = PDX_MUL_MXF32(x0, y0); + PDX_SA_MXF32_IP(z0, az, p_z); + } + num_elm = num_elm - (n - CONST_ONE) * LOOP_UNROLL_BY_8; + if(num_elm >> LOG2_PDX_M) + { + PDX_LA_MXF32_IP(x0, ax, p_x); + z0 = PDX_MUL_MXF32(x0, y0); + PDX_SA_MXF32_IP(z0, az, p_z); + } + + /* Remaining iterations */ + m = (num_elm & (PDX_M - CONST_ONE)) * sizeof(*p_elm); + PDX_LAV_MXF32_XP(x0, ax, p_x, m); + z0 = PDX_MUL_MXF32(x0, y0); + PDX_SAV_MXF32_XP(z0, az, p_z, m); + PDX_SAPOS_MXF32_FP(az, p_z); + } +} + +WORD32 xa_nn_elm_mul_broadcast_5D_f32xf32_f32(FLOAT32 *__restrict__ p_out, + const WORD32 *const p_out_shape, + const FLOAT32 *__restrict__ p_inp1, + const WORD32 *const p_inp1_shape, + const FLOAT32 *__restrict__ p_inp2, + const WORD32 *const p_inp2_shape, + WORD32 num_inp_dims) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_out_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp1, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp2, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + + /* UNSUPPORTED_PARAM input checks */ + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || + (num_inp_dims > MAX_DIMS)), UNSUPPORTED_PARAM); + + /* 5D shapes initialization */ + WORD32 p_5d_out_shape[MAX_DIMS] = {CONST_ONE, CONST_ONE, CONST_ONE, CONST_ONE, CONST_ONE}; + WORD32 p_5d_inp1_shape[MAX_DIMS] = {CONST_ONE, CONST_ONE, CONST_ONE, CONST_ONE, CONST_ONE}; + WORD32 p_5d_inp2_shape[MAX_DIMS] = {CONST_ONE, CONST_ONE, CONST_ONE, CONST_ONE, CONST_ONE}; + + shapes_convert_5D(p_5d_out_shape, p_5d_inp1_shape, p_5d_inp2_shape, + p_out_shape, p_inp1_shape, p_inp2_shape, num_inp_dims); + + /* Check shapes for broadcast compatibility */ + WORD32 error = 0; + error = check_shapes(p_5d_inp1_shape, p_5d_inp2_shape, p_5d_out_shape); + if (error) + { + return UNSUPPORTED_PARAM; + } + + /* strides calculation */ + WORD32 inp1_strides[MAX_DIMS], inp2_strides[MAX_DIMS]; + strides_calculation(p_5d_inp1_shape, p_5d_inp2_shape, inp1_strides, + inp2_strides); + + /* check for broadcast need */ + WORD32 need_broadcast = 0; + WORD32 inp1_const = CONST_ONE, inp2_const = CONST_ONE; + for (int i = 0; i < MAX_DIMS; i++) + { + if (p_5d_inp1_shape[i] != p_5d_inp2_shape[i]) + { + if (p_5d_inp1_shape[i] == CONST_ONE) + { + inp1_strides[i] = 0; + } + else + { + inp2_strides[i] = 0; + } + need_broadcast = CONST_ONE; + } + + if (p_5d_inp1_shape[i] != CONST_ONE) + inp1_const &= 0; + if (p_5d_inp2_shape[i] != CONST_ONE) + inp2_const &= 0; + } + + const FLOAT32 *__restrict__ p_inp1_base = p_inp1; + const FLOAT32 *__restrict__ p_inp2_base = p_inp2; + FLOAT32 *p_out_base = p_out; + + /* if broadcast is not needed */ + if (need_broadcast == 0) + { + xa_nn_elm_mul_f32xf32_f32( + p_out_base, + p_inp1_base, + p_inp2_base, + p_5d_out_shape[0] * inp1_strides[0]); + } + + /* if broadcast is needed */ + else if (inp1_const == CONST_ONE || inp2_const == CONST_ONE) + { + FLOAT32 elm; + FLOAT32 *p_elm; + + if (inp1_const == CONST_ONE) + { + elm = p_inp1[0]; + p_elm = (FLOAT32 *)p_inp2; + } + else + { + elm = p_inp2[0]; + p_elm = (FLOAT32 *)p_inp1; + } + + WORD32 elm_num = p_5d_out_shape[0] * p_5d_out_shape[1] * + p_5d_out_shape[2] * p_5d_out_shape[3] * p_5d_out_shape[4]; + + xa_nn_elm_mul_scalar_f32xf32_f32( + p_out_base, + p_elm, + elm, + elm_num); + } + /* check if 4th dim in both inputs is the same */ + else if (inp1_strides[4] == inp2_strides[4]) + { + WORD32 in_lc, out_lc; + /* check if 3rd dim needs to be broadcasted */ + if (inp1_strides[3] == 0 || inp2_strides[3] == 0) + { + /* Repeat the 4th dimension as the + * 3rd dimension needs to be broadcasted + */ + in_lc = p_5d_out_shape[4]; + out_lc = p_5d_out_shape[3]; + for (WORD32 itr0 = 0; itr0 < p_5d_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_itr0 = p_inp1_base; + const FLOAT32 *__restrict__ p_inp2_itr0 = p_inp2_base; + for (WORD32 itr1 = 0; itr1 < p_5d_out_shape[1]; itr1++) + { + const FLOAT32 *__restrict__ p_inp1_itr1 = p_inp1_itr0; + const FLOAT32 *__restrict__ p_inp2_itr1 = p_inp2_itr0; + for (WORD32 itr2 = 0; itr2 < p_5d_out_shape[2]; itr2++) + { + internal_elm_mul_broadcast_2D_f32xf32_f32( + p_out_base, + p_inp1_itr1, + p_inp2_itr1, + out_lc, + in_lc, + p_5d_inp1_shape, + p_5d_inp2_shape); + p_out_base += in_lc * out_lc; + p_inp1_itr1 += inp1_strides[2]; + p_inp2_itr1 += inp2_strides[2]; + } + p_inp1_itr0 += inp1_strides[1]; + p_inp2_itr0 += inp2_strides[1]; + } + p_inp1_base += inp1_strides[0]; + p_inp2_base += inp2_strides[0]; + } + } + else + { + /* 3rd and 4th dimensions need not be broadcasted. The lower + * dimension broadcasting (0th, 1st, 2nd) will be taken care + * while calculating the input addresses */ + in_lc = p_5d_out_shape[3] * p_5d_out_shape[4]; + for (WORD32 itr0 = 0; itr0 < p_5d_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_itr0 = p_inp1_base; + const FLOAT32 *__restrict__ p_inp2_itr0 = p_inp2_base; + for (WORD32 itr1 = 0; itr1 < p_5d_out_shape[1]; itr1++) + { + const FLOAT32 *__restrict__ p_inp1_itr1 = p_inp1_itr0; + const FLOAT32 *__restrict__ p_inp2_itr1 = p_inp2_itr0; + for (WORD32 itr2 = 0; itr2 < p_5d_out_shape[2]; itr2++) + { + xa_nn_elm_mul_f32xf32_f32( + p_out_base, + p_inp1_itr1, + p_inp2_itr1, + in_lc); + p_out_base += in_lc; + p_inp1_itr1 += inp1_strides[2]; + p_inp2_itr1 += inp2_strides[2]; + } + p_inp1_itr0 += inp1_strides[1]; + p_inp2_itr0 += inp2_strides[1]; + } + p_inp1_base += inp1_strides[0]; + p_inp2_base += inp2_strides[0]; + } + } + } + else + { + /* if the last dim itself is broadcastable */ + for (WORD32 itr0 = 0; itr0 < p_5d_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_itr0 = p_inp1_base; + const FLOAT32 *__restrict__ p_inp2_itr0 = p_inp2_base; + for (WORD32 itr1 = 0; itr1 < p_5d_out_shape[1]; itr1++) + { + const FLOAT32 *__restrict__ p_inp1_itr1 = p_inp1_itr0; + const FLOAT32 *__restrict__ p_inp2_itr1 = p_inp2_itr0; + for (WORD32 itr2 = 0; itr2 < p_5d_out_shape[2]; itr2++) + { + const FLOAT32 *__restrict__ p_inp1_itr2 = p_inp1_itr1; + const FLOAT32 *__restrict__ p_inp2_itr2 = p_inp2_itr1; + for (WORD32 itr3 = 0; itr3 < p_5d_out_shape[3]; itr3++) + { + internal_elm_mul_broadcast_1D_scalar_f32xf32_f32( + p_out_base, + p_inp1_itr2, + p_inp2_itr2, + p_5d_out_shape[4], + p_5d_inp1_shape, + p_5d_inp2_shape); + p_out_base += p_5d_out_shape[4]; + p_inp1_itr2 += inp1_strides[3]; + p_inp2_itr2 += inp2_strides[3]; + } + p_inp1_itr1 += inp1_strides[2]; + p_inp2_itr1 += inp2_strides[2]; + } + p_inp1_itr0 += inp1_strides[1]; + p_inp2_itr0 += inp2_strides[1]; + } + p_inp1_base += inp1_strides[0]; + p_inp2_base += inp2_strides[0]; + } + } + + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_asym16.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_asym16.c new file mode 100644 index 00000000000..fd78f1bc5f0 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_asym16.c @@ -0,0 +1,219 @@ +/******************************************************************************* + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + ******************************************************************************/ + +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_quantize_f32_asym16(WORD16 *__restrict__ p_out, + const FLOAT32 *__restrict__ p_inp, + const WORD32 *const p_inp_shape, + WORD32 num_inp_dims, + WORD32 *p_axis, + FLOAT32 *p_out_scale, + WORD32 *p_out_zero_bias, + WORD32 quant_min, + WORD32 quant_max) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_out_scale, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_out_zero_bias, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD16), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_out_scale, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_out_zero_bias, sizeof(WORD32), UNSUPPORTED_PARAM); + + /* Invalid input checks + * quant_min should be >= -32768 + * quant_max should be <= 32767 + * num_inp_dims should be greater than 0 and less than or equal to 5 + */ + XA_NNLIB_ARG_CHK_COND((quant_min < INT16_LOWER_LIMIT), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((quant_max > INT16_UPPER_LIMIT), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((quant_max < quant_min), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > MAX_DIMS)), + UNSUPPORTED_PARAM); + + /* Number of elements to be processed with a stride of 1 */ + WORD32 num_elm = CONST_ONE; + /* Number of leading dimensions of axis */ + WORD32 leading_dims = CONST_ONE; + /* Number of trailing dimensions of axis */ + WORD32 trailing_dims = CONST_ONE; + WORD32 length_per_step = 0; + WORD32 axis_count = CONST_ONE; + + if (p_axis == NULL) + { + /* out_scale should not be equal to zero. + * out_zero_bias should be in the range [-32768,32767] + */ + XA_NNLIB_ARG_CHK_COND((0 == *p_out_scale), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND( + ((p_out_zero_bias[0] < INT16_LOWER_LIMIT) || + (p_out_zero_bias[0] > INT16_UPPER_LIMIT)), + UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < num_inp_dims; i++) + { + num_elm *= p_inp_shape[i]; + } + } + else + { + WORD32 axis = *p_axis; + /* Invalid input checks. + * axis should be in the range [0,num_inp_dims-1]. + * out_scale should not be equal to zero. + * out_zero_bias should be in the range [-32768,32767]. + */ + XA_NNLIB_ARG_CHK_COND(((axis < 0) || (axis >= num_inp_dims)), + UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < p_inp_shape[axis]; i++) + { + XA_NNLIB_ARG_CHK_COND((0 == p_out_scale[i]), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND( + ((p_out_zero_bias[i] < INT16_LOWER_LIMIT) || + (p_out_zero_bias[0] > INT16_UPPER_LIMIT)), + UNSUPPORTED_PARAM); + } + + /* Calculating leading dims */ + for (WORD32 i = 0; i < axis; i++) + { + leading_dims *= p_inp_shape[i]; + } + + /* Calculating trailing dims */ + for (WORD32 i = axis + CONST_ONE; i < num_inp_dims; i++) + { + trailing_dims *= p_inp_shape[i]; + } + + num_elm = trailing_dims; + + /* Number of elements to be skipped after trailing number of + * elements quantized with a scale and zero_bias values to get + * the next base addresses. + */ + length_per_step = p_inp_shape[axis] * trailing_dims; + + /* Length of the dimension along axis */ + axis_count = p_inp_shape[axis]; + } + + xb_vecMxf32 d_inp, d_out_scale; + + xb_vecMx32 d_out_zero_bias; + + /* Base pointers that points to the first element in the channel */ + const FLOAT32 *__restrict__ inp_base; + WORD16 *__restrict__ out_base; + + /* Vector pointers for the base pointers */ + xb_vecMxf32 *__restrict__ inp_base_p; + xb_vecMx16 *__restrict__ out_base_p; + + /* Calculating number of simd and scalar operations */ + WORD32 num_simd4_ops = (num_elm >> LOG2_PDX_M); + WORD32 num_scalar_ops = (num_elm & (PDX_M - CONST_ONE)); + + /* Calculating multiples of 32-bits and 16-bits */ + WORD32 m_32 = num_scalar_ops * SIZE_OF_FLOAT; + WORD32 m_16 = num_scalar_ops * SIZE_OF_INT16; + + valign align_inp, align_out; + align_out = PDX_Z_ALIGN(); + + xb_vecMxf32 d_inp_t; + + xb_vecMx32 d_out32, clamped; + xb_vecMx32 min = quant_min; + xb_vecMx32 max = quant_max; + + xb_vecMxf32 d_one_over_out_scale, d_one = PDX_CONST_MXF32(CONST_ONE); + + /* Setting rounding mode to zero - rounding to nearest integer */ + xb_int32 actual_scf = PDX_MOV32_SCF(); + xb_int32 converted_scf = PDX_AND_32(actual_scf, 0xFFFFFCFF); + PDX_MOVSCF_32(converted_scf); + + /* Outermost loop iterates over the channels */ + for (WORD32 axis_index = 0; axis_index < axis_count; axis_index++) + { + d_out_scale = p_out_scale[axis_index]; + d_out_zero_bias = p_out_zero_bias[axis_index]; + inp_base = p_inp + (axis_index * trailing_dims); + out_base = p_out + (axis_index * trailing_dims); + + d_one_over_out_scale = PDX_DIV_MXF32(d_one, d_out_scale); + + /* This loop iterates over the leading dims. + * All the elements are quantized at a time for + * single scale and zero_bias once loaded + */ + for (WORD32 leading_dims_index = 0; leading_dims_index < leading_dims; + leading_dims_index++) + { + inp_base_p = (xb_vecMxf32*) inp_base; + align_inp = PDX_LA_MXF32_PP(inp_base_p); + out_base_p = (xb_vecMx16*) out_base; + + for (WORD32 i = 0; i < num_simd4_ops; i++) + { + PDX_LA_MXF32_IP(d_inp, align_inp, inp_base_p); + d_inp_t = PDX_MUL_MXF32(d_inp, d_one_over_out_scale); + d_inp_t = PDX_FIRINT_MXF32(d_inp_t); + d_out32 = PDX_TRUNC32_MXF32(d_inp_t, 0); + d_out32 = PDX_ADD_MX32(d_out32, d_out_zero_bias); + clamped = PDX_MIN_MX32(d_out32, max); + clamped = PDX_MAX_MX32(clamped, min); + PDX_SA32_MX16_IP(clamped, align_out, out_base_p); + + } + + PDX_LAV_MXF32_XP(d_inp, align_inp, inp_base_p, m_32); + d_inp_t = PDX_MUL_MXF32(d_inp, d_one_over_out_scale); + d_inp_t = PDX_FIRINT_MXF32(d_inp_t); + d_out32 = PDX_TRUNC32_MXF32(d_inp_t, 0); + d_out32 = PDX_ADD_MX32(d_out32, d_out_zero_bias); + clamped = PDX_MIN_MX32(d_out32, max); + clamped = PDX_MAX_MX32(clamped, min); + + PDX_SAV32_MX16_XP(clamped, align_out, out_base_p, m_16); + PDX_SAPOS_MX16_FP(align_out, out_base_p); + + inp_base = inp_base + length_per_step; + out_base = out_base + length_per_step; + } + } + + /* Resetting the original scf */ + PDX_MOVSCF_32(actual_scf); + return 0; +} + diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_asym16u.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_asym16u.c new file mode 100644 index 00000000000..e55e202c288 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_asym16u.c @@ -0,0 +1,221 @@ +/******************************************************************************* + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + ******************************************************************************/ + +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_quantize_f32_asym16u(UWORD16 *__restrict__ p_out, + const FLOAT32 *__restrict__ p_inp, + const WORD32 *const p_inp_shape, + WORD32 num_inp_dims, + WORD32 *p_axis, + FLOAT32 *p_out_scale, + WORD32 *p_out_zero_bias, + WORD32 quant_min, + WORD32 quant_max) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_out_scale, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_out_zero_bias, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(UWORD16), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_out_scale, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_out_zero_bias, sizeof(WORD32), UNSUPPORTED_PARAM); + + /* Invalid input checks + * quant_min should be >= 0 + * quant_max should be <= 65535 + * num_inp_dims should be greater than 0 and less than or equal to 5 + */ + XA_NNLIB_ARG_CHK_COND((quant_min < UINT16_LOWER_LIMIT), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((quant_max > UINT16_UPPER_LIMIT), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((quant_max < quant_min), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > MAX_DIMS)), + UNSUPPORTED_PARAM); + + /* Number of elements to be processed with a stride of 1 */ + WORD32 num_elm = CONST_ONE; + /* Number of leading dimensions of axis */ + WORD32 leading_dims = CONST_ONE; + /* Number of trailing dimensions of axis */ + WORD32 trailing_dims = CONST_ONE; + WORD32 length_per_step = 0; + WORD32 axis_count = CONST_ONE; + + if (p_axis == NULL) + { + /* out_scale should not be equal to zero + * out_zero_bias should be in the range [0,65535] + */ + XA_NNLIB_ARG_CHK_COND((0 == *p_out_scale), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND( + ((p_out_zero_bias[0] < UINT16_LOWER_LIMIT) || + (p_out_zero_bias[0] > UINT16_UPPER_LIMIT)), + UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < num_inp_dims; i++) + { + num_elm *= p_inp_shape[i]; + } + } + else + { + WORD32 axis = *p_axis; + + /* Invalid input checks + * axis should be in the range [0,num_inp_dims-1] + * out_scale should not be equal to zero + * out_zero_bias should be in the range [0,65535] + */ + XA_NNLIB_ARG_CHK_COND(((axis < 0) || (axis >= num_inp_dims)), + UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < p_inp_shape[axis]; i++) + { + XA_NNLIB_ARG_CHK_COND((0 == p_out_scale[i]), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND( + ((p_out_zero_bias[i] < UINT16_LOWER_LIMIT) || + (p_out_zero_bias[0] > UINT16_UPPER_LIMIT)), + UNSUPPORTED_PARAM); + } + + /* Calculating leading dims */ + for (WORD32 i = 0; i < axis; i++) + { + leading_dims *= p_inp_shape[i]; + } + + /* Calculating trailing dims */ + for (WORD32 i = axis + CONST_ONE; i < num_inp_dims; i++) + { + trailing_dims *= p_inp_shape[i]; + } + + num_elm = trailing_dims; + + /* Number of elements to be skipped after trailing number of + * elements quantized with a scale and zero_bias values to get + * the next base addresses. + */ + length_per_step = p_inp_shape[axis] * trailing_dims; + + /* Length of the dimension along axis */ + axis_count = p_inp_shape[axis]; + } + + xb_vecMxf32 d_inp, d_out_scale; + + xb_vecMx32 d_out_zero_bias; + + /* Base pointers that points to the first element in the channel */ + const FLOAT32 *__restrict__ inp_base; + UWORD16 *__restrict__ out_base; + + /* Vector pointers for the base pointers */ + xb_vecMxf32 *__restrict__ inp_base_p; + xb_vecMxu16 *__restrict__ out_base_p; + + /* Calculating number of simd and scalar operations */ + WORD32 num_simd4_ops = (num_elm >> LOG2_PDX_M); + WORD32 num_scalar_ops = (num_elm & (PDX_M - CONST_ONE)); + + /* Calculating multiples of 32-bits and 16-bits */ + WORD32 m_32 = num_scalar_ops * SIZE_OF_FLOAT; + WORD32 m_16 = num_scalar_ops * SIZE_OF_INT16; + + valign align_inp, align_out; + align_out = PDX_Z_ALIGN(); + + xb_vecMxf32 d_inp_t; + + xb_vecMx32 d_out32, clamped; + xb_vecMx32 min = quant_min; + xb_vecMx32 max = quant_max; + + xb_vecMxf32 d_one_over_out_scale, d_one = PDX_CONST_MXF32(CONST_ONE); + + /* Setting rounding mode to zero - rounding to nearest integer */ + xb_int32 actual_scf = PDX_MOV32_SCF(); + xb_int32 converted_scf = PDX_AND_32(actual_scf, 0xFFFFFCFF); + PDX_MOVSCF_32(converted_scf); + + /* Outermost loop iterates over the channels */ + for (WORD32 axis_index = 0; axis_index < axis_count; axis_index++) + { + d_out_scale = p_out_scale[axis_index]; + d_out_zero_bias = p_out_zero_bias[axis_index]; + inp_base = p_inp + (axis_index * trailing_dims); + out_base = p_out + (axis_index * trailing_dims); + + d_one_over_out_scale = PDX_DIV_MXF32(d_one, d_out_scale); + + /* This loop iterates over the leading dims. + * All the elements are quantized at a time for + * single scale and zero_bias once loaded + */ + for (WORD32 leading_dims_index = 0; leading_dims_index < leading_dims; + leading_dims_index++) + { + inp_base_p = (xb_vecMxf32*) inp_base; + align_inp = PDX_LA_MXF32_PP(inp_base_p); + out_base_p = (xb_vecMxu16*) out_base; + + for (WORD32 i = 0; i < num_simd4_ops; i++) + { + PDX_LA_MXF32_IP(d_inp, align_inp, inp_base_p); + d_inp_t = PDX_MUL_MXF32(d_inp, d_one_over_out_scale); + d_inp_t = PDX_FIRINT_MXF32(d_inp_t); + d_out32 = PDX_TRUNC32_MXF32(d_inp_t, 0); + d_out32 = PDX_ADD_MX32(d_out32, d_out_zero_bias); + clamped = PDX_MIN_MX32(d_out32, max); + clamped = PDX_MAX_MX32(clamped, min); + + PDX_SAU32_MX16_IP(clamped, align_out, out_base_p); + + } + PDX_LAV_MXF32_XP(d_inp, align_inp, inp_base_p, m_32); + d_inp_t = PDX_MUL_MXF32(d_inp, d_one_over_out_scale); + d_inp_t = PDX_FIRINT_MXF32(d_inp_t); + d_out32 = PDX_TRUNC32_MXF32(d_inp_t, 0); + d_out32 = PDX_ADD_MX32(d_out32, d_out_zero_bias); + clamped = PDX_MIN_MX32(d_out32, max); + clamped = PDX_MAX_MX32(clamped, min); + + PDX_SAVU32_MX16_XP(clamped, align_out, out_base_p, m_16); + PDX_SAPOS_MXU16_FP(align_out, out_base_p); + + inp_base = inp_base + length_per_step; + out_base = out_base + length_per_step; + + } + } + + /* Resetting the original scf */ + PDX_MOVSCF_32(actual_scf); + + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_asym4.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_asym4.c new file mode 100644 index 00000000000..1b4f3e26f73 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_asym4.c @@ -0,0 +1,219 @@ +/******************************************************************************* + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + ******************************************************************************/ + +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_quantize_f32_asym4(WORD8 *__restrict__ p_out, + const FLOAT32 *__restrict__ p_inp, + const WORD32 *const p_inp_shape, + WORD32 num_inp_dims, + WORD32 *p_axis, + FLOAT32 *p_out_scale, + WORD32 *p_out_zero_bias, + WORD32 quant_min, + WORD32 quant_max) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_out_scale, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_out_zero_bias, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD8), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_out_scale, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_out_zero_bias, sizeof(WORD32), UNSUPPORTED_PARAM); + + /* Invalid input checks + * quant_min should be >= -8, + * quant_max should be <= 7, + * num_inp_dims should be greater than 0 and less than or equal to 5 + */ + XA_NNLIB_ARG_CHK_COND((quant_min < INT4_LOWER_LIMIT), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((quant_max > INT4_UPPER_LIMIT), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((quant_max < quant_min), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > MAX_DIMS)), + UNSUPPORTED_PARAM); + + /* Number of elements to be processed with a stride of 1 */ + WORD32 num_elm = CONST_ONE; + /* Number of leading dimensions of axis */ + WORD32 leading_dims = CONST_ONE; + /* Number of trailing dimensions of axis */ + WORD32 trailing_dims = CONST_ONE; + WORD32 length_per_step = 0; + WORD32 axis_count = CONST_ONE; + + if (p_axis == NULL) + { + /* out_scale should not be equal to zero, + * out_zero_bias should be in the range [-8,7] + */ + XA_NNLIB_ARG_CHK_COND((0 == *p_out_scale), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND( + ((p_out_zero_bias[0] < INT4_LOWER_LIMIT) || + (p_out_zero_bias[0] > INT4_UPPER_LIMIT)), + UNSUPPORTED_PARAM); + + for (WORD32 i = 0; i < num_inp_dims; i++) + { + num_elm *= p_inp_shape[i]; + } + } + else + { + WORD32 axis = *p_axis; + /* Invalid input checks + * axis should be in the range [0,num_inp_dims-1] + * out_scale should not be equal to zero + * out_zero_bias should be in the range [-8,7] + */ + XA_NNLIB_ARG_CHK_COND(((axis < 0) || (axis >= num_inp_dims)), + UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < p_inp_shape[axis]; i++) + { + XA_NNLIB_ARG_CHK_COND((0 == p_out_scale[i]), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND( + ((p_out_zero_bias[i] < INT4_LOWER_LIMIT) || + (p_out_zero_bias[0] > INT4_UPPER_LIMIT)), + UNSUPPORTED_PARAM); + } + + /* Calculating leading dims */ + for (WORD32 i = 0; i < axis; i++) + { + leading_dims *= p_inp_shape[i]; + } + + /* Calculating trailing dims */ + for (WORD32 i = axis + CONST_ONE; i < num_inp_dims; i++) + { + trailing_dims *= p_inp_shape[i]; + } + + num_elm = trailing_dims; + + /* Number of elements to be skipped after trailing number of + * elements quantized with a scale and zero_bias values to get + * the next base addresses. + */ + length_per_step = p_inp_shape[axis] * trailing_dims; + + /* Length of the dimension along axis */ + axis_count = p_inp_shape[axis]; + } + + xb_vecMxf32 d_inp, d_out_scale; + + xb_vecMx32 d_out_zero_bias; + + /* Base pointers that points to the first element in the channel */ + const FLOAT32 *__restrict__ inp_base; + WORD8 *__restrict__ out_base; + + /* Vector pointers for the base pointers */ + xb_vecMxf32 *__restrict__ inp_base_p; + xb_vecMx8 *__restrict__ out_base_p; + + /* Calculating number of simd and scalar operations */ + WORD32 num_simd4_ops = (num_elm >> LOG2_PDX_M); + WORD32 num_scalar_ops = (num_elm & (PDX_M - CONST_ONE)); + + /* Calculating multiples of 32-bits and 16-bits */ + WORD32 m_32 = num_scalar_ops * SIZE_OF_FLOAT; + WORD32 m_8 = num_scalar_ops * SIZE_OF_INT8; + + valign align_inp, align_out; + align_out = PDX_Z_ALIGN(); + + xb_vecMxf32 d_inp_t; + + xb_vecMx32 d_out32, clamped; + xb_vecMx32 min = PDX_SLLI_MX32(quant_min, SHIFT_FACTOR_4_BIT); + xb_vecMx32 max = PDX_SLLI_MX32(quant_max, SHIFT_FACTOR_4_BIT); + + xb_vecMxf32 d_one_over_out_scale, d_one = PDX_CONST_MXF32(CONST_ONE); + + /* Setting rounding mode to zero - rounding to nearest integer */ + xb_int32 actual_scf = PDX_MOV32_SCF(); + xb_int32 converted_scf = PDX_AND_32(actual_scf, 0xFFFFFCFF); + PDX_MOVSCF_32(converted_scf); + + /* Outermost loop iterates over the channels */ + for (WORD32 axis_index = 0; axis_index < axis_count; axis_index++) + { + d_out_scale = p_out_scale[axis_index]; + d_out_zero_bias = PDX_SLLI_MX32(p_out_zero_bias[axis_index], + SHIFT_FACTOR_4_BIT); + inp_base = p_inp + (axis_index * trailing_dims); + out_base = p_out + (axis_index * trailing_dims); + + d_one_over_out_scale = PDX_DIV_MXF32(d_one, d_out_scale); + + /* This loop iterates over the leading dims. + * All the elements are quantized at a time for + * single scale and zero_bias once loaded + */ + for (WORD32 leading_dims_index = 0; leading_dims_index < leading_dims; + leading_dims_index++) + { + inp_base_p = (xb_vecMxf32*) inp_base; + align_inp = PDX_LA_MXF32_PP(inp_base_p); + out_base_p = (xb_vecMx8*) out_base; + for (WORD32 i = 0; i < num_simd4_ops; i++) + { + PDX_LA_MXF32_IP(d_inp, align_inp, inp_base_p); + d_inp_t = PDX_MUL_MXF32(d_inp, d_one_over_out_scale); + d_inp_t = PDX_FIRINT_MXF32(d_inp_t); + d_out32 = PDX_TRUNC32_MXF32(d_inp_t, SHIFT_FACTOR_4_BIT); + d_out32 = PDX_ADD_MX32(d_out32, d_out_zero_bias); + clamped = PDX_MIN_MX32(d_out32, max); + clamped = PDX_MAX_MX32(clamped, min); + + PDX_SA32_MX8_IP(clamped, align_out, out_base_p); + } + PDX_LAV_MXF32_XP(d_inp, align_inp, inp_base_p, m_32); + d_inp_t = PDX_MUL_MXF32(d_inp, d_one_over_out_scale); + d_inp_t = PDX_FIRINT_MXF32(d_inp_t); + d_out32 = PDX_TRUNC32_MXF32(d_inp_t, SHIFT_FACTOR_4_BIT); + d_out32 = PDX_ADD_MX32(d_out32, d_out_zero_bias); + clamped = PDX_MIN_MX32(d_out32, max); + clamped = PDX_MAX_MX32(clamped, min); + + PDX_SAV32_MX8_XP(clamped, align_out, out_base_p, m_8); + PDX_SAPOS_MX8_FP(align_out, out_base_p); + + inp_base = inp_base + length_per_step; + out_base = out_base + length_per_step; + } + } + + /* Resetting the original scf */ + PDX_MOVSCF_32(actual_scf); + + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_asym4u.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_asym4u.c new file mode 100644 index 00000000000..482dab94d2e --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_asym4u.c @@ -0,0 +1,220 @@ +/******************************************************************************* + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + ******************************************************************************/ + +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_quantize_f32_asym4u(UWORD8 *__restrict__ p_out, + const FLOAT32 *__restrict__ p_inp, + const WORD32 *const p_inp_shape, + WORD32 num_inp_dims, + WORD32 *p_axis, + FLOAT32 *p_out_scale, + WORD32 *p_out_zero_bias, + WORD32 quant_min, + WORD32 quant_max) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_out_scale, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_out_zero_bias, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD8), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_out_scale, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_out_zero_bias, sizeof(WORD32), UNSUPPORTED_PARAM); + + /* Invalid input checks + * quant_min should be >= 0 + * quant_max should be <= 15 + * num_inp_dims should be greater than 0 and less than or equal to 5 + */ + XA_NNLIB_ARG_CHK_COND((quant_min < UINT4_LOWER_LIMIT), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((quant_max > UINT4_UPPER_LIMIT), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((quant_max < quant_min), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > MAX_DIMS)), + UNSUPPORTED_PARAM); + + /* Number of elements to be processed with a stride of 1 */ + WORD32 num_elm = CONST_ONE; + /* Number of leading dimensions of axis */ + WORD32 leading_dims = CONST_ONE; + /* Number of trailing dimensions of axis */ + WORD32 trailing_dims = CONST_ONE; + WORD32 length_per_step = 0; + WORD32 axis_count = CONST_ONE; + + if (p_axis == NULL) + { + /* out_scale should not be equal to zero + * out_zero_bias should be in the range [0,15] + */ + XA_NNLIB_ARG_CHK_COND((0 == *p_out_scale), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND( + ((p_out_zero_bias[0] < UINT4_LOWER_LIMIT) || + (p_out_zero_bias[0] > UINT4_UPPER_LIMIT)), + UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < num_inp_dims; i++) + { + num_elm *= p_inp_shape[i]; + } + } + else + { + WORD32 axis = *p_axis; + /* Invalid input checks + * axis should be in the range [0,num_inp_dims-1] + * out_scale should not be equal to zero + * out_zero_bias should be in the range [0,15] + */ + XA_NNLIB_ARG_CHK_COND(((axis < 0) || (axis >= num_inp_dims)), + UNSUPPORTED_PARAM); // + for (WORD32 i = 0; i < p_inp_shape[axis]; i++) + { + XA_NNLIB_ARG_CHK_COND((0 == p_out_scale[i]), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND( + ((p_out_zero_bias[i] < UINT4_LOWER_LIMIT) || + (p_out_zero_bias[0] > UINT4_UPPER_LIMIT)), + UNSUPPORTED_PARAM); + } + + /* Calculating leading dims */ + for (WORD32 i = 0; i < axis; i++) + { + leading_dims *= p_inp_shape[i]; + } + + /* Calculating trailing dims */ + for (WORD32 i = axis + CONST_ONE; i < num_inp_dims; i++) + { + trailing_dims *= p_inp_shape[i]; + } + + num_elm = trailing_dims; + + /* Number of elements to be skipped after trailing number of + * elements quantized with a scale and zero_bias values to get + * the next base addresses. + */ + length_per_step = p_inp_shape[axis] * trailing_dims; + + /* Length of the dimension along axis */ + axis_count = p_inp_shape[axis]; + } + + xb_vecMxf32 d_inp, d_out_scale; + + xb_vecMx32 d_out_zero_bias; + + /* Base pointers that points to the first element in the channel */ + const FLOAT32 *__restrict__ inp_base; + UWORD8 *__restrict__ out_base; + + /* Vector pointers for the base pointers */ + xb_vecMxf32 *__restrict__ inp_base_p; + xb_vecMxu8 *__restrict__ out_base_p; + + /* Calculating number of simd and scalar operations */ + WORD32 num_simd4_ops = (num_elm >> LOG2_PDX_M); + WORD32 num_scalar_ops = (num_elm & (PDX_M - CONST_ONE)); + + /* Calculating multiples of 32-bits and 16-bits */ + WORD32 m_32 = num_scalar_ops * SIZE_OF_FLOAT; + WORD32 m_8 = num_scalar_ops * SIZE_OF_INT8; + + valign align_inp, align_out; + align_out = PDX_Z_ALIGN(); + + xb_vecMxf32 d_inp_t; + + xb_vecMx32 d_out32, clamped; + xb_vecMx32 min = PDX_SLLI_MX32(quant_min, SHIFT_FACTOR_4_BIT); + xb_vecMx32 max = PDX_SLLI_MX32(quant_max, SHIFT_FACTOR_4_BIT); + + xb_vecMxf32 d_one_over_out_scale, d_one = PDX_CONST_MXF32(CONST_ONE); + + /* Setting rounding mode to zero - rounding to nearest integer */ + xb_int32 actual_scf = PDX_MOV32_SCF(); + xb_int32 converted_scf = PDX_AND_32(actual_scf, 0xFFFFFCFF); + PDX_MOVSCF_32(converted_scf); + + /* Outermost loop iterates over the channels */ + for (WORD32 axis_index = 0; axis_index < axis_count; axis_index++) + { + d_out_scale = p_out_scale[axis_index]; + d_out_zero_bias = PDX_SLLI_MX32(p_out_zero_bias[axis_index], + SHIFT_FACTOR_4_BIT); + inp_base = p_inp + (axis_index * trailing_dims); + out_base = p_out + (axis_index * trailing_dims); + + d_one_over_out_scale = PDX_DIV_MXF32(d_one, d_out_scale); + + /* This loop iterates over the leading dims + * All the elements are quantized at a time for + * single scale and zero_bias once loaded + */ + for (WORD32 leading_dims_index = 0; leading_dims_index < leading_dims; + leading_dims_index++) + { + inp_base_p = (xb_vecMxf32*) inp_base; + align_inp = PDX_LA_MXF32_PP(inp_base_p); + out_base_p = (xb_vecMxu8*) out_base; + for (WORD32 i = 0; i < num_simd4_ops; i++) + { + PDX_LA_MXF32_IP(d_inp, align_inp, inp_base_p); + d_inp_t = PDX_MUL_MXF32(d_inp, d_one_over_out_scale); + d_inp_t = PDX_FIRINT_MXF32(d_inp_t); + d_out32 = PDX_TRUNC32_MXF32(d_inp_t, SHIFT_FACTOR_4_BIT); + d_out32 = PDX_ADD_MX32(d_out32, d_out_zero_bias); + clamped = PDX_MIN_MX32(d_out32, max); + clamped = PDX_MAX_MX32(clamped, min); + + PDX_SAU32_MX8_IP(clamped, align_out, out_base_p); + + } + PDX_LAV_MXF32_XP(d_inp, align_inp, inp_base_p, m_32); + d_inp_t = PDX_MUL_MXF32(d_inp, d_one_over_out_scale); + d_inp_t = PDX_FIRINT_MXF32(d_inp_t); + d_out32 = PDX_TRUNC32_MXF32(d_inp_t, SHIFT_FACTOR_4_BIT); + d_out32 = PDX_ADD_MX32(d_out32, d_out_zero_bias); + clamped = PDX_MIN_MX32(d_out32, max); + clamped = PDX_MAX_MX32(clamped, min); + + PDX_SAVU32_MX8_XP(clamped, align_out, out_base_p, m_8); + PDX_SAPOS_MXU8_FP(align_out, out_base_p); + + inp_base = inp_base + length_per_step; + out_base = out_base + length_per_step; + + } + } + + /* Resetting the original scf */ + PDX_MOVSCF_32(actual_scf); + + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_asym8.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_asym8.c new file mode 100644 index 00000000000..581e8a10de3 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_asym8.c @@ -0,0 +1,219 @@ +/******************************************************************************* + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + ******************************************************************************/ + +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_quantize_f32_asym8(WORD8 *__restrict__ p_out, + const FLOAT32 *__restrict__ p_inp, + const WORD32 *const p_inp_shape, + WORD32 num_inp_dims, + WORD32 *p_axis, + FLOAT32 *p_out_scale, + WORD32 *p_out_zero_bias, + WORD32 quant_min, + WORD32 quant_max) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_out_scale, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_out_zero_bias, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD8), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_out_scale, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_out_zero_bias, sizeof(WORD32), UNSUPPORTED_PARAM); + + /* Invalid input checks + * quant_min should be >= -128 + * quant_max should be <= 127 + * num_inp_dims should be greater than 0 and less than or equal to 5 + */ + XA_NNLIB_ARG_CHK_COND((quant_min < INT8_LOWER_LIMIT), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((quant_max > INT8_UPPER_LIMIT), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((quant_max < quant_min), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > MAX_DIMS)), + UNSUPPORTED_PARAM); + + /* Number of elements to be processed with a stride of 1 */ + WORD32 num_elm = CONST_ONE; + /* Number of leading dimensions of axis */ + WORD32 leading_dims = CONST_ONE; + /* Number of trailing dimensions of axis */ + WORD32 trailing_dims = CONST_ONE; + WORD32 length_per_step = 0; + WORD32 axis_count = CONST_ONE; + + if (p_axis == NULL) + { + /* out_scale should not be equal to zero + * out_zero_bias should be in the range [-128,127] + */ + XA_NNLIB_ARG_CHK_COND((0 == *p_out_scale), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND( + ((p_out_zero_bias[0] < INT8_LOWER_LIMIT) || + (p_out_zero_bias[0] > INT8_UPPER_LIMIT)), + UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < num_inp_dims; i++) + { + num_elm *= p_inp_shape[i]; + } + } + else + { + WORD32 axis = *p_axis; + + /* Invalid input checks + * axis should be in the range [0,num_inp_dims-1] + * out_scale should not be equal to zero + * out_zero_bias should be in the range [-128,127] + */ + XA_NNLIB_ARG_CHK_COND(((axis < 0) || (axis >= num_inp_dims)), + UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < p_inp_shape[axis]; i++) + { + XA_NNLIB_ARG_CHK_COND((0 == p_out_scale[i]), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND( + ((p_out_zero_bias[i] < INT8_LOWER_LIMIT) || + (p_out_zero_bias[0] > INT8_UPPER_LIMIT)), + UNSUPPORTED_PARAM); + } + + /* Calculating leading dims */ + for (WORD32 i = 0; i < axis; i++) + { + leading_dims *= p_inp_shape[i]; + } + + /* Calculating trailing dims */ + for (WORD32 i = axis + CONST_ONE; i < num_inp_dims; i++) + { + trailing_dims *= p_inp_shape[i]; + } + + num_elm = trailing_dims; + + /* Number of elements to be skipped after trailing number of + * elements quantized with a scale and zero_bias values to get + * the next base addresses. + */ + length_per_step = p_inp_shape[axis] * trailing_dims; + + /* Length of the dimension along axis */ + axis_count = p_inp_shape[axis]; + } + + xb_vecMxf32 d_inp, d_out_scale; + + xb_vecMx32 d_out_zero_bias; + + /* Base pointers that points to the first element in the channel */ + const FLOAT32 *__restrict__ inp_base; + WORD8 *__restrict__ out_base; + + /* Vector pointers for the base pointers */ + xb_vecMxf32 *__restrict__ inp_base_p; + xb_vecMx8 *__restrict__ out_base_p; + + /* Calculating number of simd and scalar operations */ + WORD32 num_simd4_ops = (num_elm >> LOG2_PDX_M); + WORD32 num_scalar_ops = (num_elm & (PDX_M - CONST_ONE)); + + /* Calculating multiples of 32-bits and 16-bits */ + WORD32 m_32 = num_scalar_ops * SIZE_OF_FLOAT; + WORD32 m_8 = num_scalar_ops * SIZE_OF_INT8; + + valign align_inp, align_out; + align_out = PDX_Z_ALIGN(); + + xb_vecMxf32 d_inp_t; + + xb_vecMx32 d_out32, clamped; + xb_vecMx32 min = quant_min; + xb_vecMx32 max = quant_max; + + xb_vecMxf32 d_one_over_out_scale, d_one = PDX_CONST_MXF32(CONST_ONE); + + /* Setting rounding mode to zero - rounding to nearest integer */ + xb_int32 actual_scf = PDX_MOV32_SCF(); + xb_int32 converted_scf = PDX_AND_32(actual_scf, 0xFFFFFCFF); + PDX_MOVSCF_32(converted_scf); + + /* Outermost loop iterates over the channels */ + for (WORD32 axis_index = 0; axis_index < axis_count; axis_index++) + { + d_out_scale = p_out_scale[axis_index]; + d_out_zero_bias = p_out_zero_bias[axis_index]; + inp_base = p_inp + (axis_index * trailing_dims); + out_base = p_out + (axis_index * trailing_dims); + + d_one_over_out_scale = PDX_DIV_MXF32(d_one, d_out_scale); + + /* This loop iterates over the leading dims. + * All the elements are quantized at a time for + * single scale and zero_bias once loaded + */ + for (WORD32 leading_dims_index = 0; leading_dims_index < leading_dims; + leading_dims_index++) + { + inp_base_p = (xb_vecMxf32*) inp_base; + align_inp = PDX_LA_MXF32_PP(inp_base_p); + out_base_p = (xb_vecMx8*) out_base; + for (WORD32 i = 0; i < num_simd4_ops; i++) + { + PDX_LA_MXF32_IP(d_inp, align_inp, inp_base_p); + d_inp_t = PDX_MUL_MXF32(d_inp, d_one_over_out_scale); + d_inp_t = PDX_FIRINT_MXF32(d_inp_t); + d_out32 = PDX_TRUNC32_MXF32(d_inp_t, 0); + d_out32 = PDX_ADD_MX32(d_out32, d_out_zero_bias); + clamped = PDX_MIN_MX32(d_out32, max); + clamped = PDX_MAX_MX32(clamped, min); + + PDX_SA32_MX8_IP(clamped, align_out, out_base_p); + } + PDX_LAV_MXF32_XP(d_inp, align_inp, inp_base_p, m_32); + d_inp_t = PDX_MUL_MXF32(d_inp, d_one_over_out_scale); + d_inp_t = PDX_FIRINT_MXF32(d_inp_t); + d_out32 = PDX_TRUNC32_MXF32(d_inp_t, 0); + d_out32 = PDX_ADD_MX32(d_out32, d_out_zero_bias); + clamped = PDX_MIN_MX32(d_out32, max); + clamped = PDX_MAX_MX32(clamped, min); + + PDX_SAV32_MX8_XP(clamped, align_out, out_base_p, m_8); + PDX_SAPOS_MX8_FP(align_out, out_base_p); + + inp_base = inp_base + length_per_step; + out_base = out_base + length_per_step; + + } + } + + /* Resetting the original scf */ + PDX_MOVSCF_32(actual_scf); + + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_asym8u.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_asym8u.c new file mode 100644 index 00000000000..a4f57a82a2c --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_asym8u.c @@ -0,0 +1,220 @@ +/******************************************************************************* + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + ******************************************************************************/ + +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_quantize_f32_asym8u(UWORD8 *__restrict__ p_out, + const FLOAT32 *__restrict__ p_inp, + const WORD32 *const p_inp_shape, + WORD32 num_inp_dims, + WORD32 *p_axis, + FLOAT32 *p_out_scale, + WORD32 *p_out_zero_bias, + WORD32 quant_min, + WORD32 quant_max) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_out_scale, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_out_zero_bias, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD8), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_out_scale, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_out_zero_bias, sizeof(WORD32), UNSUPPORTED_PARAM); + + /* Invalid input checks + * quant_min should be >= 0 + * quant_max should be <= 255 + * num_inp_dims should be greater than 0 and less than or equal to 5 + */ + XA_NNLIB_ARG_CHK_COND((quant_min < UINT8_LOWER_LIMIT), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((quant_max > UINT8_UPPER_LIMIT), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((quant_max < quant_min), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > MAX_DIMS)), + UNSUPPORTED_PARAM); + + /* Number of elements to be processed with a stride of 1 */ + WORD32 num_elm = CONST_ONE; + /* Number of leading dimensions of axis */ + WORD32 leading_dims = CONST_ONE; + /* Number of trailing dimensions of axis */ + WORD32 trailing_dims = CONST_ONE; + WORD32 length_per_step = 0; + WORD32 axis_count = CONST_ONE; + + if (p_axis == NULL) + { + /* out_scale should not be equal to zero + * out_zero_bias should be in the range [0,255] + */ + XA_NNLIB_ARG_CHK_COND((0 == *p_out_scale), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND( + ((p_out_zero_bias[0] < UINT8_LOWER_LIMIT) || + (p_out_zero_bias[0] > UINT8_UPPER_LIMIT)), + UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < num_inp_dims; i++) + { + num_elm *= p_inp_shape[i]; + } + } + else + { + WORD32 axis = *p_axis; + + /* Invalid input checks + * axis should be in the range [0,num_inp_dims-1] + * out_scale should not be equal to zero + * out_zero_bias should be in the range [0,255] + */ + XA_NNLIB_ARG_CHK_COND(((axis < 0) || (axis >= num_inp_dims)), + UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < p_inp_shape[axis]; i++) + { + XA_NNLIB_ARG_CHK_COND((0 == p_out_scale[i]), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND( + ((p_out_zero_bias[i] < UINT8_LOWER_LIMIT) || + (p_out_zero_bias[0] > UINT8_UPPER_LIMIT)), + UNSUPPORTED_PARAM); + } + + /* Calculating leading dims */ + for (WORD32 i = 0; i < axis; i++) + { + leading_dims *= p_inp_shape[i]; + } + + /* Calculating trailing dims */ + for (WORD32 i = axis + CONST_ONE; i < num_inp_dims; i++) + { + trailing_dims *= p_inp_shape[i]; + } + + num_elm = trailing_dims; + + /* Number of elements to be skipped after trailing number of + * elements quantized with a scale and zero_bias values to get + * the next base addresses. + */ + length_per_step = p_inp_shape[axis] * trailing_dims; + + /* Length of the dimension along axis */ + axis_count = p_inp_shape[axis]; + } + + xb_vecMxf32 d_inp, d_out_scale; + + xb_vecMx32 d_out_zero_bias; + + /* Base pointers that points to the first element in the channel */ + const FLOAT32 *__restrict__ inp_base; + UWORD8 *__restrict__ out_base; + + /* Vector pointers for the base pointers */ + xb_vecMxf32 *__restrict__ inp_base_p; + xb_vecMxu8 *__restrict__ out_base_p; + + /* Calculating number of simd and scalar operations */ + WORD32 num_simd4_ops = (num_elm >> LOG2_PDX_M); + WORD32 num_scalar_ops = (num_elm & (PDX_M - CONST_ONE)); + + /* Calculating multiples of 32-bits and 16-bits */ + WORD32 m_32 = num_scalar_ops * SIZE_OF_FLOAT; + WORD32 m_8 = num_scalar_ops * SIZE_OF_INT8; + + valign align_inp, align_out; + align_out = PDX_Z_ALIGN(); + + xb_vecMxf32 d_inp_t; + + xb_vecMx32 d_out32, clamped; + xb_vecMx32 min = quant_min; + xb_vecMx32 max = quant_max; + + xb_vecMxf32 d_one_over_out_scale, d_one = PDX_CONST_MXF32(CONST_ONE); + + /* Setting rounding mode to zero - rounding to nearest integer */ + xb_int32 actual_scf = PDX_MOV32_SCF(); + xb_int32 converted_scf = PDX_AND_32(actual_scf, 0xFFFFFCFF); + PDX_MOVSCF_32(converted_scf); + + /* Outermost loop iterates over the channels */ + for (WORD32 axis_index = 0; axis_index < axis_count; axis_index++) + { + d_out_scale = p_out_scale[axis_index]; + d_out_zero_bias = p_out_zero_bias[axis_index]; + inp_base = p_inp + (axis_index * trailing_dims); + out_base = p_out + (axis_index * trailing_dims); + + d_one_over_out_scale = PDX_DIV_MXF32(d_one, d_out_scale); + + /* This loop iterates over the leading dims. + * All the elements are quantized at a time for + * single scale and zero_bias once loaded + */ + for (WORD32 leading_dims_index = 0; leading_dims_index < leading_dims; + leading_dims_index++) + { + inp_base_p = (xb_vecMxf32*) inp_base; + align_inp = PDX_LA_MXF32_PP(inp_base_p); + out_base_p = (xb_vecMxu8*) out_base; + for (WORD32 i = 0; i < num_simd4_ops; i++) + { + PDX_LA_MXF32_IP(d_inp, align_inp, inp_base_p); + d_inp_t = PDX_MUL_MXF32(d_inp, d_one_over_out_scale); + d_inp_t = PDX_FIRINT_MXF32(d_inp_t); + d_out32 = PDX_TRUNC32_MXF32(d_inp_t, 0); + d_out32 = PDX_ADD_MX32(d_out32, d_out_zero_bias); + clamped = PDX_MIN_MX32(d_out32, max); + clamped = PDX_MAX_MX32(clamped, min); + + PDX_SAU32_MX8_IP(clamped, align_out, out_base_p); + + } + PDX_LAV_MXF32_XP(d_inp, align_inp, inp_base_p, m_32); + d_inp_t = PDX_MUL_MXF32(d_inp, d_one_over_out_scale); + d_inp_t = PDX_FIRINT_MXF32(d_inp_t); + d_out32 = PDX_TRUNC32_MXF32(d_inp_t, 0); + d_out32 = PDX_ADD_MX32(d_out32, d_out_zero_bias); + clamped = PDX_MIN_MX32(d_out32, max); + clamped = PDX_MAX_MX32(clamped, min); + + PDX_SAVU32_MX8_XP(clamped, align_out, out_base_p, m_8); + PDX_SAPOS_MXU8_FP(align_out, out_base_p); + + inp_base = inp_base + length_per_step; + out_base = out_base + length_per_step; + + } + } + + /* Resetting the original scf */ + PDX_MOVSCF_32(actual_scf); + + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_sym16.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_sym16.c new file mode 100644 index 00000000000..c37ffcb0205 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_sym16.c @@ -0,0 +1,199 @@ +/******************************************************************************* + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + ******************************************************************************/ + +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_quantize_f32_sym16(WORD16 *__restrict__ p_out, + const FLOAT32 *__restrict__ p_inp, + const WORD32 *const p_inp_shape, + WORD32 num_inp_dims, + WORD32 *p_axis, + FLOAT32 *p_out_scale, + WORD32 quant_min, + WORD32 quant_max) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_out_scale, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD16), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_out_scale, sizeof(FLOAT32), UNSUPPORTED_PARAM); + + /* Invalid input checks + * quant_min should be >= -32768 + * quant_max should be <= 32767 + * num_inp_dims should be greater than 0 and less than or equal to 5 + */ + XA_NNLIB_ARG_CHK_COND((quant_min < INT16_LOWER_LIMIT), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((quant_max > INT16_UPPER_LIMIT), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((quant_max < quant_min), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > MAX_DIMS)), + UNSUPPORTED_PARAM); + + /* Number of elements to be processed with a stride of 1 */ + WORD32 num_elm = CONST_ONE; + /* Number of leading dimensions of axis */ + WORD32 leading_dims = CONST_ONE; + /* Number of trailing dimensions of axis */ + WORD32 trailing_dims = CONST_ONE; + WORD32 length_per_step = 0; + WORD32 axis_count = CONST_ONE; + + if (p_axis == NULL) + { + /* out_scale should not be equal to zero */ + XA_NNLIB_ARG_CHK_COND((0 == *p_out_scale), UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < num_inp_dims; i++) + { + num_elm *= p_inp_shape[i]; + } + } + else + { + WORD32 axis = *p_axis; + /* Invalid input checks + * axis should be in the range [0,num_inp_dims-1] + * out_scale should not be equal to zero + */ + XA_NNLIB_ARG_CHK_COND(((axis < 0) || (axis >= num_inp_dims)), + UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < p_inp_shape[axis]; i++) + { + XA_NNLIB_ARG_CHK_COND((0 == p_out_scale[i]), UNSUPPORTED_PARAM); + } + + /* Calculating leading dims */ + for (WORD32 i = 0; i < axis; i++) + { + leading_dims *= p_inp_shape[i]; + } + + /* Calculating trailing dims */ + for (WORD32 i = axis + CONST_ONE; i < num_inp_dims; i++) + { + trailing_dims *= p_inp_shape[i]; + } + + num_elm = trailing_dims; + + /* Number of elements to be skipped after trailing number of + * elements quantized with a scale value to get + * the next base addresses. + */ + length_per_step = p_inp_shape[axis] * trailing_dims; + + /* Length of the dimension along axis */ + axis_count = p_inp_shape[axis]; + } + + xb_vecMxf32 d_inp, d_out_scale; + + /* Base pointers that points to the first element in the channel */ + const FLOAT32 *__restrict__ inp_base; + WORD16 *__restrict__ out_base; + + /* Vector pointers for the base pointers */ + xb_vecMxf32 *__restrict__ inp_base_p; + xb_vecMx16 *__restrict__ out_base_p; + + /* Calculating number of simd and scalar operations */ + WORD32 num_simd4_ops = (num_elm >> LOG2_PDX_M); + WORD32 num_scalar_ops = (num_elm & (PDX_M - CONST_ONE)); + + /* Calculating multiples of 32-bits and 16-bits */ + WORD32 m_32 = num_scalar_ops * SIZE_OF_FLOAT; + WORD32 m_16 = num_scalar_ops * SIZE_OF_INT16; + + valign align_inp, align_out; + align_out = PDX_Z_ALIGN(); + + xb_vecMxf32 d_inp_t; + + xb_vecMx32 d_out32, clamped; + xb_vecMx32 min = quant_min; + xb_vecMx32 max = quant_max; + + xb_vecMxf32 d_one_over_out_scale, d_one = PDX_CONST_MXF32(CONST_ONE); + + /* Setting rounding mode to zero - rounding to nearest integer */ + xb_int32 actual_scf = PDX_MOV32_SCF(); + xb_int32 converted_scf = PDX_AND_32(actual_scf, 0xFFFFFCFF); + PDX_MOVSCF_32(converted_scf); + + /* Outermost loop iterates over the channels */ + for (WORD32 axis_index = 0; axis_index < axis_count; axis_index++) + { + d_out_scale = p_out_scale[axis_index]; + inp_base = p_inp + (axis_index * trailing_dims); + out_base = p_out + (axis_index * trailing_dims); + + d_one_over_out_scale = PDX_DIV_MXF32(d_one, d_out_scale); + + /* This loop iterates over the leading dims. + * All the elements are quantized at a time for + * single scale once loaded + */ + for (WORD32 leading_dims_index = 0; leading_dims_index < leading_dims; + leading_dims_index++) + { + inp_base_p = (xb_vecMxf32*) inp_base; + align_inp = PDX_LA_MXF32_PP(inp_base_p); + out_base_p = (xb_vecMx16*) out_base; + for (WORD32 i = 0; i < num_simd4_ops; i++) + { + PDX_LA_MXF32_IP(d_inp, align_inp, inp_base_p); + d_inp_t = PDX_MUL_MXF32(d_inp, d_one_over_out_scale); + d_inp_t = PDX_FIRINT_MXF32(d_inp_t); + d_out32 = PDX_TRUNC32_MXF32(d_inp_t, 0); + clamped = PDX_MIN_MX32(d_out32, max); + clamped = PDX_MAX_MX32(clamped, min); + + PDX_SA32_MX16_IP(clamped, align_out, out_base_p); + + } + PDX_LAV_MXF32_XP(d_inp, align_inp, inp_base_p, m_32); + d_inp_t = PDX_MUL_MXF32(d_inp, d_one_over_out_scale); + d_inp_t = PDX_FIRINT_MXF32(d_inp_t); + d_out32 = PDX_TRUNC32_MXF32(d_inp_t, 0); + clamped = PDX_MIN_MX32(d_out32, max); + clamped = PDX_MAX_MX32(clamped, min); + + PDX_SAV32_MX16_XP(clamped, align_out, out_base_p, m_16); + PDX_SAPOS_MX16_FP(align_out, out_base_p); + + inp_base = inp_base + length_per_step; + out_base = out_base + length_per_step; + } + } + + /* Resetting the original scf */ + PDX_MOVSCF_32(actual_scf); + + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_sym16u.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_sym16u.c new file mode 100644 index 00000000000..9503ad9bf25 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_sym16u.c @@ -0,0 +1,201 @@ +/******************************************************************************* + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + ******************************************************************************/ + +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_quantize_f32_sym16u(UWORD16 *__restrict__ p_out, + const FLOAT32 *__restrict__ p_inp, + const WORD32 *const p_inp_shape, + WORD32 num_inp_dims, + WORD32 *p_axis, + FLOAT32 *p_out_scale, + WORD32 quant_min, + WORD32 quant_max) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_out_scale, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(UWORD16), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_out_scale, sizeof(FLOAT32), UNSUPPORTED_PARAM); + + /* Invalid input checks + * quant_min should be >= 0 + * quant_max should be <= 65535 + * num_inp_dims should be greater than 0 and less than or equal to 5 + */ + XA_NNLIB_ARG_CHK_COND((quant_min < UINT16_LOWER_LIMIT), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((quant_max > UINT16_UPPER_LIMIT), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((quant_max < quant_min), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > MAX_DIMS)), + UNSUPPORTED_PARAM); + + /* Number of elements to be processed with a stride of 1 */ + WORD32 num_elm = CONST_ONE; + /* Number of leading dimensions of axis */ + WORD32 leading_dims = CONST_ONE; + /* Number of trailing dimensions of axis */ + WORD32 trailing_dims = CONST_ONE; + WORD32 length_per_step = 0; + WORD32 axis_count = CONST_ONE; + + if (p_axis == NULL) + { + /* out_scale should not be equal to zero */ + XA_NNLIB_ARG_CHK_COND((0 == *p_out_scale), UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < num_inp_dims; i++) + { + num_elm *= p_inp_shape[i]; + } + } + else + { + WORD32 axis = *p_axis; + + /* Invalid input checks + * axis should be in the range [0,num_inp_dims-1] + * out_scale should not be equal to zero + */ + XA_NNLIB_ARG_CHK_COND(((axis < 0) || (axis >= num_inp_dims)), + UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < p_inp_shape[axis]; i++) + { + XA_NNLIB_ARG_CHK_COND((0 == p_out_scale[i]), UNSUPPORTED_PARAM); + } + + /* Calculating leading dims */ + for (WORD32 i = 0; i < axis; i++) + { + leading_dims *= p_inp_shape[i]; + } + + /* Calculating trailing dims */ + for (WORD32 i = axis + CONST_ONE; i < num_inp_dims; i++) + { + trailing_dims *= p_inp_shape[i]; + } + + num_elm = trailing_dims; + + /* Number of elements to be skipped after trailing number of + * elements quantized with a scale value to get + * the next base addresses. + */ + length_per_step = p_inp_shape[axis] * trailing_dims; + + /* Length of the dimension along axis */ + axis_count = p_inp_shape[axis]; + } + + xb_vecMxf32 d_inp, d_out_scale; + + /* Base pointers that points to the first element in the channel */ + const FLOAT32 *__restrict__ inp_base; + UWORD16 *__restrict__ out_base; + + /* Vector pointers for the base pointers */ + xb_vecMxf32 *__restrict__ inp_base_p; + xb_vecMxu16 *__restrict__ out_base_p; + + /* Calculating number of simd and scalar operations */ + WORD32 num_simd4_ops = (num_elm >> LOG2_PDX_M); + WORD32 num_scalar_ops = (num_elm & (PDX_M - CONST_ONE)); + + /* Calculating multiples of 32-bits and 16-bits */ + WORD32 m_32 = num_scalar_ops * SIZE_OF_FLOAT; + WORD32 m_16 = num_scalar_ops * SIZE_OF_INT16; + + valign align_inp, align_out; + align_out = PDX_Z_ALIGN(); + + xb_vecMxf32 d_inp_t; + + xb_vecMx32 d_out32, clamped; + xb_vecMx32 min = quant_min; + xb_vecMx32 max = quant_max; + + xb_vecMxf32 d_one_over_out_scale, d_one = PDX_CONST_MXF32(CONST_ONE); + + /* Setting rounding mode to zero - rounding to nearest integer */ + xb_int32 actual_scf = PDX_MOV32_SCF(); + xb_int32 converted_scf = PDX_AND_32(actual_scf, 0xFFFFFCFF); + PDX_MOVSCF_32(converted_scf); + + /* Outermost loop iterates over the channels */ + for (WORD32 axis_index = 0; axis_index < axis_count; axis_index++) + { + d_out_scale = p_out_scale[axis_index]; + inp_base = p_inp + (axis_index * trailing_dims); + out_base = p_out + (axis_index * trailing_dims); + + d_one_over_out_scale = PDX_DIV_MXF32(d_one, d_out_scale); + + /* This loop iterates over the leading dims. + * All the elements are quantized at a time for + * single scale once loaded + */ + for (WORD32 leading_dims_index = 0; leading_dims_index < leading_dims; + leading_dims_index++) + { + inp_base_p = (xb_vecMxf32*) inp_base; + align_inp = PDX_LA_MXF32_PP(inp_base_p); + out_base_p = (xb_vecMxu16*) out_base; + for (WORD32 i = 0; i < num_simd4_ops; i++) + { + PDX_LA_MXF32_IP(d_inp, align_inp, inp_base_p); + d_inp_t = PDX_MUL_MXF32(d_inp, d_one_over_out_scale); + d_inp_t = PDX_FIRINT_MXF32(d_inp_t); + d_out32 = PDX_TRUNC32_MXF32(d_inp_t, 0); + clamped = PDX_MIN_MX32(d_out32, max); + clamped = PDX_MAX_MX32(clamped, min); + + PDX_SAU32_MX16_IP(clamped, align_out, out_base_p); + + } + PDX_LAV_MXF32_XP(d_inp, align_inp, inp_base_p, m_32); + d_inp_t = PDX_MUL_MXF32(d_inp, d_one_over_out_scale); + d_inp_t = PDX_FIRINT_MXF32(d_inp_t); + d_out32 = PDX_TRUNC32_MXF32(d_inp_t, 0); + clamped = PDX_MIN_MX32(d_out32, max); + clamped = PDX_MAX_MX32(clamped, min); + + PDX_SAVU32_MX16_XP(clamped, align_out, out_base_p, m_16); + PDX_SAPOS_MXU16_FP(align_out, out_base_p); + + inp_base = inp_base + length_per_step; + out_base = out_base + length_per_step; + + } + } + + /* Resetting the original scf */ + PDX_MOVSCF_32(actual_scf); + + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_sym4.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_sym4.c new file mode 100644 index 00000000000..ab9d907b327 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_sym4.c @@ -0,0 +1,203 @@ +/******************************************************************************* + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + ******************************************************************************/ + +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_quantize_f32_sym4(WORD8 *__restrict__ p_out, + const FLOAT32 *__restrict__ p_inp, + const WORD32 *const p_inp_shape, + WORD32 num_inp_dims, + WORD32 *p_axis, + FLOAT32 *p_out_scale, + WORD32 quant_min, + WORD32 quant_max) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_out_scale, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD8), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_out_scale, sizeof(FLOAT32), UNSUPPORTED_PARAM); + + /* Invalid input checks + * quant_min should be >= -8 + * quant_max should be <= 7 + * num_inp_dims should be greater than 0 and less than or equal to 5 + */ + XA_NNLIB_ARG_CHK_COND((quant_min < INT4_LOWER_LIMIT), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((quant_max > INT4_UPPER_LIMIT), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((quant_max < quant_min), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > MAX_DIMS)), + UNSUPPORTED_PARAM); + + /* Number of elements to be processed with a stride of 1 */ + WORD32 num_elm = CONST_ONE; + /* Number of leading dimensions of axis */ + WORD32 leading_dims = CONST_ONE; + /* Number of trailing dimensions of axis */ + WORD32 trailing_dims = CONST_ONE; + WORD32 length_per_step = 0; + WORD32 axis_count = CONST_ONE; + + if (p_axis == NULL) + { + /* + * out_scale should not be equal to zero + */ + XA_NNLIB_ARG_CHK_COND((0 == *p_out_scale), UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < num_inp_dims; i++) + { + num_elm *= p_inp_shape[i]; + } + } + else + { + WORD32 axis = *p_axis; + + /* Invalid input checks + * axis should be in the range [0,num_inp_dims-1] + * out_scale should not be equal to zero + */ + XA_NNLIB_ARG_CHK_COND(((axis < 0) || (axis >= num_inp_dims)), + UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < p_inp_shape[axis]; i++) + { + XA_NNLIB_ARG_CHK_COND((0 == p_out_scale[i]), UNSUPPORTED_PARAM); + } + + /* Calculating leading dims */ + for (WORD32 i = 0; i < axis; i++) + { + leading_dims *= p_inp_shape[i]; + } + + /* Calculating trailing dims */ + for (WORD32 i = axis + CONST_ONE; i < num_inp_dims; i++) + { + trailing_dims *= p_inp_shape[i]; + } + + num_elm = trailing_dims; + + /* Number of elements to be skipped after trailing number of + * elements quantized with a scale value to get + * the next base addresses. + */ + length_per_step = p_inp_shape[axis] * trailing_dims; + + /* Length of the dimension along axis */ + axis_count = p_inp_shape[axis]; + } + + xb_vecMxf32 d_inp, d_out_scale; + + /* Base pointers that points to the first element in the channel */ + const FLOAT32 *__restrict__ inp_base; + WORD8 *__restrict__ out_base; + + /* Vector pointers for the base pointers */ + xb_vecMxf32 *__restrict__ inp_base_p; + xb_vecMx8 *__restrict__ out_base_p; + + /* Calculating number of simd and scalar operations */ + WORD32 num_simd4_ops = (num_elm >> LOG2_PDX_M); + WORD32 num_scalar_ops = (num_elm & (PDX_M - CONST_ONE)); + + /* Calculating multiples of 32-bits and 16-bits */ + WORD32 m_32 = num_scalar_ops * SIZE_OF_FLOAT; + WORD32 m_8 = num_scalar_ops * SIZE_OF_INT8; + + valign align_inp, align_out; + align_out = PDX_Z_ALIGN(); + + xb_vecMxf32 d_inp_t; + + xb_vecMx32 d_out32, clamped; + xb_vecMx32 min = PDX_SLLI_MX32(quant_min, SHIFT_FACTOR_4_BIT); + xb_vecMx32 max = PDX_SLLI_MX32(quant_max, SHIFT_FACTOR_4_BIT); + + xb_vecMxf32 d_one_over_out_scale, d_one = PDX_CONST_MXF32(CONST_ONE); + + /* Setting rounding mode to zero - rounding to nearest integer */ + xb_int32 actual_scf = PDX_MOV32_SCF(); + xb_int32 converted_scf = PDX_AND_32(actual_scf, 0xFFFFFCFF); + PDX_MOVSCF_32(converted_scf); + + /* Outermost loop iterates over the channels */ + for (WORD32 axis_index = 0; axis_index < axis_count; axis_index++) + { + d_out_scale = p_out_scale[axis_index]; + inp_base = p_inp + (axis_index * trailing_dims); + out_base = p_out + (axis_index * trailing_dims); + + d_one_over_out_scale = PDX_DIV_MXF32(d_one, d_out_scale); + + /* This loop iterates over the leading dims. + * All the elements are quantized at a time for + * single scale once loaded + */ + for (WORD32 leading_dims_index = 0; leading_dims_index < leading_dims; + leading_dims_index++) + { + inp_base_p = (xb_vecMxf32*) inp_base; + align_inp = PDX_LA_MXF32_PP(inp_base_p); + out_base_p = (xb_vecMx8*) out_base; + for (WORD32 i = 0; i < num_simd4_ops; i++) + { + PDX_LA_MXF32_IP(d_inp, align_inp, inp_base_p); + d_inp_t = PDX_MUL_MXF32(d_inp, d_one_over_out_scale); + d_inp_t = PDX_FIRINT_MXF32(d_inp_t); + d_out32 = PDX_TRUNC32_MXF32(d_inp_t, SHIFT_FACTOR_4_BIT); + clamped = PDX_MIN_MX32(d_out32, max); + clamped = PDX_MAX_MX32(clamped, min); + + PDX_SA32_MX8_IP(clamped, align_out, out_base_p); + + } + PDX_LAV_MXF32_XP(d_inp, align_inp, inp_base_p, m_32); + d_inp_t = PDX_MUL_MXF32(d_inp, d_one_over_out_scale); + d_inp_t = PDX_FIRINT_MXF32(d_inp_t); + d_out32 = PDX_TRUNC32_MXF32(d_inp_t, SHIFT_FACTOR_4_BIT); + clamped = PDX_MIN_MX32(d_out32, max); + clamped = PDX_MAX_MX32(clamped, min); + + PDX_SAV32_MX8_XP(clamped, align_out, out_base_p, m_8); + PDX_SAPOS_MX8_FP(align_out, out_base_p); + + inp_base = inp_base + length_per_step; + out_base = out_base + length_per_step; + + } + } + + /* Resetting the original scf */ + PDX_MOVSCF_32(actual_scf); + + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_sym4u.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_sym4u.c new file mode 100644 index 00000000000..4eb70e5e82f --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_sym4u.c @@ -0,0 +1,201 @@ +/******************************************************************************* + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + ******************************************************************************/ + +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_quantize_f32_sym4u(UWORD8 *__restrict__ p_out, + const FLOAT32 *__restrict__ p_inp, + const WORD32 *const p_inp_shape, + WORD32 num_inp_dims, + WORD32 *p_axis, + FLOAT32 *p_out_scale, + WORD32 quant_min, + WORD32 quant_max) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_out_scale, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD8), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_out_scale, sizeof(FLOAT32), UNSUPPORTED_PARAM); + + /* Invalid input checks + * quant_min should be >= 0 + * quant_max should be <= 15 + * num_inp_dims should be greater than 0 and less than or equal to 5 + */ + XA_NNLIB_ARG_CHK_COND((quant_min < UINT4_LOWER_LIMIT), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((quant_max > UINT4_UPPER_LIMIT), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((quant_max < quant_min), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > MAX_DIMS)), + UNSUPPORTED_PARAM); + + /* Number of elements to be processed with a stride of 1 */ + WORD32 num_elm = CONST_ONE; + /* Number of leading dimensions of axis */ + WORD32 leading_dims = CONST_ONE; + /* Number of trailing dimensions of axis */ + WORD32 trailing_dims = CONST_ONE; + WORD32 length_per_step = 0; + WORD32 axis_count = CONST_ONE; + + if (p_axis == NULL) + { + /* out_scale should not be equal to zero */ + XA_NNLIB_ARG_CHK_COND((0 == *p_out_scale), UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < num_inp_dims; i++) + { + num_elm *= p_inp_shape[i]; + } + } + else + { + WORD32 axis = *p_axis; + + /* Invalid input checks + * axis should be in the range [0,num_inp_dims-1] + * out_scale should not be equal to zero + */ + XA_NNLIB_ARG_CHK_COND(((axis < 0) || (axis >= num_inp_dims)), + UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < p_inp_shape[axis]; i++) + { + XA_NNLIB_ARG_CHK_COND((0 == p_out_scale[i]), UNSUPPORTED_PARAM); + } + + /* Calculating leading dims */ + for (WORD32 i = 0; i < axis; i++) + { + leading_dims *= p_inp_shape[i]; + } + + /* Calculating trailing dims */ + for (WORD32 i = axis + CONST_ONE; i < num_inp_dims; i++) + { + trailing_dims *= p_inp_shape[i]; + } + + num_elm = trailing_dims; + + /* Number of elements to be skipped after trailing number of + * elements quantized with a scale value to get + * the next base addresses. + */ + length_per_step = p_inp_shape[axis] * trailing_dims; + + /* Length of the dimension along axis */ + axis_count = p_inp_shape[axis]; + } + + xb_vecMxf32 d_inp, d_out_scale; + + /* Base pointers that points to the first element in the channel */ + const FLOAT32 *__restrict__ inp_base; + UWORD8 *__restrict__ out_base; + + /* Vector pointers for the base pointers */ + xb_vecMxf32 *__restrict__ inp_base_p; + xb_vecMxu8 *__restrict__ out_base_p; + + /* Calculating number of simd and scalar operations */ + WORD32 num_simd4_ops = (num_elm >> LOG2_PDX_M); + WORD32 num_scalar_ops = (num_elm & (PDX_M - CONST_ONE)); + + /* Calculating multiples of 32-bits and 16-bits */ + WORD32 m_32 = num_scalar_ops * SIZE_OF_FLOAT; + WORD32 m_8 = num_scalar_ops * SIZE_OF_INT8; + + valign align_inp, align_out; + align_out = PDX_Z_ALIGN(); + + xb_vecMxf32 d_inp_t; + + xb_vecMx32 d_out32, clamped; + xb_vecMx32 min = PDX_SLLI_MX32(quant_min, SHIFT_FACTOR_4_BIT); + xb_vecMx32 max = PDX_SLLI_MX32(quant_max, SHIFT_FACTOR_4_BIT); + + xb_vecMxf32 d_one_over_out_scale, d_one = PDX_CONST_MXF32(CONST_ONE); + + /* Setting rounding mode to zero - rounding to nearest integer */ + xb_int32 actual_scf = PDX_MOV32_SCF(); + xb_int32 converted_scf = PDX_AND_32(actual_scf, 0xFFFFFCFF); + PDX_MOVSCF_32(converted_scf); + + /* Outermost loop iterates over the channels */ + for (WORD32 axis_index = 0; axis_index < axis_count; axis_index++) + { + d_out_scale = p_out_scale[axis_index]; + inp_base = p_inp + (axis_index * trailing_dims); + out_base = p_out + (axis_index * trailing_dims); + + d_one_over_out_scale = PDX_DIV_MXF32(d_one, d_out_scale); + + /* This loop iterates over the leading dims. + * All the elements are quantized at a time for + * single scale once loaded + */ + for (WORD32 leading_dims_index = 0; leading_dims_index < leading_dims; + leading_dims_index++) + { + inp_base_p = (xb_vecMxf32*) inp_base; + align_inp = PDX_LA_MXF32_PP(inp_base_p); + out_base_p = (xb_vecMxu8*) out_base; + for (WORD32 i = 0; i < num_simd4_ops; i++) + { + PDX_LA_MXF32_IP(d_inp, align_inp, inp_base_p); + d_inp_t = PDX_MUL_MXF32(d_inp, d_one_over_out_scale); + d_inp_t = PDX_FIRINT_MXF32(d_inp_t); + d_out32 = PDX_TRUNC32_MXF32(d_inp_t, SHIFT_FACTOR_4_BIT); + clamped = PDX_MIN_MX32(d_out32, max); + clamped = PDX_MAX_MX32(clamped, min); + + PDX_SAU32_MX8_IP(clamped, align_out, out_base_p); + + } + PDX_LAV_MXF32_XP(d_inp, align_inp, inp_base_p, m_32); + d_inp_t = PDX_MUL_MXF32(d_inp, d_one_over_out_scale); + d_inp_t = PDX_FIRINT_MXF32(d_inp_t); + d_out32 = PDX_TRUNC32_MXF32(d_inp_t, SHIFT_FACTOR_4_BIT); + clamped = PDX_MIN_MX32(d_out32, max); + clamped = PDX_MAX_MX32(clamped, min); + + PDX_SAVU32_MX8_XP(clamped, align_out, out_base_p, m_8); + PDX_SAPOS_MXU8_FP(align_out, out_base_p); + + inp_base = inp_base + length_per_step; + out_base = out_base + length_per_step; + + } + } + + /* Resetting the original scf */ + PDX_MOVSCF_32(actual_scf); + + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_sym8.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_sym8.c new file mode 100644 index 00000000000..34bf8ad7531 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_sym8.c @@ -0,0 +1,200 @@ +/******************************************************************************* + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + ******************************************************************************/ + +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_quantize_f32_sym8(WORD8 *__restrict__ p_out, + const FLOAT32 *__restrict__ p_inp, + const WORD32 *const p_inp_shape, + WORD32 num_inp_dims, + WORD32 *p_axis, + FLOAT32 *p_out_scale, + WORD32 quant_min, + WORD32 quant_max) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_out_scale, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD8), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_out_scale, sizeof(FLOAT32), UNSUPPORTED_PARAM); + + /* Invalid input checks + * quant_min should be >= -128 + * quant_max should be <= 127 + * num_inp_dims should be greater than 0 and less than or equal to 5 + */ + XA_NNLIB_ARG_CHK_COND((quant_min < INT8_LOWER_LIMIT), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((quant_max > INT8_UPPER_LIMIT), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((quant_max < quant_min), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > MAX_DIMS)), + UNSUPPORTED_PARAM); + + /* Number of elements to be processed with a stride of 1 */ + WORD32 num_elm = CONST_ONE; + /* Number of leading dimensions of axis */ + WORD32 leading_dims = CONST_ONE; + /* Number of trailing dimensions of axis */ + WORD32 trailing_dims = CONST_ONE; + WORD32 length_per_step = 0; + WORD32 axis_count = CONST_ONE; + + if (p_axis == NULL) + { + /* out_scale should not be equal to zero */ + XA_NNLIB_ARG_CHK_COND((0 == *p_out_scale), UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < num_inp_dims; i++) + { + num_elm *= p_inp_shape[i]; + } + } + else + { + WORD32 axis = *p_axis; + + /* Invalid input checks + * axis should be in the range [0,num_inp_dims-1] + * out_scale should not be equal to zero + */ + XA_NNLIB_ARG_CHK_COND(((axis < 0) || (axis >= num_inp_dims)), + UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < p_inp_shape[axis]; i++) + { + XA_NNLIB_ARG_CHK_COND((0 == p_out_scale[i]), UNSUPPORTED_PARAM); + } + + /* Calculating leading dims */ + for (WORD32 i = 0; i < axis; i++) + { + leading_dims *= p_inp_shape[i]; + } + + /* Calculating trailing dims */ + for (WORD32 i = axis + CONST_ONE; i < num_inp_dims; i++) + { + trailing_dims *= p_inp_shape[i]; + } + + num_elm = trailing_dims; + + /* Number of elements to be skipped after trailing number of + * elements quantized with a scale value to get + * the next base addresses. + */ + length_per_step = p_inp_shape[axis] * trailing_dims; + + /* Length of the dimension along axis */ + axis_count = p_inp_shape[axis]; + } + + xb_vecMxf32 d_inp, d_out_scale; + + /* Base pointers that points to the first element in the channel */ + const FLOAT32 *__restrict__ inp_base; + WORD8 *__restrict__ out_base; + + /* Vector pointers for the base pointers */ + xb_vecMxf32 *__restrict__ inp_base_p; + xb_vecMx8 *__restrict__ out_base_p; + + /* Calculating number of simd and scalar operations */ + WORD32 num_simd4_ops = (num_elm >> LOG2_PDX_M); + WORD32 num_scalar_ops = (num_elm & (PDX_M - CONST_ONE)); + + /* Calculating multiples of 32-bits and 16-bits */ + WORD32 m_32 = num_scalar_ops * SIZE_OF_FLOAT; + WORD32 m_8 = num_scalar_ops * SIZE_OF_INT8; + + valign align_inp, align_out; + align_out = PDX_Z_ALIGN(); + + xb_vecMxf32 d_inp_t; + + xb_vecMx32 d_out32, clamped; + xb_vecMx32 min = quant_min; + xb_vecMx32 max = quant_max; + + xb_vecMxf32 d_one_over_out_scale, d_one = PDX_CONST_MXF32(CONST_ONE); + + /* Setting rounding mode to zero - rounding to nearest integer */ + xb_int32 actual_scf = PDX_MOV32_SCF(); + xb_int32 converted_scf = PDX_AND_32(actual_scf, 0xFFFFFCFF); + PDX_MOVSCF_32(converted_scf); + + /* Outermost loop iterates over the channels */ + for (WORD32 axis_index = 0; axis_index < axis_count; axis_index++) + { + d_out_scale = p_out_scale[axis_index]; + inp_base = p_inp + (axis_index * trailing_dims); + out_base = p_out + (axis_index * trailing_dims); + + d_one_over_out_scale = PDX_DIV_MXF32(d_one, d_out_scale); + + /* This loop iterates over the leading dims. + * All the elements are quantized at a time for + * single scale once loaded + */ + for (WORD32 leading_dims_index = 0; leading_dims_index < leading_dims; + leading_dims_index++) + { + inp_base_p = (xb_vecMxf32*) inp_base; + align_inp = PDX_LA_MXF32_PP(inp_base_p); + out_base_p = (xb_vecMx8*) out_base; + for (WORD32 i = 0; i < num_simd4_ops; i++) + { + PDX_LA_MXF32_IP(d_inp, align_inp, inp_base_p); + d_inp_t = PDX_MUL_MXF32(d_inp, d_one_over_out_scale); + d_inp_t = PDX_FIRINT_MXF32(d_inp_t); + d_out32 = PDX_TRUNC32_MXF32(d_inp_t, 0); + clamped = PDX_MIN_MX32(d_out32, max); + clamped = PDX_MAX_MX32(clamped, min); + + PDX_SA32_MX8_IP(clamped, align_out, out_base_p); + } + PDX_LAV_MXF32_XP(d_inp, align_inp, inp_base_p, m_32); + d_inp_t = PDX_MUL_MXF32(d_inp, d_one_over_out_scale); + d_inp_t = PDX_FIRINT_MXF32(d_inp_t); + d_out32 = PDX_TRUNC32_MXF32(d_inp_t, 0); + clamped = PDX_MIN_MX32(d_out32, max); + clamped = PDX_MAX_MX32(clamped, min); + + PDX_SAV32_MX8_XP(clamped, align_out, out_base_p, m_8); + PDX_SAPOS_MX8_FP(align_out, out_base_p); + + inp_base = inp_base + length_per_step; + out_base = out_base + length_per_step; + + } + } + + /* Resetting the original scf */ + PDX_MOVSCF_32(actual_scf); + + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_sym8u.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_sym8u.c new file mode 100644 index 00000000000..8d52610aa2b --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/basic/xa_nn_elm_quantize_f32_sym8u.c @@ -0,0 +1,200 @@ +/******************************************************************************* + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + ******************************************************************************/ + +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_elm_quantize_f32_sym8u(UWORD8 *__restrict__ p_out, + const FLOAT32 *__restrict__ p_inp, + const WORD32 *const p_inp_shape, + WORD32 num_inp_dims, + WORD32 *p_axis, + FLOAT32 *p_out_scale, + WORD32 quant_min, + WORD32 quant_max) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_out_scale, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD8), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_out_scale, sizeof(FLOAT32), UNSUPPORTED_PARAM); + + /* Invalid input checks + * quant_min should be >= 0 + * quant_max should be <= 255 + * num_inp_dims should be greater than 0 and less than or equal to 5 + */ + XA_NNLIB_ARG_CHK_COND((quant_min < UINT8_LOWER_LIMIT), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((quant_max > UINT8_UPPER_LIMIT), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((quant_max < quant_min), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > MAX_DIMS)), + UNSUPPORTED_PARAM); + + /* Number of elements to be processed with a stride of 1 */ + WORD32 num_elm = CONST_ONE; + /* Number of leading dimensions of axis */ + WORD32 leading_dims = CONST_ONE; + /* Number of trailing dimensions of axis */ + WORD32 trailing_dims = CONST_ONE; + WORD32 length_per_step = 0; + WORD32 axis_count = CONST_ONE; + + if (p_axis == NULL) + { + /* out_scale should not be equal to zero */ + XA_NNLIB_ARG_CHK_COND((0 == *p_out_scale), UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < num_inp_dims; i++) + { + num_elm *= p_inp_shape[i]; + } + } + else + { + WORD32 axis = *p_axis; + /* Invalid input checks + * axis should be in the range [0,num_inp_dims-1] + * out_scale should not be equal to zero + */ + XA_NNLIB_ARG_CHK_COND(((axis < 0) || (axis >= num_inp_dims)), + UNSUPPORTED_PARAM); + for (WORD32 i = 0; i < p_inp_shape[axis]; i++) + { + XA_NNLIB_ARG_CHK_COND((0 == p_out_scale[i]), UNSUPPORTED_PARAM); + } + + /* Calculating leading dims */ + for (WORD32 i = 0; i < axis; i++) + { + leading_dims *= p_inp_shape[i]; + } + + /* Calculating trailing dims */ + for (WORD32 i = axis + CONST_ONE; i < num_inp_dims; i++) + { + trailing_dims *= p_inp_shape[i]; + } + + num_elm = trailing_dims; + + /* Number of elements to be skipped after trailing number of + * elements quantized with a scale value to get + * the next base addresses. + */ + length_per_step = p_inp_shape[axis] * trailing_dims; + + /* Length of the dimension along axis */ + axis_count = p_inp_shape[axis]; + } + + xb_vecMxf32 d_inp, d_out_scale; + + /* Base pointers that points to the first element in the channel */ + const FLOAT32 *__restrict__ inp_base; + UWORD8 *__restrict__ out_base; + + /* Vector pointers for the base pointers */ + xb_vecMxf32 *__restrict__ inp_base_p; + xb_vecMxu8 *__restrict__ out_base_p; + + /* Calculating number of simd and scalar operations */ + WORD32 num_simd4_ops = (num_elm >> LOG2_PDX_M); + WORD32 num_scalar_ops = (num_elm & (PDX_M - CONST_ONE)); + + /* Calculating multiples of 32-bits and 16-bits */ + WORD32 m_32 = num_scalar_ops * SIZE_OF_FLOAT; + WORD32 m_8 = num_scalar_ops * SIZE_OF_INT8; + + valign align_inp, align_out; + align_out = PDX_Z_ALIGN(); + + xb_vecMxf32 d_inp_t; + + xb_vecMx32 d_out32, clamped; + xb_vecMx32 min = quant_min; + xb_vecMx32 max = quant_max; + + xb_vecMxf32 d_one_over_out_scale, d_one = PDX_CONST_MXF32(CONST_ONE); + + /* Setting rounding mode to zero - rounding to nearest integer */ + xb_int32 actual_scf = PDX_MOV32_SCF(); + xb_int32 converted_scf = PDX_AND_32(actual_scf, 0xFFFFFCFF); + PDX_MOVSCF_32(converted_scf); + + /* Outermost loop iterates over the channels */ + for (WORD32 axis_index = 0; axis_index < axis_count; axis_index++) + { + d_out_scale = p_out_scale[axis_index]; + inp_base = p_inp + (axis_index * trailing_dims); + out_base = p_out + (axis_index * trailing_dims); + + d_one_over_out_scale = PDX_DIV_MXF32(d_one, d_out_scale); + + /* This loop iterates over the leading dims. + * All the elements are quantized at a time for + * single scale once loaded + */ + for (WORD32 leading_dims_index = 0; leading_dims_index < leading_dims; + leading_dims_index++) + { + inp_base_p = (xb_vecMxf32*) inp_base; + align_inp = PDX_LA_MXF32_PP(inp_base_p); + out_base_p = (xb_vecMxu8*) out_base; + for (WORD32 i = 0; i < num_simd4_ops; i++) + { + PDX_LA_MXF32_IP(d_inp, align_inp, inp_base_p); + d_inp_t = PDX_MUL_MXF32(d_inp, d_one_over_out_scale); + d_inp_t = PDX_FIRINT_MXF32(d_inp_t); + d_out32 = PDX_TRUNC32_MXF32(d_inp_t, 0); + clamped = PDX_MIN_MX32(d_out32, max); + clamped = PDX_MAX_MX32(clamped, min); + + PDX_SAU32_MX8_IP(clamped, align_out, out_base_p); + + } + PDX_LAV_MXF32_XP(d_inp, align_inp, inp_base_p, m_32); + d_inp_t = PDX_MUL_MXF32(d_inp, d_one_over_out_scale); + d_inp_t = PDX_FIRINT_MXF32(d_inp_t); + d_out32 = PDX_TRUNC32_MXF32(d_inp_t, 0); + clamped = PDX_MIN_MX32(d_out32, max); + clamped = PDX_MAX_MX32(clamped, min); + + PDX_SAVU32_MX8_XP(clamped, align_out, out_base_p, m_8); + PDX_SAPOS_MXU8_FP(align_out, out_base_p); + + inp_base = inp_base + length_per_step; + out_base = out_base + length_per_step; + + } + } + + /* Resetting the original scf */ + PDX_MOVSCF_32(actual_scf); + + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/norm/xa_nn_layer_norm.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/norm/xa_nn_layer_norm.c new file mode 100644 index 00000000000..441204f54c9 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/norm/xa_nn_layer_norm.c @@ -0,0 +1,845 @@ +/******************************************************************************* +* Copyright (c) 2024 Cadence Design Systems, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to use this Software with Cadence processor cores only and +* not with any other processors and platforms, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +******************************************************************************/ + +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +/* + * + * Mean = (x0 + x1+ .. +xn-1)/n + * Variance = ((x0*x0 + x1*x1 + .. +xn-1*xn-1)/n - (Mean*Mean)) + * std = sqrt(Variance + eps) + * norm = (((x - Mean)/std) * weight) + bias + * + * */ + +WORD32 xa_nn_native_layer_norm_f32_f32(FLOAT32 *p_out, + FLOAT32 *p_mean, + FLOAT32 *p_std, + const FLOAT32 *p_inp, + const WORD32 *const p_inp_shape, + WORD32 num_inp_dims, + WORD32 axis, + const FLOAT32 *p_weight, + const FLOAT32 *p_bias, + FLOAT32 eps) +{ + + WORD32 i, j, m; +#ifdef ENABLE_HIGH_PRECISION + xtfloat *p_a0 = (xtfloat *)p_inp; +#endif + + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_inp, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_weight, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_bias, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_mean, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_std, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, UNSUPPORTED_PARAM); + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_weight, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_bias, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_mean, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_std, sizeof(FLOAT32), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND((eps <= 0), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((num_inp_dims <= 0), UNSUPPORTED_PARAM); + + XA_NNLIB_ARG_CHK_COND((axis < 0), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((axis >= num_inp_dims), UNSUPPORTED_PARAM); + + const xb_vecMxf32 *restrict p_in_mxf32 = (const xb_vecMxf32 *)p_inp; + const xb_vecMxf32 *restrict p_in1_mxf32 = p_in_mxf32; + xb_vecMxf32 *restrict p_out_mxf32 = (xb_vecMxf32 *)p_out; + xb_vecMxf32 *restrict p_rstd_mxf32 = (xb_vecMxf32 *)p_std; + xb_vecMxf32 *restrict p_mean_mxf32 = (xb_vecMxf32 *)p_mean; + const xb_vecMxf32 *restrict p_weight_mxf32; + const xb_vecMxf32 *restrict p_bias_mxf32; + + /* Initialize number of elements of leading and normalized shapes */ + WORD32 leading_dim = CONST_ONE; + WORD32 norm_dim = CONST_ONE; + + /* Calculate number of elements of leading dimensions */ + for (int i = 0; i < axis; i++) + { + leading_dim *= p_inp_shape[i]; + } + + /* Calculate number of elements of the shape to be normalized */ + for (int i = axis; i < num_inp_dims; i++) + { + norm_dim *= p_inp_shape[i]; + } + + if (!leading_dim) + { + return UNSUPPORTED_PARAM; + } + + if (!norm_dim) + { + valign ax, ay; + xb_vecMxf32 *restrict p_mean_mxf32 = (xb_vecMxf32 *)p_mean; + /* Zeroing align registers */ + ax = PDX_Z_ALIGN(); + ay = PDX_Z_ALIGN(); + + FLOAT32 rstd = NAN; + + /* Initialize the mean output with 0 */ + xb_vecMxf32 x; + x = PDX_ZERO_MXF32(); + + /* Initialize the inverse std output with NAN */ + xb_vecMxf32 y = PDX_MOV_MXF32_FROM_F32(rstd); + y = PDX_REP_MXF32(y, 0); + + /* loop runs for leading_dim/4 iterations */ + for (i = 0; i < (leading_dim >> LOG2_PDX_M); i++) + { + /* mean and inverse std values are being stored */ + PDX_SA_MXF32_IP(x, ax, p_mean_mxf32); + PDX_SA_MXF32_IP(y, ay, p_rstd_mxf32); + } + + /* Store the remaining mean and rstd values after processing the loop */ + m = (leading_dim & (PDX_M - CONST_ONE)) << LOG2_SIZE_FLOAT; + PDX_SAV_MXF32_XP(x, ax, p_mean_mxf32, m); + PDX_SAPOS_MXF32_FP(ax, p_mean_mxf32); + + PDX_SAV_MXF32_XP(y, ay, p_rstd_mxf32, m); + PDX_SAPOS_MXF32_FP(ay, p_rstd_mxf32); + + return UNSUPPORTED_PARAM; + } + + xb_vecMxf32 sum_mxf32; + xb_vecMxf32 sq_sum_mxf32; + xb_vecMxf32 mean_vec; + xb_vecMxf32 std_vec; + xb_vecMxf32 rstd_vec; + + FLOAT32 sum; + FLOAT32 sq_sum; + FLOAT32 mean; + FLOAT32 variance; + FLOAT32 inv_normalized; + + /* Align load priming of output and inverse std */ + valign a_out = PDX_Z_ALIGN(); + valign a_rstd = PDX_Z_ALIGN(); + valign a_mean = PDX_Z_ALIGN(); + + valign a_out_dim1 = PDX_Z_ALIGN(); + valign a_out_dim2 = PDX_Z_ALIGN(); + valign a_out_dim3 = PDX_Z_ALIGN(); + + /* Calculate inverse of number of normalized elements */ + PDX_DIV_F32_T(inv_normalized, CONST_ONE, norm_dim, CONST_ONE); + +#ifndef ENABLE_HIGH_PRECISION + xb_vecMxf32 inv_norm_vec = inv_normalized; + xb_vecMxf32 eps_vec = eps; +#endif + xb_vecMxf32 x0; + + valign ax; + + /* Calculate number of remaining inputs */ + m = (norm_dim & (PDX_M - CONST_ONE)) << LOG2_SIZE_FLOAT; + + xb_vecMxf32 sum_mxf32_1, sum_mxf32_2, sum_mxf32_3, sum_mxf32_4; + xb_vecMxf32 sq_sum_mxf32_1, sq_sum_mxf32_2, sq_sum_mxf32_3, sq_sum_mxf32_4; + + const xb_vecMxf32 *restrict p_in_mxf32_st1; + const xb_vecMxf32 *restrict p_in_mxf32_st2; + const xb_vecMxf32 *restrict p_in_mxf32_st3; + + xb_vecMxf32 *restrict p_out_mxf32_dim1; + xb_vecMxf32 *restrict p_out_mxf32_dim2; + xb_vecMxf32 *restrict p_out_mxf32_dim3; + + WORD32 offset_dim = 4 * norm_dim; + + valign ax_st1, ax_st2, ax_st3; + const FLOAT32 *p_inp1, *p_inp2, *p_inp3, *p_inp4; + FLOAT32 *p_out1, *p_out2, *p_out3, *p_out4; + + p_inp1 = p_inp; + p_inp2 = p_inp + norm_dim; + p_inp3 = p_inp + CONST_TWO * norm_dim; + p_inp4 = p_inp + CONST_THREE * norm_dim; + + p_out1 = p_out; + p_out2 = p_out + norm_dim; + p_out3 = p_out + CONST_TWO * norm_dim; + p_out4 = p_out + CONST_THREE * norm_dim; + + /* Loop runs for leading_dim/4 iterations */ + for (i = 0; i < leading_dim >> LOG2_PDX_M; i++) + { + FLOAT32 sum1, sum2, sum3; + FLOAT32 sq_sum1, sq_sum2, sq_sum3; + xb_vecMxf32 x1, x2, x3; + xb_vecMxf32 b1, b2, b3; + xb_vecMxf32 mean_vec1, mean_vec2, mean_vec3, mean_vec4; +#ifdef ENABLE_HIGH_PRECISION + xb_vecMxf32 std_vec1, std_vec2, std_vec3, std_vec4; +#else + xb_vecMxf32 rstd_vec1, rstd_vec2, rstd_vec3, rstd_vec4; + xb_vecMxf32 w1, w2, w3, w4; +#endif + + /* 4 series of computations are done together */ + p_in_mxf32 = (const xb_vecMxf32 *)p_inp1; + p_in_mxf32_st1 = (const xb_vecMxf32 *)p_inp2; + p_in_mxf32_st2 = (const xb_vecMxf32 *)p_inp3; + p_in_mxf32_st3 = (const xb_vecMxf32 *)p_inp4; + + p_out_mxf32 = (xb_vecMxf32 *)p_out1; + p_out_mxf32_dim1 = (xb_vecMxf32 *)p_out2; + p_out_mxf32_dim2 = (xb_vecMxf32 *)p_out3; + p_out_mxf32_dim3 = (xb_vecMxf32 *)p_out4; + + p_out1 += offset_dim; + p_out2 += offset_dim; + p_out3 += offset_dim; + p_out4 += offset_dim; + + ax = PDX_LA_MXF32_PP(p_in_mxf32); + ax_st1 = PDX_LA_MXF32_PP(p_in_mxf32_st1); + ax_st2 = PDX_LA_MXF32_PP(p_in_mxf32_st2); + ax_st3 = PDX_LA_MXF32_PP(p_in_mxf32_st3); + + /* Reset sum and sq_sum vectors to zero */ + sum_mxf32_1 = PDX_ZERO_MXF32(); + sq_sum_mxf32_1 = PDX_ZERO_MXF32(); + + sum_mxf32_2 = PDX_ZERO_MXF32(); + sq_sum_mxf32_2 = PDX_ZERO_MXF32(); + + sum_mxf32_3 = PDX_ZERO_MXF32(); + sq_sum_mxf32_3 = PDX_ZERO_MXF32(); + + sum_mxf32_4 = PDX_ZERO_MXF32(); + sq_sum_mxf32_4 = PDX_ZERO_MXF32(); + +#ifdef ENABLE_HIGH_PRECISION + // Loop runs for norm_dim iterations + xtfloat a0 = 0; + xtfloat a1 = 0; + xtfloat a2 = 0; + xtfloat a3 = 0; + + xtfloat *p_a0 = (xtfloat *)p_inp1; + xtfloat *p_a1 = (xtfloat *)p_inp2; + xtfloat *p_a2 = (xtfloat *)p_inp3; + xtfloat *p_a3 = (xtfloat *)p_inp4; + + sum = 0, sq_sum = 0; + sum1 = 0, sq_sum1 = 0; + sum2 = 0, sq_sum2 = 0; + sum3 = 0, sq_sum3 = 0; + +#pragma no_reorder + for (j = 0; j < (norm_dim); j++) + { + xtfloat_loadip(a0, p_a0, 4); + xtfloat_loadip(a1, p_a1, 4); + xtfloat_loadip(a2, p_a2, 4); + xtfloat_loadip(a3, p_a3, 4); + + sum = sum + a0; + sq_sum = sq_sum + XT_MUL_S(a0, a0); + + sum1 = sum1 + a1; + sq_sum1 = sq_sum1 + XT_MUL_S(a1, a1); + + sum2 = sum2 + a2; + sq_sum2 = sq_sum2 + XT_MUL_S(a2, a2); + + sum3 = sum3 + a3; + sq_sum3 = sq_sum3 + XT_MUL_S(a3, a3); + } +#else + + /* Loop runs for norm_dim/4 iterations */ + for (j = 0; j < (norm_dim >> LOG2_PDX_M); j++) + { + /* Aligning load input (4-way) */ + PDX_LA_MXF32_IP(x0, ax, p_in_mxf32); + PDX_LA_MXF32_IP(x1, ax_st1, p_in_mxf32_st1); + PDX_LA_MXF32_IP(x2, ax_st2, p_in_mxf32_st2); + PDX_LA_MXF32_IP(x3, ax_st3, p_in_mxf32_st3); + + /* Add all the inputs of the dimension to be normalized */ + sum_mxf32_1 = PDX_ADD_MXF32(sum_mxf32_1, x0); + sum_mxf32_2 = PDX_ADD_MXF32(sum_mxf32_2, x1); + sum_mxf32_3 = PDX_ADD_MXF32(sum_mxf32_3, x2); + sum_mxf32_4 = PDX_ADD_MXF32(sum_mxf32_4, x3); + + /* Calculate the sum of squares of the inputs */ + PDX_MULA_MXF32(sq_sum_mxf32_1, x0, x0); + PDX_MULA_MXF32(sq_sum_mxf32_2, x1, x1); + PDX_MULA_MXF32(sq_sum_mxf32_3, x2, x2); + PDX_MULA_MXF32(sq_sum_mxf32_4, x3, x3); + } + + x0 = 0; + x1 = 0; + x2 = 0; + x3 = 0; + + /* Load remaining inputs */ + PDX_LAV_MXF32_XP(x0, ax, p_in_mxf32, m); + PDX_LAV_MXF32_XP(x1, ax_st1, p_in_mxf32_st1, m); + PDX_LAV_MXF32_XP(x2, ax_st2, p_in_mxf32_st2, m); + PDX_LAV_MXF32_XP(x3, ax_st3, p_in_mxf32_st3, m); + + /* Add all the remaining inputs of the dimension to be normalized */ + sum_mxf32_1 = PDX_ADD_MXF32(sum_mxf32_1, x0); + sum_mxf32_2 = PDX_ADD_MXF32(sum_mxf32_2, x1); + sum_mxf32_3 = PDX_ADD_MXF32(sum_mxf32_3, x2); + sum_mxf32_4 = PDX_ADD_MXF32(sum_mxf32_4, x3); + + /* Calculate the sum of squares of the remaining inputs */ + PDX_MULA_MXF32(sq_sum_mxf32_1, x0, x0); + PDX_MULA_MXF32(sq_sum_mxf32_2, x1, x1); + PDX_MULA_MXF32(sq_sum_mxf32_3, x2, x2); + PDX_MULA_MXF32(sq_sum_mxf32_4, x3, x3); + + sum = PDX_RADD_MXF32(sum_mxf32_1); + sq_sum = PDX_RADD_MXF32(sq_sum_mxf32_1); + + sum1 = PDX_RADD_MXF32(sum_mxf32_2); + sq_sum1 = PDX_RADD_MXF32(sq_sum_mxf32_2); + + sum2 = PDX_RADD_MXF32(sum_mxf32_3); + sq_sum2 = PDX_RADD_MXF32(sq_sum_mxf32_3); + + sum3 = PDX_RADD_MXF32(sum_mxf32_4); + sq_sum3 = PDX_RADD_MXF32(sq_sum_mxf32_4); +#endif + +#ifdef ENABLE_HIGH_PRECISION + /* Calculate mean */ + xtfloat mean1; + xtfloat mean2; + xtfloat mean3; + xtfloat mean4; + + PDX_DIV_F32_T(mean1,sum,norm_dim,1); + PDX_DIV_F32_T(mean2,sum1,norm_dim,1); + PDX_DIV_F32_T(mean3,sum2,norm_dim,1); + PDX_DIV_F32_T(mean4,sum3,norm_dim,1); + + sum_mxf32 = mean1; + sum_mxf32_1 = mean2; + sum_mxf32_2 = mean3; + sum_mxf32_3 = mean4; + + sum_mxf32 = PDX_SELI_MXF32(sum_mxf32_1, sum_mxf32, SEL_INDEX ); + sum_mxf32_2 = PDX_SELI_MXF32(sum_mxf32_3, sum_mxf32_2, SEL_INDEX ); + + /* Mean of each dimension */ + mean_vec = PDX_SELI_MXF32(sum_mxf32_2, sum_mxf32, SEL_INDEX ); + + /* Calculate variance */ + xtfloat variance ; + PDX_DIV_F32_T(variance,sq_sum,norm_dim,1); + variance -= mean1 * mean1; + variance = variance + eps; + xtfloat std1 = variance; + + PDX_DIV_F32_T(variance,sq_sum1,norm_dim,1); + variance -= mean2 * mean2; + variance = variance + eps; + xtfloat std2 = variance; + + PDX_DIV_F32_T(variance,sq_sum2,norm_dim,1); + variance -= mean3 * mean3; + variance = variance + eps; + xtfloat std3 = variance; + + PDX_DIV_F32_T(variance,sq_sum3,norm_dim,1); + variance -= mean4 * mean4; + variance = variance + eps; + xtfloat std4 = variance; + + sq_sum_mxf32 = std1; + sq_sum_mxf32_1 = std2; + sq_sum_mxf32_2 = std3; + sq_sum_mxf32_3 = std4; + + sq_sum_mxf32 = PDX_SELI_MXF32(sq_sum_mxf32_1, sq_sum_mxf32, SEL_INDEX ); + sq_sum_mxf32_2 = PDX_SELI_MXF32(sq_sum_mxf32_3, sq_sum_mxf32_2, SEL_INDEX ); + + /* std of each dimension */ + std_vec = PDX_SELI_MXF32(sq_sum_mxf32_2, sq_sum_mxf32, SEL_INDEX ); +#else + sum_mxf32 = sum; + sum_mxf32_1 = sum1; + sum_mxf32_2 = sum2; + sum_mxf32_3 = sum3; + + sum_mxf32 = PDX_SELI_MXF32(sum_mxf32_1, sum_mxf32, SEL_INDEX ); + sum_mxf32_2 = PDX_SELI_MXF32(sum_mxf32_3, sum_mxf32_2, SEL_INDEX ); + + /* Sum values of each dimension */ + sum_mxf32 = PDX_SELI_MXF32(sum_mxf32_2, sum_mxf32, SEL_INDEX ); + + sq_sum_mxf32 = sq_sum; + sq_sum_mxf32_1 = sq_sum1; + sq_sum_mxf32_2 = sq_sum2; + sq_sum_mxf32_3 = sq_sum3; + + sq_sum_mxf32 = PDX_SELI_MXF32(sq_sum_mxf32_1, sq_sum_mxf32, SEL_INDEX ); + sq_sum_mxf32_2 = PDX_SELI_MXF32(sq_sum_mxf32_3, sq_sum_mxf32_2, SEL_INDEX ); + + /* Sum of squares of each dimension */ + sq_sum_mxf32 = PDX_SELI_MXF32(sq_sum_mxf32_2, sq_sum_mxf32, SEL_INDEX ); + + /* Calculate mean */ + mean_vec = PDX_MUL_MXF32(sum_mxf32, inv_norm_vec); + + /* Calculate variance */ + std_vec = PDX_MUL_MXF32(sq_sum_mxf32, inv_norm_vec); + PDX_MULS_MXF32(std_vec, mean_vec, mean_vec); + std_vec = PDX_ADD_MXF32(std_vec, eps_vec); +#endif + + /* Calculate std */ + std_vec = PDX_SQRT_MXF32(std_vec); + + /* Calculate inverse std */ + rstd_vec = PDX_DIV_MXF32(CONST_ONE, std_vec); + + /* Store mean and inverse std output for each normalized shape of four dims */ + PDX_SA_MXF32_IP(rstd_vec, a_rstd, p_rstd_mxf32); + PDX_SA_MXF32_IP(mean_vec, a_mean, p_mean_mxf32); + + mean_vec1 = PDX_REP_MXF32(mean_vec,0); + mean_vec2 = PDX_REP_MXF32(mean_vec, CONST_ONE); + mean_vec3 = PDX_REP_MXF32(mean_vec, CONST_TWO); + mean_vec4 = PDX_REP_MXF32(mean_vec, CONST_THREE); + +#ifdef ENABLE_HIGH_PRECISION + std_vec1 = PDX_REP_MXF32(std_vec,0); + std_vec2 = PDX_REP_MXF32(std_vec, CONST_ONE); + std_vec3 = PDX_REP_MXF32(std_vec, CONST_TWO); + std_vec4 = PDX_REP_MXF32(std_vec, CONST_THREE); +#else + rstd_vec1 = PDX_REP_MXF32(rstd_vec,0); + rstd_vec2 = PDX_REP_MXF32(rstd_vec, CONST_ONE); + rstd_vec3 = PDX_REP_MXF32(rstd_vec, CONST_TWO); + rstd_vec4 = PDX_REP_MXF32(rstd_vec, CONST_THREE); +#endif + + xb_vecMxf32 w0, b0; + p_weight_mxf32 = (const xb_vecMxf32 *)p_weight; + p_bias_mxf32 = (const xb_vecMxf32 *)p_bias; + + /* Align load priming of weight and bias */ + valign a_weight = PDX_LA_MXF32_PP(p_weight_mxf32); + valign a_bias = PDX_LA_MXF32_PP(p_bias_mxf32); + + p_in_mxf32 = (const xb_vecMxf32 *)p_inp1; + p_in_mxf32_st1 = (const xb_vecMxf32 *)p_inp2; + p_in_mxf32_st2 = (const xb_vecMxf32 *)p_inp3; + p_in_mxf32_st3 = (const xb_vecMxf32 *)p_inp4; + + p_inp1 += offset_dim; + p_inp2 += offset_dim; + p_inp3 += offset_dim; + p_inp4 += offset_dim; + + ax = PDX_LA_MXF32_PP(p_in_mxf32); + ax_st1 = PDX_LA_MXF32_PP(p_in_mxf32_st1); + ax_st2 = PDX_LA_MXF32_PP(p_in_mxf32_st2); + ax_st3 = PDX_LA_MXF32_PP(p_in_mxf32_st3); + + /* + * The layer norm computations for 4 series are performed + * in two separate loops i.e. Each loop processes 2 series + * because performing all 4 series in a single loop would + * require 10 align registers. However fusion_g3 has only + * 8 align registers available. + * To avoid an additional cycle caused by alignment priming + * along with loads within a single loop, the 4 series of + * computations are split into two loops. + * + */ + + /* Calculate normalized values of first two dimensions */ +#ifdef ENABLE_HIGH_PRECISION +#pragma no_reorder +#endif + for (j = 0; j < (norm_dim >> LOG2_PDX_M); j++) + { + // Load weight */ + PDX_LA_MXF32_IP(w0, a_weight, p_weight_mxf32); + + /* Load bias */ + PDX_LA_MXF32_IP(b0, a_bias, p_bias_mxf32); + + /* Load input of each dimension */ + PDX_LA_MXF32_IP(x0, ax, p_in_mxf32); + PDX_LA_MXF32_IP(x1, ax_st1, p_in_mxf32_st1); + + /* x[j] - mean_value */ + x0 = PDX_SUB_MXF32(x0, mean_vec1); + x1 = PDX_SUB_MXF32(x1, mean_vec2); + +#ifdef ENABLE_HIGH_PRECISION + /* (x[j] - mean_value) / std */ + x0 = PDX_DIV_MXF32(x0, std_vec1); + x1 = PDX_DIV_MXF32(x1, std_vec2); + + b1 = b0; + + /* (x[j] - mean_value)/std * w + b */ + x0 = PDX_MUL_MXF32(x0, w0); + b0 = PDX_ADD_MXF32(x0, b0); + x1 = PDX_MUL_MXF32(x1, w0); + b1 = PDX_ADD_MXF32(x1, b1); + +#else + /* (1/std)*w -> rstd * w */ + w1 = PDX_MUL_MXF32(rstd_vec1, w0); + w2 = PDX_MUL_MXF32(rstd_vec2, w0); + + b1 = b0; + + /* (x[j] - mean_value) * (1/std * w) + b */ + PDX_MULA_MXF32(b0, x0, w1); + PDX_MULA_MXF32(b1, x1, w2); +#endif + /* Store the normalized data */ + PDX_SA_MXF32_IP(b0, a_out, p_out_mxf32); + PDX_SA_MXF32_IP(b1, a_out_dim1, p_out_mxf32_dim1); + } + + p_weight_mxf32 = (const xb_vecMxf32 *)p_weight; + p_bias_mxf32 = (const xb_vecMxf32 *)p_bias; + a_weight = PDX_LA_MXF32_PP(p_weight_mxf32); + a_bias = PDX_LA_MXF32_PP(p_bias_mxf32); + + /* Calculate normalized values of next two dimensions */ + +#ifdef ENABLE_HIGH_PRECISION +#pragma no_reorder +#endif + for (j = 0; j < (norm_dim >> LOG2_PDX_M); j++) + { + /* Load weight */ + PDX_LA_MXF32_IP(w0, a_weight, p_weight_mxf32); + + /* Load bias */ + PDX_LA_MXF32_IP(b0, a_bias, p_bias_mxf32); + + /* Load input of each dimension */ + PDX_LA_MXF32_IP(x2, ax_st2, p_in_mxf32_st2); + PDX_LA_MXF32_IP(x3, ax_st3, p_in_mxf32_st3); + + /* x[j] - mean_value */ + x2 = PDX_SUB_MXF32(x2, mean_vec3); + x3 = PDX_SUB_MXF32(x3, mean_vec4); +#ifdef ENABLE_HIGH_PRECISION + /* (x[j] - mean_value) / rstd */ + x2 = PDX_DIV_MXF32(x2, std_vec3); + x3 = PDX_DIV_MXF32(x3, std_vec4); + + b3 = b0; + + /* (x[j] - mean_value)/std * w + b */ + x2 = PDX_MUL_MXF32(x2, w0); + b0 = PDX_ADD_MXF32(x2, b0); + x3 = PDX_MUL_MXF32(x3, w0); + b3 = PDX_ADD_MXF32(x3, b3); +#else + /* (1/std)*w -> rstd * w */ + w3 = PDX_MUL_MXF32(rstd_vec3, w0); + w4 = PDX_MUL_MXF32(rstd_vec4, w0); + + b3 = b0; + + /* (x[j] - mean_value) * (1/std * w) + b */ + PDX_MULA_MXF32(b0, x2, w3); + PDX_MULA_MXF32(b3, x3, w4); +#endif + /* Store the normalized data */ + PDX_SA_MXF32_IP(b0, a_out_dim2, p_out_mxf32_dim2); + PDX_SA_MXF32_IP(b3, a_out_dim3, p_out_mxf32_dim3); + } + + /* Load remaining input data */ + PDX_LAV_MXF32_XP(x0, ax, p_in_mxf32, m); + PDX_LAV_MXF32_XP(x1, ax_st1, p_in_mxf32_st1, m); + PDX_LAV_MXF32_XP(x2, ax_st2, p_in_mxf32_st2, m); + PDX_LAV_MXF32_XP(x3, ax_st3, p_in_mxf32_st3, m); + + /* Load weight */ + PDX_LAV_MXF32_XP(w0, a_weight, p_weight_mxf32, m); + + /* Load bias */ + PDX_LAV_MXF32_XP(b0, a_bias, p_bias_mxf32, m); + + /* x[j] - mean_value */ + x0 = PDX_SUB_MXF32(x0, mean_vec1); + x1 = PDX_SUB_MXF32(x1, mean_vec2); + x2 = PDX_SUB_MXF32(x2, mean_vec3); + x3 = PDX_SUB_MXF32(x3, mean_vec4); + +#ifdef ENABLE_HIGH_PRECISION + /* (x[j] - mean_value) / std */ + x0 = PDX_DIV_MXF32(x0, std_vec1); + x1 = PDX_DIV_MXF32(x1, std_vec2); + x2 = PDX_DIV_MXF32(x2, std_vec3); + x3 = PDX_DIV_MXF32(x3, std_vec4); + + b1 = b0; + b2 = b0; + b3 = b0; + + // (x[j] - mean_value)/std * w + b; + x0 = PDX_MUL_MXF32(x0, w0); + b0 = PDX_ADD_MXF32(x0, b0); + x1 = PDX_MUL_MXF32(x1, w0); + b1 = PDX_ADD_MXF32(x1, b1); + x2 = PDX_MUL_MXF32(x2, w0); + b2 = PDX_ADD_MXF32(x2, b2); + x3 = PDX_MUL_MXF32(x3, w0); + b3 = PDX_ADD_MXF32(x3, b3); +#else + /* (1/std)*w -> rstd * w */ + w1 = PDX_MUL_MXF32(rstd_vec1, w0); + w2 = PDX_MUL_MXF32(rstd_vec2, w0); + w3 = PDX_MUL_MXF32(rstd_vec3, w0); + w4 = PDX_MUL_MXF32(rstd_vec4, w0); + + b1 = b0; + b2 = b0; + b3 = b0; + + /* (x[j] - mean_value) * (1/std * w) + b */ + PDX_MULA_MXF32(b0, x0, w1); + PDX_MULA_MXF32(b1, x1, w2); + PDX_MULA_MXF32(b2, x2, w3); + PDX_MULA_MXF32(b3, x3, w4); +#endif + + /* Store the normalized data */ + PDX_SAV_MXF32_XP(b0, a_out, p_out_mxf32, m); + PDX_SAV_MXF32_XP(b1, a_out_dim1, p_out_mxf32_dim1, m); + PDX_SAV_MXF32_XP(b2, a_out_dim2, p_out_mxf32_dim2, m); + PDX_SAV_MXF32_XP(b3, a_out_dim3, p_out_mxf32_dim3, m); + + PDX_SAPOS_MXF32_FP(a_out, p_out_mxf32); + PDX_SAPOS_MXF32_FP(a_out_dim1, p_out_mxf32_dim1); + PDX_SAPOS_MXF32_FP(a_out_dim2, p_out_mxf32_dim2); + PDX_SAPOS_MXF32_FP(a_out_dim3, p_out_mxf32_dim3); + } + + i = i*4; + + p_in_mxf32 = (const xb_vecMxf32 *)p_inp1; + p_in1_mxf32 = (const xb_vecMxf32 *)p_inp1; + + p_out_mxf32 = (xb_vecMxf32 *)p_out1; + +#ifdef ENABLE_HIGH_PRECISION + p_a0 = (xtfloat *)p_inp1; +#endif + + /* Align load priming */ +#ifndef ENABLE_HIGH_PRECISION + ax = PDX_LA_MXF32_PP(p_in_mxf32); +#endif + valign ax_inp = PDX_LA_MXF32_PP(p_in1_mxf32); + + /* Process remaining leading dimensions */ + for (; i < leading_dim; i++) + { + /* Reset sum and sq_sum vectors to zero */ + sum_mxf32 = PDX_ZERO_MXF32(); + sq_sum_mxf32 = PDX_ZERO_MXF32(); + +#ifdef ENABLE_HIGH_PRECISION + xtfloat a0 = 0; + sum = 0, sq_sum = 0; +#pragma no_reorder + + /* Loop runs for norm_dim iterations */ + for (j = 0; j < (norm_dim); j++) + { + xtfloat_loadip(a0, p_a0, 4); + sum = sum + a0; + sq_sum = sq_sum + XT_MUL_S(a0, a0); + } +#else + /* Loop runs for norm_dim/4 iterations */ + for (j = 0; j < (norm_dim >> LOG2_PDX_M); j++) + { + /* Aligning load input (4-way) */ + PDX_LA_MXF32_IP(x0, ax, p_in_mxf32); + + /* Add all the inputs of the dimension to be normalized */ + sum_mxf32 = PDX_ADD_MXF32(sum_mxf32, x0); + + /* Calculate the sum of squares of the inputs */ + PDX_MULA_MXF32(sq_sum_mxf32, x0, x0); + } + + x0 = 0; + /* Load remaining inputs */ + PDX_LAV_MXF32_XP(x0, ax, p_in_mxf32, m); + + /* Add all the remaining inputs of the dimension to be normalized */ + sum_mxf32 = PDX_ADD_MXF32(sum_mxf32, x0); + + /* Calculate the sum of squares of the remaining inputs */ + PDX_MULA_MXF32(sq_sum_mxf32, x0, x0); + + sum = PDX_RADD_MXF32(sum_mxf32); + sq_sum = PDX_RADD_MXF32(sq_sum_mxf32); +#endif + +#ifdef ENABLE_HIGH_PRECISION + mean = PDX_DIV_MXF32(sum,norm_dim); + variance = PDX_DIV_MXF32(sq_sum,norm_dim); +#else + mean = sum * inv_normalized; + variance = sq_sum * inv_normalized; +#endif + /* Calculate mean */ + mean_vec = mean; + + /* Calculate variance */ + variance -= mean * mean; + variance = variance + eps; + std_vec = variance; + + /* Calculate std */ + std_vec = PDX_SQRT_MXF32(std_vec); + + /* Calculate inverse std */ + rstd_vec = PDX_DIV_MXF32(1, std_vec); + + /* Store inverse std output for each normalized shape */ + PDX_SAV_MXF32_XP(rstd_vec, a_rstd, p_rstd_mxf32, SIZE_OF_FLOAT); + + /* Store mean */ + p_mean[i] = mean; + + xb_vecMxf32 w0, b0; + p_weight_mxf32 = (const xb_vecMxf32 *)p_weight; + p_bias_mxf32 = (const xb_vecMxf32 *)p_bias; + + // Align load priming of weight and bias + valign a_weight = PDX_LA_MXF32_PP(p_weight_mxf32); + valign a_bias = PDX_LA_MXF32_PP(p_bias_mxf32); + +#ifdef ENABLE_HIGH_PRECISION +#pragma no_reorder +#endif + /* Calculate normalized values */ + for (j = 0; j < (norm_dim >> LOG2_PDX_M); j++) + { + /* Load input */ + PDX_LA_MXF32_IP(x0, ax_inp, p_in1_mxf32); + + /* Load weight */ + PDX_LA_MXF32_IP(w0, a_weight, p_weight_mxf32); + + /* Load bias */ + PDX_LA_MXF32_IP(b0, a_bias, p_bias_mxf32); + + /* x[j] - mean_value */ + x0 = PDX_SUB_MXF32(x0, mean_vec); + +#ifdef ENABLE_HIGH_PRECISION + /* (x[j] - mean_value) / std */ + x0 = PDX_DIV_MXF32(x0, std_vec); + + /* (x[j] - mean_value)/std * w + b */ + x0 = PDX_MUL_MXF32(x0, w0); + b0 = PDX_ADD_MXF32(x0, b0); + +#else + /* (1/std)*w -> rstd * w */ + w0 = PDX_MUL_MXF32(rstd_vec, w0); + + /* (x[j] - mean_value) * (1/std * w) + b */ + PDX_MULA_MXF32(b0, x0, w0); +#endif + + /* Store the normalized data */ + PDX_SA_MXF32_IP(b0, a_out, p_out_mxf32); + } + + /* Load remaining input data */ + PDX_LAV_MXF32_XP(x0, ax_inp, p_in1_mxf32, m); + + /* Load weight */ + PDX_LAV_MXF32_XP(w0, a_weight, p_weight_mxf32, m); + + /* Load bias */ + PDX_LAV_MXF32_XP(b0, a_bias, p_bias_mxf32, m); + + /* x[j] - mean_value */ + x0 = PDX_SUB_MXF32(x0, mean_vec); + +#ifdef ENABLE_HIGH_PRECISION + /* (x[j] - mean_value) / std */ + x0 = PDX_DIV_MXF32(x0, std_vec); + + /* (x[j] - mean_value)/std * w + b */ + x0 = PDX_MUL_MXF32(x0, w0); + b0 = PDX_ADD_MXF32(x0, b0); + +#else + /* (1/std)*w -> rstd * w */ + w0 = PDX_MUL_MXF32(rstd_vec, w0); + + /* (x[j] - mean_value) * (1/std * w) + b */ + PDX_MULA_MXF32(b0, x0, w0); +#endif + + /* Store the normalized data */ + PDX_SAV_MXF32_XP(b0, a_out, p_out_mxf32, m); + PDX_SAPOS_MXF32_FP(a_out, p_out_mxf32); + } + + PDX_SAPOS_MXF32_FP(a_rstd, p_rstd_mxf32); + + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/reorg/xa_nn_cat.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/reorg/xa_nn_cat.c new file mode 100644 index 00000000000..d0d0d08f633 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/reorg/xa_nn_cat.c @@ -0,0 +1,136 @@ +/******************************************************************************* + * Copyright (c) 2018-2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * *******************************************************************************/ + +#include "xa_type_def.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_internal.h" + +WORD32 xa_nn_cat(WORD8 *__restrict__ p_out, + const WORD32 *const p_out_shape, + const WORD8 **pp_inps, + const WORD32 *const*pp_inps_shape, + WORD32 num_inp_dims, + WORD32 num_inp, + WORD32 axis, + WORD32 elm_size) +{ + /* NULL Pointer Checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(p_out_shape, UNSUPPORTED_PARAM); + + /* Pointer Alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, elm_size, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), UNSUPPORTED_PARAM); + + for (int i = 0; i < num_inp; i++) + { + XA_NNLIB_ARG_CHK_PTR(pp_inps[i], UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(pp_inps[i], elm_size, UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_PTR(pp_inps_shape[i], UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_ALIGN(pp_inps_shape[i], sizeof(WORD32), + UNSUPPORTED_PARAM); + } + + /* Invalid Input checks */ + XA_NNLIB_ARG_CHK_COND((axis < 0) || (axis >= num_inp_dims), + UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((num_inp_dims <= 0), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((num_inp <= 0), UNSUPPORTED_PARAM); + XA_NNLIB_ARG_CHK_COND((elm_size <= 0) || (elm_size == CONST_THREE) + || (elm_size > CONST_FOUR), UNSUPPORTED_PARAM); + + /* Calculate outer_size and inner_size based on axis */ + WORD32 outer_size = CONST_ONE; + WORD32 inner_size = CONST_ONE; + + for (WORD32 i = 0; i < axis; i++) + { + outer_size *= p_out_shape[i]; + } + + for (WORD32 i = axis + 1; i < num_inp_dims; i++) + { + inner_size *= p_out_shape[i]; + } + + WORD8 *ptmp_out = p_out; + + /* Calculate the total size in bytes for the inner dimensions of the tensor */ + WORD32 inner_size_bytes = inner_size * elm_size; + + /* Loop over each input tensor */ + for (int i = 0; i < num_inp; i++) + { + /* + * Calculate the number of elements to copy based + * on the shape of the current input along the axis + */ + WORD32 copy_size = pp_inps_shape[i][axis] * inner_size_bytes; + + const WORD8 *__restrict__ p_i = pp_inps[i]; + WORD8 *__restrict__ p_o = ptmp_out; + + valign align_in = PDX_LA_4MX8_PP((xb_vec4Mx8*) p_i); + + /* Number of full chunks in copy_size */ + WORD32 t_full_chunks = copy_size >> LOG2_PDX_4M; + + /* Remaining bytes after full chunks */ + WORD32 t_remainder = copy_size & MASK_LOG2_PDX_4M; + + xb_vec4Mx8 *in_ptr = (xb_vec4Mx8*) p_i; + xb_vec4Mx8 *out_ptr = (xb_vec4Mx8*) p_o; + + /* Loop over each slice in the outer dimension */ + for (WORD32 k = 0; k < outer_size; k++) + { + p_o = ptmp_out + k * p_out_shape[axis] * inner_size_bytes; + valign align_out = PDX_Z_ALIGN(); + + xb_vec4Mx8 x0; + + /* Process full vector chunks */ + for (WORD32 m = 0; m < t_full_chunks; m++) + { + PDX_LA_4MX8_IP(x0, align_in, in_ptr); + PDX_SA_4MX8_IP(x0, align_out, out_ptr); + } + + /* Handle any remaining elements */ + PDX_LAV_4MX8_XP(x0, align_in, in_ptr, t_remainder); + PDX_SAV_4MX8_XP(x0, align_out, out_ptr, t_remainder); + + /* Store the remaining data if any */ + PDX_SAPOS_4MX8_FP(align_out, out_ptr); + out_ptr = (xb_vec4Mx8*) (ptmp_out + + (k + CONST_ONE) * p_out_shape[axis] * inner_size * elm_size); + } + + /* + * Update the input pointer and the output + * pointer after processing one tensor + */ + in_ptr += copy_size * elm_size; + ptmp_out += pp_inps_shape[i][axis] * inner_size * elm_size; + } + return 0; +} diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/tables/include/expf_tbl.h b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/tables/include/expf_tbl.h new file mode 100644 index 00000000000..34465152dc2 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/tables/include/expf_tbl.h @@ -0,0 +1,44 @@ +/******************************************************************************* + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + ******************************************************************************/ + +/* + tables for expf(x) approximation +*/ +#ifndef __EXPFTBL_H__ +#define __EXPFTBL_H__ + +#include "xa_type_def.h" +/* + polynomial coefficients for 2^x in range 0...1 + + derived by MATLAB code: + order=6; + x=(0:pow2(1,-16):1); + y=2.^x; + p=polyfit(x,y,6); + p(order+1)=1; + p(order)=p(order)-(sum(p)-2); +*/ +extern const WORD32 expftbl_q30[8]; +extern const WORD32 invln2_q30; /* 1/ln(2), Q30 */ + +#endif /* __EXPFTBL_H__ */ diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/tables/src/expf_tbl.c b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/tables/src/expf_tbl.c new file mode 100644 index 00000000000..947bc14906b --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/algo/kernels/tables/src/expf_tbl.c @@ -0,0 +1,50 @@ +/******************************************************************************* + * Copyright (c) 2024 Cadence Design Systems, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to use this Software with Cadence processor cores only and + * not with any other processors and platforms, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + ******************************************************************************/ + +/* + tables for expf(x) approximation +*/ +#include "expf_tbl.h" + +/* + polynomial coefficients for 2^x in range 0...1 + + derived by MATLAB code: + order=6; + x=(0:pow2(1,-16):1); + y=2.^x; + p=polyfit(x,y,6); + p(order+1)=1; + p(order)=p(order)-(sum(p)-2); +*/ +const WORD32 expftbl_q30[8]= +{ 234841, + 1329551, + 10400465, //0.009686187840998172760009765625 + 59570027, //0.055478910915553569793701171875 + 257946177, //0.240231097675859928131103515625 + 744260763, //0.693146849982440471649169921875 + 1073741824, //1 + 0 /* Padding to allow for vector loads */ +}; + +const WORD32 invln2_q30=1549082005L; /* 1/ln(2), Q30 */ diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/build/common.mk b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/build/common.mk new file mode 100644 index 00000000000..4b4de080b9b --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/build/common.mk @@ -0,0 +1,151 @@ +# +# Copyright (c) 2024 Cadence Design Systems, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to use this Software with Cadence processor cores only and +# not with any other processors and platforms, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# + +QUIET = +WARNING_AS_ERROR ?= 1 +MAPFILE = map_$(CODEC_NAME).txt +LDSCRIPT = ldscript_$(CODEC_NAME).txt +SYMFILE = symbols_$(CODEC_NAME).txt +DETECTED_CORE?= + +AR = xt-ar $(XTCORE) +OBJCOPY = xt-objcopy $(XTCORE) +CC = xt-clang $(XTCORE) +CXX = xt-clang++ $(XTCORE) +ISS = xt-run $(XTCORE) +CONFIGDIR := $(shell $(ISS) --show-config=config) +include $(CONFIGDIR)/misc/hostenv.mk +GREPARGS = +WINNUL = +IFEXIST = + +has_mul16_tmp = $(shell $(GREP) $(GREPARGS)"IsaUseMul16 = 1" "$(XTENSA_SYSTEM)$(S)$(XTENSA_CORE)-params") +has_mul32_tmp = $(shell $(GREP) $(GREPARGS)"IsaUse32bitMul = 1" "$(XTENSA_SYSTEM)$(S)$(XTENSA_CORE)-params") +has_mul16=1 +has_mul32=1 +ifeq (,$(has_mul16_tmp)) +has_mul16=0 +endif +ifeq (,$(has_mul32_tmp)) +has_mul32=0 +endif +CFLAGS += -Wall +ifeq ($(WARNING_AS_ERROR),1) + CFLAGS += -Werror + ifneq ($(CC), xt-xcc) + CFLAGS += -Wno-parentheses-equality + endif +endif +ifeq "$(has_mul16)" "0" + CFLAGS += -mno-mul16 +endif +ifeq "$(has_mul32)" "0" + CFLAGS += -mno-mul32 -mno-div32 +endif +CFLAGS += -fsigned-char -fno-exceptions -mlongcalls -mcoproc -INLINE:requested -fno-zero-initialized-in-bss +CFLAGS += -mtext-section-literals +CFLAGS += -Wsign-compare + +OBJDIR = objs$(S)$(CODEC_NAME)$(DETECTED_CORE) +LIBDIR = $(ROOTDIR)$(S)lib + +OBJ_LIBOBJS = $(addprefix $(OBJDIR)/,$(LIBOBJS)) +OBJ_LIBOSOBJS = $(addprefix $(OBJDIR)/,$(LIBOSOBJS)) + +ALL_OBJS := \ + $(OBJ_LIBOBJS) \ + $(OBJ_LIBOSOBJS) \ + +ALL_DEPS := $(foreach dep,$(ALL_OBJS),${dep:%.o=%.d}) +-include $(ALL_DEPS) + +TEMPOBJ = temp.o + +LIBOBJ = $(OBJDIR)/xa_$(CODEC_NAME)$(DETECTED_CORE).o +LIB = xa_$(CODEC_NAME)$(DETECTED_CORE).a + +CFLAGS += $(EXTRA_CFLAGS) $(EXTRA_CFLAGS2) + +LIBLDFLAGS += \ + $(EXTRA_LIBLDFLAGS) + +ifeq ($(DEBUG),1) + NOSTRIP = 1 + OPT_O2 = -O0 -g + OPT_O3 = -O0 -g + OPT_OS = -O0 -g + OPT_O0 = -O0 -g + CFLAGS += -DDEBUG +else + OPT_O2 = -O2 -LNO:simd + OPT_O3 = -O3 -LNO:simd + OPT_OS = -Os + OPT_O0 = -O0 + CFLAGS += -DNDEBUG=1 +endif + +all: $(OBJDIR) $(LIB) + +install: $(LIB) + @echo "Installing $(LIB)" + $(QUIET) -$(MKPATH) "$(LIBDIR)" + $(QUIET) $(CP) $(LIB) "$(LIBDIR)" + +$(OBJDIR): + $(QUIET) -$(MKPATH) $@ + +ifeq ($(NOSTRIP), 1) +$(LIBOBJ): $(OBJ_LIBOBJS) $(OBJ_LIBOSOBJS) + @echo "Linking Objects" + $(QUIET) $(CC) -o $@ $^ \ + -Wl,-r,-Map,$(MAPFILE) --no-standard-libraries +else +$(LIBOBJ): $(OBJ_LIBOBJS) $(OBJ_LIBOSOBJS) + @echo "Linking Objects" + $(QUIET) $(CC) -o $@ $^ \ + -Wl,-r,-Map,$(MAPFILE) --no-standard-libraries \ + -Wl,--retain-symbols-file,$(SYMFILE) \ + -Wl,--script,$(LDSCRIPT) $(IPA_FLAGS) $(LIBLDFLAGS) + $(QUIET) $(OBJCOPY) --keep-global-symbols=$(SYMFILE) $@ $(TEMPOBJ) + $(QUIET) $(OBJCOPY) --strip-unneeded $(TEMPOBJ) $@ + $(QUIET) -$(RM) $(TEMPOBJ) +endif + + +$(OBJ_LIBOBJS): $(OBJDIR)/%.o: %.c + @echo "Compiling $<" + $(QUIET) $(CC) -o $@ $(OPT_O3) $(CFLAGS) $(INCLUDES) -c $< + +$(OBJ_LIBOSOBJS): $(OBJDIR)/%.o: %.c + @echo "Compiling $<" + $(QUIET) $(CC) -o $@ $(OPT_OS) $(CFLAGS) $(INCLUDES) -c $< + + + +$(LIB): %.a: $(OBJDIR)/%.o + @echo "Creating Library $@" + $(QUIET) $(AR) rc $@ $^ + +clean: + -$(RM) xa_$(CODEC_NAME)$(DETECTED_CORE).a xgcc_$(CODEC_NAME)$(DETECTED_CORE).a $(LIBDIR)$(S)xa_$(CODEC_NAME)$(DETECTED_CORE).a $(LIBDIR)$(S)xgcc_$(CODEC_NAME)$(DETECTED_CORE).a $(MAPFILE) + -$(RM) $(OBJDIR)$(S)*.o + -$(RM) $(ALL_DEPS) + -$(RM_R) $(LIBDIR) diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/build/detect_core.mk b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/build/detect_core.mk new file mode 100644 index 00000000000..0fdec387134 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/build/detect_core.mk @@ -0,0 +1,36 @@ + + +ISS = xt-run $(XTCORE) +CONFIGDIR := $(shell $(ISS) --show-config=config) +include $(CONFIGDIR)/misc/hostenv.mk + +GREPARGS = +ifeq ($(HOSTTYPE),win) +GREPARGS = /c: +endif + +ifeq ("", "$(detected_core)") + +fusion_g3="0" +fusion_g3_tmp:=$(shell $(GREP) $(GREPARGS)"IsaUseFusionG = 1" "$(XTENSA_SYSTEM)$(S)$(XTENSA_CORE)-params") + +#check if the detected core is Fusion G3 + ifneq ("", "$(fusion_g3_tmp)") + detected_core=fusion_g3 + endif + +ifeq ("$(detected_core)", "fusion_g3") + fusion_g3=1 + CFLAGS+= -DCORE_FUG3=1 +else + $(error "$(fusion_g3_tmp)" Core Not Found) +endif +endif + +xclib_tmp:=$(shell $(GREP) $(GREPARGS)"SW_CLibrary = xclib" "$(XTENSA_SYSTEM)$(S)$(XTENSA_CORE)-params") +ifneq ("", "$(xclib_tmp)") + xclib=1 +else + xclib=0 +endif + diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/build/ldscript_nnlib.txt b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/build/ldscript_nnlib.txt new file mode 100644 index 00000000000..b061c273a5a --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/build/ldscript_nnlib.txt @@ -0,0 +1,48 @@ + +/* Reorg kernels */ +EXTERN(xa_nn_cat) + +/* Basic math kernels */ +EXTERN(xa_nn_elm_add_32x32_32) +EXTERN(xa_nn_elm_add_broadcast_5D_32x32_32) +EXTERN(xa_nn_elm_add_broadcast_5D_f32xf32_f32) +EXTERN(xa_nn_elm_add_f32xf32_f32) +EXTERN(xa_nn_elm_add_scalar_32x32_32) +EXTERN(xa_nn_elm_add_scalar_f32xf32_f32) +EXTERN(xa_nn_elm_dequantize_sym16_f32) +EXTERN(xa_nn_elm_dequantize_asym16_f32) +EXTERN(xa_nn_elm_dequantize_sym16u_f32) +EXTERN(xa_nn_elm_dequantize_asym16u_f32) +EXTERN(xa_nn_elm_dequantize_sym8_f32) +EXTERN(xa_nn_elm_dequantize_asym8_f32) +EXTERN(xa_nn_elm_dequantize_sym8u_f32) +EXTERN(xa_nn_elm_dequantize_asym8u_f32) +EXTERN(xa_nn_elm_dequantize_sym4_f32) +EXTERN(xa_nn_elm_dequantize_asym4_f32) +EXTERN(xa_nn_elm_dequantize_sym4u_f32) +EXTERN(xa_nn_elm_dequantize_asym4u_f32) +EXTERN(xa_nn_elm_mul_scalar_32x32_32) +EXTERN(xa_nn_elm_mul_32x32_32) +EXTERN(xa_nn_elm_mul_broadcast_5D_32x32_32) +EXTERN(xa_nn_elm_mul_scalar_f32xf32_f32) +EXTERN(xa_nn_elm_mul_f32xf32_f32) +EXTERN(xa_nn_elm_mul_broadcast_5D_f32xf32_f32) +EXTERN(xa_nn_elm_quantize_f32_asym16) +EXTERN(xa_nn_elm_quantize_f32_asym16u) +EXTERN(xa_nn_elm_quantize_f32_asym4) +EXTERN(xa_nn_elm_quantize_f32_asym4u) +EXTERN(xa_nn_elm_quantize_f32_asym8) +EXTERN(xa_nn_elm_quantize_f32_asym8u) +EXTERN(xa_nn_elm_quantize_f32_sym16) +EXTERN(xa_nn_elm_quantize_f32_sym16u) +EXTERN(xa_nn_elm_quantize_f32_sym4) +EXTERN(xa_nn_elm_quantize_f32_sym4u) +EXTERN(xa_nn_elm_quantize_f32_sym8) +EXTERN(xa_nn_elm_quantize_f32_sym8u) + +/* Normalization kernels */ +EXTERN(xa_nn_native_layer_norm_f32_f32) + +/* Activation kernels */ +EXTERN(xa_nn_softmax_f32_f32) + diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/build/makefile b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/build/makefile new file mode 100644 index 00000000000..77994982206 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/build/makefile @@ -0,0 +1,42 @@ +# +# Copyright (c) 2024 Cadence Design Systems, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to use this Software with Cadence processor cores only and +# not with any other processors and platforms, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# + +.PHONY: all install clean + +ifneq ($(BUILD_SCRATCH_SIZE_LIB), 1) +include detect_core.mk +endif + +all: nnlib + +install: install_nnlib + +clean: clean_nnlib + +nnlib: + $(MAKE) -f makefile_nn_lib_fusion_g3 all + +install_nnlib: nnlib + $(MAKE) -f makefile_nn_lib_fusion_g3 install + +clean_nnlib: + $(MAKE) -f makefile_nn_lib_fusion_g3 clean + diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/build/makefile_nn_lib_fusion_g3 b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/build/makefile_nn_lib_fusion_g3 new file mode 100644 index 00000000000..885e9ce583a --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/build/makefile_nn_lib_fusion_g3 @@ -0,0 +1,102 @@ +# +# Copyright (c) 2024 Cadence Design Systems, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to use this Software with Cadence processor cores only and +# not with any other processors and platforms, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# + +.PHONY: all install clean + +ROOTDIR = .. + +CODEC_NAME = nnlib + +ifeq ($(DISABLE_ARG_CHK), 1) +CFLAGS += -DDISABLE_ARG_CHK +endif +vpath %.c $(ROOTDIR)/algo/kernels/basic +vpath %.c $(ROOTDIR)/algo/common/src +vpath %.c $(ROOTDIR)/algo/kernels/norm +vpath %.c $(ROOTDIR)/algo/kernels/reorg +vpath %.c $(ROOTDIR)/algo/kernels/activations +vpath %.c $(ROOTDIR)/algo/kernels/tables/src + +COMMONOSOBJS = \ + xa_nnlib_common_api.o + + +BASICOBJS = \ + xa_nn_elm_add_32x32.o \ + xa_nn_elm_add_f32.o \ + xa_nn_elm_dequantize_sym16_f32.o \ + xa_nn_elm_dequantize_asym16_f32.o \ + xa_nn_elm_dequantize_sym16u_f32.o \ + xa_nn_elm_dequantize_asym16u_f32.o \ + xa_nn_elm_dequantize_sym8_f32.o \ + xa_nn_elm_dequantize_asym8_f32.o \ + xa_nn_elm_dequantize_sym8u_f32.o \ + xa_nn_elm_dequantize_asym8u_f32.o \ + xa_nn_elm_dequantize_sym4_f32.o \ + xa_nn_elm_dequantize_asym4_f32.o \ + xa_nn_elm_dequantize_sym4u_f32.o \ + xa_nn_elm_dequantize_asym4u_f32.o \ + xa_nn_elm_mul_32x32.o \ + xa_nn_elm_mul_f32.o \ + xa_nn_elm_quantize_f32_asym16.o \ + xa_nn_elm_quantize_f32_asym16u.o \ + xa_nn_elm_quantize_f32_asym4.o \ + xa_nn_elm_quantize_f32_asym4u.o \ + xa_nn_elm_quantize_f32_asym8.o \ + xa_nn_elm_quantize_f32_asym8u.o \ + xa_nn_elm_quantize_f32_sym16.o \ + xa_nn_elm_quantize_f32_sym16u.o \ + xa_nn_elm_quantize_f32_sym4.o \ + xa_nn_elm_quantize_f32_sym4u.o \ + xa_nn_elm_quantize_f32_sym8.o \ + xa_nn_elm_quantize_f32_sym8u.o \ + + +NORMOBJS = \ + xa_nn_layer_norm.o + +REORGOBJS = \ + xa_nn_cat.o + +ACTIVATIONSOBJS = \ + xa_nn_softmax.o + +TABLESOBJS = \ + expf_tbl.o + + +LIBOBJS = $(BASICOBJS) $(REORGOBJS) $(NORMOBJS) $(ACTIVATIONSOBJS) $(TABLESOBJS) +LIBOSOBJS = $(COMMONOSOBJS) + +INCLUDES = \ + -I$(ROOTDIR)/include \ + -I$(ROOTDIR)/algo/common/include \ + -I$(ROOTDIR)/include/nnlib \ + -I$(ROOTDIR)/algo/kernels/tables/include + +#CFLAGS += -DENABLE_HIGH_PRECISION +CFLAGS += \ + -ffunction-sections \ + +LIBLDFLAGS = -Wl,--gc-sections -Wl,--undefined=xa_nnlib_get_lib_name_string + + +include $(ROOTDIR)/build/common.mk diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/build/symbols_nnlib.txt b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/build/symbols_nnlib.txt new file mode 100644 index 00000000000..cd4e3bf1c88 --- /dev/null +++ b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/build/symbols_nnlib.txt @@ -0,0 +1,39 @@ +xa_nn_cat +xa_nn_elm_add_32x32_32 +xa_nn_elm_add_broadcast_5D_32x32_32 +xa_nn_elm_add_broadcast_5D_f32xf32_f32 +xa_nn_elm_add_f32xf32_f32 +xa_nn_elm_add_scalar_32x32_32 +xa_nn_elm_add_scalar_f32xf32_f32 +xa_nn_elm_dequantize_sym16_f32 +xa_nn_elm_dequantize_asym16_f32 +xa_nn_elm_dequantize_sym16u_f32 +xa_nn_elm_dequantize_asym16u_f32 +xa_nn_elm_dequantize_sym8_f32 +xa_nn_elm_dequantize_asym8_f32 +xa_nn_elm_dequantize_sym8u_f32 +xa_nn_elm_dequantize_asym8u_f32 +xa_nn_elm_dequantize_sym4_f32 +xa_nn_elm_dequantize_asym4_f32 +xa_nn_elm_dequantize_sym4u_f32 +xa_nn_elm_dequantize_asym4u_f32 +xa_nn_elm_mul_scalar_32x32_32 +xa_nn_elm_mul_32x32_32 +xa_nn_elm_mul_broadcast_5D_32x32_32 +xa_nn_elm_mul_scalar_f32xf32_f32 +xa_nn_elm_mul_f32xf32_f32 +xa_nn_elm_mul_broadcast_5D_f32xf32_f32 +xa_nn_elm_quantize_f32_asym16 +xa_nn_elm_quantize_f32_asym16u +xa_nn_elm_quantize_f32_asym4 +xa_nn_elm_quantize_f32_asym4u +xa_nn_elm_quantize_f32_asym8 +xa_nn_elm_quantize_f32_asym8u +xa_nn_elm_quantize_f32_sym16 +xa_nn_elm_quantize_f32_sym16u +xa_nn_elm_quantize_f32_sym4 +xa_nn_elm_quantize_f32_sym4u +xa_nn_elm_quantize_f32_sym8 +xa_nn_elm_quantize_f32_sym8u +xa_nn_native_layer_norm_f32_f32 +xa_nn_softmax_f32_f32 \ No newline at end of file diff --git a/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/doc/FusionG3-NNLib-API.pdf b/backends/cadence/fusion_g3/third-party/nnlib/nnlib-FuG3/xa_nnlib/doc/FusionG3-NNLib-API.pdf new file mode 100644 index 0000000000000000000000000000000000000000..b62fcdac8b28c5780f8fd4f3397ba8cd18134ad3 GIT binary patch literal 1034936 zcmeFZ2Ut^S*ESkJK$-2$n4%CA3h)kc0rj zhz14A3>Hw(D1;&+HbCq+is)Eo=6`mgGt9is_rBlvpZ`DSx{l0jce1m~Q&zdxz1Fi$ zf>-)j+FCiOD4Mi_KNUq=eH;Bv(F_&EWy`E7+Y;lfy`wfqZP>KVIyh=wT#CLO_?cuK z5S6eF6_onc zdv8ij*sPDSxAv#UrmWT%KV07q178hpBiT7hcH*b$W5jpXcaWUoAom-A#?;^RtMRI|Q&4`QH3_jsD0_kwj7e7io z$F?^5;D7NXT4T3v_9btQ+8hT?^d&p$iyHzyx#)|V0HnfaQv!S!A6tEKpR9fC^=&2h z^1@)Zzt{=I6pXfQw%w7^J;4#z1Ebtt(zqc@RIDjB|9A4H_(_^dX(3ut*b#} zZ1f#%?ev{o?AGW5$w}A@dIh8nZl$$v@}|^8&;+r_i@#WtttrV-2`Pzi?_;)Edy}nK z#--6?;z+(;;J%co=oD~@ME)fsl%j7ldobY}e;G_4`i9MM$<{s_z$mPYi`f(#XB`li zux>L=-xlL&2Va%4IXNzBql)6zN+EU4rm@fkJAB7hjQXCSTD|+|4dweC$DJ6VoXKvO z-?x100qCXdIrc=Zs#4jfg0SpY z`*y6;p8kjNCn|R>38-FDc>d|U&MWI4t)nHngwjq< zrTsRQHrI42s(yhV>SzIb&Y}ZZHR0?K(Th*7q%2SCVYQ#D{c<($4LbC~$aDMtZKv$p zSeH+%WKWxX7JpUPv+EEqmI*(Z@%bd z8emyb7v^lhlqeD=ujhE&`pK(N`}@h{4P=$K^FO8CdNI0^|4xsve2d%RjyI>g3-VHa ze*f{X#?s!n=9b-rNjvWo8^}WIEqhPYD3r)u@Qz_6%snZh^O_uE+55($o}DeBerkJKuVhZU-nzuxuqS1mdz{h&e3UF3H*UBZ)iJ)` z?znuW-1xq&4?eXoKjZ2f`?JZ~Yr#y54?i8+{)71=C&B7v8wOiu$Rp9MDyudxsM~fh zT!tr-FC%#2Zs{=ZH+d$F=pb90JPU>{c#YUaz54OT$_tArMo;bTp|#uk`X1k@zY&vFTly~FwnFB+?ucu}BuD0E4`blfr!tPjY zlX;`8zz$<>mf}$K>%V)pw9_E!qMym3#$VRSBq!CcIr9E+x9p644(0f^r!!c=^Cz^K zuwUogIkv#(*luo?scrs1>Bl81y&@0)tX^uT`KhuSGbu$B+2rj%<+X>sIdwChwvE3d zO2w#(D66?{vfWv8(N)ZX-XC=Z=!h~*v~xuK+DTloN|@QkkifYQDi@&lDXzskKDQr+ zh$pj8W#3l6y{7P2rIf^9C>Q-@U+yt*)WL6w-M-g((c7x1Bd0#bmz}y4_i237uWG$p z6Uq0#^VA8QLn%9G*s)WXq2YC$Am7vxpe~j7kQ!Rvv?MNm zp4&*@aqL3$8iRMq{tv17I`F4`&W2|c$%$_UZ*PRo{@ z&#!*fHe7bo?U#(E!n!DaPc-p}?c9~A?_?Td=~@lJ;U^FEn3#s(y|v87!zs8v8;ar@ zpK~`!3>6JQtlCO<`^_!3KWv%vef_E5JYE+3x;6IWxx>lM@29r;Bpq@%ybIZtL>>7l zCdObUVDQMCRyljmIqoT`C{G)*>)@f+)5=3fK5;stl%AL8dcRl5t5-05T)wA46}htQ zRbKMbA7if@ZPmDEdT#k44f61@xm8P_eY|D3M(%dI^tGD`xBXyUJL9`q_A)<* z=lp|i*pU61qOl{Zy@DpS&zU__&_fhYmCl=z**2#l}7o~ zhdH*dp!VX0H|%ePXaofx#pHB8*(7*9_jKez&HBxUOP^8N4n0=Rs;y7Ff`7L|^Wjtm z4Sz8jio?9Y+!=JhSRtCaA|}+nPk;LAu|m-NX`Q<6Woxc4A_=`4av0SgHDc8Dho`Y~aB;Dn?Q4D?{{22;0p17agOHMvLX3j{5Wn9>cq8V@ z$;r#f&Xt#!S5TO%sEkxmR#H;dR#lsi)YDnASWib+SKrubslK6wk*+Sv32k9xi@{)) zEOl{rwp(UpkFkSqBBh|9psb{`}QAz3X7PuMvc@j1-tWnT3cIh<6`Q>q#+dhK9Y#lM6e;$*S0oU@M{N(4TZtK!rub_I3NDz@ooEm zYz)NXY$b(75)(o4=8p-g0h9D!Z5(h#FxZ%g_(BM+4CV>}ZxNz?I&Fb~MD$R?nph&h zG{Tz=xO3U(IrEc-@Mr{)g-2=e3MJDom4ek8E7=WB2>9Dmd7qyGKk3`{pN|0v=uJSc zbRUj|_lz1K)9MoSu%#g?1(R$fQXe=MAP7V&z@EC*YyAYrDH%W zXEk`%XV?_M;th8Xmgg@UEZ6_{O?ZHaNKQJ)vd3XHm1I$zS`30$Sdz~|pg2Mkzsp1% z)&@)r6>jewcseu+Au56j6fpCzBR^Z8iuJW?F-#)3s~d?#YE#1bV`UWaD2}uUbk@=Y z84Sb)d_XeEK)2%|1Qm^@A?nh2{SEn_@4B~03OukN`-^e5G_kZ%q@rp(vB)n|jE0T> zKMnE!t>YP=rxG5X|H&3PYx=*4!T<5ST}cJYo|v4?T?5gK#a(C%px#iAEy? z5_1j2n!}Oq!y-E*1GU>jDUgF~rRB3BbOuR>O6R~oWM*+;^(T#iC;d6u|DT*_4_FsI z>udg3T+M8wH`_>w-6E{y@I|x6@LxXvS61}zH~+Wa=knRmm1A}Dn0OlNM8LXk%|LOHy})3z1;(%h%*Ss;bLwLhQC!#~2rhjd&4aMs zeio5`t51KRxxWwNKMfrhh9t8M>=zL6_Zx^?4|?zgl>plYx)oninA0mQiq8=ROHbzX z;XFEpKpnyJ>^o2eGJ3|laSc;L|g8?W< z&K25-yyOc11`MG6AMW$-jrsfcf9l=86#{rl;7u$HE!a6CVgN9wB(3O@l05-lVlgm5 zlB8&yIgCq9SCSNm7B0egQxnj!0|>5o?f*B=v${#VY?F$UzaN z8FnR_Qgn$z0hy`gBEzHnvB)nXHLJmYnfovL2hVj~BoK_R3BwmbfFVY)zC=r47lth7 z)X0;-BwE>k#?Ep!R_F{Sxdp4SpsW@tAj5JnYq|nNq_%m$t0J4hRbP!%fbc{FO9L^Q z%$TmjpA3 zIC7#UCcc7YmSCHLbu8JPjbxG*v;&YJh>&F$LNQ_`5(t=r4m3@=R2?@L?ZK5UX)*u< zD6v8uE}JulD+Y&u!~kDqKA1h4jxd@&nvFkz8(e~KhdF%=n zs%@Uo(h(dM0Yq{Mm^yftX5+{&Xa+{U;)oGGfg33i6H9==jDS9&%`w;#>CCZ2MvAu7$O3tY&(MI-_8>+=w(+D% zV|ggyB7~rv!|jw-WdhvwM`Q$d1_2voL^~LC1e%WGNl%tyMgX8g3)*WyK=bafFEe_zqK4rzJ9c4yTTY&De;18bCVM)fd>Q#qMx~(omh|h!JW85lWLn zV=cwn3g7LEOPw{P?2HBdbbz)81k0heM@Z)I|GPu(K+OlG>A-l&`lnwy695%2zFZ!`b}9a)5@{R2TeN`fv2Ki zBlj=Jr5V@}STu;AB?)YDG0SN0@AU$z^xuclwl%8ec1ddQx$NK7^|s6Poxik8*1;t+hdU# zc61v$t!B=7VBmtKBm4IF`v+dJLXd#$L@U~2wK-BkT@lgaf@HiuJ0&pBky^EK5NZK@ z0ahx>hIXFI&S)#cr1)bsq(J-26nx>XC;-bH4pi-Hy7)yW|AV~@_kR`{h>_^9f+stF zz%h7l1&d5DJ-#6dD7ifeh#c_LVOjzb;F93@2S}NqH?xGP7!hJ@fcB`9t&@UU7e;sp zo}!vmTMT9eDzjAkuWE%xVan?_$O@fcsq>FH2HPsJ7%+)s{skx)ooF`Tu@Q_jTbhWc zb3|#tIdkg~b$bZh2I}0MPwffp=I7q^bToZG@S-(;LpQJv!CnrV0e*}GK{b$IQ*Z-d ziObpGF+j%X5z=t1CueD5N@L##m|tw~w8$AXifLwrYLTFiAsoN|1||dU>91Y<`**a8 zr6c#4Cl=t2+2I=)RoEAOc9!TQ4+S6eK(K*;%;(9_Q+!aJ27BprL9h()hT1#>8VFo~ z65~MRh?*<}#!n1vz9utpfBr=aXZh>Dib)!5AOHKC7u^v-XL9=2JN78z14L@U!_Te2 zY{loZt_L*AnrO4(u|{zLX<`BlM}IxL(yTYL%0peB+IFd=Yj=CJAIhTuJ<{gltrSHaqg3jMI_Me@!@3q z;EDX_u0_DoSH}YfjAZ9$mOxZamWeB5CPHwzVwd|hqxD&IV3Ubsg#o^2GDkEiH9)ra zdFqn?+zqKi7*Olf^ZzzSeZVRQt5IxtA~0kBI0K@G6m8l}i`t`Kk;V#^HQ zOe_}}Xj^|QLhPv!Bj|xLjugw@2W{(dnMh!l?IT6!0CesGtS;yXENBuUYTI5iX z|9pira>9k55|lIO6&5*^)m50el_PiZsow{$Bp-(MB33#6is~7;qH1*Z0S>#p@;2s} zKroW338tJ$_EJnkqsIi$0;WLF*63GYiNfd*VO99C*d4J7dT0v|-(t_N_*^uh%&Ft8oDFSuWfx6JO!pg*7wghGy{ z53tCPxqmlB{~O;u#FHqZ6EHLN8)zsFASe4k7!8iffRI)xz>73iDQoGbIG!dND}mC) z(Lk_tbje?BQ3q%Ne5Wse$EIn+c>4UG749}&<`_(zNLON zvcPgTa+KUw3$-Yp=idrsd=JqL*+hbhV;HCI)KVxQ%$!=H|11zG6(|kD3O9J)N;JJM zi$d0r207-SS)l;2ea*<=ffml`#{mzW4@dmjjgDPbbfA<)4p!dB+Qso4ES!MvU>BB@ zNhmZ@2K_vNM1aO^{WBSB*g1+^sfb8?Vla9jgfD*Be>DbQYsA-%eHkmU$CZdAQ^3sd zq6@aLPi!LP1NGXABn8XSGipt%PBIm#f&*=3a7^+UDEw=LXbK}cp>{1Yx_I~^MsSUY zy>Rg{4}=I2p|IhU5`!8r;}Q%3h5hNnfr~~UN%#981RjnHGnfGG{}IXmqr;sI0f>ZJ zOhziQ9GL3QX7*pDm_)23fmk0z#1SxRA`|@K$dT$7<>!l3mGu$DKqVq-u1bv1hoDSv zZ^rrc8Q?g)A{?vxN~02>xcvci3kFe7nNlFKFR;N&qf%C%tIom2WYibXA$zja?lqY4 zfdjTQHZ+>?SAqNjYl_b4E2@CFqwllrLg&!0B?X^IS#>>B$AjR65z(60nI9J^CeNeS z#PdQrn$TTS;~Ta}7wHcqBoXdC34Ln(9KUiPXPAGAMK>PkX?VPx`0(*4!!_|}`^4p_ zTkmR{Owq%F9jSRsQ z!V6onO$?lQB}Yf|MMxGMBihD3k=z~%qL)H2Z9q>Unk<`1%18+8sWaxjt*3Uwv1The zW4R|+h2R&u1ZzrXyn$WD7iogc=hr-OsFW=ZN=1To>T;zlDj?%b;3%|H3k+SPnsg3e zmX9ewfL@mV_hH7rNn*~O_IY|Ri7QN*E{zTlP1Xug4tURKJi5KsAD?$~?hhXWH2yjqYA4APjS4G3Us6?hs)NB8Ex z7&!A(bzpQTfeSq}P6L9CUPg%>N-5gF$u_OLVXev8pKK3^J2L6mFV(r*QaV=&tg_(_8VEC}bY7_qi< z+|djtM!B285$y(z1X_GIPc5RTN~nrt1;8dT=32-B@94vmtUZ|a#t^FdKKm(l1>P*- zwiF$UlB+t|GZ|i;&OBVIaI0yNrOoL=c~gX!XfjF9t?MBn^Iwk`lo(z>*-0eTpnF zwI^1qNbTo6a!@VYt{#8I2)AIuMjv?} zO=}|gRY;JLic_j-M=CN?%&dJL5~ctS*1<4?#pwx(%C`S( zxDHTu3HPKIO|BB_q)&0+Usg=@{jp+_cs)OyDT=QhrQ5Uik%J$afS`^TRA%9ZY3Ie~ z^j+6jf_1{gAGSHV0>flx^$Baj73yS&MYz+o)z=zTOL#@ZsalN7s0-FIz_f%l$gD6s z&q|=HIJRtL=KnfM26BmEl^Sr+7$#7_>yI1fu?wq$h#;qBi7~iPjn1q#4=?Q0qu(bX z7#`+Z8w)!W%dr^6{6zES$_??u0wR_H$Gl(c%$GgjLg}`CbLvQ>ewY^q-q6Ihnn%JR z?C(>QM|*`tjb_*4g!Nhw@B&%=>&uUeG%)}H`*pIRtyDp?pHvXhgIYj3Z6MdZK^-8| z^GSGfp3QNC76c&3Xaxom=W57`Pz=N&KF}l03@2GQhnz;M?BqgC-jvXW{FCC<5l6T1 z2vKh%E8oi8ud_u(`Y=Q_k3$|%Edi==mpPRW2xgnZ=>bCtHP2UI%6XIdtQ(bSOK_~H z?F|dtMx*@q1csP$!Vjgl^(U*P8F{3TH`ZlK zSFH)=#?{aB_SzlDh6arfD57{JGEFKRnR+`j&B`?XyA#!|%tox6A#+va6?weJ#9Mwb zu0Dj8!x_fp{|v|-^?Gr?IHnLaUlk+--Q@s_P6QMLT(9#v7w|_g_GP61(|v|m;5kBZ z$_Yp$V8!<+D@KsF%%?g*6_+eP5GT~GyOs%~(~NDkWfra_g(Yx^Ey*DR3N2sRWrjBa zmE`Xxjk)D0)6ehbm=i{@mceggnUrmYm5UEMbVg#z>tb~_n%bs#Fz{&83l}dcQu83L zsrh^%rqzjsI@guI4g;uban(v92b0NUZRuUJ5||8S6j!=Z);5BMd`{{MyadclkcS#F zx}-24@LZM}Qb4b&;67LY8#9{{{7W+Mv#kA<7Lb`g#PZ|oZ1N_)G;LqE#&RKhkLkKM z{DV4lOxUsp7mead(Wy|S64%|aPM zS3`RM;b-Xb ztmS5tSP!6G9e)@ri5Cw;a5@3~1mJ0!I5sD#?e^5C%(rG7AYfI@^R_vU)mRFR9DE3g z0}T`FwT6=zQ5;P1^@qvNEs{GtUE3j=aKDMPGAR=LsJquUxN>7{WC8`>M^G9Ly|G8CWG`P?q=@%HL} z%s#D=CFY!`0D?88ot9+IwAeYK-sr`s8WAM}m_L{$se==n=>1f*IIe+fG(eRd&|Sgb zCl2xFM2a(0Fi{SxxN@JQ#s>kYBUpwx!zwYmiIc^LFe6>0GnV>wiW`?ReL6vsi+!{p z1yExV@_PdL(}6;|Hq8`xETMM8q}Q1F19g2;edsvDx6$!lfBfb#Ra*B^GUWo=p=tAO}%R@huq?WetoQMlr&aS zk{2xAL*E~P&!E-8p!!SB77N<^FB=S$U@!Pjp74uD^w4&hEKT)E?^~;Bdn=kSaGzl- z%ISY}>?rQX+(8X`ig#mD_%=+I{AVP<(elQyf$Qg*|7?x3PVv5>nIWx7$F0crYpn zL^uPPt_3k~n~va)N9-R|s28oErt{D!^gG?u3kX|D-EEs+G{pMuZjH4cwcTXDHM3~PqUqqtQU_L3 zeFW!By%S{{M>wx`o>+YWu*^%l9QB@y>?zhX74Rw|J3IXfH0o6sPu3~Qt3M`PoQWru zb?%oEadb(2YasN7*gGIyMNf3CByY6rNVVn6wPSE@exKhO!dbVB^`J2~XLBB}g>&~| zeZusG!L)+P*YqQ}dXm|Ll=_gT>*E$e?e&GO<%VzT2NBJu!xJX(u{iYrP2_VaY+9G1 zDPa^$3`n2@je`{y#Kk5^Nl^k^cZKtgMD7I;PM-^`>t2xP3>;I85W)4Y?VLZWGsTK6 zxqcMnO-Wy_2VMy-nGjcT0-r3G0cdP7%K?Aj!q00TBOHK}0G)`J7Tt^GaDilp+J`%_{R`4o6qYIvdo_NgdP)5Vq9l z6oet~w{e@xE?ja1wk1!xB{wA&a!YXfK9ekr;FvMXOISkV52h~`hZsS3)T5bX=;6}d8| z?V}Hu9tk~me;fBkt10@TR%7s{qzI3iflS*&)P$*l7Z(J%nCv7U@XoAbQ@eByaBl~a ziK9+S@8t_i4SJeg`m z^?pZ+FJ}b|xQwPPVMX*yafPI)s9W5%fo9A4OY<1D?jaX0b$YyHCD_q}?L*4$&osA@ zQkR_089nNgU*c{yk(l)&kfMm5c}l$4rF5-}yaCSSZ(e}R0!h-G2kERMNp2eYi_NQl zG{n><*UB4Sdgm#JoEX$BKU~kt4@yt&B5ZRoA852TV%rkEZGg^O40Nl6`?ZV+YO|9^ zNqn#Tk0DukoVl{g&$(_!5)W2H%XT6KBez$22tbA~DH|!t7Q@o3FY!#Ax??Z_N$3KH zG!Y!YhRe}7Amyj$PvFA!x}dxhIDU}O6jv(12ZTs*1%|jnVhYBge`x%d!g}0YxZFYF z{h4r`4=57C0(5!KW*!|>hs*&Pp0>K4;RxW9X~N|pk!=<9B#?UvTB(zV1jFl6TNb(m zBrztdAP1n;ZQQyZ>F69%Ur4O}=Nt?$Eij=67cv~I0mV|F`~{?#Qn0!mz|mW3ic_wj zNCQ|nWw>4gZVYUN;OyTQ5A+wr4BLdS#-#v-z^uf`Gj_b;m(XOd6o2-*|!aDX%ZMvihPPD_|Py*hHu?X4eX>80WZeqN2v{R;u_G z;8iHGMd}CNKAzj%=z(xPI?Oe@v)*f|^11aR7x7lEx-*8|KR?u~MTN zN#=9d`BPx(g0r3lpj2?!rO1whJ;iSYHujur$P&86dYd z7}my@z051tPNr9_R|t80kSG_ueIRx^s1xP2JSrk3fLSyzHgsXp z2q%17qm;*sz|M_23s)CTUdb}7Tp$y~ZVmR2PPo(oKzR!QjT{U4bONa2Y@_%PM{2gs zZ;CVP!=g%99<6I8m+ls^EeI&`J?^?2F3;VJWe+Cdgew&10igXv)PUJ^H2d1J7+p~; zn*Z(-ui_`8CB+-q1wdt|V_Hq8RKf})7Vkh;Z`Ge3IY6-NI5nxfZW*+mum@reOUip3*O`grubQaU0Orecg&BXgzNLtZ>mspSyOOwwJ5DXjEVM!yjn&t#53yrB(6}_>?KN2kNe^N|FXb^te2|-eAkWYI z@wjbO;H4zbQ%!-0=V-e5V3v_=Y~*c_+p(Qcv?mCPhqH~Px*Mh=vb5GR6sArkwyrj5 z6kdwiDYU2cKfR1O`ltveC<+YkZnV7^kzeb2=q_gC$^(YNaKDDQL+bj7Fs`pMI|e<< zsFALeL(`fCg@zBX$dN^0O4Tc36ycJh9FR=S3*h4+Gz|of z9Mg;ieM2CqUgVFbi<1LdFlPr>6luK!ItfudfE?K<_p#98qD}-{ECUz|%em4JH2}oF zIYdwZ1gLt{2`NIjoXW8z4$taV=)m>{ucaX!u#sMBTsWE)T9Pm4-?F+2Au0^bNLp@b)Ep24m6x zXwwRIB8E%|eg z&)Um2YM%?Agy>=}RxMCuxoEqIel4FCRa;h{ zs>2^$BQoi+Xhoji=ior`TQiR#`gLT6-b2`ad0F9OfPbikv$`%B0ug9et;O z)C(h?8u3Vg#I_$?F$a?iS^Wr(B@}bTwsU1Tt1Fc>%w0I?Mc-%2M4NIJei%w)US=Df z$Gp;hNdlA))u-^ORkW=Z@>f96_JK@*q%a^q-^H|+&Lk!KCvm#zcCukk^2c~n@kgt+ zzFA^!QA)g97OypSWbWm2eg^K09bDXoeX*FHAUy3V<=Fzg`tu&OxA5PoDRIIU7Sgpn z#_E!h>&{WuHRfuuK{TALhvuF+e4(Np_h@%;x8eB96zb!wW!={qj-_;?*3iOeXFj*= zk;_xF2$7P;wLO)(nPy9DW?m4{aU}nR1-**7-5)QyyA37RN2$4AvF!fTX}Csz!{VLA z5xdQPUcQNX)c2ZQp!@ogly<+td%F7`A|_c^wB8u$J+tUGUaD;p`dRsv%6a?l*6Pc&E%=drBB@uosoj@2 zmt_Lgm;Cf7NWJI)?qHzw?x9MlrBF2i-zMT^)Tu{BS%#8s>#gTitER-BKA;|*)$$}P zI>VYsHk8#>OigDd7Vug}FN7>YAoS3{9jBIO&v`}IrY?tTu)kB7#*JFNP|-h>5;+lG z^3Z79rNJ&hw77NgP)t#%QO3_8Ty?sg7ZgE~3B1z*O2@otLbn=?98iA+k|>}m0Kx|bfV-E~MQ5kU5KLAe1<6ir z5%K-LC2swX8-hL4OyRnva8EJMm(NQ0G|$Y+%1-$ieM4xV#HW7cHn|2kGV zkFH^Rmm?!B$XRV5Sc)O@n_RF-pjfGv=}qHuEM!d!wTP^QrvXrPtQ*X_gZvNvLDYL- zLeC*nWcN+x^n*22AZj3R>yki^`auUXo`M{XTZSMW3VZ{w5Cq~~$+9=(T=DXl*%*Bm zIX&Bg&bE-^&-_wot3y<5=#;r~i_pa+Zkf*Os4@pmUO91%$~DN`WRo@_eDzD>n&2)` zDcK3{cg{Dx;naR#2mQ7~L#J|S?&b4T)QH3b!M`KmcNaek5o(y*juJba(m(XB@lD^d ze^uT%GKnznM|SSBr>L`D*S3If(`p06N$YpnRQan}L=RT7H#(|0^p@u#jCjJ#SF8 zGNw(mp}5l0L!`Que^tYshNv#SYStWhrFBt)HMJnPnohEsmme)9ePQg@HosrB}I*@s#+y@Ala?dAdU_?({0TxaiRq&d>1mbgtdQw0(G z7|9*D$)l0!M6V-@?i?)3@nBb~`!aR{OUO8)_1ydzn-xnW3WlGMo_3}x*)hEC&PXu_ z)B-|WU#Ev|TDW-6L%n6=&hooMI84-|!;GV^r&-OhS7RKk&P4UBqF=WQGOeLInWpp( zZi}Du>KBUF_YZeq7na%&1bMBij8VDq6L+q1b|TKGMy4tF-eg-`)#eTQ*b{(w# z@z!%QGs=nMs@}I+ZV)RU{qX4a@_}4^^S0Mv_4f~7s+Pqr+a2gSks!b1%4XtA%E-8F z`wh>VE|(v5{Fa(Kx{qz~zQOpxzMF}^#gb!FuSV@YKd!vuLIY;v+S%#fUj0`4UNNk! zI2z|xaj3lg#b`=U!NIcQeyI@)f&$hZe#I}gQ~2)eS%~2`Z{W_AZZq6(1k#1%)k|A0 zzjxoYO!P1+iM{KAQPI7-9MYDlM>4Df<0_{{)3~`gq{xEH+L(6VYvB-KRP9Dg=&6`Z z&nR;jw#X{XxHRIoAl$~C`I~!4mb;4%4OP}WLgr5#KeZtxR2{+4(7W>E=G{hjLdP?< z@OZYN8mH?Y)atFW+E~a&IM+oNxRM)Nj7#=zQH8vchK>|IBp<1-@;koj^_k z9-|BxM^9N?hH{A&P1|OY@`6grkJ4|4S`-SJitq)o1AXU1N&05!2V5tiU8QJvud`WC zm+UeGdVOyl-qy#8upeMI_DS(N7T|3WczPWWmp4SnNu%AtjTC|k3S$vq zBt4deU(~UDn=j$QRy&>fq9Jp8O-NkP8Fhh zlcWJMG*ZM<1K%iv%1IM{4-*uZdt{{*h`|P7eGu2K%d~>B&d#O!nyH9>4-jUlZNTUi z`OrG$nxA{A(rV9H>MOeQxiG`o#%&}j9e_Z47A{WfN)Ys;s%{0Bq!-knJD&+h27L-=mfkSibi4l{~mL^<>EX;H~5F646F)^B~o zuF4!UlN#b^@ra>S2ht1Ub1#&k8STSI-4bh49ZQk#%T_6!E=tv6A8er+Oco)ZQbV;y zBDRBM%K4-fiYwq$e%9qyEs#dZ9D`#@Yz7>GgIaoj2t6zf`U$F~iT>aK6C9S5P=hfR z7sURTIhQZ2(!cC=r&B?Fcq;Ga72KKXjP}Xg0cLc4M9lfRy$)K>0AKm$vl0)gfFzF>pxss9kXzVWxR z5k6tI)j`&MJtI|jZI&PxwN%-LK-Go1z4x%rAUWd>&mX+f?YTBt>88QwD66R7N-+R& zCven%pl1Lffh9E+Np0_A9pJs;Q=`|ThjQ@54aY=MMEA?wPz%CR>O{54xm@HzgZye< z#oc0~_(t3Fc_7xS^5r`gsQOTk8b@K5AyCc3B%*!t1}chUJg@bLVZXoZGve8{Cjl2- zj&jLtv~OFJX*$VFqrSujeW8$d_qftqEhFh_1ebRry7f4uG1ixFW}1>`W=L(RiqqR$ z;F9Rpud{*T87^YJJGzG6x*3x{j_#uW7=<|^h5V5O5h#rD-mJnPj2dabr#sDN-MN_O z40qqn*wZJP>#kO(hrI9YRCC;(6rL=<&Bo;e<)dp==Htcj*X7AVpZ?rh@cwDW*__)a ztt~EVH@vgU-T`e5l&Ygg1eD-z8rxiU$A;O(o%fZOuk#L$z!fg~c+o+Ry@UNEtgvQc ztZ8YqcI!HgPP=uMw`D5R=cjLe&6k<8@wwZsncoCmW!Tq;cv7E!NBp|8xL`}#oMfp( z=bm(}xPd1g+z=RwEuT7NSD129w{ZUWjpKkU}zC_8SJF``zY_Q$b$1-+0{V;@b%xxxPlfBaQ~nf3Rx9`Ixxi(f$7Flix7MFSoDS*KDRm zz7n8(VqsO=vtPEnYh;{15-g=Fo$}i*s=C3ng3LamZu^xv>u%<$xS5!A-+y~#e2I0; zo*P_u+}2xdR#hpVo?!|4lW~t9ez!F5k#Rbu=T>L%-hi4rd4aB*8}suuCx25D{lMg0 zB|7eEqIy$4b>6Ga7|2YNM{-f;8Kg5-jf#1zvjfKJ1=qIs+fdsY#;SXL`szjt{Y!Sr zdLd6y%cGXI2Ihyka!HM6bFFGyq`ieyKSszQoL>(aLI0a?16Od>X z15m@r+?x+p_QmH6q|7^iHy`pExHCA<&%zZV+~Syu>;7QU%gGt=XEOC3LZ_x>iDvm3 zmBHw9^nrIEbS>vLd?ud7!FXFHpqZl8aGn^7BdEcPFXTOr?gPYdm}gW9DB9^`Rph~i zbw!vIMX--A#lWZ#6t>=Qh&Ug0i(Q!0-*(gzRF3C%St3D2M3NE>9|Cyu zJkrX1FK|KvoPmVv3g}OPOT67MqOL+vty`hDx1Im7|FCKKo`@40Pxw6tbe&wO`l6g- zIgP`teM^UOSdYoL@PiQ*iM5IRPV|$I>%iu2LEcTN6eamC7F1R!&FkiL1KK1k1|;3e zFfUs1W2!|z;kP2AtI8r0cGSDQsYliIbS53n*8MD&?e0QsS3 zQX8hNj_7M9PQCH#Vx@2psB=n3MCTAek#z=eh9D{3>*){3!a%?me4+pPgqj{;Ktb9; zvex1=^_UU}@`{r8cvyju7BB_kO_Iu&pkwjS&z^>_eU?!uDL*wLSl%;mIF|AAx~!t@ zdJHydBj%;W;KNq;PIQ)(X#f3@u&5UceXkm_D1#>>fWPyvAr)Xx_cQ3FkNB@5l0(xR zgFFQfa8R&I{hHmo<(o_yiD$_=YC{!!ya)?{92FZDA)z)hUvUTenbP8nL6lRAhz@+ZwyXrKE3Vxp~-{>l*NX0#a7**&=QBW z15c;wvWRxY{sj#Mz>THyZ)t-`r~EMn1WQP}CHO>WO4Z(co;T6HR~TS?Q}?MOEkC=# z4p%R&@q*OU@^rlkK4rL(6Gn-D5*mF{C-dG37 z<#Il!O6^?e_Pr^)!zq`%WKoW*wiWwQ_igX)G)^pWVF>mk#@J@5dUyAyugo>n3v#eQ z%mZ}R{_|Lx{}x5GvmTw>!5i_?^9yrWSUGP-X;kOJRw#+eWZ$B_P+xR|T@&~?(y+Q}XX`1w-{@_(N*nHI;be4ww*BV(7B>AHA zCv$Hf(c$3GLXuHy&a|n^(zfqB2}GPydHLI+QoB12r@Mp(vT6DsYlm8wc(d*zKL&Jb znjCbx7UE0(ZO3{}zh&gf>?sXfohx=y&-d_lm4CRY{=+lNUE2(OhO>0%C%W#+C1nrq zA8!v595!BO`~b@HZASNvdad%dRX(ScMDgBcv+LB-$Ee-kU!Hj2Yq1_XXMg;INE!FG zc+1RPO@XC(*BU$%FFthttr)V3@IN=N*I8ynz&@?IlEZFP-FDz(@deth+chq5glkGGG%AHRFNrqZ{C6P)y#RhkmBGu&uJkJh8QUnqjT<;N2{s%>+u ze@Cpo{UIba9twE$a0U8<4jH|PiD+||_t>$#>he(4q2grM>gvIHbRCnV*T21_U8kts zI7dBIMYTVCu3z1AuM%3jW4xMOwaF~@(-{xl${|v{slWbEn_%w%Pawc0+ zp8n^WUXq%pGmlm7rlLSFWwN3i_Qc60;ZC#YV0A=UC#c@!h!W~2!79@!%(Xt2+Co#$ zx3&3$7!e2fzQswnfKl2V|8kpSAMDWWm?K*KVcQXBNqO`OE65~->_#2J zlz*WX@?BkqtTwl4Kowpik+qv*K(`ODOD8L-h4(b9MzXt=L<#(9|1;``1)Gvy=YcxC z1nt^XP^apKVU{mv?lY|`vp_!~y!1Z>Io=--+}7Hg)BhH-L4m;Ur)(`ynxBk*{XY8= zqW|P8!d=dobm~$TM`0db4d!lP&m(Ps?9kbX zuKio4LmyskYfAo}ltMhPwf0)WqwMUS%Z0uSLKlcpoKl8Pqzp8Z)(j*#EZ*@Xnf6G1 z-$Y+qzY|!Kl#;yXAAslk{r0^1nm0T!l_hHpbHk9kU;WWoQncWZW;Ls zLa;0otZHGx>*isPMFboU02grXY<8OIYirZKcrfu^vZ%9z$CPh%g+c@7u0;$gw7zjj zhI%LlhD?@y^`&b1;mA$+0ZwP+&zOpH(A(qoD^A=Nl|GjfwD@{^b%>s?SZ}l}{4h0t z)kJC|vvs`!VY8;7!1ts3d~)FGxtFQ!SkeX28rc^nqs=D^(AUj~s`E0)kDH7F7v55x z7%tPgPrO^gikQdyVWk0$KN7o^SNo9?14@|AC-e~GSop@bM=LD}i!>(sbXtmfb*ZBz z5UL6H2j}iBDuDv#mZ=e+8xKNMVSfPUxrc$!C5rfx)JGsen)VA&dVD~Q$y3iQ$jpKE z;x6PBhdkPHKha!a%s|hzq~;dGKPzw_i~r8*R~6rfZn`X%c}z7nv4_iO?p~#X>T)Y! zWA?Gyk%)PWl{xzNf$PfC_Fk2+vTjF)vLc$k$i0z&%}$=0%ejnSU&Pv96fwBofsHos zXC2^PPG4`UE~C2sTI2rW+}edNmY(xjTXCIszIiFyug^I9I&HEzxH|LjLnp3c9KvnuLy?N49)A$H>0TJejs) z$1fj3E4mI@rM#~1`0!!K^hDzBlS^0X{z^zAcl3NXp&hbD8D|`yyeoSfvLskLVJ!6H z^<4%7EbYGNb@93jq(2Ov)3ivinsArv*m)K>w(k@Be5^M7!0$00SfenWaO23cRrB+2 zZ-0b5F^VZ##(8~>TJCHgP~qC+v;3Sn+a@Q?E;)YkKtfC_>*0Cj%e1#LKm4-O;ZVcf zQ>-iVm*%N#c2c#>&LK)C38Z&?+(mvLhTe!gy_e6nNLV*gu8sKcn+%InGGQMSaH3>2 zq9Ef8XXMf@wX{#!sit>SoiSIo=Ai7By!HL%@Us8`q!fNN&$0>0Uh>lEmD9~rk0=iA z;ZH)bZNK%Nbt?|~*?PmT^3VL!?0$k;&npMTBwS1y^IGv{$*X2N*}D%%KQ&&ZgsGMD zI!xNTa1)2AoAZpY*+GF3A6T41joJ~PLa)p9%M_0bANm+9+4}m?hmfsvbC*qi(4WCT zofU%z>NfjZUf|x=LrkzLueR@XCceXlSw5^QS^nzbp*M9t{x-pOLDC1=+CR{j&zL>_ zPS2ysxbnF{#+C0gmsjpA-W%cbBdPlE)(@=S);63$?5=Y;GOW_5T9SND8=+_G{dId@ z{Br2Z;H9`&>q6ha^j~+L-C2BC+IV~UZy#oiPW>?EI&gPe%FMFe&wfWdo7l15l+CD6 z?>_J@b#2ymqLr#_s*!5?c1pIlkA3gBTy0x+RxUzDX&QYahS;+&E&SxYW!v=HmcFM5 z6Yrf#$&ov+^&;^236{HE@KcZ;xcRza>imma(izEq^)$g7hcU((rKm>3u1$`2R{!Kp zB|4?k1VS@3Ke?vAkpzbpf0`F`mMEcPPCsEzzfyDy6Y^3MVkUF?S2e0;WBLhJ+erxy zPFi)W0Mfc)V!=y4*~vV*^9<*F4Olf?3xT)11lC5!*B(gTqN8`}q`W897n?LMG&-3P z0CXQpC8_CkS~`~Sy<4!3P^4=Pc(DRa>JFdtLi`#iZ&C*pkE+)q`r|}!;%zTj{W-9= zCRzti^$zYrR+2;%yG+R5-|Hq^$HQ4z6tr;nYM&gAf8k04T;0*Nt&X->INh8-(Uk?t zVIcF^!r^>&&Te8k;tB4$?WpR8LST<(l>GcASv%G%L~D8tC{W2GwHBbhI==lO@oGID ztppaE$LFx}qK~?PcORtq|I#1e-NqfO<<23KfN5WJ4(-edBo-?ze zyEGX&o>Yn`;8@T>QMdmVrEu`#8;;8n8_mO1#AMOpH*A*F9n@qFnXtOo^G1_yA5I8f z5Chh$X*q;>Skb^r{mA}a8BA;+WJBwfF)cMMk}H+VMcD12b=SfHQ=^h+@($Xgq1?Tt zaXJS*x@T(Cp84EJUn5{x&gq`1P2%468L89yo5R)= zM|Y&wCeAc)`ilIY89lU|Acn|Pir!xeu~?bD&&ZNANOaMok3C2dQh$&ih&Wme9f^@D*hj9MR@XC=JH2UY@NDcG4(g{n>i>(fw~lK%4&QzU zC@2z&pmf(L1xA;M#DEbSC7`6hrgRBNDj=g9M#t#xFpv(ZF%T({Zj_Yt`Qq>V;(49t zobx(=pxf9K`0DP@ec#vhz9N3=ZuP_HhTVBI>4UMcEi3gAy#DX}^fI6L1vT@$E9EJR zq*1E~kvEykf8r@iia(R}OIVIH+zH@gOYXniA91KzGYpw_6r3ohxK3Ta)}C%&7V(0+ z9jIg106~{JAaNzY0{^ofJMrCXRxV0dWSYbQiEKBS>y>w1+Ue_TfZ|^+zS%&uAJ~#ATNBWEY>}S9*G51@f5_ss*KG zJa`27a4OW6jaU_UG9;D)%75cMs1Oa7Ab7UUY1^~`&NMHm>zBc~vnQ1a^zoKBx?Csx z{isa7Uh6EecOaBGVTMw@=v!VritE1A(*v4j z@a6EY5z_^gXu-?CJ|b?M(*PDx>0|1q}pR@lnLS5Q>y+2Exwe?~*#;WH$w*7k{^Cu7XCvfy{x z#s(#7j&Drb3$M??4Y;Z}4t%BH!5{fmaZAdR(IKQx{4 z&Pvz#U?4{KXmeBByc=_S3Ey-?5}8VfM-L-V#GVu&oijhg z>kjgIl4d!-Hcpe*lu?L0akN<1V~1K(Oq#WP;4RMSgsx~kmX&%#4hi% z$bsfb4cX$0hj~7GH?!<;%=k|{ocAUzb|5CJkXK%lgDHF!>Pjodx@eH1W>|Rl83IRq z#wAtuZSDM z=%6U+Rl~@4Ceh7FmQJg^;-&Df?=S-BR*8cXqN$R%8Ceg0P9KyFG-*l*9Cpuw9aWf(*)|~ca)W1hq zEb+TjX8C4cbBa9)EKt-e@jDSuqB8{Fp9>#-yj1qU?zXzvo#6mEdILq2v z+UERhLREDy%`9~)LfKJ8+TANdw9`VYUcC0rAyk}YlWQC~U&RXbmMc84+VuwwityE| z$a)WSqzbsx7AOZi<@#wfUBv0+kOy*LCscw_8WSLZ<8#$`i+)ok1QY`a;)J)V{8gqv z=O><&l;aH4R3?Bz)l{boFqcwHjR9Q_c<~qcfYm(J~o-O{7Sa`;F117u~<4fehGB5|y>ZT)@YxXQxVRX4Q$V(- zIg0W8M~fJ1+7e_c7F8Al=wDnkj|6G+v4h-%3usXlFMv0s;sq?gZvuN7K>B{IONbX+ z5Chy(u^R*l@#YW8h#5R4HMSQ*X5ULn)xcOXsUYiguh!;(l;B}iGj>oMjYN0UsvyBv zJSRE;CN&0HNdg20|Hx7Z0#~R;7o+zGp)N-95W4_u*Q|A80Z0D zd`2y9Dqj_6z>Nf`k3j&*t&6(NSQGHP0NTkxTI@gOBmrm}cmRGD{5ZP<57?&wR{APAG`)(vnYnf1~p?1vR<<{MBHYJ=1E zni!u+E&a%y?6vEXAZf!VA_t$il}h?YrxthhHt8!Bwexvsrb%^zM(LWx8R2+skxK8t zof^sF;`i?)%yYk6EZ@TAQ0<5zz78Pz+IlN^!d*PqUiMjfagqw_IIQwu?|>6eAe&e{ zTIR&`nPp1k2o-d)kTX?`8AffGVTI+trGZh`Rmmm8I9)*mD%#Dj3xS=ry6j>lu6){{ z!a(D0&IVRvC+gg)4@#xJNh~#wete2~)9Ft-Q61cyP=-D}E-l{Buwr1T&H4Jy{|*bT zqV_stx^w(fdQK>{i0Nm<9`B_)YT*sPDDzuaAJQL*ugsV6{dkAtzK0~snh(snt8CI^ zzIq3*ZoDqu<#Mbg>{;~?_!0IOM4;DjFA)3Y{o9|>6(s8GK~OsSOvVQ217k1ZH%(KI z$W_ZVRyGjEs~5(6m3WGmsvUQdtDBmgqviD@{9B~N24-hc-EUS9EMo-}r0YxXvd>Hh znC=V_%e^cb&v4ro=Qllw{tIGm<2f}DPkYr!Kik~16TPcF;(xf|Yoj{u{?dnccu*j~ zc1+HHxL$v!@5Sz|?@*?+=Wb1{j(kKH_~&cJ6!=wo*a!o&o|+JLGW;m z>V4KW`z+f!vnNbuzZGyzpvyPAvY~t*_UB6YcIK&AxCv;_h6j8o)iqCU`;S2t6g9Pf zsN-ldEEs-@%3y`zYQ%sz~OwS6-haYb6 zyS~t8K|BeC7mJ%(fwJF-vUhy1>%*T`5pZIfzOqI4BlcN7`D5>idf$cezT*28-i8f# zZ5S^ZNRniXAJMPx1Xua)A4ebLi3t+ zcLrrom{VBYj}mM;j-Q4Xl^L|^koforRmyyo56&`{r z_i3!On)mG!+Xn2G&(=&rgRA0Zd{XsNWc2TS-_j|51Rm@3{ z`?TIsRAn>s-8k)zUJRwFdoldy&F4})d)Tkx5*m@sMH+Fa8q>)1ER=JG)GGGAG71(4 z<3_Pb$48uvu7dL(aY}UwCr?E~EE9TpYY1AU<`F>@`S6>n>aWdJjZLeW+W4Y^k_qD> z&3Q?3L0b-O$WJ;$$dlYCHT0G^+LTmXN5ok>@%rR4-AafQq|76N)W)uc0Dwr@M1tIb zde)qm3aYD{8AW^rS|IVHP}VA7w*o_x>HsRz`#|+WRa_w+{)y%7nuvjF#8A`%%FhSH zTp1(4h){%g2JFgO%cRA`CwR~rsZkWO)d$9Qsm9rrVf7EdPvA>9NBO#hF{5U+QZ^0G z{6V&Y372LycgQGo=p~r?sv-i$ZNNa3nDDeKUAH)E2Vktov4r z68d!K9@ATmw~ykI2Ze1t*5q3rsj_j zhDIUVbkU=L)KgfiEf=Hc41}_0LDAm9okSQ-L@U4mVhh(mJ0M-P+61ZLx)uOwux`!f2)Po_ zY)NGkWvNRYV1aQ&zzjwNSQkE$z(^J9J~DNL09!+B7lc2u3wT@AKwCIB4B&Qlg--l;2u7I20Vq`v|9kc@f%S%(V# z`3?SWS`J`|_U|#;8Z{Kt_g$E0fy2V5$F2^>neVJ}9$JE@Pl_bqD{*O$7SxB;7}3!X zTZmLcduRJ&R97qcq8JP{UiWH9`r>6J>C%Q!K8&Bfw0C$}LoJW=y7Z^mSI)PGFU$6g zDvzf(O%T2vMseaypXz_E{GgJyBZo}g%&(-EP*tl-$BpVY0bd9}T}Y)t|0Yk>x3h73tNfi zh)=@vkW#42;K=x^Q=CmRpJj1}!yJ72CTbeBwKx8N-z!=_s~h$KQV8skPZ(yIhgei%}TzA#7XSH19Pdx z--PK_#c4KzyAro_*R&^xww|#B%P?4Oew12KO#oS+dzRi|RkE*AtVz;warN#d1V`bP- zEQL2OomdB5g5M5HthS*XOp@J1nw%W=F>3>0N7cf^1wmo$>Lz!jyY&RCILtWz7l7ZKR0*Py-$D+Yy?4c_ zu5a;48v4}Luq1l%;KwxT#(cdg6*XBb?R11^X888(;>?8ir?aWQr}Ld?jQH+)yVnqE zMjzil%xzw5D|N8`1%YItqpAoUMaygCNmjv3DEqVMY^Li^hA8_#@UCkaEAbfDlUjH- zYIRIxhk+P=6k2Q|90gpEBC(^$2%#<5XNJT^-qYXu*(y}#Z)2y=FNF2orr$b>|1|OJ zt2y})L$UQxYTh>XGdxe7C9>Vyf*#&80LEaSmXMZ!wg5o=r(W;dglIF za6B6HbL^vY%a%2=RGxCj#x%?7%cW&VJvjWLkmXmBLa=a!?6)D3#i!~{D7?LT-|f^q zhGaagy~DGi|RHGOQ{HB3ErCn!=Q+!1#m1!rHLIGyx)-^fdJ14rIv}zBf(`r;q#%+O-K1% z3f$=7Sdj`*YFjaG$#oe2HnjcASHM6$tx=QA!PHyahB!h5!F4RYzK;}=ELNv9hGiaH zq&2yc5eC9v9p^dlHPcpz5(j|+PpTMmp!3IKVeKA>Wt851jxI?mY4}1sjd(-R&1*|~ zfr+7FX4Hg=be!)mn?wChkKKAg8`IMTwJqKEfk|8s;2zx9nc2@ zmUDVCM{n>?J3@>FAPE1h0)B>G&;_8kg-xdRf-=;W;lif&)aSYl;n%y-!dhKIQ>CPf z!phKw2%X&fbZApVwFVk6&D!vGDO{QA_+uB6foETt)PX7p+yHFsmKj$Dcd^@7F4`-n zJLw`YEI80Lzq;>Ey#@RPuROE=q4K;(1K8jTfp_96@-9Ik-Yc%$6>tL)-b zSzb_Xsj1NF!belXk6OJe(VaTx$fjaW+BXRPE>w^4ByVm6Ld^mSU^38evbltSi$RTl zJRcr!Y*1V;x~XRgH}!p~OH56A@X+mYG}v&&-Ntn9d?Nm#Zrn?XjV%qyNP^9YhHjN# zA_ZGLixA87`$W`RhROZz|9~NAQ+?ier(CoXP44>&5F#L-y1Cqn%4jF3>BntU_mM$k zz{=F!WOKjYFvH@mb>uhRs4~VroywDR=_>R$jgRg$6dib|i%%zAlSN2|KQn46)8TYbAj?{P z+G1asT0Y(G1v{(FERiZ?xQjFiwl;+Cbz3>* z?Pqj{k3F5z<75o9_G0S>kC%S>mgLr5qw?F+50u|HEYQ4s6;AIv+Lk{YOYFa#cNr8* z8GAD!I-^NfProapoi|vT^5YKD@c53S!X`suy)tPCx9VAGA5U{ZZ5?ij;sXJU{Y@CV zl0cy90;;oy@EQ`Rllhi{3w!wU?c+!h-YG-U&so@O(3xsSmK2xQx_JE7Sue;!RZ-JM zpV`#k&0W4u1v|sLsn!?Y8KN>(Z{2ijxp0IrtBL3*|F_kkwBZZx{?RZseJTqbEZY*Dh<%F-99# zl95vfo9q$|`Fi^c1G8D)=o|5LMw5)pi{}dkUFBhrREh7rJ0CytGnh0!xPI6m8LPp2 zTu#erqA~H7lTtX0XXF4oZuxbzZ{*=5Or(X`)A2znsnlj*`ssM@B%*O@%G6Xie-OTN;*zIJLT{C7sOPY zwDIli^Pg=c<(!RiR+Msc!{zR!4E$EOzZ-1e#U?Ar?{o3RI2i|B;b$Lxi(R6Y)F-7j4~vODL{K|tsXM){jr@!?Qes4YsJOW%Y>>+zCbjqT zR;IRb2_HUAWwLjBU6cI&vA=pj{T=!^@O0R&_K&N(I=@f-5{z!_W1NbKde%?jA<<(8 zg%)Imq)yfc7vS$^+PI7Wdr9~*(&bYEjsT3ehA(;R10&w92LJH%0ebU11dR|!6TqjL%%c$!0PGV$x0TEbxc>m0o|LLHpnj#!1?~fMIb*7->sOWJ z8-cJEA&^)QV=4@7NJ`4$B?k(cF&H540jLMOZhs13%0fE`BM{CAUbX_&h_QCSSc;31 ztV3XXIN9{6c_^(jzzxKYX2$kSI_vb+y|6gX;2+88Ml6#r>nc>L1yF z9}M6b02)ideX@ zHnNo(!HV+dS4g(eHv$@QmdF#mF}QQc(np) zp|1-^MAYHNXsEy|7kuJ(Gu_Wz4II8N#k-YXeidG2)N@h0$pg5HA};m5jC&zAHEw^L zCS}Ng@zOATD3#@BKd?YpewafPEC2^5<9y(B#1~RpRFHwfNeKCEtQME*a}ahxQ#(s-yjwZhg5tMfFD5#IyJ;A6sqh@)u-AJvYHtOFVyaEXrc=rKiyBSIPIgnnr&? zrCON@vb#sFKrs6!+ih`S-!ARK6*guh8oX6NC zgh1+AUUmV`q|+?NX)S$o*9RZ!=!X{ze3Ez*)>lM;pAm`{`xX2!*FbJ*(=}PM`U{sjJcVCCV`cAdoh)G zJ#&CGblkTyvW|^d{0l1AcW2|-Y2RzDjn?+AEzC`fg)Zoh{mOpn`hJ2#8IR#+GjOLO z?s+U(^3s-@AacB0^h-I*y&UbW^l4 zAcm7rvM)V(q>YS6d-Jl&4C5%HFe|_D7xa1eA|tw|Lq8K2gv`#GR5L|;N8aL6DNZaA zYfR>d`gF*r#4W$8K4hE_MvX5Szd@$(?oyxC;Bx)EMBpi}7)q+Yr;Slx`QHQjh`J@l zqlMTffy0%?PF5oG6G z+qMpiedm0l@$FBf%1313$jw`Es&d7Bn21Bj!E7dpWRBn?V|P}P5nO;TxeVy17Igrl zTVa4NIG+&h&WbHyYfp+n1Os$pB)H8c1ypDm0G=#SGbf=2NGIl`#!~-DE&8Vv%>(wy z>QebDF+h*|0jF0Th7ncfVg;72ytH+xS5WYfq__|wz`+N^Gyv=a8GvT5Oouh603#(K zO}QPE?MZgORRo*RbR9daLkHPtOAN-?`;ef$%mj=yBtn;Fj+a`Z@5^mA z-qg;_-4iW3v9x1t57ZN*nfK+pd|hJrrufvbJPJB$w^;bwv|A3Dk4P)$*{jS_JJ7kJ z!lm=8RV@B%tic=2ZHGC^bL|ctaFHI6K_WJmMO3dLDTjE4alFClZMtE3XnT>*z7(H< z;}hP6Sgal+rtJ7x@_tE;jD+!20hg*;s!Ezq66c5ETRP#Phegg>gw0G(8-3DvmG85+ zD0Csh6W8_brK^gKl06ws3wkw51*RKhNG=jfzgb9(`nGw0O}kdefy@~W0+l$&w^T$W z)`MO7W4SEBmNQK>uth$nm&z^r;Is!umPl-|-cqkPZhov9&i;0yIqz1jNz6hzb!Jf6 zbyIvB!HB`HJQv3PQX&>q2Ah5{n666Hq&vYQS$=jP_T= zby;RBe{eJ(R5%;*RLqG0@B5_33O`L$H=}WhdB&Km4(bG}on(Zac&iuZXho30ETO$s6#*AE)cmle~+RA*}^MbGI z5ci4o-Q|ktmmUFM7>B04pnM-$WzdZ`$E&B~RH!)oa{7*pCrxCrS&;1ewCDx-dU>qR zk@qkRhPjbKS-_}~05}viQ8%7vdp~6Y_X3 z&|kE>e!>5B%faGux`A0e`bUw-z4c+^EwMZoH+%K@W`DUP+T#G)2iM#VSJ5sL-S%rv zwL^@2tkYD!S+HVT^WM*c&~j>U$~{g@Tl-6{j%$iC~$oZ zTlz`o3#{W$20j0*{fV^u{gKW*oEJa5pK4E4pn0e1(_<8de`>y&nGpb*)v=RtYJpqL zm<6*I3fB!}fZS|1iT?dn-aG-T)1Yo8KXHK=ZUd-FEyr*XWqP2Z-@ zj)036auXVnsx4;2JFurk?Jh>G>*gi-yknxwaYlGlRZ(Z)$wnJ_c>7EQXLFqFAbWhxT*HZK2k zPj!zywq)|VQAc-LfV`zhmetehfo0k(XL%#mz|Bn5VOi_8lBX*9ANt|u;E9c&cU&YK zXpaky1XD8k!)(S@+?L_`PQW#)@Rk{Pp~K8N7^Oyp*{Hw4Y0rJcZR!|YePYw3!eG(p zxZ9I8k?z6$Od4rFYx|{Qg~~RPUETp4;_N!H(ub5(h`6 zI$z@omZgp=>>1GYWM!dzoqFohvBYNN6pKeD@Q+ta4ATBI>=;v601*XBu)led^%sG!u#iPDQS zI*5tzK<8&_B&HAEomFH}6?&le=uaJy#4#e=elrf_!U!$J}><1+99skESGun=29ZkoZiL32pNguLFF;?{ywz>M=|v!nt52E-tZK zMiys{5J9$Wfe5Ii#ae=NXP#rF!r3XFaKef)p= zQU1$a_}@-T9Rfw5xm}^%hhiAY*(#&GYsb|96kC4Gb>ge0nM;s%1<1!Dt+kg?X=6?# zvm5$DB<5T$+l0?7XeoaD@<+6iRKJXFP3D&GU?TR-Nqxspl08=`W?q@<_S@c80WX@Bx1;_!F2fhq{5gA7f_)b z-*S1=(8~HQjKSco0xhEfKEZtF>(We}@=B_ndr3{dwcP6>&v>{poyQ{9t$C;-1Vt)c zJz9AU6W07{JnX%0oxUH6J8qOhdV`BMHz^O^BPfjlkE52V0mNNWou&a-^sfF-T5tdhLD`{3zWpBj0J_BCrz$q(x~x*tH8VXxsqB^ z)mU%5h&st5)WMTUSIOdo0a0HM0+NB}KzURjp0PFu_3?m6x9Up8YU=baMF&{C`&~Z0 ztm+v^9^ zHP2HR#h7Sdd%2aHmt-cQVig$Q##EA2K*MMDL@Xr^Dkb)OZ$pMbm;Zcd{sg+Y z?YgnEF4bv)B8gdOV7&r&Jx7*+c@79w)X7TcFo+&hFtBIF_0Wkk3S$KlCx^MwHTRmP%Poc5Yztmxl=@E z^=E?@i^>^A$!;|nRXlU@pYQ`RDlIYl4G=m{IZ&}fg#oOpUHl8nEc2%Wsvn9DLa>fs z;@uzkPDzu7CWkt!r>QPhing}tm-~2LX$&;4T{p`7`Si)d^Qua1Z`oM#IGCgp$9wx= zzqP~T0&XaXEPhhYu55`f2)cINj!!W62>Jf}UgUcgdI2KAkeH)`JZGyo7ZfX0il-b( zU)wNFC&s~LrZ`cdOdH|gXHr$bUPg3=^E|?mLHoo6Bkwv0xWLrvZ^(|n&$C@`J1hoM zv2xB+pr+V+2Hp&=uj}UM(Q0>&1?C$=UUBjJ(DBg8_)FIp7w_NGk+gILMlGd+P+E?n zb(1mHrh2Q(NpoMK2Y*4&->Pi{j~NB?c}oyye{?xMqx1USD4&b1b?=`)WWA923qlxF zE)Y_+kq@xP`Kqc*-yi9{wU@8@=S%PD@8`~ZvZ6C4pUDAWkB`S?;<~G|&}&*rn$^=W z+Be#M9u=H2Z5A5@g}0Thd2kg^w%GVjs-zF>$a({KU{7KO!cEl)i| zFEqmWq)WrV6Kgo$l;qy;54B)fZM-OVa-noXv0K-kPPXo z{4;s$e^H1)qya!m$b*~&Ti5}~h`B$neiO0-6h%NT*GT|ydeHq!c$FTue?1ro+tvL_ zR8#;&(p+&>sanwP_1lpOn%JGD8AQ%7z4U>wH#G3_8~*;OYK;N z*F&&rB`rydM$>sl&yr*bw&|##Qn`+*F4Z7q_wi2G3N?NX$+3w&<`I*{_sr_jlwB;@N78`aC-1#}E&-FYdc^n@9)n@?M( z&$=;yTM?_k20=M6*Lk58-Wx!9CQ^*EB+Kk?E+US5Rt>CL!W~5(teVcDdS7vQSmYlz zCq8-k5wH2$xl562273PV)BvTxQ(Tn)CNm_%)>;z~LNdMyKN){2wCcFngUhi!5~K6C zGX3_#l-tEhit_?h?dY9ka4}j^_EODMsgbFC7*#gdV;t|<|3kfOci$OTXJ|>Ruqt-T ze5^4(9WNp#13E~}akuW)`vueBu72}l?^nl~vz|sqL9XYQ_J-HH`u!R_ai=Sa`=)BX zjrve74=Ew&Avb1>(KZ_UcrW%#x2r*}TK<02=SXG#B7;YZtbTs@{S^U*UV`*(rStWM z2KdHZk?C$-y*DRvB;~piyuvoJ?pyq*_D*A=6NPWFeV6kGr(@f%;G>Pglzj2^fs#Gt z_eGtn{pVFDGAdrcu?I>*Jady|9uz?}mAmJG%iXH!s%AUU#wKS!QLTn6sh`#kj>U!s zg)LpCQ_`ys#gIm5rxj024WY$|-`{vXe>Aks!K4uHK--TY+pdLAcn4T4@Z$sE$Pa&r z0j&tv525NRzyA@@70S3i7vok8F&)eFOeL1U?vozanVIi-t2 ze<#&D)6Bkoz*$27Gjp5+Bu$yly7l|5^3 zjKIt2pN#S6>%KYAL|IwaqLdun<>~fqj*zoisVI0&`pl`;PwBMO3 zN?nXX)CdWtfob`;QOI^)ja~9?|I#WG3AH=N2#=MgOH#w)(sOEx%R{lHuxlDG2u=(y ztSA>C^Tdg^L-OAq0$;_w@LO(bn9DUAE(P|?3Vcrv=MBiTW1X4;j+3PYan3POYVt*J zyH|NRWjW=DxlUBO=(+R0+G^mW{&aC(b-;I%+c=g(T-0U9RFDasmop>sNCt71c=lBx z0&j57(Mwo&53u8}=A!Kr)01moY4TM9ccKvnI}UkjpZbuUq+Sj?zAyz&DA#yD^}UG> z`xZY3Uzj0UBC(xyfL`M@XLF)em`mrSey%Y;=fgIoe!uh2Bk<|Rw0}WyeQ)YIH2wDd zJty^~xN0*|4}Hd4O%S~MoHFc8M#rb{$&9YX;FVVo6!DqkP9jfQ+0Tvhyh45mu{8*v zju{j>$DtEZlB4;Ox|k+(jDJ1PUOt?Ok@XGENrp7b?`L4!L6O8gDKT~&`0fMa1U>LX zc@HI>Yn``eKAahCG*5<^t9yLv6JqX>)64YBg`ZZQ7|&o(59vAK)|vz^yw1zp=tt#M z=H~N$pD$43O~fW%!sEHUr|9+<)~zoN#9AGGmd8%#Mo8e|gt!IL-c-%@4mvaQ2H$@c zIXs7<`F_~P_|2Y!H=01n0=)|0Nq@gmdr;RrL__nP_SA~x+|DFO>M@y%h;5=|>79@F z8_JnF)X_hH49;=fc#g-mCDy28Z8O2>i zi06Iiy?a>gTNxD8!%danp{mMZ_afN;f?mpw(#tSj7fYiT02k~1*70&kugOF$=iW1z zk%vnbfvHTuUJn^)H&FOwD8Ie?i7E(PwsEw+4APL8V}%RA?_|`W1-zdAJ2@P<6kfB; z(V73nAZ91CNTu5PgBlbt_64hk(@fy>xucr&C4i&S8wvWUU9j~+_N_*c^xH>$1cf%K zZ`V*NaN!=5FGvU3HYRnDefJU7N-|{jGcS8!mb$qbxtb(h5kavk7)HQ~a&q7!!|fD0 zaRb>ZmT`7cg-%@8=11c->82m?KtSg}6Q7Q=9(CgEE2v&cOB&>N9kK(>LR(T?cvo=& z+}9S-&$@FMS`f9--iCyl)!13|`&t~;pwY9jo7OgyZwAg(=La96ooW~FEny{BHw)WeuJZl`3r)0 z1Hpmu{XnIAoUzm~dw-DnlU@E^FtbsiS7^%v4DY@&Pg30{_}FpqPACTaW)!+y&TKro zO9FfjEIVZ;h?K3*`$6d@J0&}+(v?3|_gpdilRL9Jq~xgJ4jTDs5lVGZ=T%QGDCZNM z`0pkS^|SEAyIysf_Np3P_bvLvfR`@njs+BY@UvYa^9CWK8iCrW=w34j?j$6wOGugV zi?z_c#)mOiWqBKZHsBlRP!_yzk)3Ig_Z3o?oB6RrQKFs!lcB7i8zow|L%Wm zHm=f}{^b`v`Zsp@kCjIZ;}`|9ZV^@)@7yZ)pXeuOF*FZNqI+!j+@f+*C zkg(Rwk7Ey+%|D`X7eFwkk0&7X2}PZ<5KDjL4CR_9 zSuQA_ms6JDV=$u1Tyin0L{%JNnoUnZwSo0$NOnoVIsX<%WlO(Msj7DyL8m%)8#t^WAeb>S!^bWo&4Pg6x|AzMr! zG+po7H!+y?OQryfQo1d5$s>0GdDDHvtDazKq z=vDX5lNQXE-fjz$pV*5Ah;m&mhb&Oxp2(1?)j@Ea@N=o{__rX*$ES*=U{ zSbZeSE+&-9n-Z4>fp-(cN$7Y}2^$mp#6V4G4g<=KI=MlAt*oU>S}(kkMi@fpUjI)GdM6=C^`-SbfqlnHTI( zldaz+Y>%3V#uN(oOOq+tAZ3r|Zp0}sIOO12}Kk_;mv@$aj z%=e#*{RFItjX#&pVDnXMm#23siz^&<_`@ZlRt}vZ^EDjqx*NXoiDkNXB4@v=d!@IE z##uXl&_)#uBD6MCiw{fwNL@Q#lS5&oliGs?`_|7lI)xpS_ygFUgNEni6F98~<~lfJ z{O1&Andj=JPi8nhwmMv*Q+V5$$4vnL-4;52U`1YamWj{gZut9Yx(;hfzj4j+1bBny zWf@2z5R$Y!plqeG?18FLQM!FYw?{{EcpYIDCiMNQo!Y)OoP??Pci=@eDng|A8b_8B zu9PfXohrv?5mnEhh$96uFV^7Z=2ik1iqRmMgSDGwe?jl>wND$>^ADXl zHxK_7UyL`&ByaeUyY6r@6<4jh(IcghM6#-P$!-EpYU6lwW@gE}$PwU=eeCz^!}ria zzs>e!TtWwwRo`(JB<<&(g?ttB7!%xkF~ured?_yW!ukH&+2&c!)dP@$*ZyvP$dK?} zdkTT7*3vi?^yM`Sb|(4mr%A(=*kOxx<>k>2O-4UqRh0u&6zy5QZsFHF#IxVWUmy1L zJ1GVnlwq3ooUz5tu@7^4W{2PS^4_(q4%Mo?5$ZdWUhAQ_zN@!gKDY5KFEA@L8udyT z>M6WhDbQ1In&}n<@PB@h%%RG|n#Gp!x{5~(@@!^@(Q`_h2PF+2HM4`uoy|MYHIia% zO*Q89S#<>8-i|l#p_t#o4lld7htwT^#?aENU)ZIvS8lb+$eRsyQaoW?J98aVC4wq} zsX;|tVe_Ql(NltA*OFP~`sj>WA50{5?r?p?^w4H*ZEOG3+Wrfgz$X1Hv^eLL--rsz zN%G=}`4E`|VHJ3!0|iD$7Ji*U6(-Y=w?0g*xL zk|tHEZ{*q6zi-WfIz5Jj>y)bpRD;N})Rn2w)ztPpiao`-lJ|sL5W_EM9;6tVJADd@ zRZAsQJ@&IXEL@6@L(y!)%df`^npIo!@F|4dU;zYD^&aC!Fos_sa~C8H)4;Lr!m58s z$c4h`xJSooh~Cysn(KI!Z24c1II&#I;wc_uV)^tg9=?HU6kTdGdXdC+R`eyrCx{>O zTTgR}2QUW0L!=H^qc@+q?*+G7`lBB@ABYXUPQRP|+?<)lq%exCV5zho3kR1A*k#97-opwl&o(-EwNJ#3bKFY7CmmO~Q(}!(@)w;Y z^KzmQ;pUvT=MoObDK76WpH9%zS49pgp}8ldDI1D{t%+o!)r}5`vQmlVO6gb{Q0-zi z3n4ErJ{&!l+t|+NUbb~29_{&P*|pE;=WY$#tErsjAX?Xf43d15i5ksv)xMwZJLFf7 zDJVdwc|H)Fww0tU_M6aFCM+s0e5<~Ef3cZECUlJYw@YVK$45AECi#7Hv-_kJHx}`F zX#*~0g_Gk1+Qb>{%flkdjFpR8NOU?Gi*ZR!%YhhAI^~=Wa>hE-$D34F#>Y>e*;1|- zJ4rd~gbb8Q2MOb&kh1%0e3XJ0V0jBXy-=soe%e$%)#a9OiAkj7Cp}7aq3h9g;y${o zS&scBJ9+N+3HM-`bp;NuX`VWGM9|Q%wnQ8nXY){DW9QQLaOCY(d= z){EE&I|D4nw&||8oK*+cZltL4*}aw2mH)-oSw=L)ds0Tw%FiPK0;UXfjVN^MnP+$CXt9ve>|wuZa_tb@d8g!*&fJ_Y|-wv za9$j#R#;|f*$Hx;DU(CniQ0)!evhf+g5Lsx(-OlxU-$;No9Og=dPA31PcZ^gq3QYAh!@Q0{Lp4F&dMyE*DahSD=~OjO0f={iyswjZ;jwUlyI=k4i` zhqi~nEvQeeqKHRc)gB2icZNUv*yqK8mPK4D8w2EC^xQxFUoJFYKLD0nm>i7%sg3}2 zXliU;XtS1_ULOc#2d@MU$>fXwo9UQ1n219EM>7P2Xj7o<8Jf>p<@#;U#NNi)`}!Aj*#WA(j8rdr>RXwf&LoU&X0plRK6^<}TMt3_U6HOaw z=*MX_UeRf^G<_#Wngf;7m2T7_W7veg;}y13m}K5!ofvr5;5B_`TY!=`_oTekRV6yr z4=?7LHOlJcw(PuV6k%=V9N&I|yx{U$zgNrtV)Sy+ZpkhTr@t!2e}1LL2bkHtEAb*W z%U!t6#Q1-8Wp|ciSaRnw?{bxR)e9o=a_zRu!Wm47x2xuxRsp=of)xqWm z9PuYHpjACt`|A1ROZ$QDitH#Q@jZ(p`EI4Psi+m`b8Zzd-C;j!Q8}CbqK2&L@T~Gs zX9I#h&SzYK8e!8FC8W*qV|bj1{@mSO$DQU=9tw4J((f7$Q?dC=ZZ!P`!KGAu##F4M z*!hmm#OG9dZj&g~N|!b6W-*)N@$NVKVm;wsN+=n~hsiH4xcku0iL?BC+OVud8dj;4 zc}(wcns-%VFt`vNg(#;EK#rWDF+|*~aihM^(T(i#{hu@A0mC8v@miMes?L~yD>1S@ zq{mhDNmLtPZ@nmL@y*;ZIFQP>$sHApdF! z?Osuc@lTgG0S+UzySR*r@(Onm@1>l4TSG&^uKbDNT@ohYDiaP`U4vsDuK!TFf>HO1 zuOyT&+{e9C^^7hYJ&_d@P_mEs?-g+Mrj(nt@6@vTscBp}zz)Ero@T4f>tvw(vN5 z64ch99XdL*?3kY8&z3ztA%cxIIZrM`3bBxL*r|7cFNuPc#q7sZnP9=H??XB+fvwg* z6mY2t$9px04>(K97O_CBXFYFqS}y=k++@LRzyy#Z8` zacT;yrMl7xLchegiAZ%TYzR>d6Y`(W-ueLCSCCH)Lr-(T8O*=-=JBZfM$%wHXtAU~*UQfvaAfflHnQw_3yR!4Is(cN_B<p*vG2u?Gvusm3l_ z0}9KuyADh)b-jDPG$zYU?myq2vFxd6?&Q?v%Kk&Sn3Wix|2X^MA4*}$lWoJCX6MvH zpwjJpw##Ww=tU9B2E6;CL;+1;5%88&((HWlnRKeiZW2f6PwmRbQjN2=J%u8c z>(z_WeS^25KNW9fXtIkLfgQ7<1Wsj>P~?bJGO#%`9mRiX*FJ#?z`HJ{b^6_X*mU*( z+&3UdE+ht_<)LlcX=_@4Jr2RBt{MsezK0LEBD-nqz;fWI@H`KxD{t`hr_#E^)s<>` z10Nv;P-qdK~RNGaO1yEj=mx)l(~!jIR4|m>kmAT=z-etgwO$BiuGRP zuGZiJ_H_HOFYq=jTneb90hh9}%KyW&vMXxH@ou*|E{Ataw{-uSIj)%ToRYff3Vv=9 zZ*@@~7yl21SdodPq)aMiJOA&)L10n}Ygmdx>PBf!cWYYvT;SrTY9Gqx z?Tgr5mS?B`ggd99?q+7m%52XRx$h4pjL9m(63RELHaD9K?PV*dZS#K37XAZ3}=1oSLQ z+FUR(5ESnXSHlD+_}eN8ACC$6YPZYq=N>}~Qn8$%FueBrvzb(|c@-8~fthC|tNpl* zR)ZX49fP{?EIC?|e_emaZd>|sO&gv3vBhm7w8j-Q6sGk$&D6u2U&ii4jyNIiB@#VC zj?(ZipH_Q}-hg5wq24zM3+*Z)s@UTSUJ$+Zp?@qCM~wxz+4A2=Y}B3>?oWON8UY-M zSF9g>tklox#iSmN)NP!w8>AIRDixo#JdNRB%L$qcV&O=Dzv-?O!V!jFL1}#w88kA5 zpEfgGZk0vwM-#%E{yq;p{z{@llTtL84;e`^#aZhEATrBxSiC&Xl0#xOhIOZ*_5D9T zttqI^pL;09nx;@M<7Un%O4%Wc;4d&7W{84``G{^c0Y+*)d{sq7H#rm+Ae&&L*K$8& zt$gRD;Xj6ayS=i&Kwjf8E#$a__y_ z1|Jt4{s99k+Hs{RV>uD=!zau1`s?x>#=tit$C%o(ttu|q?RmRD6_}fsa0zFi$phQp zsvl%hgN};+kklP87@CBWaasp&p3cSlWebdc?EOO#gG*9H!=Az8{D$^jL6sQA5-Dka ze-#5Of1;0%glUNI?$eA5=D;B5+A*36Wol0N(%K1VfqCl z>&Wb(o7@WS++Qc+H4rT89GD{Ae2tNzh}>K!MUa^Nf^ zLBj4h?#>{hn8Sr{@FWE{k_7>DlGMDUN8fM(((eKAR|eFMdJ$A@jvirwt@q@g`t=5o zySZXu&n-10;_}yDaTAjF2~qVi5AD%0{#>YMx{B%|O2xs|DG+@9MUH`asJ(6aOSMV99bf za2I^71`$sGCmG^y=IfaYFZSswAei^Z{I7 zb7bss9LpvPz(>)N?`&z;#ND+v4W_Ry$v4~aSaZsH{jK7LBlPM0&v!w8NRGzme})M^ zkJ6~a{w(DK^5dWC-YI3hW?am<*&L(cRD=meGQ=3(`|#!aP%gE6Ci{teMNDUvxW+mq zNEH(4;XD-}kVQ+F=pK?&tMOVysk%4wuv7ZPGQeYMNB5N*#!Cc&Qbv!f5X#+sl5+d8 z`F&E#?4$(q+wL_WoOPzw(z1Df&>ruOdUhIwSXjt;yc$w(ciFWo^x$=r^!^Z*sK!G{ z?dGwIm-2_~G30KZsU^p4SM9upA&p$dvKButTL+ayHSfjA`}s zWqyVFW3R)e{q=(S^V3rO(+p~w@8br?9X`_ft%hRKRRj^@Z0Dn<87D(mHPw7Kbc3>P z@_r*QGIL#5oiQ%1GosP=!POItXFUq+L&(|r4C}ZgpO5FYX`@N2h`V_=t=I;PSE@^L zEXcpbwT3uNL~mKv--&4`r*5a5t^@jiP)8UgE}sWg?j`gNm3MN=OX-U6i@yC z4g;?ebQmQgO@xGo&LLkMQW($z(1F+^^HIc7tbCl6^rc^}72)Hz5Kf1eKR)1!IRta> zLT|(d+3=8*UTIdWvVmw$KvsIo;il1+CQN zwFpdg+9jE1`@-Mc=#BKR?tD4flh@dN?}3iHl?M&@Yzk0q`;l`7%YnX_)vDBbrpR87~OKpuT1af zR5URw|AYpCg#?AJDx^KA`8B=WZl*B;5CWQ2DnF8HzAUe5EdO~VgEr_2I*PaU*VCax zPth6thtj!i`H`D8YC@qi*Rvu_EA0C#VH{4>Tz~hMLH4-Fl(u20DY_5DxtxZUrnyAP z(H4f~T+QE~b+m%uH2k4=Z`Vhah@2l(^*z&6VX;t8#$55xD`M8DEbq${_e1bbNc+$Y zpicfApv%-y)s_8=c!91dB$CK#jxlSS(yny~&8w2kRmCwE)!E(-0&`*0CMlcAi$F>` zj#L?N@N2~x*`-c2G&vQXYW+?WhtMHEBJ7cP*NPJ+sF98KQhk*vN*H(Ar_R5Mts}55 zVD=0`wwi8H8@ zK3N;}J5Bnj7zn5e*7dhC8_cG$9Vp8QP|cc)I$a{TTF5FE;w%j-oaEgO)FI!s2ctQZ zTX1rd`An%J8O3wm)Hu{2N^45^YnXz3(wM^e&QUX*htkMND86#*Dsl2_^=Em&Oc_Zc zh6OCu1)jxIbaER4?$cP)IYsH2+-FcXJqv6V6~GV;AYyYd{_`z`{#Q8wVt^CmfL%Qe zq6v87bN}D_Fvx%1y>v}fMStxX&FRs1u0&C82UOVeQ(V6Y?v~p zLHB&@urqd<(|-&bb2?D2Xo`+y7BRTs`lTg4QxNDE(JPwz1EVT}TTVNi z`G1Q1dX**EK!0&o4+SV=vl29Sl#`$*Uj95&)Yw`YH4)?|*Ja>0`%c@5hzVR>f+u$_+D1i5(q zADjY0X9tF}U3o8w$(Q~L4#?iWVm1t{3e)m8PfI(5Vl&iXg5IF;ta+DHm!%c;BVj6* zx1AoLfH!TGO?oCTM}h1X>#EATGRxovQDDm0>tVE37|$o14qUQ?nr9L?%^4r#C*D0A ztXMZy@wVUUqOL6#t!)Sv$j%BG2pRT`Od%I~x~seAFBcXP!d%WXt3;|(0GY;aK_h!BA+yY~T!Al{OL8sGF(MA1dHiSo`CV7=V$iFz*@|IXo@!6%EVi#< zj^?lLIwM&If76H4_;Zb)k8HhdVf0a~WoiwBB@9waAqn=f3RWID*7xhtz{2!s&Z7w6 znOrMeAvhVhPcHD^c_&_xX`2LG9rqMH`s0v|qe12fQi|8ZebJGNSq@+FpWe)WjWMhg z;n;fGfBJJRQ4E}(k=;a0uqZZ|PZVvoi{jfOe}_Z{k5O!Fu#Aso=wnGFItb5~NsCUo z$@Anjf_wwM9uT>iuJ_mUouWqh(C(LA)wLhDy-&C4oF7r|GwIkD19!RMJy{pk1_txP zj^=mM{-K2WH~%Q1q!(L!m6R1(;b)*Soa}$0EjGj7+=Le@JfG*N7UR?2e@Oj0gkRlLQS@oS zn}Y}Zj^XWHtH|Pk39D_aLQ52*A6n4Xb?EYxc>YsH0+i7`j3_BQ+CAU7d$NDOP@97PP#rf!-{xCo&_cX2rj?ECL-ROjbDV>JESsr`B8(yNEx?o z0{e32#Sc&dF8!_TVJsKg=oXtv(#d-pn2iBS_VZhX>kN<89yVKt_Tq(8Sc>5gvsUQW zVYWo6qld%Lyr3p2ydEppKqN+?6j^Y-)q{O}8of{INwO?Yya4M{c^x4CjOpn4zz{C- zD2CPjVRhoMD1fQS?R81?S?D(VZ~1U)6WO`w=`-GCUGHi);~JUq z?D)j>=$l+$LSs&pF`D(M#o6kvB(}hzeJp#ER3(m)z^ZW4D?Y0KhAL9#n(vmA9xS6` zK+yiG>T52W*Yb0W#K+#d%g;+3YXy*{npZ-fsi|YIM(`$CJCN{>_Z*V3gx6N;C{;|$ zleRX*FK7nKHB)6jvn3WS_XCb${(YMpr>fh~ADMGyLD;dw#M76<>F_?}5yiPM$qycjeE2Y75LwML0p&Pz1q8ODM)3A|4;6)v`iAviA-GhP9^7^hz4tZ}P8 zfY=`9uUiUFk0F5@$Jpx?+jUyHxh3iG+5WI-N^Iv-UyYf*zcBV8%waI=4&9!%Yh_=i zTCKmViML={w-dt`4@DY#PLV28qcC(w5!YQu*%`> zabj(_ut7+bU9Yp1dn2Z2IzRqBEJQg)luq>T%q0AU;7E0xK0~gGtlv`PcTdQ6OmbR; zSxq5H=cgtJUc5JX6-PP5qOE@my2qw8LR8gLxf?DhO0hg=o4TELe1xO_7o=Rg{>>Ua{bk9|vtat^k0X2JCy{*GP_g^X@{3$vb_K67*R zJC+k1@I^bZar9q+7}w4U0ESL!aL7q@vg8WzT*=99T7&cmI#0uOO7hRew!H{V-El`2alTDK~MOlrGDk#I6@76|rmw3c@dg8Y>3O#Lkzta88 zX!%Z}h$Ch36FI8L>({F-f)bGDZl(<`&a9re%aEBjn@!!%RH=eJ^Ha$geV;E;+nw4a zG*K1yYg?lzUBSo0zipwT=-Sesz6=8OhprX!YJU&EJ&{heupCur<>H=X&1g*u)PW__ z@2ymIk_c6!&T#(n(5^yhFCY4*wTstL{0w4!S@HS#GZ9@|t0kT39Ugn3PaiGvx_!^Z4I$_$u#VHJ?-L?KAOyUgnw?JsERWzVt>lv3t;com;xY|o5`dUZ*3 zV~K%(Q)JlJ-E@sw6Yw- zW}{}&-htp!6%-b*ZEmA)-oo>T`7-BG1*pqmg8&2pFmAwZq;Y&fRJC2-Dmg(9jF3MaR>F$vOunH&fbSLo))6x!V z^jZ*>7)Vk)y=KS_;3X}Ir+0ej0eC?Ql-(LkfQ59gScd&Bzw*x$)Q|MPYL7W*KreRw=&b7G%JndNN_Vv;%v{)@YYf=k<`!5f`Cf>mR ztFC;wKup#fs{qZ3{37fzQf{XZ1r#vN6i%f2)TQ6nDV=_dTfBTB%K>xUgVs}??_*>f4Y_9rtn}>$ty;w`@#C=6t zCY=V&^55#a^>P2;B*s}m$#&FF`vDZaRCakC696`r9<<&fvF7SdZ^QYSn%f5&lE5`Tkc$qVUNg z61^Z|q2R~V^OiC^6J5+R+CABjA1oc$PkI^34dr#$&j%)!34IN|7TFq}weYR;2L56u z-<$6a&REO9$-hvPf(n-EL|lD-S!<;0>>=_K6VqCzZ9Gk)Rtn<8duz2Lg#_$w?W!Lz zJCPd0D7Vn_s|_Cx{pKms#0TE72cf#4`l%duUH zLQ$Z07u56qh40r}xUIz{l|kMMu`;jkw+>5QRyPJE4_aS5yBidH-mkw>7dx87wsp+U zy+C!1ReSMER0@=x?3J%viRbn2|&+^hTH|glKhm zODMnU&t}>q8_l{>D&f;KZLQ}!iUjjN>lrYOU4<8l0$~Y5(2?@LuQrCNO)mI`?g6%t z6!9L^JU6;O?lL_vhM~jpM>r{c_p8MmXWs)(i9a)@=*90pxtEAGH;%g&WheSHx`11)Y&eeP-9=C4RXk%MU z4oWas)h}(Znr`LU#)Iono7nTM z;%W-7dljoDq1?CW4VT9uaN~QX%?#l+_G)@AeR5h?0S2t;WRFKSfO&Yx#>+gvCX0y2 zeLY;I{PRo0XCfZe<&*VH$&szyqXzhDR3X7dH!iLO!AJmj>6c?=}gH1!O4*c=6+J~F35(y#a zAo9UJdz)@M-P*R{=U*8T!5!>->_Dt1N02%QVbWo6OwR7E1b~|}>!ixnm}hTdOv)sP z-K^$Lgl&IGt^IC}Djntj+obMBWqtp*y7~hK)tX92Z+p6mAA6bekC?}KGGd(1Vo;-u zQue%pGDq$jheP2x9^mL)!dS9w0geOwV=`}{k2U~eYe#Lw-0|3di8=!cKl;oB~0m#FKx78F02BLd>;93$2Mx*DVfn_VLsB6`i zPc6yS#`PDkiYydo>6fTX1+S{wJ_WBFC01cS+OOBI`V9Ho{0b>YT?Z-%!otLVRY`w| zz$Ml(^#3LfTn>9ISvJ(siF?K>F!K&L*cHd2MBk`ER0SZukCqZ15$h7ooZRwhJ= zIkqyWE)=In&6WsS+@*_pp50c29H%ZDLw5V`(l04g5r2qt16jOC9#|a{ix=HNQ1`eL z8XJ}<=hvArkv!DoTN}!Sg&BurH0guk%=g2+87w3s;(J+j+4oD055;b(c^W~ZTK+hf zZF3y&pNzsTD`{mzA2&emBbYh6E*6}>yr(m@b99}~-hH|wJBiRNAay`*I){I@xKJL# za)Z*-5S`oOgTkp0lMY>ZD!7|ImsJxmmTNn;tCzNkj3ra_Y0$c_m!1;eH4DfkI+dCw z?C!}3`%LU;`$ZPEgK20CUAWxE?5*Hvx^KzIYCq0mW%9Cleik4(?BG<3`b=G*i=kIIUBdlHsEq-Y5&EMTt z1En{#8?6CzFH_w^0@plCm*O0set%&*xZ^cZpfdnypyju?QnfAhb|=}`CT>&{3dRro zW8OJxZ?cCz!`wXS8DN!I0*!d{aeiHT1ecJs;o%lYGvHL>Ew;$#GrJ+(3|yk7Yrl$L zN$tA)6a?bO+uSFe&Sz$A;ReHLKQ6L^0^1AWf<)v2NR>vdjOx_%CX)7%5)hxbgqy^FQB+xM6$ISOzE5870LS6se) zRwFLA2bWd;@oL+9;}hbx7{Q*t_eZx#cNOv{CW!wu*}F-m;v7r zU>FyyK_F{tcnhJ3Hpc8+nzRNCukw~jQYd+)-Y4fEFu!A~gA;}+X)Klu5IAY%nqqh+ z_{J6c$Vkk7J|k~kQMz@#D^1!z%bZBg1u z*=VTXWIo)t7fN+tZ2d0v$8unw0e%Q3xFd-=Ul3?x4X)S^a8 z6e*bcV02K?rX#K4x-uaQ{e(_O<#$#+#(FCO^QLJVS-BG`!#&cw^OZ}S{>V49#+oha z(BsH4i&fp3dhJ+WWBR;j$Gl@S&7CL}$FSuIgEw^oG?yMn+D=3ZC_WX#q?A!taKbyF zT79h(D^=MElAiOSFdJ-^_Mwec1ysu=-F5_>b}Il#T-u4qis&OSnxm*gg(_wwaaYv` zS<4`}tmT)+Ds*aZs#5jEi8((%-^P4jw7bW%LyO8F(tOTrx1SF*+#1$dT!v-Tm7 zoV*a*B0zGf^CBKz;Y8#_ggL!7qp`xeDha5qA4?OO|9d~zdxd`vX&`NzPQgF>t6<5F z*$y}!kHz3mEs9mu1Ssg~$687DSnlOQ|LH?a7a*^R+Dl`BIXyELMcLR?OZC4l`8*DUIvtUVH^rH!+s$cy;9W zy%whV31VxvBCw34g)kC6YRe<-G+X6CjWLSrx(Ml+*}=EXrdAUSz!>v5%*qH(BTlg$;_prtme!Z!Kh)KJpF^;1GaEk=oRJCy5*VhBsGPbhtEkz18p zqH~1@xIT~_{SQL)+w%cL%q_wc3)5`Ucav7zsXRPjsAFOpe_%zX6|m7E36 zHxGC8MjW3oymA>-JUcVZ_J37vCe5Z^8Zjb9DX#Xf(w9T~y%?nn3W%PSJf0t@BDn5HIU)eF$O;|rAdr0(tW@#+#18mqzvI9(1s z4dEO;I+*lXPi(OoZYMW#OGMhrkZaQV;4weHLpSclTl9$|+kjP{|t$23zY1&^oU`8l5dRG zhB^CM*ktbR(6tkco}H)i_OUQL+$~J}3XCt*F~05XOh%)4^Sz94Dc}*D6j5U2bQW&d{xM(mVp(jA5l znzd812Ql!k@tv}cx6yEuRVyM`R@)ZJ7TfB1ZDyfhmbE#p1;C>}TBL+e{S`A4_?>Di z@DOr$wa5XhIKzvWM~u0Rm{=t64hP%b;VXMW87sE6gAQj`2A;3eO#%Pce=0jm{^Fau z|DH5jy1m-oupBC&{18ARA)Kv7#6ZX@zV&dSySyQjX$h4q_Bl3AJ%s+*RaB%xEllr& zT7@gzSGpY;6GA!aMK6Y4r4*_w!QE3GOYt;t-A7$NOddyuPc_rbu?phq{OgxBu9vHbJV!$JRSt^kYfb2etmbknaNdZ_AB|TdvnD3rQAzJ zwa|lY!Z|X*0n8!S_!M{X=C&Vk?cB*ceh*-kL{5@4Em{c2)6$cCP7rQh3XsO2{tUvUlt&VO-*_G*1^i`$_^=FX5#$?o83o0j3VZwbjzR_B*ZiJ3Uh*p!)J@Aurvc z0zo^7j$<*1n$vXvMk{NvC9o=bLr^IuS-J9jN{^U{=%c;^>y$betN!w9^<@SJ-O@O} zD*iemvMc+{AOIOQDSl#mP>xOBBU^YG4lEQ_YSs4cn@J_s5r0{|d4s2zUt*V28k|nt z>KA3Nr{>za{X=0tG!wj{jvs1us$hCN;#d5sFY$KQp(NF3Y4gj8>E)kYc@_!fk6WbH zswpJMCt|5w|1z3Lh1s%O{rtu-G_FDXAfroRx zaM#S;Dq-SGQk?JkbAdTxYG$un@lPkvyn!^VZSzj3A$)tJ@Nyx_)K$%nJn7*AvMJWP zms6HX{cTY8x5`7{Jb#=2Vo%Pe`;Ypjjg8l(qKIWe`i?f|OLoS592vH>1z42)E6|`{ zg@~bICvo@W*Z3cvh`_i681A|Kg8dPa!IESN2>m78X}1pZsziOp`mfse(Ylko(A-(J|*k z_0tyYm};kzfHuo2Qkv2xYkDDaYWW?}g=X!R@$FpjEysyL?JLPXh3hIl-$ynTwG@(I zQVHd%6jiM?->9vU2~g~h{EJ)t&-ortHPg%z7vl@T-^3cKAY$Oc(uQyo`4G80ROvzKX+N+n|2Xyc78Hk7u+ z%A8Oo)K0Z1*H6m()uM8h z@b@WO6W3RXYCYTd5|R6b<&54ABOg1%J!0_*sX8s5x`8XISdAvcW!b2UvcLRoEz*Q} zORjt?@<`{qi<{$Wsi4@QooIhDbzaq3_WEp7>SBZ9xE&|Wd+tP&I(s8LWEJtGWG%|} zKNS1}rFP)k>Q;)UvW@~7=kF>zTzgLTz$5N?Q`eZb`(v=ETwZU&5hB~qD&$^++6mCmD*m`VU(djHf5SWSl-9Z9G)|vx3>x>ZnqD9O{ z-fejL>00Xx{QN-lOPvzt44EZZi$QB=rm+gF^&0xg@NUjyLbL?NCZiT!UuDt);?ZW~ zI#J&U=`^l3aU*C-S*(?R2ls@&R4l)J*68NL!DlJfKaG8on>8|IZu|3UmoV)98c{z5CoyxdN?Vdd3?l+5#cmt(!dH1f=1mJYHdhn@4okHMb87qWblw5tMa$X^=6m#A8x|!nUGj!YBLZ zxa++a-=|Ug^nkb^AQzjttHW4ll;oki=3!ujCp~ixJx|vLFGy(vXgC&zDPWyQmT8d!j{v%~-jcIi~y!GNsX`!8yCZ#ruUZME;}9;)%jNR3UpyK$?PPlsY2G&Mv9M+-ha zH#sn+Iw!aNaI8VAeEP)n1xW2`xofp44>u%Zt&>f-u~~`Wrmu(@{_4It_FFWaXbi%2 zo)kkv{J_6C^0pGa*T7;nD-*AOYli0>L`0yr(DLD zH+?*Nw7-d~-{Zr$juzR)ua^y3uSBwl3cp3((WJm8h#F0&c0+2iKk8E|q9;ag^6jqp z;cazpB{rH`U47P?JIQ3zpzHM|ksq^)n-)f;#53@t81 z$@n^*!JQ|4X8_xE$DGry>ND%_^+zdR?+i#iQq|Rij!J&V)%?-0+b&4hWyPYR!N06! ztc&hab5_oA4yF$6t3^7-8YFa-S?f>ZY!8ywQG=$KOW7yg`+@XV=!YCJUpLE_2 ztayfBHH^RH%KosmRgAt@)8=T2e;@o4CjP0pE>6paTStMi<5n+eTCc>-iS{nfJZNp& zg;^Ahz@HWFwv3bMK~ww|i41?9b~cXRh@{L(G>W;)buF%a@usiWn`h$~`pbL2`0ccw z7ebCc#)sF75gDgd`3r6=!$@3a9uU%E^zHM~<1ev&PLa&BXN)>($gd814D)Uey|qfd zLOMzT6Q9JQ@DD+`;^pGSXzy0*_sK%QX$rsY_Jg>!j!+_A;GeyGn&(Ts^>6-2*s_In z2}eeDG$_fdkyk-*wSQvhp@^)M z1++*F6Ph_N2Rer(INq)_JkS@?#Zgh<64I2nD0{D2zjxd_ou&MNwT=j$z<+Wf#|m2nqJOq-hXx-h9DbrHLh0Z zvHJKP;A2RMFuMMsxME29R$#;!{rt-Isw-loPiO_q?eBK&uXFq(QN)$>CK}Rf6D<~E z++6;vwnSF`CgNJK{2z*4NQ1uk1UBrRohCgW{11hT0Qi{4HOTmPja&}aY&l7NM2cGK z0*f@f$_8h_KLMgWS0~rsNbnqeq27yL?x85s`QQ(`fi|Wnl|E->?;q~K#A-Rpd%?Aw zV;uv&YWNyi1x9>?5n9WoVWvaA(7v!PvU^rmcI9b?{64*Rg(_$?dHHMp-WB|wk*2rS zfFt14j`w6s!aJ!A{Q-zIHGX|kd%J9L@CaRLbKB?I*t)T=o!_{;Zy$Jf4^aRhc6u)6 z37Srbsw(&O2C~!b2FRc#gVl-lF#lyG6*)nr-~<%Q+IDm%KYtHVyK1J&)Tq#wvBg(F zQt<5o5SuueKwFQ{_A(FEj@3x$bm(w4EnFvZCgC=gO;LSOtm59-kIP3mJ97VpDV7O5L8V>CkaJ0pE%- z3i>@h#g6s9N^R(J;y|BNMJS$r=+kc(QN1iryz`VZ6DIglbQs_d1Yx>^e6Ok-NlH`( zSZJEvj`TdywzaK+KT(iKXkH}qp*E&~lZ>PN*PgU-8225;2f%qqm7_$j!IN-Bn+cW9 z70NbxxnZ{f<$?l~zZZ1(1Dt;}sqS>#gTQ`6qedGbA3e>bD}=n_1eDxVouBco(ylJE z`ZDY#?*z;LVeGA=+IXXPUkU|UDA1Nbf#M!06nCdML4sRxcPSE_;>Dfd5+GQS;HA(~ z+}$a~o#IaKe1GSjv(7(vt-BVJkjX$M;pLtE?)~h~Lvsqi8$x2&LcME$fVjKk3d3N%JBjL9$j%igv06%8RTC<|{qa*Zf0-)De#$fD|!ErkL0 z1bF9-xnC=vMpLxW&sg*Ws{y{rD;$0tyC0t1j*J88v@y}>2VWfD+I?hR`-0imIB{5d zr0;YrsU*<`xMZfoITcp{8?)~J?4zM$KquP&L8oVg-7^o9t9PT3B8os6fUHCo%qjDW z-;v)@oX4tEs=5cT)ZDeH!%bX^(F810+`{@!WLY4nHrvy%zFMW|WXHEQo4ARNMJ7nk z@y0lH_m?+j$pgp}rD#FvjB?Y7Qh0>$dfu8ArhR?=RzZ@3M5n)pBL!n%}%k5UII=Xt9yOKZyTLaVOSJ6I5= zBwH+DMMP~6i7v}%R2tB)5yj3O)KrRIlJ1$_d)R0?W+MF0NRu=)ogGm7E?RGfVC+$D zwWISzsNMlmYL6koW$VV7(Q6i2EiG)OT-G;A66vaRfD;JW zOx&6v{mzW z-?8qG+^uS9Cl-e;BpjE5)uBe{FT8O0nw}<}!Z+_BFVK%hrgeL9isZ2kO?6N55Fhv% zeaRh2+op(_>E5$8v9Fhod4821XVYHUlRQm&A+Hx0xt+Id7zq0sipn84)AyAMCx>4l3f|n$VsMmAL3O`9=VSN$I5#kCV+RAxH68f%F||7`Eufmr30(%ggocep}>6@ z-wIpfn{R=GtO&q5;8t)_;gt4i0h(4cxP$*c~ViV42S!o2& z0uXK>%Nsp^?Qiqe*wz^3*z}u3{EzM!#l%mv{uP0H*0R59udcHWuDj$M?jr6!#7ty+ zJmdGJcxck|FXsJWn^5Ro{|ifkUm#V@{3|v5_^_AT{1HaA!5VR+(Q`H4J6@D7t^oiv zsGE56q-*}ZNn0T^%MargllyM3xW}BvG*DkbDaWhYnSJe26HCag@H0+mW_iL%lcBLG zADJ|@^6FthAn#?tGG4Ql$_t~ z^@Fx)-=HBM*^Yl@|0JpmiPyQb>qSlrb>k@^dQ)C_xyBQw3O1 zso1c|q;w)RwYpKku)f|M7On6n<0KCdR!|{SU4v!1`va9HFO`0RwTOv7CMzgW%gp?q z=@glUecn4Y9Yo++`jld)R-0B=es56wkPzQxz64WUqf;V4lo`yXq@&Qvz%v-OT$*~J zp#gc!*La(ye2XDj#C0J6>H_+66SHx01!@E6|Dl=SSZ)*@oOGB5u~=q3FoJql47Wmr z(*IXmjSWAjIe%Op1DL=YK+lxbxP-?giqSW2$KlFkd~kLSddk9N++9fx1r%WSaeAxX z0Oo7p3Bh9xT8p$5L7nfa4RZZz%K&QsmMiwQIFCEdqka=^@;ZE%%)HF6*30hqlgUzl z9TmCuSLs$A8*!=XZ{~u*Zrw)dAbX<73sL{wTu96D?KD-r(W-^|hEZ@#Ktpx~(>EN2 z^iYn#WR=r`iwC=;#ijMc{woCE70K@Hv~1xn+En97S51;HYzA)_rLvu_w$8*dlX)!RFZ!_FMa`g|35&EY`V}y7$I2=@tPURhqh6KW=P4xWlyEt2_1Q zslM(jZ1yfaHW*uUHL&;lgOTh%$`+f|z7A?ye8CVvRvZ1naNSpD-SCOMdQ;l|yjNZS zAbd@CVl%)CzsOo9BHl0XrPZ<9|9#|%i1vtHUM&PDAgYoeVZCKoT(US|H zq`UvJ)`s3r>(jnIT>1I-X7gJ4mohJNa;-&K zJmbaRa%k8@4RsE2Eq>6=$aL;dY?=TJG&_JOF!o! zu^5HD>qudTY|8B4PVs{&hw$Bxl{PvMa`ejB!_qnim_ze~A^Dn8gfm}3B>VD&yDA)Y z)bhbNtSTVs*Js0;y3OQMyM$)tGh~ea{S_pmxqn}j-Ix4N2A6F^ohg-4=;WekKp7Y{ z@4*kur+K1agS!GZX>#w8K>eYXSS}T@RPSHc<#k=7Owj|bkSg>oZyBiVU|hsZ>77vW zH8c{W9Sx>=tp~c33O`>%4al{971tk*Ca=Mrsd`2~B5pXfT=lwn@U8j9gAHdsFkK5( znxs(KX@_g{8uz55JOfq_bbkdDofW;heQeMRV=|A8RT=6j_)g)f`bc}d zR+w8#Ak}<7`xNq)Zny2Cd>_=!@w^up0m`$ydZ&lDv8)&G9DuQmSjby>up?jq-o8a; zdu7M=+(BC!7qHKEF@2^y@?-j-=|V%S&p$M_uz|MZOigHnhSpEQnY@I;##jYjPj*en zkD}7yl>j%(NS`R;_=*r8Hr6-V-&ZuOBpQE{fbu~{ zEP=fd7QI7#B6%>8?V+E$`l*pa3n zzZo?#6?zE*0o^QKbDkjfqjMkDW60@f6{hhFyl#hzvpP#v--eXun5}HimN=6-%ynwa zlw=hik1uM|;hv zCHaq%8xP|=Rsjro0ncBoIKVj=_=f3^j1XWzpYRW@6GHWuP|X>zuHbJ%RyrAdAqA2% zK%Msq?@K16`OTPx7z*dtgD@H7_W``xQVf-AdI4A&4CpOC_9K@3@wCnet~&IAb=Kq- zE^(7DnbFF$#*i55yX{5k2r-*gTKP@s)I^l%?ir14y^I8X>2uqqcS*XF8 z@L>8MPbD;E0!7G|U6sNh5;J2((^$E`np0|+X89g*aLL`;4h(HTq+M{4uMNHt9%WvC z#MH1JRDNpJ+b)aNTw@0&LgtXsutD@*p=^${KzZ#G7j_mU#t?U!K zq6l8o0z4cJ(N^n@e`wXM=Dgo}qsE%*Pg$|G;WnQ{5M8v9Uo6a%Ka4DFPN+l2LW3yG%`v8@AW1=W#7T!wHReMGV!PvSiwJ@0g zBkk2*Jz8O91E8*3G_b($*>(Y{74g|yZIr3c@1aalY^b`z$Hb_3_rx;87grkp#hV-7 zU@w5?rXJ4%<+G_YqFJMmrdid7#-QSC*DL$flQOEm5g#p16$t?#5Dwi5iIRKY0q(ou zFekORhWRB(9{x&`^g9Xjoz6hT4W3o93#_AJf+_7{{<@`kLj!UjSBYP*X46K+b3ByX z$)e5-zdF5ztzTPE&9o9koT9$J%bYu%IVHTQvM6ScK0sUC3i5B^t@!&luv-mK_5lv% zL+boRw7J`eUZ%7r)rVQ8bE56>KJ$~!HGcXi(1YhJ>HRiFkiaO>_x&U;fnH<9Z4sQa#?ZA@F>Np0Z!bMB zZ)cU=0bpyHZ;r?Jr*5sqKGEjsO9nO;JF+kx!YUfB`Wn)wpL@8Ejb_R+f`ebQ6_b+Q z_|6iQmzlD>@LwN~8rGyNWw3DLrtYttFCy)`sA3KB>!G}^>~8^7KQQZpg7v4xECPw!D$-=LpkfvZ%{v8v-${o=@GOy*@7#(G6r(JxyooBzdVB zztE<{8XtlKLuK=1iuse8R#H2JMxRcGLui)Mzb##u1(f8{>zCwDmE*mk7_?_bJ}oRS zgy*PDEfp&>@KPsC*@PEAvo$REGc3Yc5-OI5P^_)pDLqBFRZ=sSHUZ`k*HAW8X|eSQ zkqBQ>MK}xR1)!O*Z1HIX{TNQXCCeN`8Td{@`Cc9f467qzDyA`1LQS$^YS~8mjWZjs z?#rjUv+MLK?PI%Eitgz5W=g4Ca^`s7eG!HW0F&T^`9re20);!|4Ew75NcTRl&ZhT{ zmkQqT8*!#{w+?Wi*cIY0Atb<|Qvr7W(EI{Ca{t#|bx=CST}cALB}8ufJzzH?ks8=> zJMQuH_g_M}`=?U@CXW}Cz{Xmz6nOs>KwNb~_2ldz1NvD`Cx?n=aur=@L4*Y6nj zA0KfBlhs+HtG5{%Tx~tTx zdPFt=fl5^>k%xdwwG$|x-u{ipG zFgNIjG~bbb`-j$`BClzJtA~A$Q>Xo?dUNf3w^!Nay%})~_?84~PbAU2jWlEle*5Ie zcCq+(Yzs+w?NZX`tzr54v~`h*ZQ00-bl|hFJ}XInZ|OWzkyR?ovW8i3RuxA`|N6n!>7|5lccvv@t#ye0T83%ge1Dx;<11CPWnAHa;rR5hIxm$m5fc}F*sfPYWZVL{6H4i zlE<1tFU-BdADF$wxdMM{1q`a?3Z|fjx|$hFJ{9H`(|g)@0=?vg?b6WnO!$NT12K>Cg8YplS)p0dVgK9sd z5&z5LoCs^ziytpfiK=y!=)Fy+id6|~{2Osrd9qy|+x>8+ZJyZ}47anf%o%=|yctge zysP)#bKH(u3hZkQ1*N+OdOUhKIUnT#A94tgWmv8`)>Hgu-bSXwmgam!HBzpq61cu1 zdrKDeb%{`~etBrbmGuRTY|Fb@kUnBym;(rn3-w+>_PQe>m{-HGo3T?9;TPqO7#LOK46)s+|0q?K|DUQ=n1J=yFFkKbaqx!!3Ly_X_RKPIY=fCqzcQ*%a>$>m5@A)AXnXyS5$Ea_3X^5tx>ZwPUl$RJg`@p;nQj!luQyTlG;Tpm3}d2YvnpJ5Lf3QYUI zcK_x8c2ntlNm{-CpQHcRE#uj~T!z05p3Ld&UBnrZ2t?4P?h`SSO@2EqQ^3>9qmA7@(z>R z^-vL$d=?j*ek&J`frcl!1_#2OCZax*Lhi!c!Es3m!ZFm~r_$#qE1qB+3SjQ$_kX*) zjF|!WfjiwAt$oR_e*-d0b}4cx+~IvgIb{Na0uEOb8|TFfcrwCr`li(9fP%WzEfhTs z=+LwYVjgBHh%K|wR z3<^BD1<$eb+nggh&jE|^P->vt(&Ybj(LnpZ@3mgVcoYSXX)H*A!!>rqS+>R|)x*j~ zrgGh31OxHB?Qc#?Jw)Y^EXKU;*qd_jl`l8y9%PA7E5w<>cp!@#D$X1CMx*_gWb(c& z#8|U)13e(+z!#_5AbH6881Y-@5@RkvGhAFkoW-xeE$!Cm!m`lMuPyME`i(mPb5#7{ z#>pU{0DXoy(l|vDa|xx2hY52U#wzhErnm@}n(snbdaolkwQ54oY(J!AN$mZMPrsWo z#6AYKh4F7*fGquOngP4E7JpkW0(}#JtpY-!L0>7!pvm?@KA4N8S|yz4ztgY~JQ655 zPL;#cA$3>J*wxjhHi#0A)}ZCM2`UBgzGYxnw-Yss`HJ?bXz<7I^HfD|_Z^!W8`svu ziX0U^^lWmIN6;brwEnV%2&_d2d2~HrM>$uc!@kLe@7JFM+8UB&Dkz1DyIR>s2<#cDbRO*S{ zGMYZ_&O?3Ybma`i^VRn93^i*R}888#ogPq-WiMS$!r`JtFGT?KS zfVKNZAQ7*$6A^8E=3~R8jJ+E}Oj@{OYc5BaQpdZhlLMlL!|t7g;pL~^(aV8k~FyUzWc-Y*Gsq77`{I@ zPIe7{_&raAy1AWWDq<@IkoL<~YL49PP1_o+R;IOlVNb89t#5VkkPF{(#cYZBIB$ZE zFyhZ;83~E1PTrOJ_pD-I1P9Mfw0@q_RB|tm?~H*+ihDGlaTyC> zh&BA$f0;B<%Oqy~wEFjcx94r5@q5$vjBiu1F~p4(|6Dlf8+DNg*I_fAlwu->b=SYX z#1;S=_@oc+eqekpk*l&B2+kQ@*YYemX?v^sk+q5qUtnqXn<$z;{%?1F%n01vi|R&O zAkQy;ocPGA*dxxW8jJQk+zE7|t*IlDR2c zY0g!o!EuR%NnRH+)*c?Ne7mlR7}A}2U^ z6}$5(Wd)k3Vu21=8<0i_jyHNTOXssgvPUgy=SzU*%(hTP|HOwkD@lgAQ6fZEgiTM$ zQ`~zkuZ3<^)6q3CSi^I$&LEA5PNkIRZHnY+bg^NRh73={@EmHkj;gJjR+X;#MNh>E zwEL=C7&o)-j%vwzC5h_#%DD;*j2eCUz$yNjvhO918})J}$f)=X0}MEA%eQ&^PT}C+ z$x7nHEeEAkv4R$~tZu^aw5abTK|ZRQR5>Jg)0pC}MT`|Gb=)rxz9@5Y?0nHU;%SRz z#R@ZK`-ip-AKz@cKhyRzjy`0q}!JPciJXQG8PW;8H%o;d!ET`NiVT z(Ym;yR#}^F7I{uXa+D@lHTNj*R4-nBtfb5cixzx*bWp_N&a4!VSSWhiTlcI*R0-?#iLpUs@lCpdjnYE=w|>A+_~8{nX6k*LYSt$C!~v@FMs8DEwrdQDSoj_S zFQDvW*GLkUia%wSKd-j3mpwDEF|>}euVdzL!SdC1sM#}!`++6wDpgp-b9nXoXvC zi|;fAsQzf!eKdB}k3DrbRrzrPz0uB~V!lTL+Kh;~01$2&Fs00ts~5NV|D5Wdt}^%k zF2(3OqdI4ONFJEuq2pB+`g9?1l~?0VQ6fN7Ij~JeKvDtKmzdVb4J=rI<)P37sy5+O zD}A@F(zf$a{59Uka2+}(+gqF+HRY5j^KgzJ2MVyVyx`(QZZ)15Y9sawH7g;djzGz? zs)4`2iDo&u$thZ0Pd+}i7nB9>n-Ybd@mJzkm z4YKTp!(^V~o=EJO%6Vsw8^R~e_=0;Z=jr^IeX$zOtrPO6D2c|QIB!E820R@>w;1N#OnwY9|`EAqQdz?x+hSfdL14Pk@obvN&(IMO6 zjizk2j#?X|0(pE6_O=E~0qFuutIBY0wy##GizaNQwg$I0u$6f(;||r9&eKv+hfbSl zoeEk`Fh7b;soMrk(h>jy1xx0xs&HBJrtM9tRCA+6VuwO@v4%#gNb7r6zO`E!2gcvs z_R-(Kf=oA{EG|gJcHyH3uFj~;{od7O*bf18@w=_??=7+m9|Et29Vrc**kmU2wVJWC z49(-K>_JVBtM}z$Rd=KkSx&z#s|Xq-=it^h%oUvHUX5T7T(K(gca-_7DIUJJCN z>Pp*kLH>40#g|P>WU-6RZNhS!vV(47JNM#wLo9tyc}SshZe}0QQI~0A#-^|r zw-v3h{T-h|Fp$4;b60Vr%@S&=J|L>J<;hI{ahj{gpJmST+;||D{FY##(N)!xwbtYr z-3FTPL|V)z(7DOrLOQi)6}jkC;+v^>2a8iwN1OU2sN@5Mt;p0rG}`)r;ZHi0h0c*v z*8Oz(NWZCKE+*4kZ73?O%D+oSI&K+k=f7P;mCAT|PL*maD^erhR!6QL56c>Imb_8l z;?KXYP{N$2WJXR$0b@{`9u}Zw7&rR{K-yx!(|s=Qk;R=@pzX)jx^O}Kpq%CXlw5;Z+ZwJXwb;{fiyca+jvirkb^@phK9~*^QJ{~qU zVUQVWvzDI(!n1|wER2&}ionW#2~P&0rqm20k)RA--v1;Yjc^V-edbMkGcv_2kXdPp zU?fry>6$1zu9O+gE9LNCNqbr*_i5Cidw+pcHqn}J}3$M*y2eUv37XXaGD zz#=>U(Apu)SGaYY4x)fWD}8G6{=Z!c+j0+ov}|xr2@iHeZ3oHTtAIuN;@=|qWC4Y* zOu8h&N>Ou3<~r~jN=Vv(iAbX6{iplgdz>4{R;MF^FI(_yDOc-;DM>fa;cBg#3phrqL8=AQ;(Ye(3nnpy9?Es+=&WkM%+Z*5(Dz>iSF;pv+%Mc|vWfl&HRX3q9Lawn&LU@Q``h2BiQ_ zE{xa+ow`!GErou0<-$snzU8kGdfGqq_`?Iv!G?!J;z|#3!M@Df74WY@C z5Xpk?HuH%@Rz`E&vOnMfKErwy|1rJl^F}bU;=7lJIP<8}%Fy*tR1ri?Uz)*Vq%U9Z^Wj zwQ-VjMn*`_P4@_MKkx89^VR*Jqb@jtVa+>!{mG21eJ!msZN-9gT}0dU^W;pU$Dmbn z+N2dfOOr9=*Q|2RcuHCm7bw1lg|h2*ItPpJAKG&syLK;*zQe@$ef68C9|m`O_x1Ia zA6_1A1lP_=oh zgmk_LV5w ztylG1Xjt^&-jnrvAYC)9c_mnpeaYQ157A&&S{H>lFw{P5QeQW zk4x8J=O~glUxw@hMq7MeRrZ6f@raiTt!JMtLv}gZ_;6j#s0hL*Llo?f&yz9UjNVnslqsE ztqd~>{y;Q6{Yw-h7|~Je%hs;vhYB;xGOfzyRc+S%@>;A+M3VV}+6LyD<|@h<&&p5AD00h_D!}DA#HZ=aqwS zeuvWzZ~+6+eOSzwHm&PhsHj32yoXV+pGwh}j@3jIP2iCwJb6Oif=N|nLR3I6Y?mjb zg!J)(eR5w4K^f^qb7~<&Z`dDv&iitT2%pRo2-eIhzgZw}D%kC|)6CTF!8oY&~~SV_9#3gg)uZRwxa=PNjFlR~Q?zN|h3OH{#?i}H&i_{fUbN4fj*S44UD=gE&ZRZ(nE$q~Djn%-p|7SA0<4b6swW8j>on->^J zUi?l)UrsMWN}&m96)VsN`H&*()v2$L_A<57&z@eDeWd8D_7-Is0Vlnv=8n$n=D{?@ z(%CX^hN-b7svHih023bpT>}*-6|4m{Uyy2HZ`0aB6{eA9&%ZvJ;7^<*VkJjMhtN3f zIq8Ls@FXw4-WW_PVTvt`9V|b8$6+U|B>&MfHfKg+9u1!p`t+3w{l|L_A}%QalPD)0 z^A}>zyn_PUeQ~rx1g|WEWeS0OYdK(Eal!A9w^HsBd-*bCI`G@nA1CY+GDuw@ESu=R zYn_ZIDYax1`4m_A`xUkutss1%DK2CcJ?fP%isTL`GWbopI|au6Lpz4&m=Bo5o&Tn( z|2a|rGLafPQsvx|tW5$973N=u@O56;{q)GXHWSVigXBA3d<9-{DyiE}-kk2g z_?_=%T0hen!k$^zr1pNVNJV>^x|Lbl(V6v7!B}jq3il}mY550KlMbdKz?oiA{DhC8vBsif-XMFyeX269MIZ9*fP5f0>IiJ13FU37mvL1M0d9 zS)YiYct;`cm&zm@`#$M=Ny&}Pz^(SjpP24k?-~3d%u)Ujg{%HF}{;m1p546tH?0tvcymhSBMqnCGZZLv~@fC0#Lf=61A6M#|yXn2z$EnWb-Ft<^L00d-kq`vZ)_r5aFd<%)Cxd9PEx!+L*j{u76}FzElsmH==dWICA(quHOxFxm{iLC(k&FOaw~X7V6|m~zT<~4 zTl5@;Uq6h)W2+T*yR{jUKdCn2Lp5kbA^bc8umZb}9~Gr)*tA@K=K>ig47c)s=Dh2(F(Fk91 zahJt($YgvUwZ?7ZhQ=MPaXb5*9AC-D8t-!tEN-n`Y;T^usyMhGXZ>k(uNbIlM0*Wr z7SD0Of)+sFcYrGrW`X^Rtak>9?5u>acnw@iUozmpWI3 zo%_e3&LQCk0q)o&!mU2+t3|*niDKldnZcu8-4i2jZ|M4v^=sMAT7lbXjX%EUEpZZ& zi;BtN=R6OERskbAb1OYew_h)+b-G_3w>2e`bC}+c9l#fiIXFH>`_53dlo+Zs_6TG) zVq_B_EU|6h2(%H}*wxynvv3bd(>3YuFqnsH(NS>g-uk_kLuL*^F+F}T*h8Uu-**?| zRZxf)mkCD4)8iQa0nSOW99HzBA?4@2OhCAlhwwO+l{amod<>~ynLDCBFaKOi7(3$1 z^CNsM&%QXYxaXJuWNXvR60_L(bpm#kXaNpf`t@(K^0Urv@j+wB>*$=;ljz7sq3Ds# zIol7HWkV9%c$2@Y6cTJ+g^Pmuh^hI86GQM}(5jTs)vt=*(pKG}47Ifg6NF(iHEGM>z59Z#Bls!fGWxg(Yo2S{{e%qdIRAYl*7 z@^ZtJVz}o!VfSoku`h+u+bxYmZFcLlT_S@ga{2ozT}o2SZ12hDP!1og8;=@v!PvPC z1w>0H(wk`bLR5-hYc`SudGLxq5F<2!Ts{m#n+}!OW)`H*TCO?%bx5qi@mQV(aVd}6FTNaZ3MKGl0%ce$L&=Z5nq_pBj>srRrI_PQkt@`H6{KOv> z+O|Q%`$o1E&)adjz%-;gu$ijF_GYMQaw#@1q!4Q}2qSe^wRBn4p3q0(TiT|pv^p%r zAxWXSu=&)2X=qeczn=D^MEL|&O0Ek`+j8^QFpSYW3hNoku8#iW(WW~6mqC@{^<&L> zni%_jVk!Py7Tx3QY-?VXJ%0Ae$X~h5ptQfE+vsM2cUz&Up9|`p>5MRH&iCa9FkZr< zi|iRT;rD3=i!IdAiLwRsVG9`jw;DuILpHF<7j(51jL);E7zMWB>bBth;A^0Xn_AOm z6uU{^m?v+oLUy{Z=MUr#m$DK(k5w$$uW<%-7GFVj2jNI2oE@qVvk3J~xfSeb4_YHj zL%ObG0eA1&>-y`78#Ro<091=wUy|Gix5m3(Ad-0LO?QSElAieT%*AC20h&N>N6vQ( zzi>_y-Svz%nnLM5T>w{xo|lfx1R>wKy&w4&Jf+2FPF^rb1^#~3)Nc23ITi|k4OsNV z&Mwqt3l&s%J^n1+!UJqg(uoh!d0%jgwnZ9^rUm>tzeM(Xiqw|J!Wk;?A4ei649?1j zNVQVPxw5S)6XXQTPhI}h{#c#XL$|EZqk}K&#Sf2%`%X27dQs(AeprF@8~;fzovA7( zoSKw>TkAk_hvT_%{=mY)0me0NAjVA!r%VY~_ACLHv@^b;KXYG59u^CMf8H#10^4ES zPaZ0g#=~M|?Ibv5^X0kYP!apGwlUMvWM0>#_lM1VmZrv<%C69ErQ?b&Wn& z_j`s(k5lV0ngWJi)ar3ufkNqn+T`OuL~sJ0k!uI^<8KDm6FC(0u=w$D!m)g(GS*UP zOj;;^!?>_v+9c#SrW!HuY6A!KG?LYciWBv^*PKumQyCogb@bI3%G?H027{-XZQeyz zvII`y6{$f#m>T4(ak! z?@Ev$DinV@s4%Z3Y$ZFl;{~nS*f}pB%Q2?&OAsdcBUfEznM*-0^qt%%a%A{uvV_N@ z{Xs2iNBoX&0dnj>$eLr3BO6dW4gJqJ;tMs7l&ieKL;|#x;xj2wE#-Lvz)N0S6s|IT zOgVs#Lumd7K^4uM4`h%2KQ=J81TGMkY^8SU10DZhLku3Ep$yK>=`L$4i0CE@vI5Ru zCX@k$(x?IX?dTj8l)MdPdi(aQ49xve_!oCY;%ib`Q7s^JF1NLj`^*1up_YqsgK4XR z#H65q|Dn15_l(nZVDo>7Y{o8C-Ec0S%8zX4_?rA4fSA7`@NTbIcg-{pC~N?0ZY^?r zaEVI30Z>rv=6@|Kbuw1hFTj86Pq3}Xxf3WPA$hE-CTcfM4T4dP zq}!Z7*e=1w6Zf@~p0vpU5o=zRFx^Z{yt+ys)+%s7GRD}W$Kl`}IQE?#myz7m#2Dr} zuxBzzh-w76WOu2Ck(rVwvsF`lFU`|p@ac4R>eSZo-AMS_>B_u2tK;RcrPA6nE-7gm z%C=g=^?_5qZL=ZYWXO_j1wqPm+U<OqmoxxB_WQ>pk-w94kW`*Z!-&(c z!(5HM%iM-*H<%4`YDz&CIM)(A8S8M2I9JuzFRm(oaO;CHa8h`!4HVJ^4p_#lEi{&&t?&rhHWyO#wE6$pC>YKdtf;b8D5BU3P zMQ$vCdyH_#^b)?Y7g49u(Y9{*#uL(DhvJP_}3G%2k9ZMbtFBx}iU6ec=h+K3&Tred5!L5Y{6caUh-?g{C;vI1$bMwHwmKy^YRc zo0g-kca!((kUE~LdRr7Ao7?EQKOVoWcT7v&xyMqOe6$NKo-?9zSm=>`8gW*((5-sk zSgFp6FHM$dopOR|`lDq*-hU4C*VgymUr?n|0V=ceu>dV?9=Pg_+4@SUf$-;h%Yc%c zo}>eLkF$r98)%?)-Z4%h8@d1P@4vi4zmWIJ0aX$=>PmN_?b`7NSO3tgs!`)iI#fAQ zvtw})Jow|ZMj}^%1Eop=vrALhYNN}>BUzWd;@!7utiQg$(#b~%QedjTdH=BI(j0iK zCR{k^(;jrcC(dr>BVNTl5_qf;TCKX0&mMtNR#nuD38lU&A*B-G6(x%|+pEu1KXOY4`^xMh|y~zyrAHN=F^@6oH$)y;&Zi*7546-|BPo=0o!6cPdd? z+|pa$mHovRx@9i8vu~^)0qoVj7SR;D@PIgt>Yd*lpvk;d6E0Gq>Fikw>_o1;H@}&2 zTj}s&QTJ5UDHbl=iKpn$7wK@GDQ6oc@4ub=J|@0hx-au{r+)J;paJU7B^(tR-$rd> zS|0mYsOb;~eSU;zCFd`|K=!Af0{;l*#6 zMU2G$u|<38#I!fn@3F5Z`&I<#=*QW^m#k1*p7rZxx#LxZ` zHXmO_o3Zo>tH|C%OV|c#B!tB0@)*`a1Mf?^v$^X6q>Pj?UIO`c; z%HTHFFKjD|u%}+-ysffKYO7D2BqB4#=Tywe?o< z)a91MS-i2zvK#0)1x9-Rr(iOyQ-%J=3+trfPbCVq$;p#I#583856#X3WAoMZq?(;+*=UjJCrwzQGooy+#lbXpuJxebYYy^4K7t|e#U z#`E+?(4SjC%a|CQpa2xs{I$F{Rj^z~71P~v*RpWqv|_9`o!S>N_9@pqIXfRO1h{S| z?yCSkDreNIc&}$&dv=|yz{GBXA&A7nR5Ve{7)8&8Y1+ubCi_YD_3`lfMisqx)s9H_ zq2VO^5;Urc@d4nGRs06S=2878$v&N*Kt5$%w#PsdGl70&Za_$}&TAXZEiSjxqr6?! z2AAIc^aqojz;7%VNt|$KiTbR=mp)#0k?Nh|BO4wGAZeW=Lr;x-OCWlt-VFojNHRCQ z#<04kXRs6PP)lV*GF^nm?MzsG@mOUv0bSyb9V6@uA?v)HPyFy9VxQH`^fM|!l@=GCgxCC=A&?8;CFF`kxPMd+IuGG!= zXJ6hBHio{Tp~4MZl(M?thM_%?1$qr$5~u|V8U0MCxj>k_WZ5pb-<8)`yeRGiz6$%y zgAC8ba|D9pQaYC=qd@ebRPPH)U3cX%*$EKD5+sxKEc zu*Rc5fpnktHUJNDf`GKm|B;}XZR9d9{JPc3N!&`en;jNmux=Cf88;rdKoX&mYi8-p zBm<<>_2AT_em4?Q7@35MHxX0HchuqMnZ|JZnHILhr`tTS=rtRfi~hT=$42T1P?gyy zW)aoreWdzuQuE{(v}uzdyKjxbaT~(=NEw+ORzAIF-$QaXzaq3FMpUSt!nP`%cu^LW zN?#~|8a4PQ&*M(nRpJ8Nn(q2}`F8XhIyOwnIq3cO)ZE!PA*r4Me<{-mtfIwVe&N+R z6ERgL{YazBi1YHKJ^lku)YmcS<&Iy3;+YJ|U>$%Fy}4);Xx3B}P7F$9N6w{qfadU@;0VwR*o+3qHBzWaO>=c<>lRq^KZNJre` zuiZ`Z2Z1a(L5$jc)lLfA(9cOi*0d=CqSzF%QWkT$3y^v=*U#NHGltSbiE<=I_R^#~ z278bf&!x=;D+i4^j=gUHbP0NRx_w8Q3@He=_VSCB%=ijV#9N_S)Vfu`n)T^KjnBX))VGI}E{-{d*nLP{m%-nh!rrzLXr+%Ch}V@W6htVDh%9DN@BO7R-GX3zh-s3pWE?pS&Tja=>mb2{;?rgIX z2w3w)KVUcM<6IC*&gVKuHj=L67GQXI#gmfV`g-YA$K?n0$0pEBV@$4-+^*rEQB=TL zUJ<#bBf!G8d-ieMQz)=@A#hOP%3oLGgOC~3_tL%jYOU-zRLDnfUc=%$nYsPY1qZ*f z_`mwnm3pN(l>uK2>psdyP;XNfmcFSKzn$wKDoP2MT5)t052p<%DRgv_y-O^A@Tp9o zP0m2vlfkt=>~V?85aWB=HX>bq_h3&H{nx;ekcX^1=1Nge`MGTsqro~*r~=9+gtz;p zozRl>z3>H?CSDUanFv#)jR)V~MbYZD&Lly>T}|B8Y&Y=bt--JZ!{e zP|L&%Zttr^V2StS=VCN@JKc0EpdU5$z~Fd7DeprFgJ3ww7M!1@OVey-Hx=VK>8J4j!`XYsv;Ft`!?xBeqP1zwST$nr zYKWK_X0`T+wrZ=JU_a#Bi@2GlfYK8IJLrt>8ix_&M91f85|%YmWYX#`LOM`XmXjA# zL8Ay>r9}<*zbz|z0ffm!+I|y5Lim{oZHZaWl28OcC%%=WoW3s~)L=#wYsOq0;iU)p z3bqx#o=#QwNaDJBJ}pPG?;b6wotTV`r`8(QNd@=Gf}C>V!Pl?hgM|}!Cq$f%?OKiA zU?^%|Kky6tky-g~I03Vw%Y+;1_^9#d3{E(n;$D*VX0CGZ+;o6e&Ajc!C)}#krqv5z z6Pde`xqDT*Wv4T3D=g}?gJlC+@GF3a=}H3Z-AoHe1Y@T<9f%rn+s$SW4_G7ugG z7RjWIrv2?I8xSO$#`Tqd1L^8PfZ(o7{Gfeyfm$K^I!hG|-9rV4ChUXyJ{4yRn(cW^ zmE|CXO-Ybv49T)|Cg4i1O-=E#+S8R+p`e1%I-6&HUOvw8kA*2pRz@>RYz9vFl<(W= z)=q7^*D2{eoYZn8CW|a(=VCshr;)|-SbxfMSS=dWvsa}xV*9PWIH4g#rw%4>y#sca>lNl1Nrtg|I=kJyVgX($n_IeMs>gvMEL94k%bTm3uWeENX)9 zQu1^~p1d{A>7ki5J5KANKGuS|TGVL))CZ7*hca9V6)uzC;^N(AV!-IcKKJc_q5b8g(c!0nlV2#h#p!KZ)=+}AwMl@IY zuPwrVLw73Qz-ivj!3FZoZCfM>%ERljxB=9s}+|-{RLZ!1FSEml;Bmw3a zZ0dg%H#}dlP!C(HH9JC2cqf)NIB)!T{M&)Jf+$iP{+s44*H3{~6U?lgjIhXi0Gkj^ zHkpxZSa-g!Ml{DMo>$?Bzks=s7s6F6ZGfGc1WhD(oLRp-H-Z zL=jc^o6;)GBHEYwHL+KVr9FmxU04BGJ-Do5C&dDKb_S?K+j3pPnp<`|JhuuS8~@rA zOUR?8;z$DlSAD4x*lAsn>~bOZa#(jzgx@`Tviq`-r?84F`Iqy-@D#sJC%4xLSSTjEXgX6*p9m zUG3s@<8ve(yS>nOAJn$b5~`%8U(^XE&=L_AXMUBzd4xCiZRjIF)P+#cj}3}_=A|?a zjC-F0+Qi)dD-0LBuHL* z{!^c?Ez7_bVEk&7iA&og!QpKnpO>U{yowfh! zU{2Q6?EHq!ycKlbxA3-5Qt@Qr(ALC#NNifrS>|bK^s8gr=+xmrhtBT&YXb}wM}Wqglc;r9opm!&~-Ikn4ozrJ`E6jNj^*>l#` zXsH8rVN9Vh?n9{Qk7ao?;hXB8yhyn7)He0;!py?SZ0`%=r!()GJ5ZOc`iK__o~s|T zdA%3Qq4`c`w$t@TZ8IRzHtWvp6dPZd$X>j3qrc8t4y!)G_aYeZCM2yLN;eC$kO=FJs#;(ej~h7?#>yd>OnN zN%sv5dhRqKAxxb)kf zXhBO_82f$|-(j@4fZu7?J_*}QRh}xwCztMDisN$DGBM$G)<3$tG05w_VaAGm&Mm4+ z^wXvpu-DRDhl&_^@6ZB1xq-J+8G`4F;-TG?KoR7{abdrHnO5s|4qw({@K3yQulzb+ zbggz2vSt}{U$6th7NkjRJZrDVhpt>4n)Qg2Fr6w8%t@5~2JQlZX99YdTgo11OMO4I zo+rE;Z#nXR>V9jrq@+Mi$iqKyc6`fJr~H=uox=oi&qALh8%?QPHAM^gE3H7hdQaO- zmZGRQwFs%4nD&#v5;(qL_05_d>-Y6cL(`twyTB>D*Tw;XMHP3A&G%&TqHvY+h<4%= zfABQO14~nx=U}61P0=V_EV2!u@trwSL1yi&$PZ&Z;~`vSi?3LC7Vc8>!zDOgtKe5sk`7GUIW;n}k*8%Ha!^~Ip|@H2)r)JBmb!3ahkcWUokRs>Hvat>F1pL@Bk=7Qv`+`p zS~-SOya=4gqS7?Ghu#OAnk`27t6KWUaNZbIQzeJ{z9Xs-L;%2eu>juW7U1W~)uizj zju8cQY&h>@0bg|)*rOElZ(n7_> zS-ioj8lhjsZ(U^1>Ps z9*^Os``oR3P-F4*qWGZLoB2&sBEgJ9L^tJe*)iI}%v=2DYZqdNHTPE=W(yzQaH*bp ziR-mmyrMY+_p19w3jzp8zBIr04({AKi$r@pAUJcsiZ>9Q=Wv(kU0@PusZ{@qx>|lY zC2Lf>PWilV^?azNwZ+*@X=fZl2GTl0C#(I7xMeG8ozsL}Q={)sw^VUzy+R4ITAK75 zj)puQtCzvbTXyyfLt#&o`&<3ORU6)2d4g*itBo*FmiTdOcdM2Qk0iu0a23aj9JGHw zb5JF5CyOig-azfm7J_+xh6XUF*5iz}`yQ^8Avg3fEaIZAYuuIMvw95{`d1qm?etv- zJpiic+vU_R zPBA_?VYHIh9puJ{i4n+h$3*5(_``ja)9SDa2b zuq)*T;#_uiyI{>6{g`$+we0lfda+{xgjuq>-xt`NAd^Ve+=&&-;){vwWo}Ngqxqxt zo&dZ~jvWFiPj-&Db7;j$Fso1fO_s|_q;pF5ahtCut>Wpj`>tJ!|BTCm6JN{b@yb@= z5xzXBPGn5R0o15ws+BhH2MUH)%3SrCdTue@GKlFP=O5m@}91>c`1YLtiZ`pYh@CuZ+ z+oJ6N@%GZzkC9(0oYfQYwl8D8-A9+>i{6H5R$^g=eVev%%K~)d5L{N+p{tqbao6?~ ziIwMcX0k1rzl^*4$~ zb=15>)&Xso=wm+l)^jZuGRnB%09qxCK{~~k!qy5o7i{^&85sv-Gyq8`pCR;v9tN6Q z4P3!G!2^U4uH12Os0<%y;>sez+!XYc4u$DWMOZFPl_fK37o=~3jhH)Ntmd!)FrmM! ztm!4~KH6*4%`8Xn`+nI7<5LX&`4ZyPlAn%P7$%(g32R+7LZkeHc&7h8Mq=9cAu~01 zb)UGvk!6R~n|`K>sklSd$3~MB#Tbx#=1}ChLTWJJc*G->1dt`cvH5`W|Cr&~Op_&Z zs6g!8{DeOdVNwc2qX1a|QHZ#8nF$En7*0t>*Y+Y4Mw>cjK@UPw1!!bSk?(1te0U0t zqVE8n!Uu{4GRW|YSbM%!kas4i&9{wagk~(x+#2z}vk?9#c7jYPN7UM=UjX|oiz=Kw zdJ(>c@5_72%Z1cb^MRb>L|FEn(0MuC&3^25geEYlFq)2fRbeVGv=j&aUdQ{$>jTy3 z?&Z^=y9z0WyfCOmS`;?u-BO3e7?YN+%|*maLg2cZzr%s^sjOFY(gGGBi&nRt1304I zP{FDyzf0GDf@a4x5V+_=R?ZQApni@8)VBhg6YuQqZt6Hd<)jTZ$NBp*;oRKzy$f(o z(cSf}Kyfl828iNREl11*n3<({&5<`AAgq#&qIoDBG|?Q{cGSh!%{-6~g1IRb1lUP{ zvIVG_eIQ&ebRmk0&v!3^eX|l{8WaVeva1SPX^kK~o6O8DgpwAEWydA495Q0tqicpv zF$iqaB}Qk{_=)k(p6BJ*`=8+fB9rA(y${kZkKiJ~Vu4n&P~S|RdKTyLNYo?#U|bDE zF8Q*>W$RFes7XgbOjq?)xSfO|I4St|rBan~wq6lg>cxTJHt8G1O{}SrJ~+-B2HD0E zrr%6rI412bs4CX}K2Uk5ck{xiOjxtg4m&(oTo|*PY%y!f49aJ)iw@W98Zle@G95*o z@Pjo~tc{Wl?ieH(fyxSP3wOcvFI2vjUvg36xNDH@?J{$eq3ZE~wr;DGgV9)mRpQ#y z75lxKq*@;t=T)CZ?13uxlU2k}*d2mbg<5TyS8619MUWhld)_PNZoJTsZCmfS%O)SX zt+X``Yiz~OUhQcer@}@k`Ikk%OMh3eCASxk)IXNF>*@dKc~)^>kiBf@uAXnkys+Vg zhi7(l`aizT4*%AWneytKUft``N^=$NJBtaaE(CAL9l`7X+sE#*K)_}0t5>*I;aQEu zl^aP0e+@GmBrDj|=JUQTlfbN4=uXGHicBR~-7YVRerq~f>qDx3sUCDqs1V{)@!S|8 zuA<~6!Sx=Sld~S)oK%8AJy5#0F$7OD>Z;1vmKIdZUELPj`bizUHAH`rQ`B8n&(yzn zY(B>(^LgIuj!h#vxQ5{oVC{2I?bk{y&%PCpE#NpE#Hti3iJgpRp~r?{pJaDLIySxH zoW0ct_955?y60+ctn`~;J8r==ztBXSJks!?1_GN0Eq)BCVz)|5rzW0HqomFd95zEz zrz#yD`#~L^HB7dkxHLLsQfDe{d%ffEhEH>N6@ZbFN#O4hJ&-5T8PLf3v8)Gv-!RtG z#LQ|9==>N{nqhDJIpB9eAvZLck)N=SM%*7sylG%`^7YInAPh^*>}Va&+Xh#dxTGy zk07=2#=e@R+kS7cnD+~@Z+HCmZ?}8KXg@CPS6L#c1|3$N5uZ;l;~XS}(1$DNEPu1n z<$gX(FY1(dtK#`y^(<=y64PZrQ?&@Vj~r}t4*s@8K6!TiDNQb57yh$4jMCmWdU(%o z3JA}ooyN=IEO09Ksw1!s3^~{cD^8-{GuN&zKDDw;n4#Mvnp593H}}-+hi)lNqvC3T z>1ok#l=dN=AGa;778|Y&T#V}vbQ1K_eZWjls$mAQBv0dukC{8 zAMnF>yW+OKH`9$Fb%FOcLv^1s6|A@}Bngnc8HFDr>L-^NMT|;WbS(A7!fl_wzy)=^ z%!EE;yx4;J)hEvDGf**5k)>$AC-{;eI|M++1SJ98OsplFF94|TzsAU!FPHe~W};ka z1H-Tt90BBA>G-^l-FPGBlxe3hIfmb0dl&I?BJ086B<1 zEIvcp)SVxd_y^s~#qoRi<-=d3$r!}CGQi?F#8o5T6Pze_$hCjfNNrXW^y4}s1_#Q5m0{lf;JYe z3hi_rYl$2Ww)A4kOyl;>RlCMPjK}3)yYys$c{LaE9`hg11N4TNC-)R#m1~!<7;H~` zut~5bp}NH(%omikf}EqRws&_+N)kx*7i9iR?Y6>PM|GfI-cf7a7l4PHwTPv?PnPXo zYxw0in$xh~C!-?s?OL^RYVYvTr;l89FSrINFolV5$hZLkGSuH;uclV-WKrPGq7qNA z|7`ZEr0HwIko9V8q|LW%H3wY!8L$38OUwEdXoUY6`dOsuU3&I@eoccV#H`!(C4)YV z=U*o5qBC{F85k7K-6~e4TrW(|4D%)ay%tsY5#H7~6l6+J zysoH!&-}4$aLSqWsL5-Iwr(Bz-*!oh6I`)ooXlnWYSJ^h#F?nyO^)UM`F603klo6Y zo}D*8r_@HWnsbz$i&2CG55WS);F!!o{ceJ`VNoPW4+C?h#^MyMI`*rW7*}>*lIiF=rex{=RvvnN% z11hf>|K9f@RXu|&#t$cbOWGGF+oSIms&Ln8b6IEWAq499ov#{o@#$x~U&!FuE$%Q{ z_uFl-~g|9#h36hY@f89Yt%azSI|uLIB+;gRM9_jHdIp zTkE~e+wRm>$BUzWWRQdqwz(e6?=nPy#cmA z#3&ZybbYCzN+wN)NFz(H2xgmjGWg#TB%zT{c*Lmw}dkza-w{r``!*d z^Sb&KOhL)57V&R3{%h+(c4DtGOQnhx33@;h^E)I|M%888UUdFf)&23{t&*X}tAHWj z-H94V{xUAb(_Rh&+`JhBM+E-)TCuA2Tl{D|{Pj{+&;Vh7ggtGEq8W;OGnQG~AgZ#YSf*DG6M-$4qyWmuTH-5bFZ1Gm*U)+<@ zUKMMRj_Tw+$3=MkC-I@P77xbtJ?_b_7%_NBxP8DDHxzuhQc>UJoqbFBOFPqj#DQCkliL-PXGpgE*QC*lWA{= z9ab;u#`E`~|H3#11WxZ)_Jsja5!TjzgZlQ~s7a2awxm;V%Z(*oeXz3Kmq2INX&Ijz z@u1U+_cj-1iU>EE9!3`@9#N0{BMZ@a6h%xQZc|$Ned=3^2J=*<^rd`kka?+8pIZmU z1Rz$G_weAf4uCnN0|Vf?>m*ap3<)+jk*PmB0&1$@tE>oQCzR(T0-+PM=FnKUlT0Nr zYDCooBhKo7BA2v*|AH~Wu}w$>v{wO?)wBZ?E#(p+j;6x8774Jc<~PgDU>KuzV_VM3 zEr94Gse}`%)nT#JLmxn%6pcD%R4Yd{r~}2w7Y0zNI)Q>__>ao;KOmt0S6a9K)WaE@ z!+c$XOBW_KF7v5fgV2ve5BZQ5^If++C?Y5iz~5I*+XC%QPA*<50^ znD|#_s`-5FnacfhX(Z|2&6ZiCKD$?lyjNU>#H}laATOS>Uv$&~|JvMmh&AQ{O*pN~ zRib)7?~NW1WpV>9sp#Wvi||0dM{y_2oGsns4H_%lJ`kpOLnUH7uYVMzeHT^r?V1%o zXI{L)yhz2bo7@NPWQ@JmBU?$Wjs0WMwQJCQ72GB=T~R?$#cOGXmVbfcGyMIFNvw={ zE#F>3meA7aH|sj(${0x}%Zy8D!u_G6&M3(ya!d9l=ji3e#z+r>bC!3VIAr$5qrX^| zg3^rEjViPKdLN9GkK7}jneF@0TXXv;Zq}OG>hD|a6KMPy#RV+5z3Q!XiZtrKM%k{w;Q9%OC2VKL`4{cdrt?eC-eJyVQn?&>8-)`l>*O5AdW#l5*(w;MlS`SLaQ z<#gpf{iR5O3Q~_+)2P%L6D@mT6&*BC&@rhaRuc4UBCiiB*Nc)v+jFL_*Zqutk8Sj} zw^kR={(W)oCO2z(z=N~b*z}i5R<2kp;D73JbuqUPTwB|CXO=vE4MasEE&VIjj9;m~ zpSpQ&YqjSv=IK3I^P5vo_V&P=m5HH7Q9F*7V>~J71JwI5$qWm^mtQDqAq9TaAUrr5 z4l-3@PYdv6RU{bQ9EHC;E7KkU{=4T?FQoPu^Mf4!Ofokt$m zzWhj%j?}f!|C=0g$n-Ud;c1?b$d#RbGCW;YGSYOhD4J8#y^)1p`0Zq1hyu6ZAb7CAb&{oeW=Du%th19Y-t!P&3hnG)Bm&lUgt zwK?z~H(H;goKF4P*v0F+>!44+08y)xu}!3gvY|)slm`i*o%G8Hy7AZwKxRJtpc{b= zd(SD)JXJ^syb3=xkNQ~zHgo0oF7-CU>G!MUXOgfM=r+^CoJ`-}yHmG8+F&OG!u1rJ zrFgWGK|6S>YakFg&L5>a9xGLvVrvkQU~#5@9+cUQ-r{ktZc4Vsk}(~3 zO9d~uPm3gC^o!x1rSW^$VGyi71JU*4`Opp3Wme_;g;>8AlFFj zm}RD0^_zB=I*cudJ`&(b{!P>O;`VHr?}nLW1h3BWG2&f!Z^BQ~jU}PW(ooGfLI~Xg@^;0gT*&@8 zAnK{9mFP(yXab%9+G#GCO$+eX3Izql&LXK}SosUwSZ4%Qo(^Gd0-k0l&q+eK zvVeVwZfBtiQZn_7K>oA?T~0<0js_33mE2e-umc@drjpLaN_*U~^LU)&TILZNj=2R% z@Rp9Frm-QhPNVl6jm&UoDPoMTvs}t)IdwV6m!!1xFhgagATr)Wc;mAZoiZLR+v4-DaG^C$ zkdH^ua&T57_yvWtwEm2a*v~P;H}$OeM2(%McGwIvN5Z}c`MDJKRR=opD%N#JcTOVg zpS@K6L1J)Cve$zth7VyMcFe(itJow^j&R%vrEWwqFV_2kRt1j3w>kDAo^UA%IpJn* zhIO`)OW-50W@2CP7QB>>a~dat+axUTIm>4fQsl14L1)fxZJii;_Kd2b<3kpgk&+;x2sdawGFiiCFAcI?$5VxNG>+Ip#Mw6}}30Wmk zY2vMushrk#J1LI`jU%!}l~UI%*Vsovy|}4)%$%e{!2|T(r$X%NybYc=Ju!PdlKwX@ zt3?^Mrr}m_=I8MZ>-5CD+27NT(8{`w$bMfoeg8-wvYU&GGV|N()$Ow2FEM@9gXtb79Zd&s8V&uG~bgZJhU-7&99*Kq|eGqcn%W5)CpS)Q& zu83Q+Ncfb4AjkbE_n#?qWE&92tlw2DxT*8#{A1DfD-~Pur!m>L%xT0!VxH6rj_ya1 zyq9lh#iv@t1KKJC|Gs4J^EFO|T!YXanIC#Fgxf!n*2O!Y^*rrdM(6+QQP&I#@%6~CD4iLA%7jd^6KBZ4*9)F0DeH@;& zpCje6cl4of)=jrb_qH>j&@0TLSO-^5-nn#LPwak%MWdqD5@vM&S!m&>p~k+T$7wNy z8*kk7AKwgk6LBADddm)O!8@Y?;O)PjH~W=SueRk%c%|$9d=L>!`vlQ%u|AvRPp?zco??k=QMOx5O^q2F@;I;$tAt(CBR|C*3h z#frf^35gNQO{$?EfZSMA`_ggOC8&gSP~&|yUw7JC{F^kv^ErH%1chl&G-R;c1z|s)d@h3VB&9$vCE6Kt`|DMi`=_lH$BgcQn%mjZI~l9_ zx;~dgMuB{#S!i}{KAp%jj{^Li=CkW#gzP;>S1Nag@E-&r{+&W15s;;eu=4s9-EMuG z*^?6ZD|F5vP2l^mO1`Ho#g)&=9Vn7uwjq(!Ri#@F?$WD{_-IBqF%)RQIs#`@3_}`c z9Z!_!mE&0=C-#)5?=ltBQq+QEZMrjWMbdy;h>D!X)^u@66vdg$+cJpxu7ReS^_7Dd~5} zil`D11@AP)Is-vAn$@(W$i*UHsiX&9LKrx38`|kBntd6C3k11mxI&P{SuGVH^?ow*kLOc3h?90c?5T?2hZ7(N8dR>rJtIGQ-X7+-r@+tw~3@0uDvu_g?XM+5QBN`+gD z4IjsDs+vL7E;=}m$5kT7k-U$02y=)r+u`#PBZ_|d6RpHyId0^*(Li~fvO_2+pFj)* z6hq^NAfyeuPHdBR+87kjbImx91+qdV`H2DN|E*{KlUn{CjGnBf?oGW>veF9;vN6us zEaNE3kTh0Ov^FPb#pHd0~3iUt9M}J!O|D7T9e>w(+@x?Q%LKV-fVvpiK(3%%xz%sXt zUzkC8HO{r-6|h(CR)&&pCtvb);moN)_6SwA1o%CvqOV`S2}7q|mQUwSd3hBYR)~z6 z{8{E`UrYw&)IKJH=FZB@cUH@NOuu}Im||j;MY#h0!s%v(%ewv^KR%nja@o?pi>T06 ze?pf1*s3R2cBQt6s@l$d38R8}xcV$n6lHaCj=m0a;76U~*%rfUg64>Na^_#3=%9L8 zm}q#-nKfEE8dY^G#R;;VBQs6E3X?(z4i6@4S#RY!ygUDFjuD>27U5z4^Ts64Y`v{g?&x`gy4ny@Rb#}CEF`BBB&jkBt5t79 zoT|@DcSzk#sZgIDD%6RQQQ+1MBFuD|+jGd2~=9-yVA^CBbz0O!@yp-9?p zJTD)i&V9hMya@M1;I7}KiWw~d)QS06=RW9B)j6&TmmQbtdBTz4;3=>gnyid8liod) z<1C>sIegr*qw~RnUEN{IQ#OW9$L?v<)r#N|pV*3r9_f?LB6Clwcl)|t-^qb4iz2Q| zRk_{*hzs&Mjhc%pF+WhpcK%}dPW*2|hVCPyVp0!y8%giN1wocg;p!t|f{u|%A^O=C z^r^CleSLC^!uFT^ImcqEA$nU&Cd%i7aMh0)vD~#*FXdy7kL%W@hoDKT<>^&?$~7Oy zbA>Ui*!6aU;2#AMU3}?$LSb4}mqP}&QL?pe;)e~bgDyYfc4S{?-jXNtSD0y-$~(Fs zQZTKR3I3~HKz!kb!_0((i*9u1xjxnRKctWBK7V$J*>XHCKi?3f!?|}Hhqyq(X|=t( z?nVEN=6-wVuP#DUQSE`wS=KKv`e&Vb<0; zjN5ZJPen9qw+{9ATJIWNs4yS}JtIMDfn=+=pOq}wBcr8Xie(tPq|+m1V$4xqckadW z{0r>A%n{#13SxVENWTwE5-I}F3s^<-tJvYXG?;ixOO@s`atPx4^Tp^K9% zg8&>>z_aanyH?2$hWFGGeR4oZF;e-O^{b}>`Bi4x%{))ukp|Y5`)CJjD2`1q;}e_2 zyNS`Z`1fV{V(2FEnd0oD9riN-!c0SI^=$+#iKkohvfV-h#@b9Pk;h1zEGdEPK7CZLYWb8tNat@FV3$%Sk=)EHto_?Nhs{Skbq_F(t!gdJ{dmo> z;dG?p`?LG7fH^JtxUFlsBd>&FBKkMbhT_9|0gs=rz?aFvdY3+5@+cI!pN$`5c{C&Z zfi)$+P8ypTE?^P`-A3?rRG*mKlT{U#OCp^wX0$*FSGD=J!a(jNIA5j`Z1jwSn-z$W z&`&<^3OWcA-e_uiWYL(q$SdRJ|EOv;6Y@+acaUb`7^Uw5uUXSB?r57x3{u^@ykhQF z`E&@Y9`!!M1Tbw(Fl)~A;yUuq^#T1tHCUYaZZeZ8GSOxl2w%8*K1+sT)`Z-yFuX+3 zv(jb>8&ZFt*I#oyz9|M?SR~YJo)~KV%d&#FV+^&M)HkRbV!X^uVz$Tr+ME_ zUCat4P?XL^0{<1$gw9eXgcU_mhPJ5#Fb<Rik0q=W2XokQ)D?l`}z@1Vu0K5ns z(sCB62=2ODNv6#sEEY8<`o zfP>=AyZh1z-wOSGKo6UQ1Y$7Bicm=@@LZ`dj$M^07pL{0Cjm6$6y2$f_5MqT8d02I zQN)-p$$S3CK7LToQW_B=3Rc9lbskpInw#g>*I)4O0nPGEWLn|h$qFAgl!Yx~Eiz}a zF6vpy+<-9v5f8DlmWc;MTN#a(XC~sKC?q3IAe6TpdyVC*6Ic~$3(Tee3tj!6dP|Fc zE1dr8s8XWbgT&&4lJP@ee{uJ@;$U9U(>a zvio*iLd?@|F+Qn%|7`BkcVm?z(*yZ=S#NA?a2EP1maYpxclw+58BbWVhVt+!jtcD_y6Z(dq*>w;Rbq zPP>87K=wBrJJi$43)*Frq;HwAVV40lbuZ8oJ+VxIW|rz^k(143B!9`-Nxq;eY*L10 z2>e9=Le=K*nXVFDwzGFRp>|0spOCIPaE2nLc77+&YQ!n$rPP$X)Ho92%cE5tJATNm zJ@vgx(5cT~K2!!WOvMJe_QzYZBdAkVX@kb7IMNF^yKml8sCES#rgQ3nc81R$%fbdFG^U(THPaai}4F^$oH-kI{RgIC{rVQ3S- zW;8hr<#cE3m%s^3>3Ze)LRDy;Z1hT2xF~%6E@fd<~fYux4o0LAQ!yyX5kUt&_@QVN(xF6#=a={+4=2+8#q1abY81q|8!&BAH+$ zuR<{EcNUt;NB04HK2d$mpW$#`kRI=uD@Qb+EZBLvA055xfu7Q^A?$@ zK#kO%ES?kT3zbN7bU2OV?BTuBbCr-VtFN9;SrV5B9G(Fq*wg;zVI5CelVd7~e&(sK(D%?ZB13 zFSn!v<7HI7y7r*xoE~@+#3OjGf5n<0%c+nAql(CTC&3$j{1#ZWfjpWXR2Co0&!`} z0e(|sO0#cbS`JD}v%^V8ZCF3LI$(Ud^mE0d2z$d_|W^-SvwYcf%N6wOUN8Q576I)0|0 zPo+I&iJI+zdgC^o?;_6rQy@plIQ|n!%|*@o^EZ!)sas9ffrse^ue69go%-G7KAd1+ zPHXVH3>p6r;*Ve)9L;H6Kda^`b3cqt7_>pWQdg3<4GWHo7tBlMzQ~4cTenEf(Y&4== zrHUqyxRooHS2LwD)l|Y=696M94Wth)Jw+-LlJE#~b~TMM0?Xx=HZd?p_y9O4YcBQV z1#wY!IG7?m7g%CU@c>vCefW|0;IJu3xJv(_^V0jtmIFnqX-R-@1043IISF;?)dQDT z3!=spYbZzP`TuaKjTby0ia;^msKFn?nREZ zc96O%7{Umop1^&=+*b^ENH*h^6Z_{;xJ^IxacAXf_Dq;tX{&ygYjGvOR1IJlCwKbG z5N{t=_yYVU^^B6Z4HKCax4;%i3#7>=%~}0qEN%m;pd$v1JBxY&H1oorwr`fPJOisp zH)dvz3;zS5wgBBBXy>9Y9x#$Q9Vh;mYue$lc&D3{ zB*2(ep^)_h?k_uzirIbPyZb6ve8^fh$963#bCI;xBCri?%fd;ci1@&eLs6UUrzrie zvJ2o(pzY2`mMy}=hu1%~X4ROjC$P|T5-PvBAZ$j0NSaz|A7s-Ww0X-aG)QN~$Jsib zZc1ktHDX#1pNrImYpJJ@#vA4P(29$GXC)N@9~dDd8eZW?Avw-Q!Ulra9a2xl#c)-~ zt1E(+inQbi3JEeBA^HwZstDUXS+nz;&bCrl*tI7nT-vHZ67x&G6&(_UFS_FNyCi~2FwXH!%-SwcVQYQk*sss1kF|>e> zc`<{3%$)R_ms8JnKna0oKk0bxuH^A&>{6!TkLY$;IGvE z+O}*uJMo?R?V}H-_lvz=a2h#qU))9S^^30U*Ca~Tng149Tr~PPd2M&(&QQ6D%>A!V zY)(F?Ra#6Po_lcLB%14hcWfYlJ|-mIvcqt3C8-bF(FFtuRroO+tM<>U#Pk)^^m2cX z-7ynQwyu1}khWC4cTm0hzJQ-ypk1~gU@N`wpz`(}iuv2~EvF0W)eFwjwitm>KgbqQ9wv95p4-xU{cA&FNrkuYvSeOKJ zzk{$!K{8{RZe4!9dCyyR*k$9y_T7%DOU*LoEj+jQX4{|dmZEI3bcpwS zWt!Q2XB#DqQft4&cT4+=rh3(9`BqFKLx^_{PQlLXazPByK&HVc!9JN^yRQV%qdZPL!Jo$f8LuJ~@(#o&3)lvLi+ z(Myp6ASJh_0HQ?UBe=GVK{Z*FtvV$^Us;sE>ld4%P$ax^9)WmJ8fVqrGW^Glh2WN4 zo(tfI0NL>m;6#t{{c|R;E+*0-urlSHe856z2x`HXS1Fy(#oh+i!%iEFHWCr)Dg>TI zhoF%44l$stdEc^OdF@=#kc#gbD>s`vE^^$rDu|VX0UL(NCzQeDb?T&K0Dj=Z@|;YN z10z$)#LSHeXGW&nfJ%fl4p6Hx<>Z^=>{&VSm8(TFRXTrYlmC+QBNsD(S7VM<26!%D z%pa%=V8Z}N7|1a;3rR)2paB*VfKNjB(+&LRNX-NwY$lXv~YArKNOXa@jUV3|LbI(j{brPWRgIy zZ+}+L&-iTp$=Y7}5-Q1B#79i%P8aihQST5GW-GSKWZJ;2HYQQ-%iE?h#p5A=Ryf${ zynpu`Ita8N5Y8?8Id-?Uzm(4afvvkEZW29Q@>+G&sWn(+-CM6`hk~F#yrs86EKNML zXuuEk$ozCWVE!ei6#MM|;q0x$ntb2?;UN+#f-#f^C1fB-cO%^!4Jy(bf^;b*f->oL zbc_Z?X;e^J8U{*=jP8`i-!*-I-oMZH`#jG-&v7`H_su;IJg@V5)v0=v_vY%nDNHJ1 zx^Q=DXHmg|UrsT2PYXixLuf-lWRH=Pm^*k3D;chk)~2XK3^KqfWKETJq%TU6m@Uek znblTOHDYyMJTBy~abkXnRSaGoM_iiuTBW4%*2;C~RR5VE&ON`3RpwJgUmzvm3lu$$ za813eStl^ay4u&Nm|NmlS~4Nx`l#tscl~&EzVS%)`YZ z_1k9z5|;S2_6k1?kcr8fQpNioH>ew)p&0N(J!VPQes`t)NK1XX6|B@(Vt2KT17pTu&{nCqr$%kEI%^&i zdZ_1I&a@ScL@v}_vve$njyvN~Jl1)?L59DlCrlpgsL&TZ+1 z3yDphq!GtMjt8Hy+hnCguC3cUg_a*MSEOob{a>K-=O{leT$D4P2)-ZV{hByR?{g`Q z@DuTxZ4uEN3M2LgEbmWwb?45pX4E{`QJI5D`$@ zh&Zv?K@nJNcEWSQQ<`{y&;SiU9Oy7L)CeUEn3mb05Q3nu(R>rEL)aP;S|H1l^xzQ) zAB0<$vqL9vD?@oK%W5F6%#$b{QLctgfy94+>uEmJW`tF&^gRdTBy)s~E>*y7BXA&Q z1Uw_8jer@GP9T_taBmxF@wTFtU|lIcIkGpB9jh#X-m(k2Uh9mM=pkt0|ce z3zzA&&=gjoV^|T!t*>I7Z2a=V4JOK`i`yYm2HLNp;QDLrunck?-fT}WHLjvF z-~<_Itda|g8o`S?5neJE?|py8Q9UgNk6qN3*P`8Rq7tsQq= zKCU%mv(e4vUy<~s(D<=F1)#p5$b$KT3{^-+-X>xFen`>6!KJBJmGHt<3K5uZ8II7q z5Qf%;H6or3amKl1A_5DXcvn{jpKd*8eBov+`ib3>#F2GW_S1xexBPpUez=zeZP|?~ zt!>o;>0Z<3`pwH78v1QVj;GYUoi1!$_f2S%cW=9Cj=Pz)rbQwRA6t%BU(r6AjY<2c zR4r_u7zy%S~R=l%JELF!J)a&=~O@!XTWMeQqY%lbfi zA-1f}Ff*h(pEbX%F=zK1@=eaXifs>jo55m8{uU#0(oS{?8PXQYFMFRWK#+^ZbFL_{ zJ&!F!TX0|Zy3q4f3Ocmw`OuxBMHXUOe5KSUBPV3!j4Qs&Qi88=)$G#ldJn2t#%NIk znmqog;{5EF#&9M@3cB7|Marq6XJ9alF2BL-(Tew|Pu|&Ru^VwX+Z;{}21=QV?_(EJ zPoc*@!&a0zYhL!wsF>d8ukja3+m;hsZ6Rh34<8Kbh2G?!r(xPzpJ??yh>#4tS>;W0 zh)ce^;H7V?fS1k8(kT@>`$fAgA4>!N74&^R=NC>9D>eF7{lKb4;<1W@!8(R*1A&Z_ zxjkziQL_2!f~wZf2Z-aeCej&wJ;rkl#jA$xu|K2hlRuY5_EE}x zFVp&Uz8%^^ET348at{5}rGAIO7q@u+&3!HI$mR?5vhw?w*A{wvwZc5m&|BIhH?TsX zc};Z4ifLFpxw-;Ut)L?p0_H$m)#OnndBpb_azt11D@IhldJ&$OmzaAqK9N@jPkO>t zQp{#Gr@;S%y68T}98nVfH}6+E?aF zX^RFiZ29qHWq{|y=}(kcK@lke;hl8 zgk3+u2m}<@f66hTl>-l_!{Kkwfii2Z%U=<{#4l&=2x#o8gg|W^d};szlv;lcKZcM@ zI?zpv2}?@pyH~1(W$jcn`XP5U^hh{H!OS*A&inxj_$*^<|36)>eJ(#|X8+$>uc3x9 zwIG;{&JZdp2@li`+6>G+dP-kDgEEYUnD=8~oJ4Mfo|oX`(IpHlfe+#o=FsZ@U>Iix z29fqPgj(zc%5aVl1keWmu^Rnv5%+($%m0>N@HPHCzAW|d$UyB53)Md+mWD7n#3p$^tpV8oXP(($*oL>+x1 zBp2Lyo{#(4W5<+XD1OfT5WLX1-q$bFa74M%HKGbcA*Dfs zM8gey4J6Hkb8Szrm-BMFY6>}~!qo%?@Rg<%u^bomWBA71iwjZ>vUfOmL<_0ed3d=K zTeArYI&iimsGdXMs5#+$C13C^O0s}c;w%$|S^}j4V)jBr3v4b8eP&RPy45nt(eM*) zrDC?bYpHaE>~_HUxjV;3QX?8lTZp6P*{FSk)>pP}r8H|E*NuKv!Rn?B5F>7}0q5r* zKSi*m#P&a}Jl%Hvae|r~^-36-vGsJy#UOnbBG@9;rrCb+NUaMdC5$BRabV2x)DQeP zj!zjK?uMJ$Lb5So?6x9<|O&M6P(g-uo4eKZF&m$)er5Io9$kiCiat zLlC3M>oGe8XJ4elIh2E*1qoorq)D7c~XQ>|XuScXjc5D|O@7Sc&n4%U=>yKo$RZGLgM z#kO2W{I3(_)D{ZT1l<^69uB-^*h7R}AI>x^6kVH359YWCh_HO|0%AmUP_z#89uY5c z>?4ZB7d~@&U8z+gLlGm0e#zhmQ`^g)62~Ny_mU=ew+api4T5mK*E_PN@eE!?Q>mNq zwVk}cI4Qjh^(Ze!EzX_1vyGb?by*0wXQnmU583!@{jRQ1ve=hHOf3U>D&A#Rh$2$# z^+79HVX4n*K<#B9O}pbQxezKLkG?4<3q5uShmip{;|n- z^Ei`J$+m4&(P<;var^%o@U2%qFA5#^hb(kF=`EC=~i+=bsT`$}!tF`6ZRP z;Y?u|xwzCo$;g`b;-cn-Yz4A`Ts6UVJSLaDZN3Mn0f{=CDkx|B?^ya^sk#x{(y*Vs zI6q`~6ZHz7TC3H}Z(GcP2){DaiEj6CBi-cDv*IIom_1x*t6#<6J zJbO?v3AKrU%@UNY{Bq|&)5r&OuF-@oKcV`{5en`}>#%nq zABlmZ`O)03>0Tni7Yqd}k{*yY0l)-K0|)a%}cYaZ&iHFFt46`p0#|6;ocf0?M%ZJt>q>l{Ds#TC+#PYpp#a)b{`3_FSRCp?L0 zfe<>>B+fNaXg%P)Ti{`$^q`#n8c4mFXbE9d6IMUtN)k?|Y|g|@rMKynW)&Y1(UntS z7u8eCqtap;a$s}C=TgR0R8-e)DYf7^qwuFaqrV|$gF7o>c^6rL%mTR$WQ?Y@_f%TG zd9h)OUU?tIbY^UnS#{C&sP!T+qC*92ZrUGmjggw+CL^!II&Q@xv7Zu%`~%L5&c6{> z7UkIH|FM;odgxgo>V~6WdKNM6k^%O9e#f@Yj`8`!SrgqHPv)88NjmnnJ60#7MP*+I zUz-+Olymlz(7n!nCsJiz+24s$8ggU)xlA+2QClZ5sRmdW!Hh zJFHMguYaXk*XNwB-ZGhrFS8kfviIAd+^O?eqis<-qy?4vsSrtLeXGnzaqN$vZUYnY z!ppDH(C2z9iNVSZzykt1st~##>*E5v`|~KGkIV)j>@3 znCgct%1^ED!m+hH8Le3K&uMXGmP6YJ6?gMVQdd4da_q&pmYWaAXwvy6r?Hos1Wu>T znb#_>(@cyz`{a}M$_&xB+_t-&zXT(GilZoxSyZRGnMD;7qft*rnY9EmN&7KY;j;Y0Mo!s*#|xi`9YFG%xGX8Ld`xhltIA-Oo> z&@VsUbW$JCw>d$PCEw9Tg&)SYH6HYqPw8rPb0p}g*%Hak#F->XG{OxbYNwm02}y2V z>}|J4ele05ZVP-|ie-O?d;g)dHo06TFNbQ$)fcS#vERuSTR4tgG*c3*Q{Bln^LQk zKhc3UR?bnGx-!m?9E3@eh>Q-K#l>vpdkVjQ>`49gfUJ8W%9K>e@nL`dd_UQ4DT^G0 zlFR}YQ{4XZvIhSG2@2TIK0o5P6vnn3Ssy^Tvm)&JvFNfC?`pYMxLx7=+p?Y-VXq_g z_i^>!J-)kT8}4MI7@<5M4bjAD0#G54P~QLRMzA6Nsr!Tn*Z#l`ps*2;5;Q~|W}iuT zAO}iwJcLm1z;Zy74WMvPsp-)$P@*JBTJ_Li2LSE|!Uz8Jney27yn#Ugis)%i@$dlK zMji{;1n>}+qrYvkCgDVAunQoZ2sHjmG}*5xz$Fo`00NEwzv503LKK`Hc)Jm**8)O( z{$t(%W=_zG#v4BbFQWO?yZ$^wof*(mQzaolCHFa}W5YpqK;mE6Bon-tBm|P_1^>AK zL9noxs&i&f(a?6YGMv9aGJ~Vy!0U+@6jO4|!2Q-sZ&tz98hBB+>9!|u5knY`?`-Di zmu0Z8EhrVuaC9{_ySvuP*PV(LxaIrGS2bx!5J(MdL%ag9Y06xtGz38sxI_RxfPGGK zZa#q#AlOuru@>}%yX>a(F>Els>G<;n)z>rAvB8|t7HGxCZex!Qkch;@eD@+pP!X4JEe;0C|6v}4fFBU zY;6xIKcDQ)Ht;^HrSDc=Iz_N;zppN)gHqfI)A%XbtV#tOtl-27d-! zTUB~2(YuBSrF216wN_9w;m)$^x@zLf-Ad_XvUpW;>HSPzb>D4uvq&oGCC1}$x3QZ^ z0fpI3wH#AmY6(fJOWZz=!{VdP6xUT}GclSy@cYvl!9vUy{$$9jekCx&&0xRJl-Ks#h+fcYZMh@4BYz!}JMfe;smxG1EFdp)%jw$gshN+wSIFovy8? zAN+;;OuXOA4X(I3OoPFJFN?ZCv?*w%!e?cAXkP+i;tLX2a$@B(uEa-y$2 ze-h78)|9mrcgbpSUVKeAYW|)_Lzsu3RB|N`|LonV&d0RG2NuJtFJ8KLtGskd0JK87 z>(kQ1K{aX`rCl+a56q<<1ASoecfkWWlBq&Y@L^v?eJ2=Kph%3LcKEWVsuI@#`DlELv;=e5;yN&izz_<*xY-x z(C}laA^DAYF3aU-L}UdwUqCzDF*!z<>8t6gkHF|rPR!9 zuOSsLNd$KrjyJ8t7Dto(^)0qFc}=O-#Dq4EcZ<*`=Y)#(?|;;h(v?gK#;5ydV@+IT zOW}qx30w`2!Xr7k=2Armm7R^>?nn(Kq>tj)^Ajg?uvWJ-M7VU1be?)>_+dd8FT(cnl*<^JOA}w(yskIYY~x#M21f4o4yL zgmGyj3@@?Is+?7N4zz2&a9urr@Bnc}*RK~hu(n&U$FMmX&#<>XP;kyon`|JPi~$kU zk$Fyj7Yx&)g3!5(MB?3^zLdn2&nr5Shj~@+MUapuMjE+A7q9DV8}e5PA8RQLO1~se~xfC0|39GfGV~5@fwE>nFvWLN_i#>~EtdE=me#45YJroZk5olq9Ihqw909 zB+Df=Buy86(;6v4va ziwQ{yPk(_G;1y4a49&i+FfY{RDjE4vKJWb&r7Yc9)dvg+uj^v%G>oV#$G)i^IRVVt z-g313Vd4;54exeKNI>mLFy?7a8Q6d+Pr?Hwud9w3C-+R=%zjR z8uY8Wc(QTQ&BQEh$8JvQZ4D)*4};Z%^9t`7H8dw-u3WXzC4?M{KO6`ow*_oflY!9_ z{`O*ani8)rJ>iPD`2p_(Y;x)2Dz07~0}yndZzF#4vOHs2knk1zllG5s%k zoN`z&%4I9Lw6k=;|HslZ4qNj4{T3!MpoLy(>>2B zOb|)Ng<1v=q*b7`cNuaBIsyMtB#kho2-h@0Ees?+%7BLdezpZ#VVM9ZdHd`pdJXXX zfK1FBj>kAttBZ2r<)*V5>a`@knAPgPYZj1OLQUy%Q?*`O10{ zV6VU50?}9?Yx*(VdS^#yqL~WhET)sUeX_q0>qOhJJQ>Hby*QeU!?d*;oeI;pl6 z2cKY^_Hxib;3X>GdaI%b$dMy<+}rJbv*3OAqO1$?J>zd_DP&d-AOy3`5Ob?c{}aV*C!LC4EtYC(tdTS@MA+Pi9}PbmDL)HFt?5GjK_lD%wJ zq>{g=;a1oM{W(#-AN$0R7yC0MjF%1P!9B*!EdFVM401^2&(-xym(d($D)T;nZ1c<2 z3Zn5#62r5@Z?NVjCyBEIhsx$chV2h-jSZx|sWmIrOq1v|s|xNKsC@NgL4}_7olNRZ zF`svjm=oQYVZ{@Z=%480rsGo&73INWd299~@zB=pA*i60VSh;gl>GVbEoIX#*gzi1R z9JR{Rj=-&H9|8JP;z$^8oVtl3YP7O7A8@y1TvUB}o-#hhj~Xtqi*LE_FsQ+>9aAi_9d;S1ZR;Ot?fe)~eOGGtz>xjzk_3D|_rDH}Pog&Q5x+6E$>bG~WKD4zu zPm&QHd-)zwu$q@8YLelnjb?MZ0i#7~I zA$n)kzU6Deshfmq+e{S)B7;8?xhMwgF1m{4;#r06RO_H1-r{(SLeRIMC@jrfsx0%@ z`fwqmTedDGGASVQ9%PemgwyQx;Pbt6^c-3d~_<@;RTE4^sm5zqE~oc@>TF8@tS$$HqL9L2L?6CYyK zo)u!fs`97olUs$02DWvHR-uwc?;;Z}ImNe{#7?V48mD@m3G~w$&LWdS1c}XWKpWrY z+UAmdG;nY+`m=BO>ukF#T&a5kT6H_*4FS5 zv7S_;TSjZq!DsCi>*=Cyez#{oKxbi`zD+zkKR#@|m3WXmbAq&vd)qs2aBsWE2YThZ z;4^1epBdjFucv3o?&BmM(ma#cE!yI@@TP5>m1kM+g*?H8r4%ei?hMQa`kW?QXb`7m zqujY76@5ME+hoq2F|XkqTMF-6-{Dq|ANX3bk#WbwY5v$I9!uxf>Ns7as?8&=*lFwQ zyJ>Os9_NT1hY%ll6>LdvA1}6i)+Ni=)IEE2fJJ0H*5v>G|E6QacD6SZt z$TCpX-XzEbfiCk%Jb6b>Z?!gm==)6#iM*I?DSS|0zGTn%byz}qT+LmPshx^B;!zpY z0s>he5n%Iq0CN5t)V~c}RX1y_9*k8{AnV8|6$(xG+GiGO-?de8xpt}=u^y%4yDa;Y z`CW>(esztL&o*?1e=ip&C28aT$>aTj>*ddg1mP6y;%I5aN}xopUPNzPbklc((bhtf zrI!~Yi*qLmf?dX)^JEnLlst4b1V6uuH;8whVu*}?Co}T^-g(@jY<-aX*oXtuS5d-g zagCZ0DkAf`F;8-19#`}mBB;0C+Q0DXa0WncNVevNRn@jF2%r^tdX8$^^2|VW$v3I< z4ak*b>NxHNpQ$L|k_04|ygUJG{*MKeAY}LhH~)(}g9l)D3=oNjqy*rXu=cfFyAVcr zAE@Or&x8|T0Jxsq1vMoOBoJuW#s+x;9w+39A=FLwrotr%QqsP)-OfnYr-7_df2-8FH0d;hidP0k%bNJd+$;gdOaH67XA7(&uX^8mhmm>{3|A8Iu(PjS6=AM z<-?)*kUzDgA7N^kXOA zbeZ_qG^}ra%2BqrF-+~|b!5Ocn;dgwQs-XRv$8U$L2Sx$r0g6bzOSeVyD}=AbEa8& ztXydGjQ0H;=Pev1Dty#d7WXM&1ZCBSqo{44@oc$suxL!Q=19LE_k9N(VxWDtBkEV; zE0PF)4IG#Ps~p?LIl&MV$XJI99;7{nk?TsN!(5hIgl48d{4dd5ps;(&s=&yzB|7*7 zum65b5Ob)r!-oOu4eZ>tfeZR^r#@c;MVE&^e4vCL(m$+jxQ&pX);cCyV40>2?IOZ8*=>B{Zy zDJeR7JodSiIBK0h<^E`J(s-IS_O8xe^7$N%texkiXD^XU%dhL(4DZzVpPT?GcU9nT z2p9I&s~y$t)7KxO)>xSr)fcq&uIg zw%)$H9q1>C8`xjh)Y)}0R|8X#h!2NgGAxCFwakxOxUaK8&C}dnXCqs&ScctQ+@)P3 zIw+lizsj(XH2j!~iH{-5MT#7Hlrw1Y)3%jyXEbbvA3hU;BY$ariiKIqntl;QByh6F z+_1`knXfX3qI_a9B+MAG&4xT8-3a3vT$$i+2-EO{OtNX;Gwzxjzae4%uaS9*QM=W* zFZe4SV&;x{rs)}3G#H2KNn_sKl8=j0G^+I3(XyS=;;5H$nX+WQoxm)QMixVJOTjP0 zpk`gow(y-Rx8ni^X_U!kwI7g}O^%-AYvG!U+#)&xb#{)6h=1DOnKlvf;bp%cs@BKF zoHcct;7{*oq*Hx#{g~~Vixq@bmV1Q?#WQh^cLDF@I9pMdIBG+LcPtTXx$8SWi1oUq zaczU_H{`iL_uefYF7x$^>%GDrPsOnh7Ee8TQb*mUW_*&z))zGr;#Phr_K34$iD^1Y zX?NO0<5aAFAgh&c*euW$bY5+`5z#tK7NcuB^s-~3M?x>vgZLQfHW@t?SwXH#L<`cp zqkNw_e|RBwtXdp1rb5&CwAptj6DeYdre*s~962-8zB>Kf$?_bd@eZ}}vdJ^zcXuWm zs^yKVMZaBF*^6~ahccWO=Dz#*o`arJ{Bxg3vIjq@lXzCXT|D~MBSF;XPn}*usR+2) zsV*qdUX-ur)i|51JH+X!ap)N9`Py!TAxT{-U5q)|vGYZUxrr#{MfF&h@F8j^Uu(Nc zom!XF?a9|ljr)h^lVyxDS_^WlJ&2=BUo))Xt4+@<4LyuIbVDfn>w=+l`3#uX^s=U; zW+V+K1}s)fTyLLFG5K*B;2E|xvvs|vPUBa?r*>OZ>VHGtXrJB9w9ZV&Z{$?z3$&%{2>nbmGIP`m;7 zvX6W<%mc4TI%vyvD^l@?gG}1BZSw|`_XT8j&yKsPts2ZErPt(1HN9z|5jM%CuRF14 zFCUPVc3DQ3t;eb!>bt57w#}o1uoF?ssz%hg>*=Wa8d`CzT`yU+HdlSx$pV{-g~k=x z8@lzgt^w+XtPR4HZqX|Ke7=3EkpaB>d6I4mZr_GuQs!B20fhGRt{9#ph?t2|yzS9$ z!C0OoM%3nACrvPa)5F;afPYX?0Yi}+%Gy5O!EVA`MX8!|k#8})cju3+ys1niZDn%mrar#C^~r2(dH_PP%o{`W!<;3I*~2qYK7*Ac!r+c920^p zLQ}%XEXD#@NK8V%Ya!{Cr&Md(W#+RAO-F1v9DJZ_;#@vwK`2dltL~g?8vW5??^d29 zxfQ2;tf+#7#CBA(eoQa#RQcPCZ{ZexD%3pHqZh8s92n!h4;s`f10~L%S0%JZ#D6&a z=D(y2F1`dLTX{fNuG1??01llV7{NH-mi=i<{&W!t0vY`WQy_5Wphx!w2h#?ElN3 z@aO7F0>SV?Q$QBKo2qdU?32~f& z>_Kbx%l+G@=_%b1ssK&act4^`5(saVWFAuHw&&(C!NiR{+1hi2uPuqfe+AF7V+*SL zKV?V&NkhRN<{r*1^X(xlrBJlL=UH>SzF9YY#2x!*U9rWe%WWbVF3?=^NdYjUcTE_f z7ckz&yf<#QAbuL}Hm+swLTC3}_)=}`CW|t##nsE|fgGp%6qTw{xDS!niPQECWP^m? z7kZMW)qsl;x{IEmUjn(O<>Q zzBXv3b=xiKwU=mh_Pgj_`ZGel^myMf@4`uk5m<%Z92!c!?I2!B6j9!Jh_JeKty;i3 zQnjt;gV)XMTep2PN+0Q8F;%(cn2&?M4WH`L*2#lW*E0C0arroMXdBJZuAWjGpP`Ql zUO+-tC-Y?^L|qu49Kk8(Y3F}@yg|$0wyH|TSW>SZ%9LCbySS;=W!>6VjjBw4jc39Q zH_o{2K)=kHjhUOJb*4)Q==Y)CD{ZwC2kp~+5RfRB87Z)?4poxuBi&`RXY4fG`?bBn z>=58lQ>?B%CD*P#34dKW;Gnk-I=01R)aq(m0SJ3(DvZ;Bd^bVgCeG@diciE zb4i^tIWeZ3%UrWGeA48(H}*ewOVSV@4hp8HA&>Eu5NVD?hGfiLHdp=Wm`<^SE$<#J z4mH}n6?>Y~UxKT0=6Y>p18IRehy#T8aj$tFm9#G!;w-*65P$7N8urnopbdw`v4SzC#k6Tuy58u?-kJ`+<*e%&omzZ{DEGyDj_aXJS1m+NX zlO=!pT9qWd!6DvS%gf}3GUEV-nmL&2bM`x}H1DbuiA;Y$(?Q{{onUh@^`!AO zOvvnUNLd+MQVqqpzkyls7-{&_yikgz+hdnSHz8y>yA9$y6PeH#arQ_*Ew0u9zrKqm z8Xm-VdaF@JXH&RU=-d6}T$WG+W;dVqK2w?1x?VNbr6%NWu~1v$h&WP>nw0D{XEaO7^~Xldag}s@Rd9B9hn4jXG+!f zKCAe4D{kloK=O646Q5&?=k8dhHJo3N&ksiQbe4C7vaBx17aDXtXsr`>bX-$ew<4+M zn~G|EKik)%AoSEwUNO`b?HXBtYfm0hUKH+ zT+ZXWMVu3DvxplsEdsC9n`A!WHluy0&&T`#ejBkJ&}g-MRq*sv{_Vdo@vdF=aynM}IV^v3J6GAug!^qx0Z?!nq{o6W