diff --git a/.gitmodules b/.gitmodules index 23ce5ff059b1b..001504ec9ed07 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "kompute"] path = ggml/src/ggml-kompute/kompute url = https://github.com/nomic-ai/kompute.git +[submodule "ggml-tsi-kernel"] + path = ggml-tsi-kernel + url = git@github.com:tsisw/ggml-tsi-kernel.git diff --git a/CMakeLists.txt b/CMakeLists.txt index ac3e9090336d9..a4d51cdbe2dc6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,10 +5,61 @@ include(CheckIncludeFileCXX) #set(CMAKE_WARN_DEPRECATED YES) set(CMAKE_WARN_UNUSED_CLI YES) +if (GGML_TSAVORITE) + if (NOT DEFINED GGML_TSAVORITE_TARGET) + set(GGML_TSAVORITE_TARGET "posix") + endif() + if (NOT ${GGML_TSAVORITE_TARGET} STREQUAL fpga) + set(GGML_TSAVORITE_TARGET "posix") + endif() + + if (NOT DEFINED MLIR_COMPILER_DIR) + if (NOT DEFINED $ENV{MLIR_SDK_VERSION}) + set (MLIR_COMPILER_DIR /proj/rel/sw/sdk-r.0.1.3/compiler) + message("MLIR_SDK_VERSION not set defaulting to ${MLIR_COMPILER_DIR}") + else() + set (MLIR_COMPILER_DIR $ENV{MLIR_SDK_VERSION}/compiler) + endif() + endif() + + if (NOT DEFINED RUNTIME_DIR) + if (NOT DEFINED $ENV{MLIR_SDK_VERSION}) + set (RUNTIME_DIR /proj/rel/sw/sdk-r.0.1.3/${GGML_TSAVORITE_TARGET}/runtime) + message("MLIR_SDK_VERSION not set defaulting to ${RUNTIME_DIR}") + else() + set (RUNTIME_DIR $ENV{MLIR_SDK_VERSION}/${GGML_TSAVORITE_TARGET}/runtime) + endif() + endif() + + if (NOT DEFINED GGML_TSI_KERNEL_DIR) + set (GGML_TSI_KERNEL_DIR ${CMAKE_SOURCE_DIR}/ggml-tsi-kernel/${GGML_TSAVORITE_TARGET}) + endif() + + + if (${GGML_TSAVORITE_TARGET} STREQUAL fpga) + set(CMAKE_CROSSCOMPILING ON) + set(ARCH_FLAGS -march=armv8-a) + file(GLOB TLIBS "${RUNTIME_DIR}/lib/*.so" "${GGML_TSI_KERNEL_DIR}/host/*.o" "${RUNTIME_DIR}/../utils/lib/TsavRTShimCAPI.cpp.o") + message("Setting target as fpga") + elseif (${GGML_TSAVORITE_TARGET} STREQUAL "posix") + file(GLOB TLIBS "${RUNTIME_DIR}/lib/*.so" "${GGML_TSI_KERNEL_DIR}/host/*.o" "${MLIR_COMPILER_DIR}/lib/libFFMDeviceShim.so" "${MLIR_COMPILER_DIR}/lib/libTsavRTPosixShimCAPI.so") + message("Setting target as posix for tsavorite") + endif() + + set(GGML_TSAVORITE_TARGET "${GGML_TSAVORITE_TARGET}" CACHE STRING "Target for tsavorite") + set (TSAVORITE_INCLUDE_DIR ${CMAKE_SOURCE_DIR}/ggml/src/ggml-tsavorite/include) + + include_directories(${TSAVORITE_INCLUDE_DIR}) + include_directories(${MLIR_COMPILER_DIR}/include/runtime/shim) + include_directories(${RUNTIME_DIR}/include) + message("tsavorite backend is enabled") +endif() + set(CMAKE_EXPORT_COMPILE_COMMANDS ON) if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) + #set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) + set(CMAKE_BUILD_TYPE Debug CACHE STRING "Build type" FORCE) set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") endif() @@ -82,9 +133,18 @@ option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) # 3rd party libs -option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON) option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF) +if (GGML_TSAVORITE) + if (${GGML_TSAVORITE_TARGET} STREQUAL fpga) + option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF) + else() + option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON) + endif() +else() + option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON) +endif() + # Required for relocatable CMake package include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake) diff --git a/README.md b/README.md index d1cb8d8336229..f222c9a1a8ae1 100644 --- a/README.md +++ b/README.md @@ -580,3 +580,59 @@ $ echo "source ~/.llama-completion.bash" >> ~/.bashrc - [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License - [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License - [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html) + +#### TSI compilation steps +```bash +#Pull the repo frim tsisw as follows +git clone git@github.com:tsisw/llama.cpp.git + +#Ensure prerequisites are met as follows +cd llama.cpp/ +#Ensure prerequisites are met as follows +echo 'updating submodule' +git submodule update --recursive --init +cd ggml-tsi-kernel/ +module load tsi4 gcc/13.3.0 +export MLIR_SDK_VERSION=/proj/rel/sw/sdk-r.0.1.3 +echo 'creating python virtual env' +/proj/local/Python-3.10.12/bin/python3 -m venv blob-creation +source blob-creation/bin/activate +echo 'installing mlir and python dependencies' +pip install -r ${MLIR_SDK_VERSION}/compiler/python/requirements-common.txt +pip install ${MLIR_SDK_VERSION}/compiler/python/mlir_external_packages-1.3.0-py3-none-any.whl +pip install onnxruntime-training + + + +#build TSI kernels for the Tsavorite backend +#First for FPGA +cd fpga-kernel +cmake -B build-fpga +./create-all-kernels.sh +#The for Posix Use cases +cd ../posix-kernel/ +./create-all-kernels.sh + +#Change directory to top level llama.cpp +cd ../../ + +#Compile for posix with build-posix as a target folder + +cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix +cmake --build build-posix --config Release + +#Compile for fpga with build-fpga as a target folder +export CC="/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-gcc" +export CXX="/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-g++" +cmake -B build-fpga -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=fpga +cmake --build build-fpga --config Release + +#For easy build one can also use which creates a FPGA specific tar bundle tsi-ggml.tz +#If you want to release the build update the TSI-VERSION in the file tsi-pkg-build.sh and add Release as parameter +#when running ./tsi-pkg-build.sh (Note it will overwrite what exists in /proj/rel/sw/ggml so be sure you want to do +#it. Example ./tsi-pkg-build.sh release +./tsi-pkg-build.sh + +``` + +## References diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index a7ff3ac16c446..9eafc9bb2b659 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -145,8 +145,16 @@ endif () target_include_directories(${TARGET} PUBLIC .) target_compile_features (${TARGET} PUBLIC cxx_std_17) -target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads) +if (GGML_TSAVORITE) + if (${GGML_TSAVORITE_TARGET} STREQUAL fpga) + target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} ${TLIBS} PUBLIC llama Threads::Threads) + else() + target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads) + endif() +else() + target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads) +endif() # # copy the license files diff --git a/common/log.h b/common/log.h index c56bb50d95db0..0e23b669fcc22 100644 --- a/common/log.h +++ b/common/log.h @@ -90,11 +90,20 @@ void common_log_set_timestamps(struct common_log * log, bool timestamps) #define LOG(...) LOG_TMPL(GGML_LOG_LEVEL_NONE, 0, __VA_ARGS__) #define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__) +#if ENABLE_LOG #define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, 0, __VA_ARGS__) #define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, 0, __VA_ARGS__) #define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0, __VA_ARGS__) #define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__) #define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, 0, __VA_ARGS__) +#else +#define LOG_INF(...) +#define LOG_WRN(...) +#define LOG_ERR(...) +#define LOG_DBG(...) +#define LOG_CNT(...) +#endif +#define LOG_TSAVORITE(...) LOG_TMPL(GGML_LOG_LEVEL_TSAVORITE, 0, __VA_ARGS__) #define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO, verbosity, __VA_ARGS__) #define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN, verbosity, __VA_ARGS__) diff --git a/docs/build.md b/docs/build.md index c9027c0b580a5..e4de6070ee4cc 100644 --- a/docs/build.md +++ b/docs/build.md @@ -559,3 +559,66 @@ The GPU may still be used to accelerate some parts of the computation even when In most cases, it is possible to build and use multiple backends at the same time. For example, you can build llama.cpp with both CUDA and Vulkan support by using the `-DGGML_CUDA=ON -DGGML_VULKAN=ON` options with CMake. At runtime, you can specify which backend devices to use with the `--device` option. To see a list of available devices, use the `--list-devices` option. Backends can be built as dynamic libraries that can be loaded dynamically at runtime. This allows you to use the same llama.cpp binary on different machines with different GPUs. To enable this feature, use the `GGML_BACKEND_DL` option when building. + + +## TSI compilation steps + +Following are the instructions to compile for TSI FPGA and Posix backend + +```bash +Pull the repo frim tsisw as follows +git clone git@github.com:tsisw/llama.cpp.git -b FIR-699 +``` + +Ensure prerequisites are met as follows +```bash +cd llama.cpp/ +git submodule update --recursive --init +cd ggml-tsi-kernel/ +module load tsi4 gcc/13.3.0 +python3 -m venv blob-creation +source blob-creation/bin/activate +pip install -r /proj/rel/sw/sdk-r.0.1.3/compiler/python/requirements-common.txt +pip install /proj/rel/sw/sdk-r.0.1.3/compiler/python/mlir_external_packages-1.3.0-py3-none-any.whl +pip install onnxruntime-training +``` + +build TSI kernels for the Tsavorite backend +First for FPGA +```bash +cd fpga-kernel +cmake -B build-fpga +./create-all-kernels.sh +``` +The for Posix Use cases +```bash +cd ../posix-kernel/ +./create-all-kernels.sh +``` + +Change directory to top level llama.cpp +```bash +cd ../../ +``` + +Compile for posix with build-posix as a target folder +```bash +cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix +cmake --build build-posix --config Release +``` + +Compile for fpga with build-fpga as a target folder +```bash +export CC="/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-gcc" +export CXX="/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-g++" +cmake -B build-fpga -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=fpga +cmake --build build-fpga --config Release +``` +For easy build one can also use which creates a FPGA specific tar bundle tsi-ggml.tz +If you want to release the build update the TSI-VERSION in the file tsi-pkg-build.sh and add Release as parameter +when running ./tsi-pkg-build.sh (Note it will overwrite what exists in /proj/rel/sw/ggml so be sure you want to do +it. Example ./tsi-pkg-build.sh release + +```bash +./tsi-pkg-build.sh +``` diff --git a/examples/gguf-hash/CMakeLists.txt b/examples/gguf-hash/CMakeLists.txt index 15c5c68c6f402..0d9272b663d1a 100644 --- a/examples/gguf-hash/CMakeLists.txt +++ b/examples/gguf-hash/CMakeLists.txt @@ -1,5 +1,6 @@ set(TARGET llama-gguf-hash) add_executable(${TARGET} gguf-hash.cpp) +target_link_libraries(${TARGET} PRIVATE ${TLIBS}) install(TARGETS ${TARGET} RUNTIME) # clibs dependencies diff --git a/examples/gguf/CMakeLists.txt b/examples/gguf/CMakeLists.txt index fb04eb83f34ce..48365a0b054ce 100644 --- a/examples/gguf/CMakeLists.txt +++ b/examples/gguf/CMakeLists.txt @@ -1,5 +1,5 @@ set(TARGET llama-gguf) add_executable(${TARGET} gguf.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT} ${TLIBS}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/lookup/CMakeLists.txt b/examples/lookup/CMakeLists.txt index fba78ceda6fd7..f7626a45dedd8 100644 --- a/examples/lookup/CMakeLists.txt +++ b/examples/lookup/CMakeLists.txt @@ -1,23 +1,23 @@ set(TARGET llama-lookup) add_executable(${TARGET} lookup.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${TLIBS}) target_compile_features(${TARGET} PRIVATE cxx_std_17) set(TARGET llama-lookup-create) add_executable(${TARGET} lookup-create.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${TLIBS}) target_compile_features(${TARGET} PRIVATE cxx_std_17) set(TARGET llama-lookup-merge) add_executable(${TARGET} lookup-merge.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${TLIBS}) target_compile_features(${TARGET} PRIVATE cxx_std_17) set(TARGET llama-lookup-stats) add_executable(${TARGET} lookup-stats.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${TLIBS}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/simple-chat/CMakeLists.txt b/examples/simple-chat/CMakeLists.txt index 567f7fbbbf43a..cdf65e58a9d7d 100644 --- a/examples/simple-chat/CMakeLists.txt +++ b/examples/simple-chat/CMakeLists.txt @@ -1,5 +1,5 @@ set(TARGET llama-simple-chat) add_executable(${TARGET} simple-chat.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT} ${TLIBS}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt index 104ecabfd7236..a87dac20c82da 100644 --- a/examples/simple/CMakeLists.txt +++ b/examples/simple/CMakeLists.txt @@ -1,5 +1,24 @@ +# +# simple-ctx set(TARGET llama-simple) add_executable(${TARGET} simple.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE llama ${TLIBS} ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +# +if (GGML_TSAVORITE) + # + # tsavorite backend test cases + if (${GGML_TSAVORITE_TARGET} STREQUAL fpga) + file(GLOB TLIBS "${RUNTIME_DIR}/lib/*.so" "../../${GGML_TSI_KERNEL_DIR}/host/*.o") + else() + file(GLOB TLIBS "${RUNTIME_DIR}/lib/*.so" "${MLIR_COMPILER_DIR}/lib/libFFMDeviceShim.so" "../../${GGML_TSI_KERNEL_DIR}/host/*.o") + endif() + # + # simple-backend-tsi + + set(TEST_TARGET simple-backend-tsi) + add_executable(${TEST_TARGET} simple-backend-tsi.cpp) + target_link_libraries(${TEST_TARGET} PRIVATE ggml ${TLIBS} dl rt) +endif() diff --git a/examples/simple/simple-backend-tsi.cpp b/examples/simple/simple-backend-tsi.cpp new file mode 100644 index 0000000000000..2f56f34168062 --- /dev/null +++ b/examples/simple/simple-backend-tsi.cpp @@ -0,0 +1,578 @@ +#include "ggml.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" +#include "ggml-tsavorite.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NUM_INPUT_TENSORS 2 +#define NUM_INPUT_URINARY_TENSORS 1 +#define NUM_ELEMENTS 32 +#define NUM_ELEMENTS_SCALE 32*4 + 25 + +// index 0 for addition, index 1 for subtraction, index 2 for multiplication, index 3 for division +float test_input_1[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS] = { + //ADD KERNEL + {1.1, 2.3, 3.2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + //SUB KERNEL + {2.2, 10.3, 10.4, 2.2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + //MULT KERNEL + {1.1, 2.3, 3.2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + //DIV KERNEL + {1.1, 4.4, 10, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + // SQRT Kernel + {1, 4, 9.6, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 256, 289, 324, 361, 400, 441, 484, 529, 576, 625, 676, 729, 784, 841, 900, 961, 1024}, + //NEG Kernel + {1.1, -4.4, 10, -5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, -23, 24, 25, -26, 27, -28, 29, -30, 31, -32.6}, + //ABS Kernel + {1.1, -4.4, 10, -5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, -23, 24, 25, -26, 27, -28, 29, -30, 31, -32.6}, + //SIN Kernel + {1.1, 4.4, 10, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 20, 20, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32.6} +}; +float test_input_2[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS] = { + //ADD KERNEL + {1.1, 2.2, 3.3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + //SUB KERNEL + {1.1, 2.2, 3.0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + //MULT KERNEL + {1.1, 2.2, 3.3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + //DIV KERNEL + {1.1, 2.2, 5, 10, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + //Below ROW value not used for Unary OPS-SQRT, NEG, ABS, SIN + //SQRT KERNEL input not used + {1.1, 2.2, 5, 10, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + //NEG KERNEL input not used + {1.1, 2.2, 5, 10, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + //ABS KERNEL input not used + {1.1, 2.2, 5, 10, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + //SIN Kernel input not used + {1.1, 2.2, 5, 10, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32} +}; + +float test_result[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS] = { + //ADD KERNEL + {2.20, 4.50, 6.50, 8.00, 10.00, 12.00, 14.00, 16.00, 18.00, 20.00, 22.00, 24.00, 26.00, 28.00, 30.00, 32.00, 34.00, 36.00, 38.00, 40.00, 42.00, 44.00, 46.00, 48.00, 50.00, 52.00, 54.00, 56.00, 58.00, 60.00, 62.00, 64.00}, + //SUB KERNEL + {1.1, 8.1, 7.4, -1.8, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00}, + //MULT KERNEL + {1.21, 5.06, 10.56, 16.00, 25.00, 36.00, 49.00, 64.00, 81.00, 100.00, 121.00, 144.00, 169.00, 196.00, 225.00, 256.00, 289.00, 324.00, 361.00, 400.00, 441.00, 484.00, 529.00, 576.00, 625.00, 676.00, 729.00, 784.00, 841.00, 900.00, 961.00, 1024.00}, + //DIV KERNEL + {1.0, 2.0, 2, 0.5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + //SQRT Kernel + {1, 2, 3.098387, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + //NEG Kernel + {-1.1, 4.4, -10, 5, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, 23, -24, -25, 26, -27, 28, -29, 30, -31, 32.6}, + //ABS Kernel + {1.1, 4.4, 10, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32.6}, + //SIN Kernel + {0.891207, -0.951602, -0.544021, -0.958924, -0.958924, -0.279416, 0.656987, 0.989358, 0.412118, -0.544021, -0.999990, -0.536573, 0.420167, 0.990607, 0.650288, -0.287903, -0.961398, -0.750987, 0.149877, 0.912945, 0.912945, 0.912945, -0.846220, -0.905578, -0.132352, 0.762559, 0.956376, 0.270906, -0.663634, -0.988032, -0.404039, 0.926149} +}; + +float test_input_scale_1[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS_SCALE] = { + //ADD KERNEL + {1.3, 2.3, 3.3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}, + //SUB KERNEL + {8.5, 2.5, 3.5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 64, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 63, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 63, 32, + 4, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 63, 32, + 2, 4, 8, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}, + //MULT KERNEL + {1.5, 2.5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 10, + 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 10, + 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 10, + 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 10, + 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + //DIV KERNEL + {4.2, 8.4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 4, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 4, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 4, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 4, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + //SQRT KERNEL + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 9, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 16, 25, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + //NEG KERNEL + {-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -9, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -16, 25, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + //ABS KERNEL + {-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -9, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -16, 25, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + //SIN KERNEL + {-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -9, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -16, 25, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1} +}; + +float test_input_scale_2[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS_SCALE] = { + // ADD KERNEL + {1.3, 2.3, 3.3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}, + // SUB KERNEL + {1, 8, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 6, 8, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}, + // MULT KERNEL + {2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + // DIV KERNEL + {2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + //Below ROW value not used for Unary OPS-SQRT, NEG, ABS, SIN + //SQRT KERNEL input not used + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + //NEG KERNEL input not used + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + //ABS KERNEL input not used + {-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -9, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -16, 25, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + //SIN KERNEL input not used + {-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -9, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -16, 25, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1} +}; +float test_result_scale[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS_SCALE] = { + // ADD KERNEL + {2.6, 4.6, 6.6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38 ,40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, + 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38 ,40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, + 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38 ,40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, + 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38 ,40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, + 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38 ,40, 42, 44, 46, 48, 50}, + // SUB KERNEL + {7.5, -5.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, + -5, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, + 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, + 1, 2, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + // MULT KERNEL + {3, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 2, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 2, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 2, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 2, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + // DIV KERNEL + {2.1, 4.2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 2, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 2, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 2, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 2, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + // SQRT KERNEL + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 4, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + // NEG KERNEL + {1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 9, -4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 16, -25, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + // ABS KERNEL + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 9, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 16, 25, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + // SIN KERNEL + {-0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + -0.412118,-0.756802, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + 0.287903,-0.132352, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + -0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + -0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + 0.841471, 0.841471, 0.841471} +}; + +// This is a simple model with two tensors a and b +struct simple_model { + struct ggml_tensor * a; + struct ggml_tensor * b; + + // the backend to perform the computation (TSAVORITE) + ggml_backend_t backend = NULL; + + // the backend buffer to storage the tensors data of a and b + ggml_backend_buffer_t buffer; + + // the context to define the tensor information (dimensions, size, memory address) + struct ggml_context * ctx; +}; + + +static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) { + (void) level; + (void) user_data; + fputs(text, stderr); + fflush(stderr); +} + +static bool ggml_tsi_compare_two_float(float a, float b) { + float epsilon = 1e-5; + float absA = abs(a); + float absB = abs(b); + float diff = abs(a - b); + float minV = std::numeric_limits::min(); + float maxV = std::numeric_limits::max(); + + if (a == b) { // shortcut, handles infinities + return true; + } else if (a == 0 || b == 0 || (absA + absB < minV)) { + // a or b is zero or both are extremely close to it + // relative error is less meaningful here + return diff < (epsilon * minV); + } + // use relative error + return diff /std::min((absA + absB), maxV) < epsilon; +} + + +static bool load_model(simple_model & model, float * a, float * b, enum ggml_type data_type, int elements_A, int elements_B) { + ggml_log_set(ggml_log_callback_default, nullptr); + + // initialize the backend + fprintf(stderr, "%s: using TSavorite backend \n", __func__); + model.backend = ggml_backend_tsavorite_init(); + if (!model.backend) { + fprintf(stderr, "%s: ggml_backend_tsavorite_init() failed\n", __func__); + return false; + } + + int num_tensors; + + if (!b) + num_tensors = NUM_INPUT_URINARY_TENSORS; + else + num_tensors = NUM_INPUT_TENSORS; + + // Since we are not passing the mem_buffer ggml context will create + /* .mem_buffer = params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size) */ + // mem_buffer for ctx is used for any object creation and used for tensor data if + // backend doesnt have own memory + // Since we are using backend memory hence i have removed extra bytes: 100, removed from mem_size at below + struct ggml_init_params params { + /*.mem_size =*/ (ggml_tensor_overhead() * num_tensors), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + fprintf(stderr, "\n Calculating mem_size %ld %d and creating ggml context \n", ggml_tensor_overhead(), num_tensors); + + // create context + model.ctx = ggml_init(params); + if (!model.ctx) { + fprintf(stderr, "%s: ggml_init failed\n", __func__); + return false; + } + + // create tensors + // // BELOW CODE NO CHANGE FOR tsavorite Backend + // Tensor just created with OBJ(Structure)+Tensor(structure) + // Still Buffer need to attached to Tensor since we are using Backend + // We will using tsi_alloc called under tsavorite-backend + + fprintf(stderr, "\n Creating input Tensor \n"); + + //int64_t ne[GGML_MAX_DIMS]; // number of elements + //size_t nb[GGML_MAX_DIMS]; // stride in bytes: + model.a = ggml_new_tensor_1d(model.ctx, data_type, elements_A); + if (b) + model.b = ggml_new_tensor_1d(model.ctx, data_type, elements_B); + + // create a backend buffer (backend memory) and alloc the tensors from the context + fprintf(stderr, "\n Creating Backend Buffer \n"); + + // Here at ggml Context we have only two input tensors, hence backend memory is + // created for two input tensors + model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, model.backend); + + // load data from cpu memory to backend buffer + fprintf(stderr, "\n Loading Input Tensor Data to Backend Buffer \n"); + + // loading the data to tensor + ggml_backend_tensor_set(model.a, a, 0, ggml_nbytes(model.a)); + if (b) + ggml_backend_tensor_set(model.b, b, 0, ggml_nbytes(model.b)); + + // create a array to print input tensor + std::vector out_data(ggml_nelements(model.a)); + // bring the data from the backend memory + ggml_backend_tensor_get(model.a, out_data.data(), 0, ggml_nbytes(model.a)); + + + fprintf(stderr, "\nBringing tensor data from Backend buffer and printing %d tensor data:\n[", (int) model.a->ne[0]); + + for (int i = 0; i < model.a->ne[0] /* cols */; i++) { + fprintf(stderr, " %.2f", out_data[i]); + } + fprintf(stderr, " ]\n"); + return true; +} + +// build the compute graph +static struct ggml_cgraph * build_graph(const simple_model& model, enum ggml_tsavorite_kernel_type ops_type) { + static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); + static std::vector buf(buf_size); + + struct ggml_init_params params0 = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf.data(), + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() + }; + + // create a temporally context to build the graph + struct ggml_context * ctx0 = ggml_init(params0); + + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + + struct ggml_tensor * result; + switch(ops_type) { + case GGML_TSAVORITE_KERNEL_TYPE_ADD: + result = ggml_add(ctx0, model.a, model.b); + break; + case GGML_TSAVORITE_KERNEL_TYPE_SUB: + result = ggml_sub(ctx0, model.a, model.b); + break; + case GGML_TSAVORITE_KERNEL_TYPE_MULT: + result = ggml_mul(ctx0, model.a, model.b); + break; + case GGML_TSAVORITE_KERNEL_TYPE_DIV: + result = ggml_div(ctx0, model.a, model.b); + break; + case GGML_TSAVORITE_KERNEL_TYPE_SQRT: + result = ggml_sqrt(ctx0, model.a); + break; + case GGML_TSAVORITE_KERNEL_TYPE_NEG: + result = ggml_neg(ctx0, model.a); + break; + case GGML_TSAVORITE_KERNEL_TYPE_ABS: + result = ggml_abs(ctx0, model.a); + break; + case GGML_TSAVORITE_KERNEL_TYPE_SIN: + result = ggml_sin(ctx0, model.a); + break; + default: + ggml_free(ctx0); + fprintf(stderr, "\n Non Supported Operation \n"); + return NULL; + } + // build operations nodes + ggml_build_forward_expand(gf, result); + + // delete the temporally context used to build the graph + ggml_free(ctx0); + return gf; +} + +// compute with backend +static struct ggml_tensor * compute(const simple_model & model, ggml_gallocr_t allocr, enum ggml_tsavorite_kernel_type ops_type) { + // reset the allocator to free all the memory allocated during the previous inference + + fprintf(stderr, "\n Under Test case for compute API creating build_graph \n"); + struct ggml_cgraph * gf = build_graph(model, ops_type); + if (!gf) { + fprintf(stderr, "\ncompute failed\n"); + return NULL; + } + + // allocate tensors + ggml_gallocr_alloc_graph(allocr, gf); + + ggml_backend_graph_compute(model.backend, gf); + + // in this case, the output tensor is the last one in the graph + return ggml_graph_node(gf, -1); +} + +enum ggml_tsavorite_kernel_type convert_testcase_to_ops_type (const char *testCase) { + if (!strcmp(testCase,"add")) + return GGML_TSAVORITE_KERNEL_TYPE_ADD; + else if (!strcmp(testCase,"sub")) + return GGML_TSAVORITE_KERNEL_TYPE_SUB; + else if (!strcmp(testCase,"mult")) + return GGML_TSAVORITE_KERNEL_TYPE_MULT; + else if (!strcmp(testCase,"div")) + return GGML_TSAVORITE_KERNEL_TYPE_DIV; + else if (!strcmp(testCase,"sqrt")) + return GGML_TSAVORITE_KERNEL_TYPE_SQRT; + else if (!strcmp(testCase,"neg")) + return GGML_TSAVORITE_KERNEL_TYPE_NEG; + else if (!strcmp(testCase,"abs")) + return GGML_TSAVORITE_KERNEL_TYPE_ABS; + else if (!strcmp(testCase,"sin")) + return GGML_TSAVORITE_KERNEL_TYPE_SIN; + + fprintf(stderr, "\n un-supported test case %s hence running default test case which is add operation \n", testCase); + return GGML_TSAVORITE_KERNEL_TYPE_ADD; +} + +int main(int argc, char *argv[]) { + ggml_time_init(); + bool test_case_flag = true; + enum ggml_tsavorite_kernel_type ops_type; + simple_model model; + float *input1[GGML_TSAVORITE_KERNEL_TYPE_COUNT]; + float *input2[GGML_TSAVORITE_KERNEL_TYPE_COUNT]; + float *result_data[GGML_TSAVORITE_KERNEL_TYPE_COUNT]; + bool data_scale = false; + + int elements_A=0, elements_B=0; + int num_of_input_tensors; + + if (argc > 1) { + ops_type = convert_testcase_to_ops_type(argv[1]); + if (argc > 2 && !strcmp(argv[2], "scale")) + data_scale = true; + } else { + // Default Case + ops_type = convert_testcase_to_ops_type("add"); + } + if (ops_type == GGML_TSAVORITE_KERNEL_TYPE_SQRT || + ops_type == GGML_TSAVORITE_KERNEL_TYPE_NEG || + ops_type == GGML_TSAVORITE_KERNEL_TYPE_ABS || + ops_type == GGML_TSAVORITE_KERNEL_TYPE_SIN) + num_of_input_tensors = NUM_INPUT_URINARY_TENSORS; + else + num_of_input_tensors = NUM_INPUT_TENSORS; + + if (data_scale) { + input1[ops_type] = test_input_scale_1[ops_type]; + elements_A = NUM_ELEMENTS_SCALE; + if (num_of_input_tensors != NUM_INPUT_URINARY_TENSORS) { + input2[ops_type] = test_input_scale_2[ops_type]; + elements_B = NUM_ELEMENTS_SCALE; + } + result_data[ops_type] = test_result_scale[ops_type]; + } else { + input1[ops_type] = test_input_1[ops_type]; + elements_A = NUM_ELEMENTS; + if (num_of_input_tensors != NUM_INPUT_URINARY_TENSORS) { + input2[ops_type] = test_input_2[ops_type]; + elements_B = NUM_ELEMENTS; + } + result_data[ops_type] = test_result[ops_type]; + } + + if(!load_model(model, input1[ops_type], input2[ops_type], GGML_TYPE_F32, elements_A, elements_B)) { + fprintf(stderr, "\n\n TEST CASE FAILED \n\n"); + return -1; + } + // since tsavorite-backend init set the debug level to none, we are overwritting here + ggml_tsavorite_log_type_val = GGML_TSAVORITE_LOG_DEBUG; + + ggml_gallocr_t allocr = NULL; + + allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); + + if (!allocr) { + fprintf(stderr, "\n\n TEST CASE FAILED \n\n"); + return -1; + } + + // create the worst case graph for memory usage estimation + struct ggml_cgraph * gf = build_graph(model, ops_type); + if (!gf) { + fprintf(stderr, "\n\n TEST CASE FAILED \n\n"); + return -1; + } + ggml_gallocr_reserve(allocr, gf); + size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0); + + fprintf(stderr, "%s: compute buffer size: %.4f KB\n", __func__, mem_size/1024.0); + + // perform computation + struct ggml_tensor * result = compute(model, allocr, ops_type); + if (!result) { + fprintf(stderr, "\n\n TEST CASE FAILED \n\n"); + return -1; + } + fprintf(stderr, "\n Compute Done \n"); + + std::vector out_data(ggml_nelements(result)); + + // bring the data from the backend memory + ggml_backend_tensor_get(result, out_data.data(), 0, ggml_nbytes(result)); + + // expected result: + + fprintf(stderr, "\n operation type: %d, num of elements %d \n", ops_type, (int) result->ne[0]); + + fprintf(stderr, "\n compute is also done \n"); + for (int i = 0; i < result->ne[0] /* cols */; i++) { + if (ggml_tsi_compare_two_float(out_data[i], result_data[ops_type][i])) { + continue; + } + test_case_flag = false; + fprintf(stderr, "\n result for index %d is not matching expected %f got %f \n", i, result_data[ops_type][i], out_data[i]); + } + + if (test_case_flag == false) { + fprintf(stderr, "\n\n TEST CASE FAILED \n\n"); + return -1; + } + fprintf(stderr, "\n\n TEST CASE PASSED \n\n"); + + // free memory + ggml_free(model.ctx); + + // release backend memory and free backend + //ggml_backend_buffer_free(model.buffer); + ggml_backend_free(model.backend); + return 0; +} diff --git a/ggml-tsi-kernel b/ggml-tsi-kernel new file mode 160000 index 0000000000000..ea3a5d613e821 --- /dev/null +++ b/ggml-tsi-kernel @@ -0,0 +1 @@ +Subproject commit ea3a5d613e82129326c93a22eb3af871e6882530 diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 4746d5cb76c08..93a72d6cc84e4 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -267,6 +267,7 @@ set(GGML_PUBLIC_HEADERS include/ggml-kompute.h include/ggml-opt.h include/ggml-metal.h + include/ggml-tsavorite.h include/ggml-rpc.h include/ggml-sycl.h include/ggml-vulkan.h diff --git a/ggml/include/ggml-tsavorite.h b/ggml/include/ggml-tsavorite.h new file mode 100644 index 0000000000000..238dcc428da88 --- /dev/null +++ b/ggml/include/ggml-tsavorite.h @@ -0,0 +1,191 @@ +// ------------------------------------------------------------------------------ +// Copyright (c) 2023 Tsavorite Scalable Intelligence, Inc . All rights reserved. +// +// +// This file is the confidential and proprietary property of +// Tsavorite Scalable Intelligence, Inc +// +// Possession or use of this file requires a written license from +// Tsavorite Scalable Intelligence, Inc + +/****************************************************************************** + * File: ggml-tsavorite.h + * Author TSI Inc + * + * Description: + * ***************************************************************************/ + +// +// +// Note: this description is outdated +// +// An interface allowing to compute ggml_cgraph with tSovrite +// +// This is a fully functional interface that extends ggml with Hardware Accelerator support for +// tSovrite devices. A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, +// etc.) +// +// How it works? +// +// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this +// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you +// use ggml_tsavorite_graph_compute() +// +// You only need to make sure that all memory buffers that you used during the graph creation +// are mapped to the device unified memory with the ggml_tsavorite_add_buffer() function. This +// mapping is used during the graph evaluation to determine the arguments of the compute kernels. +// +// Synchronization between device and host memory (for example for input and output tensors) +// is done with the ggml_tsavorite_set_tensor() and ggml_tsavorite_get_tensor() functions. +// + +#pragma once + +#include "ggml-backend.h" +#include "ggml.h" + +#include "TestModel.h" + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define TSAVORITE_KERNEL_SIZE 64 +#define TSAVORITE_DEVICE_MAX_BUF_LEN 1024 * 1024 * 128 + +enum ggml_tsavorite_input_tensors_count { + TSAVORITE_UNARY_INPUT_TENSORS = 1, + TSAVORITE_TWO_INPUT_TENSORS = 2 +}; + +enum ggml_tsavorite_log_type { + GGML_TSAVORITE_LOG_NONE, + GGML_TSAVORITE_LOG_CONT, + GGML_TSAVORITE_LOG_ERROR, + GGML_TSAVORITE_LOG_WARN, + GGML_TSAVORITE_LOG_DEBUG, + GGML_TSAVORITE_LOG_INFO, + GGML_TSAVORITE_LOG_ALL +}; + +enum ggml_tsavorite_kernel_mode { + GGML_TSAVORITE_KERNEL_MODE_CPU, + GGML_TSAVORITE_KERNEL_MODE_MLIR +}; + +enum ggml_tsavorite_kernel_mode ggml_tsavorite_kernel_mode_flag = GGML_TSAVORITE_KERNEL_MODE_MLIR; +enum ggml_tsavorite_log_type ggml_tsavorite_log_type_val = GGML_TSAVORITE_LOG_ALL; +#define GGML_TSAVORITE_LOG_INFO(...) \ + do { \ + if (ggml_tsavorite_log_type_val >= GGML_TSAVORITE_LOG_INFO) { \ + ggml_log_internal(GGML_LOG_LEVEL_INFO, __VA_ARGS__); \ + } \ + } while (0) +#define GGML_TSAVORITE_LOG_DEBUG(...) \ + do { \ + if (ggml_tsavorite_log_type_val >= GGML_TSAVORITE_LOG_DEBUG) { \ + ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__); \ + } \ + } while (0) +#define GGML_TSAVORITE_LOG_WARN(...) \ + do { \ + if (ggml_tsavorite_log_type_val >= GGML_TSAVORITE_LOG_WARN) { \ + ggml_log_internal(GGML_LOG_LEVEL_WARN, __VA_ARGS__); \ + } \ + } while (0) +#define GGML_TSAVORITE_LOG_ERROR(...) \ + do { \ + if (ggml_tsavorite_log_type_val >= GGML_TSAVORITE_LOG_ERROR) { \ + ggml_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__); \ + } \ + } while (0) +#define GGML_TSAVORITE_LOG_CONT(...) \ + do { \ + if (ggml_tsavorite_log_type_val >= GGML_TSAVORITE_LOG_CONT) { \ + ggml_log_internal(GGML_LOG_LEVEL_CONT, __VA_ARGS__); \ + } \ + } while (0) + +enum ggml_tsavorite_tensor_data_type { + GGML_TSAVORITE_TENSOR_HEADER, + GGML_TSAVORITE_TENSOR_LEAF1, + GGML_TSAVORITE_TENSOR_LEAF2, + GGML_TSAVORITE_TENSOR_NODE, + GGML_TSAVORITE_TENSOR_END_DATA +}; + +enum ggml_tsavorite_kernel_type { + GGML_TSAVORITE_KERNEL_TYPE_ADD, + GGML_TSAVORITE_KERNEL_TYPE_SUB, + GGML_TSAVORITE_KERNEL_TYPE_MULT, + GGML_TSAVORITE_KERNEL_TYPE_DIV, + GGML_TSAVORITE_KERNEL_TYPE_SQRT, + GGML_TSAVORITE_KERNEL_TYPE_NEG, + GGML_TSAVORITE_KERNEL_TYPE_ABS, + GGML_TSAVORITE_KERNEL_TYPE_SIN, + GGML_TSAVORITE_KERNEL_TYPE_SIGMOID, + GGML_TSAVORITE_KERNEL_TYPE_SILU, + + GGML_TSAVORITE_KERNEL_TYPE_COUNT +}; + +// max memory buffers that can be mapped to the device +#define GGML_TSAVORITE_MAX_BUFFERS 64 + +// max number of TSAVORITECommandBuffer used to submit a graph for processing +#define GGML_TSAVORITE_MAX_COMMAND_BUFFERS 8 +#define tsi_nil 0 +#define TSI_UNUSED(x) (void)(x) + +typedef struct tensor_log_ { + uint32_t leaf1_len; + uint32_t leaf2_len; + uint32_t node_len; + enum ggml_tsavorite_tensor_data_type data_type; + enum ggml_tsavorite_kernel_type kernel_type; + uint64_t num_of_op; + FILE *log_file; + const ggml_tensor *tensor; +} tensor_log; + +extern void _mlir_ciface_txe_add_host(void *a, void *b, void *res); +extern void _mlir_ciface_txe_sub_host(void *a, void *b, void *res); +extern void _mlir_ciface_txe_mult_host(void *a, void *b, void *res); +extern void _mlir_ciface_txe_div_host(void *a, void *b, void *res); +extern void _mlir_ciface_txe_sqrt_host(void *a, void *res); +extern void _mlir_ciface_txe_neg_host(void *a, void *res); +extern void _mlir_ciface_txe_abs_host(void *a, void *res); +extern void _mlir_ciface_txe_sin_host(void *a, void *res); +extern void _mlir_ciface_txe_sigmoid_host(void *a, void *res); +extern void _mlir_ciface_txe_silu_host(void *a, void *res); +extern void ggml_tsi_log_tensor_data(tensor_log log_data); + +#define NUM_OF_TXES 1 +#define MEM_REF_DESCRIPTOR_RANK 1 + +// +// backend API +// user-code should use only these functions +// + +GGML_BACKEND_API ggml_backend_t ggml_backend_tsavorite_init(void); + +GGML_BACKEND_API bool ggml_backend_is_tsavorite(ggml_backend_t backend); + +GGML_BACKEND_API void ggml_backend_tsavorite_set_abort_callback(ggml_backend_t backend, + ggml_abort_callback abort_callback, + void *user_data); + +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_tsavorite_buffer_type(void); + +// capture all command buffers committed the next time `ggml_backend_graph_compute` is called +GGML_BACKEND_API void ggml_backend_tsavorite_capture_next_compute(ggml_backend_t backend); + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_tsavorite_reg(void); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index c81ff03fee810..e6830b63ba8e1 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -554,6 +554,7 @@ extern "C" { GGML_LOG_LEVEL_WARN = 3, GGML_LOG_LEVEL_ERROR = 4, GGML_LOG_LEVEL_CONT = 5, // continue previous log + GGML_LOG_LEVEL_TSAVORITE = 42, }; // this tensor... diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index ddea5ad3891e5..0a14bbb74ced7 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -308,6 +308,7 @@ ggml_add_backend(CUDA) ggml_add_backend(HIP) ggml_add_backend(Kompute) ggml_add_backend(METAL) +ggml_add_backend(TSAVORITE) ggml_add_backend(MUSA) ggml_add_backend(RPC) ggml_add_backend(SYCL) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 405d8e31514b5..f48a23bf83151 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -37,6 +37,10 @@ #include "ggml-metal.h" #endif +#ifdef GGML_USE_TSAVORITE +#include "ggml-tsavorite.h" +#endif + #ifdef GGML_USE_SYCL #include "ggml-sycl.h" #endif @@ -166,6 +170,11 @@ struct ggml_backend_registry { #ifdef GGML_USE_METAL register_backend(ggml_backend_metal_reg()); #endif + +#ifdef GGML_USE_TSAVORITE + register_backend(ggml_backend_tsavorite_reg()); +#endif + #ifdef GGML_USE_SYCL register_backend(ggml_backend_sycl_reg()); #endif @@ -572,6 +581,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) { ggml_backend_load_best("hip", silent, dir_path); ggml_backend_load_best("kompute", silent, dir_path); ggml_backend_load_best("metal", silent, dir_path); + ggml_backend_load_best("tsavorite", silent, dir_path); ggml_backend_load_best("rpc", silent, dir_path); ggml_backend_load_best("sycl", silent, dir_path); ggml_backend_load_best("vulkan", silent, dir_path); diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index b30b4cb386f9f..1238093e41c81 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -939,8 +939,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } else { cur_backend_id = *node_backend_id; } - } else if (cur_backend_id != -1) { - ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); + // Below Code is Optimization which i am disabling for now since we have not implemented other + // Operation at tsavorite + } else if (cur_backend_id != -1 || (node->op == GGML_OP_UNARY)) { + //ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); + ggml_backend_sched_set_if_supported(sched, node, 0, node_backend_id); } } } diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 1d4259dae5ba7..2cbae62a1dddf 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -53,14 +53,18 @@ function(ggml_add_cpu_backend_variant_impl tag_name) endif() endif() - if (GGML_OPENMP) - find_package(OpenMP) - if (OpenMP_FOUND) - target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP) + if (${GGML_TSAVORITE_TARGET} STREQUAL fpga) + message("Target is FPGA no GOMP linked") + else() + if (GGML_OPENMP) + find_package(OpenMP) + if (OpenMP_FOUND) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP) - target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) - else() - message(WARNING "OpenMP not found") + target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) + else() + message(WARNING "OpenMP not found") + endif() endif() endif() diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index a19cfb14e0f9f..99c3475fc10cf 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -85,6 +85,7 @@ GGML_API void ggml_log_callback_default(enum ggml_log_level level, const char * #define GGML_LOG_ERROR(...) ggml_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) #define GGML_LOG_DEBUG(...) ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) #define GGML_LOG_CONT(...) ggml_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__) +#define GGML_LOG_TSAVORITE(...) ggml_log_internal(GGML_LOG_LEVEL_TSAVORITE , __VA_ARGS__) #define GGML_DEBUG 0 diff --git a/ggml/src/ggml-tsavorite/CMakeLists.txt b/ggml/src/ggml-tsavorite/CMakeLists.txt new file mode 100644 index 0000000000000..323c37df14a8b --- /dev/null +++ b/ggml/src/ggml-tsavorite/CMakeLists.txt @@ -0,0 +1,9 @@ +message(STATUS "Tsavorite framework is found") +# +# tsavorite Kernel Library +add_compile_options(--std=c++20) +ggml_add_backend_library(ggml-tsavorite + ggml-tsavorite.cpp + ) + +target_link_libraries(ggml-tsavorite PRIVATE ${TLIBS} dl rt) diff --git a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp new file mode 100644 index 0000000000000..c49d02375921f --- /dev/null +++ b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp @@ -0,0 +1,1911 @@ +// -----------------------------------------------------------------------------n +// Copyright (c) 2023 Tsavorite Scalable Intelligence, Inc . All rights reserved. +// +// +// This file is the confidential and proprietary property of +// Tsavorite Scalable Intelligence, Inc +// +// Possession or use of this file requires a written license from +// Tsavorite Scalable Intelligence, Inc + +/****************************************************************************** + * File: ggml-tsavorite.cpp + * Author TSI Inc + * + * Description: + * ***************************************************************************/ + +#include "ggml-tsavorite.h" +#include +#include +#include +#include +#include +#include "ggml-backend-impl.h" +#include "ggml-impl.h" +#include "ggml.h" +#include "HostShimCAPI.h" +#include "tsi-rt/utils/Profiler.h" + +using namespace std; +namespace tsirt = ::tsi::runtime; +typedef struct _txe_device_t *txe_device_s; +typedef struct _txe_compute_pipeline_state_t *txe_compute_pipeline_state_s; +FILE *tsi_op_log_file; +uint64_t num_of_op; + +#ifdef USE_COMMAND_BUFFERS +typedef struct _txe_command_queue_t *txe_command_queue_s; +typedef struct _txe_dispatch_queue_t *txe_dispatch_queue_s; +typedef struct _txe_command_buffer_t *txe_command_buffer_s; +#endif /* USE_COMMAND_BUFFERS */ +typedef struct ggml_backend_tsavorite_buffer ggml_backend_tsavorite_buffer_s; + +struct _txe_device_t { + char name[100]; + uint32_t max_buf_len; + size_t recommended_max_working_set_size; + size_t current_allocated_size; + int reserved; + struct _stats { + struct _op_run_count { + // Each Kernel operation belong to one tensor. Below count will increment for each Node Tensor + uint64_t total_tensor_count; + // This counter increment whenever kernel call are made + uint64_t num_of_kernel_call; + // below field count all tensors whose num of elements are larger than kernel number of + // elements + uint64_t num_of_tensor_spilt; + // For Any application below field maintain smallest tensor num of elem + uint64_t min_num_of_elem; + // For Any application below field maintain largest tensor num of elem + uint64_t max_num_of_elem; + } op_run_count[GGML_TSAVORITE_KERNEL_TYPE_COUNT]; + } stats; +}; + +struct _txe_compute_pipeline_state_t { + void (*_mlir_fptr_2_input)(void *, void *, void *); + void (*_mlir_fptr_1_input)(void *, void *); + std::string kernel_name; + int reserved; +}; + +#ifdef USE_COMMAND_BUFFERS +struct _txe_command_queue_t { + int reserved; +}; + +struct _txe_dispatch_queue_t { + int reserved; +}; + +struct _txe_command_buffer_t { + int reserved; +}; +#endif /* USE_COMMAND_BUFFERS */ + +static txe_device_s tsi_system_default_device_create(); + +// kernels + +struct ggml_tsavorite_kernel { + txe_compute_pipeline_state_s pipeline; +}; + +struct ggml_backend_tsavorite_context { +#ifdef USE_COMMAND_BUFFERS + txe_command_queue_s queue; + + txe_dispatch_queue_s d_queue; +#endif /* USE_COMMAND_BUFFERS */ + + struct ggml_tsavorite_kernel kernels[GGML_TSAVORITE_KERNEL_TYPE_COUNT]; + + // capture state + bool capture_next_compute; + bool capture_started; + + // command buffer state + int n_cb; // number of extra threads used to submit the command buffers + int n_nodes_0; // number of nodes submitted by the main thread + int n_nodes_1; // remaining number of nodes submitted by the n_cb threads + int n_nodes_per_cb; + + struct ggml_cgraph *gf; + + // the callback given to the thread pool + // void (^encode_async)(size_t ith); + +#ifdef USE_COMMAND_BUFFERS + // n_cb command buffers + 1 used by the main thread + txe_command_buffer_s command_buffers[GGML_TSAVORITE_MAX_COMMAND_BUFFERS + 1]; +#endif /* USE_COMMAND_BUFFERS */ + + // abort ggml_tsavorite_graph_compute if callback returns true + ggml_abort_callback abort_callback; + void *abort_callback_data; + + // picking CPU compute example + int n_threads; + ggml_threadpool_t threadpool; + + uint8_t *work_data; + size_t work_size; +}; + +// global + +// initialized in ggml_backend_tsavorite_reg +static struct ggml_backend_reg g_ggml_backend_tsavorite_reg; +static struct ggml_backend_device g_ggml_backend_tsavorite_device; + +// information about a tSavorite device +// note: assumes single GPU device - the default one +// Need to Add Support for multiple GPU devices +static struct ggml_backend_tsavorite_device_context { + txe_device_s device; + int ref_count; + + char name[128]; +} g_ggml_ctx_dev_main = { + /*.device =*/tsi_nil, + /*.ref_count =*/0, + /*.name =*/"", +}; + +// temporarily defined here for compatibility between ggml-backend and the old API + +struct ggml_backend_tsavorite_buffer { + void *data; + size_t size; +}; + +struct ggml_backend_tsavorite_buffer_context { + void *all_data; + size_t all_size; + bool owned; + + // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap + int n_buffers; + ggml_backend_tsavorite_buffer_s buffers[GGML_TSAVORITE_MAX_BUFFERS]; +}; + +static txe_device_s tsi_system_default_device_create() { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + txe_device_s device = (txe_device_s)malloc(sizeof(struct _txe_device_t)); + device->max_buf_len = TSAVORITE_DEVICE_MAX_BUF_LEN; + device->recommended_max_working_set_size = TSAVORITE_DEVICE_MAX_BUF_LEN; + device->current_allocated_size = 0; + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return device; +} + +static void tsi_device_free(txe_device_s device) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + free(device); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return; +} + +#ifdef USE_COMMAND_BUFFERS +static txe_command_queue_s tsi_command_queue_create() { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + txe_command_queue_s cqueue = (txe_command_queue_s)malloc(sizeof(struct _txe_command_queue_t)); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return cqueue; +} + +static txe_dispatch_queue_s tsi_dispatch_queue_create() { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + txe_dispatch_queue_s dqueue = (txe_dispatch_queue_s)malloc(sizeof(struct _txe_dispatch_queue_t)); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return dqueue; +} + +static void tsi_command_queue_free(txe_command_queue_s cqueue) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + if (cqueue) + free(cqueue); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return; +} + +static void tsi_dispatch_queue_free(txe_dispatch_queue_s dqueue) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + if (dqueue) + free(dqueue); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return; +} +#endif /* USE_COMMAND_BUFFERS */ + +static void tsi_buffer_free(void *data) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + if (data) + free(data); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return; +} + +static bool tsi_log_setup() { + tsi_op_log_file = fopen("tsi-op.txt", "w+"); + if (tsi_op_log_file == NULL) { + printf("Error Creating or opening log file\n"); + return false; + } + return true; +} + +void ggml_tsi_log_tensor_data(tensor_log log_data) { + if (!log_data.log_file) { + GGML_TSAVORITE_LOG_ERROR("%s: error: log file Cant be NULL\n", __func__); + return; + } + + switch (log_data.data_type) { + case GGML_TSAVORITE_TENSOR_HEADER: + fprintf(log_data.log_file, "\n\n"); + fprintf(log_data.log_file, "#############################################################\n"); + fprintf(log_data.log_file, + "Tensor Number %ld and Type %d \n leaf1 len %d, leaf2 len %d, Node len %d\n", + log_data.num_of_op, log_data.kernel_type, log_data.leaf1_len, log_data.leaf2_len, + log_data.node_len); + fprintf(log_data.log_file, "############################################################\n"); + fprintf(log_data.log_file, "\n\n"); + fflush(log_data.log_file); + return; + case GGML_TSAVORITE_TENSOR_LEAF1: + fprintf(log_data.log_file, "\n---------------------------------------------------\n"); + fprintf(log_data.log_file, "leaf1 Detail:\n"); + break; + case GGML_TSAVORITE_TENSOR_LEAF2: + fprintf(log_data.log_file, "\n---------------------------------------------------\n"); + fprintf(log_data.log_file, "leaf2 Detail:\n"); + break; + case GGML_TSAVORITE_TENSOR_NODE: + fprintf(log_data.log_file, "\n---------------------------------------------------\n"); + fprintf(log_data.log_file, "Node Detail:\n"); + break; + case GGML_TSAVORITE_TENSOR_END_DATA: + fprintf(log_data.log_file, "DONE WITH THIS OPERATION %ld\n", log_data.num_of_op); + fprintf(log_data.log_file, "############################################################\n"); + fprintf(log_data.log_file, "\n\n"); + fflush(log_data.log_file); + return; + default: + GGML_TSAVORITE_LOG_ERROR("%s: error: Invalid Data Type Passed\n", __func__); + return; + } + if (!log_data.tensor) { + GGML_TSAVORITE_LOG_ERROR("%s: error: tensor pointer is NULL\n", __func__); + return; + } + float *p; + int64_t count = (log_data.tensor->ne[0]) * (log_data.tensor->ne[1]) * (log_data.tensor->ne[2]) * + (log_data.tensor->ne[3]); + p = (float *)log_data.tensor->data; + if ((!p) || (count == 0)) { + fprintf(log_data.log_file, "\n\n"); + fprintf(log_data.log_file, "Tensor Data is Empty"); + fprintf(log_data.log_file, "\n---------------------------------------------------\n"); + fprintf(log_data.log_file, "\n\n"); + fflush(log_data.log_file); + return; + } + fprintf(tsi_op_log_file, "%.16f ", p[0]); + for (int64_t ii = 1; ii < count; ++ii) { + if (!(ii % 4)) + fprintf(log_data.log_file, "\n"); + fprintf(log_data.log_file, "%.16f ", p[ii]); + } + fprintf(log_data.log_file, "\n\n"); + fprintf(log_data.log_file, "\n---------------------------------------------------\n"); + fflush(log_data.log_file); + return; +} + +static void ggml_tsavorite_disp_stats(struct ggml_backend_tsavorite_context *ctx, + txe_device_s device) { + if (!ctx || !device) { + GGML_TSAVORITE_LOG_ERROR( + "At %s Either backend context or device or both are NULL, hence cant display Stats", + __func__); + return; + } + for (int i = 0; i < GGML_TSAVORITE_KERNEL_TYPE_COUNT; ++i) { + if (!ctx->kernels[i].pipeline) + continue; + GGML_TSAVORITE_LOG_CONT( + "\n %s Operation, total tensor: %lu Number of Kernel Call: %lu Number of tensor got " + "spilt: %lu Min Num of Elem %lu Max Num of Elem %lu \n", + ctx->kernels[i].pipeline->kernel_name.c_str(), + device->stats.op_run_count[i].total_tensor_count, + device->stats.op_run_count[i].num_of_kernel_call, + device->stats.op_run_count[i].num_of_tensor_spilt, + device->stats.op_run_count[i].min_num_of_elem, + device->stats.op_run_count[i].max_num_of_elem); + } + return; +} + +static void _mlir_ciface_txe_add_test (void *src0, void *src1, void *res) +{ + // MemRefDescriptor + if (!src0 || !src1 || !res) + return; + + const int Rank = MEM_REF_DESCRIPTOR_RANK; + MemRefDescriptor *srcP0, *srcP1, *nodeP; + srcP0 = (MemRefDescriptor *)src0; + srcP1 = (MemRefDescriptor *)src1; + nodeP = (MemRefDescriptor *)res; + + uint32_t count = srcP0->shape[Rank - 1]; + float *s0 = (float*)srcP0->data; + float *s1 = (float*)srcP1->data; + float *n = (float*)nodeP->data; + + for(uint32_t i=0; i < count; ++i) + n[i] = s0[i] + s1[i]; + //printf("\n Calling mlir_add cpu function-5 \n"); + return; +} + +static void _mlir_ciface_txe_mult_test (void *src0, void *src1, void *res) +{ + // MemRefDescriptor + if (!src0 || !src1 || !res) + return; + + const int Rank = MEM_REF_DESCRIPTOR_RANK; + MemRefDescriptor *srcP0, *srcP1, *nodeP; + srcP0 = (MemRefDescriptor *)src0; + srcP1 = (MemRefDescriptor *)src1; + nodeP = (MemRefDescriptor *)res; + + uint32_t count = srcP0->shape[Rank - 1]; + float *s0 = (float*)srcP0->data; + float *s1 = (float*)srcP1->data; + float *n = (float*)nodeP->data; + + for(uint32_t i=0; i < count; ++i) + n[i] = s0[i]*s1[i]; + return; +} + +static txe_compute_pipeline_state_s tsi_kernel_setup(enum ggml_tsavorite_kernel_type kernel_type) { + txe_compute_pipeline_state_s kernel_pipeline = + (txe_compute_pipeline_state_s)calloc(1, sizeof(struct _txe_compute_pipeline_state_t)); + bool flag = false; + if (!kernel_pipeline) { + GGML_TSAVORITE_LOG_ERROR("Calloc failing while setting up kernel"); + return NULL; + } + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + + switch (kernel_type) { + case GGML_TSAVORITE_KERNEL_TYPE_ADD: + if (ggml_tsavorite_kernel_mode_flag == GGML_TSAVORITE_KERNEL_MODE_CPU) + kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_add_test; + else + kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_add_host; + kernel_pipeline->kernel_name = "TXE_ADD"; + flag = true; + break; + case GGML_TSAVORITE_KERNEL_TYPE_SUB: + kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_sub_host; + kernel_pipeline->kernel_name = "TXE_SUB"; + flag = true; + break; + case GGML_TSAVORITE_KERNEL_TYPE_MULT: + if (ggml_tsavorite_kernel_mode_flag == GGML_TSAVORITE_KERNEL_MODE_CPU) + kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_mult_test; + else + kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_mult_host; + kernel_pipeline->kernel_name = "TXE_MULT"; + flag = true; + break; + case GGML_TSAVORITE_KERNEL_TYPE_DIV: + kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_div_host; + kernel_pipeline->kernel_name = "TXE_DIV"; + flag = true; + break; + case GGML_TSAVORITE_KERNEL_TYPE_SQRT: + kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_sqrt_host; + kernel_pipeline->kernel_name = "TXE_SQRT"; + flag = true; + break; + case GGML_TSAVORITE_KERNEL_TYPE_NEG: + kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_neg_host; + kernel_pipeline->kernel_name = "TXE_NEG"; + flag = true; + break; + case GGML_TSAVORITE_KERNEL_TYPE_ABS: + kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_abs_host; + kernel_pipeline->kernel_name = "TXE_ABS"; + flag = true; + break; + case GGML_TSAVORITE_KERNEL_TYPE_SIN: + kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_sin_host; + kernel_pipeline->kernel_name = "TXE_SIN"; + flag = true; + break; + case GGML_TSAVORITE_KERNEL_TYPE_SIGMOID: + kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_sigmoid_host; + kernel_pipeline->kernel_name = "TXE_SIGMOID"; + flag = true; + break; + case GGML_TSAVORITE_KERNEL_TYPE_SILU: + kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_silu_host; + kernel_pipeline->kernel_name = "TXE_SILU"; + flag = true; + break; + default: + break; + } + if (!flag) { + GGML_TSAVORITE_LOG_INFO("Kernel %d not supported \n", kernel_type); + if (kernel_pipeline) { + free(kernel_pipeline); + kernel_pipeline = NULL; + } + } + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return kernel_pipeline; +} + +static void tsi_kernel_release(txe_compute_pipeline_state_s kernel_pipeline) { + // clear kernel_pipeline + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + if (kernel_pipeline) { + free(kernel_pipeline); + } + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return; +} + +// acquire +static txe_device_s +ggml_backend_tsavorite_device_acq(struct ggml_backend_tsavorite_device_context *ctx) { + assert(ctx != NULL); + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + + if (ctx->device == tsi_nil) { + ctx->device = tsi_system_default_device_create(); + snprintf(ctx->name, sizeof("txe"), "txe"); + } + + ctx->ref_count++; + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + return ctx->device; +} + +// release +static void ggml_backend_tsavorite_device_rel(struct ggml_backend_tsavorite_device_context *ctx) { + assert(ctx != NULL); + assert(ctx->ref_count > 0); + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + + ctx->ref_count--; + + // Need to define function txe_device_free + if (ctx->ref_count == 0) { + tsi_device_free(ctx->device); + ctx->device = tsi_nil; + } + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); +} + +// We will use Unified Memory this memory is used for buffer +static void *ggml_tsavorite_host_malloc(size_t n) { + void *data = NULL; + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + + GGML_TSAVORITE_LOG_INFO("\n Allocating memory from tsi_alloc with size %ld \n", n); + data = tsi_alloc(n); + GGML_TSAVORITE_LOG_CONT("\n Allocating memory from tsi_alloc with size %ld starting memory %p\n", + n, data); + + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + return data; +} + +static struct ggml_backend_tsavorite_context *ggml_tsavorite_init(ggml_backend_dev_t dev) { + GGML_TSAVORITE_LOG_INFO("%s: Start\n", __func__); + // Open a file named "tsi-op.txt" in the current directory for writing + num_of_op = 0; + + if (tsi_log_setup() == false) + return NULL; + + std::string mainProfilerName = "GGML Tsavorite "; + tsirt::utils::TSIProfiler::initialize(); + tsirt::utils::TSIScopedProfiler mainProfiler(mainProfilerName); + + // TSI Run time Initalization + tsi_initialize(NUM_OF_TXES, NULL); + + // init context + struct ggml_backend_tsavorite_context *ctx = (struct ggml_backend_tsavorite_context *)calloc( + 1, sizeof(struct ggml_backend_tsavorite_context)); + struct ggml_backend_tsavorite_device_context *ctx_dev = + (struct ggml_backend_tsavorite_device_context *)dev->context; + + // setup the devie context + txe_device_s device = ggml_backend_tsavorite_device_acq(ctx_dev); + GGML_TSAVORITE_LOG_INFO("%s: picking default device: %s\n", __func__, device->name); + for (uint32_t op = GGML_TSAVORITE_KERNEL_TYPE_ADD; op < GGML_TSAVORITE_KERNEL_TYPE_COUNT; ++op) { + device->stats.op_run_count[op].total_tensor_count = 0; + device->stats.op_run_count[op].num_of_kernel_call = 0; + device->stats.op_run_count[op].num_of_tensor_spilt = 0; + device->stats.op_run_count[op].min_num_of_elem = 0; + device->stats.op_run_count[op].max_num_of_elem = 0; + } + ctx->n_threads = GGML_DEFAULT_N_THREADS; + ctx->threadpool = NULL; + ctx->work_data = NULL; + ctx->work_size = 0; + ctx->abort_callback = NULL; + ctx->abort_callback_data = NULL; + + // We dont need it for now, we will revisit +#ifdef USE_COMMAND_BUFFERS + // setting up backend context + ctx->queue = tsi_command_queue_create(); + ctx->d_queue = tsi_dispatch_queue_create(); +#endif /* USE_COMMAND_BUFFERS */ + + ctx->capture_next_compute = false; + ctx->capture_started = false; + + ctx->gf = tsi_nil; + // ctx->encode_async = tsi_nil; + +#ifdef USE_COMMAND_BUFFERS + for (int i = 0; i < GGML_TSAVORITE_MAX_COMMAND_BUFFERS; ++i) { + ctx->command_buffers[i] = tsi_nil; + } +#endif /* USE_COMMAND_BUFFERS */ + + // load TSavorite kernels + { + for (int i = 0; i < GGML_TSAVORITE_KERNEL_TYPE_COUNT; ++i) { + ctx->kernels[i].pipeline = tsi_nil; + } + +#define GGML_TSAVORITE_KERNEL(e, supported) \ + if (supported) { \ + ctx->kernels[e].pipeline = tsi_kernel_setup(e); \ + GGML_TSAVORITE_LOG_INFO(" TSAVORITE SUPPORTED KERNEL "); \ + } else { \ + GGML_TSAVORITE_LOG_WARN("%s: skipping %-40s (not supported)\n", __func__, "kernel_" #e); \ + } + + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_ADD, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SUB, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_MULT, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_DIV, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SQRT, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_NEG, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_ABS, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SIN, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SIGMOID, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SILU, true); + } + + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return ctx; +} + +static void ggml_tsavorite_free(struct ggml_backend_tsavorite_context *ctx) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + + for (int i = 0; i < GGML_TSAVORITE_KERNEL_TYPE_COUNT; ++i) { + if (ctx->kernels[i].pipeline) { + tsi_kernel_release(ctx->kernels[i].pipeline); + ctx->kernels[i].pipeline = tsi_nil; + } + } + + // Block_release(ctx->encode_async); + // +#ifdef USE_COMMAND_BUFFERS + tsi_command_queue_free(ctx->queue); + + tsi_dispatch_queue_free(ctx->d_queue); +#endif /* USE_COMMAND_BUFFERS */ + + free(ctx); + + // TSI run time free + GGML_TSAVORITE_LOG_INFO("\n Calling tsi_finalize \n"); + // delay to allow any file operations to complete for runtime + + GGML_TSAVORITE_LOG_INFO("Delaying tsi_finalize for 2 sec"); + sleep(2); + tsi_finalize(); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + tsirt::utils::TSIProfiler::finalize(); + std::cout << "\nGGML Tsavorite Profiling Results:" << std::endl; + std::cout << tsirt::utils::TSIProfiler::getFormattedResults( + /*truncateFuncNames*/ true) + << std::endl; +} + +#if 0 +// finds the tSavorite buffer that contains the tensor data on the TXE device unified memory +// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the +// tSavorite buffer based on the host memory pointer +// +static ggml_backend_tsavorite_buffer_s ggml_tsavorite_get_buffer(struct ggml_tensor * t, size_t * offs) { + // GGML_TSAVORITE_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + + const int64_t tsize = ggml_nbytes(t); + + ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer; + + struct ggml_backend_tsavorite_buffer_context * buf_ctx = (struct ggml_backend_tsavorite_buffer_context *) buffer->context; + + // find the view that contains the tensor fully + for (int i = 0; i < buf_ctx->n_buffers; ++i) { + const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data; + + // GGML_TSAVORITE_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size); + if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) { + *offs = (size_t) ioffs; + + // GGML_TSAVORITE_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + return buf_ctx->buffers[i]; + } + } + + GGML_TSAVORITE_LOG_ERROR("%s: error: tensor '%s' buffer is tsi_nil\n", __func__, t->name); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + return tsi_nil; +} +#endif + +static bool ggml_tsavorite_supports_op(const struct ggml_backend_tsavorite_device_context *ctx_dev, + const struct ggml_tensor *op) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + if (!ctx_dev) + return false; + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + for (size_t i = 0, n = 3; i < n; ++i) { + if (op->src[i] != NULL && op->src[i]->type != GGML_TYPE_F32) { + return false; + } + } + + if (op->type != GGML_TYPE_F32) + return false; + switch (op->op) { + case GGML_OP_NONE: + case GGML_OP_ADD: + case GGML_OP_SUB: + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_SQRT: + case GGML_OP_SIN: + break; + case GGML_OP_UNARY: + switch (ggml_get_unary_op(op)) { + case GGML_UNARY_OP_NEG: + case GGML_UNARY_OP_ABS: + case GGML_UNARY_OP_SIGMOID: + case GGML_UNARY_OP_SILU: + break; + default: + return false; + } + break; + default: + return false; + } + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return true; +} + +/* +static void ggml_tsavorite_encode_node( + ggml_backend_t backend, + int idx, + tsi_command_encoder encoder) { +} +*/ + +static void ggml_tsavorite_decompose_unary_kernel_sin(uint32_t num_elem, ggml_tensor *src) { + float *p = (float *)(src->data); + for (uint32_t i = 0; i < num_elem; ++i) { + *p = (*p) / (2 * M_PI); + ++p; + } + return; +} + +static void ggml_tsavorite_decompose_unary_kernel(uint32_t num_elem, ggml_tensor *src, + ggml_tensor *node) { + switch (node->op) { + case GGML_OP_SIN: + ggml_tsavorite_decompose_unary_kernel_sin(num_elem, src); + break; + default: + break; + } + return; +} + +// nodes are intermediate which has multiple src tensors & operation +// Here we create multiple thread +// Each Thread run the command buffer & pick Tensor and execute and get the result back base on +// async or sync all Compute wil finish all tensors execution +static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend, + struct ggml_cgraph *cgraph) { +#if 0 + GGML_LOG_INFO("Start %s\n", __func__); + struct ggml_backend_tsavorite_context * ctx = backend->context; + struct ggml_backend_tsavorite_device_context * ctx_dev = backend->device->context; + + // number of nodes encoded by the main thread (empirically determined) + const int n_main = 128; + + // number of threads in addition to the main thread + const int n_cb = ctx->n_cb; + + // submit the ggml compute graph to the TXE by creating command buffers and encoding the ops in them + // the first n_nodes_0 are encoded and submitted for processing directly by the calling thread + // while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes + // each thread creates it's own command buffer and enqueues the ops in parallel + + GGML_LOG_INFO("End %s\n", __func__); + return GGML_STATUS_SUCCESS; +#endif + + struct ggml_backend_tsavorite_context *ctx = + (struct ggml_backend_tsavorite_context *)backend->context; + if (!ctx) { + GGML_LOG_ERROR("\n backend ctx is NULL \n"); + return GGML_STATUS_FAILED; + } + +#if 0 + struct ggml_cplan cplan = ggml_graph_plan(cgraph, ctx->n_threads, ctx->threadpool); + + if (ctx->work_size < cplan.work_size) { + delete[] ctx->work_data; + ctx->work_data = new uint8_t[cplan.work_size]; + if (ctx->work_data == NULL) { + ctx->work_size = 0; + return GGML_STATUS_ALLOC_FAILED; + } + ctx->work_size = cplan.work_size; + } + cplan.work_data = (uint8_t *)ctx->work_data; + + cplan.abort_callback = ctx->abort_callback; + cplan.abort_callback_data = ctx->abort_callback_data; +#endif + + txe_device_s device = ggml_backend_tsavorite_device_acq( + (struct ggml_backend_tsavorite_device_context *)backend->device->context); + + if (!device) { + GGML_TSAVORITE_LOG_ERROR("\n tsavorite device is NULL \n"); + return GGML_STATUS_FAILED; + } + // MemRefDescriptor + const int Rank = MEM_REF_DESCRIPTOR_RANK; + MemRefDescriptor *srcP0, *srcP1, *nodeP; + struct ggml_tensor *src0, *src1, *node; + uint32_t num_elem_src0, num_elem_src1, num_elem_node; + enum ggml_tsavorite_kernel_type kernel_type; + // This variable not needed since src0 or node will have max elem size + // and src1 size will min elem size + uint64_t max_num_of_elem, min_num_of_elem; + enum ggml_tsavorite_input_tensors_count num_of_input_tensors; + tensor_log log_data; + + for (int i = 0; i < cgraph->n_nodes; i++) { + node = cgraph->nodes[i]; + src0 = node->src[0]; + src1 = node->src[1]; + min_num_of_elem = 0; + max_num_of_elem = 0; + + switch (node->op) { + case GGML_OP_ADD: + kernel_type = GGML_TSAVORITE_KERNEL_TYPE_ADD; + num_of_input_tensors = TSAVORITE_TWO_INPUT_TENSORS; + break; + case GGML_OP_SUB: + kernel_type = GGML_TSAVORITE_KERNEL_TYPE_SUB; + num_of_input_tensors = TSAVORITE_TWO_INPUT_TENSORS; + break; + case GGML_OP_MUL: + kernel_type = GGML_TSAVORITE_KERNEL_TYPE_MULT; + num_of_input_tensors = TSAVORITE_TWO_INPUT_TENSORS; + break; + case GGML_OP_DIV: + kernel_type = GGML_TSAVORITE_KERNEL_TYPE_DIV; + num_of_input_tensors = TSAVORITE_TWO_INPUT_TENSORS; + break; + case GGML_OP_SQRT: + kernel_type = GGML_TSAVORITE_KERNEL_TYPE_SQRT; + num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS; + break; + case GGML_OP_SIN: + kernel_type = GGML_TSAVORITE_KERNEL_TYPE_SIN; + num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS; + break; + case GGML_OP_UNARY: + switch (ggml_get_unary_op(node)) { + case GGML_UNARY_OP_NEG: + kernel_type = GGML_TSAVORITE_KERNEL_TYPE_NEG; + num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS; + break; + case GGML_UNARY_OP_ABS: + kernel_type = GGML_TSAVORITE_KERNEL_TYPE_ABS; + num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS; + break; + case GGML_UNARY_OP_SIGMOID: + kernel_type = GGML_TSAVORITE_KERNEL_TYPE_SIGMOID; + num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS; + break; + case GGML_UNARY_OP_SILU: + kernel_type = GGML_TSAVORITE_KERNEL_TYPE_SILU; + num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS; + break; + default: + ggml_backend_tsavorite_device_rel( + (struct ggml_backend_tsavorite_device_context *)backend->device->context); + return GGML_STATUS_ABORTED; + } + break; + default: + ggml_backend_tsavorite_device_rel( + (struct ggml_backend_tsavorite_device_context *)backend->device->context); + return GGML_STATUS_ABORTED; + } + + if (!ctx->kernels[kernel_type].pipeline || + (!ctx->kernels[kernel_type].pipeline->_mlir_fptr_2_input && + !ctx->kernels[kernel_type].pipeline->_mlir_fptr_1_input)) { + GGML_TSAVORITE_LOG_ERROR("Kernel Type %d, not supported \n", kernel_type); + return GGML_STATUS_ABORTED; + } + ++num_of_op; + + if (num_of_input_tensors == TSAVORITE_TWO_INPUT_TENSORS) { + if (node->src[0] && node->src[1]) { + if (!src0->data || !src1->data || !node->data) { + GGML_TSAVORITE_LOG_ERROR( + "One of tensor Data doesnt have memory leaf1 %p, leaf2 %p, node %p \n", src0->data, + src1->data, node->data); + ggml_backend_tsavorite_device_rel( + (struct ggml_backend_tsavorite_device_context *)backend->device->context); + return GGML_STATUS_ABORTED; + } + srcP0 = (MemRefDescriptor *)src0->data; + srcP1 = (MemRefDescriptor *)src1->data; + nodeP = (MemRefDescriptor *)node->data; + // This is for tsavorite MemRef Header hence getting header + --srcP0; + --srcP1; + --nodeP; + srcP0->data = srcP0->base = src0->data; + srcP1->data = srcP1->base = src1->data; + nodeP->data = nodeP->base = node->data; + // offset & shape size will be update base on Tensor Size + // TSAVORITE KERNEL CAN Take max of TSAVORITE_KERNEL_SIZE + // Hence we need to load tensor data at multiple iteration + // for large Tensor Dataset + srcP0->offset = 0; + srcP1->offset = 0; + nodeP->offset = 0; + + // currently _mlir_ as restriction to hold max of 64 elements, we need to spilt the work if + // its more than 64, i will address this at future PR Initalizing num_elem + num_elem_src0 = 1; + for (int i = 0; i < GGML_MAX_DIMS && src0->nb[i] != 0; ++i) + num_elem_src0 *= src0->ne[i]; + + num_elem_src1 = 1; + for (int i = 0; i < GGML_MAX_DIMS && src1->nb[i] != 0; ++i) + num_elem_src1 *= src1->ne[i]; + + num_elem_node = 1; + for (int i = 0; i < GGML_MAX_DIMS && node->nb[i] != 0; ++i) + num_elem_node *= node->ne[i]; + + if (!num_elem_src0 || !num_elem_src1 || !num_elem_node) { + GGML_TSAVORITE_LOG_ERROR("\nOne or more of Tensor length is zero of kernel_type %d\n", + kernel_type); + return GGML_STATUS_ABORTED; + } + + min_num_of_elem = max_num_of_elem = num_elem_src0; + + if (min_num_of_elem > num_elem_src1) + min_num_of_elem = num_elem_src1; + if (min_num_of_elem > num_elem_node) + min_num_of_elem = num_elem_node; + + if (max_num_of_elem < num_elem_src1) + max_num_of_elem = num_elem_src1; + if (max_num_of_elem < num_elem_node) + max_num_of_elem = num_elem_node; + + if (ggml_tsavorite_log_type_val == GGML_TSAVORITE_LOG_DEBUG) { + bzero((char *)&log_data, sizeof(log_data)); + log_data.leaf1_len = num_elem_src0; + log_data.leaf2_len = num_elem_src1; + log_data.node_len = num_elem_node; + log_data.log_file = tsi_op_log_file; + log_data.num_of_op = num_of_op; + log_data.kernel_type = kernel_type; + + log_data.data_type = GGML_TSAVORITE_TENSOR_HEADER; + ggml_tsi_log_tensor_data(log_data); + + log_data.data_type = GGML_TSAVORITE_TENSOR_LEAF1; + log_data.tensor = src0; + ggml_tsi_log_tensor_data(log_data); + + log_data.data_type = GGML_TSAVORITE_TENSOR_LEAF2; + log_data.tensor = src1; + ggml_tsi_log_tensor_data(log_data); + } + + ggml_tensor *dst = node; + const int nr = ggml_nrows(src0); + + GGML_TENSOR_BINARY_OP_LOCALS + + for (int ir = 0; ir < nr; ++ir) { + const int64_t i03 = ir / (ne02 * ne01); + const int64_t i02 = (ir - i03 * ne02 * ne01) / ne01; + const int64_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01); + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + const int64_t nr0 = ne00 / ne10; + + float *dst_ptr = (float *)((char *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1); + float *src0_ptr = (float *)((char *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01); + float *src1_ptr = (float *)((char *)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11); + + for (int64_t r = 0; r < nr0; ++r) { + // While loop is added to handle the scenario when kernel number of elements + // less than ggml tensor number of elements.GGML tensor number of elements decided + // base on application like llama.cpp. Currently we have build Kernel elements + // statically hence we have MACRO: TSAVORITE_KERNEL_SIZE to track this + int count = 0; + while (count < ne10) { + int kernel_size; + srcP1->data = srcP1->base = (void *)(src1_ptr + count); + srcP0->data = srcP0->base = (void *)(src0_ptr + r * ne10 + count); + nodeP->data = nodeP->base = (void *)(dst_ptr + r * ne10 + count); + if ((count + TSAVORITE_KERNEL_SIZE) > ne10) + kernel_size = ne10 - count; + else + kernel_size = TSAVORITE_KERNEL_SIZE; + count += kernel_size; + srcP0->shape[Rank - 1] = kernel_size; + srcP1->shape[Rank - 1] = kernel_size; + nodeP->shape[Rank - 1] = kernel_size; + srcP0->strides[Rank - 1] = 0; + srcP1->strides[Rank - 1] = 0; + nodeP->strides[Rank - 1] = 0; + // kernel call + ctx->kernels[kernel_type].pipeline->_mlir_fptr_2_input(srcP0, srcP1, nodeP); + ++device->stats.op_run_count[kernel_type].num_of_kernel_call; + } + } + } + + if (ggml_tsavorite_log_type_val == GGML_TSAVORITE_LOG_DEBUG) { + log_data.data_type = GGML_TSAVORITE_TENSOR_NODE; + log_data.tensor = node; + ggml_tsi_log_tensor_data(log_data); + + log_data.data_type = GGML_TSAVORITE_TENSOR_END_DATA; + log_data.tensor = NULL; + ggml_tsi_log_tensor_data(log_data); + } + } + } + + if (num_of_input_tensors == TSAVORITE_UNARY_INPUT_TENSORS) { + if (node->src[0]) { + if (!src0->data || !node->data) { + GGML_TSAVORITE_LOG_ERROR( + "input or output tensor Data doesnt have memory leaf %p, node %p \n", src0->data, + node->data); + ggml_backend_tsavorite_device_rel( + (struct ggml_backend_tsavorite_device_context *)backend->device->context); + return GGML_STATUS_ABORTED; + } + srcP0 = (MemRefDescriptor *)src0->data; + nodeP = (MemRefDescriptor *)node->data; + // This is for tsavorite MemRef Header hence getting header + --srcP0; + --nodeP; + srcP0->data = srcP0->base = src0->data; + nodeP->data = nodeP->base = node->data; + // offset & shape size will be update base on Tensor Size + // TSAVORITE KERNEL CAN Take max of TSAVORITE_KERNEL_SIZE + // Hence we need to load tensor data at multiple iteration + // for large Tensor Dataset + srcP0->offset = 0; + nodeP->offset = 0; + + // currently _mlir_ as restriction to hold max of 64 elements, we need to spilt the work if + // its more than 64, i will address this at future PR Initalizing num_elem + num_elem_src0 = 1; + for (int i = 0; i < GGML_MAX_DIMS && src0->nb[i] != 0; ++i) + num_elem_src0 *= src0->ne[i]; + max_num_of_elem = min_num_of_elem = num_elem_src0; + + if (ggml_tsavorite_log_type_val == GGML_TSAVORITE_LOG_DEBUG) { + bzero((char *)&log_data, sizeof(log_data)); + log_data.leaf1_len = num_elem_src0; + log_data.leaf2_len = 0; + log_data.node_len = num_elem_src0; + log_data.log_file = tsi_op_log_file; + log_data.num_of_op = num_of_op; + log_data.kernel_type = kernel_type; + + log_data.data_type = GGML_TSAVORITE_TENSOR_HEADER; + ggml_tsi_log_tensor_data(log_data); + + log_data.data_type = GGML_TSAVORITE_TENSOR_LEAF1; + log_data.tensor = src0; + ggml_tsi_log_tensor_data(log_data); + } + // While loop is added to handle the scenario when kernel number of elements + // less than ggml tensor number of elements.GGML tensor number of elements decided + // base on application like llama.cpp. Currently we have build Kernel elements statically + // hence we have MACRO: TSAVORITE_KERNEL_SIZE to track this + uint32_t count = 0; + + if (node->op == GGML_OP_SIN) { + ggml_tsavorite_decompose_unary_kernel(num_elem_src0, src0, node); + } + while (count < num_elem_src0) { + int kernel_size; + srcP0->data = srcP0->base = (void *)((float *)src0->data + count); + nodeP->data = nodeP->base = (void *)((float *)node->data + count); + if ((count + TSAVORITE_KERNEL_SIZE) > num_elem_src0) + kernel_size = num_elem_src0 - count; + else + kernel_size = TSAVORITE_KERNEL_SIZE; + count += kernel_size; + srcP0->shape[Rank - 1] = kernel_size; + nodeP->shape[Rank - 1] = kernel_size; + srcP0->strides[Rank - 1] = 0; + nodeP->strides[Rank - 1] = 0; + // kernel call + ctx->kernels[kernel_type].pipeline->_mlir_fptr_1_input(srcP0, nodeP); + ++device->stats.op_run_count[kernel_type].num_of_kernel_call; + } + + if (ggml_tsavorite_log_type_val == GGML_TSAVORITE_LOG_DEBUG) { + log_data.data_type = GGML_TSAVORITE_TENSOR_NODE; + log_data.tensor = node; + ggml_tsi_log_tensor_data(log_data); + + log_data.data_type = GGML_TSAVORITE_TENSOR_END_DATA; + log_data.tensor = NULL; + ggml_tsi_log_tensor_data(log_data); + } + } + } + if (min_num_of_elem > 0) { + ++device->stats.op_run_count[kernel_type].total_tensor_count; + + if (min_num_of_elem > TSAVORITE_KERNEL_SIZE) + ++device->stats.op_run_count[kernel_type].num_of_tensor_spilt; + + if (!(device->stats.op_run_count[kernel_type].min_num_of_elem) || + device->stats.op_run_count[kernel_type].min_num_of_elem > min_num_of_elem) + device->stats.op_run_count[kernel_type].min_num_of_elem = min_num_of_elem; + + if (!(device->stats.op_run_count[kernel_type].max_num_of_elem) || + device->stats.op_run_count[kernel_type].max_num_of_elem < max_num_of_elem) + device->stats.op_run_count[kernel_type].max_num_of_elem = max_num_of_elem; + } + } + + // This this need to implement correctly when we have mixture of CPU and accelerator operation + // return ggml_graph_compute(cgraph, &cplan); + ggml_backend_tsavorite_device_rel( + (struct ggml_backend_tsavorite_device_context *)backend->device->context); + return GGML_STATUS_SUCCESS; + + GGML_UNUSED(backend); +} + +//////////////////////////////////////////////////////////////////////////////// + +// backend interface + +#if 0 +static const char * ggml_backend_tsavorite_buffer_get_name(ggml_backend_buffer_t buffer) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return "tSavorite"; + + TSI_UNUSED(buffer); +} +#endif + +static void ggml_backend_tsavorite_buffer_free_buffer(ggml_backend_buffer_t buffer) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + struct ggml_backend_tsavorite_buffer_context *ctx = + (struct ggml_backend_tsavorite_buffer_context *)buffer->context; + +#if 0 + // ctx->all_data & tsi_buffer_free(ctx->buffers[i].data and same memory and created by tsi_alloc + // tsi_finalize called when ggml call backend free all memory + // this fucntion called when ggml free backend particular buffer, currently we cant provide this support + // and just return NoOps + // But at end there is no memory leak but memory can grow since we free at last once backend is shutdown + // We need to revisit this hence i kept the stuff under if 0 + for (int i = 0; i < ctx->n_buffers; i++) { + tsi_buffer_free(ctx->buffers[i].data); + } + ggml_backend_tsavorite_device_rel((struct ggml_backend_tsavorite_device_context *)buffer->buft->device->context); + + if (ctx->owned) { + free(ctx->all_data); + } +#endif + + free(ctx); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); +} + +static void *ggml_backend_tsavorite_buffer_get_base(ggml_backend_buffer_t buffer) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + struct ggml_backend_tsavorite_buffer_context *ctx = + (struct ggml_backend_tsavorite_buffer_context *)buffer->context; + + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return ctx->all_data; +} + +static ggml_status ggml_backend_tsavorite_buffer_init_tensor(ggml_backend_buffer_t buffer, + struct ggml_tensor *tensor) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + const int Rank = MEM_REF_DESCRIPTOR_RANK; + MemRefDescriptor tensor_data_header; + tensor->data = (void *)(sizeof(tensor_data_header) + (char *)tensor->data); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return GGML_STATUS_SUCCESS; + + TSI_UNUSED(buffer); +} + +static void ggml_backend_tsavorite_buffer_memset_tensor(ggml_backend_buffer_t buffer, + struct ggml_tensor *tensor, uint8_t value, + size_t offset, size_t size) { + if (!tensor || !tensor->data) { + GGML_TSAVORITE_LOG_ERROR("\n tensor or data cant be null under func: %s\n", __func__); + return; + } + memset((char *)tensor->data + offset, value, size); + + GGML_UNUSED(buffer); +} + +static void ggml_backend_tsavorite_buffer_set_tensor(ggml_backend_buffer_t buffer, + struct ggml_tensor *tensor, const void *data, + size_t offset, size_t size) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + memcpy((char *)tensor->data + offset, data, size); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + TSI_UNUSED(buffer); +} + +static void ggml_backend_tsavorite_buffer_get_tensor(ggml_backend_buffer_t buffer, + const struct ggml_tensor *tensor, void *data, + size_t offset, size_t size) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + memcpy(data, (const char *)tensor->data + offset, size); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + TSI_UNUSED(buffer); +} + +static bool ggml_backend_tsavorite_buffer_cpy_tensor(ggml_backend_buffer_t buffer, + const struct ggml_tensor *src, + struct ggml_tensor *dst) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + + if (ggml_backend_buffer_is_host(src->buffer)) { + memcpy(dst->data, src->data, (ggml_nbytes(src))); + return true; + } + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return false; + + TSI_UNUSED(buffer); +} + +static void ggml_backend_tsavorite_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + struct ggml_backend_tsavorite_buffer_context *ctx = + (struct ggml_backend_tsavorite_buffer_context *)buffer->context; + if (!ctx || !ctx->all_data) { + GGML_TSAVORITE_LOG_ERROR("\n ctx or all_data cant be null under func: %s\n", __func__); + return; + } + memset((char *)ctx->all_data, value, ctx->all_size); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); +} + +static struct ggml_backend_buffer_i ggml_backend_tsavorite_buffer_i = { + /* .free_buffer = */ ggml_backend_tsavorite_buffer_free_buffer, + /* .get_base = */ ggml_backend_tsavorite_buffer_get_base, + /* .init_tensor = */ ggml_backend_tsavorite_buffer_init_tensor, + /* .memset_tensor = */ ggml_backend_tsavorite_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_tsavorite_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_tsavorite_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_tsavorite_buffer_cpy_tensor, + /* .clear = */ ggml_backend_tsavorite_buffer_clear, + /* .reset = */ NULL, +}; + +// default buffer type + +static const char *ggml_backend_tsavorite_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return "tsavorite"; + + TSI_UNUSED(buft); +} + +static void ggml_backend_tsavorite_log_allocated_size(txe_device_s device, size_t size_aligned) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); +#ifndef GGML_TSAVORITE_NDEBUG +#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) + GGML_TSAVORITE_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n", __func__, + size_aligned / 1024.0 / 1024.0, + device.currentAllocatedSize / 1024.0 / 1024.0); +#endif +#endif + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + TSI_UNUSED(device); + TSI_UNUSED(size_aligned); +} + +static ggml_backend_buffer_t +ggml_backend_tsavorite_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + struct ggml_backend_tsavorite_buffer_context *ctx = + (struct ggml_backend_tsavorite_buffer_context *)calloc( + 1, sizeof(struct ggml_backend_tsavorite_buffer_context)); + + const size_t size_page = sysconf(_SC_PAGESIZE); + GGML_TSAVORITE_LOG_CONT( + "ggml_backend_tsavorite_buffer_type_alloc_buffer is called from llama data Loader \n"); + + size_t size_aligned = size; + if ((size_aligned % size_page) != 0) { + size_aligned += (size_page - (size_aligned % size_page)); + } + + txe_device_s device = ggml_backend_tsavorite_device_acq( + (struct ggml_backend_tsavorite_device_context *)buft->device->context); + if (!device) + return NULL; + + ctx->all_data = ggml_tsavorite_host_malloc(size_aligned); + ctx->all_size = size_aligned; + ctx->owned = true; + ctx->n_buffers = 1; + GGML_TSAVORITE_LOG_INFO("\n\n\n\n Memory Starting address %p and size %ld \n\n\n", ctx->all_data, + ctx->all_size); + + if (ctx->all_data != NULL) { + GGML_TSAVORITE_LOG_CONT("\nAddress of Newly Created BUffer %p and size %ld \n", ctx->all_data, + ctx->all_size); + if (ggml_tsavorite_log_type_val == GGML_TSAVORITE_LOG_DEBUG) { + fprintf(tsi_op_log_file, "Address of Newly Created BUffer %p and size %ld \n", ctx->all_data, + ctx->all_size); + } + ctx->buffers[0].data = NULL; + ctx->buffers[0].data = ctx->all_data; + ctx->buffers[0].size = size; + memset((char *)ctx->all_data, 0, ctx->all_size); + } + + if (size_aligned > 0 && (ctx->all_data == NULL)) { + GGML_TSAVORITE_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, + size_aligned / 1024.0 / 1024.0); + free(ctx); + ggml_backend_tsavorite_device_rel( + (struct ggml_backend_tsavorite_device_context *)buft->device->context); + return NULL; + } + + // ggml_backend_tsavorite_log_allocated_size(device, size_aligned); + device->current_allocated_size += ctx->all_size; + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + return ggml_backend_buffer_init(buft, ggml_backend_tsavorite_buffer_i, ctx, size); +} + +static size_t ggml_backend_tsavorite_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return 32; + TSI_UNUSED(buft); +} + +static size_t ggml_backend_tsavorite_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + txe_device_s device = ggml_backend_tsavorite_device_acq( + (struct ggml_backend_tsavorite_device_context *)buft->device->context); + const size_t max_size = device->max_buf_len; + ggml_backend_tsavorite_device_rel( + (struct ggml_backend_tsavorite_device_context *)buft->device->context); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + return max_size; + + TSI_UNUSED(buft); +} + +static size_t ggml_backend_tsavorite_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, + const struct ggml_tensor *tensor) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + txe_device_s device = ggml_backend_tsavorite_device_acq( + (struct ggml_backend_tsavorite_device_context *)buft->device->context); + if (!device) { + GGML_TSAVORITE_LOG_ERROR("\n tsavorite device is NULL \n"); + return 0; + } + const int Rank = MEM_REF_DESCRIPTOR_RANK; + MemRefDescriptor tensor_data_header; + ggml_backend_tsavorite_device_rel( + (struct ggml_backend_tsavorite_device_context *)buft->device->context); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + GGML_TSAVORITE_LOG_INFO( + "\n\n\n\n Calculating---- Alloc ----Size header %lu and data %lu \n\n\n\n ", + sizeof(tensor_data_header), ggml_nbytes(tensor)); + + return (sizeof(tensor_data_header) + ggml_nbytes(tensor)); + + TSI_UNUSED(buft); +} + +static bool ggml_backend_tsavorite_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + // For Now CPU is loading all data and then copy some tensor to Tsavorite Backend + // Once we have most of Operation supported by Tsavorite + // We will figure out to make tsavorite Backend also host + return false; + + TSI_UNUSED(buft); +} + +ggml_backend_buffer_type_t ggml_backend_tsavorite_buffer_type(void) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + static struct ggml_backend_buffer_type ggml_backend_buffer_type_tsavorite = { + /* .iface = */ { + /* .get_name = */ ggml_backend_tsavorite_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_tsavorite_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_tsavorite_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_tsavorite_buffer_type_get_max_size, + /* .get_alloc_size = */ + ggml_backend_tsavorite_buffer_type_get_alloc_size, // defaults to ggml_nbytes + /* .is_host = */ ggml_backend_tsavorite_buffer_type_is_host, + }, + /* .device = */ &g_ggml_backend_tsavorite_device, + /* .context = */ NULL, + }; + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + return &ggml_backend_buffer_type_tsavorite; +} + +// backend + +static const char *ggml_backend_tsavorite_name(ggml_backend_t backend) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return "Tsavorite"; + + TSI_UNUSED(backend); +} + +static void ggml_backend_tsavorite_free(ggml_backend_t backend) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + if (!backend || !backend->context || !backend->device || !backend->device->context) { + GGML_TSAVORITE_LOG_ERROR("At %s One of more pointer among: Backend, backend_context, " + "device_context or device are NULL", + __func__); + return; + } + struct ggml_backend_tsavorite_context *ctx = + (struct ggml_backend_tsavorite_context *)backend->context; + struct ggml_backend_tsavorite_device_context *ctx_dev = + (struct ggml_backend_tsavorite_device_context *)backend->device->context; + ggml_tsavorite_disp_stats(ctx, ctx_dev->device); + + ggml_backend_tsavorite_device_rel(ctx_dev); + ggml_tsavorite_free(ctx); + + free(backend); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); +} + +static void ggml_backend_tsavorite_synchronize(ggml_backend_t backend) { +// We need to implement ASYN Method to take output of tensor data to input of other Tensor +// We will evaluate and implement at later PR +#ifdef SYNC_DEBUG + usleep(100000); +#endif /* SYNC_DEBUG */ + TSI_UNUSED(backend); +} + +static ggml_backend_buffer_type_t +ggml_backend_tsavorite_get_default_buffer_type(ggml_backend_t backend) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return ggml_backend_tsavorite_buffer_type(); + + TSI_UNUSED(backend); +} + +static enum ggml_status ggml_backend_tsavorite_graph_compute(ggml_backend_t backend, + struct ggml_cgraph *cgraph) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return ggml_tsavorite_graph_compute(backend, cgraph); +} + +static void ggml_backend_tsavorite_set_n_cb(ggml_backend_t backend, int n_cb) { + // GGML_ASSERT(ggml_backend_is_tsavorite(backend)); + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + + struct ggml_backend_tsavorite_context *ctx = + (struct ggml_backend_tsavorite_context *)backend->context; + + if (ctx->n_cb != n_cb) { + ctx->n_cb = MIN(n_cb, GGML_TSAVORITE_MAX_COMMAND_BUFFERS); + + if (ctx->n_cb > 2) { + GGML_TSAVORITE_LOG_WARN("%s: n_cb = %d, using n_cb > 2 is not recommended and can degrade " + "the performance in some cases\n", + __func__, n_cb); + } + } + +#if 0 + if (ctx->encode_async) { + Block_release(ctx->encode_async); + } +#endif + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); +} + +static struct ggml_backend_i ggml_backend_tsavorite_i = { + /* .get_name = */ ggml_backend_tsavorite_name, + /* .free = */ ggml_backend_tsavorite_free, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_async = */ NULL, + /* .synchronize = */ ggml_backend_tsavorite_synchronize, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_tsavorite_graph_compute, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, +}; + +static ggml_guid_t ggml_backend_tsavorite_guid(void) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + static ggml_guid guid = {0x81, 0xa1, 0x8b, 0x1e, 0x71, 0xec, 0x79, 0xed, + 0x2b, 0x85, 0xdc, 0x8a, 0x61, 0x98, 0x30, 0xe6}; + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return &guid; +} + +// This need to be removed in the future +ggml_backend_t ggml_backend_tsavorite_init(void) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_tsavorite_reg(), 0); + + struct ggml_backend_tsavorite_context *ctx = ggml_tsavorite_init(dev); + if (ctx == NULL) { + GGML_TSAVORITE_LOG_ERROR("%s: error: failed to allocate context\n", __func__); + return NULL; + } + + ggml_backend_t backend = (ggml_backend_t)malloc(sizeof(struct ggml_backend)); + if (backend) { + backend->guid = ggml_backend_tsavorite_guid(); + backend->iface = ggml_backend_tsavorite_i; + backend->device = dev; + backend->context = ctx; + } + // Will enable later + // ggml_backend_tsavorite_set_n_cb(backend, 1); + + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return backend; +} + +bool ggml_backend_is_tsavorite(ggml_backend_t backend) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_tsavorite_guid()); +} + +void ggml_backend_tsavorite_set_abort_callback(ggml_backend_t backend, + ggml_abort_callback abort_callback, + void *user_data) { + GGML_ASSERT(ggml_backend_is_tsavorite(backend)); + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + + struct ggml_backend_tsavorite_context *ctx = + (struct ggml_backend_tsavorite_context *)backend->context; + + ctx->abort_callback = abort_callback; + ctx->abort_callback_data = user_data; + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); +} + +void ggml_backend_tsavorite_capture_next_compute(ggml_backend_t backend) { + GGML_ASSERT(ggml_backend_is_tsavorite(backend)); + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + + struct ggml_backend_tsavorite_context *ctx = + (struct ggml_backend_tsavorite_context *)backend->context; + ctx->capture_next_compute = true; + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); +} + +// backend device + +static const char *ggml_backend_tsavorite_device_get_name(ggml_backend_dev_t dev) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return "Tsavorite"; + + GGML_UNUSED(dev); +} + +static const char *ggml_backend_tsavorite_device_get_description(ggml_backend_dev_t dev) { + // acq/rel just to populate ctx->name in case it hasn't been done yet + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + struct ggml_backend_tsavorite_device_context *ctx_dev = + (struct ggml_backend_tsavorite_device_context *)dev->context; + ggml_backend_tsavorite_device_acq(ctx_dev); + ggml_backend_tsavorite_device_rel(ctx_dev); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + return ctx_dev->name; +} + +static void ggml_backend_tsavorite_device_get_memory(ggml_backend_dev_t dev, size_t *free, + size_t *total) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + + if (!dev || !free || !total) { + GGML_TSAVORITE_LOG_INFO("One of more pointers(dev, free, total) are NULL\n"); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return; + } + *total = 0; + *total = 0; + struct ggml_backend_tsavorite_device_context *ctx_dev = + (struct ggml_backend_tsavorite_device_context *)dev->context; + if (ctx_dev) { + txe_device_s device = ggml_backend_tsavorite_device_acq(ctx_dev); + *total = device->recommended_max_working_set_size; + *free = *total - device->current_allocated_size; + GGML_TSAVORITE_LOG_CONT("\n TXE Device MEMORY Summary total %lu and free %lu \n", *total, + *free); + ggml_backend_tsavorite_device_rel(ctx_dev); + } + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return; +} + +// Currently We are setting our TXE accerator at GPU Type +static enum ggml_backend_dev_type ggml_backend_tsavorite_device_get_type(ggml_backend_dev_t dev) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return GGML_BACKEND_DEVICE_TYPE_GPU; + + GGML_UNUSED(dev); +} + +// Need to understand the scope of this API since this is not used +// // use by Structure llama_model_loader +// func llm_load_tensors +// structure lama_new_context_with_model +static void ggml_backend_tsavorite_device_get_props(ggml_backend_dev_t dev, + struct ggml_backend_dev_props *props) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + props->name = ggml_backend_tsavorite_device_get_name(dev); + props->description = ggml_backend_tsavorite_device_get_description(dev); + props->type = ggml_backend_tsavorite_device_get_type(dev); + ggml_backend_tsavorite_device_get_memory(dev, &props->memory_free, &props->memory_total); + + if (props) { + props->caps.async = false; + props->caps.host_buffer = false; + props->caps.buffer_from_host_ptr = true; + props->caps.buffer_from_host_ptr = false; + props->caps.events = false; + } + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); +} + +static ggml_backend_t ggml_backend_tsavorite_device_init(ggml_backend_dev_t dev, + const char *params) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + struct ggml_backend_tsavorite_context *ctx = ggml_tsavorite_init(dev); + if (ctx == NULL) { + GGML_TSAVORITE_LOG_ERROR("%s: error: failed to allocate context\n", __func__); + return NULL; + } + + ggml_backend_t backend = (ggml_backend_t)malloc(sizeof(struct ggml_backend)); + + if (backend) { + backend->guid = ggml_backend_tsavorite_guid(); + backend->iface = ggml_backend_tsavorite_i; + backend->device = dev; + backend->context = ctx; + } + + ggml_backend_tsavorite_set_n_cb(backend, 1); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + return backend; + + GGML_UNUSED(params); +} + +static ggml_backend_buffer_type_t +ggml_backend_tsavorite_device_get_buffer_type(ggml_backend_dev_t dev) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return ggml_backend_tsavorite_buffer_type(); + + GGML_UNUSED(dev); +} + +// Currently for llama.cpp model below API it seems not used +// llama.cpp is using as part llm_load_tensors +// buffer_from_host_ptr_supported +// is_default_buft +// else they will be using +// ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); +// Need to revist when we will look at buffer section implementation +static ggml_backend_buffer_t ggml_backend_tsavorite_device_buffer_from_ptr(ggml_backend_dev_t dev, + void *ptr, size_t size, + size_t max_tensor_size) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + struct ggml_backend_tsavorite_buffer_context *ctx = + (struct ggml_backend_tsavorite_buffer_context *)calloc( + 1, sizeof(struct ggml_backend_tsavorite_buffer_context)); + + ctx->all_data = ptr; + ctx->all_size = size; + ctx->owned = false; + ctx->n_buffers = 0; + + const size_t size_page = sysconf(_SC_PAGESIZE); + + // page-align the data ptr + { + const uintptr_t offs = (uintptr_t)ptr % size_page; + ptr = (void *)((char *)ptr - offs); + size += offs; + } + + size_t size_aligned = size; + if ((size_aligned % size_page) != 0) { + size_aligned += (size_page - (size_aligned % size_page)); + } + + struct ggml_backend_tsavorite_device_context *ctx_dev = + (struct ggml_backend_tsavorite_device_context *)dev->context; + txe_device_s device = ggml_backend_tsavorite_device_acq(ctx_dev); + + // the buffer fits into the max buffer size allowed by the device + if (size_aligned <= device->max_buf_len) { + ctx->buffers[ctx->n_buffers].data = ptr; + ctx->buffers[ctx->n_buffers].size = size; + + // ggml_backend_tsavorite_log_allocated_size(device, size_aligned); + + ++ctx->n_buffers; + } else { + // this overlap between the views will guarantee that the tensor with the maximum size will + // fully fit into one of the views + const size_t size_ovlp = ((max_tensor_size + size_page - 1) / size_page + 1) * + size_page; // round-up 2 pages just in case + const size_t size_step = device->max_buf_len - size_ovlp; + const size_t size_view = device->max_buf_len; + + for (size_t i = 0; i < size; i += size_step) { + const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i); + + ctx->buffers[ctx->n_buffers].data = (void *)((uint8_t *)ptr + i); + ctx->buffers[ctx->n_buffers].size = size_step_aligned; + + // ggml_backend_tsavorite_log_allocated_size(device, size_step_aligned); + + if (i + size_step < size) { + GGML_TSAVORITE_LOG_INFO("\n"); + } + + ++ctx->n_buffers; + } + } + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + return ggml_backend_buffer_init(ggml_backend_tsavorite_buffer_type(), + ggml_backend_tsavorite_buffer_i, ctx, size); +} + +// llama_build_graph -> ggml_backend_supports_op -> gml_backend_dev_supports_op +// basically if true then it will call ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, +// backend.get()); here is cur is tensor +static bool ggml_backend_tsavorite_device_supports_op(ggml_backend_dev_t dev, + const struct ggml_tensor *op) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + struct ggml_backend_tsavorite_device_context *ctx_dev = + (struct ggml_backend_tsavorite_device_context *)dev->context; + + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return ggml_tsavorite_supports_op(ctx_dev, op); +} + +// template +// static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {} +// ggml_backend_dev_supports_op(dev, op_tensor); +static bool ggml_backend_tsavorite_device_supports_buft(ggml_backend_dev_t dev, + ggml_backend_buffer_type_t buft) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return buft->iface.get_name == ggml_backend_tsavorite_buffer_type_get_name; + + TSI_UNUSED(dev); +} + +// // returns the backend that should be used for the node based on the current locations +// ggml_backend_sched_backend_id_from_cur -> ggml_backend_offload_op -> +static bool ggml_backend_tsavorite_device_offload_op(ggml_backend_dev_t dev, + const struct ggml_tensor *op) { + if (op->type != GGML_TYPE_F32) + return false; + switch (op->op) { + // case GGML_OP_NONE: + case GGML_OP_ADD: + case GGML_OP_SUB: + case GGML_OP_DIV: + case GGML_OP_MUL: + case GGML_OP_SQRT: + case GGML_OP_SIN: + break; + case GGML_OP_UNARY: + switch (ggml_get_unary_op(op)) { + case GGML_UNARY_OP_NEG: + case GGML_UNARY_OP_ABS: + case GGML_UNARY_OP_SIGMOID: + case GGML_UNARY_OP_SILU: + break; + default: + return false; + } + break; + default: + return false; + } + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return true; + TSI_UNUSED(dev); +} +#ifdef SYNC_DEBUG +static void ggml_backend_tsavorite_device_synchronize(ggml_backend_dev_t dev, + ggml_backend_event_t event) { + usleep(100); + TSI_UNUSED(dev); + TSI_UNUSED(event); +} +#endif /* SYNC_DEBUG */ + +static struct ggml_backend_device_i ggml_backend_tsavorite_device_i = { + /* .get_name = */ ggml_backend_tsavorite_device_get_name, + /* .get_description = */ ggml_backend_tsavorite_device_get_description, + /* .get_memory = */ ggml_backend_tsavorite_device_get_memory, + /* .get_type = */ ggml_backend_tsavorite_device_get_type, + /* .get_props = */ ggml_backend_tsavorite_device_get_props, + /* .init_backend = */ ggml_backend_tsavorite_device_init, + /* .get_buffer_type = */ ggml_backend_tsavorite_device_get_buffer_type, + /* .get_host_buffer_type = */ NULL, + /* .buffer_from_host_ptr = */ ggml_backend_tsavorite_device_buffer_from_ptr, + /* .supports_op = */ ggml_backend_tsavorite_device_supports_op, + /* .supports_buft = */ ggml_backend_tsavorite_device_supports_buft, + /* .offload_op = */ ggml_backend_tsavorite_device_offload_op, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_synchronize = */ NULL, +}; + +// backend registry + +static const char *ggml_backend_tsavorite_reg_get_name(ggml_backend_reg_t reg) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return "Tsavorite"; + + GGML_UNUSED(reg); +} + +static size_t ggml_backend_tsavorite_reg_device_count(ggml_backend_reg_t reg) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return 1; + + GGML_UNUSED(reg); +} + +static ggml_backend_dev_t ggml_backend_tsavorite_reg_device_get(ggml_backend_reg_t reg, + size_t index) { + GGML_ASSERT(index == 0); + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + return &g_ggml_backend_tsavorite_device; + + GGML_UNUSED(reg); + GGML_UNUSED(index); +} + +static struct ggml_backend_reg_i ggml_backend_tsavorite_reg_i = { + /* .get_name = */ ggml_backend_tsavorite_reg_get_name, + /* .device_count = */ ggml_backend_tsavorite_reg_device_count, + /* .device_get = */ ggml_backend_tsavorite_reg_device_get, + /* .get_proc_address = */ NULL, +}; + + +ggml_backend_reg_t ggml_backend_tsavorite_reg(void) { + ggml_tsavorite_log_type_val = GGML_TSAVORITE_LOG_NONE; + ggml_tsavorite_kernel_mode_flag = GGML_TSAVORITE_KERNEL_MODE_MLIR; + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + g_ggml_backend_tsavorite_reg.iface = ggml_backend_tsavorite_reg_i; + g_ggml_backend_tsavorite_reg.context = NULL; + + g_ggml_backend_tsavorite_device.iface = ggml_backend_tsavorite_device_i; + g_ggml_backend_tsavorite_device.reg = &g_ggml_backend_tsavorite_reg; + g_ggml_backend_tsavorite_device.context = &g_ggml_ctx_dev_main; + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + return &g_ggml_backend_tsavorite_reg; +} + +GGML_BACKEND_DL_IMPL(ggml_backend_tsavorite_reg) diff --git a/ggml/src/ggml-tsavorite/include/TestModel.h b/ggml/src/ggml-tsavorite/include/TestModel.h new file mode 100644 index 0000000000000..feff2539a96fa --- /dev/null +++ b/ggml/src/ggml-tsavorite/include/TestModel.h @@ -0,0 +1,217 @@ +#pragma once + +#include "HostShimCAPI.h" +#include +#include +#include +#include +#include +#include + +#define MAX_RESULT_VALUES_TO_PRINT 32 +template +struct MemRefDescriptor { + void *base; + void *data; + int64_t offset = 0; + int64_t shape[N]; + int64_t strides[N]; +} __attribute__((aligned(128))); + +template +class TestModel { +public: + TestModel(std::string name, int version, bool verbose = false) + : name_(name), version_(version), verbose_(verbose) {} + + ~TestModel() { + // free memory + for (int i = 0; i < NumInputs; i++) + tsi_dealloc(inputs[i].base); + for (int i = 0; i < NumOutputs; i++) + tsi_dealloc(outputs[i].base); + tsi_finalize(); + } + + template + void initRandom(size_t numElements, + std::array inputRange = {-10, 10}) { + static_assert(Rank == 1, + "initRandom(size_t) is only defined for Rank == 1"); + size_t inputSizes[2][Rank] = {{numElements}, {numElements}}; + size_t outputSizes[1][Rank] = {{numElements}}; + init(inputSizes, outputSizes, + /*initWithRandom=*/true, inputRange); + } + +#if 0 + template + void initFill(size_t numElements, ElType val) { + static_assert(Rank == 1, + "initRandom(size_t) is only defined for Rank == 1"); + size_t inputSizes[2][Rank] = {{numElements}, {numElements}}; + size_t outputSizes[1][Rank] = {{numElements}}; + init(inputSizes, outputSizes); + for (int i = 0; i < NumInputs; i++) { + auto nEls = getNumElements(inputs[i]); + for (size_t j = 0; j < nEls; j++) + static_cast(inputs[i].data)[j] = val; + } + } +#endif /* 0 */ + + template + void init(size_t inputSizes[NumInputs][Rank], + size_t outputSizes[NumOutputs][Rank], bool initWithRandom = false, + std::array inputRange = {-10, 10}) { + tsi_initialize(1); + + for (int i = 0; i < NumInputs; i++) + initMemRefDescriptor(inputs[i], inputSizes[i], + initWithRandom, inputRange, i); + + for (int i = 0; i < NumOutputs; i++) { + initMemRefDescriptor(outputs[i], outputSizes[i]); + // set default result values to -1 + auto nEls = getNumElements(outputSizes[i]); + std::fill((OutputsElType *)outputs[i].base, + (OutputsElType *)outputs[i].base + nEls, -1); + } + if (verbose_) { + printf("[%s v.%d] Allocated DRAM arrays (host VAs):", name_.c_str(), + version_); + for (int i = 0; i < NumInputs; i++) + printf(" ANOOP input%d = %p ", i, inputs[i].base); + for (int i = 0; i < NumOutputs; i++) + printf(" ANOOP-1 output%d = %p ", i, outputs[i].base); + printf("\n"); + } + } + + template + int validateResult(size_t index, ElType *expected, bool printErrs = false, + float tolerance = 1e-5) { + if (verbose_) { + printf("[%s v.%d] Model executed successfully. Validating result...", + name_.c_str(), version_); + } + + int retCode = 0; + size_t nEls = getNumElements(outputs[index].shape); + float sqrSumOfDiff = 0.0; + for (size_t j = 0; j < nEls; j++) { + sqrSumOfDiff += + std::pow(((ElType *)outputs[index].base)[j] - expected[j], 2); + if (std::abs(((ElType *)outputs[index].base)[j] - expected[j]) > + tolerance) { + retCode = 1; + if (printErrs && j < MAX_RESULT_VALUES_TO_PRINT) { + printf("Mismatch at index %d: expected %1.6f, got %1.6f\n", (int)j, + expected[j], ((ElType *)outputs[index].base)[j]); + } + if (retCode && j == MAX_RESULT_VALUES_TO_PRINT) + printf("... (more mismatches not printed; maximum %d reached) ...\n", + MAX_RESULT_VALUES_TO_PRINT); + } + } + // Compute the relative error: norm2(result) / norm2(expected) + float sqrSumExpected = 0.0; + for (size_t j = 0; j < nEls; j++) + sqrSumExpected += std::pow(expected[j], 2); + + float relativeErr = std::sqrt(sqrSumOfDiff) / std::sqrt(sqrSumExpected); + if (verbose_) { + retCode ? printf("\n[%s v.%d] FAILED [relative err=%1.6f]\n", + name_.c_str(), version_, relativeErr) + : printf("\n[%s v.%d] PASS [relative err=%1.6f]\n", name_.c_str(), + version_, relativeErr); + } + return retCode; + } + + size_t getNumElements(const MemRefDescriptor &memref) const { + return getNumElements(memref.shape); + } + + template + void writeToFile(void *data, size_t numElements, + const std::string &filename) { + std::ofstream ofs(filename, std::ios::binary); + if (!ofs) { + printf("[%s v.%d] Error opening file %s for writing.", name_.c_str(), + version_, filename.c_str()); + return; + } + ofs.write((char *)data, numElements * sizeof(ElType)); + ofs.close(); + } + + template + void readFromFile(void *data, size_t numElements, + const std::string &filename) { + std::ifstream ifs(filename, std::ios::binary); + if (!ifs) { + printf("[%s v.%d] Error opening file %s for reading.", name_.c_str(), + version_, filename.c_str()); + return; + } + ifs.read((char *)data, numElements * sizeof(ElType)); + ifs.close(); + } + + std::string getName() const { return name_; } + std::string getVersion() const { return std::to_string(version_); } + + MemRefDescriptor inputs[NumInputs]; + MemRefDescriptor outputs[NumOutputs]; + +private: + std::string name_; + int version_ = 1; + bool verbose_ = false; + + template + void initMemRefDescriptor(MemRefDescriptor &memref, size_t shape[Rank], + bool initWithRandom = false, + std::array inputRange = {-10, 10}, + int seed = 42) { + size_t nBytes = sizeof(ElType); + for (int i = 0; i < Rank; i++) { + nBytes *= shape[i]; + } + memref.base = tsi_alloc(nBytes); + memref.data = memref.base; + memref.offset = 0; + printf("\n checking Shape value %d \n\n", memref.shape[0]); +#if 0 + for (int i = 0; i < Rank; i++) { + memref.shape[i] = shape[i]; + memref.strides[i] = 1; + for (int j = i + 1; j < Rank; j++) { + memref.strides[i] *= shape[j]; + } + } + #endif + if (initWithRandom) { + std::mt19937 gen(seed); // fixed seed + std::uniform_real_distribution dist(inputRange[0], inputRange[1]); + for (size_t i = 0; i < getNumElements(shape); i++) { + static_cast(memref.data)[i] = static_cast(dist(gen)); + } + } + } + + size_t getNumElements(const int64_t shape[Rank]) const { + size_t numElements = 1; + printf("\n Anoop Rank %d and shape[Rank] %d \n\n", Rank, shape[Rank]); + for (int i = 0; i < Rank; i++) { + numElements *= shape[i]; + } + printf("\n numElements %d \n", numElements); + return numElements; + } + + size_t getNumElements(const size_t shape[Rank]) const { + return getNumElements(reinterpret_cast(shape)); + } +}; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 57d3e39adf758..134b7420de746 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -249,6 +249,7 @@ static void ggml_log_internal_v(enum ggml_log_level level, const char * format, void ggml_log_internal(enum ggml_log_level level, const char * format, ...) { va_list args; va_start(args, format); + if (level == GGML_LOG_LEVEL_TSAVORITE) ggml_log_internal_v(level, format, args); va_end(args); } diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 85b4324b699e6..984dbf14d14ae 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2615,13 +2615,19 @@ void llama_perf_context_print(const llama_context * ctx) { const auto data = llama_perf_context(ctx); const double t_end_ms = 1e-3 * ggml_time_us(); - LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms); LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval); LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); + + LLAMA_LOG_TSAVORITE("%s: load time = %10.2f ms\n", __func__, data.t_load_ms); + LLAMA_LOG_TSAVORITE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval); + LLAMA_LOG_TSAVORITE("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); + LLAMA_LOG_TSAVORITE("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); } void llama_perf_context_reset(llama_context * ctx) { diff --git a/src/llama-impl.h b/src/llama-impl.h index 02b1d07f8400d..abc963a4a14e7 100644 --- a/src/llama-impl.h +++ b/src/llama-impl.h @@ -29,6 +29,7 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void * #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) #define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) #define LLAMA_LOG_CONT(...) llama_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__) +#define LLAMA_LOG_TSAVORITE(...) llama_log_internal(GGML_LOG_LEVEL_TSAVORITE, __VA_ARGS__) // // helpers diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 804b11e0a943e..d012a0ce520e0 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2562,6 +2562,8 @@ void llama_perf_sampler_print(const struct llama_sampler * chain) { LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample); + LLAMA_LOG_TSAVORITE("\n\n%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)", + __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample); } void llama_perf_sampler_reset(struct llama_sampler * chain) { diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 083347d188880..1c8b8e29a822e 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -174,4 +174,4 @@ target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd) # dummy executable - not installed get_filename_component(TEST_TARGET test-c.c NAME_WE) add_executable(${TEST_TARGET} test-c.c) -target_link_libraries(${TEST_TARGET} PRIVATE llama) +target_link_libraries(${TEST_TARGET} PRIVATE ${TLIBS} llama stdc++) diff --git a/tools/flaskIfc/README.md b/tools/flaskIfc/README.md new file mode 100644 index 0000000000000..b7163dfaf4216 --- /dev/null +++ b/tools/flaskIfc/README.md @@ -0,0 +1,53 @@ +This tool provides you an interface to Tsavorite FPGA via a serial console + +The tool consists of following files + +. +├── flaskCommon.py << Common code but currently not used +├── flaskIfc.py << Browser based console interface to TSI device +├── flaskXterm.py << Browser based terminal emulation +├── README.md << Readme file +└── serial_script.py << File with serial interface to console + + +The command to run to run the service on FPGA machine is +``` +flask -A flaskIfc.py --debug run --port 5000 +``` + +This command runs a webserver at port number 500 + +The curl command to connect to this server and communicate is as follows as +an example + +``` +curl "http://localhost:5000/serial?command=cd+%20/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/;./run_platform_test.sh" +``` + +In the above command the command being run is + +``` +cd /usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin +./run_platform_test.sh +``` + +You can also get full fledged Terminal within a browser by running following + +``` +flask -A flaskXterm.py --debug run --port 5000 +``` + +You can connect to this flaskTerm by doing as follows + +``` +http://127.0.0.1:5000/terminal +``` + +If your machine does not have flask terminal installed these are the typical packages you will have to install +``` +sudo apt install flask +sudo pip install flask-terminal +pip install blinker +pip install flask +pip install flask-terminal +``` diff --git a/tools/flaskIfc/copy2fpga-setup.sh b/tools/flaskIfc/copy2fpga-setup.sh new file mode 100755 index 0000000000000..9ccbe55d3939f --- /dev/null +++ b/tools/flaskIfc/copy2fpga-setup.sh @@ -0,0 +1,12 @@ + +echo " Remove the device " +sudo bash -c "echo 1 > /sys/bus/pci/devices/0000\:01\:00.0/remove" + +echo "rescan" +sudo bash -c "echo 1 > /sys/bus/pci/rescan" + +echo " dump the pci data" +/aws/proj/rel/sw/platform/release_v0.1.1.tsv026_04_15_2025/scripts/dump-pci.sh + +echo " set the pci bit to access mem" +sudo setpci -s 0000:01:00.0 COMMAND=0x02 diff --git a/tools/flaskIfc/copy2fpga-x86 b/tools/flaskIfc/copy2fpga-x86 new file mode 100755 index 0000000000000..7a0e76a65b5b3 Binary files /dev/null and b/tools/flaskIfc/copy2fpga-x86 differ diff --git a/tools/flaskIfc/copy2fpga-x86.sh b/tools/flaskIfc/copy2fpga-x86.sh new file mode 100755 index 0000000000000..d214838f6b52c --- /dev/null +++ b/tools/flaskIfc/copy2fpga-x86.sh @@ -0,0 +1,9 @@ +#! /bin/bash +# This file runs the PCIE setup needed for file transfer. +# Also, it invokes the file transfer utility: copy2fpga-x86 +# Note: sudo permissions are needed for file transfer +# +echo " Inside copy2fpga-x86.sh " +sudo ./copy2fpga-setup.sh +echo "sudo ./copy2fpga-x86 $1" +sudo ./copy2fpga-x86 $1 diff --git a/tools/flaskIfc/flaskCommon.py b/tools/flaskIfc/flaskCommon.py new file mode 100644 index 0000000000000..eb93a63fcf395 --- /dev/null +++ b/tools/flaskIfc/flaskCommon.py @@ -0,0 +1,85 @@ + +from flask import Flask +from flask_terminal import terminal_blueprint, configure_logger +from flask import Flask, render_template, request +import serial + + +app = Flask(__name__) +app.logger = configure_logger('flask_terminal') + +app.config['SECRET_KEY'] = 'your_secret_key_here' + + +@app.route('/ping') +def ping(): + app.logger.info("Accessed /ping route") + try: + app.logger.info("Successfully returned 'pong'") + return 'pong', 200 + except Exception as e: + app.logger.error(f"Error in ping route: {e}", exc_info=True) + return "An error occurred", 500 + +#### +## IMPLEMENT SOME SORT OF SECURITY +## Around your application, below is an example +### +def is_authenticated(): + """Check if the user is authenticated based on a token stored in the session.""" + # Example logic for checking if a user is authenticated + return 'user_token' in session and session['user_token'] == 'your_secure_token' + +#@terminal_blueprint.before_request +#def before_request_func(): +# if not is_authenticated(): + # Redirect to login page or return an error +# current_app.logger.info("User not authenticated, redirecting to login.") +# return redirect('/login') # Adjusted to use a direct path + + +# Register the terminal blueprint +#app.register_blueprint(terminal_blueprint, url_prefix='/terminal') + +#if __name__ == '__main__': +# app.run(port=8080) + + + +try: + ser = serial.Serial('/dev/ttyUSB3', 921600) # Replace /dev/ttyUSB3 with your port and baud rate +except serial.SerialException as e: + print(f"Error opening serial port: {e}") + ser = None # Handle case where serial port cannot be opened + +@app.route('/send', methods=['POST']) +def send_data(): + if ser is None: + return "Serial port not available", 500 + data = request.form['data'] # Get data from the form + try: + ser.write(data.encode()) # Convert to bytes and send + return 'Data sent successfully' + except serial.SerialException as e: + return f"Error writing to serial port: {e}", 500 + + +@app.route('/receive') +def receive_data(): + if ser is None: + return "Serial port not available", 500 + try: + if ser.in_waiting > 0: + data = ser.readline().decode().strip() # Read and decode + return data + else: + return "No data available" + except serial.SerialException as e: + return f"Error reading from serial port: {e}", 500 + +# Register the terminal blueprint +#app.register_blueprint(terminal_blueprint, url_prefix='/terminal') + +if __name__ == '__main__': + app.run(port=8080) + diff --git a/tools/flaskIfc/flaskIfc.py b/tools/flaskIfc/flaskIfc.py new file mode 100644 index 0000000000000..38e20b4d1dfbf --- /dev/null +++ b/tools/flaskIfc/flaskIfc.py @@ -0,0 +1,307 @@ +from flask import Flask, render_template, request +import subprocess +import threading +import time +from werkzeug.utils import secure_filename +import os +import subprocess +import mmap + + +job_status = {"running": False, "result": "", "thread": None} + +app = Flask(__name__) + +port = '/dev/ttyUSB3' +#port = '/dev/ttyUSB2' +baudrate = '921600' +#baudrate = '115200' +#exe_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/" +exe_path = "/usr/bin/tsi/v0.1.1*/bin/" + +DEFAULT_REPEAT_PENALTY = 1.5 +DEFAULT_BATCH_SIZE = 1024 +DEFAULT_TOP_K = 50 +DEFAULT_TOP_P = 0.9 +DEFAULT_LAST_N = 5 +DEFAULT_CONTEXT_LENGTH = 12288 +DEFAULT_TEMP = 0.0 + +@app.route('/') +def index(): + return render_template('index.html') + +@app.route('/llama-cli', methods=['GET']) +def llama_cli_serial_command(): + + #./run_llama_cli.sh "my cat's name" "10" "tinyllama-vo-5m-para.gguf" "none" + model = request.args.get('model') + backend = request.args.get('backend') + tokens = request.args.get('tokens') + prompt = request.args.get('prompt') + repeat_penalty = request.args.get('repeat-penalty', DEFAULT_REPEAT_PENALTY) + batch_size = request.args.get('batch-size', DEFAULT_BATCH_SIZE) + top_k = request.args.get('top-k', DEFAULT_TOP_K) + top_p = request.args.get('top-p', DEFAULT_TOP_P) + last_n = request.args.get('last-n', DEFAULT_LAST_N) + context_length = request.args.get('context-length', DEFAULT_CONTEXT_LENGTH) + temp = request.args.get('temp', DEFAULT_TEMP) + + # Define the model path (update with actual paths) + model_paths = { + "tiny-llama": "tinyllama-vo-5m-para.gguf", + "Tiny-llama-F32": "Tiny-Llama-v0.3-FP32-1.1B-F32.gguf" + } + + model_path = model_paths.get(model, "") + if not model_path: + return f"

Error: Model path not found for '{model}'

" + + # Build llama-cli command + #command = [ + # "./llama-cli", + # "-p", prompt, + # "-m", model_path, + # "--device", backend, + # "--temp", "0", + # "--n-predict", tokens, + # "--repeat-penalty", "1", + # "--top-k", "0", + # "--top-p", "1" + #] + # URL to Test this end point is as follows + # http://10.50.30.167:5001/llama-cli?model=tiny-llama&backend=tSavorite&tokens=5&prompt=Hello+How+are+you + script_path = "./run_llama_cli.sh" + command = f"cd {exe_path}; {script_path} \"{prompt}\" {tokens} {model_path} {backend} {repeat_penalty} {batch_size} {top_k} {top_p} {last_n} {context_length} {temp}" + + try: + result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True) + return result.stdout, 200 + except subprocess.CalledProcessError as e: + return f"Error executing script: {e.stderr}", 500 + +UPLOAD_FOLDER = './' # Directory where uploaded files will be stored +app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER +os.makedirs(UPLOAD_FOLDER, exist_ok=True) # Create the upload folder if it doesn't exist + +@app.route('/upload-gguf', methods=['POST', 'GET']) +def upload_serial_command(): + if request.method == 'POST': + # Check if a file was submitted + if 'file' not in request.files: + return "No file part" + file = request.files['file'] + + # Check if the file is empty + if file.filename == '': + return "No file selected" + + # Save the file if it exists + if file: + filename = secure_filename(file.filename) + file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) + return "File uploaded successfully" + return render_template('upload.html') # Display the upload form + +# command = f"upload file" +# try: +# result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True) +# return result.stdout, 200 +# except subprocess.CalledProcessError as e: +# return f"Error executing script: {e.stderr}", 500 + +@app.route('/uploadtofpga-file', methods=['GET', 'POST']) +def uploadtofpga_file(): + setupprints = "Before:Copy2fpga-setup.sh" + print(setupprints) + + if request.method == 'POST': + # Check if a file was submitted + if 'file' not in request.files: + return "No file part" + file = request.files['file'] + + # Check if the file is empty + if file.filename == '': + return "No file selected" + + # Save the file if it exists + if file: + filename = secure_filename(file.filename) + process = subprocess.Popen(["./copy2fpga-x86.sh", filename], text=True) + copy2fpgax86prints = "Starting copy2fpga-x86 sending file..." + print (copy2fpgax86prints) + file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) + + script_path = "./recvFromHost " + command = f"cd {exe_path}; {script_path} {filename}" + def scriptRecvFromHost(): + try: + result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True) + job_status["result"] = result.stdout + print("FPGA Target ready to receive file: recvFromHost started..\n") + print(result.stdout) + recv_output = result.stdout + except subprocess.CalledProcessError as e: + job_status["result"] = f"Error: {e.stderr}" + finally: + job_status["running"] = False + thread = threading.Thread(target=scriptRecvFromHost) + job_status = {"running": True, "result": "", "thread": thread} + thread.start() + + stdout, stderr = process.communicate() + return render_template('uploadtofpga.html', apple = process, recvoutput=f"On FPGA Target, recvFromHost completed ; transf ered file:{filename} received") + return render_template('upload.html') # Display the upload form + +@app.route('/upload-file', methods=['GET', 'POST']) +def upload_file(): + if request.method == 'POST': + # Check if a file was submitted + if 'file' not in request.files: + return "No file part" + file = request.files['file'] + + # Check if the file is empty + if file.filename == '': + return "No file selected" + + # Save the file if it exists + if file: + filename = secure_filename(file.filename) + file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) + return "File uploaded successfully" + return render_template('upload.html') # Display the upload form + +@app.route('/restart-txe', methods=['GET']) +def restart_txe_serial_command(): + command = f"telnet localhost 8000\r\nclose all\r\n" + + try: + result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True) + time.sleep(5) + command = f"{exe_path}/../install/tsi-start\nyes\n" + try: + result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True) + return result.stdout, 200 + except subprocess.CalledProcessError as e: + return f"Error executing script: {e.stderr}", 500 + except subprocess.CalledProcessError as e: + return f"Error executing script: {e.stderr}", 500 + +@app.route('/health-check', methods=['GET']) +def health_check_serial_command(): + command = f"free -h" + + try: + result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True) + return result.stdout, 200 + except subprocess.CalledProcessError as e: + return f"Error executing script: {e.stderr}", 500 + +@app.route('/test', methods=['GET']) +def test_serial_command(): + command = f"test" + + try: + result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True) + return result.stdout, 200 + except subprocess.CalledProcessError as e: + return f"Error executing script: {e.stderr}", 500 + +@app.route('/system-info', methods=['GET']) +def system_info_serial_command(): + + command = f"{exe_path}../install/tsi-version;lscpu" + + try: + result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True) + return result.stdout, 200 + except subprocess.CalledProcessError as e: + return f"Error executing script: {e.stderr}", 500 + +@app.route('/submit', methods=['POST']) +def submit(): + global job_status + + if job_status["running"]: + return "

A model is already running. Please wait or abort.

" + + #./run_llama_cli.sh "my cat's name" "10" "tinyllama-vo-5m-para.gguf" "none" + model = request.form.get('model') + backend = request.form.get('backend') + tokens = request.form.get('tokens') + prompt = request.form.get('prompt') + repeat_penalty = request.form.get('repeat-penalty', DEFAULT_REPEAT_PENALTY) + batch_size = request.form.get('batch-size', DEFAULT_BATCH_SIZE) + top_k = request.form.get('top-k', DEFAULT_TOP_K) + top_p = request.form.get('top-p', DEFAULT_TOP_P) + last_n = request.form.get('last-n', DEFAULT_LAST_N) + context_length = request.form.get('context-length', DEFAULT_CONTEXT_LENGTH) + temp = request.form.get('temp', DEFAULT_TEMP) + + # Define the model path (update with actual paths) + model_paths = { + "tiny-llama": "tinyllama-vo-5m-para.gguf", + "Tiny-llama-F32": "Tiny-Llama-v0.3-FP32-1.1B-F32.gguf" + } + + model_path = model_paths.get(model, "") + if not model_path: + return f"

Error: Model path not found for '{model}'

" + + # Build llama-cli command + #command = [ + # "./llama-cli", + # "-p", prompt, + # "-m", model_path, + # "--device", backend, + # "--temp", "0", + # "--n-predict", tokens, + # "--repeat-penalty", "1", + # "--top-k", "0", + # "--top-p", "1" + #] + + script_path = "./run_llama_cli.sh" + command = f"cd {exe_path}; {script_path} \"{prompt}\" {tokens} {model_path} {backend} {repeat_penalty} {batch_size} {top_k} {top_p} {last_n} {context_length} {temp}" + + + def run_script(): + try: + result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True) + job_status["result"] = result.stdout + except subprocess.CalledProcessError as e: + job_status["result"] = f"Error: {e.stderr}" + finally: + job_status["running"] = False + + thread = threading.Thread(target=run_script) + job_status = {"running": True, "result": "", "thread": thread} + thread.start() + + return render_template("processing.html") + +@app.route('/status') +def status(): + if job_status["running"]: + return "running" + else: + return "done" + +@app.route('/result') +def result(): + return render_template("result.html", output=job_status["result"]) + +@app.route('/abort') +def abort(): + global job_status + if job_status["running"] and job_status["thread"].is_alive(): + # Use subprocess.Popen + pid handling instead for real process termination + job_status["running"] = False + job_status["result"] = "Aborted by user." + return "

Job aborted.

Home" + return "

No job running.

Home" + +if __name__ == '__main__': + app.run(debug=True, port=5000) diff --git a/tools/flaskIfc/flaskXterm.py b/tools/flaskIfc/flaskXterm.py new file mode 100644 index 0000000000000..df7ecf391471d --- /dev/null +++ b/tools/flaskIfc/flaskXterm.py @@ -0,0 +1,43 @@ + +from flask import Flask +from flask_terminal import terminal_blueprint, configure_logger + + +app = Flask(__name__) +app.logger = configure_logger('flask_terminal') + +app.config['SECRET_KEY'] = 'your_secret_key_here' + + +@app.route('/ping') +def ping(): + app.logger.info("Accessed /ping route") + try: + app.logger.info("Successfully returned 'pong'") + return 'pong', 200 + except Exception as e: + app.logger.error(f"Error in ping route: {e}", exc_info=True) + return "An error occurred", 500 + +#### +## IMPLEMENT SOME SORT OF SECURITY +## Around your application, below is an example +### +def is_authenticated(): + """Check if the user is authenticated based on a token stored in the session.""" + # Example logic for checking if a user is authenticated + return 'user_token' in session and session['user_token'] == 'your_secure_token' + +#@terminal_blueprint.before_request +#def before_request_func(): +# if not is_authenticated(): + # Redirect to login page or return an error +# current_app.logger.info("User not authenticated, redirecting to login.") +# return redirect('/login') # Adjusted to use a direct path + + +# Register the terminal blueprint +app.register_blueprint(terminal_blueprint, url_prefix='/terminal') + +if __name__ == '__main__': + app.run(port=8080) diff --git a/tools/flaskIfc/serial_script.py b/tools/flaskIfc/serial_script.py new file mode 100644 index 0000000000000..9581376a8b16b --- /dev/null +++ b/tools/flaskIfc/serial_script.py @@ -0,0 +1,55 @@ +import serial +import sys + +def send_serial_command(port, baudrate, command): + try: + ser = serial.Serial(port, baudrate) + + ser.write((command + '\n').encode()) # Send command with newline + # Wait to read the serial port + data = '\0' + first_time = 1 + while True: + try: + # read byte by byte to find either a new line character or a prompt marker + # instead of new line using line = ser.readline() + line = b"" + while True: + byte = ser.read(1) # Read one byte at a time + if (byte == b"\n") or (byte == b"#"): # Stop when delimiter is found + break + line += byte + if line: # Check if line is not empty + read_next_line = line.decode('utf-8') + if ("run-platform-done" in read_next_line.strip()) or ("@agilex7_dk_si_agf014ea" in read_next_line.strip()) or ("imx8mpevk" in read_next_line.strip()): + break + if (first_time == 1) : + first_time = 0 + else: + data += read_next_line # Keep the line as-is with newline + else: + break # Exit loop if no data is received + except serial.SerialException as e: + ser.close() + return (f"Error reading from serial port: {e}") + except KeyboardInterrupt: + ser.close() + return ("Program interrupted by user") + ser.close() + print(data) + return data + + except serial.SerialException as e: + ser.close() + return f"Error: {e}" + +# This script can be run in standalone as well +if __name__ == "__main__": + if len(sys.argv) < 4: + print("Usage: python script.py ") + sys.exit(1) + + port = sys.argv[1] + baudrate = int(sys.argv[2]) + command = sys.argv[3] + response = send_serial_command(port, baudrate, command) diff --git a/tools/flaskIfc/templates/index.html b/tools/flaskIfc/templates/index.html new file mode 100644 index 0000000000000..9152167a86c44 --- /dev/null +++ b/tools/flaskIfc/templates/index.html @@ -0,0 +1,38 @@ + + + + TSAVORITE Web UI For Model Inference + + +

Model Inference Configuration

+
+ + + +

+ + + + +

+ + + + +

+ + +
+ +

+ + +
+ + diff --git a/tools/flaskIfc/templates/processing.html b/tools/flaskIfc/templates/processing.html new file mode 100644 index 0000000000000..15f609bee1712 --- /dev/null +++ b/tools/flaskIfc/templates/processing.html @@ -0,0 +1,22 @@ + + + + Processing + + + +

Model is running...

+ + + diff --git a/tools/flaskIfc/templates/result.html b/tools/flaskIfc/templates/result.html new file mode 100644 index 0000000000000..07c79c409f596 --- /dev/null +++ b/tools/flaskIfc/templates/result.html @@ -0,0 +1,12 @@ + + + + Model Output + + +

Model Response

+
{{ output }}
+
+ ⟵ Back to Form + + diff --git a/tools/flaskIfc/templates/upload.html b/tools/flaskIfc/templates/upload.html new file mode 100644 index 0000000000000..3368379f74754 --- /dev/null +++ b/tools/flaskIfc/templates/upload.html @@ -0,0 +1,4 @@ +
+ + +
diff --git a/tools/flaskIfc/templates/uploadtofpga.html b/tools/flaskIfc/templates/uploadtofpga.html new file mode 100644 index 0000000000000..97445c1b68622 --- /dev/null +++ b/tools/flaskIfc/templates/uploadtofpga.html @@ -0,0 +1,14 @@ + + + + File Transfer In Progress... + + +

File Transfer Started.

+

Running copy2fpga-x86.sh

+
{{ apple }}
+
{{ recvoutput }}
+
+ + + diff --git a/tools/main/main.cpp b/tools/main/main.cpp index 1bd2be2d94f51..26842116ec6df 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -41,6 +41,12 @@ static std::vector * g_output_tokens; static bool is_interacting = false; static bool need_insert_eot = false; +static void my_logger(ggml_log_level level, const char *text, void *user_data) { + if (level == GGML_LOG_LEVEL_TSAVORITE) { + fprintf(stderr, "%s", text); // only show warnings or errors + } +} + static void print_usage(int argc, char ** argv) { (void) argc; @@ -120,6 +126,7 @@ int main(int argc, char ** argv) { LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); } + llama_log_set(my_logger, nullptr); LOG_INF("%s: llama backend init\n", __func__); llama_backend_init(); diff --git a/tsi-pkg-build.sh b/tsi-pkg-build.sh new file mode 100755 index 0000000000000..64c577235b911 --- /dev/null +++ b/tsi-pkg-build.sh @@ -0,0 +1,89 @@ + +set -e + +#Ensure prerequisites are met as follows +echo 'updating submodule' +git submodule update --recursive --init +cd ggml-tsi-kernel/ +module load tsi4 gcc/13.3.0 +export MLIR_SDK_VERSION=/proj/rel/sw/sdk-r.0.1.4 +echo 'creating python virtual env' +/proj/local/Python-3.10.12/bin/python3 -m venv blob-creation +source blob-creation/bin/activate +echo 'installing mlir and python dependencies' +pip install -r ${MLIR_SDK_VERSION}/compiler/python/requirements-common.txt +pip install ${MLIR_SDK_VERSION}/compiler/python/mlir_external_packages-1.3.0-py3-none-any.whl +pip install onnxruntime-training + +#build TSI kernels for the Tsavorite backend +#First for FPGA + +echo 'creating fpga kernel' +cd fpga-kernel +cmake -B build-fpga +./create-all-kernels.sh +#The for Posix Use cases + +echo 'creating posix kernel' +cd ../posix-kernel/ +./create-all-kernels.sh + +#Change directory to top level llama.cpp + +cd ../../ + +#Compile for posix with build-posix as a target folder + +echo 'building llama.cp, ggml for tsavorite and other binary for posix' +cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix +cmake --build build-posix --config Release + +#Compile for fpga with build-fpga as a target folder + +echo 'building llama.cp, ggml for tsavorite and other binary for fpga' +export CC="/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-gcc" +export CXX="/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-g++" +cmake -B build-fpga -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=fpga +cmake --build build-fpga --config Release + + +echo 'creating tar bundle for fpga' +TSI_GGML_VERSION=0.0.3 +TSI_GGML_BUNDLE_INSTALL_DIR=tsi-ggml +GGML_TSI_INSTALL_DIR=ggml-tsi-kernel +TSI_GGML_RELEASE_DIR=/proj/rel/sw/ggml/ +TSI_BLOB_INSTALL_DIR=$(pwd)/${GGML_TSI_INSTALL_DIR}/fpga-kernel/build-fpga + +if [ -e ${TSI_GGML_BUNDLE_INSTALL_DIR} ]; then + echo "${TSI_GGML_BUNDLE_INSTALL_DIR} exist" +else + echo "creating ${TSI_GGML_BUNDLE_INSTALL_DIR}" + mkdir ${TSI_GGML_BUNDLE_INSTALL_DIR} +fi +if [ -e ${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh ]; then + rm -fr ${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh +fi + +cat > ./${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh << EOL +#!/bin/bash +export LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:\$(pwd) +tsi_kernels=("add" "sub" "mult" "div" "abs" "inv" "neg" "sin" "sqrt" "sigmoid" "silu") + +for kernel in "\${tsi_kernels[@]}"; do + mkdir -p ${TSI_BLOB_INSTALL_DIR}/txe_\$kernel + cp blobs ${TSI_BLOB_INSTALL_DIR}/txe_\$kernel/ -r +done +EOL +chmod +x ${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh +cp ${GGML_TSI_INSTALL_DIR}/fpga/blobs ${TSI_GGML_BUNDLE_INSTALL_DIR}/ -r +cp build-fpga/bin/llama-cli ${TSI_GGML_BUNDLE_INSTALL_DIR}/ +cp build-fpga/bin/libggml*.so ${TSI_GGML_BUNDLE_INSTALL_DIR}/ +cp build-fpga/bin/libllama*.so ${TSI_GGML_BUNDLE_INSTALL_DIR}/ +cp build-fpga/bin/simple-backend-tsi ${TSI_GGML_BUNDLE_INSTALL_DIR}/ + +tar -cvzf ${TSI_GGML_BUNDLE_INSTALL_DIR}-${TSI_GGML_VERSION}.tz ${TSI_GGML_BUNDLE_INSTALL_DIR}/* + +if [ "$1" == "Release" ] || [ "$1" == "release" ] +then + cp ${TSI_GGML_BUNDLE_INSTALL_DIR}-${TSI_GGML_VERSION}.tz ${TSI_GGML_RELEASE_DIR} +fi