From bb1f9812d5da63a33245b9f6bcad1f2769617701 Mon Sep 17 00:00:00 2001 From: Anoop Kapoor Date: Fri, 23 May 2025 22:13:44 -0700 Subject: [PATCH 01/35] @FIR-702 - llama.cpp: Sync with latest opensource --- .gitmodules | 3 + CMakeLists.txt | 62 +- common/CMakeLists.txt | 10 +- examples/gguf-hash/CMakeLists.txt | 1 + examples/gguf/CMakeLists.txt | 2 +- examples/lookup/CMakeLists.txt | 8 +- examples/simple-chat/CMakeLists.txt | 2 +- examples/simple/CMakeLists.txt | 21 +- examples/simple/simple-backend-tsi.cpp | 578 ++++++ ggml-tsi-kernel | 1 + ggml/CMakeLists.txt | 1 + ggml/include/ggml-tsavorite.h | 189 ++ ggml/src/CMakeLists.txt | 1 + ggml/src/ggml-backend-reg.cpp | 10 + ggml/src/ggml-tsavorite/CMakeLists.txt | 8 + ggml/src/ggml-tsavorite/ggml-tsavorite.cpp | 1887 +++++++++++++++++++ ggml/src/ggml-tsavorite/include/TestModel.h | 217 +++ tests/CMakeLists.txt | 2 +- tsi-pkg-build.sh | 87 + 19 files changed, 3079 insertions(+), 11 deletions(-) create mode 100644 examples/simple/simple-backend-tsi.cpp create mode 160000 ggml-tsi-kernel create mode 100644 ggml/include/ggml-tsavorite.h create mode 100644 ggml/src/ggml-tsavorite/CMakeLists.txt create mode 100644 ggml/src/ggml-tsavorite/ggml-tsavorite.cpp create mode 100644 ggml/src/ggml-tsavorite/include/TestModel.h create mode 100755 tsi-pkg-build.sh diff --git a/.gitmodules b/.gitmodules index 23ce5ff059b1b..001504ec9ed07 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "kompute"] path = ggml/src/ggml-kompute/kompute url = https://github.com/nomic-ai/kompute.git +[submodule "ggml-tsi-kernel"] + path = ggml-tsi-kernel + url = git@github.com:tsisw/ggml-tsi-kernel.git diff --git a/CMakeLists.txt b/CMakeLists.txt index ac3e9090336d9..f9c146006c1a5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,10 +5,59 @@ include(CheckIncludeFileCXX) #set(CMAKE_WARN_DEPRECATED YES) set(CMAKE_WARN_UNUSED_CLI YES) +if (GGML_TSAVORITE) + if (NOT DEFINED GGML_TSAVORITE_TARGET) + set(GGML_TSAVORITE_TARGET "posix") + endif() + if (NOT ${GGML_TSAVORITE_TARGET} STREQUAL fpga) + set(GGML_TSAVORITE_TARGET "posix") + endif() + + if (NOT DEFINED MLIR_COMPILER_DIR) + if (NOT DEFINED $ENV{MLIR_SDK_VERSION}) + set (MLIR_COMPILER_DIR /proj/work/rel/sw/sdk-r.0.1.0/compiler) + else() + set (MLIR_COMPILER_DIR $ENV{MLIR_SDK_VERSION}/compiler) + endif() + endif() + + if (NOT DEFINED RUNTIME_DIR) + if (NOT DEFINED $ENV{MLIR_SDK_VERSION}) + set (RUNTIME_DIR /proj/work/rel/sw/sdk-r.0.1.0/${GGML_TSAVORITE_TARGET}/runtime) + else() + set (RUNTIME_DIR $ENV{MLIR_SDK_VERSION}/${GGML_TSAVORITE_TARGET}/runtime) + endif() + endif() + + if (NOT DEFINED GGML_TSI_KERNEL_DIR) + set (GGML_TSI_KERNEL_DIR ${CMAKE_SOURCE_DIR}/ggml-tsi-kernel/${GGML_TSAVORITE_TARGET}) + endif() + + file(GLOB TLIBS "${RUNTIME_DIR}/lib/*.so" "${GGML_TSI_KERNEL_DIR}/host/*.o") + + if (${GGML_TSAVORITE_TARGET} STREQUAL fpga) + set(CMAKE_CROSSCOMPILING ON) + set(ARCH_FLAGS -march=armv8-a) + message("Setting target as fpga") + elseif (${GGML_TSAVORITE_TARGET} STREQUAL "posix") + list(APPEND TLIBS "${MLIR_COMPILER_DIR}/lib/libFFMDeviceShim.so") + message("Setting target as posix for tsavorite") + endif() + + set(GGML_TSAVORITE_TARGET "${GGML_TSAVORITE_TARGET}" CACHE STRING "Target for tsavorite") + set (TSAVORITE_INCLUDE_DIR ${CMAKE_SOURCE_DIR}/ggml/src/ggml-tsavorite/include) + + include_directories(${TSAVORITE_INCLUDE_DIR}) + include_directories(${MLIR_COMPILER_DIR}/include/runtime/shim) + include_directories(${RUNTIME_DIR}/include) + message("tsavorite backend is enabled") +endif() + set(CMAKE_EXPORT_COMPILE_COMMANDS ON) if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) + #set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) + set(CMAKE_BUILD_TYPE Debug CACHE STRING "Build type" FORCE) set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") endif() @@ -82,9 +131,18 @@ option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) # 3rd party libs -option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON) option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF) +if (GGML_TSAVORITE) + if (${GGML_TSAVORITE_TARGET} STREQUAL fpga) + option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF) + else() + option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON) + endif() +else() + option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON) +endif() + # Required for relocatable CMake package include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake) diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index a7ff3ac16c446..9eafc9bb2b659 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -145,8 +145,16 @@ endif () target_include_directories(${TARGET} PUBLIC .) target_compile_features (${TARGET} PUBLIC cxx_std_17) -target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads) +if (GGML_TSAVORITE) + if (${GGML_TSAVORITE_TARGET} STREQUAL fpga) + target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} ${TLIBS} PUBLIC llama Threads::Threads) + else() + target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads) + endif() +else() + target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads) +endif() # # copy the license files diff --git a/examples/gguf-hash/CMakeLists.txt b/examples/gguf-hash/CMakeLists.txt index 15c5c68c6f402..0d9272b663d1a 100644 --- a/examples/gguf-hash/CMakeLists.txt +++ b/examples/gguf-hash/CMakeLists.txt @@ -1,5 +1,6 @@ set(TARGET llama-gguf-hash) add_executable(${TARGET} gguf-hash.cpp) +target_link_libraries(${TARGET} PRIVATE ${TLIBS}) install(TARGETS ${TARGET} RUNTIME) # clibs dependencies diff --git a/examples/gguf/CMakeLists.txt b/examples/gguf/CMakeLists.txt index fb04eb83f34ce..48365a0b054ce 100644 --- a/examples/gguf/CMakeLists.txt +++ b/examples/gguf/CMakeLists.txt @@ -1,5 +1,5 @@ set(TARGET llama-gguf) add_executable(${TARGET} gguf.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT} ${TLIBS}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/lookup/CMakeLists.txt b/examples/lookup/CMakeLists.txt index fba78ceda6fd7..f7626a45dedd8 100644 --- a/examples/lookup/CMakeLists.txt +++ b/examples/lookup/CMakeLists.txt @@ -1,23 +1,23 @@ set(TARGET llama-lookup) add_executable(${TARGET} lookup.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${TLIBS}) target_compile_features(${TARGET} PRIVATE cxx_std_17) set(TARGET llama-lookup-create) add_executable(${TARGET} lookup-create.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${TLIBS}) target_compile_features(${TARGET} PRIVATE cxx_std_17) set(TARGET llama-lookup-merge) add_executable(${TARGET} lookup-merge.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${TLIBS}) target_compile_features(${TARGET} PRIVATE cxx_std_17) set(TARGET llama-lookup-stats) add_executable(${TARGET} lookup-stats.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${TLIBS}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/simple-chat/CMakeLists.txt b/examples/simple-chat/CMakeLists.txt index 567f7fbbbf43a..cdf65e58a9d7d 100644 --- a/examples/simple-chat/CMakeLists.txt +++ b/examples/simple-chat/CMakeLists.txt @@ -1,5 +1,5 @@ set(TARGET llama-simple-chat) add_executable(${TARGET} simple-chat.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT} ${TLIBS}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt index 104ecabfd7236..a87dac20c82da 100644 --- a/examples/simple/CMakeLists.txt +++ b/examples/simple/CMakeLists.txt @@ -1,5 +1,24 @@ +# +# simple-ctx set(TARGET llama-simple) add_executable(${TARGET} simple.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE llama ${TLIBS} ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) + +# +if (GGML_TSAVORITE) + # + # tsavorite backend test cases + if (${GGML_TSAVORITE_TARGET} STREQUAL fpga) + file(GLOB TLIBS "${RUNTIME_DIR}/lib/*.so" "../../${GGML_TSI_KERNEL_DIR}/host/*.o") + else() + file(GLOB TLIBS "${RUNTIME_DIR}/lib/*.so" "${MLIR_COMPILER_DIR}/lib/libFFMDeviceShim.so" "../../${GGML_TSI_KERNEL_DIR}/host/*.o") + endif() + # + # simple-backend-tsi + + set(TEST_TARGET simple-backend-tsi) + add_executable(${TEST_TARGET} simple-backend-tsi.cpp) + target_link_libraries(${TEST_TARGET} PRIVATE ggml ${TLIBS} dl rt) +endif() diff --git a/examples/simple/simple-backend-tsi.cpp b/examples/simple/simple-backend-tsi.cpp new file mode 100644 index 0000000000000..2f56f34168062 --- /dev/null +++ b/examples/simple/simple-backend-tsi.cpp @@ -0,0 +1,578 @@ +#include "ggml.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" +#include "ggml-tsavorite.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NUM_INPUT_TENSORS 2 +#define NUM_INPUT_URINARY_TENSORS 1 +#define NUM_ELEMENTS 32 +#define NUM_ELEMENTS_SCALE 32*4 + 25 + +// index 0 for addition, index 1 for subtraction, index 2 for multiplication, index 3 for division +float test_input_1[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS] = { + //ADD KERNEL + {1.1, 2.3, 3.2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + //SUB KERNEL + {2.2, 10.3, 10.4, 2.2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + //MULT KERNEL + {1.1, 2.3, 3.2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + //DIV KERNEL + {1.1, 4.4, 10, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + // SQRT Kernel + {1, 4, 9.6, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 256, 289, 324, 361, 400, 441, 484, 529, 576, 625, 676, 729, 784, 841, 900, 961, 1024}, + //NEG Kernel + {1.1, -4.4, 10, -5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, -23, 24, 25, -26, 27, -28, 29, -30, 31, -32.6}, + //ABS Kernel + {1.1, -4.4, 10, -5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, -23, 24, 25, -26, 27, -28, 29, -30, 31, -32.6}, + //SIN Kernel + {1.1, 4.4, 10, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 20, 20, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32.6} +}; +float test_input_2[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS] = { + //ADD KERNEL + {1.1, 2.2, 3.3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + //SUB KERNEL + {1.1, 2.2, 3.0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + //MULT KERNEL + {1.1, 2.2, 3.3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + //DIV KERNEL + {1.1, 2.2, 5, 10, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + //Below ROW value not used for Unary OPS-SQRT, NEG, ABS, SIN + //SQRT KERNEL input not used + {1.1, 2.2, 5, 10, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + //NEG KERNEL input not used + {1.1, 2.2, 5, 10, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + //ABS KERNEL input not used + {1.1, 2.2, 5, 10, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + //SIN Kernel input not used + {1.1, 2.2, 5, 10, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32} +}; + +float test_result[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS] = { + //ADD KERNEL + {2.20, 4.50, 6.50, 8.00, 10.00, 12.00, 14.00, 16.00, 18.00, 20.00, 22.00, 24.00, 26.00, 28.00, 30.00, 32.00, 34.00, 36.00, 38.00, 40.00, 42.00, 44.00, 46.00, 48.00, 50.00, 52.00, 54.00, 56.00, 58.00, 60.00, 62.00, 64.00}, + //SUB KERNEL + {1.1, 8.1, 7.4, -1.8, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00}, + //MULT KERNEL + {1.21, 5.06, 10.56, 16.00, 25.00, 36.00, 49.00, 64.00, 81.00, 100.00, 121.00, 144.00, 169.00, 196.00, 225.00, 256.00, 289.00, 324.00, 361.00, 400.00, 441.00, 484.00, 529.00, 576.00, 625.00, 676.00, 729.00, 784.00, 841.00, 900.00, 961.00, 1024.00}, + //DIV KERNEL + {1.0, 2.0, 2, 0.5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + //SQRT Kernel + {1, 2, 3.098387, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + //NEG Kernel + {-1.1, 4.4, -10, 5, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, 23, -24, -25, 26, -27, 28, -29, 30, -31, 32.6}, + //ABS Kernel + {1.1, 4.4, 10, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32.6}, + //SIN Kernel + {0.891207, -0.951602, -0.544021, -0.958924, -0.958924, -0.279416, 0.656987, 0.989358, 0.412118, -0.544021, -0.999990, -0.536573, 0.420167, 0.990607, 0.650288, -0.287903, -0.961398, -0.750987, 0.149877, 0.912945, 0.912945, 0.912945, -0.846220, -0.905578, -0.132352, 0.762559, 0.956376, 0.270906, -0.663634, -0.988032, -0.404039, 0.926149} +}; + +float test_input_scale_1[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS_SCALE] = { + //ADD KERNEL + {1.3, 2.3, 3.3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}, + //SUB KERNEL + {8.5, 2.5, 3.5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 64, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 63, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 63, 32, + 4, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 63, 32, + 2, 4, 8, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}, + //MULT KERNEL + {1.5, 2.5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 10, + 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 10, + 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 10, + 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 10, + 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + //DIV KERNEL + {4.2, 8.4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 4, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 4, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 4, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 4, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + //SQRT KERNEL + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 9, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 16, 25, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + //NEG KERNEL + {-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -9, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -16, 25, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + //ABS KERNEL + {-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -9, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -16, 25, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + //SIN KERNEL + {-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -9, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -16, 25, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1} +}; + +float test_input_scale_2[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS_SCALE] = { + // ADD KERNEL + {1.3, 2.3, 3.3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}, + // SUB KERNEL + {1, 8, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 6, 8, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}, + // MULT KERNEL + {2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + // DIV KERNEL + {2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + //Below ROW value not used for Unary OPS-SQRT, NEG, ABS, SIN + //SQRT KERNEL input not used + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + //NEG KERNEL input not used + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + //ABS KERNEL input not used + {-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -9, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -16, 25, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + //SIN KERNEL input not used + {-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -9, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -16, 25, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1} +}; +float test_result_scale[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS_SCALE] = { + // ADD KERNEL + {2.6, 4.6, 6.6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38 ,40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, + 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38 ,40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, + 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38 ,40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, + 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38 ,40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, + 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38 ,40, 42, 44, 46, 48, 50}, + // SUB KERNEL + {7.5, -5.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, + -5, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, + 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, + 1, 2, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + // MULT KERNEL + {3, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 2, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 2, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 2, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 2, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + // DIV KERNEL + {2.1, 4.2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 2, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 2, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 2, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 2, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + // SQRT KERNEL + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 4, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + // NEG KERNEL + {1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 9, -4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 16, -25, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + // ABS KERNEL + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 9, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 16, 25, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + // SIN KERNEL + {-0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + -0.412118,-0.756802, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + 0.287903,-0.132352, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + -0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + -0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, + 0.841471, 0.841471, 0.841471} +}; + +// This is a simple model with two tensors a and b +struct simple_model { + struct ggml_tensor * a; + struct ggml_tensor * b; + + // the backend to perform the computation (TSAVORITE) + ggml_backend_t backend = NULL; + + // the backend buffer to storage the tensors data of a and b + ggml_backend_buffer_t buffer; + + // the context to define the tensor information (dimensions, size, memory address) + struct ggml_context * ctx; +}; + + +static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) { + (void) level; + (void) user_data; + fputs(text, stderr); + fflush(stderr); +} + +static bool ggml_tsi_compare_two_float(float a, float b) { + float epsilon = 1e-5; + float absA = abs(a); + float absB = abs(b); + float diff = abs(a - b); + float minV = std::numeric_limits::min(); + float maxV = std::numeric_limits::max(); + + if (a == b) { // shortcut, handles infinities + return true; + } else if (a == 0 || b == 0 || (absA + absB < minV)) { + // a or b is zero or both are extremely close to it + // relative error is less meaningful here + return diff < (epsilon * minV); + } + // use relative error + return diff /std::min((absA + absB), maxV) < epsilon; +} + + +static bool load_model(simple_model & model, float * a, float * b, enum ggml_type data_type, int elements_A, int elements_B) { + ggml_log_set(ggml_log_callback_default, nullptr); + + // initialize the backend + fprintf(stderr, "%s: using TSavorite backend \n", __func__); + model.backend = ggml_backend_tsavorite_init(); + if (!model.backend) { + fprintf(stderr, "%s: ggml_backend_tsavorite_init() failed\n", __func__); + return false; + } + + int num_tensors; + + if (!b) + num_tensors = NUM_INPUT_URINARY_TENSORS; + else + num_tensors = NUM_INPUT_TENSORS; + + // Since we are not passing the mem_buffer ggml context will create + /* .mem_buffer = params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size) */ + // mem_buffer for ctx is used for any object creation and used for tensor data if + // backend doesnt have own memory + // Since we are using backend memory hence i have removed extra bytes: 100, removed from mem_size at below + struct ggml_init_params params { + /*.mem_size =*/ (ggml_tensor_overhead() * num_tensors), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + fprintf(stderr, "\n Calculating mem_size %ld %d and creating ggml context \n", ggml_tensor_overhead(), num_tensors); + + // create context + model.ctx = ggml_init(params); + if (!model.ctx) { + fprintf(stderr, "%s: ggml_init failed\n", __func__); + return false; + } + + // create tensors + // // BELOW CODE NO CHANGE FOR tsavorite Backend + // Tensor just created with OBJ(Structure)+Tensor(structure) + // Still Buffer need to attached to Tensor since we are using Backend + // We will using tsi_alloc called under tsavorite-backend + + fprintf(stderr, "\n Creating input Tensor \n"); + + //int64_t ne[GGML_MAX_DIMS]; // number of elements + //size_t nb[GGML_MAX_DIMS]; // stride in bytes: + model.a = ggml_new_tensor_1d(model.ctx, data_type, elements_A); + if (b) + model.b = ggml_new_tensor_1d(model.ctx, data_type, elements_B); + + // create a backend buffer (backend memory) and alloc the tensors from the context + fprintf(stderr, "\n Creating Backend Buffer \n"); + + // Here at ggml Context we have only two input tensors, hence backend memory is + // created for two input tensors + model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, model.backend); + + // load data from cpu memory to backend buffer + fprintf(stderr, "\n Loading Input Tensor Data to Backend Buffer \n"); + + // loading the data to tensor + ggml_backend_tensor_set(model.a, a, 0, ggml_nbytes(model.a)); + if (b) + ggml_backend_tensor_set(model.b, b, 0, ggml_nbytes(model.b)); + + // create a array to print input tensor + std::vector out_data(ggml_nelements(model.a)); + // bring the data from the backend memory + ggml_backend_tensor_get(model.a, out_data.data(), 0, ggml_nbytes(model.a)); + + + fprintf(stderr, "\nBringing tensor data from Backend buffer and printing %d tensor data:\n[", (int) model.a->ne[0]); + + for (int i = 0; i < model.a->ne[0] /* cols */; i++) { + fprintf(stderr, " %.2f", out_data[i]); + } + fprintf(stderr, " ]\n"); + return true; +} + +// build the compute graph +static struct ggml_cgraph * build_graph(const simple_model& model, enum ggml_tsavorite_kernel_type ops_type) { + static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); + static std::vector buf(buf_size); + + struct ggml_init_params params0 = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf.data(), + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() + }; + + // create a temporally context to build the graph + struct ggml_context * ctx0 = ggml_init(params0); + + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + + struct ggml_tensor * result; + switch(ops_type) { + case GGML_TSAVORITE_KERNEL_TYPE_ADD: + result = ggml_add(ctx0, model.a, model.b); + break; + case GGML_TSAVORITE_KERNEL_TYPE_SUB: + result = ggml_sub(ctx0, model.a, model.b); + break; + case GGML_TSAVORITE_KERNEL_TYPE_MULT: + result = ggml_mul(ctx0, model.a, model.b); + break; + case GGML_TSAVORITE_KERNEL_TYPE_DIV: + result = ggml_div(ctx0, model.a, model.b); + break; + case GGML_TSAVORITE_KERNEL_TYPE_SQRT: + result = ggml_sqrt(ctx0, model.a); + break; + case GGML_TSAVORITE_KERNEL_TYPE_NEG: + result = ggml_neg(ctx0, model.a); + break; + case GGML_TSAVORITE_KERNEL_TYPE_ABS: + result = ggml_abs(ctx0, model.a); + break; + case GGML_TSAVORITE_KERNEL_TYPE_SIN: + result = ggml_sin(ctx0, model.a); + break; + default: + ggml_free(ctx0); + fprintf(stderr, "\n Non Supported Operation \n"); + return NULL; + } + // build operations nodes + ggml_build_forward_expand(gf, result); + + // delete the temporally context used to build the graph + ggml_free(ctx0); + return gf; +} + +// compute with backend +static struct ggml_tensor * compute(const simple_model & model, ggml_gallocr_t allocr, enum ggml_tsavorite_kernel_type ops_type) { + // reset the allocator to free all the memory allocated during the previous inference + + fprintf(stderr, "\n Under Test case for compute API creating build_graph \n"); + struct ggml_cgraph * gf = build_graph(model, ops_type); + if (!gf) { + fprintf(stderr, "\ncompute failed\n"); + return NULL; + } + + // allocate tensors + ggml_gallocr_alloc_graph(allocr, gf); + + ggml_backend_graph_compute(model.backend, gf); + + // in this case, the output tensor is the last one in the graph + return ggml_graph_node(gf, -1); +} + +enum ggml_tsavorite_kernel_type convert_testcase_to_ops_type (const char *testCase) { + if (!strcmp(testCase,"add")) + return GGML_TSAVORITE_KERNEL_TYPE_ADD; + else if (!strcmp(testCase,"sub")) + return GGML_TSAVORITE_KERNEL_TYPE_SUB; + else if (!strcmp(testCase,"mult")) + return GGML_TSAVORITE_KERNEL_TYPE_MULT; + else if (!strcmp(testCase,"div")) + return GGML_TSAVORITE_KERNEL_TYPE_DIV; + else if (!strcmp(testCase,"sqrt")) + return GGML_TSAVORITE_KERNEL_TYPE_SQRT; + else if (!strcmp(testCase,"neg")) + return GGML_TSAVORITE_KERNEL_TYPE_NEG; + else if (!strcmp(testCase,"abs")) + return GGML_TSAVORITE_KERNEL_TYPE_ABS; + else if (!strcmp(testCase,"sin")) + return GGML_TSAVORITE_KERNEL_TYPE_SIN; + + fprintf(stderr, "\n un-supported test case %s hence running default test case which is add operation \n", testCase); + return GGML_TSAVORITE_KERNEL_TYPE_ADD; +} + +int main(int argc, char *argv[]) { + ggml_time_init(); + bool test_case_flag = true; + enum ggml_tsavorite_kernel_type ops_type; + simple_model model; + float *input1[GGML_TSAVORITE_KERNEL_TYPE_COUNT]; + float *input2[GGML_TSAVORITE_KERNEL_TYPE_COUNT]; + float *result_data[GGML_TSAVORITE_KERNEL_TYPE_COUNT]; + bool data_scale = false; + + int elements_A=0, elements_B=0; + int num_of_input_tensors; + + if (argc > 1) { + ops_type = convert_testcase_to_ops_type(argv[1]); + if (argc > 2 && !strcmp(argv[2], "scale")) + data_scale = true; + } else { + // Default Case + ops_type = convert_testcase_to_ops_type("add"); + } + if (ops_type == GGML_TSAVORITE_KERNEL_TYPE_SQRT || + ops_type == GGML_TSAVORITE_KERNEL_TYPE_NEG || + ops_type == GGML_TSAVORITE_KERNEL_TYPE_ABS || + ops_type == GGML_TSAVORITE_KERNEL_TYPE_SIN) + num_of_input_tensors = NUM_INPUT_URINARY_TENSORS; + else + num_of_input_tensors = NUM_INPUT_TENSORS; + + if (data_scale) { + input1[ops_type] = test_input_scale_1[ops_type]; + elements_A = NUM_ELEMENTS_SCALE; + if (num_of_input_tensors != NUM_INPUT_URINARY_TENSORS) { + input2[ops_type] = test_input_scale_2[ops_type]; + elements_B = NUM_ELEMENTS_SCALE; + } + result_data[ops_type] = test_result_scale[ops_type]; + } else { + input1[ops_type] = test_input_1[ops_type]; + elements_A = NUM_ELEMENTS; + if (num_of_input_tensors != NUM_INPUT_URINARY_TENSORS) { + input2[ops_type] = test_input_2[ops_type]; + elements_B = NUM_ELEMENTS; + } + result_data[ops_type] = test_result[ops_type]; + } + + if(!load_model(model, input1[ops_type], input2[ops_type], GGML_TYPE_F32, elements_A, elements_B)) { + fprintf(stderr, "\n\n TEST CASE FAILED \n\n"); + return -1; + } + // since tsavorite-backend init set the debug level to none, we are overwritting here + ggml_tsavorite_log_type_val = GGML_TSAVORITE_LOG_DEBUG; + + ggml_gallocr_t allocr = NULL; + + allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); + + if (!allocr) { + fprintf(stderr, "\n\n TEST CASE FAILED \n\n"); + return -1; + } + + // create the worst case graph for memory usage estimation + struct ggml_cgraph * gf = build_graph(model, ops_type); + if (!gf) { + fprintf(stderr, "\n\n TEST CASE FAILED \n\n"); + return -1; + } + ggml_gallocr_reserve(allocr, gf); + size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0); + + fprintf(stderr, "%s: compute buffer size: %.4f KB\n", __func__, mem_size/1024.0); + + // perform computation + struct ggml_tensor * result = compute(model, allocr, ops_type); + if (!result) { + fprintf(stderr, "\n\n TEST CASE FAILED \n\n"); + return -1; + } + fprintf(stderr, "\n Compute Done \n"); + + std::vector out_data(ggml_nelements(result)); + + // bring the data from the backend memory + ggml_backend_tensor_get(result, out_data.data(), 0, ggml_nbytes(result)); + + // expected result: + + fprintf(stderr, "\n operation type: %d, num of elements %d \n", ops_type, (int) result->ne[0]); + + fprintf(stderr, "\n compute is also done \n"); + for (int i = 0; i < result->ne[0] /* cols */; i++) { + if (ggml_tsi_compare_two_float(out_data[i], result_data[ops_type][i])) { + continue; + } + test_case_flag = false; + fprintf(stderr, "\n result for index %d is not matching expected %f got %f \n", i, result_data[ops_type][i], out_data[i]); + } + + if (test_case_flag == false) { + fprintf(stderr, "\n\n TEST CASE FAILED \n\n"); + return -1; + } + fprintf(stderr, "\n\n TEST CASE PASSED \n\n"); + + // free memory + ggml_free(model.ctx); + + // release backend memory and free backend + //ggml_backend_buffer_free(model.buffer); + ggml_backend_free(model.backend); + return 0; +} diff --git a/ggml-tsi-kernel b/ggml-tsi-kernel new file mode 160000 index 0000000000000..f7a3ac1ee334c --- /dev/null +++ b/ggml-tsi-kernel @@ -0,0 +1 @@ +Subproject commit f7a3ac1ee334c242958ccb2053ecc4854822d87e diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 4746d5cb76c08..93a72d6cc84e4 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -267,6 +267,7 @@ set(GGML_PUBLIC_HEADERS include/ggml-kompute.h include/ggml-opt.h include/ggml-metal.h + include/ggml-tsavorite.h include/ggml-rpc.h include/ggml-sycl.h include/ggml-vulkan.h diff --git a/ggml/include/ggml-tsavorite.h b/ggml/include/ggml-tsavorite.h new file mode 100644 index 0000000000000..cd380ddf61ed3 --- /dev/null +++ b/ggml/include/ggml-tsavorite.h @@ -0,0 +1,189 @@ +// ------------------------------------------------------------------------------ +// Copyright (c) 2023 Tsavorite Scalable Intelligence, Inc . All rights reserved. +// +// +// This file is the confidential and proprietary property of +// Tsavorite Scalable Intelligence, Inc +// +// Possession or use of this file requires a written license from +// Tsavorite Scalable Intelligence, Inc + +/****************************************************************************** + * File: ggml-tsavorite.h + * Author TSI Inc + * + * Description: + * ***************************************************************************/ + +// +// +// Note: this description is outdated +// +// An interface allowing to compute ggml_cgraph with tSovrite +// +// This is a fully functional interface that extends ggml with Hardware Accelerator support for +// tSovrite devices. A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, +// etc.) +// +// How it works? +// +// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this +// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you +// use ggml_tsavorite_graph_compute() +// +// You only need to make sure that all memory buffers that you used during the graph creation +// are mapped to the device unified memory with the ggml_tsavorite_add_buffer() function. This +// mapping is used during the graph evaluation to determine the arguments of the compute kernels. +// +// Synchronization between device and host memory (for example for input and output tensors) +// is done with the ggml_tsavorite_set_tensor() and ggml_tsavorite_get_tensor() functions. +// + +#pragma once + +#include "ggml-backend.h" +#include "ggml.h" + +#include "TestModel.h" + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define TSAVORITE_KERNEL_SIZE 64 +#define TSAVORITE_DEVICE_MAX_BUF_LEN 1024 * 1024 * 128 + +enum ggml_tsavorite_input_tensors_count { + TSAVORITE_UNARY_INPUT_TENSORS = 1, + TSAVORITE_TWO_INPUT_TENSORS = 2 +}; + +enum ggml_tsavorite_log_type { + GGML_TSAVORITE_LOG_NONE, + GGML_TSAVORITE_LOG_CONT, + GGML_TSAVORITE_LOG_ERROR, + GGML_TSAVORITE_LOG_WARN, + GGML_TSAVORITE_LOG_DEBUG, + GGML_TSAVORITE_LOG_INFO, + GGML_TSAVORITE_LOG_ALL +}; + +enum ggml_tsavorite_kernel_mode { + GGML_TSAVORITE_KERNEL_MODE_CPU, + GGML_TSAVORITE_KERNEL_MODE_MLIR +}; + +enum ggml_tsavorite_kernel_mode ggml_tsavorite_kernel_mode_flag = GGML_TSAVORITE_KERNEL_MODE_MLIR; +enum ggml_tsavorite_log_type ggml_tsavorite_log_type_val = GGML_TSAVORITE_LOG_ALL; +#define GGML_TSAVORITE_LOG_INFO(...) \ + do { \ + if (ggml_tsavorite_log_type_val >= GGML_TSAVORITE_LOG_INFO) { \ + ggml_log_internal(GGML_LOG_LEVEL_INFO, __VA_ARGS__); \ + } \ + } while (0) +#define GGML_TSAVORITE_LOG_DEBUG(...) \ + do { \ + if (ggml_tsavorite_log_type_val >= GGML_TSAVORITE_LOG_DEBUG) { \ + ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__); \ + } \ + } while (0) +#define GGML_TSAVORITE_LOG_WARN(...) \ + do { \ + if (ggml_tsavorite_log_type_val >= GGML_TSAVORITE_LOG_WARN) { \ + ggml_log_internal(GGML_LOG_LEVEL_WARN, __VA_ARGS__); \ + } \ + } while (0) +#define GGML_TSAVORITE_LOG_ERROR(...) \ + do { \ + if (ggml_tsavorite_log_type_val >= GGML_TSAVORITE_LOG_ERROR) { \ + ggml_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__); \ + } \ + } while (0) +#define GGML_TSAVORITE_LOG_CONT(...) \ + do { \ + if (ggml_tsavorite_log_type_val >= GGML_TSAVORITE_LOG_CONT) { \ + ggml_log_internal(GGML_LOG_LEVEL_CONT, __VA_ARGS__); \ + } \ + } while (0) + +enum ggml_tsavorite_tensor_data_type { + GGML_TSAVORITE_TENSOR_HEADER, + GGML_TSAVORITE_TENSOR_LEAF1, + GGML_TSAVORITE_TENSOR_LEAF2, + GGML_TSAVORITE_TENSOR_NODE, + GGML_TSAVORITE_TENSOR_END_DATA +}; + +enum ggml_tsavorite_kernel_type { + GGML_TSAVORITE_KERNEL_TYPE_ADD, + GGML_TSAVORITE_KERNEL_TYPE_SUB, + GGML_TSAVORITE_KERNEL_TYPE_MULT, + GGML_TSAVORITE_KERNEL_TYPE_DIV, + GGML_TSAVORITE_KERNEL_TYPE_SQRT, + GGML_TSAVORITE_KERNEL_TYPE_NEG, + GGML_TSAVORITE_KERNEL_TYPE_ABS, + GGML_TSAVORITE_KERNEL_TYPE_SIN, + GGML_TSAVORITE_KERNEL_TYPE_SIGMOID, + + GGML_TSAVORITE_KERNEL_TYPE_COUNT +}; + +// max memory buffers that can be mapped to the device +#define GGML_TSAVORITE_MAX_BUFFERS 64 + +// max number of TSAVORITECommandBuffer used to submit a graph for processing +#define GGML_TSAVORITE_MAX_COMMAND_BUFFERS 8 +#define tsi_nil 0 +#define TSI_UNUSED(x) (void)(x) + +typedef struct tensor_log_ { + uint32_t leaf1_len; + uint32_t leaf2_len; + uint32_t node_len; + enum ggml_tsavorite_tensor_data_type data_type; + enum ggml_tsavorite_kernel_type kernel_type; + uint64_t num_of_op; + FILE *log_file; + const ggml_tensor *tensor; +} tensor_log; + +extern void _mlir_ciface_txe_add(void *a, void *b, void *res); +extern void _mlir_ciface_txe_sub(void *a, void *b, void *res); +extern void _mlir_ciface_txe_mult(void *a, void *b, void *res); +extern void _mlir_ciface_txe_div(void *a, void *b, void *res); +extern void _mlir_ciface_txe_sqrt(void *a, void *res); +extern void _mlir_ciface_txe_neg(void *a, void *res); +extern void _mlir_ciface_txe_abs(void *a, void *res); +extern void _mlir_ciface_txe_sin(void *a, void *res); +extern void _mlir_ciface_txe_sigmoid(void *a, void *res); +extern void ggml_tsi_log_tensor_data(tensor_log log_data); + +#define NUM_OF_TXES 1 +#define MEM_REF_DESCRIPTOR_RANK 1 + +// +// backend API +// user-code should use only these functions +// + +GGML_BACKEND_API ggml_backend_t ggml_backend_tsavorite_init(void); + +GGML_BACKEND_API bool ggml_backend_is_tsavorite(ggml_backend_t backend); + +GGML_BACKEND_API void ggml_backend_tsavorite_set_abort_callback(ggml_backend_t backend, + ggml_abort_callback abort_callback, + void *user_data); + +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_tsavorite_buffer_type(void); + +// capture all command buffers committed the next time `ggml_backend_graph_compute` is called +GGML_BACKEND_API void ggml_backend_tsavorite_capture_next_compute(ggml_backend_t backend); + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_tsavorite_reg(void); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index ddea5ad3891e5..0a14bbb74ced7 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -308,6 +308,7 @@ ggml_add_backend(CUDA) ggml_add_backend(HIP) ggml_add_backend(Kompute) ggml_add_backend(METAL) +ggml_add_backend(TSAVORITE) ggml_add_backend(MUSA) ggml_add_backend(RPC) ggml_add_backend(SYCL) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 405d8e31514b5..f48a23bf83151 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -37,6 +37,10 @@ #include "ggml-metal.h" #endif +#ifdef GGML_USE_TSAVORITE +#include "ggml-tsavorite.h" +#endif + #ifdef GGML_USE_SYCL #include "ggml-sycl.h" #endif @@ -166,6 +170,11 @@ struct ggml_backend_registry { #ifdef GGML_USE_METAL register_backend(ggml_backend_metal_reg()); #endif + +#ifdef GGML_USE_TSAVORITE + register_backend(ggml_backend_tsavorite_reg()); +#endif + #ifdef GGML_USE_SYCL register_backend(ggml_backend_sycl_reg()); #endif @@ -572,6 +581,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) { ggml_backend_load_best("hip", silent, dir_path); ggml_backend_load_best("kompute", silent, dir_path); ggml_backend_load_best("metal", silent, dir_path); + ggml_backend_load_best("tsavorite", silent, dir_path); ggml_backend_load_best("rpc", silent, dir_path); ggml_backend_load_best("sycl", silent, dir_path); ggml_backend_load_best("vulkan", silent, dir_path); diff --git a/ggml/src/ggml-tsavorite/CMakeLists.txt b/ggml/src/ggml-tsavorite/CMakeLists.txt new file mode 100644 index 0000000000000..f58331fd68d30 --- /dev/null +++ b/ggml/src/ggml-tsavorite/CMakeLists.txt @@ -0,0 +1,8 @@ +message(STATUS "Tsavorite framework is found") +# +# tsavorite Kernel Library +ggml_add_backend_library(ggml-tsavorite + ggml-tsavorite.cpp + ) + +target_link_libraries(ggml-tsavorite PRIVATE ${TLIBS} dl rt) diff --git a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp new file mode 100644 index 0000000000000..7939a0f8cfa13 --- /dev/null +++ b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp @@ -0,0 +1,1887 @@ +// -----------------------------------------------------------------------------n +// Copyright (c) 2023 Tsavorite Scalable Intelligence, Inc . All rights reserved. +// +// +// This file is the confidential and proprietary property of +// Tsavorite Scalable Intelligence, Inc +// +// Possession or use of this file requires a written license from +// Tsavorite Scalable Intelligence, Inc + +/****************************************************************************** + * File: ggml-tsavorite.cpp + * Author TSI Inc + * + * Description: + * ***************************************************************************/ + +#include "ggml-tsavorite.h" +#include +#include +#include +#include + +#include "ggml-backend-impl.h" +#include "ggml-impl.h" +#include "ggml.h" + +typedef struct _txe_device_t *txe_device_s; +typedef struct _txe_compute_pipeline_state_t *txe_compute_pipeline_state_s; +FILE *tsi_op_log_file; +uint64_t num_of_op; + +#ifdef USE_COMMAND_BUFFERS +typedef struct _txe_command_queue_t *txe_command_queue_s; +typedef struct _txe_dispatch_queue_t *txe_dispatch_queue_s; +typedef struct _txe_command_buffer_t *txe_command_buffer_s; +#endif /* USE_COMMAND_BUFFERS */ +typedef struct ggml_backend_tsavorite_buffer ggml_backend_tsavorite_buffer_s; + +struct _txe_device_t { + char name[100]; + uint32_t max_buf_len; + size_t recommended_max_working_set_size; + size_t current_allocated_size; + int reserved; + struct _stats { + struct _op_run_count { + // Each Kernel operation belong to one tensor. Below count will increment for each Node Tensor + uint64_t total_tensor_count; + // This counter increment whenever kernel call are made + uint64_t num_of_kernel_call; + // below field count all tensors whose num of elements are larger than kernel number of + // elements + uint64_t num_of_tensor_spilt; + // For Any application below field maintain smallest tensor num of elem + uint64_t min_num_of_elem; + // For Any application below field maintain largest tensor num of elem + uint64_t max_num_of_elem; + } op_run_count[GGML_TSAVORITE_KERNEL_TYPE_COUNT]; + } stats; +}; + +struct _txe_compute_pipeline_state_t { + void (*_mlir_fptr_2_input)(void *, void *, void *); + void (*_mlir_fptr_1_input)(void *, void *); + std::string kernel_name; + int reserved; +}; + +#ifdef USE_COMMAND_BUFFERS +struct _txe_command_queue_t { + int reserved; +}; + +struct _txe_dispatch_queue_t { + int reserved; +}; + +struct _txe_command_buffer_t { + int reserved; +}; +#endif /* USE_COMMAND_BUFFERS */ + +static txe_device_s tsi_system_default_device_create(); + +// kernels + +struct ggml_tsavorite_kernel { + txe_compute_pipeline_state_s pipeline; +}; + +struct ggml_backend_tsavorite_context { +#ifdef USE_COMMAND_BUFFERS + txe_command_queue_s queue; + + txe_dispatch_queue_s d_queue; +#endif /* USE_COMMAND_BUFFERS */ + + struct ggml_tsavorite_kernel kernels[GGML_TSAVORITE_KERNEL_TYPE_COUNT]; + + // capture state + bool capture_next_compute; + bool capture_started; + + // command buffer state + int n_cb; // number of extra threads used to submit the command buffers + int n_nodes_0; // number of nodes submitted by the main thread + int n_nodes_1; // remaining number of nodes submitted by the n_cb threads + int n_nodes_per_cb; + + struct ggml_cgraph *gf; + + // the callback given to the thread pool + // void (^encode_async)(size_t ith); + +#ifdef USE_COMMAND_BUFFERS + // n_cb command buffers + 1 used by the main thread + txe_command_buffer_s command_buffers[GGML_TSAVORITE_MAX_COMMAND_BUFFERS + 1]; +#endif /* USE_COMMAND_BUFFERS */ + + // abort ggml_tsavorite_graph_compute if callback returns true + ggml_abort_callback abort_callback; + void *abort_callback_data; + + // picking CPU compute example + int n_threads; + ggml_threadpool_t threadpool; + + uint8_t *work_data; + size_t work_size; +}; + +// global + +// initialized in ggml_backend_tsavorite_reg +static struct ggml_backend_reg g_ggml_backend_tsavorite_reg; +static struct ggml_backend_device g_ggml_backend_tsavorite_device; + +// information about a tSavorite device +// note: assumes single GPU device - the default one +// Need to Add Support for multiple GPU devices +static struct ggml_backend_tsavorite_device_context { + txe_device_s device; + int ref_count; + + char name[128]; +} g_ggml_ctx_dev_main = { + /*.device =*/tsi_nil, + /*.ref_count =*/0, + /*.name =*/"", +}; + +// temporarily defined here for compatibility between ggml-backend and the old API + +struct ggml_backend_tsavorite_buffer { + void *data; + size_t size; +}; + +struct ggml_backend_tsavorite_buffer_context { + void *all_data; + size_t all_size; + bool owned; + + // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap + int n_buffers; + ggml_backend_tsavorite_buffer_s buffers[GGML_TSAVORITE_MAX_BUFFERS]; +}; + +static txe_device_s tsi_system_default_device_create() { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + txe_device_s device = (txe_device_s)malloc(sizeof(struct _txe_device_t)); + device->max_buf_len = TSAVORITE_DEVICE_MAX_BUF_LEN; + device->recommended_max_working_set_size = TSAVORITE_DEVICE_MAX_BUF_LEN; + device->current_allocated_size = 0; + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return device; +} + +static void tsi_device_free(txe_device_s device) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + free(device); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return; +} + +#ifdef USE_COMMAND_BUFFERS +static txe_command_queue_s tsi_command_queue_create() { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + txe_command_queue_s cqueue = (txe_command_queue_s)malloc(sizeof(struct _txe_command_queue_t)); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return cqueue; +} + +static txe_dispatch_queue_s tsi_dispatch_queue_create() { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + txe_dispatch_queue_s dqueue = (txe_dispatch_queue_s)malloc(sizeof(struct _txe_dispatch_queue_t)); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return dqueue; +} + +static void tsi_command_queue_free(txe_command_queue_s cqueue) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + if (cqueue) + free(cqueue); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return; +} + +static void tsi_dispatch_queue_free(txe_dispatch_queue_s dqueue) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + if (dqueue) + free(dqueue); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return; +} +#endif /* USE_COMMAND_BUFFERS */ + +static void tsi_buffer_free(void *data) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + if (data) + free(data); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return; +} + +static bool tsi_log_setup() { + tsi_op_log_file = fopen("tsi-op.txt", "w+"); + if (tsi_op_log_file == NULL) { + printf("Error Creating or opening log file\n"); + return false; + } + return true; +} + +void ggml_tsi_log_tensor_data(tensor_log log_data) { + if (!log_data.log_file) { + GGML_TSAVORITE_LOG_ERROR("%s: error: log file Cant be NULL\n", __func__); + return; + } + + switch (log_data.data_type) { + case GGML_TSAVORITE_TENSOR_HEADER: + fprintf(log_data.log_file, "\n\n"); + fprintf(log_data.log_file, "#############################################################\n"); + fprintf(log_data.log_file, + "Tensor Number %ld and Type %d \n leaf1 len %d, leaf2 len %d, Node len %d\n", + log_data.num_of_op, log_data.kernel_type, log_data.leaf1_len, log_data.leaf2_len, + log_data.node_len); + fprintf(log_data.log_file, "############################################################\n"); + fprintf(log_data.log_file, "\n\n"); + fflush(log_data.log_file); + return; + case GGML_TSAVORITE_TENSOR_LEAF1: + fprintf(log_data.log_file, "\n---------------------------------------------------\n"); + fprintf(log_data.log_file, "leaf1 Detail:\n"); + break; + case GGML_TSAVORITE_TENSOR_LEAF2: + fprintf(log_data.log_file, "\n---------------------------------------------------\n"); + fprintf(log_data.log_file, "leaf2 Detail:\n"); + break; + case GGML_TSAVORITE_TENSOR_NODE: + fprintf(log_data.log_file, "\n---------------------------------------------------\n"); + fprintf(log_data.log_file, "Node Detail:\n"); + break; + case GGML_TSAVORITE_TENSOR_END_DATA: + fprintf(log_data.log_file, "DONE WITH THIS OPERATION %ld\n", log_data.num_of_op); + fprintf(log_data.log_file, "############################################################\n"); + fprintf(log_data.log_file, "\n\n"); + fflush(log_data.log_file); + return; + default: + GGML_TSAVORITE_LOG_ERROR("%s: error: Invalid Data Type Passed\n", __func__); + return; + } + if (!log_data.tensor) { + GGML_TSAVORITE_LOG_ERROR("%s: error: tensor pointer is NULL\n", __func__); + return; + } + float *p; + int64_t count = (log_data.tensor->ne[0]) * (log_data.tensor->ne[1]) * (log_data.tensor->ne[2]) * + (log_data.tensor->ne[3]); + p = (float *)log_data.tensor->data; + if ((!p) || (count == 0)) { + fprintf(log_data.log_file, "\n\n"); + fprintf(log_data.log_file, "Tensor Data is Empty"); + fprintf(log_data.log_file, "\n---------------------------------------------------\n"); + fprintf(log_data.log_file, "\n\n"); + fflush(log_data.log_file); + return; + } + fprintf(tsi_op_log_file, "%.16f ", p[0]); + for (int64_t ii = 1; ii < count; ++ii) { + if (!(ii % 4)) + fprintf(log_data.log_file, "\n"); + fprintf(log_data.log_file, "%.16f ", p[ii]); + } + fprintf(log_data.log_file, "\n\n"); + fprintf(log_data.log_file, "\n---------------------------------------------------\n"); + fflush(log_data.log_file); + return; +} + +static void ggml_tsavorite_disp_stats(struct ggml_backend_tsavorite_context *ctx, + txe_device_s device) { + if (!ctx || !device) { + GGML_TSAVORITE_LOG_ERROR( + "At %s Either backend context or device or both are NULL, hence cant display Stats", + __func__); + return; + } + for (int i = 0; i < GGML_TSAVORITE_KERNEL_TYPE_COUNT; ++i) { + if (!ctx->kernels[i].pipeline) + continue; + GGML_TSAVORITE_LOG_CONT( + "\n %s Operation, total tensor: %lu Number of Kernel Call: %lu Number of tensor got " + "spilt: %lu Min Num of Elem %lu Max Num of Elem %lu \n", + ctx->kernels[i].pipeline->kernel_name.c_str(), + device->stats.op_run_count[i].total_tensor_count, + device->stats.op_run_count[i].num_of_kernel_call, + device->stats.op_run_count[i].num_of_tensor_spilt, + device->stats.op_run_count[i].min_num_of_elem, + device->stats.op_run_count[i].max_num_of_elem); + } + return; +} + +static void _mlir_ciface_txe_add_test (void *src0, void *src1, void *res) +{ + // MemRefDescriptor + if (!src0 || !src1 || !res) + return; + + const int Rank = MEM_REF_DESCRIPTOR_RANK; + MemRefDescriptor *srcP0, *srcP1, *nodeP; + srcP0 = (MemRefDescriptor *)src0; + srcP1 = (MemRefDescriptor *)src1; + nodeP = (MemRefDescriptor *)res; + + uint32_t count = srcP0->shape[Rank - 1]; + float *s0 = (float*)srcP0->data; + float *s1 = (float*)srcP1->data; + float *n = (float*)nodeP->data; + + for(uint32_t i=0; i < count; ++i) + n[i] = s0[i] + s1[i]; + //printf("\n Calling mlir_add cpu function-5 \n"); + return; +} + +static void _mlir_ciface_txe_mult_test (void *src0, void *src1, void *res) +{ + // MemRefDescriptor + if (!src0 || !src1 || !res) + return; + + const int Rank = MEM_REF_DESCRIPTOR_RANK; + MemRefDescriptor *srcP0, *srcP1, *nodeP; + srcP0 = (MemRefDescriptor *)src0; + srcP1 = (MemRefDescriptor *)src1; + nodeP = (MemRefDescriptor *)res; + + uint32_t count = srcP0->shape[Rank - 1]; + float *s0 = (float*)srcP0->data; + float *s1 = (float*)srcP1->data; + float *n = (float*)nodeP->data; + + for(uint32_t i=0; i < count; ++i) + n[i] = s0[i]*s1[i]; + return; +} + +static txe_compute_pipeline_state_s tsi_kernel_setup(enum ggml_tsavorite_kernel_type kernel_type) { + txe_compute_pipeline_state_s kernel_pipeline = + (txe_compute_pipeline_state_s)calloc(1, sizeof(struct _txe_compute_pipeline_state_t)); + bool flag = false; + if (!kernel_pipeline) { + GGML_TSAVORITE_LOG_ERROR("Calloc failing while setting up kernel"); + return NULL; + } + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + + switch (kernel_type) { + case GGML_TSAVORITE_KERNEL_TYPE_ADD: + if (ggml_tsavorite_kernel_mode_flag == GGML_TSAVORITE_KERNEL_MODE_CPU) + kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_add_test; + else + kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_add; + kernel_pipeline->kernel_name = "TXE_ADD"; + flag = true; + break; + case GGML_TSAVORITE_KERNEL_TYPE_SUB: + kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_sub; + kernel_pipeline->kernel_name = "TXE_SUB"; + flag = true; + break; + case GGML_TSAVORITE_KERNEL_TYPE_MULT: + if (ggml_tsavorite_kernel_mode_flag == GGML_TSAVORITE_KERNEL_MODE_CPU) + kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_mult_test; + else + kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_mult; + kernel_pipeline->kernel_name = "TXE_MULT"; + flag = true; + break; + case GGML_TSAVORITE_KERNEL_TYPE_DIV: + kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_div; + kernel_pipeline->kernel_name = "TXE_DIV"; + flag = true; + break; + case GGML_TSAVORITE_KERNEL_TYPE_SQRT: + kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_sqrt; + kernel_pipeline->kernel_name = "TXE_SQRT"; + flag = true; + break; + case GGML_TSAVORITE_KERNEL_TYPE_NEG: + kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_neg; + kernel_pipeline->kernel_name = "TXE_NEG"; + flag = true; + break; + case GGML_TSAVORITE_KERNEL_TYPE_ABS: + kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_abs; + kernel_pipeline->kernel_name = "TXE_ABS"; + flag = true; + break; + case GGML_TSAVORITE_KERNEL_TYPE_SIN: + kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_sin; + kernel_pipeline->kernel_name = "TXE_SIN"; + flag = true; + break; + case GGML_TSAVORITE_KERNEL_TYPE_SIGMOID: + kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_sigmoid; + kernel_pipeline->kernel_name = "TXE_SIGMOID"; + flag = true; + break; + default: + break; + } + if (!flag) { + GGML_TSAVORITE_LOG_INFO("Kernel %d not supported \n", kernel_type); + if (kernel_pipeline) { + free(kernel_pipeline); + kernel_pipeline = NULL; + } + } + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return kernel_pipeline; +} + +static void tsi_kernel_release(txe_compute_pipeline_state_s kernel_pipeline) { + // clear kernel_pipeline + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + if (kernel_pipeline) { + free(kernel_pipeline); + } + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return; +} + +// acquire +static txe_device_s +ggml_backend_tsavorite_device_acq(struct ggml_backend_tsavorite_device_context *ctx) { + assert(ctx != NULL); + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + + if (ctx->device == tsi_nil) { + ctx->device = tsi_system_default_device_create(); + snprintf(ctx->name, sizeof("txe"), "txe"); + } + + ctx->ref_count++; + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + return ctx->device; +} + +// release +static void ggml_backend_tsavorite_device_rel(struct ggml_backend_tsavorite_device_context *ctx) { + assert(ctx != NULL); + assert(ctx->ref_count > 0); + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + + ctx->ref_count--; + + // Need to define function txe_device_free + if (ctx->ref_count == 0) { + tsi_device_free(ctx->device); + ctx->device = tsi_nil; + } + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); +} + +// We will use Unified Memory this memory is used for buffer +static void *ggml_tsavorite_host_malloc(size_t n) { + void *data = NULL; + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + + GGML_TSAVORITE_LOG_INFO("\n Allocating memory from tsi_alloc with size %ld \n", n); + printf("\n ANoop Allocating memory from tsi_alloc with size %ld \n", n); + data = tsi_alloc(n); + GGML_TSAVORITE_LOG_CONT("\n Allocating memory from tsi_alloc with size %ld starting memory %p\n", + n, data); + + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + return data; +} + +static struct ggml_backend_tsavorite_context *ggml_tsavorite_init(ggml_backend_dev_t dev) { + GGML_TSAVORITE_LOG_INFO("%s: Start\n", __func__); + // Open a file named "tsi-op.txt" in the current directory for writing + num_of_op = 0; + + if (tsi_log_setup() == false) + return NULL; + + // TSI Run time Initalization + tsi_initialize(NUM_OF_TXES); + + // init context + struct ggml_backend_tsavorite_context *ctx = (struct ggml_backend_tsavorite_context *)calloc( + 1, sizeof(struct ggml_backend_tsavorite_context)); + struct ggml_backend_tsavorite_device_context *ctx_dev = + (struct ggml_backend_tsavorite_device_context *)dev->context; + + // setup the devie context + txe_device_s device = ggml_backend_tsavorite_device_acq(ctx_dev); + GGML_TSAVORITE_LOG_INFO("%s: picking default device: %s\n", __func__, device->name); + for (uint32_t op = GGML_TSAVORITE_KERNEL_TYPE_ADD; op < GGML_TSAVORITE_KERNEL_TYPE_COUNT; ++op) { + device->stats.op_run_count[op].total_tensor_count = 0; + device->stats.op_run_count[op].num_of_kernel_call = 0; + device->stats.op_run_count[op].num_of_tensor_spilt = 0; + device->stats.op_run_count[op].min_num_of_elem = 0; + device->stats.op_run_count[op].max_num_of_elem = 0; + } + ctx->n_threads = GGML_DEFAULT_N_THREADS; + ctx->threadpool = NULL; + ctx->work_data = NULL; + ctx->work_size = 0; + ctx->abort_callback = NULL; + ctx->abort_callback_data = NULL; + + // We dont need it for now, we will revisit +#ifdef USE_COMMAND_BUFFERS + // setting up backend context + ctx->queue = tsi_command_queue_create(); + ctx->d_queue = tsi_dispatch_queue_create(); +#endif /* USE_COMMAND_BUFFERS */ + + ctx->capture_next_compute = false; + ctx->capture_started = false; + + ctx->gf = tsi_nil; + // ctx->encode_async = tsi_nil; + +#ifdef USE_COMMAND_BUFFERS + for (int i = 0; i < GGML_TSAVORITE_MAX_COMMAND_BUFFERS; ++i) { + ctx->command_buffers[i] = tsi_nil; + } +#endif /* USE_COMMAND_BUFFERS */ + + // load TSavorite kernels + { + for (int i = 0; i < GGML_TSAVORITE_KERNEL_TYPE_COUNT; ++i) { + ctx->kernels[i].pipeline = tsi_nil; + } + +#define GGML_TSAVORITE_KERNEL(e, supported) \ + if (supported) { \ + ctx->kernels[e].pipeline = tsi_kernel_setup(e); \ + GGML_TSAVORITE_LOG_INFO(" TSAVORITE SUPPORTED KERNEL "); \ + } else { \ + GGML_TSAVORITE_LOG_WARN("%s: skipping %-40s (not supported)\n", __func__, "kernel_" #e); \ + } + + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_ADD, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SUB, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_MULT, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_DIV, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SQRT, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_NEG, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_ABS, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SIN, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SIGMOID, true); + } + + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return ctx; +} + +static void ggml_tsavorite_free(struct ggml_backend_tsavorite_context *ctx) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + + for (int i = 0; i < GGML_TSAVORITE_KERNEL_TYPE_COUNT; ++i) { + if (ctx->kernels[i].pipeline) { + tsi_kernel_release(ctx->kernels[i].pipeline); + ctx->kernels[i].pipeline = tsi_nil; + } + } + + // Block_release(ctx->encode_async); + // +#ifdef USE_COMMAND_BUFFERS + tsi_command_queue_free(ctx->queue); + + tsi_dispatch_queue_free(ctx->d_queue); +#endif /* USE_COMMAND_BUFFERS */ + + free(ctx); + + // TSI run time free + GGML_TSAVORITE_LOG_INFO("\n Calling tsi_finalize \n"); + // delay to allow any file operations to complete for runtime + + GGML_TSAVORITE_LOG_INFO("Delaying tsi_finalize for 2 sec"); + sleep(2); + tsi_finalize(); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); +} + +#if 0 +// finds the tSavorite buffer that contains the tensor data on the TXE device unified memory +// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the +// tSavorite buffer based on the host memory pointer +// +static ggml_backend_tsavorite_buffer_s ggml_tsavorite_get_buffer(struct ggml_tensor * t, size_t * offs) { + // GGML_TSAVORITE_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + + const int64_t tsize = ggml_nbytes(t); + + ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer; + + struct ggml_backend_tsavorite_buffer_context * buf_ctx = (struct ggml_backend_tsavorite_buffer_context *) buffer->context; + + // find the view that contains the tensor fully + for (int i = 0; i < buf_ctx->n_buffers; ++i) { + const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data; + + // GGML_TSAVORITE_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size); + if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) { + *offs = (size_t) ioffs; + + // GGML_TSAVORITE_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + return buf_ctx->buffers[i]; + } + } + + GGML_TSAVORITE_LOG_ERROR("%s: error: tensor '%s' buffer is tsi_nil\n", __func__, t->name); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + return tsi_nil; +} +#endif + +static bool ggml_tsavorite_supports_op(const struct ggml_backend_tsavorite_device_context *ctx_dev, + const struct ggml_tensor *op) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + if (!ctx_dev) + return false; + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + for (size_t i = 0, n = 3; i < n; ++i) { + if (op->src[i] != NULL && op->src[i]->type != GGML_TYPE_F32) { + return false; + } + } + + if (op->type != GGML_TYPE_F32) + return false; + switch (op->op) { + case GGML_OP_NONE: + case GGML_OP_ADD: + case GGML_OP_SUB: + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_SQRT: + case GGML_OP_SIN: + break; + case GGML_OP_UNARY: + switch (ggml_get_unary_op(op)) { + case GGML_UNARY_OP_NEG: + case GGML_UNARY_OP_ABS: + case GGML_UNARY_OP_SIGMOID: + break; + default: + return false; + } + break; + default: + return false; + } + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return true; +} + +/* +static void ggml_tsavorite_encode_node( + ggml_backend_t backend, + int idx, + tsi_command_encoder encoder) { +} +*/ + +static void ggml_tsavorite_decompose_unary_kernel_sin(uint32_t num_elem, ggml_tensor *src) { + float *p = (float *)(src->data); + for (uint32_t i = 0; i < num_elem; ++i) { + *p = (*p) / (2 * M_PI); + ++p; + } + return; +} + +static void ggml_tsavorite_decompose_unary_kernel(uint32_t num_elem, ggml_tensor *src, + ggml_tensor *node) { + switch (node->op) { + case GGML_OP_SIN: + ggml_tsavorite_decompose_unary_kernel_sin(num_elem, src); + break; + default: + break; + } + return; +} + +// nodes are intermediate which has multiple src tensors & operation +// Here we create multiple thread +// Each Thread run the command buffer & pick Tensor and execute and get the result back base on +// async or sync all Compute wil finish all tensors execution +static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend, + struct ggml_cgraph *cgraph) { +#if 0 + GGML_LOG_INFO("Start %s\n", __func__); + struct ggml_backend_tsavorite_context * ctx = backend->context; + struct ggml_backend_tsavorite_device_context * ctx_dev = backend->device->context; + + // number of nodes encoded by the main thread (empirically determined) + const int n_main = 128; + + // number of threads in addition to the main thread + const int n_cb = ctx->n_cb; + + // submit the ggml compute graph to the TXE by creating command buffers and encoding the ops in them + // the first n_nodes_0 are encoded and submitted for processing directly by the calling thread + // while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes + // each thread creates it's own command buffer and enqueues the ops in parallel + + GGML_LOG_INFO("End %s\n", __func__); + return GGML_STATUS_SUCCESS; +#endif + + struct ggml_backend_tsavorite_context *ctx = + (struct ggml_backend_tsavorite_context *)backend->context; + if (!ctx) { + GGML_LOG_ERROR("\n backend ctx is NULL \n"); + return GGML_STATUS_FAILED; + } + +#if 0 + struct ggml_cplan cplan = ggml_graph_plan(cgraph, ctx->n_threads, ctx->threadpool); + + if (ctx->work_size < cplan.work_size) { + delete[] ctx->work_data; + ctx->work_data = new uint8_t[cplan.work_size]; + if (ctx->work_data == NULL) { + ctx->work_size = 0; + return GGML_STATUS_ALLOC_FAILED; + } + ctx->work_size = cplan.work_size; + } + cplan.work_data = (uint8_t *)ctx->work_data; + + cplan.abort_callback = ctx->abort_callback; + cplan.abort_callback_data = ctx->abort_callback_data; +#endif + + txe_device_s device = ggml_backend_tsavorite_device_acq( + (struct ggml_backend_tsavorite_device_context *)backend->device->context); + + if (!device) { + GGML_TSAVORITE_LOG_ERROR("\n tsavorite device is NULL \n"); + return GGML_STATUS_FAILED; + } + // MemRefDescriptor + const int Rank = MEM_REF_DESCRIPTOR_RANK; + MemRefDescriptor *srcP0, *srcP1, *nodeP; + struct ggml_tensor *src0, *src1, *node; + uint32_t num_elem_src0, num_elem_src1, num_elem_node; + enum ggml_tsavorite_kernel_type kernel_type; + // This variable not needed since src0 or node will have max elem size + // and src1 size will min elem size + uint64_t max_num_of_elem, min_num_of_elem; + enum ggml_tsavorite_input_tensors_count num_of_input_tensors; + tensor_log log_data; + + for (int i = 0; i < cgraph->n_nodes; i++) { + node = cgraph->nodes[i]; + src0 = node->src[0]; + src1 = node->src[1]; + min_num_of_elem = 0; + max_num_of_elem = 0; + + switch (node->op) { + case GGML_OP_ADD: + kernel_type = GGML_TSAVORITE_KERNEL_TYPE_ADD; + num_of_input_tensors = TSAVORITE_TWO_INPUT_TENSORS; + break; + case GGML_OP_SUB: + kernel_type = GGML_TSAVORITE_KERNEL_TYPE_SUB; + num_of_input_tensors = TSAVORITE_TWO_INPUT_TENSORS; + break; + case GGML_OP_MUL: + kernel_type = GGML_TSAVORITE_KERNEL_TYPE_MULT; + num_of_input_tensors = TSAVORITE_TWO_INPUT_TENSORS; + break; + case GGML_OP_DIV: + kernel_type = GGML_TSAVORITE_KERNEL_TYPE_DIV; + num_of_input_tensors = TSAVORITE_TWO_INPUT_TENSORS; + break; + case GGML_OP_SQRT: + kernel_type = GGML_TSAVORITE_KERNEL_TYPE_SQRT; + num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS; + break; + case GGML_OP_SIN: + kernel_type = GGML_TSAVORITE_KERNEL_TYPE_SIN; + num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS; + break; + case GGML_OP_UNARY: + switch (ggml_get_unary_op(node)) { + case GGML_UNARY_OP_NEG: + kernel_type = GGML_TSAVORITE_KERNEL_TYPE_NEG; + num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS; + break; + case GGML_UNARY_OP_ABS: + kernel_type = GGML_TSAVORITE_KERNEL_TYPE_ABS; + num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS; + break; + case GGML_UNARY_OP_SIGMOID: + kernel_type = GGML_TSAVORITE_KERNEL_TYPE_SIGMOID; + num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS; + break; + default: + ggml_backend_tsavorite_device_rel( + (struct ggml_backend_tsavorite_device_context *)backend->device->context); + return GGML_STATUS_ABORTED; + } + break; + default: + ggml_backend_tsavorite_device_rel( + (struct ggml_backend_tsavorite_device_context *)backend->device->context); + return GGML_STATUS_ABORTED; + } + + if (!ctx->kernels[kernel_type].pipeline || + (!ctx->kernels[kernel_type].pipeline->_mlir_fptr_2_input && + !ctx->kernels[kernel_type].pipeline->_mlir_fptr_1_input)) { + GGML_TSAVORITE_LOG_ERROR("Kernel Type %d, not supported \n", kernel_type); + return GGML_STATUS_ABORTED; + } + ++num_of_op; + + if (num_of_input_tensors == TSAVORITE_TWO_INPUT_TENSORS) { + if (node->src[0] && node->src[1]) { + if (!src0->data || !src1->data || !node->data) { + GGML_TSAVORITE_LOG_ERROR( + "One of tensor Data doesnt have memory leaf1 %p, leaf2 %p, node %p \n", src0->data, + src1->data, node->data); + ggml_backend_tsavorite_device_rel( + (struct ggml_backend_tsavorite_device_context *)backend->device->context); + return GGML_STATUS_ABORTED; + } + srcP0 = (MemRefDescriptor *)src0->data; + srcP1 = (MemRefDescriptor *)src1->data; + nodeP = (MemRefDescriptor *)node->data; + // This is for tsavorite MemRef Header hence getting header + --srcP0; + --srcP1; + --nodeP; + srcP0->data = srcP0->base = src0->data; + srcP1->data = srcP1->base = src1->data; + nodeP->data = nodeP->base = node->data; + // offset & shape size will be update base on Tensor Size + // TSAVORITE KERNEL CAN Take max of TSAVORITE_KERNEL_SIZE + // Hence we need to load tensor data at multiple iteration + // for large Tensor Dataset + srcP0->offset = 0; + srcP1->offset = 0; + nodeP->offset = 0; + + // currently _mlir_ as restriction to hold max of 64 elements, we need to spilt the work if + // its more than 64, i will address this at future PR Initalizing num_elem + num_elem_src0 = 1; + for (int i = 0; i < GGML_MAX_DIMS && src0->nb[i] != 0; ++i) + num_elem_src0 *= src0->ne[i]; + + num_elem_src1 = 1; + for (int i = 0; i < GGML_MAX_DIMS && src1->nb[i] != 0; ++i) + num_elem_src1 *= src1->ne[i]; + + num_elem_node = 1; + for (int i = 0; i < GGML_MAX_DIMS && node->nb[i] != 0; ++i) + num_elem_node *= node->ne[i]; + + if (!num_elem_src0 || !num_elem_src1 || !num_elem_node) { + GGML_TSAVORITE_LOG_ERROR("\nOne or more of Tensor length is zero of kernel_type %d\n", + kernel_type); + return GGML_STATUS_ABORTED; + } + + min_num_of_elem = max_num_of_elem = num_elem_src0; + + if (min_num_of_elem > num_elem_src1) + min_num_of_elem = num_elem_src1; + if (min_num_of_elem > num_elem_node) + min_num_of_elem = num_elem_node; + + if (max_num_of_elem < num_elem_src1) + max_num_of_elem = num_elem_src1; + if (max_num_of_elem < num_elem_node) + max_num_of_elem = num_elem_node; + + if (ggml_tsavorite_log_type_val == GGML_TSAVORITE_LOG_DEBUG) { + bzero((char *)&log_data, sizeof(log_data)); + log_data.leaf1_len = num_elem_src0; + log_data.leaf2_len = num_elem_src1; + log_data.node_len = num_elem_node; + log_data.log_file = tsi_op_log_file; + log_data.num_of_op = num_of_op; + log_data.kernel_type = kernel_type; + + log_data.data_type = GGML_TSAVORITE_TENSOR_HEADER; + ggml_tsi_log_tensor_data(log_data); + + log_data.data_type = GGML_TSAVORITE_TENSOR_LEAF1; + log_data.tensor = src0; + ggml_tsi_log_tensor_data(log_data); + + log_data.data_type = GGML_TSAVORITE_TENSOR_LEAF2; + log_data.tensor = src1; + ggml_tsi_log_tensor_data(log_data); + } + + ggml_tensor *dst = node; + const int nr = ggml_nrows(src0); + + GGML_TENSOR_BINARY_OP_LOCALS + + for (int ir = 0; ir < nr; ++ir) { + const int64_t i03 = ir / (ne02 * ne01); + const int64_t i02 = (ir - i03 * ne02 * ne01) / ne01; + const int64_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01); + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + const int64_t nr0 = ne00 / ne10; + + float *dst_ptr = (float *)((char *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1); + float *src0_ptr = (float *)((char *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01); + float *src1_ptr = (float *)((char *)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11); + + for (int64_t r = 0; r < nr0; ++r) { + // While loop is added to handle the scenario when kernel number of elements + // less than ggml tensor number of elements.GGML tensor number of elements decided + // base on application like llama.cpp. Currently we have build Kernel elements + // statically hence we have MACRO: TSAVORITE_KERNEL_SIZE to track this + int count = 0; + while (count < ne10) { + int kernel_size; + srcP1->data = srcP1->base = (void *)(src1_ptr + count); + srcP0->data = srcP0->base = (void *)(src0_ptr + r * ne10 + count); + nodeP->data = nodeP->base = (void *)(dst_ptr + r * ne10 + count); + if ((count + TSAVORITE_KERNEL_SIZE) > ne10) + kernel_size = ne10 - count; + else + kernel_size = TSAVORITE_KERNEL_SIZE; + count += kernel_size; + srcP0->shape[Rank - 1] = kernel_size; + srcP1->shape[Rank - 1] = kernel_size; + nodeP->shape[Rank - 1] = kernel_size; + srcP0->strides[Rank - 1] = 0; + srcP1->strides[Rank - 1] = 0; + nodeP->strides[Rank - 1] = 0; + // kernel call + ctx->kernels[kernel_type].pipeline->_mlir_fptr_2_input(srcP0, srcP1, nodeP); + ++device->stats.op_run_count[kernel_type].num_of_kernel_call; + } + } + } + + if (ggml_tsavorite_log_type_val == GGML_TSAVORITE_LOG_DEBUG) { + log_data.data_type = GGML_TSAVORITE_TENSOR_NODE; + log_data.tensor = node; + ggml_tsi_log_tensor_data(log_data); + + log_data.data_type = GGML_TSAVORITE_TENSOR_END_DATA; + log_data.tensor = NULL; + ggml_tsi_log_tensor_data(log_data); + } + } + } + + if (num_of_input_tensors == TSAVORITE_UNARY_INPUT_TENSORS) { + if (node->src[0]) { + if (!src0->data || !node->data) { + GGML_TSAVORITE_LOG_ERROR( + "input or output tensor Data doesnt have memory leaf %p, node %p \n", src0->data, + node->data); + ggml_backend_tsavorite_device_rel( + (struct ggml_backend_tsavorite_device_context *)backend->device->context); + return GGML_STATUS_ABORTED; + } + srcP0 = (MemRefDescriptor *)src0->data; + nodeP = (MemRefDescriptor *)node->data; + // This is for tsavorite MemRef Header hence getting header + --srcP0; + --nodeP; + srcP0->data = srcP0->base = src0->data; + nodeP->data = nodeP->base = node->data; + // offset & shape size will be update base on Tensor Size + // TSAVORITE KERNEL CAN Take max of TSAVORITE_KERNEL_SIZE + // Hence we need to load tensor data at multiple iteration + // for large Tensor Dataset + srcP0->offset = 0; + nodeP->offset = 0; + + // currently _mlir_ as restriction to hold max of 64 elements, we need to spilt the work if + // its more than 64, i will address this at future PR Initalizing num_elem + num_elem_src0 = 1; + for (int i = 0; i < GGML_MAX_DIMS && src0->nb[i] != 0; ++i) + num_elem_src0 *= src0->ne[i]; + max_num_of_elem = min_num_of_elem = num_elem_src0; + + if (ggml_tsavorite_log_type_val == GGML_TSAVORITE_LOG_DEBUG) { + bzero((char *)&log_data, sizeof(log_data)); + log_data.leaf1_len = num_elem_src0; + log_data.leaf2_len = 0; + log_data.node_len = num_elem_src0; + log_data.log_file = tsi_op_log_file; + log_data.num_of_op = num_of_op; + log_data.kernel_type = kernel_type; + + log_data.data_type = GGML_TSAVORITE_TENSOR_HEADER; + ggml_tsi_log_tensor_data(log_data); + + log_data.data_type = GGML_TSAVORITE_TENSOR_LEAF1; + log_data.tensor = src0; + ggml_tsi_log_tensor_data(log_data); + } + // While loop is added to handle the scenario when kernel number of elements + // less than ggml tensor number of elements.GGML tensor number of elements decided + // base on application like llama.cpp. Currently we have build Kernel elements statically + // hence we have MACRO: TSAVORITE_KERNEL_SIZE to track this + uint32_t count = 0; + + if (node->op == GGML_OP_SIN) { + ggml_tsavorite_decompose_unary_kernel(num_elem_src0, src0, node); + } + while (count < num_elem_src0) { + int kernel_size; + srcP0->data = srcP0->base = (void *)((float *)src0->data + count); + nodeP->data = nodeP->base = (void *)((float *)node->data + count); + if ((count + TSAVORITE_KERNEL_SIZE) > num_elem_src0) + kernel_size = num_elem_src0 - count; + else + kernel_size = TSAVORITE_KERNEL_SIZE; + count += kernel_size; + srcP0->shape[Rank - 1] = kernel_size; + nodeP->shape[Rank - 1] = kernel_size; + srcP0->strides[Rank - 1] = 0; + nodeP->strides[Rank - 1] = 0; + // kernel call + ctx->kernels[kernel_type].pipeline->_mlir_fptr_1_input(srcP0, nodeP); + ++device->stats.op_run_count[kernel_type].num_of_kernel_call; + } + + if (ggml_tsavorite_log_type_val == GGML_TSAVORITE_LOG_DEBUG) { + log_data.data_type = GGML_TSAVORITE_TENSOR_NODE; + log_data.tensor = node; + ggml_tsi_log_tensor_data(log_data); + + log_data.data_type = GGML_TSAVORITE_TENSOR_END_DATA; + log_data.tensor = NULL; + ggml_tsi_log_tensor_data(log_data); + } + } + } + if (min_num_of_elem > 0) { + ++device->stats.op_run_count[kernel_type].total_tensor_count; + + if (min_num_of_elem > TSAVORITE_KERNEL_SIZE) + ++device->stats.op_run_count[kernel_type].num_of_tensor_spilt; + + if (!(device->stats.op_run_count[kernel_type].min_num_of_elem) || + device->stats.op_run_count[kernel_type].min_num_of_elem > min_num_of_elem) + device->stats.op_run_count[kernel_type].min_num_of_elem = min_num_of_elem; + + if (!(device->stats.op_run_count[kernel_type].max_num_of_elem) || + device->stats.op_run_count[kernel_type].max_num_of_elem < max_num_of_elem) + device->stats.op_run_count[kernel_type].max_num_of_elem = max_num_of_elem; + } + } + + // This this need to implement correctly when we have mixture of CPU and accelerator operation + // return ggml_graph_compute(cgraph, &cplan); + ggml_backend_tsavorite_device_rel( + (struct ggml_backend_tsavorite_device_context *)backend->device->context); + return GGML_STATUS_SUCCESS; + + GGML_UNUSED(backend); +} + +//////////////////////////////////////////////////////////////////////////////// + +// backend interface + +#if 0 +static const char * ggml_backend_tsavorite_buffer_get_name(ggml_backend_buffer_t buffer) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return "tSavorite"; + + TSI_UNUSED(buffer); +} +#endif + +static void ggml_backend_tsavorite_buffer_free_buffer(ggml_backend_buffer_t buffer) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + struct ggml_backend_tsavorite_buffer_context *ctx = + (struct ggml_backend_tsavorite_buffer_context *)buffer->context; + +#if 0 + // ctx->all_data & tsi_buffer_free(ctx->buffers[i].data and same memory and created by tsi_alloc + // tsi_finalize called when ggml call backend free all memory + // this fucntion called when ggml free backend particular buffer, currently we cant provide this support + // and just return NoOps + // But at end there is no memory leak but memory can grow since we free at last once backend is shutdown + // We need to revisit this hence i kept the stuff under if 0 + for (int i = 0; i < ctx->n_buffers; i++) { + tsi_buffer_free(ctx->buffers[i].data); + } + ggml_backend_tsavorite_device_rel((struct ggml_backend_tsavorite_device_context *)buffer->buft->device->context); + + if (ctx->owned) { + free(ctx->all_data); + } +#endif + + free(ctx); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); +} + +static void *ggml_backend_tsavorite_buffer_get_base(ggml_backend_buffer_t buffer) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + struct ggml_backend_tsavorite_buffer_context *ctx = + (struct ggml_backend_tsavorite_buffer_context *)buffer->context; + + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return ctx->all_data; +} + +static ggml_status ggml_backend_tsavorite_buffer_init_tensor(ggml_backend_buffer_t buffer, + struct ggml_tensor *tensor) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + const int Rank = MEM_REF_DESCRIPTOR_RANK; + MemRefDescriptor tensor_data_header; + tensor->data = (void *)(sizeof(tensor_data_header) + (char *)tensor->data); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return GGML_STATUS_SUCCESS; + + TSI_UNUSED(buffer); +} + +static void ggml_backend_tsavorite_buffer_memset_tensor(ggml_backend_buffer_t buffer, + struct ggml_tensor *tensor, uint8_t value, + size_t offset, size_t size) { + if (!tensor || !tensor->data) { + GGML_TSAVORITE_LOG_ERROR("\n tensor or data cant be null under func: %s\n", __func__); + return; + } + memset((char *)tensor->data + offset, value, size); + + GGML_UNUSED(buffer); +} + +static void ggml_backend_tsavorite_buffer_set_tensor(ggml_backend_buffer_t buffer, + struct ggml_tensor *tensor, const void *data, + size_t offset, size_t size) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + memcpy((char *)tensor->data + offset, data, size); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + TSI_UNUSED(buffer); +} + +static void ggml_backend_tsavorite_buffer_get_tensor(ggml_backend_buffer_t buffer, + const struct ggml_tensor *tensor, void *data, + size_t offset, size_t size) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + memcpy(data, (const char *)tensor->data + offset, size); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + TSI_UNUSED(buffer); +} + +static bool ggml_backend_tsavorite_buffer_cpy_tensor(ggml_backend_buffer_t buffer, + const struct ggml_tensor *src, + struct ggml_tensor *dst) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + + if (ggml_backend_buffer_is_host(src->buffer)) { + memcpy(dst->data, src->data, (ggml_nbytes(src))); + return true; + } + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return false; + + TSI_UNUSED(buffer); +} + +static void ggml_backend_tsavorite_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + struct ggml_backend_tsavorite_buffer_context *ctx = + (struct ggml_backend_tsavorite_buffer_context *)buffer->context; + if (!ctx || !ctx->all_data) { + GGML_TSAVORITE_LOG_ERROR("\n ctx or all_data cant be null under func: %s\n", __func__); + return; + } + memset((char *)ctx->all_data, value, ctx->all_size); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); +} + +static struct ggml_backend_buffer_i ggml_backend_tsavorite_buffer_i = { + /* .free_buffer = */ ggml_backend_tsavorite_buffer_free_buffer, + /* .get_base = */ ggml_backend_tsavorite_buffer_get_base, + /* .init_tensor = */ ggml_backend_tsavorite_buffer_init_tensor, + /* .memset_tensor = */ ggml_backend_tsavorite_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_tsavorite_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_tsavorite_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_tsavorite_buffer_cpy_tensor, + /* .clear = */ ggml_backend_tsavorite_buffer_clear, + /* .reset = */ NULL, +}; + +// default buffer type + +static const char *ggml_backend_tsavorite_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return "tsavorite"; + + TSI_UNUSED(buft); +} + +static void ggml_backend_tsavorite_log_allocated_size(txe_device_s device, size_t size_aligned) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); +#ifndef GGML_TSAVORITE_NDEBUG +#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) + GGML_TSAVORITE_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n", __func__, + size_aligned / 1024.0 / 1024.0, + device.currentAllocatedSize / 1024.0 / 1024.0); +#endif +#endif + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + TSI_UNUSED(device); + TSI_UNUSED(size_aligned); +} + +static ggml_backend_buffer_t +ggml_backend_tsavorite_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + struct ggml_backend_tsavorite_buffer_context *ctx = + (struct ggml_backend_tsavorite_buffer_context *)calloc( + 1, sizeof(struct ggml_backend_tsavorite_buffer_context)); + + const size_t size_page = sysconf(_SC_PAGESIZE); + GGML_TSAVORITE_LOG_CONT( + "ggml_backend_tsavorite_buffer_type_alloc_buffer is called from llama data Loader \n"); + + size_t size_aligned = size; + if ((size_aligned % size_page) != 0) { + size_aligned += (size_page - (size_aligned % size_page)); + } + + txe_device_s device = ggml_backend_tsavorite_device_acq( + (struct ggml_backend_tsavorite_device_context *)buft->device->context); + if (!device) + return NULL; + + ctx->all_data = ggml_tsavorite_host_malloc(size_aligned); + ctx->all_size = size_aligned; + ctx->owned = true; + ctx->n_buffers = 1; + GGML_TSAVORITE_LOG_INFO("\n\n\n\n Memory Starting address %p and size %ld \n\n\n", ctx->all_data, + ctx->all_size); + + if (ctx->all_data != NULL) { + GGML_TSAVORITE_LOG_CONT("\nAddress of Newly Created BUffer %p and size %ld \n", ctx->all_data, + ctx->all_size); + if (ggml_tsavorite_log_type_val == GGML_TSAVORITE_LOG_DEBUG) { + fprintf(tsi_op_log_file, "Address of Newly Created BUffer %p and size %ld \n", ctx->all_data, + ctx->all_size); + } + ctx->buffers[0].data = NULL; + ctx->buffers[0].data = ctx->all_data; + ctx->buffers[0].size = size; + memset((char *)ctx->all_data, 0, ctx->all_size); + } + + if (size_aligned > 0 && (ctx->all_data == NULL)) { + GGML_TSAVORITE_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, + size_aligned / 1024.0 / 1024.0); + free(ctx); + ggml_backend_tsavorite_device_rel( + (struct ggml_backend_tsavorite_device_context *)buft->device->context); + return NULL; + } + + // ggml_backend_tsavorite_log_allocated_size(device, size_aligned); + device->current_allocated_size += ctx->all_size; + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + return ggml_backend_buffer_init(buft, ggml_backend_tsavorite_buffer_i, ctx, size); +} + +static size_t ggml_backend_tsavorite_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return 32; + TSI_UNUSED(buft); +} + +static size_t ggml_backend_tsavorite_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + txe_device_s device = ggml_backend_tsavorite_device_acq( + (struct ggml_backend_tsavorite_device_context *)buft->device->context); + const size_t max_size = device->max_buf_len; + ggml_backend_tsavorite_device_rel( + (struct ggml_backend_tsavorite_device_context *)buft->device->context); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + return max_size; + + TSI_UNUSED(buft); +} + +static size_t ggml_backend_tsavorite_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, + const struct ggml_tensor *tensor) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + txe_device_s device = ggml_backend_tsavorite_device_acq( + (struct ggml_backend_tsavorite_device_context *)buft->device->context); + if (!device) { + GGML_TSAVORITE_LOG_ERROR("\n tsavorite device is NULL \n"); + return 0; + } + const int Rank = MEM_REF_DESCRIPTOR_RANK; + MemRefDescriptor tensor_data_header; + ggml_backend_tsavorite_device_rel( + (struct ggml_backend_tsavorite_device_context *)buft->device->context); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + GGML_TSAVORITE_LOG_INFO( + "\n\n\n\n Calculating---- Alloc ----Size header %lu and data %lu \n\n\n\n ", + sizeof(tensor_data_header), ggml_nbytes(tensor)); + + return (sizeof(tensor_data_header) + ggml_nbytes(tensor)); + + TSI_UNUSED(buft); +} + +static bool ggml_backend_tsavorite_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + // For Now CPU is loading all data and then copy some tensor to Tsavorite Backend + // Once we have most of Operation supported by Tsavorite + // We will figure out to make tsavorite Backend also host + return false; + + TSI_UNUSED(buft); +} + +ggml_backend_buffer_type_t ggml_backend_tsavorite_buffer_type(void) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + static struct ggml_backend_buffer_type ggml_backend_buffer_type_tsavorite = { + /* .iface = */ { + /* .get_name = */ ggml_backend_tsavorite_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_tsavorite_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_tsavorite_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_tsavorite_buffer_type_get_max_size, + /* .get_alloc_size = */ + ggml_backend_tsavorite_buffer_type_get_alloc_size, // defaults to ggml_nbytes + /* .is_host = */ ggml_backend_tsavorite_buffer_type_is_host, + }, + /* .device = */ &g_ggml_backend_tsavorite_device, + /* .context = */ NULL, + }; + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + return &ggml_backend_buffer_type_tsavorite; +} + +// backend + +static const char *ggml_backend_tsavorite_name(ggml_backend_t backend) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return "Tsavorite"; + + TSI_UNUSED(backend); +} + +static void ggml_backend_tsavorite_free(ggml_backend_t backend) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + if (!backend || !backend->context || !backend->device || !backend->device->context) { + GGML_TSAVORITE_LOG_ERROR("At %s One of more pointer among: Backend, backend_context, " + "device_context or device are NULL", + __func__); + return; + } + struct ggml_backend_tsavorite_context *ctx = + (struct ggml_backend_tsavorite_context *)backend->context; + struct ggml_backend_tsavorite_device_context *ctx_dev = + (struct ggml_backend_tsavorite_device_context *)backend->device->context; + ggml_tsavorite_disp_stats(ctx, ctx_dev->device); + + ggml_backend_tsavorite_device_rel(ctx_dev); + ggml_tsavorite_free(ctx); + + free(backend); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); +} + +static void ggml_backend_tsavorite_synchronize(ggml_backend_t backend) { +// We need to implement ASYN Method to take output of tensor data to input of other Tensor +// We will evaluate and implement at later PR +#ifdef SYNC_DEBUG + usleep(100000); +#endif /* SYNC_DEBUG */ + TSI_UNUSED(backend); +} + +static ggml_backend_buffer_type_t +ggml_backend_tsavorite_get_default_buffer_type(ggml_backend_t backend) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return ggml_backend_tsavorite_buffer_type(); + + TSI_UNUSED(backend); +} + +static enum ggml_status ggml_backend_tsavorite_graph_compute(ggml_backend_t backend, + struct ggml_cgraph *cgraph) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return ggml_tsavorite_graph_compute(backend, cgraph); +} + +static void ggml_backend_tsavorite_set_n_cb(ggml_backend_t backend, int n_cb) { + // GGML_ASSERT(ggml_backend_is_tsavorite(backend)); + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + + struct ggml_backend_tsavorite_context *ctx = + (struct ggml_backend_tsavorite_context *)backend->context; + + if (ctx->n_cb != n_cb) { + ctx->n_cb = MIN(n_cb, GGML_TSAVORITE_MAX_COMMAND_BUFFERS); + + if (ctx->n_cb > 2) { + GGML_TSAVORITE_LOG_WARN("%s: n_cb = %d, using n_cb > 2 is not recommended and can degrade " + "the performance in some cases\n", + __func__, n_cb); + } + } + +#if 0 + if (ctx->encode_async) { + Block_release(ctx->encode_async); + } +#endif + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); +} + +static struct ggml_backend_i ggml_backend_tsavorite_i = { + /* .get_name = */ ggml_backend_tsavorite_name, + /* .free = */ ggml_backend_tsavorite_free, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_async = */ NULL, + /* .synchronize = */ ggml_backend_tsavorite_synchronize, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_tsavorite_graph_compute, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, +}; + +static ggml_guid_t ggml_backend_tsavorite_guid(void) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + static ggml_guid guid = {0x81, 0xa1, 0x8b, 0x1e, 0x71, 0xec, 0x79, 0xed, + 0x2b, 0x85, 0xdc, 0x8a, 0x61, 0x98, 0x30, 0xe6}; + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return &guid; +} + +// This need to be removed in the future +ggml_backend_t ggml_backend_tsavorite_init(void) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_tsavorite_reg(), 0); + + struct ggml_backend_tsavorite_context *ctx = ggml_tsavorite_init(dev); + if (ctx == NULL) { + GGML_TSAVORITE_LOG_ERROR("%s: error: failed to allocate context\n", __func__); + return NULL; + } + + ggml_backend_t backend = (ggml_backend_t)malloc(sizeof(struct ggml_backend)); + if (backend) { + backend->guid = ggml_backend_tsavorite_guid(); + backend->iface = ggml_backend_tsavorite_i; + backend->device = dev; + backend->context = ctx; + } + // Will enable later + // ggml_backend_tsavorite_set_n_cb(backend, 1); + + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return backend; +} + +bool ggml_backend_is_tsavorite(ggml_backend_t backend) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_tsavorite_guid()); +} + +void ggml_backend_tsavorite_set_abort_callback(ggml_backend_t backend, + ggml_abort_callback abort_callback, + void *user_data) { + GGML_ASSERT(ggml_backend_is_tsavorite(backend)); + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + + struct ggml_backend_tsavorite_context *ctx = + (struct ggml_backend_tsavorite_context *)backend->context; + + ctx->abort_callback = abort_callback; + ctx->abort_callback_data = user_data; + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); +} + +void ggml_backend_tsavorite_capture_next_compute(ggml_backend_t backend) { + GGML_ASSERT(ggml_backend_is_tsavorite(backend)); + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + + struct ggml_backend_tsavorite_context *ctx = + (struct ggml_backend_tsavorite_context *)backend->context; + ctx->capture_next_compute = true; + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); +} + +// backend device + +static const char *ggml_backend_tsavorite_device_get_name(ggml_backend_dev_t dev) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return "Tsavorite"; + + GGML_UNUSED(dev); +} + +static const char *ggml_backend_tsavorite_device_get_description(ggml_backend_dev_t dev) { + // acq/rel just to populate ctx->name in case it hasn't been done yet + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + struct ggml_backend_tsavorite_device_context *ctx_dev = + (struct ggml_backend_tsavorite_device_context *)dev->context; + ggml_backend_tsavorite_device_acq(ctx_dev); + ggml_backend_tsavorite_device_rel(ctx_dev); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + return ctx_dev->name; +} + +static void ggml_backend_tsavorite_device_get_memory(ggml_backend_dev_t dev, size_t *free, + size_t *total) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + + if (!dev || !free || !total) { + GGML_TSAVORITE_LOG_INFO("One of more pointers(dev, free, total) are NULL\n"); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return; + } + *total = 0; + *total = 0; + struct ggml_backend_tsavorite_device_context *ctx_dev = + (struct ggml_backend_tsavorite_device_context *)dev->context; + if (ctx_dev) { + txe_device_s device = ggml_backend_tsavorite_device_acq(ctx_dev); + *total = device->recommended_max_working_set_size; + *free = *total - device->current_allocated_size; + GGML_TSAVORITE_LOG_CONT("\n TXE Device MEMORY Summary total %lu and free %lu \n", *total, + *free); + ggml_backend_tsavorite_device_rel(ctx_dev); + } + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return; +} + +// Currently We are setting our TXE accerator at GPU Type +static enum ggml_backend_dev_type ggml_backend_tsavorite_device_get_type(ggml_backend_dev_t dev) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return GGML_BACKEND_DEVICE_TYPE_GPU; + + GGML_UNUSED(dev); +} + +// Need to understand the scope of this API since this is not used +// // use by Structure llama_model_loader +// func llm_load_tensors +// structure lama_new_context_with_model +static void ggml_backend_tsavorite_device_get_props(ggml_backend_dev_t dev, + struct ggml_backend_dev_props *props) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + props->name = ggml_backend_tsavorite_device_get_name(dev); + props->description = ggml_backend_tsavorite_device_get_description(dev); + props->type = ggml_backend_tsavorite_device_get_type(dev); + ggml_backend_tsavorite_device_get_memory(dev, &props->memory_free, &props->memory_total); + + if (props) { + props->caps.async = false; + props->caps.host_buffer = false; + props->caps.buffer_from_host_ptr = true; + props->caps.buffer_from_host_ptr = false; + props->caps.events = false; + } + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); +} + +static ggml_backend_t ggml_backend_tsavorite_device_init(ggml_backend_dev_t dev, + const char *params) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + struct ggml_backend_tsavorite_context *ctx = ggml_tsavorite_init(dev); + if (ctx == NULL) { + GGML_TSAVORITE_LOG_ERROR("%s: error: failed to allocate context\n", __func__); + return NULL; + } + + ggml_backend_t backend = (ggml_backend_t)malloc(sizeof(struct ggml_backend)); + + if (backend) { + backend->guid = ggml_backend_tsavorite_guid(); + backend->iface = ggml_backend_tsavorite_i; + backend->device = dev; + backend->context = ctx; + } + + ggml_backend_tsavorite_set_n_cb(backend, 1); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + return backend; + + GGML_UNUSED(params); +} + +static ggml_backend_buffer_type_t +ggml_backend_tsavorite_device_get_buffer_type(ggml_backend_dev_t dev) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return ggml_backend_tsavorite_buffer_type(); + + GGML_UNUSED(dev); +} + +// Currently for llama.cpp model below API it seems not used +// llama.cpp is using as part llm_load_tensors +// buffer_from_host_ptr_supported +// is_default_buft +// else they will be using +// ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); +// Need to revist when we will look at buffer section implementation +static ggml_backend_buffer_t ggml_backend_tsavorite_device_buffer_from_ptr(ggml_backend_dev_t dev, + void *ptr, size_t size, + size_t max_tensor_size) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + struct ggml_backend_tsavorite_buffer_context *ctx = + (struct ggml_backend_tsavorite_buffer_context *)calloc( + 1, sizeof(struct ggml_backend_tsavorite_buffer_context)); + + ctx->all_data = ptr; + ctx->all_size = size; + ctx->owned = false; + ctx->n_buffers = 0; + + const size_t size_page = sysconf(_SC_PAGESIZE); + + // page-align the data ptr + { + const uintptr_t offs = (uintptr_t)ptr % size_page; + ptr = (void *)((char *)ptr - offs); + size += offs; + } + + size_t size_aligned = size; + if ((size_aligned % size_page) != 0) { + size_aligned += (size_page - (size_aligned % size_page)); + } + + struct ggml_backend_tsavorite_device_context *ctx_dev = + (struct ggml_backend_tsavorite_device_context *)dev->context; + txe_device_s device = ggml_backend_tsavorite_device_acq(ctx_dev); + + // the buffer fits into the max buffer size allowed by the device + if (size_aligned <= device->max_buf_len) { + ctx->buffers[ctx->n_buffers].data = ptr; + ctx->buffers[ctx->n_buffers].size = size; + + // ggml_backend_tsavorite_log_allocated_size(device, size_aligned); + + ++ctx->n_buffers; + } else { + // this overlap between the views will guarantee that the tensor with the maximum size will + // fully fit into one of the views + const size_t size_ovlp = ((max_tensor_size + size_page - 1) / size_page + 1) * + size_page; // round-up 2 pages just in case + const size_t size_step = device->max_buf_len - size_ovlp; + const size_t size_view = device->max_buf_len; + + for (size_t i = 0; i < size; i += size_step) { + const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i); + + ctx->buffers[ctx->n_buffers].data = (void *)((uint8_t *)ptr + i); + ctx->buffers[ctx->n_buffers].size = size_step_aligned; + + // ggml_backend_tsavorite_log_allocated_size(device, size_step_aligned); + + if (i + size_step < size) { + GGML_TSAVORITE_LOG_INFO("\n"); + } + + ++ctx->n_buffers; + } + } + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + return ggml_backend_buffer_init(ggml_backend_tsavorite_buffer_type(), + ggml_backend_tsavorite_buffer_i, ctx, size); +} + +// llama_build_graph -> ggml_backend_supports_op -> gml_backend_dev_supports_op +// basically if true then it will call ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, +// backend.get()); here is cur is tensor +static bool ggml_backend_tsavorite_device_supports_op(ggml_backend_dev_t dev, + const struct ggml_tensor *op) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + struct ggml_backend_tsavorite_device_context *ctx_dev = + (struct ggml_backend_tsavorite_device_context *)dev->context; + + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return ggml_tsavorite_supports_op(ctx_dev, op); +} + +// template +// static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {} +// ggml_backend_dev_supports_op(dev, op_tensor); +static bool ggml_backend_tsavorite_device_supports_buft(ggml_backend_dev_t dev, + ggml_backend_buffer_type_t buft) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return buft->iface.get_name == ggml_backend_tsavorite_buffer_type_get_name; + + TSI_UNUSED(dev); +} + +// // returns the backend that should be used for the node based on the current locations +// ggml_backend_sched_backend_id_from_cur -> ggml_backend_offload_op -> +static bool ggml_backend_tsavorite_device_offload_op(ggml_backend_dev_t dev, + const struct ggml_tensor *op) { + // printf("\n ANoop Calling %s \n ", __func__); + if (op->type != GGML_TYPE_F32) + return false; + switch (op->op) { + // case GGML_OP_NONE: + case GGML_OP_ADD: + case GGML_OP_SUB: + case GGML_OP_DIV: + case GGML_OP_MUL: + case GGML_OP_SQRT: + case GGML_OP_SIN: + break; + case GGML_OP_UNARY: + switch (ggml_get_unary_op(op)) { + case GGML_UNARY_OP_NEG: + case GGML_UNARY_OP_ABS: + case GGML_UNARY_OP_SIGMOID: + break; + default: + return false; + } + break; + default: + return false; + } + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return true; + TSI_UNUSED(dev); +} +#ifdef SYNC_DEBUG +static void ggml_backend_tsavorite_device_synchronize(ggml_backend_dev_t dev, + ggml_backend_event_t event) { + usleep(100); + TSI_UNUSED(dev); + TSI_UNUSED(event); +} +#endif /* SYNC_DEBUG */ + +static struct ggml_backend_device_i ggml_backend_tsavorite_device_i = { + /* .get_name = */ ggml_backend_tsavorite_device_get_name, + /* .get_description = */ ggml_backend_tsavorite_device_get_description, + /* .get_memory = */ ggml_backend_tsavorite_device_get_memory, + /* .get_type = */ ggml_backend_tsavorite_device_get_type, + /* .get_props = */ ggml_backend_tsavorite_device_get_props, + /* .init_backend = */ ggml_backend_tsavorite_device_init, + /* .get_buffer_type = */ ggml_backend_tsavorite_device_get_buffer_type, + /* .get_host_buffer_type = */ NULL, + /* .buffer_from_host_ptr = */ ggml_backend_tsavorite_device_buffer_from_ptr, + /* .supports_op = */ ggml_backend_tsavorite_device_supports_op, + /* .supports_buft = */ ggml_backend_tsavorite_device_supports_buft, + /* .offload_op = */ ggml_backend_tsavorite_device_offload_op, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_synchronize = */ NULL, +}; + +// backend registry + +static const char *ggml_backend_tsavorite_reg_get_name(ggml_backend_reg_t reg) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return "Tsavorite"; + + GGML_UNUSED(reg); +} + +static size_t ggml_backend_tsavorite_reg_device_count(ggml_backend_reg_t reg) { + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + return 1; + + GGML_UNUSED(reg); +} + +static ggml_backend_dev_t ggml_backend_tsavorite_reg_device_get(ggml_backend_reg_t reg, + size_t index) { + GGML_ASSERT(index == 0); + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + return &g_ggml_backend_tsavorite_device; + + GGML_UNUSED(reg); + GGML_UNUSED(index); +} + +static struct ggml_backend_reg_i ggml_backend_tsavorite_reg_i = { + /* .get_name = */ ggml_backend_tsavorite_reg_get_name, + /* .device_count = */ ggml_backend_tsavorite_reg_device_count, + /* .device_get = */ ggml_backend_tsavorite_reg_device_get, + /* .get_proc_address = */ NULL, +}; + +ggml_backend_reg_t ggml_backend_tsavorite_reg(void) { + ggml_tsavorite_log_type_val = GGML_TSAVORITE_LOG_ERROR; + ggml_tsavorite_kernel_mode_flag = GGML_TSAVORITE_KERNEL_MODE_MLIR; + GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); + g_ggml_backend_tsavorite_reg.iface = ggml_backend_tsavorite_reg_i; + g_ggml_backend_tsavorite_reg.context = NULL; + + g_ggml_backend_tsavorite_device.iface = ggml_backend_tsavorite_device_i; + g_ggml_backend_tsavorite_device.reg = &g_ggml_backend_tsavorite_reg; + g_ggml_backend_tsavorite_device.context = &g_ggml_ctx_dev_main; + GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + + return &g_ggml_backend_tsavorite_reg; +} + +GGML_BACKEND_DL_IMPL(ggml_backend_tsavorite_reg) diff --git a/ggml/src/ggml-tsavorite/include/TestModel.h b/ggml/src/ggml-tsavorite/include/TestModel.h new file mode 100644 index 0000000000000..feff2539a96fa --- /dev/null +++ b/ggml/src/ggml-tsavorite/include/TestModel.h @@ -0,0 +1,217 @@ +#pragma once + +#include "HostShimCAPI.h" +#include +#include +#include +#include +#include +#include + +#define MAX_RESULT_VALUES_TO_PRINT 32 +template +struct MemRefDescriptor { + void *base; + void *data; + int64_t offset = 0; + int64_t shape[N]; + int64_t strides[N]; +} __attribute__((aligned(128))); + +template +class TestModel { +public: + TestModel(std::string name, int version, bool verbose = false) + : name_(name), version_(version), verbose_(verbose) {} + + ~TestModel() { + // free memory + for (int i = 0; i < NumInputs; i++) + tsi_dealloc(inputs[i].base); + for (int i = 0; i < NumOutputs; i++) + tsi_dealloc(outputs[i].base); + tsi_finalize(); + } + + template + void initRandom(size_t numElements, + std::array inputRange = {-10, 10}) { + static_assert(Rank == 1, + "initRandom(size_t) is only defined for Rank == 1"); + size_t inputSizes[2][Rank] = {{numElements}, {numElements}}; + size_t outputSizes[1][Rank] = {{numElements}}; + init(inputSizes, outputSizes, + /*initWithRandom=*/true, inputRange); + } + +#if 0 + template + void initFill(size_t numElements, ElType val) { + static_assert(Rank == 1, + "initRandom(size_t) is only defined for Rank == 1"); + size_t inputSizes[2][Rank] = {{numElements}, {numElements}}; + size_t outputSizes[1][Rank] = {{numElements}}; + init(inputSizes, outputSizes); + for (int i = 0; i < NumInputs; i++) { + auto nEls = getNumElements(inputs[i]); + for (size_t j = 0; j < nEls; j++) + static_cast(inputs[i].data)[j] = val; + } + } +#endif /* 0 */ + + template + void init(size_t inputSizes[NumInputs][Rank], + size_t outputSizes[NumOutputs][Rank], bool initWithRandom = false, + std::array inputRange = {-10, 10}) { + tsi_initialize(1); + + for (int i = 0; i < NumInputs; i++) + initMemRefDescriptor(inputs[i], inputSizes[i], + initWithRandom, inputRange, i); + + for (int i = 0; i < NumOutputs; i++) { + initMemRefDescriptor(outputs[i], outputSizes[i]); + // set default result values to -1 + auto nEls = getNumElements(outputSizes[i]); + std::fill((OutputsElType *)outputs[i].base, + (OutputsElType *)outputs[i].base + nEls, -1); + } + if (verbose_) { + printf("[%s v.%d] Allocated DRAM arrays (host VAs):", name_.c_str(), + version_); + for (int i = 0; i < NumInputs; i++) + printf(" ANOOP input%d = %p ", i, inputs[i].base); + for (int i = 0; i < NumOutputs; i++) + printf(" ANOOP-1 output%d = %p ", i, outputs[i].base); + printf("\n"); + } + } + + template + int validateResult(size_t index, ElType *expected, bool printErrs = false, + float tolerance = 1e-5) { + if (verbose_) { + printf("[%s v.%d] Model executed successfully. Validating result...", + name_.c_str(), version_); + } + + int retCode = 0; + size_t nEls = getNumElements(outputs[index].shape); + float sqrSumOfDiff = 0.0; + for (size_t j = 0; j < nEls; j++) { + sqrSumOfDiff += + std::pow(((ElType *)outputs[index].base)[j] - expected[j], 2); + if (std::abs(((ElType *)outputs[index].base)[j] - expected[j]) > + tolerance) { + retCode = 1; + if (printErrs && j < MAX_RESULT_VALUES_TO_PRINT) { + printf("Mismatch at index %d: expected %1.6f, got %1.6f\n", (int)j, + expected[j], ((ElType *)outputs[index].base)[j]); + } + if (retCode && j == MAX_RESULT_VALUES_TO_PRINT) + printf("... (more mismatches not printed; maximum %d reached) ...\n", + MAX_RESULT_VALUES_TO_PRINT); + } + } + // Compute the relative error: norm2(result) / norm2(expected) + float sqrSumExpected = 0.0; + for (size_t j = 0; j < nEls; j++) + sqrSumExpected += std::pow(expected[j], 2); + + float relativeErr = std::sqrt(sqrSumOfDiff) / std::sqrt(sqrSumExpected); + if (verbose_) { + retCode ? printf("\n[%s v.%d] FAILED [relative err=%1.6f]\n", + name_.c_str(), version_, relativeErr) + : printf("\n[%s v.%d] PASS [relative err=%1.6f]\n", name_.c_str(), + version_, relativeErr); + } + return retCode; + } + + size_t getNumElements(const MemRefDescriptor &memref) const { + return getNumElements(memref.shape); + } + + template + void writeToFile(void *data, size_t numElements, + const std::string &filename) { + std::ofstream ofs(filename, std::ios::binary); + if (!ofs) { + printf("[%s v.%d] Error opening file %s for writing.", name_.c_str(), + version_, filename.c_str()); + return; + } + ofs.write((char *)data, numElements * sizeof(ElType)); + ofs.close(); + } + + template + void readFromFile(void *data, size_t numElements, + const std::string &filename) { + std::ifstream ifs(filename, std::ios::binary); + if (!ifs) { + printf("[%s v.%d] Error opening file %s for reading.", name_.c_str(), + version_, filename.c_str()); + return; + } + ifs.read((char *)data, numElements * sizeof(ElType)); + ifs.close(); + } + + std::string getName() const { return name_; } + std::string getVersion() const { return std::to_string(version_); } + + MemRefDescriptor inputs[NumInputs]; + MemRefDescriptor outputs[NumOutputs]; + +private: + std::string name_; + int version_ = 1; + bool verbose_ = false; + + template + void initMemRefDescriptor(MemRefDescriptor &memref, size_t shape[Rank], + bool initWithRandom = false, + std::array inputRange = {-10, 10}, + int seed = 42) { + size_t nBytes = sizeof(ElType); + for (int i = 0; i < Rank; i++) { + nBytes *= shape[i]; + } + memref.base = tsi_alloc(nBytes); + memref.data = memref.base; + memref.offset = 0; + printf("\n checking Shape value %d \n\n", memref.shape[0]); +#if 0 + for (int i = 0; i < Rank; i++) { + memref.shape[i] = shape[i]; + memref.strides[i] = 1; + for (int j = i + 1; j < Rank; j++) { + memref.strides[i] *= shape[j]; + } + } + #endif + if (initWithRandom) { + std::mt19937 gen(seed); // fixed seed + std::uniform_real_distribution dist(inputRange[0], inputRange[1]); + for (size_t i = 0; i < getNumElements(shape); i++) { + static_cast(memref.data)[i] = static_cast(dist(gen)); + } + } + } + + size_t getNumElements(const int64_t shape[Rank]) const { + size_t numElements = 1; + printf("\n Anoop Rank %d and shape[Rank] %d \n\n", Rank, shape[Rank]); + for (int i = 0; i < Rank; i++) { + numElements *= shape[i]; + } + printf("\n numElements %d \n", numElements); + return numElements; + } + + size_t getNumElements(const size_t shape[Rank]) const { + return getNumElements(reinterpret_cast(shape)); + } +}; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 083347d188880..31fa312f65da6 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -174,4 +174,4 @@ target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd) # dummy executable - not installed get_filename_component(TEST_TARGET test-c.c NAME_WE) add_executable(${TEST_TARGET} test-c.c) -target_link_libraries(${TEST_TARGET} PRIVATE llama) +target_link_libraries(${TEST_TARGET} PRIVATE llama ${TLIBS}) diff --git a/tsi-pkg-build.sh b/tsi-pkg-build.sh new file mode 100755 index 0000000000000..8712a77d1f71a --- /dev/null +++ b/tsi-pkg-build.sh @@ -0,0 +1,87 @@ + +set -e + +#Ensure prerequisites are met as follows +echo 'updating submodule' +git submodule update --recursive --init +cd ggml-tsi-kernel/ +module load tsi4 gcc/13.3.0 +echo 'creating python virtual env' +python3 -m venv blob-creation +source blob-creation/bin/activate +echo 'installing mlir and python dependencies' +pip install -r /proj/rel/sw/mlir-compiler/python/requirements-common.txt +pip install /proj/rel/sw/mlir-compiler/python/mlir_external_packages-1.2.1-py3-none-any.whl +pip install onnxruntime-training + +#build TSI kernels for the Tsavorite backend +#First for FPGA + +echo 'creating fpga kernel' +cd fpga-kernel +cmake -B build-fpga +./create-all-kernels.sh +#The for Posix Use cases + +echo 'creating posix kernel' +cd ../posix-kernel/ +./create-all-kernels.sh + +#Change directory to top level llama.cpp + +cd ../../ + +export MLIR_SDK_VERSION=/proj/work/rel/sw/sdk-r.0.1.0 +#Compile for posix with build-posix as a target folder + +echo 'building llama.cp, ggml for tsavorite and other binary for posix' +cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix +cmake --build build-posix --config Release + +#Compile for fpga with build-fpga as a target folder + +echo 'building llama.cp, ggml for tsavorite and other binary for fpga' +export CC="/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-gcc" +export CXX="/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-g++" +cmake -B build-fpga -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=fpga +cmake --build build-fpga --config Release + + +echo 'creating tar bundle for fpga' +TSI_GGML_VERSION=0.0.1 +TSI_GGML_BUNDLE_INSTALL_DIR=tsi-ggml +GGML_TSI_INSTALL_DIR=ggml-tsi-kernel +TSI_GGML_RELEASE_DIR=/proj/rel/sw/ggml/ +TSI_BLOB_INSTALL_DIR=$(pwd)/${GGML_TSI_INSTALL_DIR}/fpga-kernel/build-fpga + +if [ -e ${TSI_GGML_BUNDLE_INSTALL_DIR} ]; then + echo "${TSI_GGML_BUNDLE_INSTALL_DIR} exist" +else + echo "creating ${TSI_GGML_BUNDLE_INSTALL_DIR}" + mkdir ${TSI_GGML_BUNDLE_INSTALL_DIR} +fi +if [ -e ${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh ]; then + rm -fr ${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh +fi + +cat > ${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh << EOL +#!/bin/bash +export LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:\$(pwd) +mkdir -p ${TSI_BLOB_INSTALL_DIR}/txe_mult +mkdir -p ${TSI_BLOB_INSTALL_DIR}/txe_add +cp blobs ${TSI_BLOB_INSTALL_DIR}/txe_mult/ -r +cp blobs ${TSI_BLOB_INSTALL_DIR}/txe_add/ -r +EOL +chmod +x ${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh +cp ${GGML_TSI_INSTALL_DIR}/fpga/blobs ${TSI_GGML_BUNDLE_INSTALL_DIR}/ -r +cp build-fpga/bin/llama-cli ${TSI_GGML_BUNDLE_INSTALL_DIR}/ +cp build-fpga/bin/libggml*.so ${TSI_GGML_BUNDLE_INSTALL_DIR}/ +cp build-fpga/bin/libllama*.so ${TSI_GGML_BUNDLE_INSTALL_DIR}/ +cp build-fpga/bin/simple-backend-tsi ${TSI_GGML_BUNDLE_INSTALL_DIR}/ + +tar -cvzf ${TSI_GGML_BUNDLE_INSTALL_DIR}-${TSI_GGML_VERSION}.tz ${TSI_GGML_BUNDLE_INSTALL_DIR}/* + +if [ "$1" == "Release" ] || [ "$1" == "release" ] +then + cp ${TSI_GGML_BUNDLE_INSTALL_DIR}-${TSI_GGML_VERSION}.tz ${TSI_GGML_RELEASE_DIR} +fi From 699538538521884d4389f18d866068d9ba216efd Mon Sep 17 00:00:00 2001 From: Anoop Kapoor Date: Fri, 23 May 2025 22:19:39 -0700 Subject: [PATCH 02/35] Releasing next version --- tsi-pkg-build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tsi-pkg-build.sh b/tsi-pkg-build.sh index 8712a77d1f71a..ba84118f38ff7 100755 --- a/tsi-pkg-build.sh +++ b/tsi-pkg-build.sh @@ -48,7 +48,7 @@ cmake --build build-fpga --config Release echo 'creating tar bundle for fpga' -TSI_GGML_VERSION=0.0.1 +TSI_GGML_VERSION=0.0.2 TSI_GGML_BUNDLE_INSTALL_DIR=tsi-ggml GGML_TSI_INSTALL_DIR=ggml-tsi-kernel TSI_GGML_RELEASE_DIR=/proj/rel/sw/ggml/ From 68410968db0ce9374e2eecdbbd36b6004174abf1 Mon Sep 17 00:00:00 2001 From: Anoop Kapoor Date: Fri, 23 May 2025 22:32:17 -0700 Subject: [PATCH 03/35] Updated MLIR_SDK_VERSION version --- tsi-pkg-build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tsi-pkg-build.sh b/tsi-pkg-build.sh index ba84118f38ff7..5ff9b9389c475 100755 --- a/tsi-pkg-build.sh +++ b/tsi-pkg-build.sh @@ -31,7 +31,7 @@ cd ../posix-kernel/ cd ../../ -export MLIR_SDK_VERSION=/proj/work/rel/sw/sdk-r.0.1.0 +export MLIR_SDK_VERSION=/proj/work/rel/sw/sdk-r.0.1.1 #Compile for posix with build-posix as a target folder echo 'building llama.cp, ggml for tsavorite and other binary for posix' From 1a1514a715b26307c2de92072c7e2fdc1d36eb9f Mon Sep 17 00:00:00 2001 From: Anoop Kapoor Date: Fri, 23 May 2025 23:01:26 -0700 Subject: [PATCH 04/35] Updated the Version --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f9c146006c1a5..96a8a393817d3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,7 +15,7 @@ if (GGML_TSAVORITE) if (NOT DEFINED MLIR_COMPILER_DIR) if (NOT DEFINED $ENV{MLIR_SDK_VERSION}) - set (MLIR_COMPILER_DIR /proj/work/rel/sw/sdk-r.0.1.0/compiler) + set (MLIR_COMPILER_DIR /proj/work/rel/sw/sdk-r.0.1.1/compiler) else() set (MLIR_COMPILER_DIR $ENV{MLIR_SDK_VERSION}/compiler) endif() From d9dd83cf1c197425e1da6cde096e981603dff10d Mon Sep 17 00:00:00 2001 From: Ashish Trivedi Date: Sun, 25 May 2025 11:44:16 -0700 Subject: [PATCH 05/35] @FIR-707: Fix requirement for libgomp and move to new sdk 0.1.2 This change has following. 1. Move to new SDK 0.1.2 2. remove the requirement for libgomp in fpga build --- CMakeLists.txt | 4 ++-- ggml/src/ggml-cpu/CMakeLists.txt | 18 +++++++++++------- tsi-pkg-build.sh | 2 +- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 96a8a393817d3..e047785e603d6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,7 +15,7 @@ if (GGML_TSAVORITE) if (NOT DEFINED MLIR_COMPILER_DIR) if (NOT DEFINED $ENV{MLIR_SDK_VERSION}) - set (MLIR_COMPILER_DIR /proj/work/rel/sw/sdk-r.0.1.1/compiler) + set (MLIR_COMPILER_DIR /proj/work/rel/sw/sdk-r.0.1.2/compiler) else() set (MLIR_COMPILER_DIR $ENV{MLIR_SDK_VERSION}/compiler) endif() @@ -23,7 +23,7 @@ if (GGML_TSAVORITE) if (NOT DEFINED RUNTIME_DIR) if (NOT DEFINED $ENV{MLIR_SDK_VERSION}) - set (RUNTIME_DIR /proj/work/rel/sw/sdk-r.0.1.0/${GGML_TSAVORITE_TARGET}/runtime) + set (RUNTIME_DIR /proj/work/rel/sw/sdk-r.0.1.2/${GGML_TSAVORITE_TARGET}/runtime) else() set (RUNTIME_DIR $ENV{MLIR_SDK_VERSION}/${GGML_TSAVORITE_TARGET}/runtime) endif() diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 1d4259dae5ba7..2cbae62a1dddf 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -53,14 +53,18 @@ function(ggml_add_cpu_backend_variant_impl tag_name) endif() endif() - if (GGML_OPENMP) - find_package(OpenMP) - if (OpenMP_FOUND) - target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP) + if (${GGML_TSAVORITE_TARGET} STREQUAL fpga) + message("Target is FPGA no GOMP linked") + else() + if (GGML_OPENMP) + find_package(OpenMP) + if (OpenMP_FOUND) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP) - target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) - else() - message(WARNING "OpenMP not found") + target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) + else() + message(WARNING "OpenMP not found") + endif() endif() endif() diff --git a/tsi-pkg-build.sh b/tsi-pkg-build.sh index 5ff9b9389c475..b6b998671544c 100755 --- a/tsi-pkg-build.sh +++ b/tsi-pkg-build.sh @@ -31,7 +31,7 @@ cd ../posix-kernel/ cd ../../ -export MLIR_SDK_VERSION=/proj/work/rel/sw/sdk-r.0.1.1 +export MLIR_SDK_VERSION=/proj/work/rel/sw/sdk-r.0.1.2 #Compile for posix with build-posix as a target folder echo 'building llama.cp, ggml for tsavorite and other binary for posix' From 441fd0b95685c3b1590eb81800dcee7487e2533e Mon Sep 17 00:00:00 2001 From: Ashish Trivedi Date: Tue, 27 May 2025 12:00:44 -0700 Subject: [PATCH 06/35] @FIR-708: Added TXE profile to ggm-tsavorite backend. The chanegs have following 1. Enable profiling for tsavorite backed for txe 2. Add std c++20 for compiling the profiler The test results are as follows root@agilex7_dk_si_agf014ea:/usr/bin/tsi/v0.1.1.tsv30_05_24_2025/bin# ./run_platform_test.sh Check if tnApcMgr is running; if it is not, uncomment below line and execute the run_platform_test.sh script. Running on v0.1.1.tsv30_05_24_2025 [2018-03-09 13:52:26.300409] 271:272 [ info] :: TXE resource allocation request processed successfully. [2018-03-09 13:52:27.339] [info] [llama.cpp:56] Execution time: 1019 ms [2018-03-09 13:52:27.347638] 2909:2909 [ info] [LlamaForCausalLM_Random v. 2] TestBase.h:154: Model executed successfully. Validating result... [2018-03-09 13:52:27.380511] 2909:2909 [ info] [LlamaForCausalLM_Random v. 2] TestBase.h:193: PASS [relative err=0.000000, relTol=1.000000e-05] [2018-03-09 13:52:27.405665] 271:272 [ info] :: TXE resource release request processed successfully. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Profiling Results (LlamaForCausalLM_Random): ------------------------------------------------------------------------------------------------------------------------ Calls Total(ms) T/call Self(ms) Function ------------------------------------------------------------------------------------------------------------------------ 243 498.000 2.049 0.000 [45%] RuntimeHostShim::awaitCommandListCompletion 84 200.688 2.389 200.688 └─ [18%] [ txe_blob_1 ] 32 76.626 2.395 76.626 └─ [ 7%] [ txe_blob_6 ] 16 55.493 3.468 55.493 └─ [ 5%] [ txe_blob_12 ] 8 31.821 3.978 31.821 └─ [ 3%] [ txe_blob_10 ] 8 31.322 3.915 31.322 └─ [ 3%] [ txe_blob_7 ] 8 31.152 3.894 31.152 └─ [ 3%] [ txe_blob_8 ] 8 27.693 3.462 27.693 └─ [ 2%] [ txe_blob_9 ] 17 26.019 1.531 26.019 └─ [ 2%] [ txe_blob_2 ] 17 25.906 1.524 25.906 └─ [ 2%] [ txe_blob_5 ] 17 25.899 1.523 25.899 └─ [ 2%] [ txe_blob_3 ] 17 25.833 1.520 25.833 └─ [ 2%] [ txe_blob_4 ] 8 23.993 2.999 23.993 └─ [ 2%] [ txe_blob_11 ] 3 6.002 2.001 6.002 └─ [ 1%] [ txe_blob_0 ] 1 35.000 35.000 35.000 [ 3%] RuntimeHostShim::finalize 188 33.000 0.176 33.000 [ 3%] RuntimeHostShim::copy 1 16.000 16.000 16.000 [ 1%] RuntimeHostShim::initialize 13 1.000 0.077 1.000 [ 0%] RuntimeHostShim::loadBlob 573 0.000 0.000 0.000 [ 0%] RuntimeHostShim::allocate 573 0.000 0.000 0.000 [ 0%] RuntimeHostShim::deallocate 243 0.000 0.000 0.000 [ 0%] RuntimeHostShim::createCommandList 922 0.000 0.000 0.000 [ 0%] RuntimeHostShim::getShmemManager 243 0.000 0.000 0.000 [ 0%] RuntimeHostShim::launchBlob 243 0.000 0.000 0.000 [ 0%] RuntimeHostShim::addCommandToList 243 0.000 0.000 0.000 [ 0%] RuntimeHostShim::finalizeCommandList 13 0.000 0.000 0.000 [ 0%] RuntimeHostShim::unloadBlob 33 0.000 0.000 0.000 [ 0%] RuntimeHostShim::stridedCopy ======================================================================================================================== 3532 1116.000 0.316 1116.000 [100%] TOTAL ======================================================================================================================== register_backend: registered backend Tsavorite (1 devices) register_device: registered device Tsavorite (txe) register_backend: registered backend CPU (1 devices) register_device: registered device CPU (CPU) load_backend: failed to find ggml_backend_init in /usr/bin/tsi/v0.1.1.tsv30_05_24_2025/bin/tsi-ggml/libggml-tsavorite.so load_backend: failed to find ggml_backend_init in /usr/bin/tsi/v0.1.1.tsv30_05_24_2025/bin/tsi-ggml/libggml-cpu.so build: 5464 (194fbaa9) with gcc (GCC) 13.3.0 for x86_64-pc-linux-gnu (debug) main: llama backend init main: load the model and apply lora adapter, if any TXE Device MEMORY Summary total 134217728 and free 134217728 llama_model_load_from_file_impl: using device Tsavorite (txe) - 128 MiB free llama_model_loader: loaded meta data with 24 key-value pairs and 75 tensors from /tsi/anoop_feb26/tinyllama-vo-5m-para.gguf (version GGUF V3 (latest)) llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. llama_model_loader: - kv 0: general.architecture str = llama llama_model_loader: - kv 1: general.type str = model llama_model_loader: - kv 2: general.name str = Vicuna Hf llama_model_loader: - kv 3: general.size_label str = 4.6M llama_model_loader: - kv 4: general.license str = apache-2.0 llama_model_loader: - kv 5: llama.block_count u32 = 8 llama_model_loader: - kv 6: llama.context_length u32 = 2048 llama_model_loader: - kv 7: llama.embedding_length u32 = 64 llama_model_loader: - kv 8: llama.feed_forward_length u32 = 256 llama_model_loader: - kv 9: llama.attention.head_count u32 = 16 llama_model_loader: - kv 10: llama.attention.layer_norm_rms_epsilon f32 = 0.000001 llama_model_loader: - kv 11: general.file_type u32 = 32 llama_model_loader: - kv 12: llama.vocab_size u32 = 32000 llama_model_loader: - kv 13: llama.rope.dimension_count u32 = 4 llama_model_loader: - kv 14: tokenizer.ggml.model str = llama llama_model_loader: - kv 15: tokenizer.ggml.pre str = default llama_model_loader: - kv 16: tokenizer.ggml.tokens arr[str,32000] = ["", "", "", "<0x00>", "<... llama_model_loader: - kv 17: tokenizer.ggml.scores arr[f32,32000] = [0.000000, 0.000000, 0.000000, 0.0000... llama_model_loader: - kv 18: tokenizer.ggml.token_type arr[i32,32000] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ... llama_model_loader: - kv 19: tokenizer.ggml.bos_token_id u32 = 1 llama_model_loader: - kv 20: tokenizer.ggml.eos_token_id u32 = 2 llama_model_loader: - kv 21: tokenizer.ggml.unknown_token_id u32 = 0 llama_model_loader: - kv 22: tokenizer.ggml.padding_token_id u32 = 0 llama_model_loader: - kv 23: general.quantization_version u32 = 2 llama_model_loader: - type f32: 17 tensors llama_model_loader: - type bf16: 58 tensors print_info: file format = GGUF V3 (latest) print_info: file type = BF16 print_info: file size = 8.82 MiB (16.00 BPW) load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect load: special tokens cache size = 3 load: token to piece cache size = 0.1914 MB print_info: arch = llama print_info: vocab_only = 0 print_info: n_ctx_train = 2048 print_info: n_embd = 64 print_info: n_layer = 8 print_info: n_head = 16 print_info: n_head_kv = 16 print_info: n_rot = 4 print_info: n_swa = 0 print_info: n_swa_pattern = 1 print_info: n_embd_head_k = 4 print_info: n_embd_head_v = 4 print_info: n_gqa = 1 print_info: n_embd_k_gqa = 64 print_info: n_embd_v_gqa = 64 print_info: f_norm_eps = 0.0e+00 print_info: f_norm_rms_eps = 1.0e-06 print_info: f_clamp_kqv = 0.0e+00 print_info: f_max_alibi_bias = 0.0e+00 print_info: f_logit_scale = 0.0e+00 print_info: f_attn_scale = 0.0e+00 print_info: n_ff = 256 print_info: n_expert = 0 print_info: n_expert_used = 0 print_info: causal attn = 1 print_info: pooling type = 0 print_info: rope type = 0 print_info: rope scaling = linear print_info: freq_base_train = 10000.0 print_info: freq_scale_train = 1 print_info: n_ctx_orig_yarn = 2048 print_info: rope_finetuned = unknown print_info: ssm_d_conv = 0 print_info: ssm_d_inner = 0 print_info: ssm_d_state = 0 print_info: ssm_dt_rank = 0 print_info: ssm_dt_b_c_rms = 0 print_info: model type = ?B print_info: model params = 4.62 M print_info: general.name = Vicuna Hf print_info: vocab type = SPM print_info: n_vocab = 32000 print_info: n_merges = 0 print_info: BOS token = 1 '' print_info: EOS token = 2 '' print_info: UNK token = 0 '' print_info: PAD token = 0 '' print_info: LF token = 13 '<0x0A>' print_info: EOG token = 2 '' print_info: max token length = 18 load_tensors: loading model tensors, this can take a while... (mmap = true) TXE Device MEMORY Summary total 134217728 and free 134217728 load_tensors: offloading 0 repeating layers to GPU load_tensors: offloaded 0/9 layers to GPU load_tensors: CPU_Mapped model buffer size = 8.82 MiB .............. llama_context: constructing llama_context llama_context: n_seq_max = 1 llama_context: n_ctx = 12288 llama_context: n_ctx_per_seq = 12288 llama_context: n_batch = 1024 llama_context: n_ubatch = 512 llama_context: causal_attn = 1 llama_context: flash_attn = 0 llama_context: freq_base = 10000.0 llama_context: freq_scale = 1 llama_context: n_ctx_per_seq (12288) > n_ctx_train (2048) -- possible training context overflow [2018-03-09 13:52:28.706203] 271:272 [ info] :: TXE resource allocation request processed successfully. llama_context: CPU output buffer size = 0.12 MiB llama_kv_cache_unified: CPU KV buffer size = 24.00 MiB llama_kv_cache_unified: size = 24.00 MiB ( 12288 cells, 8 layers, 1 seqs), K (f16): 12.00 MiB, V (f16): 12.00 MiB ggml_backend_tsavorite_buffer_type_alloc_buffer is called from llama data Loader ANoop Allocating memory from tsi_alloc with size 266240 Allocating memory from tsi_alloc with size 266240 starting memory 0xffff93e00080 Address of Newly Created BUffer 0xffff93e00080 and size 266240 llama_context: tsavorite compute buffer size = 0.25 MiB llama_context: CPU compute buffer size = 408.51 MiB llama_context: graph nodes = 294 llama_context: graph splits = 67 (with bs=512), 37 (with bs=1) common_init_from_params: setting dry_penalty_last_n to ctx_size = 12288 main: llama threadpool init, n_threads = 4 main: model was trained on only 2048 context tokens (12288 specified) system_info: n_threads = 4 (n_threads_batch = 4) / 4 | CPU : NEON = 1 | ARM_FMA = 1 | LLAMAFILE = 1 | AARCH64_REPACK = 1 | sampler seed: 177927434 sampler params: repeat_last_n = 5, repeat_penalty = 1.500, frequency_penalty = 0.000, presence_penalty = 0.000 dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 12288 top_k = 50, top_p = 0.900, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.000 mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist generate: n_ctx = 12288, n_batch = 1024, n_predict = 10, n_keep = 1 my cat's name was Tim. He loved to play with his toy llama_perf_sampler_print: sampling time = 195.98 ms / 16 runs ( 12.25 ms per token, 81.64 tokens per second) llama_perf_context_print: load time = 1577.27 ms llama_perf_context_print: prompt eval time = 305.19 ms / 6 tokens ( 50.86 ms per token, 19.66 tokens per second) llama_perf_context_print: eval time = 803.59 ms / 9 runs ( 89.29 ms per token, 11.20 tokens per second) llama_perf_context_print: total time = 2628.44 ms / 15 tokens TXE_ADD Operation, total tensor: 10 Number of Kernel Call: 10 Number of tensor got spilt: 0 Min Num of Elem 64 Max Num of Elem 64 TXE_SUB Operation, total tensor: 0 Number of Kernel Call: 0 Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0 TXE_MULT Operation, total tensor: 170 Number of Kernel Call: 245 Number of tensor got spilt: 0 Min Num of Elem 64 Max Num of Elem 384 TXE_DIV Operation, total tensor: 0 Number of Kernel Call: 0 Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0 TXE_SQRT Operation, total tensor: 0 Number of Kernel Call: 0 Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0 TXE_NEG Operation, total tensor: 0 Number of Kernel Call: 0 Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0 TXE_ABS Operation, total tensor: 0 Number of Kernel Call: 0 Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0 TXE_SIN Operation, total tensor: 0 Number of Kernel Call: 0 Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0 TXE_SIGMOID Operation, total tensor: 0 Number of Kernel Call: 0 Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0 [2018-03-09 13:52:32.222949] 271:272 [ info] :: TXE resource release request processed successfully. GGML Tsavorite Profiling Results: ------------------------------------------------------------------------------------------------------------------------ Calls Total(ms) T/call Self(ms) Function ------------------------------------------------------------------------------------------------------------------------ 255 255.000 1.000 0.000 [ 7%] RuntimeHostShim::awaitCommandListCompletion 245 379.466 1.549 379.466 └─ [11%] [ txe_mult_blob ] 10 15.443 1.544 15.443 └─ [ 0%] [ txe_add_blob ] 1 35.000 35.000 35.000 [ 1%] RuntimeHostShim::finalize 1 19.000 19.000 2.000 [ 1%] GGML Tsavorite 1 17.000 17.000 17.000 └─ [ 0%] RuntimeHostShim::initialize 256 0.000 0.000 0.000 [ 0%] RuntimeHostShim::allocate 1020 0.000 0.000 0.000 [ 0%] RuntimeHostShim::getShmemManager 255 0.000 0.000 0.000 [ 0%] RuntimeHostShim::createCommandList 255 0.000 0.000 0.000 [ 0%] RuntimeHostShim::loadBlob 255 0.000 0.000 0.000 [ 0%] RuntimeHostShim::launchBlob 255 0.000 0.000 0.000 [ 0%] RuntimeHostShim::addCommandToList 255 0.000 0.000 0.000 [ 0%] RuntimeHostShim::finalizeCommandList 255 0.000 0.000 0.000 [ 0%] RuntimeHostShim::unloadBlob 255 0.000 0.000 0.000 [ 0%] RuntimeHostShim::deallocate ======================================================================================================================== 3318 3529.000 1.064 3529.000 [100%] TOTAL ======================================================================================================================== root@agilex7_dk_si_agf014ea:/usr/bin/tsi/v0.1.1.tsv30_05_24_2025/bin# --- ggml/src/ggml-tsavorite/CMakeLists.txt | 1 + ggml/src/ggml-tsavorite/ggml-tsavorite.cpp | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-tsavorite/CMakeLists.txt b/ggml/src/ggml-tsavorite/CMakeLists.txt index f58331fd68d30..323c37df14a8b 100644 --- a/ggml/src/ggml-tsavorite/CMakeLists.txt +++ b/ggml/src/ggml-tsavorite/CMakeLists.txt @@ -1,6 +1,7 @@ message(STATUS "Tsavorite framework is found") # # tsavorite Kernel Library +add_compile_options(--std=c++20) ggml_add_backend_library(ggml-tsavorite ggml-tsavorite.cpp ) diff --git a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp index 7939a0f8cfa13..e359906b61ce6 100644 --- a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp +++ b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp @@ -20,11 +20,15 @@ #include #include #include - +#include #include "ggml-backend-impl.h" #include "ggml-impl.h" #include "ggml.h" +#include "HostShimCAPI.h" +#include "tsi-rt/utils/Profiler.h" +using namespace std; +namespace tsirt = ::tsi::runtime; typedef struct _txe_device_t *txe_device_s; typedef struct _txe_compute_pipeline_state_t *txe_compute_pipeline_state_s; FILE *tsi_op_log_file; @@ -513,8 +517,12 @@ static struct ggml_backend_tsavorite_context *ggml_tsavorite_init(ggml_backend_d if (tsi_log_setup() == false) return NULL; + std::string mainProfilerName = "GGML Tsavorite "; + tsirt::utils::TSIProfiler::initialize(); + tsirt::utils::TSIScopedProfiler mainProfiler(mainProfilerName); + // TSI Run time Initalization - tsi_initialize(NUM_OF_TXES); + tsi_initialize(NUM_OF_TXES, NULL); // init context struct ggml_backend_tsavorite_context *ctx = (struct ggml_backend_tsavorite_context *)calloc( @@ -615,6 +623,11 @@ static void ggml_tsavorite_free(struct ggml_backend_tsavorite_context *ctx) { sleep(2); tsi_finalize(); GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); + tsirt::utils::TSIProfiler::finalize(); + std::cout << "\nGGML Tsavorite Profiling Results:" << std::endl; + std::cout << tsirt::utils::TSIProfiler::getFormattedResults( + /*truncateFuncNames*/ true) + << std::endl; } #if 0 From 9d65b92953fe3f674dd7a5d51a12e900cdc8682c Mon Sep 17 00:00:00 2001 From: Anoop Kapoor Date: Thu, 29 May 2025 13:23:31 -0700 Subject: [PATCH 07/35] @FIR-709 - GGML: Adding SILU Kernel --- README.md | 49 +++++++++++++++++ docs/build.md | 63 ++++++++++++++++++++++ ggml-tsi-kernel | 2 +- ggml/include/ggml-tsavorite.h | 2 + ggml/src/ggml-backend.cpp | 7 ++- ggml/src/ggml-tsavorite/ggml-tsavorite.cpp | 30 +++++++---- tsi-pkg-build.sh | 10 ++-- 7 files changed, 147 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index d1cb8d8336229..d8371a472a675 100644 --- a/README.md +++ b/README.md @@ -580,3 +580,52 @@ $ echo "source ~/.llama-completion.bash" >> ~/.bashrc - [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License - [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License - [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html) + +#### TSI compilation steps +```bash +#Pull the repo frim tsisw as follows +git clone git@github.com:tsisw/llama.cpp.git -b FIR-699 + +#Ensure prerequisites are met as follows +cd llama.cpp/ +git submodule update --recursive --init +cd ggml-tsi-kernel/ +module load tsi4 gcc/13.3.0 +python3 -m venv blob-creation +source blob-creation/bin/activate +pip install -r /proj/rel/sw/mlir-compiler/python/requirements-common.txt +pip install /proj/rel/sw/mlir-compiler/python/mlir_external_packages-1.2.1-py3-none-any.whl +pip install onnxruntime-training + +#build TSI kernels for the Tsavorite backend +#First for FPGA +cd fpga-kernel +cmake -B build-fpga +./create-all-kernels.sh +#The for Posix Use cases +cd ../posix-kernel/ +./create-all-kernels.sh + +#Change directory to top level llama.cpp +cd ../../ + +#Compile for posix with build-posix as a target folder + +cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix +cmake --build build-posix --config Release + +#Compile for fpga with build-fpga as a target folder +export CC="/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-gcc" +export CXX="/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-g++" +cmake -B build-fpga -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=fpga +cmake --build build-fpga --config Release + +#For easy build one can also use which creates a FPGA specific tar bundle tsi-ggml.tz +#If you want to release the build update the TSI-VERSION in the file tsi-pkg-build.sh and add Release as parameter +#when running ./tsi-pkg-build.sh (Note it will overwrite what exists in /proj/rel/sw/ggml so be sure you want to do +#it. Example ./tsi-pkg-build.sh release +./tsi-pkg-build.sh + +``` + +## References diff --git a/docs/build.md b/docs/build.md index c9027c0b580a5..1685adbc916bc 100644 --- a/docs/build.md +++ b/docs/build.md @@ -559,3 +559,66 @@ The GPU may still be used to accelerate some parts of the computation even when In most cases, it is possible to build and use multiple backends at the same time. For example, you can build llama.cpp with both CUDA and Vulkan support by using the `-DGGML_CUDA=ON -DGGML_VULKAN=ON` options with CMake. At runtime, you can specify which backend devices to use with the `--device` option. To see a list of available devices, use the `--list-devices` option. Backends can be built as dynamic libraries that can be loaded dynamically at runtime. This allows you to use the same llama.cpp binary on different machines with different GPUs. To enable this feature, use the `GGML_BACKEND_DL` option when building. + + +## TSI compilation steps + +Following are the instructions to compile for TSI FPGA and Posix backend + +```bash +Pull the repo frim tsisw as follows +git clone git@github.com:tsisw/llama.cpp.git -b FIR-699 +``` + +Ensure prerequisites are met as follows +```bash +cd llama.cpp/ +git submodule update --recursive --init +cd ggml-tsi-kernel/ +module load tsi4 gcc/13.3.0 +python3 -m venv blob-creation +source blob-creation/bin/activate +pip install -r /proj/rel/sw/mlir-compiler/python/requirements-common.txt +pip install /proj/rel/sw/mlir-compiler/python/mlir_external_packages-1.2.1-py3-none-any.whl +pip install onnxruntime-training +``` + +build TSI kernels for the Tsavorite backend +First for FPGA +```bash +cd fpga-kernel +cmake -B build-fpga +./create-all-kernels.sh +``` +The for Posix Use cases +```bash +cd ../posix-kernel/ +./create-all-kernels.sh +``` + +Change directory to top level llama.cpp +```bash +cd ../../ +``` + +Compile for posix with build-posix as a target folder +```bash +cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix +cmake --build build-posix --config Release +``` + +Compile for fpga with build-fpga as a target folder +```bash +export CC="/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-gcc" +export CXX="/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-g++" +cmake -B build-fpga -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=fpga +cmake --build build-fpga --config Release +``` +For easy build one can also use which creates a FPGA specific tar bundle tsi-ggml.tz +If you want to release the build update the TSI-VERSION in the file tsi-pkg-build.sh and add Release as parameter +when running ./tsi-pkg-build.sh (Note it will overwrite what exists in /proj/rel/sw/ggml so be sure you want to do +it. Example ./tsi-pkg-build.sh release + +```bash +./tsi-pkg-build.sh +``` diff --git a/ggml-tsi-kernel b/ggml-tsi-kernel index f7a3ac1ee334c..d1383a04f29d0 160000 --- a/ggml-tsi-kernel +++ b/ggml-tsi-kernel @@ -1 +1 @@ -Subproject commit f7a3ac1ee334c242958ccb2053ecc4854822d87e +Subproject commit d1383a04f29d0160750c0e51ab524d461c6a127b diff --git a/ggml/include/ggml-tsavorite.h b/ggml/include/ggml-tsavorite.h index cd380ddf61ed3..54a8e34662799 100644 --- a/ggml/include/ggml-tsavorite.h +++ b/ggml/include/ggml-tsavorite.h @@ -127,6 +127,7 @@ enum ggml_tsavorite_kernel_type { GGML_TSAVORITE_KERNEL_TYPE_ABS, GGML_TSAVORITE_KERNEL_TYPE_SIN, GGML_TSAVORITE_KERNEL_TYPE_SIGMOID, + GGML_TSAVORITE_KERNEL_TYPE_SILU, GGML_TSAVORITE_KERNEL_TYPE_COUNT }; @@ -159,6 +160,7 @@ extern void _mlir_ciface_txe_neg(void *a, void *res); extern void _mlir_ciface_txe_abs(void *a, void *res); extern void _mlir_ciface_txe_sin(void *a, void *res); extern void _mlir_ciface_txe_sigmoid(void *a, void *res); +extern void _mlir_ciface_txe_silu(void *a, void *res); extern void ggml_tsi_log_tensor_data(tensor_log log_data); #define NUM_OF_TXES 1 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index b30b4cb386f9f..1238093e41c81 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -939,8 +939,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } else { cur_backend_id = *node_backend_id; } - } else if (cur_backend_id != -1) { - ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); + // Below Code is Optimization which i am disabling for now since we have not implemented other + // Operation at tsavorite + } else if (cur_backend_id != -1 || (node->op == GGML_OP_UNARY)) { + //ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); + ggml_backend_sched_set_if_supported(sched, node, 0, node_backend_id); } } } diff --git a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp index e359906b61ce6..573220c8a7027 100644 --- a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp +++ b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp @@ -436,6 +436,11 @@ static txe_compute_pipeline_state_s tsi_kernel_setup(enum ggml_tsavorite_kernel_ kernel_pipeline->kernel_name = "TXE_SIGMOID"; flag = true; break; + case GGML_TSAVORITE_KERNEL_TYPE_SILU: + kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_silu; + kernel_pipeline->kernel_name = "TXE_SILU"; + flag = true; + break; default: break; } @@ -580,15 +585,16 @@ static struct ggml_backend_tsavorite_context *ggml_tsavorite_init(ggml_backend_d GGML_TSAVORITE_LOG_WARN("%s: skipping %-40s (not supported)\n", __func__, "kernel_" #e); \ } - GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_ADD, true); - GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SUB, true); - GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_MULT, true); - GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_DIV, true); - GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SQRT, true); - GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_NEG, true); - GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_ABS, true); - GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SIN, true); - GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SIGMOID, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_ADD, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SUB, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_MULT, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_DIV, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SQRT, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_NEG, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_ABS, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SIN, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SIGMOID, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SILU, true); } GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); @@ -695,6 +701,7 @@ static bool ggml_tsavorite_supports_op(const struct ggml_backend_tsavorite_devic case GGML_UNARY_OP_NEG: case GGML_UNARY_OP_ABS: case GGML_UNARY_OP_SIGMOID: + case GGML_UNARY_OP_SILU: break; default: return false; @@ -852,6 +859,10 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend, kernel_type = GGML_TSAVORITE_KERNEL_TYPE_SIGMOID; num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS; break; + case GGML_UNARY_OP_SILU: + kernel_type = GGML_TSAVORITE_KERNEL_TYPE_SILU; + num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS; + break; default: ggml_backend_tsavorite_device_rel( (struct ggml_backend_tsavorite_device_context *)backend->device->context); @@ -1806,6 +1817,7 @@ static bool ggml_backend_tsavorite_device_offload_op(ggml_backend_dev_t dev, case GGML_UNARY_OP_NEG: case GGML_UNARY_OP_ABS: case GGML_UNARY_OP_SIGMOID: + case GGML_UNARY_OP_SILU: break; default: return false; diff --git a/tsi-pkg-build.sh b/tsi-pkg-build.sh index b6b998671544c..4d6a8c736a5a8 100755 --- a/tsi-pkg-build.sh +++ b/tsi-pkg-build.sh @@ -67,10 +67,12 @@ fi cat > ${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh << EOL #!/bin/bash export LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:\$(pwd) -mkdir -p ${TSI_BLOB_INSTALL_DIR}/txe_mult -mkdir -p ${TSI_BLOB_INSTALL_DIR}/txe_add -cp blobs ${TSI_BLOB_INSTALL_DIR}/txe_mult/ -r -cp blobs ${TSI_BLOB_INSTALL_DIR}/txe_add/ -r +tsi_kernels=("add" "sub" "mult" "div" "abs" "inv" "neg" "sin" "sqrt" "sigmoid" "silu") + +for kernel in "${tsi_kernels[@]}"; do + mkdir -p ${TSI_BLOB_INSTALL_DIR}/txe_$kernel + cp blobs ${TSI_BLOB_INSTALL_DIR}/txe_$kernel/ -r +done EOL chmod +x ${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh cp ${GGML_TSI_INSTALL_DIR}/fpga/blobs ${TSI_GGML_BUNDLE_INSTALL_DIR}/ -r From f919789e23efb9270616cff3970e6784f3fe5119 Mon Sep 17 00:00:00 2001 From: Anoop Kapoor Date: Thu, 29 May 2025 14:17:33 -0700 Subject: [PATCH 08/35] @FIR-709: Fixed the script --- tsi-pkg-build.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tsi-pkg-build.sh b/tsi-pkg-build.sh index 4d6a8c736a5a8..2dd5f048871b7 100755 --- a/tsi-pkg-build.sh +++ b/tsi-pkg-build.sh @@ -64,14 +64,14 @@ if [ -e ${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh ]; then rm -fr ${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh fi -cat > ${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh << EOL +cat > ./${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh << EOL #!/bin/bash export LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:\$(pwd) tsi_kernels=("add" "sub" "mult" "div" "abs" "inv" "neg" "sin" "sqrt" "sigmoid" "silu") -for kernel in "${tsi_kernels[@]}"; do - mkdir -p ${TSI_BLOB_INSTALL_DIR}/txe_$kernel - cp blobs ${TSI_BLOB_INSTALL_DIR}/txe_$kernel/ -r +for kernel in "\${tsi_kernels[@]}"; do + mkdir -p ${TSI_BLOB_INSTALL_DIR}/txe_\$kernel + cp blobs ${TSI_BLOB_INSTALL_DIR}/txe_\$kernel/ -r done EOL chmod +x ${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh From 9459c0c16b7e6ed83095e5c7b270069c24a56e7f Mon Sep 17 00:00:00 2001 From: Ashish Trivedi Date: Tue, 3 Jun 2025 09:39:54 -0700 Subject: [PATCH 09/35] @FIR-714: Updated SDK version to r0.1.3 version --- CMakeLists.txt | 6 ++++-- ggml-tsi-kernel | 2 +- tsi-pkg-build.sh | 7 ++++--- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e047785e603d6..d1986def391fa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,7 +15,8 @@ if (GGML_TSAVORITE) if (NOT DEFINED MLIR_COMPILER_DIR) if (NOT DEFINED $ENV{MLIR_SDK_VERSION}) - set (MLIR_COMPILER_DIR /proj/work/rel/sw/sdk-r.0.1.2/compiler) + set (MLIR_COMPILER_DIR /proj/rel/sw/sdk-r.0.1.3/compiler) + message("MLIR_SDK_VERSION not set defaulting to ${MLIR_COMPILER_DIR}") else() set (MLIR_COMPILER_DIR $ENV{MLIR_SDK_VERSION}/compiler) endif() @@ -23,7 +24,8 @@ if (GGML_TSAVORITE) if (NOT DEFINED RUNTIME_DIR) if (NOT DEFINED $ENV{MLIR_SDK_VERSION}) - set (RUNTIME_DIR /proj/work/rel/sw/sdk-r.0.1.2/${GGML_TSAVORITE_TARGET}/runtime) + set (RUNTIME_DIR /proj/rel/sw/sdk-r.0.1.3/${GGML_TSAVORITE_TARGET}/runtime) + message("MLIR_SDK_VERSION not set defaulting to ${RUNTIME_DIR}") else() set (RUNTIME_DIR $ENV{MLIR_SDK_VERSION}/${GGML_TSAVORITE_TARGET}/runtime) endif() diff --git a/ggml-tsi-kernel b/ggml-tsi-kernel index d1383a04f29d0..9dcf09f210636 160000 --- a/ggml-tsi-kernel +++ b/ggml-tsi-kernel @@ -1 +1 @@ -Subproject commit d1383a04f29d0160750c0e51ab524d461c6a127b +Subproject commit 9dcf09f2106364d0dafa54bce743d1c11b701112 diff --git a/tsi-pkg-build.sh b/tsi-pkg-build.sh index 2dd5f048871b7..2a2c0afe462a3 100755 --- a/tsi-pkg-build.sh +++ b/tsi-pkg-build.sh @@ -6,12 +6,14 @@ echo 'updating submodule' git submodule update --recursive --init cd ggml-tsi-kernel/ module load tsi4 gcc/13.3.0 +export MLIR_SDK_VERSION=/proj/rel/sw/sdk-r.0.1.3 echo 'creating python virtual env' +/proj/local/Python-3.10.12/bin/python3 -m venv blob-creation python3 -m venv blob-creation source blob-creation/bin/activate echo 'installing mlir and python dependencies' -pip install -r /proj/rel/sw/mlir-compiler/python/requirements-common.txt -pip install /proj/rel/sw/mlir-compiler/python/mlir_external_packages-1.2.1-py3-none-any.whl +pip install -r ${MLIR_SDK_VERSION}/compiler/python/requirements-common.txt +pip install ${MLIR_SDK_VERSION}/compiler/python/mlir_external_packages-1.2.3-py3-none-any.whl pip install onnxruntime-training #build TSI kernels for the Tsavorite backend @@ -31,7 +33,6 @@ cd ../posix-kernel/ cd ../../ -export MLIR_SDK_VERSION=/proj/work/rel/sw/sdk-r.0.1.2 #Compile for posix with build-posix as a target folder echo 'building llama.cp, ggml for tsavorite and other binary for posix' From c18585c2fa0c09015e3d2b08861bf704361889ff Mon Sep 17 00:00:00 2001 From: Ashish Trivedi Date: Tue, 3 Jun 2025 11:49:25 -0700 Subject: [PATCH 10/35] @FIR-714: Updated TLIBS to be passed to llama_build function --- tests/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 31fa312f65da6..ade78632c7352 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -8,7 +8,7 @@ function(llama_build source) endif() add_executable(${TEST_TARGET} ${source}) - target_link_libraries(${TEST_TARGET} PRIVATE common) + target_link_libraries(${TEST_TARGET} PRIVATE common ${TLIBS}) install(TARGETS ${TEST_TARGET} RUNTIME) endfunction() @@ -169,7 +169,7 @@ endif() # libmtmd set(LLAMA_TEST_NAME test-mtmd-c-api) llama_build_and_test(test-mtmd-c-api.c) -target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd) +target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd ${TLIBS}) # dummy executable - not installed get_filename_component(TEST_TARGET test-c.c NAME_WE) From 47ceff08be48672c997ee3b3242fdf504cb54696 Mon Sep 17 00:00:00 2001 From: Ashish Trivedi Date: Wed, 4 Jun 2025 10:50:11 -0700 Subject: [PATCH 11/35] @FIR-714: Updated to use 1.30 external dependencies --- CMakeLists.txt | 4 ++-- tests/CMakeLists.txt | 2 +- tsi-pkg-build.sh | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d1986def391fa..6af525e29cdbc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,14 +35,14 @@ if (GGML_TSAVORITE) set (GGML_TSI_KERNEL_DIR ${CMAKE_SOURCE_DIR}/ggml-tsi-kernel/${GGML_TSAVORITE_TARGET}) endif() - file(GLOB TLIBS "${RUNTIME_DIR}/lib/*.so" "${GGML_TSI_KERNEL_DIR}/host/*.o") if (${GGML_TSAVORITE_TARGET} STREQUAL fpga) set(CMAKE_CROSSCOMPILING ON) set(ARCH_FLAGS -march=armv8-a) + file(GLOB TLIBS "${RUNTIME_DIR}/lib/*.so" "${GGML_TSI_KERNEL_DIR}/host/*.o") message("Setting target as fpga") elseif (${GGML_TSAVORITE_TARGET} STREQUAL "posix") - list(APPEND TLIBS "${MLIR_COMPILER_DIR}/lib/libFFMDeviceShim.so") + file(GLOB TLIBS "${RUNTIME_DIR}/lib/*.so" "${GGML_TSI_KERNEL_DIR}/host/*.o" "${MLIR_COMPILER_DIR}/lib/libFFMDeviceShim.so" "${MLIR_COMPILER_DIR}/lib/libTsavRTPosixShimCAPI.so") message("Setting target as posix for tsavorite") endif() diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ade78632c7352..6ffd975127e8e 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -69,7 +69,7 @@ function(llama_build_and_test source) add_executable(${TEST_TARGET} ${source} get-model.cpp) install(TARGETS ${TEST_TARGET} RUNTIME) - target_link_libraries(${TEST_TARGET} PRIVATE common) + target_link_libraries(${TEST_TARGET} PRIVATE common ${TLIBS}) add_test( NAME ${TEST_TARGET} diff --git a/tsi-pkg-build.sh b/tsi-pkg-build.sh index 2a2c0afe462a3..488d98abb035b 100755 --- a/tsi-pkg-build.sh +++ b/tsi-pkg-build.sh @@ -13,7 +13,7 @@ python3 -m venv blob-creation source blob-creation/bin/activate echo 'installing mlir and python dependencies' pip install -r ${MLIR_SDK_VERSION}/compiler/python/requirements-common.txt -pip install ${MLIR_SDK_VERSION}/compiler/python/mlir_external_packages-1.2.3-py3-none-any.whl +pip install ${MLIR_SDK_VERSION}/compiler/python/mlir_external_packages-1.3.0-py3-none-any.whl pip install onnxruntime-training #build TSI kernels for the Tsavorite backend From 2ea9390d102562f36d655638f0e7345d30c48ccd Mon Sep 17 00:00:00 2001 From: Ashish Trivedi Date: Wed, 4 Jun 2025 13:29:14 -0700 Subject: [PATCH 12/35] @FIR-714: Addressed build failures for posix, FPGA still fails as follows /proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: /proj/work/atrivedi/workspace/06_02_2025/llama.cpp/ggml-tsi-kernel/fpga/host/host_abs.o: in function `txe_abs_host': LLVMDialectModule:(.text+0x18): undefined reference to `tsi_alloc' /proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: LLVMDialectModule:(.text+0x24): undefined reference to `tsi_shmem_handle_from_ptr' /proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: LLVMDialectModule:(.text+0x30): undefined reference to `tsi_shmem_handle_from_ptr' /proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: LLVMDialectModule:(.text+0x3c): undefined reference to `tsi_create_command_list' /proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: LLVMDialectModule:(.text+0x58): undefined reference to `tsi_load_blob' /proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: LLVMDialectModule:(.text+0x64): undefined reference to `tsi_shmem_handle_from_ptr' /proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: LLVMDialectModule:(.text+0x70): undefined reference to `tsi_launch_blob' /proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: LLVMDialectModule:(.text+0x7c): undefined reference to `tsi_add_command_to_list' /proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: LLVMDialectModule:(.text+0x84): undefined reference to `tsi_finalize_command_list' /proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: LLVMDialectModule:(.text+0x8c): undefined reference to `tsi_wait' /proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: LLVMDialectModule:(.text+0x94): undefined reference to `tsi_unload_blob' /proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: LLVMDialectModule:(.text+0xa0): undefined reference to `tsi_dealloc' /proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: /proj/work/atrivedi/workspace/06_02_2025/llama.cpp/ggml-tsi-kernel/fpga/host/host_add.o: in function `txe_add_host': LLVMDialectModule:(.text+0x20): undefined reference to `tsi_alloc' /proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: LLVMDialectModule:(.text+0x2c): undefined reference to `tsi_shmem_handle_from_ptr' /proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: LLVMDialectModule:(.text+0x38): undefined reference to `tsi_shmem_handle_from_ptr' --- CMakeLists.txt | 2 +- ggml/include/ggml-tsavorite.h | 20 ++++++++++---------- ggml/src/ggml-tsavorite/ggml-tsavorite.cpp | 20 ++++++++++---------- tests/CMakeLists.txt | 8 ++++---- 4 files changed, 25 insertions(+), 25 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6af525e29cdbc..2eebb65851cad 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,7 +39,7 @@ if (GGML_TSAVORITE) if (${GGML_TSAVORITE_TARGET} STREQUAL fpga) set(CMAKE_CROSSCOMPILING ON) set(ARCH_FLAGS -march=armv8-a) - file(GLOB TLIBS "${RUNTIME_DIR}/lib/*.so" "${GGML_TSI_KERNEL_DIR}/host/*.o") + file(GLOB TLIBS "${RUNTIME_DIR}/lib/*.so" "${GGML_TSI_KERNEL_DIR}/host/*.o") message("Setting target as fpga") elseif (${GGML_TSAVORITE_TARGET} STREQUAL "posix") file(GLOB TLIBS "${RUNTIME_DIR}/lib/*.so" "${GGML_TSI_KERNEL_DIR}/host/*.o" "${MLIR_COMPILER_DIR}/lib/libFFMDeviceShim.so" "${MLIR_COMPILER_DIR}/lib/libTsavRTPosixShimCAPI.so") diff --git a/ggml/include/ggml-tsavorite.h b/ggml/include/ggml-tsavorite.h index 54a8e34662799..238dcc428da88 100644 --- a/ggml/include/ggml-tsavorite.h +++ b/ggml/include/ggml-tsavorite.h @@ -151,16 +151,16 @@ typedef struct tensor_log_ { const ggml_tensor *tensor; } tensor_log; -extern void _mlir_ciface_txe_add(void *a, void *b, void *res); -extern void _mlir_ciface_txe_sub(void *a, void *b, void *res); -extern void _mlir_ciface_txe_mult(void *a, void *b, void *res); -extern void _mlir_ciface_txe_div(void *a, void *b, void *res); -extern void _mlir_ciface_txe_sqrt(void *a, void *res); -extern void _mlir_ciface_txe_neg(void *a, void *res); -extern void _mlir_ciface_txe_abs(void *a, void *res); -extern void _mlir_ciface_txe_sin(void *a, void *res); -extern void _mlir_ciface_txe_sigmoid(void *a, void *res); -extern void _mlir_ciface_txe_silu(void *a, void *res); +extern void _mlir_ciface_txe_add_host(void *a, void *b, void *res); +extern void _mlir_ciface_txe_sub_host(void *a, void *b, void *res); +extern void _mlir_ciface_txe_mult_host(void *a, void *b, void *res); +extern void _mlir_ciface_txe_div_host(void *a, void *b, void *res); +extern void _mlir_ciface_txe_sqrt_host(void *a, void *res); +extern void _mlir_ciface_txe_neg_host(void *a, void *res); +extern void _mlir_ciface_txe_abs_host(void *a, void *res); +extern void _mlir_ciface_txe_sin_host(void *a, void *res); +extern void _mlir_ciface_txe_sigmoid_host(void *a, void *res); +extern void _mlir_ciface_txe_silu_host(void *a, void *res); extern void ggml_tsi_log_tensor_data(tensor_log log_data); #define NUM_OF_TXES 1 diff --git a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp index 573220c8a7027..bc7095eeebf2f 100644 --- a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp +++ b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp @@ -389,12 +389,12 @@ static txe_compute_pipeline_state_s tsi_kernel_setup(enum ggml_tsavorite_kernel_ if (ggml_tsavorite_kernel_mode_flag == GGML_TSAVORITE_KERNEL_MODE_CPU) kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_add_test; else - kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_add; + kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_add_host; kernel_pipeline->kernel_name = "TXE_ADD"; flag = true; break; case GGML_TSAVORITE_KERNEL_TYPE_SUB: - kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_sub; + kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_sub_host; kernel_pipeline->kernel_name = "TXE_SUB"; flag = true; break; @@ -402,42 +402,42 @@ static txe_compute_pipeline_state_s tsi_kernel_setup(enum ggml_tsavorite_kernel_ if (ggml_tsavorite_kernel_mode_flag == GGML_TSAVORITE_KERNEL_MODE_CPU) kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_mult_test; else - kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_mult; + kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_mult_host; kernel_pipeline->kernel_name = "TXE_MULT"; flag = true; break; case GGML_TSAVORITE_KERNEL_TYPE_DIV: - kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_div; + kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_div_host; kernel_pipeline->kernel_name = "TXE_DIV"; flag = true; break; case GGML_TSAVORITE_KERNEL_TYPE_SQRT: - kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_sqrt; + kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_sqrt_host; kernel_pipeline->kernel_name = "TXE_SQRT"; flag = true; break; case GGML_TSAVORITE_KERNEL_TYPE_NEG: - kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_neg; + kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_neg_host; kernel_pipeline->kernel_name = "TXE_NEG"; flag = true; break; case GGML_TSAVORITE_KERNEL_TYPE_ABS: - kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_abs; + kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_abs_host; kernel_pipeline->kernel_name = "TXE_ABS"; flag = true; break; case GGML_TSAVORITE_KERNEL_TYPE_SIN: - kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_sin; + kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_sin_host; kernel_pipeline->kernel_name = "TXE_SIN"; flag = true; break; case GGML_TSAVORITE_KERNEL_TYPE_SIGMOID: - kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_sigmoid; + kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_sigmoid_host; kernel_pipeline->kernel_name = "TXE_SIGMOID"; flag = true; break; case GGML_TSAVORITE_KERNEL_TYPE_SILU: - kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_silu; + kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_silu_host; kernel_pipeline->kernel_name = "TXE_SILU"; flag = true; break; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6ffd975127e8e..c2b5cc88ad330 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -8,7 +8,7 @@ function(llama_build source) endif() add_executable(${TEST_TARGET} ${source}) - target_link_libraries(${TEST_TARGET} PRIVATE common ${TLIBS}) + target_link_libraries(${TEST_TARGET} PRIVATE common) install(TARGETS ${TEST_TARGET} RUNTIME) endfunction() @@ -69,7 +69,7 @@ function(llama_build_and_test source) add_executable(${TEST_TARGET} ${source} get-model.cpp) install(TARGETS ${TEST_TARGET} RUNTIME) - target_link_libraries(${TEST_TARGET} PRIVATE common ${TLIBS}) + target_link_libraries(${TEST_TARGET} PRIVATE common) add_test( NAME ${TEST_TARGET} @@ -169,9 +169,9 @@ endif() # libmtmd set(LLAMA_TEST_NAME test-mtmd-c-api) llama_build_and_test(test-mtmd-c-api.c) -target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd ${TLIBS}) +target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd) # dummy executable - not installed get_filename_component(TEST_TARGET test-c.c NAME_WE) add_executable(${TEST_TARGET} test-c.c) -target_link_libraries(${TEST_TARGET} PRIVATE llama ${TLIBS}) +target_link_libraries(${TEST_TARGET} PRIVATE ${TLIBS} llama) From cea50afa37a04114429b07abf9e6849c0d3a2ecb Mon Sep 17 00:00:00 2001 From: Ashish Trivedi Date: Wed, 4 Jun 2025 18:06:58 -0700 Subject: [PATCH 13/35] @FIR-714: Fixed the issues of not finding fpga libs using runtime/utils/lib/ path --- CMakeLists.txt | 6 +++--- tests/CMakeLists.txt | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2eebb65851cad..a4d51cdbe2dc6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,7 +16,7 @@ if (GGML_TSAVORITE) if (NOT DEFINED MLIR_COMPILER_DIR) if (NOT DEFINED $ENV{MLIR_SDK_VERSION}) set (MLIR_COMPILER_DIR /proj/rel/sw/sdk-r.0.1.3/compiler) - message("MLIR_SDK_VERSION not set defaulting to ${MLIR_COMPILER_DIR}") + message("MLIR_SDK_VERSION not set defaulting to ${MLIR_COMPILER_DIR}") else() set (MLIR_COMPILER_DIR $ENV{MLIR_SDK_VERSION}/compiler) endif() @@ -25,7 +25,7 @@ if (GGML_TSAVORITE) if (NOT DEFINED RUNTIME_DIR) if (NOT DEFINED $ENV{MLIR_SDK_VERSION}) set (RUNTIME_DIR /proj/rel/sw/sdk-r.0.1.3/${GGML_TSAVORITE_TARGET}/runtime) - message("MLIR_SDK_VERSION not set defaulting to ${RUNTIME_DIR}") + message("MLIR_SDK_VERSION not set defaulting to ${RUNTIME_DIR}") else() set (RUNTIME_DIR $ENV{MLIR_SDK_VERSION}/${GGML_TSAVORITE_TARGET}/runtime) endif() @@ -39,7 +39,7 @@ if (GGML_TSAVORITE) if (${GGML_TSAVORITE_TARGET} STREQUAL fpga) set(CMAKE_CROSSCOMPILING ON) set(ARCH_FLAGS -march=armv8-a) - file(GLOB TLIBS "${RUNTIME_DIR}/lib/*.so" "${GGML_TSI_KERNEL_DIR}/host/*.o") + file(GLOB TLIBS "${RUNTIME_DIR}/lib/*.so" "${GGML_TSI_KERNEL_DIR}/host/*.o" "${RUNTIME_DIR}/../utils/lib/TsavRTShimCAPI.cpp.o") message("Setting target as fpga") elseif (${GGML_TSAVORITE_TARGET} STREQUAL "posix") file(GLOB TLIBS "${RUNTIME_DIR}/lib/*.so" "${GGML_TSI_KERNEL_DIR}/host/*.o" "${MLIR_COMPILER_DIR}/lib/libFFMDeviceShim.so" "${MLIR_COMPILER_DIR}/lib/libTsavRTPosixShimCAPI.so") diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index c2b5cc88ad330..1c8b8e29a822e 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -174,4 +174,4 @@ target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd) # dummy executable - not installed get_filename_component(TEST_TARGET test-c.c NAME_WE) add_executable(${TEST_TARGET} test-c.c) -target_link_libraries(${TEST_TARGET} PRIVATE ${TLIBS} llama) +target_link_libraries(${TEST_TARGET} PRIVATE ${TLIBS} llama stdc++) From bbecb0102ccb617bb88aac21055a589cb5c3ef6c Mon Sep 17 00:00:00 2001 From: Anoop Kapoor Date: Fri, 6 Jun 2025 20:50:10 -0700 Subject: [PATCH 14/35] Updated README --- README.md | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 30f91f4696a33..f222c9a1a8ae1 100644 --- a/README.md +++ b/README.md @@ -588,15 +588,22 @@ git clone git@github.com:tsisw/llama.cpp.git #Ensure prerequisites are met as follows cd llama.cpp/ +#Ensure prerequisites are met as follows +echo 'updating submodule' git submodule update --recursive --init cd ggml-tsi-kernel/ module load tsi4 gcc/13.3.0 -python3 -m venv blob-creation +export MLIR_SDK_VERSION=/proj/rel/sw/sdk-r.0.1.3 +echo 'creating python virtual env' +/proj/local/Python-3.10.12/bin/python3 -m venv blob-creation source blob-creation/bin/activate -pip install -r /proj/rel/sw/sdk-r.0.1.3/compiler/python/requirements-common.txt -pip install /proj/rel/sw/sdk-r.0.1.3/compiler/python/mlir_external_packages-1.3.0-py3-none-any.whl +echo 'installing mlir and python dependencies' +pip install -r ${MLIR_SDK_VERSION}/compiler/python/requirements-common.txt +pip install ${MLIR_SDK_VERSION}/compiler/python/mlir_external_packages-1.3.0-py3-none-any.whl pip install onnxruntime-training + + #build TSI kernels for the Tsavorite backend #First for FPGA cd fpga-kernel From d7685c7ec3a83534bb699c6cfc56d28f78134930 Mon Sep 17 00:00:00 2001 From: Anoop Kapoor Date: Sun, 8 Jun 2025 12:38:56 -0700 Subject: [PATCH 15/35] @FIR-722 --updating the latest changes for ggml-tsi-kernel code --- ggml-tsi-kernel | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-tsi-kernel b/ggml-tsi-kernel index 0696a053d553c..3194c54b13b8c 160000 --- a/ggml-tsi-kernel +++ b/ggml-tsi-kernel @@ -1 +1 @@ -Subproject commit 0696a053d553c99488c1f7d60a859afdce9712df +Subproject commit 3194c54b13b8cd0b5c29a6a1cc0060ae2abbed06 From 9688963ed09199949b6c38dad67a1b061b38f933 Mon Sep 17 00:00:00 2001 From: Ashish Trivedi Date: Tue, 10 Jun 2025 11:33:49 -0700 Subject: [PATCH 16/35] @FIR-715: Added FlaskInterface tool for serial port This is a first version of FlaskInterface tool with following 1. Xterm Interface via Browser via /terminal endpoint 2. Serial console interface via Browser via /serial endpoint --- tools/flaskIfc/README.md | 46 ++++++++++++++++++ tools/flaskIfc/flaskCommon.py | 85 +++++++++++++++++++++++++++++++++ tools/flaskIfc/flaskIfc.py | 32 +++++++++++++ tools/flaskIfc/flaskXterm.py | 43 +++++++++++++++++ tools/flaskIfc/serial_script.py | 44 +++++++++++++++++ 5 files changed, 250 insertions(+) create mode 100644 tools/flaskIfc/README.md create mode 100644 tools/flaskIfc/flaskCommon.py create mode 100644 tools/flaskIfc/flaskIfc.py create mode 100644 tools/flaskIfc/flaskXterm.py create mode 100644 tools/flaskIfc/serial_script.py diff --git a/tools/flaskIfc/README.md b/tools/flaskIfc/README.md new file mode 100644 index 0000000000000..4893dc75147c8 --- /dev/null +++ b/tools/flaskIfc/README.md @@ -0,0 +1,46 @@ +This tool provides you an interface to Tsavorite FPGA via a serial console + +The tool consists of following files + +. +├── flaskCommon.py << Common code but currently not used +├── flaskIfc.py << Browser based console interface to TSI device +├── flaskXterm.py << Browser based terminal emulation +├── README.md << Readme file +└── serial_script.py << File with serial interface to console + + +The command to run to run the service on FPGA machine is +``` +flask -A flaskIfc.py --debug run --port 5000 +``` + +This command runs a webserver at port number 500 + +The curl command to connect to this server and communicate is as follows as +an example + +``` +curl "http://localhost:5000/serial?command=cd+%20/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/;./run_platform_test.sh" +``` + +In the above command the command being run is + +``` +cd /usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin +./run_platform_test.sh +``` + +You can also get full fledged Terminal within a browser by running following + +``` +flask -A flaskXterm.py --debug run --port 5000 +``` + +You can connect to this flaskTerm by doing as follows + +``` +http://127.0.0.1:5000/terminal +``` + + diff --git a/tools/flaskIfc/flaskCommon.py b/tools/flaskIfc/flaskCommon.py new file mode 100644 index 0000000000000..eb93a63fcf395 --- /dev/null +++ b/tools/flaskIfc/flaskCommon.py @@ -0,0 +1,85 @@ + +from flask import Flask +from flask_terminal import terminal_blueprint, configure_logger +from flask import Flask, render_template, request +import serial + + +app = Flask(__name__) +app.logger = configure_logger('flask_terminal') + +app.config['SECRET_KEY'] = 'your_secret_key_here' + + +@app.route('/ping') +def ping(): + app.logger.info("Accessed /ping route") + try: + app.logger.info("Successfully returned 'pong'") + return 'pong', 200 + except Exception as e: + app.logger.error(f"Error in ping route: {e}", exc_info=True) + return "An error occurred", 500 + +#### +## IMPLEMENT SOME SORT OF SECURITY +## Around your application, below is an example +### +def is_authenticated(): + """Check if the user is authenticated based on a token stored in the session.""" + # Example logic for checking if a user is authenticated + return 'user_token' in session and session['user_token'] == 'your_secure_token' + +#@terminal_blueprint.before_request +#def before_request_func(): +# if not is_authenticated(): + # Redirect to login page or return an error +# current_app.logger.info("User not authenticated, redirecting to login.") +# return redirect('/login') # Adjusted to use a direct path + + +# Register the terminal blueprint +#app.register_blueprint(terminal_blueprint, url_prefix='/terminal') + +#if __name__ == '__main__': +# app.run(port=8080) + + + +try: + ser = serial.Serial('/dev/ttyUSB3', 921600) # Replace /dev/ttyUSB3 with your port and baud rate +except serial.SerialException as e: + print(f"Error opening serial port: {e}") + ser = None # Handle case where serial port cannot be opened + +@app.route('/send', methods=['POST']) +def send_data(): + if ser is None: + return "Serial port not available", 500 + data = request.form['data'] # Get data from the form + try: + ser.write(data.encode()) # Convert to bytes and send + return 'Data sent successfully' + except serial.SerialException as e: + return f"Error writing to serial port: {e}", 500 + + +@app.route('/receive') +def receive_data(): + if ser is None: + return "Serial port not available", 500 + try: + if ser.in_waiting > 0: + data = ser.readline().decode().strip() # Read and decode + return data + else: + return "No data available" + except serial.SerialException as e: + return f"Error reading from serial port: {e}", 500 + +# Register the terminal blueprint +#app.register_blueprint(terminal_blueprint, url_prefix='/terminal') + +if __name__ == '__main__': + app.run(port=8080) + diff --git a/tools/flaskIfc/flaskIfc.py b/tools/flaskIfc/flaskIfc.py new file mode 100644 index 0000000000000..61187c91a09d4 --- /dev/null +++ b/tools/flaskIfc/flaskIfc.py @@ -0,0 +1,32 @@ +from flask import Flask, request +import subprocess + +app = Flask(__name__) + +@app.route('/serial', methods=['GET']) +def serial_command(): + # Currently the port is hard coded to /dev/ttyUSB3 but can be parameterized + port = '/dev/ttyUSB3' + #port = request.args.get('port') + + # Currently the baudrate is hard coded to 921600 but can be parameterized + #baudrate = request.args.get('baudrate') + baudrate = '921600' + + + # Parse the command and send it to serial.py + command = request.args.get('command') + + #if not all([port, baudrate, command]): + if not all([command]): + return "Missing parameters", 400 + + try: + result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True) + return result.stdout.strip(), 200 + except subprocess.CalledProcessError as e: + return f"Error executing script: {e.stderr}", 500 + + +if __name__ == '__main__': + app.run(debug=True, port=5000) diff --git a/tools/flaskIfc/flaskXterm.py b/tools/flaskIfc/flaskXterm.py new file mode 100644 index 0000000000000..df7ecf391471d --- /dev/null +++ b/tools/flaskIfc/flaskXterm.py @@ -0,0 +1,43 @@ + +from flask import Flask +from flask_terminal import terminal_blueprint, configure_logger + + +app = Flask(__name__) +app.logger = configure_logger('flask_terminal') + +app.config['SECRET_KEY'] = 'your_secret_key_here' + + +@app.route('/ping') +def ping(): + app.logger.info("Accessed /ping route") + try: + app.logger.info("Successfully returned 'pong'") + return 'pong', 200 + except Exception as e: + app.logger.error(f"Error in ping route: {e}", exc_info=True) + return "An error occurred", 500 + +#### +## IMPLEMENT SOME SORT OF SECURITY +## Around your application, below is an example +### +def is_authenticated(): + """Check if the user is authenticated based on a token stored in the session.""" + # Example logic for checking if a user is authenticated + return 'user_token' in session and session['user_token'] == 'your_secure_token' + +#@terminal_blueprint.before_request +#def before_request_func(): +# if not is_authenticated(): + # Redirect to login page or return an error +# current_app.logger.info("User not authenticated, redirecting to login.") +# return redirect('/login') # Adjusted to use a direct path + + +# Register the terminal blueprint +app.register_blueprint(terminal_blueprint, url_prefix='/terminal') + +if __name__ == '__main__': + app.run(port=8080) diff --git a/tools/flaskIfc/serial_script.py b/tools/flaskIfc/serial_script.py new file mode 100644 index 0000000000000..e61f926d0bf47 --- /dev/null +++ b/tools/flaskIfc/serial_script.py @@ -0,0 +1,44 @@ +import serial +import sys + +def send_serial_command(port, baudrate, command): + try: + # Open the serial port with 1 second timeout + ser = serial.Serial(port, baudrate, timeout=1) + + ser.write(command.encode()) # Encode command to bytes + ser.write('\n'.encode()) # Encode command to bytes + + # Wait to read the serial port + data = '\0' + while True: + try: + line = ser.readline() + if line: # Check if line is not empty + data += (line.decode('utf-8').strip()) # Decode and strip to remove extra chars + else: + break # Exit loop if no data is received + except serial.SerialException as e: + ser.close() + return (f"Error reading from serial port: {e}") + except KeyboardInterrupt: + ser.close() + return ("Program interrupted by user") + ser.close() + return data + + except serial.SerialException as e: + ser.close() + return f"Error: {e}" + +# This script can be run in standalone as well +if __name__ == "__main__": + if len(sys.argv) < 4: + print("Usage: python script.py ") + sys.exit(1) + + port = sys.argv[1] + baudrate = int(sys.argv[2]) + command = sys.argv[3] + response = send_serial_command(port, baudrate, command) + print(response) From a4b77bfd0006da521c1fb7624791a8f45d9f3227 Mon Sep 17 00:00:00 2001 From: LewisLui777 <777abc.7@berkeley.edu> Date: Wed, 11 Jun 2025 10:20:02 -0700 Subject: [PATCH 17/35] Just wanted to see if I could push. Added one comment --- tools/flaskIfc/serial_script.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/flaskIfc/serial_script.py b/tools/flaskIfc/serial_script.py index b30e6fae1dade..e138d19ab7de0 100644 --- a/tools/flaskIfc/serial_script.py +++ b/tools/flaskIfc/serial_script.py @@ -10,6 +10,7 @@ def send_serial_command(port, baudrate, command): ser.write('\n'.encode()) # Encode command to bytes # Wait to read the serial port + # Need to add a break somewhere for when we see the phrase "root@name" data = '\0' while True: try: From 21ba6d11ca8f2c95ed8110d2c570550a1288a43e Mon Sep 17 00:00:00 2001 From: Anoop Kapoor Date: Wed, 11 Jun 2025 16:49:18 -0700 Subject: [PATCH 18/35] @FIR-732 - Llama.cpp: Webserver & HTML pages support --- tools/flaskIfc/flaskIfc.py | 54 +++++++++++++++++++++------- tools/flaskIfc/serial_script.py | 6 ++-- tools/flaskIfc/templates/index.html | 38 ++++++++++++++++++++ tools/flaskIfc/templates/result.html | 12 +++++++ 4 files changed, 93 insertions(+), 17 deletions(-) create mode 100644 tools/flaskIfc/templates/index.html create mode 100644 tools/flaskIfc/templates/result.html diff --git a/tools/flaskIfc/flaskIfc.py b/tools/flaskIfc/flaskIfc.py index 61187c91a09d4..4d65c9a7ffa0e 100644 --- a/tools/flaskIfc/flaskIfc.py +++ b/tools/flaskIfc/flaskIfc.py @@ -1,32 +1,60 @@ -from flask import Flask, request +from flask import Flask, render_template, request import subprocess app = Flask(__name__) -@app.route('/serial', methods=['GET']) -def serial_command(): +@app.route('/') +def index(): + return render_template('index.html') + +@app.route('/submit', methods=['POST']) +def submit(): + #./run_platform_test.sh "my cat's name" "10" "tinyllama-vo-5m-para.gguf" "none" + model = request.form.get('model') + backend = request.form.get('backend') + tokens = request.form.get('tokens') + prompt = request.form.get('prompt') + + # Define the model path (update with actual paths) + model_paths = { + "tiny-llama": "tinyllama-vo-5m-para.gguf", + "Tiny-llama-F32": "Tiny-Llama-v0.3-FP32-1.1B-F32.gguf" + } + + model_path = model_paths.get(model, "") + if not model_path: + return f"

Error: Model path not found for '{model}'

" + + # Below is for reference i will remove later + # Build llama-cli command + #command = [ + # "./llama-cli", + # "-p", prompt, + # "-m", model_path, + # "--device", backend, + # "--temp", "0", + # "--n-predict", tokens, + # "--repeat-penalty", "1", + # "--top-k", "0", + # "--top-p", "1" + #] # Currently the port is hard coded to /dev/ttyUSB3 but can be parameterized port = '/dev/ttyUSB3' - #port = request.args.get('port') # Currently the baudrate is hard coded to 921600 but can be parameterized - #baudrate = request.args.get('baudrate') baudrate = '921600' + script_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/run_platform_test.sh" + command = f"{script_path} \"{prompt}\" {tokens} {model_path} {backend}" - # Parse the command and send it to serial.py - command = request.args.get('command') - - #if not all([port, baudrate, command]): - if not all([command]): - return "Missing parameters", 400 try: result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True) - return result.stdout.strip(), 200 + output = result.stdout # This should have \n except subprocess.CalledProcessError as e: - return f"Error executing script: {e.stderr}", 500 + output = f"Error running model: {e.stderr}" + return render_template('result.html', output=output) if __name__ == '__main__': app.run(debug=True, port=5000) diff --git a/tools/flaskIfc/serial_script.py b/tools/flaskIfc/serial_script.py index e138d19ab7de0..cde5e0cd54dfc 100644 --- a/tools/flaskIfc/serial_script.py +++ b/tools/flaskIfc/serial_script.py @@ -4,19 +4,18 @@ def send_serial_command(port, baudrate, command): try: # Open the serial port with 1 second timeout - ser = serial.Serial(port, baudrate, timeout=60) + ser = serial.Serial(port, baudrate, timeout=20) ser.write(command.encode()) # Encode command to bytes ser.write('\n'.encode()) # Encode command to bytes # Wait to read the serial port - # Need to add a break somewhere for when we see the phrase "root@name" data = '\0' while True: try: line = ser.readline() if line: # Check if line is not empty - data += (line.decode('utf-8').strip()) # Decode and strip to remove extra chars + data += line.decode('utf-8') # Keep the line as-is with newline else: break # Exit loop if no data is received except serial.SerialException as e: @@ -42,4 +41,3 @@ def send_serial_command(port, baudrate, command): baudrate = int(sys.argv[2]) command = sys.argv[3] response = send_serial_command(port, baudrate, command) - print(response) diff --git a/tools/flaskIfc/templates/index.html b/tools/flaskIfc/templates/index.html new file mode 100644 index 0000000000000..9152167a86c44 --- /dev/null +++ b/tools/flaskIfc/templates/index.html @@ -0,0 +1,38 @@ + + + + TSAVORITE Web UI For Model Inference + + +

Model Inference Configuration

+
+ + + +

+ + + + +

+ + + + +

+ + +
+ +

+ + +
+ + diff --git a/tools/flaskIfc/templates/result.html b/tools/flaskIfc/templates/result.html new file mode 100644 index 0000000000000..07c79c409f596 --- /dev/null +++ b/tools/flaskIfc/templates/result.html @@ -0,0 +1,12 @@ + + + + Model Output + + +

Model Response

+
{{ output }}
+
+ ⟵ Back to Form + + From 597f92800787739d23e95bd5bf700d5e277e9c42 Mon Sep 17 00:00:00 2001 From: Ashish Trivedi Date: Wed, 11 Jun 2025 20:15:09 -0700 Subject: [PATCH 19/35] @FIR-732: Added print back to ensure stdout has data --- tools/flaskIfc/serial_script.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/flaskIfc/serial_script.py b/tools/flaskIfc/serial_script.py index cde5e0cd54dfc..0e1064225921f 100644 --- a/tools/flaskIfc/serial_script.py +++ b/tools/flaskIfc/serial_script.py @@ -4,7 +4,7 @@ def send_serial_command(port, baudrate, command): try: # Open the serial port with 1 second timeout - ser = serial.Serial(port, baudrate, timeout=20) + ser = serial.Serial(port, baudrate, timeout=60) ser.write(command.encode()) # Encode command to bytes ser.write('\n'.encode()) # Encode command to bytes @@ -25,6 +25,7 @@ def send_serial_command(port, baudrate, command): ser.close() return ("Program interrupted by user") ser.close() + print (data) return data except serial.SerialException as e: From 8a5ffff1fee344726e4e6da14016fd92e6ad14b6 Mon Sep 17 00:00:00 2001 From: Anoop Kapoor Date: Wed, 11 Jun 2025 21:46:16 -0700 Subject: [PATCH 20/35] @FIR-733 - Lllama.cpp: Webserver, add JOB status support for Model --- tools/flaskIfc/flaskIfc.py | 55 ++++++++++++++++++++---- tools/flaskIfc/templates/processing.html | 22 ++++++++++ 2 files changed, 68 insertions(+), 9 deletions(-) create mode 100644 tools/flaskIfc/templates/processing.html diff --git a/tools/flaskIfc/flaskIfc.py b/tools/flaskIfc/flaskIfc.py index 4d65c9a7ffa0e..e2945f680dfc1 100644 --- a/tools/flaskIfc/flaskIfc.py +++ b/tools/flaskIfc/flaskIfc.py @@ -1,5 +1,9 @@ from flask import Flask, render_template, request import subprocess +import threading +import time + +job_status = {"running": False, "result": "", "thread": None} app = Flask(__name__) @@ -9,6 +13,11 @@ def index(): @app.route('/submit', methods=['POST']) def submit(): + global job_status + + if job_status["running"]: + return "

A model is already running. Please wait or abort.

" + #./run_platform_test.sh "my cat's name" "10" "tinyllama-vo-5m-para.gguf" "none" model = request.form.get('model') backend = request.form.get('backend') @@ -25,7 +34,6 @@ def submit(): if not model_path: return f"

Error: Model path not found for '{model}'

" - # Below is for reference i will remove later # Build llama-cli command #command = [ # "./llama-cli", @@ -43,18 +51,47 @@ def submit(): # Currently the baudrate is hard coded to 921600 but can be parameterized baudrate = '921600' - script_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/run_platform_test.sh" + #command = script_path prompt tokens model backend + #command = script_path command = f"{script_path} \"{prompt}\" {tokens} {model_path} {backend}" - try: - result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True) - output = result.stdout # This should have \n - except subprocess.CalledProcessError as e: - output = f"Error running model: {e.stderr}" + def run_script(): + try: + result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True) + job_status["result"] = result.stdout + except subprocess.CalledProcessError as e: + job_status["result"] = f"Error: {e.stderr}" + finally: + job_status["running"] = False + + thread = threading.Thread(target=run_script) + job_status = {"running": True, "result": "", "thread": thread} + thread.start() + + return render_template("processing.html") + +@app.route('/status') +def status(): + if job_status["running"]: + return "running" + else: + return "done" + +@app.route('/result') +def result(): + return render_template("result.html", output=job_status["result"]) - return render_template('result.html', output=output) +@app.route('/abort') +def abort(): + global job_status + if job_status["running"] and job_status["thread"].is_alive(): + # Use subprocess.Popen + pid handling instead for real process termination + job_status["running"] = False + job_status["result"] = "Aborted by user." + return "

Job aborted.

Home" + return "

No job running.

Home" if __name__ == '__main__': - app.run(debug=True, port=5000) + app.run(debug=True, port=5001) diff --git a/tools/flaskIfc/templates/processing.html b/tools/flaskIfc/templates/processing.html new file mode 100644 index 0000000000000..15f609bee1712 --- /dev/null +++ b/tools/flaskIfc/templates/processing.html @@ -0,0 +1,22 @@ + + + + Processing + + + +

Model is running...

+ + + From 52ae0e9eedc0fd11642eda9740c03cf93f1106dc Mon Sep 17 00:00:00 2001 From: Anoop Kapoor Date: Wed, 11 Jun 2025 21:49:13 -0700 Subject: [PATCH 21/35] removing commented code --- tools/flaskIfc/flaskIfc.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/flaskIfc/flaskIfc.py b/tools/flaskIfc/flaskIfc.py index e2945f680dfc1..bba12448a1177 100644 --- a/tools/flaskIfc/flaskIfc.py +++ b/tools/flaskIfc/flaskIfc.py @@ -52,8 +52,6 @@ def submit(): # Currently the baudrate is hard coded to 921600 but can be parameterized baudrate = '921600' script_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/run_platform_test.sh" - #command = script_path prompt tokens model backend - #command = script_path command = f"{script_path} \"{prompt}\" {tokens} {model_path} {backend}" @@ -94,4 +92,4 @@ def abort(): return "

No job running.

Home" if __name__ == '__main__': - app.run(debug=True, port=5001) + app.run(debug=True, port=5000) From ffe045a424894637ee58ea2cc639e0be50c92b92 Mon Sep 17 00:00:00 2001 From: Lewis Lui Date: Thu, 12 Jun 2025 13:58:36 -0700 Subject: [PATCH 22/35] @FIR-731 - serial_script.py changes to identify end of output --- tools/flaskIfc/serial_script.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tools/flaskIfc/serial_script.py b/tools/flaskIfc/serial_script.py index 0e1064225921f..0354f15ff2ca3 100644 --- a/tools/flaskIfc/serial_script.py +++ b/tools/flaskIfc/serial_script.py @@ -3,19 +3,21 @@ def send_serial_command(port, baudrate, command): try: - # Open the serial port with 1 second timeout - ser = serial.Serial(port, baudrate, timeout=60) + # Open the serial port with 1 second timeout (timeout = 60 but removed it for testing!) + ser = serial.Serial(port, baudrate) - ser.write(command.encode()) # Encode command to bytes - ser.write('\n'.encode()) # Encode command to bytes + ser.write((command + '\n').encode()) # Send command with newline # Wait to read the serial port data = '\0' while True: try: line = ser.readline() + check = line.decode('utf-8') + if ("run-platform-done" in check) or ("@agilex7_dk_si_agf014ea" in check): + break if line: # Check if line is not empty - data += line.decode('utf-8') # Keep the line as-is with newline + data += check # Keep the line as-is with newline else: break # Exit loop if no data is received except serial.SerialException as e: @@ -25,7 +27,6 @@ def send_serial_command(port, baudrate, command): ser.close() return ("Program interrupted by user") ser.close() - print (data) return data except serial.SerialException as e: @@ -42,3 +43,4 @@ def send_serial_command(port, baudrate, command): baudrate = int(sys.argv[2]) command = sys.argv[3] response = send_serial_command(port, baudrate, command) + print(response) From 3211f60eeab04f757d2fce0f730aad87b584937b Mon Sep 17 00:00:00 2001 From: Lewis Lui Date: Thu, 12 Jun 2025 14:35:23 -0700 Subject: [PATCH 23/35] Some more changes to address the comments --- tools/flaskIfc/serial_script.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/flaskIfc/serial_script.py b/tools/flaskIfc/serial_script.py index 0354f15ff2ca3..aca14a60f5c9a 100644 --- a/tools/flaskIfc/serial_script.py +++ b/tools/flaskIfc/serial_script.py @@ -13,11 +13,11 @@ def send_serial_command(port, baudrate, command): while True: try: line = ser.readline() - check = line.decode('utf-8') - if ("run-platform-done" in check) or ("@agilex7_dk_si_agf014ea" in check): - break if line: # Check if line is not empty - data += check # Keep the line as-is with newline + read_next_line = line.decode('utf-8') + if ("run-platform-done" in read_next_line) or ("@agilex7_dk_si_agf014ea" in read_next_line): + break + data += read_next_line # Keep the line as-is with newline else: break # Exit loop if no data is received except serial.SerialException as e: @@ -27,6 +27,7 @@ def send_serial_command(port, baudrate, command): ser.close() return ("Program interrupted by user") ser.close() + print(data) return data except serial.SerialException as e: @@ -43,4 +44,3 @@ def send_serial_command(port, baudrate, command): baudrate = int(sys.argv[2]) command = sys.argv[3] response = send_serial_command(port, baudrate, command) - print(response) From a411fd97096431afd4a759487d1236890791ae62 Mon Sep 17 00:00:00 2001 From: Lewis Lui Date: Thu, 12 Jun 2025 14:43:29 -0700 Subject: [PATCH 24/35] Removed a comment --- tools/flaskIfc/serial_script.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/flaskIfc/serial_script.py b/tools/flaskIfc/serial_script.py index aca14a60f5c9a..a91587341e557 100644 --- a/tools/flaskIfc/serial_script.py +++ b/tools/flaskIfc/serial_script.py @@ -3,7 +3,6 @@ def send_serial_command(port, baudrate, command): try: - # Open the serial port with 1 second timeout (timeout = 60 but removed it for testing!) ser = serial.Serial(port, baudrate) ser.write((command + '\n').encode()) # Send command with newline From 41d98b7ce06914f544e4a3b7902e1a5ece53082e Mon Sep 17 00:00:00 2001 From: Ashish Trivedi Date: Fri, 13 Jun 2025 10:32:36 -0700 Subject: [PATCH 25/35] @FIR-737: Added another endpoint llama-cli t invoke directly in URL This commit has two changes 1. Added another endpoint llama-cli to invole the run_platform_test.sh directly 2. Updated reading of output to byte by byte to identify marking prompt and exit when the marker is seen --- tools/flaskIfc/flaskIfc.py | 50 +++++++++++++++++++++++++++++++++ tools/flaskIfc/serial_script.py | 12 ++++++-- 2 files changed, 59 insertions(+), 3 deletions(-) diff --git a/tools/flaskIfc/flaskIfc.py b/tools/flaskIfc/flaskIfc.py index bba12448a1177..7d2333b36e0ce 100644 --- a/tools/flaskIfc/flaskIfc.py +++ b/tools/flaskIfc/flaskIfc.py @@ -11,6 +11,56 @@ def index(): return render_template('index.html') +@app.route('/llama-cli', methods=['GET']) +def serial_command(): + # Currently the port is hard coded to /dev/ttyUSB3 but can be parameterized + port = '/dev/ttyUSB3' + #port = request.args.get('port') + + # Currently the baudrate is hard coded to 921600 but can be parameterized + #baudrate = request.args.get('baudrate') + baudrate = '921600' + #./run_platform_test.sh "my cat's name" "10" "tinyllama-vo-5m-para.gguf" "none" + model = request.args.get('model') + backend = request.args.get('backend') + tokens = request.args.get('tokens') + prompt = request.args.get('prompt') + + # Define the model path (update with actual paths) + model_paths = { + "tiny-llama": "tinyllama-vo-5m-para.gguf", + "Tiny-llama-F32": "Tiny-Llama-v0.3-FP32-1.1B-F32.gguf" + } + + model_path = model_paths.get(model, "") + if not model_path: + return f"

Error: Model path not found for '{model}'

" + + # Build llama-cli command + #command = [ + # "./llama-cli", + # "-p", prompt, + # "-m", model_path, + # "--device", backend, + # "--temp", "0", + # "--n-predict", tokens, + # "--repeat-penalty", "1", + # "--top-k", "0", + # "--top-p", "1" + #] + # URL to Test this end point is as follows + # http://10.50.30.167:5001/llama-cli?model=tiny-llama&backend=tSavorite&tokens=5&prompt=Hello+How+are+you + script_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/run_platform_test.sh" + command = f"{script_path} \"{prompt}\" {tokens} {model_path} {backend}" + + try: + result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True) + return result.stdout, 200 + except subprocess.CalledProcessError as e: + return f"Error executing script: {e.stderr}", 500 + + + @app.route('/submit', methods=['POST']) def submit(): global job_status diff --git a/tools/flaskIfc/serial_script.py b/tools/flaskIfc/serial_script.py index a91587341e557..38a53d103f0c8 100644 --- a/tools/flaskIfc/serial_script.py +++ b/tools/flaskIfc/serial_script.py @@ -6,15 +6,21 @@ def send_serial_command(port, baudrate, command): ser = serial.Serial(port, baudrate) ser.write((command + '\n').encode()) # Send command with newline - # Wait to read the serial port data = '\0' while True: try: - line = ser.readline() + # read byte by byte to find either a new line character or a prompt marker + # instead of new line using line = ser.readline() + line = b"" + while True: + byte = ser.read(1) # Read one byte at a time + if (byte == b"\n") or (byte == b"#"): # Stop when delimiter is found + break + line += byte if line: # Check if line is not empty read_next_line = line.decode('utf-8') - if ("run-platform-done" in read_next_line) or ("@agilex7_dk_si_agf014ea" in read_next_line): + if ("run-platform-done" in read_next_line.strip()) or ("@agilex7_dk_si_agf014ea" in read_next_line.strip()) or ("imx8mpevk" in read_next_line.strip()): break data += read_next_line # Keep the line as-is with newline else: From 2aeae8f8759989ee9780da82e162c0d82b00ce82 Mon Sep 17 00:00:00 2001 From: atrivedi-tsavoritesi Date: Fri, 13 Jun 2025 16:14:17 -0700 Subject: [PATCH 26/35] @FIR-738: Updated the run_llama_cli to be run instead of (#12) run_platform_test.sh Co-authored-by: Ashish Trivedi --- tools/flaskIfc/flaskIfc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/flaskIfc/flaskIfc.py b/tools/flaskIfc/flaskIfc.py index 7d2333b36e0ce..cffc2e4a2b27e 100644 --- a/tools/flaskIfc/flaskIfc.py +++ b/tools/flaskIfc/flaskIfc.py @@ -50,7 +50,7 @@ def serial_command(): #] # URL to Test this end point is as follows # http://10.50.30.167:5001/llama-cli?model=tiny-llama&backend=tSavorite&tokens=5&prompt=Hello+How+are+you - script_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/run_platform_test.sh" + script_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/run_llama_cli.sh" command = f"{script_path} \"{prompt}\" {tokens} {model_path} {backend}" try: From 52e4a5804d7dc0fbeca76cab2f66c4ef8dad72f5 Mon Sep 17 00:00:00 2001 From: Anoop Kapoor Date: Sat, 14 Jun 2025 14:26:43 -0700 Subject: [PATCH 27/35] @FIR-736 - lama.cpp: Disable all logs except token generation log --- common/log.h | 9 +++++++++ ggml/include/ggml.h | 1 + ggml/src/ggml-impl.h | 1 + ggml/src/ggml-tsavorite/ggml-tsavorite.cpp | 5 ++--- ggml/src/ggml.c | 1 + src/llama-context.cpp | 8 +++++++- src/llama-impl.h | 1 + src/llama-sampling.cpp | 2 ++ tools/main/main.cpp | 7 +++++++ 9 files changed, 31 insertions(+), 4 deletions(-) diff --git a/common/log.h b/common/log.h index c56bb50d95db0..0e23b669fcc22 100644 --- a/common/log.h +++ b/common/log.h @@ -90,11 +90,20 @@ void common_log_set_timestamps(struct common_log * log, bool timestamps) #define LOG(...) LOG_TMPL(GGML_LOG_LEVEL_NONE, 0, __VA_ARGS__) #define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__) +#if ENABLE_LOG #define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, 0, __VA_ARGS__) #define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, 0, __VA_ARGS__) #define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0, __VA_ARGS__) #define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__) #define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, 0, __VA_ARGS__) +#else +#define LOG_INF(...) +#define LOG_WRN(...) +#define LOG_ERR(...) +#define LOG_DBG(...) +#define LOG_CNT(...) +#endif +#define LOG_TSAVORITE(...) LOG_TMPL(GGML_LOG_LEVEL_TSAVORITE, 0, __VA_ARGS__) #define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO, verbosity, __VA_ARGS__) #define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN, verbosity, __VA_ARGS__) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index c81ff03fee810..e6830b63ba8e1 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -554,6 +554,7 @@ extern "C" { GGML_LOG_LEVEL_WARN = 3, GGML_LOG_LEVEL_ERROR = 4, GGML_LOG_LEVEL_CONT = 5, // continue previous log + GGML_LOG_LEVEL_TSAVORITE = 42, }; // this tensor... diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index a19cfb14e0f9f..99c3475fc10cf 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -85,6 +85,7 @@ GGML_API void ggml_log_callback_default(enum ggml_log_level level, const char * #define GGML_LOG_ERROR(...) ggml_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) #define GGML_LOG_DEBUG(...) ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) #define GGML_LOG_CONT(...) ggml_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__) +#define GGML_LOG_TSAVORITE(...) ggml_log_internal(GGML_LOG_LEVEL_TSAVORITE , __VA_ARGS__) #define GGML_DEBUG 0 diff --git a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp index bc7095eeebf2f..c49d02375921f 100644 --- a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp +++ b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp @@ -504,7 +504,6 @@ static void *ggml_tsavorite_host_malloc(size_t n) { GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); GGML_TSAVORITE_LOG_INFO("\n Allocating memory from tsi_alloc with size %ld \n", n); - printf("\n ANoop Allocating memory from tsi_alloc with size %ld \n", n); data = tsi_alloc(n); GGML_TSAVORITE_LOG_CONT("\n Allocating memory from tsi_alloc with size %ld starting memory %p\n", n, data); @@ -1800,7 +1799,6 @@ static bool ggml_backend_tsavorite_device_supports_buft(ggml_backend_dev_t dev, // ggml_backend_sched_backend_id_from_cur -> ggml_backend_offload_op -> static bool ggml_backend_tsavorite_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor *op) { - // printf("\n ANoop Calling %s \n ", __func__); if (op->type != GGML_TYPE_F32) return false; switch (op->op) { @@ -1894,8 +1892,9 @@ static struct ggml_backend_reg_i ggml_backend_tsavorite_reg_i = { /* .get_proc_address = */ NULL, }; + ggml_backend_reg_t ggml_backend_tsavorite_reg(void) { - ggml_tsavorite_log_type_val = GGML_TSAVORITE_LOG_ERROR; + ggml_tsavorite_log_type_val = GGML_TSAVORITE_LOG_NONE; ggml_tsavorite_kernel_mode_flag = GGML_TSAVORITE_KERNEL_MODE_MLIR; GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); g_ggml_backend_tsavorite_reg.iface = ggml_backend_tsavorite_reg_i; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 57d3e39adf758..134b7420de746 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -249,6 +249,7 @@ static void ggml_log_internal_v(enum ggml_log_level level, const char * format, void ggml_log_internal(enum ggml_log_level level, const char * format, ...) { va_list args; va_start(args, format); + if (level == GGML_LOG_LEVEL_TSAVORITE) ggml_log_internal_v(level, format, args); va_end(args); } diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 85b4324b699e6..984dbf14d14ae 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2615,13 +2615,19 @@ void llama_perf_context_print(const llama_context * ctx) { const auto data = llama_perf_context(ctx); const double t_end_ms = 1e-3 * ggml_time_us(); - LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms); LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval); LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); + + LLAMA_LOG_TSAVORITE("%s: load time = %10.2f ms\n", __func__, data.t_load_ms); + LLAMA_LOG_TSAVORITE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval); + LLAMA_LOG_TSAVORITE("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); + LLAMA_LOG_TSAVORITE("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); } void llama_perf_context_reset(llama_context * ctx) { diff --git a/src/llama-impl.h b/src/llama-impl.h index 02b1d07f8400d..abc963a4a14e7 100644 --- a/src/llama-impl.h +++ b/src/llama-impl.h @@ -29,6 +29,7 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void * #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) #define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) #define LLAMA_LOG_CONT(...) llama_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__) +#define LLAMA_LOG_TSAVORITE(...) llama_log_internal(GGML_LOG_LEVEL_TSAVORITE, __VA_ARGS__) // // helpers diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 804b11e0a943e..d012a0ce520e0 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2562,6 +2562,8 @@ void llama_perf_sampler_print(const struct llama_sampler * chain) { LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample); + LLAMA_LOG_TSAVORITE("\n\n%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)", + __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample); } void llama_perf_sampler_reset(struct llama_sampler * chain) { diff --git a/tools/main/main.cpp b/tools/main/main.cpp index 1bd2be2d94f51..26842116ec6df 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -41,6 +41,12 @@ static std::vector * g_output_tokens; static bool is_interacting = false; static bool need_insert_eot = false; +static void my_logger(ggml_log_level level, const char *text, void *user_data) { + if (level == GGML_LOG_LEVEL_TSAVORITE) { + fprintf(stderr, "%s", text); // only show warnings or errors + } +} + static void print_usage(int argc, char ** argv) { (void) argc; @@ -120,6 +126,7 @@ int main(int argc, char ** argv) { LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); } + llama_log_set(my_logger, nullptr); LOG_INF("%s: llama backend init\n", __func__); llama_backend_init(); From 61915987c54a7c58f60d6c6bd53034a5f7d47e7f Mon Sep 17 00:00:00 2001 From: LewisLui777 <121061033+LewisLui777@users.noreply.github.com> Date: Mon, 16 Jun 2025 15:57:02 -0700 Subject: [PATCH 28/35] Changed run_platform_test.sh to run_llama_cli.sh (#14) Co-authored-by: Lewis Lui --- tools/flaskIfc/flaskIfc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/flaskIfc/flaskIfc.py b/tools/flaskIfc/flaskIfc.py index cffc2e4a2b27e..1bfca20440bb2 100644 --- a/tools/flaskIfc/flaskIfc.py +++ b/tools/flaskIfc/flaskIfc.py @@ -101,7 +101,7 @@ def submit(): # Currently the baudrate is hard coded to 921600 but can be parameterized baudrate = '921600' - script_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/run_platform_test.sh" + script_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/run_llama_cli.sh" command = f"{script_path} \"{prompt}\" {tokens} {model_path} {backend}" From cd734f0085828e5b8c638397d6bcb4ee0a2ffbc9 Mon Sep 17 00:00:00 2001 From: atrivedi-tsavoritesi Date: Mon, 16 Jun 2025 17:13:06 -0700 Subject: [PATCH 29/35] @FIR-748: Added endpoints for health, sysinfo, upload and restart (#15) --- tools/flaskIfc/flaskIfc.py | 111 +++++++++++++++++++++++---- tools/flaskIfc/templates/upload.html | 4 + 2 files changed, 101 insertions(+), 14 deletions(-) create mode 100644 tools/flaskIfc/templates/upload.html diff --git a/tools/flaskIfc/flaskIfc.py b/tools/flaskIfc/flaskIfc.py index 1bfca20440bb2..966fc38c549dc 100644 --- a/tools/flaskIfc/flaskIfc.py +++ b/tools/flaskIfc/flaskIfc.py @@ -2,25 +2,24 @@ import subprocess import threading import time +from werkzeug.utils import secure_filename +import os job_status = {"running": False, "result": "", "thread": None} app = Flask(__name__) +port = '/dev/ttyUSB3' +baudrate = '921600' + @app.route('/') def index(): return render_template('index.html') @app.route('/llama-cli', methods=['GET']) -def serial_command(): - # Currently the port is hard coded to /dev/ttyUSB3 but can be parameterized - port = '/dev/ttyUSB3' - #port = request.args.get('port') - - # Currently the baudrate is hard coded to 921600 but can be parameterized - #baudrate = request.args.get('baudrate') - baudrate = '921600' - #./run_platform_test.sh "my cat's name" "10" "tinyllama-vo-5m-para.gguf" "none" +def llama_cli_serial_command(): + + #./run_llama_cli.sh "my cat's name" "10" "tinyllama-vo-5m-para.gguf" "none" model = request.args.get('model') backend = request.args.get('backend') tokens = request.args.get('tokens') @@ -59,7 +58,95 @@ def serial_command(): except subprocess.CalledProcessError as e: return f"Error executing script: {e.stderr}", 500 +UPLOAD_FOLDER = './' # Directory where uploaded files will be stored +app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER +os.makedirs(UPLOAD_FOLDER, exist_ok=True) # Create the upload folder if it doesn't exist + +@app.route('/upload-gguf', methods=['POST', 'GET']) +def upload_serial_command(): + if request.method == 'POST': + # Check if a file was submitted + if 'file' not in request.files: + return "No file part" + file = request.files['file'] + + # Check if the file is empty + if file.filename == '': + return "No file selected" + + # Save the file if it exists + if file: + filename = secure_filename(file.filename) + file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) + return "File uploaded successfully" + return render_template('upload.html') # Display the upload form + +# command = f"upload file" +# try: +# result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True) +# return result.stdout, 200 +# except subprocess.CalledProcessError as e: +# return f"Error executing script: {e.stderr}", 500 + +@app.route('/upload-file', methods=['GET', 'POST']) +def upload_file(): + if request.method == 'POST': + # Check if a file was submitted + if 'file' not in request.files: + return "No file part" + file = request.files['file'] + + # Check if the file is empty + if file.filename == '': + return "No file selected" + + # Save the file if it exists + if file: + filename = secure_filename(file.filename) + file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) + return "File uploaded successfully" + return render_template('upload.html') # Display the upload form + +@app.route('/restart-txe', methods=['GET']) +def restart_txe_serial_command(): + command = f"telnet localhost 8000; close all" + + try: + result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True) + return result.stdout, 200 + except subprocess.CalledProcessError as e: + return f"Error executing script: {e.stderr}", 500 + +@app.route('/health-check', methods=['GET']) +def health_check_serial_command(): + command = f"free -h" + + try: + result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True) + return result.stdout, 200 + except subprocess.CalledProcessError as e: + return f"Error executing script: {e.stderr}", 500 +@app.route('/test', methods=['GET']) +def test_serial_command(): + command = f"test" + + try: + result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True) + return result.stdout, 200 + except subprocess.CalledProcessError as e: + return f"Error executing script: {e.stderr}", 500 + +@app.route('/system-info', methods=['GET']) +def system_info_serial_command(): + + command = f"lscpu" + + try: + result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True) + return result.stdout, 200 + except subprocess.CalledProcessError as e: + return f"Error executing script: {e.stderr}", 500 @app.route('/submit', methods=['POST']) def submit(): @@ -68,7 +155,7 @@ def submit(): if job_status["running"]: return "

A model is already running. Please wait or abort.

" - #./run_platform_test.sh "my cat's name" "10" "tinyllama-vo-5m-para.gguf" "none" + #./run_llama_cli.sh "my cat's name" "10" "tinyllama-vo-5m-para.gguf" "none" model = request.form.get('model') backend = request.form.get('backend') tokens = request.form.get('tokens') @@ -96,11 +183,7 @@ def submit(): # "--top-k", "0", # "--top-p", "1" #] - # Currently the port is hard coded to /dev/ttyUSB3 but can be parameterized - port = '/dev/ttyUSB3' - # Currently the baudrate is hard coded to 921600 but can be parameterized - baudrate = '921600' script_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/run_llama_cli.sh" command = f"{script_path} \"{prompt}\" {tokens} {model_path} {backend}" diff --git a/tools/flaskIfc/templates/upload.html b/tools/flaskIfc/templates/upload.html new file mode 100644 index 0000000000000..3368379f74754 --- /dev/null +++ b/tools/flaskIfc/templates/upload.html @@ -0,0 +1,4 @@ +
+ + +
From f53f23ca0f636bc1ec3bd8966c73ac5183a2fba6 Mon Sep 17 00:00:00 2001 From: atrivedi-tsavoritesi Date: Tue, 17 Jun 2025 13:26:48 -0700 Subject: [PATCH 30/35] @FIR-742: Add system-info, txe-restart functionality and cd to right path (#16) The changes are as follows 1. change directory to right folder before running the commands 2. Add system-info and txe-restart functionlity Co-authored-by: Ashish Trivedi --- tools/flaskIfc/flaskIfc.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/tools/flaskIfc/flaskIfc.py b/tools/flaskIfc/flaskIfc.py index 966fc38c549dc..8d57a069cf3de 100644 --- a/tools/flaskIfc/flaskIfc.py +++ b/tools/flaskIfc/flaskIfc.py @@ -11,6 +11,7 @@ port = '/dev/ttyUSB3' baudrate = '921600' +exe_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/" @app.route('/') def index(): @@ -49,8 +50,8 @@ def llama_cli_serial_command(): #] # URL to Test this end point is as follows # http://10.50.30.167:5001/llama-cli?model=tiny-llama&backend=tSavorite&tokens=5&prompt=Hello+How+are+you - script_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/run_llama_cli.sh" - command = f"{script_path} \"{prompt}\" {tokens} {model_path} {backend}" + script_path = "./run_llama_cli.sh" + command = f"cd {exe_path}; {script_path} \"{prompt}\" {tokens} {model_path} {backend}" try: result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True) @@ -109,11 +110,17 @@ def upload_file(): @app.route('/restart-txe', methods=['GET']) def restart_txe_serial_command(): - command = f"telnet localhost 8000; close all" + command = f"telnet localhost 8000\r\nclose all\r\n" try: result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True) - return result.stdout, 200 + time.sleep(5) + command = f"{exe_path}/../install/tsi-start\nyes\n" + try: + result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True) + return result.stdout, 200 + except subprocess.CalledProcessError as e: + return f"Error executing script: {e.stderr}", 500 except subprocess.CalledProcessError as e: return f"Error executing script: {e.stderr}", 500 @@ -140,7 +147,7 @@ def test_serial_command(): @app.route('/system-info', methods=['GET']) def system_info_serial_command(): - command = f"lscpu" + command = f"{exe_path}../install/tsi-version;lscpu" try: result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True) @@ -184,8 +191,8 @@ def submit(): # "--top-p", "1" #] - script_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/run_llama_cli.sh" - command = f"{script_path} \"{prompt}\" {tokens} {model_path} {backend}" + script_path = "./run_llama_cli.sh" + command = f"cd {exe_path}; {script_path} \"{prompt}\" {tokens} {model_path} {backend}" def run_script(): From 1a9ba9db5710577aa563902ddb8a4971787bd346 Mon Sep 17 00:00:00 2001 From: Anoop Kapoor Date: Tue, 17 Jun 2025 14:03:19 -0700 Subject: [PATCH 31/35] @FIR-720 - GGML: Add TMU(MAT_MUL) kernel --- ggml-tsi-kernel | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-tsi-kernel b/ggml-tsi-kernel index 3194c54b13b8c..ea3a5d613e821 160000 --- a/ggml-tsi-kernel +++ b/ggml-tsi-kernel @@ -1 +1 @@ -Subproject commit 3194c54b13b8cd0b5c29a6a1cc0060ae2abbed06 +Subproject commit ea3a5d613e82129326c93a22eb3af871e6882530 From d733056d8f7dd4a35b1c357f13fbf94e981b4329 Mon Sep 17 00:00:00 2001 From: atrivedi-tsavoritesi Date: Tue, 17 Jun 2025 21:04:37 -0700 Subject: [PATCH 32/35] @FIR-754: Added all parameter parsing for the llama-cli (#18) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * @FIR-754: Added all parameter parsing for the llama-cli The test results are as follows Model Response cd /usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/; ./run_llama_cli.sh "My cat's name" " 50 tinyllama-vo-5m-para.gguf tSavorite 1.5 1024 50 0.9 5 12288 0.0 [2018-03-09 13:03:17.788243] 271:272 [[32m info[m] :: TXE resource allocation request processed successfully. My cat's name was Tim. He loved to play with his toy car. He would run and jump in the park, making loud noises. Tim was very happy with his new toy car. One day, Tim's mom said, "Tim. You llama_perf_sampler_print: sampling time = 999.96 ms / 56 runs ( 17.86 ms per token, 56.00 tokens per second)llama_perf_context_print: load time = 1713.55 ms llama_perf_context_print: prompt eval time = 603.51 ms / 6 tokens ( 100.58 ms per token, 9.94 tokens per second) llama_perf_context_print: eval time = 7069.36 ms / 49 runs ( 144.27 ms per token, 6.93 tokens per second) llama_perf_context_print: total time = 10046.17 ms / 55 tokens [2018-03-09 13:03:28.875126] 271:272 [[32m info[m] :: TXE resource release request processed successfully. GGML Tsavorite Profiling Results: ------------------------------------------------------------------------------------------------------------------------ Calls Total(ms) T/call Self(ms) Function ------------------------------------------------------------------------------------------------------------------------ 2715 2720.000 1.002 0.000 [25%] RuntimeHostShim::awaitCommandListCompletion 1740 2635.984 1.515 2635.984 └─ [24%] [ txe_silu ] 925 1379.715 1.492 1379.715 └─ [12%] [ txe_mult ] 50 74.450 1.489 74.450 └─ [ 1%] [ txe_add ] 2715 0.448 0.000 0.448 └─ [ 0%] TXE 0 Idle 1 34.000 34.000 34.000 [ 0%] RuntimeHostShim::finalize 1 16.000 16.000 1.000 [ 0%] GGML Tsavorite 1 15.000 15.000 15.000 └─ [ 0%] RuntimeHostShim::initialize 2716 0.000 0.000 0.000 [ 0%] RuntimeHostShim::allocate 9120 0.000 0.000 0.000 [ 0%] RuntimeHostShim::getShmemManager 2715 0.000 0.000 0.000 [ 0%] RuntimeHostShim::createCommandList 2715 0.000 0.000 0.000 [ 0%] RuntimeHostShim::loadBlob 2715 0.000 0.000 0.000 [ 0%] RuntimeHostShim::launchBlob 2715 0.000 0.000 0.000 [ 0%] RuntimeHostShim::addCommandToList 2715 0.000 0.000 0.000 [ 0%] RuntimeHostShim::finalizeCommandList 2715 0.000 0.000 0.000 [ 0%] RuntimeHostShim::unloadBlob 2715 0.000 0.000 0.000 [ 0%] RuntimeHostShim::deallocate ======================================================================================================================== 33558 11098.000 0.331 11098.000 [100%] TOTAL ======================================================================================================================== ⟵ Back to Form The URL used is as follows http://10.50.0.124:5003/llama-cli?model=tiny-llama&backend=tSavorite&tokens=10&prompt=My+cat%27s+name&repeat-penalty=1.5&batch-size=1024&top-k=50&top-p=0.9&last-n=5&context-length=12288&temp=0.0 * @FIR-754: Addressed review comments. --------- Co-authored-by: Ashish Trivedi --- tools/flaskIfc/flaskIfc.py | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/tools/flaskIfc/flaskIfc.py b/tools/flaskIfc/flaskIfc.py index 8d57a069cf3de..34b9fc5970522 100644 --- a/tools/flaskIfc/flaskIfc.py +++ b/tools/flaskIfc/flaskIfc.py @@ -10,9 +10,19 @@ app = Flask(__name__) port = '/dev/ttyUSB3' +#port = '/dev/ttyUSB2' baudrate = '921600' +#baudrate = '115200' exe_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/" +DEFAULT_REPEAT_PENALTY = 1.5 +DEFAULT_BATCH_SIZE = 1024 +DEFAULT_TOP_K = 50 +DEFAULT_TOP_P = 0.9 +DEFAULT_LAST_N = 5 +DEFAULT_CONTEXT_LENGTH = 12288 +DEFAULT_TEMP = 0.0 + @app.route('/') def index(): return render_template('index.html') @@ -25,6 +35,13 @@ def llama_cli_serial_command(): backend = request.args.get('backend') tokens = request.args.get('tokens') prompt = request.args.get('prompt') + repeat_penalty = request.args.get('repeat-penalty', DEFAULT_REPEAT_PENALTY) + batch_size = request.args.get('batch-size', DEFAULT_BATCH_SIZE) + top_k = request.args.get('top-k', DEFAULT_TOP_K) + top_p = request.args.get('top-p', DEFAULT_TOP_P) + last_n = request.args.get('last-n', DEFAULT_LAST_N) + context_length = request.args.get('context-length', DEFAULT_CONTEXT_LENGTH) + temp = request.args.get('temp', DEFAULT_TEMP) # Define the model path (update with actual paths) model_paths = { @@ -51,7 +68,7 @@ def llama_cli_serial_command(): # URL to Test this end point is as follows # http://10.50.30.167:5001/llama-cli?model=tiny-llama&backend=tSavorite&tokens=5&prompt=Hello+How+are+you script_path = "./run_llama_cli.sh" - command = f"cd {exe_path}; {script_path} \"{prompt}\" {tokens} {model_path} {backend}" + command = f"cd {exe_path}; {script_path} \"{prompt}\" {tokens} {model_path} {backend} {repeat_penalty} {batch_size} {top_k} {top_p} {last_n} {context_length} {temp}" try: result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True) @@ -167,6 +184,13 @@ def submit(): backend = request.form.get('backend') tokens = request.form.get('tokens') prompt = request.form.get('prompt') + repeat_penalty = request.form.get('repeat-penalty', DEFAULT_REPEAT_PENALTY) + batch_size = request.form.get('batch-size', DEFAULT_BATCH_SIZE) + top_k = request.form.get('top-k', DEFAULT_TOP_K) + top_p = request.form.get('top-p', DEFAULT_TOP_P) + last_n = request.form.get('last-n', DEFAULT_LAST_N) + context_length = request.form.get('context-length', DEFAULT_CONTEXT_LENGTH) + temp = request.form.get('temp', DEFAULT_TEMP) # Define the model path (update with actual paths) model_paths = { @@ -192,7 +216,7 @@ def submit(): #] script_path = "./run_llama_cli.sh" - command = f"cd {exe_path}; {script_path} \"{prompt}\" {tokens} {model_path} {backend}" + command = f"cd {exe_path}; {script_path} \"{prompt}\" {tokens} {model_path} {backend} {repeat_penalty} {batch_size} {top_k} {top_p} {last_n} {context_length} {temp}" def run_script(): From f5713b3ff130329da255fe1227a0ae995fa4c993 Mon Sep 17 00:00:00 2001 From: atrivedi-tsavoritesi Date: Wed, 18 Jun 2025 09:26:05 -0700 Subject: [PATCH 33/35] @FIR-756: Removed the echo of command in flask output (#19) --- tools/flaskIfc/serial_script.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/flaskIfc/serial_script.py b/tools/flaskIfc/serial_script.py index 38a53d103f0c8..9581376a8b16b 100644 --- a/tools/flaskIfc/serial_script.py +++ b/tools/flaskIfc/serial_script.py @@ -8,6 +8,7 @@ def send_serial_command(port, baudrate, command): ser.write((command + '\n').encode()) # Send command with newline # Wait to read the serial port data = '\0' + first_time = 1 while True: try: # read byte by byte to find either a new line character or a prompt marker @@ -22,7 +23,10 @@ def send_serial_command(port, baudrate, command): read_next_line = line.decode('utf-8') if ("run-platform-done" in read_next_line.strip()) or ("@agilex7_dk_si_agf014ea" in read_next_line.strip()) or ("imx8mpevk" in read_next_line.strip()): break - data += read_next_line # Keep the line as-is with newline + if (first_time == 1) : + first_time = 0 + else: + data += read_next_line # Keep the line as-is with newline else: break # Exit loop if no data is received except serial.SerialException as e: From 15e7365fad52323136de3bb0c046dbf502fe4328 Mon Sep 17 00:00:00 2001 From: atrivedi-tsavoritesi Date: Wed, 18 Jun 2025 11:59:53 -0700 Subject: [PATCH 34/35] @FIR-757: Update SDK to 0.1.4 and update release to 0.0.3 for tsi-ggml (#20) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The test results with ./run_llama_cli.sh with 5 tokens is as follows +++ root@agilex7_dk_si_agf014ea:/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin# ./run_llama_cli.sh my cat's name is Max. He' llama_perf_sampler_print: sampling time = 111.70 ms / 11 runs ( 10.15 ms per token, 98.47 tokens per second)llama_perf_context_print: load time = 132926.48 ms llama_perf_context_print: prompt eval time = 109957.33 ms / 6 tokens (18326.22 ms per token, 0.05 tokens per second) llama_perf_context_print: eval time = 195682.91 ms / 4 runs (48920.73 ms per token, 0.02 tokens per second) llama_perf_context_print: total time = 328764.01 ms / 10 tokens GGML Tsavorite Profiling Results: ------------------------------------------------------------------------------------------------------------------------ Calls Total(ms) T/call Self(ms) Function ------------------------------------------------------------------------------------------------------------------------ 33160 100086.000 3.018 47907.157 [32%] RuntimeHostShim::awaitCommandListCompletion 18920 29912.952 1.581 29912.952 └─ [10%] [ txe_silu ] 14080 22010.102 1.563 22010.102 └─ [ 7%] [ txe_mult ] 160 253.071 1.582 253.071 └─ [ 0%] [ txe_add ] 33160 1.178 0.000 1.178 └─ [ 0%] TXE 0 Idle 1 114.000 114.000 18.000 [ 0%] GGML Tsavorite 1 96.000 96.000 96.000 └─ [ 0%] RuntimeHostShim::initialize 1 52.000 52.000 52.000 [ 0%] RuntimeHostShim::finalize 33160 26.000 0.001 26.000 [ 0%] RuntimeHostShim::loadBlob 33160 23.000 0.001 23.000 [ 0%] RuntimeHostShim::finalizeCommandList 33160 5.000 0.000 5.000 [ 0%] RuntimeHostShim::addCommandToList 33161 3.000 0.000 3.000 [ 0%] RuntimeHostShim::allocate 33160 3.000 0.000 3.000 [ 0%] RuntimeHostShim::createCommandList 113720 0.000 0.000 0.000 [ 0%] RuntimeHostShim::getShmemManager 33160 0.000 0.000 0.000 [ 0%] RuntimeHostShim::launchBlob 33160 0.000 0.000 0.000 [ 0%] RuntimeHostShim::unloadBlob 33160 0.000 0.000 0.000 [ 0%] RuntimeHostShim::deallocate ======================================================================================================================== 412163 308849.000 0.749308849.000 [100%] TOTAL ======================================================================================================================== root@agilex7_dk_si_agf014ea:/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin# +++ --- tsi-pkg-build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tsi-pkg-build.sh b/tsi-pkg-build.sh index d2f09ffd2e001..64c577235b911 100755 --- a/tsi-pkg-build.sh +++ b/tsi-pkg-build.sh @@ -6,7 +6,7 @@ echo 'updating submodule' git submodule update --recursive --init cd ggml-tsi-kernel/ module load tsi4 gcc/13.3.0 -export MLIR_SDK_VERSION=/proj/rel/sw/sdk-r.0.1.3 +export MLIR_SDK_VERSION=/proj/rel/sw/sdk-r.0.1.4 echo 'creating python virtual env' /proj/local/Python-3.10.12/bin/python3 -m venv blob-creation source blob-creation/bin/activate @@ -48,7 +48,7 @@ cmake --build build-fpga --config Release echo 'creating tar bundle for fpga' -TSI_GGML_VERSION=0.0.2 +TSI_GGML_VERSION=0.0.3 TSI_GGML_BUNDLE_INSTALL_DIR=tsi-ggml GGML_TSI_INSTALL_DIR=ggml-tsi-kernel TSI_GGML_RELEASE_DIR=/proj/rel/sw/ggml/ From 1aa54e42c39392548df5cb53fb06659cf1592446 Mon Sep 17 00:00:00 2001 From: "M.Mankali" Date: Fri, 20 Jun 2025 07:42:44 -0700 Subject: [PATCH 35/35] FIR 760- Integrate copy2fpga file transfer to llama.cpp --- tools/flaskIfc/copy2fpga-setup.sh | 12 +++++ tools/flaskIfc/copy2fpga-x86 | Bin 0 -> 21960 bytes tools/flaskIfc/copy2fpga-x86.sh | 9 ++++ tools/flaskIfc/flaskIfc.py | 50 ++++++++++++++++++++- tools/flaskIfc/templates/uploadtofpga.html | 14 ++++++ 5 files changed, 84 insertions(+), 1 deletion(-) create mode 100755 tools/flaskIfc/copy2fpga-setup.sh create mode 100755 tools/flaskIfc/copy2fpga-x86 create mode 100755 tools/flaskIfc/copy2fpga-x86.sh create mode 100644 tools/flaskIfc/templates/uploadtofpga.html diff --git a/tools/flaskIfc/copy2fpga-setup.sh b/tools/flaskIfc/copy2fpga-setup.sh new file mode 100755 index 0000000000000..9ccbe55d3939f --- /dev/null +++ b/tools/flaskIfc/copy2fpga-setup.sh @@ -0,0 +1,12 @@ + +echo " Remove the device " +sudo bash -c "echo 1 > /sys/bus/pci/devices/0000\:01\:00.0/remove" + +echo "rescan" +sudo bash -c "echo 1 > /sys/bus/pci/rescan" + +echo " dump the pci data" +/aws/proj/rel/sw/platform/release_v0.1.1.tsv026_04_15_2025/scripts/dump-pci.sh + +echo " set the pci bit to access mem" +sudo setpci -s 0000:01:00.0 COMMAND=0x02 diff --git a/tools/flaskIfc/copy2fpga-x86 b/tools/flaskIfc/copy2fpga-x86 new file mode 100755 index 0000000000000000000000000000000000000000..7a0e76a65b5b343e26205d378b8994c314a57c41 GIT binary patch literal 21960 zcmeHPdvF}ZneW-vNGq)#E6X;PA9!qIu)$hc#&0ZJe#l;d4aV5P4v*1lcUD>~?JB#o zegtd_6em^**GZKlF$o9uB`M4$5a$S#0|kgjFc6?bauu9}Crl;pX{l@hkMPV{|*-A#-nQ<0UCkhU26d8~@ zwv?6NcQ(6(<%7@Tm{ne95tLf#v?rVT1nvbTy-AcAg-*3#CaLt0DCy-&jee^{QWmgy z(o-obcsDICvg9O{<#v6VRnA=W1>wJ>PCbgvq$*2qQf)T9AmXxA=u16lHNh;$>F`<8H&rRo(5|X5Dy%ynzo369en50w(r5nzf(f>|;n}uG_Ov{d&%6cS~ z^<57=ipyV{WH)aS_2ue^0nrbVitw`P)_Bw6h1IRms@8ZSv!`m$(#2Ja7Y5VG-~#Rf z`K95X#?*#uw=tOMBTROur^DiLp zH|dZJC9)@6wE4-%kqnQAawnL8hcJ1p%$aug)HjUR+m;P=ONi?_e=Y$d4sj>UJ^`m? zAr_;9j3)oJ9QY4&;OlbW_vXO=AqT!B2YyWsygmoMI0wEX2VRN5YbbN4D1g!8b5Ra_ zS`PfnIq+3E@Rd36!#VJsIq<*DfzQd&UK*#PwRa2fayFT1b;uye^9;SXe zb#)dl<(aNW_Js9VJP~e8oiOcpbZ)>hV@G1Dv-Yt5uvU?!eUMwTtp(~)oj zy6L@%2uT~M6pN*daFn$pn@lldPuzqw>Wl2uBP~1iSUBFw+Q=2u1gzOG&3K!^$Spcm zEM^-vHmq5zF9t($bzZ#2i#rjgpZX>DsVVQdXIwHm0u zxhFo8k;D5}^J!}X(C3!uR{oPk#;hMPKNxVbgzB;R1 z;*vj22L=+I3u%nWbjE=@&z}Phyi^Ks@N$RzQ3vik4;^>lV(4(K6AqlN z!!n(8;1eVW+ULM0I`EeqIGx*N>UZFk5*(vr5g3cWSOoqLN8tUki$4l=zT*pZtG}vd zEOfBX^bGfhI$!WT&%GXA^ihzpq2bvdgOp8YB;P_MLxaB`9v<%DJY9H)2H&uGy5J5C zK5O%I!5bR;LXt3Aj>B2TNc&E+N1#M{XOEymzvZ2AS&C>;JXz+TQ zrwiB6;A)$v3)awJjm^`AYG`nV&C>;HXt3PoX<{%msMtJRsD=jL`&71LK{&C`Vm z?YDWlAff#>PZuJz-{$E8g!W4w(QV_=#my1L6Z}0@6zviIu!}$F;`h1ujEiq`@rH}P z#l>%P@ta(Hy^F7P@rzu1m5ZP4;-|Uz2`;|G#j7s@PuDL^3Sng+4QfVQ*ZzruKf^5=&(9OWZ4CC0uGJGDUU;= zaQIBw^bV4HUQo!t%;gt7M`Z4&p|10xXaBS+^y~$1NO>{z#;4|Z7&s^l_=eBKc)v=0 z>hF%)r%=o?S8NM));3W#)OF4*4t3YYzzqEAli}fkCOJ6 z(8IwT2zBkd5bCbKK$CSCqg-7c9i&#)HLMTPB7%KE(9r`{txM5+=0FZwCg#lqiZudwElQ^U2*4y;S5=MNm!|L$XD4`W(o8O z^3`_Y?@%tBI40D84G5Nxk`sUWSUPdmLaDyc5d@|ubhy4h)b*kj#m6BB4|WR^U$fXA z!S1oxhb)$dbKnJz9X!C}{vvdL=PZql)Ye6XFFFgmEOV!Y^~qodH=r}&bk7s)ycb7V zf2eH3bMW#==+1pdhTrwF-9B{S5nkB6>1e2X+Yv~gc$$U)we#(dc=w;gNIA)PFz*xA zU!}@KG|oWEYVR*C^khh>KZMegp}7bM?|8Ckcf4+iA!vD>7Ya>3iF7mSuiMb59ioaI zFgw^wdi5QlNI!)7LP~$xsWWL^6{}o00Q)m`F6D9dllJDHt!^#dm!Ve5<82sTw z9t8Ke?%=LqxYyAIbfnoW5@&QFwWD!8!H4V5*rTSyGJ6Ky(I1-IA1izJ<)Kj7$;&JOc4QHfPJ&}tjKiyPf` zzPtV`E-$OQ`hG%9d4lFTI9&7CGu+M{7ZAiQI;rl%Fq(yGP9yj<5@tdly2u@ZjOA#M z+k`aGD`=!_I~(fWMCbB>P#3jqpx5fF3%m*fz71s>p>(?J5~fet27gP{A6EB^g8Tnn z);S&pJPI#A&*vO`4rtH%$n~)_7J;z{j74B90{@>QporDQ6kCtQ3?r(UNsSgyw0Ht* zBw7p@yQ8}5W=AnC!H}4dh-%50B{HInH4qMIX$R<0R$qI=`uZ)}oEykydny?*(&@R&HOSTe6^Z)ix`eiV^M=*h)^G|Q%+qe5MImjA z5i#Ps3~4f`t;X_CN{gFXOE|6Zg(D+655=fY3u`-#z1oV)_Ezt>Y$Yn!W=9xXj{xwd zP>Y)k%}j+8=@^z~qRX|}QLSmOX{3u-5w9+uXx6rha&09~)C8WKfe0fEsbrfo#B6Oc zVWyI;8m%|M!$i{5u(N-)_X=ZpQx|mdDd(fKT;+U2;BFRJ)E!vh2HyZ%c+6q1ruoBm5 ztSn;HQDav%R@4|DB$Cr&eDLtKFgwo|=_FUg18GM``W3}<4?7vJy|!AJ;=KxwJs3Um z>G|XE@E*|fe;OV>3d-KK<}91f4-YqjUH0Daa6jl{pyxr$KNubk&?F2=KI2!uyoD)y z0?L%)0$&fx2&Z+K0ob~f1lJYd&XM6w7RDChKY%(L$wp~led(mD%lx~29cAO3m*uly6(n*7(U-YKz(W8tDdzb`JOa9vdVt|_$id% zoaZR_-jGKEr2i`ZKm7CXa1+^;`raK1HvniXjYVKA0%H*vi@;a}#v(8lfw2hu&m$n; zN0RR!$tfMZ6Gf@mBK8YXC4$crF9XSUK;(N)^4%d!`FN7=49WMMXr4z&mjC6`;Uw`V z=y@U%JszfXM7;DP-~Blw%JHa`r!~S+tab1-Nl@pzEd&WQHc7mHPsx9jb7}}>6Y^BN49WuS+=AMGlKd&S z#qu7hq&u;E+IrxlsGY0ab~{q1{qkvnWf6Dq*Ke8I%OVNtK&>ad7X4tb7r%073r+u%y#80 zG3zGY4%J5GeiBEm2UU?xG)pOR2ZWt1=o)029cUD3DdZMY-?6xcXh>xF9VJals0vQr8v}Fs>fb zJv>*7hV@b|&!?_KK7Sg13JNAX3d`lQAQtfXC&Hpaj~0A^;%`%x5CJn`0xb~vXl(y8 z5cN%P#nY&E8Y+`CMX41PUO@|1H_GLQ6`pe{w5x*8ak@_8e_g96^jVV3`MDxTq;2gvt3{1k31reXXFvL5OR z4}V@$FbjN9OyOm}r!qd)eYB~ia9Jq{2GIBs4PNf}>4XIH(KW*V7&k(r*F&x6LPe~g z8;<1PYSqv{h0nvbXC~Fl3aX%3bjB)dq{0Wez57Y$ZEkNDq0}*~U=_0Y-|>`@=l2nO z9WuNf)(;tGljHcwe@rQE1f|BDBuR1z)!j{YqgHkO4tydsi!ZY9cL@Ji@{w{(eHYn+ zuan~Io^m>6_;3ZcJn~U1^8b>MG<^o5NG?s*HGcBrCgk*)0FG|H4GTP`ZLQ zz}tI%zn7MDYX$q~{a)Hpva+b3TYh)HcRSoMCekoLkocfJ`q8bc+lf{moUG7rQchMs>+yuhL6PZCpy$I z!h*`GN^VbG=D)16p<+d)0x6#jqnZ)85c?6b>nNg;QY5% zs35~-Gf+?suLFy?M0thcE%p0(duI4ZVw}~Ee6(XmX+y=TO46?=S7w)1_`TDoOfOf) z6$CJ^9gil#&IHj&94Qkov+UhOwvO3e-uX zLs331Eg?c<4FAYXFF=1Y1OLtV@4^40_@|5LApSkdbWd3sOfb5@GQ|%i&v^WJr{X7X z7X4g~AC=Aqc`NCMo{Q(-NLVBNw9rqIe)iChTEhxGa@2ZfDr)8X>cj^gRx2J>D<4)% zA6C`7)N#HUYUMdqc|hH;(lcwE8hBgv`JPk@Gw0NbW2*mQwb)nD*{POys)Y|{)XLjc z^#Qf;707H?3$Ho%q&nf4TJePH?^S)KTDarflgCDiDvzlXOf}zI98*=_#B=9-wdzve z%c^SncBuY8sDT}TYt_;R)QMj~El;W?8Q7b4Os(uxr+LfnQVSnY1FxulQ(ewHdCGQm zA(5C`v_maFr`~vrS~leCRDf)HX!V+Cf#8k_?W!qKXMpcce1v}KK z-iG^C)m!@mweo3d#ze3GB~?W%;L`&?Q>XbV)OlB{v#(Yctx&J(QLnjLU3ay*Z3TS0 zT2*^h#rKpt)7OIbRjY-~p4k;@IokfDTKX$Mj{rKS2GDxX5)tdt!mA*B^Y1yOX*@Q50Pur!_g>qq{%IA zd>>%kBJo@rt?nZ;mF$bCOy=7aJU9yt*Xt;B#hc-1F?!nz=l(imRf5wu2$8oRmH-zU9TmTOe8bus!W0wnXCBlOrsF? zBRi`QL(_mS(kA}@K4jMX&xCnQ|>VBT(gCBv(v|D{6wo{4`XB}o@)1-A! zS-lp450PC~ck$!HI(R4qbd+JRq}!bbJTHz#4Q#aJhlJRGNwxEVW3pg%B-y@qL9D$w z9E`AP^wVwCyOXJ%)!0*+*a=6fo3H^J=a@`84qg-M+sIe#j+-r28|K$6o1ad`s?Bs< z-ZF-&4@Zn@oB>Rut=dc`ThlDasx#>n?dPs;jzp?4cJ##y#jft=M5el?raGR8 zv}O=sE-0AK3O2>DXY}($s?ELaXtPr=i}|c!xJ&y@qed*)!niglhH55a@-r&uqVXiC zeXbdikbwt6VFSq|FgBye#3Bi^6_i9k03r%JgKe_S$mCAABI6y#_lDgPbkC-sm0(!t^z)qeVV(ChSQDU zX!z~G1FVt-T=wZkZ8Z6N2=|mT`TSgZ_3a$;kLSP#a^R(NYG+g_+RI80sQW|pY)IM!Ke&;FSl`UZg0k=O?4l+7!x}cr z1FYkl7h(YAzTu;;X#$G^CS!7zd3OaR|5IF4E>K z8`iOGX~;aP3b(~!%u>gwU_qQj+Hh_FW;A7*AsJ3I*xA@mej#h!mLE! z5pwFgzoT_YmNd;NvlGp4o%(WqEvc3*{u%unQI6MNld}Ety#Yz*3w^mZ>-Im4ay$gJ z+b`cMko2T5Og>WVq`p`WU~ym+a#_E8PeIZ(u7JB(_9f`jn!nVS>->^ht95vO2suhp zPEuMJqjv}-FW)1Ow2_pM-2PiAV~3HUK9TzJy#`6o3qe`GyZzr3`fG%od{07B`CbL- zQ_S4@$AD2xqmE{g4H9(3tW<$}g}&^6rU^ko zXkM}Du$QNPral5=zD?-cqyGVnsdt-mOT{>Mv%f*ja6^w;LnUuG#> z@j3d}34M3`E?r^i-{?AEr2mrAk~ihu`uplE{X35#pKvQ)=oz^SlK&Vg|c2LNT*Ae2rrs?jU{oLOTopm{{-fPU%mhU literal 0 HcmV?d00001 diff --git a/tools/flaskIfc/copy2fpga-x86.sh b/tools/flaskIfc/copy2fpga-x86.sh new file mode 100755 index 0000000000000..d214838f6b52c --- /dev/null +++ b/tools/flaskIfc/copy2fpga-x86.sh @@ -0,0 +1,9 @@ +#! /bin/bash +# This file runs the PCIE setup needed for file transfer. +# Also, it invokes the file transfer utility: copy2fpga-x86 +# Note: sudo permissions are needed for file transfer +# +echo " Inside copy2fpga-x86.sh " +sudo ./copy2fpga-setup.sh +echo "sudo ./copy2fpga-x86 $1" +sudo ./copy2fpga-x86 $1 diff --git a/tools/flaskIfc/flaskIfc.py b/tools/flaskIfc/flaskIfc.py index 34b9fc5970522..38e20b4d1dfbf 100644 --- a/tools/flaskIfc/flaskIfc.py +++ b/tools/flaskIfc/flaskIfc.py @@ -4,6 +4,9 @@ import time from werkzeug.utils import secure_filename import os +import subprocess +import mmap + job_status = {"running": False, "result": "", "thread": None} @@ -13,7 +16,8 @@ #port = '/dev/ttyUSB2' baudrate = '921600' #baudrate = '115200' -exe_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/" +#exe_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/" +exe_path = "/usr/bin/tsi/v0.1.1*/bin/" DEFAULT_REPEAT_PENALTY = 1.5 DEFAULT_BATCH_SIZE = 1024 @@ -106,6 +110,50 @@ def upload_serial_command(): # except subprocess.CalledProcessError as e: # return f"Error executing script: {e.stderr}", 500 +@app.route('/uploadtofpga-file', methods=['GET', 'POST']) +def uploadtofpga_file(): + setupprints = "Before:Copy2fpga-setup.sh" + print(setupprints) + + if request.method == 'POST': + # Check if a file was submitted + if 'file' not in request.files: + return "No file part" + file = request.files['file'] + + # Check if the file is empty + if file.filename == '': + return "No file selected" + + # Save the file if it exists + if file: + filename = secure_filename(file.filename) + process = subprocess.Popen(["./copy2fpga-x86.sh", filename], text=True) + copy2fpgax86prints = "Starting copy2fpga-x86 sending file..." + print (copy2fpgax86prints) + file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) + + script_path = "./recvFromHost " + command = f"cd {exe_path}; {script_path} {filename}" + def scriptRecvFromHost(): + try: + result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True) + job_status["result"] = result.stdout + print("FPGA Target ready to receive file: recvFromHost started..\n") + print(result.stdout) + recv_output = result.stdout + except subprocess.CalledProcessError as e: + job_status["result"] = f"Error: {e.stderr}" + finally: + job_status["running"] = False + thread = threading.Thread(target=scriptRecvFromHost) + job_status = {"running": True, "result": "", "thread": thread} + thread.start() + + stdout, stderr = process.communicate() + return render_template('uploadtofpga.html', apple = process, recvoutput=f"On FPGA Target, recvFromHost completed ; transf ered file:{filename} received") + return render_template('upload.html') # Display the upload form + @app.route('/upload-file', methods=['GET', 'POST']) def upload_file(): if request.method == 'POST': diff --git a/tools/flaskIfc/templates/uploadtofpga.html b/tools/flaskIfc/templates/uploadtofpga.html new file mode 100644 index 0000000000000..97445c1b68622 --- /dev/null +++ b/tools/flaskIfc/templates/uploadtofpga.html @@ -0,0 +1,14 @@ + + + + File Transfer In Progress... + + +

File Transfer Started.

+

Running copy2fpga-x86.sh

+
{{ apple }}
+
{{ recvoutput }}
+
+ + +