From bb1f9812d5da63a33245b9f6bcad1f2769617701 Mon Sep 17 00:00:00 2001
From: Anoop Kapoor <akapoor@wssw01.tsavoritesi.net>
Date: Fri, 23 May 2025 22:13:44 -0700
Subject: [PATCH 01/35] @FIR-702 - llama.cpp: Sync with latest opensource

---
 .gitmodules                                 |    3 +
 CMakeLists.txt                              |   62 +-
 common/CMakeLists.txt                       |   10 +-
 examples/gguf-hash/CMakeLists.txt           |    1 +
 examples/gguf/CMakeLists.txt                |    2 +-
 examples/lookup/CMakeLists.txt              |    8 +-
 examples/simple-chat/CMakeLists.txt         |    2 +-
 examples/simple/CMakeLists.txt              |   21 +-
 examples/simple/simple-backend-tsi.cpp      |  578 ++++++
 ggml-tsi-kernel                             |    1 +
 ggml/CMakeLists.txt                         |    1 +
 ggml/include/ggml-tsavorite.h               |  189 ++
 ggml/src/CMakeLists.txt                     |    1 +
 ggml/src/ggml-backend-reg.cpp               |   10 +
 ggml/src/ggml-tsavorite/CMakeLists.txt      |    8 +
 ggml/src/ggml-tsavorite/ggml-tsavorite.cpp  | 1887 +++++++++++++++++++
 ggml/src/ggml-tsavorite/include/TestModel.h |  217 +++
 tests/CMakeLists.txt                        |    2 +-
 tsi-pkg-build.sh                            |   87 +
 19 files changed, 3079 insertions(+), 11 deletions(-)
 create mode 100644 examples/simple/simple-backend-tsi.cpp
 create mode 160000 ggml-tsi-kernel
 create mode 100644 ggml/include/ggml-tsavorite.h
 create mode 100644 ggml/src/ggml-tsavorite/CMakeLists.txt
 create mode 100644 ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
 create mode 100644 ggml/src/ggml-tsavorite/include/TestModel.h
 create mode 100755 tsi-pkg-build.sh

diff --git a/.gitmodules b/.gitmodules
index 23ce5ff059b1b..001504ec9ed07 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "kompute"]
 	path = ggml/src/ggml-kompute/kompute
 	url = https://github.com/nomic-ai/kompute.git
+[submodule "ggml-tsi-kernel"]
+	path = ggml-tsi-kernel
+	url = git@github.com:tsisw/ggml-tsi-kernel.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ac3e9090336d9..f9c146006c1a5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,10 +5,59 @@ include(CheckIncludeFileCXX)
 #set(CMAKE_WARN_DEPRECATED YES)
 set(CMAKE_WARN_UNUSED_CLI YES)
 
+if (GGML_TSAVORITE)
+    if  (NOT DEFINED GGML_TSAVORITE_TARGET)
+        set(GGML_TSAVORITE_TARGET "posix")
+    endif()
+    if (NOT ${GGML_TSAVORITE_TARGET} STREQUAL fpga)
+        set(GGML_TSAVORITE_TARGET "posix")
+    endif()
+
+    if (NOT DEFINED MLIR_COMPILER_DIR)
+        if (NOT DEFINED $ENV{MLIR_SDK_VERSION})
+            set (MLIR_COMPILER_DIR /proj/work/rel/sw/sdk-r.0.1.0/compiler)
+        else()
+            set (MLIR_COMPILER_DIR $ENV{MLIR_SDK_VERSION}/compiler)
+        endif()
+    endif()
+
+    if (NOT DEFINED RUNTIME_DIR)
+        if (NOT DEFINED $ENV{MLIR_SDK_VERSION})
+            set (RUNTIME_DIR /proj/work/rel/sw/sdk-r.0.1.0/${GGML_TSAVORITE_TARGET}/runtime)
+        else()
+            set (RUNTIME_DIR $ENV{MLIR_SDK_VERSION}/${GGML_TSAVORITE_TARGET}/runtime)
+        endif()
+    endif()
+
+    if (NOT DEFINED GGML_TSI_KERNEL_DIR)
+       set (GGML_TSI_KERNEL_DIR ${CMAKE_SOURCE_DIR}/ggml-tsi-kernel/${GGML_TSAVORITE_TARGET})
+    endif()
+
+    file(GLOB TLIBS  "${RUNTIME_DIR}/lib/*.so" "${GGML_TSI_KERNEL_DIR}/host/*.o")
+
+    if (${GGML_TSAVORITE_TARGET} STREQUAL fpga)
+        set(CMAKE_CROSSCOMPILING ON)
+        set(ARCH_FLAGS -march=armv8-a)
+        message("Setting target as fpga")
+    elseif (${GGML_TSAVORITE_TARGET} STREQUAL "posix")
+	list(APPEND TLIBS "${MLIR_COMPILER_DIR}/lib/libFFMDeviceShim.so")
+        message("Setting target as posix for tsavorite")
+    endif()
+
+    set(GGML_TSAVORITE_TARGET "${GGML_TSAVORITE_TARGET}" CACHE STRING "Target for tsavorite")
+    set (TSAVORITE_INCLUDE_DIR ${CMAKE_SOURCE_DIR}/ggml/src/ggml-tsavorite/include)
+
+    include_directories(${TSAVORITE_INCLUDE_DIR})
+    include_directories(${MLIR_COMPILER_DIR}/include/runtime/shim)
+    include_directories(${RUNTIME_DIR}/include)
+    message("tsavorite backend is enabled")
+endif()
+
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+    #set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+    set(CMAKE_BUILD_TYPE Debug CACHE STRING "Build type" FORCE)
     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()
 
@@ -82,9 +131,18 @@ option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
 
 # 3rd party libs
-option(LLAMA_CURL       "llama: use libcurl to download model from an URL" ON)
 option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
 
+if (GGML_TSAVORITE)
+    if (${GGML_TSAVORITE_TARGET} STREQUAL fpga)
+        option(LLAMA_CURL       "llama: use libcurl to download model from an URL" OFF)
+    else()
+        option(LLAMA_CURL       "llama: use libcurl to download model from an URL" ON)
+    endif()
+else()
+    option(LLAMA_CURL       "llama: use libcurl to download model from an URL" ON)
+endif()
+
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index a7ff3ac16c446..9eafc9bb2b659 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -145,8 +145,16 @@ endif ()
 
 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features   (${TARGET} PUBLIC cxx_std_17)
-target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
 
+if (GGML_TSAVORITE)
+    if (${GGML_TSAVORITE_TARGET} STREQUAL fpga)
+        target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} ${TLIBS} PUBLIC llama Threads::Threads)
+    else()
+        target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
+    endif()
+else()
+    target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
+endif()
 
 #
 # copy the license files
diff --git a/examples/gguf-hash/CMakeLists.txt b/examples/gguf-hash/CMakeLists.txt
index 15c5c68c6f402..0d9272b663d1a 100644
--- a/examples/gguf-hash/CMakeLists.txt
+++ b/examples/gguf-hash/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(TARGET llama-gguf-hash)
 add_executable(${TARGET} gguf-hash.cpp)
+target_link_libraries(${TARGET} PRIVATE ${TLIBS})
 install(TARGETS ${TARGET} RUNTIME)
 
 # clibs dependencies
diff --git a/examples/gguf/CMakeLists.txt b/examples/gguf/CMakeLists.txt
index fb04eb83f34ce..48365a0b054ce 100644
--- a/examples/gguf/CMakeLists.txt
+++ b/examples/gguf/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-gguf)
 add_executable(${TARGET} gguf.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT} ${TLIBS})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/lookup/CMakeLists.txt b/examples/lookup/CMakeLists.txt
index fba78ceda6fd7..f7626a45dedd8 100644
--- a/examples/lookup/CMakeLists.txt
+++ b/examples/lookup/CMakeLists.txt
@@ -1,23 +1,23 @@
 set(TARGET llama-lookup)
 add_executable(${TARGET} lookup.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${TLIBS})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 set(TARGET llama-lookup-create)
 add_executable(${TARGET} lookup-create.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${TLIBS})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 set(TARGET llama-lookup-merge)
 add_executable(${TARGET} lookup-merge.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${TLIBS})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 set(TARGET llama-lookup-stats)
 add_executable(${TARGET} lookup-stats.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${TLIBS})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/simple-chat/CMakeLists.txt b/examples/simple-chat/CMakeLists.txt
index 567f7fbbbf43a..cdf65e58a9d7d 100644
--- a/examples/simple-chat/CMakeLists.txt
+++ b/examples/simple-chat/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET llama-simple-chat)
 add_executable(${TARGET} simple-chat.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT} ${TLIBS})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt
index 104ecabfd7236..a87dac20c82da 100644
--- a/examples/simple/CMakeLists.txt
+++ b/examples/simple/CMakeLists.txt
@@ -1,5 +1,24 @@
+#
+# simple-ctx
 set(TARGET llama-simple)
 add_executable(${TARGET} simple.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama ${TLIBS} ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+#
+if (GGML_TSAVORITE)
+    #
+    # tsavorite backend test cases
+    if (${GGML_TSAVORITE_TARGET} STREQUAL fpga)
+        file(GLOB TLIBS  "${RUNTIME_DIR}/lib/*.so" "../../${GGML_TSI_KERNEL_DIR}/host/*.o")
+    else()
+        file(GLOB TLIBS  "${RUNTIME_DIR}/lib/*.so" "${MLIR_COMPILER_DIR}/lib/libFFMDeviceShim.so" "../../${GGML_TSI_KERNEL_DIR}/host/*.o")
+    endif()
+    #
+    # simple-backend-tsi
+
+    set(TEST_TARGET simple-backend-tsi)
+    add_executable(${TEST_TARGET} simple-backend-tsi.cpp)
+    target_link_libraries(${TEST_TARGET} PRIVATE ggml ${TLIBS}  dl rt)
+endif()
diff --git a/examples/simple/simple-backend-tsi.cpp b/examples/simple/simple-backend-tsi.cpp
new file mode 100644
index 0000000000000..2f56f34168062
--- /dev/null
+++ b/examples/simple/simple-backend-tsi.cpp
@@ -0,0 +1,578 @@
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "ggml-tsavorite.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include <math.h>
+#include <float.h>
+
+#define NUM_INPUT_TENSORS 2
+#define NUM_INPUT_URINARY_TENSORS 1
+#define  NUM_ELEMENTS 32
+#define  NUM_ELEMENTS_SCALE 32*4 + 25
+
+// index 0 for addition, index 1 for subtraction, index 2 for multiplication, index 3 for division
+float test_input_1[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS] = {
+	//ADD KERNEL
+	{1.1,  2.3,  3.2,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+	//SUB KERNEL
+	{2.2,  10.3,  10.4,  2.2,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+	//MULT KERNEL
+	{1.1,  2.3,  3.2,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+	//DIV KERNEL
+	{1.1,  4.4,  10,  5,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+	// SQRT Kernel
+	{1,  4,  9.6,  16,  25,  36,  49,  64,  81,  100, 121, 144, 169, 196, 225, 256, 289, 324, 361, 400, 441, 484, 529, 576, 625, 676, 729, 784, 841, 900, 961, 1024},
+	//NEG Kernel
+	{1.1,  -4.4,  10,  -5,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, -23, 24, 25, -26, 27, -28, 29, -30, 31, -32.6},
+	//ABS Kernel
+	{1.1,  -4.4,  10,  -5,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, -23, 24, 25, -26, 27, -28, 29, -30, 31, -32.6},
+	//SIN Kernel
+	{1.1,  4.4,  10,  5,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 20, 20, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32.6}
+};
+float test_input_2[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS] = {
+	//ADD KERNEL
+	{1.1,  2.2,  3.3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+	//SUB KERNEL
+	{1.1,  2.2,  3.0,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+	//MULT KERNEL
+	{1.1,  2.2,  3.3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+	//DIV KERNEL
+	{1.1,  2.2,  5,  10,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+	//Below ROW value not used for Unary OPS-SQRT, NEG, ABS, SIN
+	//SQRT KERNEL input not used
+	{1.1,  2.2,  5,  10,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+	//NEG KERNEL input not used
+	{1.1,  2.2,  5,  10,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+	//ABS KERNEL input not used
+	{1.1,  2.2,  5,  10,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+	//SIN Kernel input not used
+	{1.1,  2.2,  5,  10,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+};
+
+float test_result[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS] = {
+	//ADD KERNEL
+	{2.20, 4.50, 6.50, 8.00, 10.00, 12.00, 14.00, 16.00, 18.00, 20.00, 22.00, 24.00, 26.00, 28.00, 30.00, 32.00, 34.00, 36.00, 38.00, 40.00, 42.00, 44.00, 46.00, 48.00, 50.00, 52.00, 54.00, 56.00, 58.00, 60.00, 62.00, 64.00},
+	//SUB KERNEL
+	{1.1, 8.1, 7.4, -1.8, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00},
+	//MULT KERNEL
+	{1.21, 5.06, 10.56, 16.00, 25.00, 36.00, 49.00, 64.00, 81.00, 100.00, 121.00, 144.00, 169.00, 196.00, 225.00, 256.00, 289.00, 324.00, 361.00, 400.00, 441.00, 484.00, 529.00, 576.00, 625.00, 676.00, 729.00, 784.00, 841.00, 900.00, 961.00, 1024.00},
+	//DIV KERNEL
+	{1.0, 2.0, 2, 0.5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+	//SQRT Kernel
+	{1,  2,  3.098387,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+	//NEG Kernel
+	{-1.1,  4.4,  -10,  5,  -5,  -6,  -7,  -8,  -9,  -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, 23, -24, -25, 26, -27, 28, -29, 30, -31, 32.6},
+	//ABS Kernel
+	{1.1,  4.4,  10,  5,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32.6},
+	//SIN Kernel
+	{0.891207,  -0.951602,  -0.544021,  -0.958924,  -0.958924,  -0.279416,  0.656987,  0.989358,  0.412118,  -0.544021, -0.999990, -0.536573, 0.420167, 0.990607, 0.650288, -0.287903, -0.961398, -0.750987, 0.149877, 0.912945, 0.912945, 0.912945, -0.846220, -0.905578, -0.132352, 0.762559, 0.956376, 0.270906, -0.663634, -0.988032, -0.404039, 0.926149}
+};
+
+float test_input_scale_1[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS_SCALE] = {
+	//ADD KERNEL
+	{1.3, 2.3, 3.3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
+	//SUB KERNEL
+	{8.5, 2.5, 3.5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 64,
+	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 63, 32,
+	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 63, 32,
+	 4, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 63, 32,
+	 2, 4, 8, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
+	//MULT KERNEL
+	{1.5, 2.5, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  10,
+	 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  10,
+	 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  10,
+	 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  10,
+	 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1},
+	//DIV KERNEL
+	{4.2, 8.4, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,
+	 4,   8,   1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,
+	 4,   8,   1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,
+	 4,   8,   1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,
+	 4,   8,   1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1},
+	//SQRT KERNEL
+	{1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 9, 4, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 16, 25, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1},
+	//NEG KERNEL
+	{-1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -9, 4, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -16, 25, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1},
+	//ABS KERNEL
+	{-1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -9, 4, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -16, 25, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1},
+	//SIN KERNEL
+	{-1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -9, 4, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -16, 25, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1}
+};
+
+float test_input_scale_2[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS_SCALE] = {
+	// ADD KERNEL
+	{1.3, 2.3, 3.3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
+	// SUB KERNEL
+	{1, 8, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+	 6, 8, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+	 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
+	// MULT KERNEL
+	{2, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,
+	 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,
+	 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,
+	 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,
+	 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1},
+	// DIV KERNEL
+	{2, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1},
+	//Below ROW value not used for Unary OPS-SQRT, NEG, ABS, SIN
+	//SQRT KERNEL input not used
+	{1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1},
+	//NEG KERNEL input not used
+	{1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1},
+	//ABS KERNEL input not used
+	{-1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -9, 4, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -16, 25, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1},
+	//SIN KERNEL input not used
+	{-1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -9, 4, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -16, 25, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1}
+};
+float test_result_scale[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS_SCALE] = {
+	// ADD KERNEL
+	{2.6, 4.6, 6.6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38 ,40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64,
+	 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38 ,40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64,
+	 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38 ,40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64,
+	 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38 ,40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64,
+	 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38 ,40, 42, 44, 46, 48, 50},
+	// SUB KERNEL
+	{7.5, -5.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  32,
+	 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0,
+        -5, -6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0,
+	 3, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0,
+	 1, 2,  5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+	// MULT KERNEL
+	{3, 5,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+	 2, 4,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+	 2, 4,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+	 2, 4,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+	 2, 4,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+	// DIV KERNEL
+	{2.1, 4.2,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+	 2, 4,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+	 2, 4,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+	 2, 4,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+	 2, 4,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+	// SQRT KERNEL
+	{1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 3, 2, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 4, 5, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1},
+	// NEG KERNEL
+	{1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+	 9, -4, -1, -1, -1, -1, -1, -1, -1, -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+	 16, -25, -1, -1, -1, -1, -1, -1, -1, -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+	 1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
+	 1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1},
+	// ABS KERNEL
+	{1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 9, 4, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 16, 25, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1},
+	// SIN KERNEL
+	{-0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	  0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	  0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	 -0.412118,-0.756802, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	  0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	  0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	  0.287903,-0.132352, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	  0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	  0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	 -0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	  0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	  0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	 -0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	  0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471,
+	  0.841471, 0.841471, 0.841471}
+};
+
+// This is a simple model with two tensors a and b
+struct simple_model {
+    struct ggml_tensor * a;
+    struct ggml_tensor * b;
+
+    // the backend to perform the computation (TSAVORITE)
+    ggml_backend_t backend = NULL;
+
+    // the backend buffer to storage the tensors data of a and b
+    ggml_backend_buffer_t buffer;
+
+    // the context to define the tensor information (dimensions, size, memory address)
+    struct ggml_context * ctx;
+};
+
+
+static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) user_data;
+    fputs(text, stderr);
+    fflush(stderr);
+}
+
+static bool ggml_tsi_compare_two_float(float a, float b) {
+    float epsilon = 1e-5;
+    float absA = abs(a);
+    float absB = abs(b);
+    float diff = abs(a - b);
+    float minV = std::numeric_limits<float>::min();
+    float maxV = std::numeric_limits<float>::max();
+
+    if (a == b) { // shortcut, handles infinities
+        return true;
+    } else if (a == 0 || b == 0 || (absA + absB < minV)) {
+                        // a or b is zero or both are extremely close to it
+                        // relative error is less meaningful here
+        return diff < (epsilon * minV);
+    }
+    // use relative error
+    return diff /std::min((absA + absB), maxV) < epsilon;
+}
+
+
+static bool load_model(simple_model & model, float * a, float * b, enum ggml_type data_type, int elements_A, int elements_B) {
+    ggml_log_set(ggml_log_callback_default, nullptr);
+
+    // initialize the backend
+    fprintf(stderr, "%s: using TSavorite backend \n", __func__);
+    model.backend = ggml_backend_tsavorite_init();
+    if (!model.backend) {
+        fprintf(stderr, "%s: ggml_backend_tsavorite_init() failed\n", __func__);
+	return false;
+    }
+
+    int num_tensors;
+
+    if (!b)
+        num_tensors = NUM_INPUT_URINARY_TENSORS;
+    else
+        num_tensors = NUM_INPUT_TENSORS;
+
+    // Since we are not passing the mem_buffer ggml context will create
+    /* .mem_buffer = params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size) */
+    // mem_buffer for ctx is used for any object creation and used for tensor data if
+    // backend doesnt have own memory
+    // Since we are using backend memory hence i have removed extra bytes: 100, removed from mem_size at below
+    struct ggml_init_params params {
+            /*.mem_size   =*/ (ggml_tensor_overhead() * num_tensors),
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+    };
+    fprintf(stderr, "\n Calculating mem_size %ld  %d  and creating ggml context \n", ggml_tensor_overhead(), num_tensors); 
+
+    // create context
+    model.ctx = ggml_init(params);
+    if (!model.ctx) {
+        fprintf(stderr, "%s: ggml_init failed\n", __func__);
+	return false;
+    }
+
+    // create tensors
+    // //  BELOW CODE NO CHANGE FOR tsavorite Backend
+    // Tensor just created with OBJ(Structure)+Tensor(structure)
+    // Still Buffer need to attached to Tensor since we are using Backend
+    // We will using tsi_alloc called under tsavorite-backend
+
+    fprintf(stderr, "\n Creating input Tensor \n");
+
+    //int64_t ne[GGML_MAX_DIMS]; // number of elements
+    //size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
+    model.a = ggml_new_tensor_1d(model.ctx, data_type, elements_A);
+    if (b)
+        model.b = ggml_new_tensor_1d(model.ctx, data_type, elements_B);
+
+    // create a backend buffer (backend memory) and alloc the tensors from the context
+    fprintf(stderr, "\n Creating Backend Buffer \n");
+
+    // Here at ggml Context we have only two input tensors, hence backend memory is
+    // created for two input tensors
+    model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, model.backend);
+
+    // load data from cpu memory to backend buffer
+    fprintf(stderr, "\n Loading Input Tensor Data to Backend Buffer \n");
+
+    // loading the data to tensor
+    ggml_backend_tensor_set(model.a, a, 0, ggml_nbytes(model.a));
+    if (b)
+        ggml_backend_tensor_set(model.b, b, 0, ggml_nbytes(model.b));
+
+    // create a array to print input tensor
+    std::vector<float> out_data(ggml_nelements(model.a));
+    // bring the data from the backend memory
+    ggml_backend_tensor_get(model.a, out_data.data(), 0, ggml_nbytes(model.a));
+
+
+    fprintf(stderr, "\nBringing  tensor data from Backend buffer and printing %d  tensor data:\n[", (int) model.a->ne[0]);
+
+    for (int i = 0; i < model.a->ne[0] /* cols */; i++) {
+        fprintf(stderr, " %.2f", out_data[i]);
+    }
+    fprintf(stderr, " ]\n");
+    return true;
+}
+
+// build the compute graph
+static struct ggml_cgraph * build_graph(const simple_model& model, enum ggml_tsavorite_kernel_type ops_type) {
+    static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
+    static std::vector<uint8_t> buf(buf_size);
+
+    struct ggml_init_params params0 = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf.data(),
+        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
+    };
+
+    // create a temporally context to build the graph
+    struct ggml_context * ctx0 = ggml_init(params0);
+
+    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
+
+
+    struct ggml_tensor * result;
+    switch(ops_type) {
+	    case GGML_TSAVORITE_KERNEL_TYPE_ADD:
+    		result = ggml_add(ctx0, model.a, model.b);
+		break;
+	    case GGML_TSAVORITE_KERNEL_TYPE_SUB:
+    		result = ggml_sub(ctx0, model.a, model.b);
+		break;
+	    case GGML_TSAVORITE_KERNEL_TYPE_MULT:
+    		result = ggml_mul(ctx0, model.a, model.b);
+		break;
+	    case GGML_TSAVORITE_KERNEL_TYPE_DIV:
+    		result = ggml_div(ctx0, model.a, model.b);
+		break;
+	    case GGML_TSAVORITE_KERNEL_TYPE_SQRT:
+    		result = ggml_sqrt(ctx0, model.a);
+		break;
+	    case GGML_TSAVORITE_KERNEL_TYPE_NEG:
+                result = ggml_neg(ctx0, model.a);
+		break;
+	    case GGML_TSAVORITE_KERNEL_TYPE_ABS:
+                result = ggml_abs(ctx0, model.a);
+		break;
+	    case GGML_TSAVORITE_KERNEL_TYPE_SIN:
+                result = ggml_sin(ctx0, model.a);
+		break;
+	     default:
+    		ggml_free(ctx0);
+    		fprintf(stderr, "\n Non Supported Operation \n");
+		return NULL;
+    }
+    // build operations nodes
+    ggml_build_forward_expand(gf, result);
+
+    // delete the temporally context used to build the graph
+    ggml_free(ctx0);
+    return gf;
+}
+
+// compute with backend
+static struct ggml_tensor * compute(const simple_model & model, ggml_gallocr_t allocr, enum ggml_tsavorite_kernel_type ops_type) {
+    // reset the allocator to free all the memory allocated during the previous inference
+
+    fprintf(stderr, "\n Under Test case for  compute API creating  build_graph  \n");
+    struct ggml_cgraph * gf = build_graph(model, ops_type);
+    if (!gf) { 
+	    fprintf(stderr, "\ncompute failed\n");
+	    return NULL;
+    }
+	   
+    // allocate tensors
+    ggml_gallocr_alloc_graph(allocr, gf);
+
+    ggml_backend_graph_compute(model.backend, gf);
+
+    // in this case, the output tensor is the last one in the graph
+    return ggml_graph_node(gf, -1);
+}
+
+enum ggml_tsavorite_kernel_type convert_testcase_to_ops_type (const char *testCase) {
+        if (!strcmp(testCase,"add"))
+            return GGML_TSAVORITE_KERNEL_TYPE_ADD;
+        else if (!strcmp(testCase,"sub"))
+            return GGML_TSAVORITE_KERNEL_TYPE_SUB;
+        else if (!strcmp(testCase,"mult"))
+            return GGML_TSAVORITE_KERNEL_TYPE_MULT;
+        else if (!strcmp(testCase,"div"))
+            return GGML_TSAVORITE_KERNEL_TYPE_DIV;
+        else if (!strcmp(testCase,"sqrt"))
+            return GGML_TSAVORITE_KERNEL_TYPE_SQRT;
+        else if (!strcmp(testCase,"neg"))
+            return GGML_TSAVORITE_KERNEL_TYPE_NEG;
+        else if (!strcmp(testCase,"abs"))
+            return GGML_TSAVORITE_KERNEL_TYPE_ABS;
+        else if (!strcmp(testCase,"sin"))
+            return GGML_TSAVORITE_KERNEL_TYPE_SIN;
+
+    	fprintf(stderr, "\n un-supported test case %s hence running default test case which is add operation  \n", testCase);
+	return GGML_TSAVORITE_KERNEL_TYPE_ADD;
+}
+
+int main(int argc, char *argv[]) {
+    ggml_time_init();
+    bool test_case_flag = true;
+    enum ggml_tsavorite_kernel_type ops_type;
+    simple_model model;
+    float *input1[GGML_TSAVORITE_KERNEL_TYPE_COUNT];
+    float *input2[GGML_TSAVORITE_KERNEL_TYPE_COUNT];
+    float *result_data[GGML_TSAVORITE_KERNEL_TYPE_COUNT];
+    bool data_scale = false;
+
+    int elements_A=0, elements_B=0;
+    int num_of_input_tensors;
+
+    if (argc > 1) {
+    	ops_type = convert_testcase_to_ops_type(argv[1]);
+	if (argc > 2 && !strcmp(argv[2], "scale"))
+		data_scale = true;
+    } else {
+	// Default Case
+    	ops_type = convert_testcase_to_ops_type("add");
+    }
+    if (ops_type == GGML_TSAVORITE_KERNEL_TYPE_SQRT ||
+		    ops_type == GGML_TSAVORITE_KERNEL_TYPE_NEG ||
+		    ops_type == GGML_TSAVORITE_KERNEL_TYPE_ABS ||
+		    ops_type == GGML_TSAVORITE_KERNEL_TYPE_SIN)
+	    num_of_input_tensors = NUM_INPUT_URINARY_TENSORS;
+    else 
+	    num_of_input_tensors = NUM_INPUT_TENSORS;
+
+    if (data_scale) {
+	    input1[ops_type]      = test_input_scale_1[ops_type];
+	    elements_A            = NUM_ELEMENTS_SCALE; 
+	    if (num_of_input_tensors != NUM_INPUT_URINARY_TENSORS) {
+	        input2[ops_type]      = test_input_scale_2[ops_type];
+	        elements_B            = NUM_ELEMENTS_SCALE; 
+	    }
+	    result_data[ops_type] = test_result_scale[ops_type];
+    } else {
+	    input1[ops_type]      = test_input_1[ops_type];
+	    elements_A            = NUM_ELEMENTS; 
+	    if (num_of_input_tensors != NUM_INPUT_URINARY_TENSORS) {
+	        input2[ops_type]      = test_input_2[ops_type];
+	        elements_B            = NUM_ELEMENTS; 
+	    }
+	    result_data[ops_type] = test_result[ops_type];
+    }
+
+    if(!load_model(model, input1[ops_type], input2[ops_type], GGML_TYPE_F32, elements_A, elements_B)) {
+	    fprintf(stderr, "\n\n TEST CASE FAILED \n\n");
+	    return -1;
+    }
+    // since tsavorite-backend init set the debug level to none, we are overwritting here
+    ggml_tsavorite_log_type_val = GGML_TSAVORITE_LOG_DEBUG;
+
+    ggml_gallocr_t allocr = NULL;
+
+    allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
+
+    if (!allocr) {
+    	fprintf(stderr, "\n\n TEST CASE FAILED \n\n");
+	return -1;
+    }
+
+    // create the worst case graph for memory usage estimation
+    struct ggml_cgraph * gf = build_graph(model, ops_type);
+    if (!gf) {
+    	fprintf(stderr, "\n\n TEST CASE FAILED \n\n");
+	return -1;
+    }
+    ggml_gallocr_reserve(allocr, gf);
+    size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0);
+
+    fprintf(stderr, "%s: compute buffer size: %.4f KB\n", __func__, mem_size/1024.0);
+
+    // perform computation
+    struct ggml_tensor * result = compute(model, allocr, ops_type);
+    if (!result) {
+	fprintf(stderr, "\n\n TEST CASE FAILED \n\n");
+	return -1;
+    }
+    fprintf(stderr, "\n Compute Done \n");
+
+    std::vector<float> out_data(ggml_nelements(result));
+
+    // bring the data from the backend memory
+    ggml_backend_tensor_get(result, out_data.data(), 0, ggml_nbytes(result));
+
+    // expected result:
+
+    fprintf(stderr, "\n operation type: %d, num of elements %d  \n", ops_type, (int) result->ne[0]);
+
+    fprintf(stderr, "\n compute is also done \n");
+    for (int i = 0; i < result->ne[0] /* cols */; i++) {
+	if (ggml_tsi_compare_two_float(out_data[i], result_data[ops_type][i])) {
+		continue;
+	}
+	test_case_flag = false;
+    	fprintf(stderr, "\n result for index %d is not matching expected %f got %f \n", i, result_data[ops_type][i], out_data[i]);
+    }
+
+    if (test_case_flag == false) {
+	fprintf(stderr, "\n\n TEST CASE FAILED \n\n");
+	return -1;
+    }
+    fprintf(stderr, "\n\n TEST CASE PASSED \n\n");
+
+    // free memory
+    ggml_free(model.ctx);
+
+    // release backend memory and free backend
+    //ggml_backend_buffer_free(model.buffer);
+    ggml_backend_free(model.backend);
+    return 0;
+}
diff --git a/ggml-tsi-kernel b/ggml-tsi-kernel
new file mode 160000
index 0000000000000..f7a3ac1ee334c
--- /dev/null
+++ b/ggml-tsi-kernel
@@ -0,0 +1 @@
+Subproject commit f7a3ac1ee334c242958ccb2053ecc4854822d87e
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 4746d5cb76c08..93a72d6cc84e4 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -267,6 +267,7 @@ set(GGML_PUBLIC_HEADERS
     include/ggml-kompute.h
     include/ggml-opt.h
     include/ggml-metal.h
+    include/ggml-tsavorite.h
     include/ggml-rpc.h
     include/ggml-sycl.h
     include/ggml-vulkan.h
diff --git a/ggml/include/ggml-tsavorite.h b/ggml/include/ggml-tsavorite.h
new file mode 100644
index 0000000000000..cd380ddf61ed3
--- /dev/null
+++ b/ggml/include/ggml-tsavorite.h
@@ -0,0 +1,189 @@
+// ------------------------------------------------------------------------------
+// Copyright (c) 2023 Tsavorite Scalable Intelligence, Inc . All rights reserved.
+//
+//
+// This file is the confidential and proprietary property of
+// Tsavorite Scalable Intelligence, Inc
+//
+// Possession or use of this file requires a written license from
+// Tsavorite Scalable Intelligence, Inc
+
+/******************************************************************************
+ * File: ggml-tsavorite.h
+ * Author TSI Inc
+ *
+ * Description:
+ * ***************************************************************************/
+
+//
+//
+// Note: this description is outdated
+//
+// An interface allowing to compute ggml_cgraph with tSovrite
+//
+// This is a fully functional interface that extends ggml with Hardware Accelerator support for
+// tSovrite devices. A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA,
+// etc.)
+//
+// How it works?
+//
+// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
+// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
+// use ggml_tsavorite_graph_compute()
+//
+// You only need to make sure that all memory buffers that you used during the graph creation
+// are mapped to the device unified memory with the ggml_tsavorite_add_buffer() function. This
+// mapping is used during the graph evaluation to determine the arguments of the compute kernels.
+//
+// Synchronization between device and host memory (for example for input and output tensors)
+// is done with the ggml_tsavorite_set_tensor() and ggml_tsavorite_get_tensor() functions.
+//
+
+#pragma once
+
+#include "ggml-backend.h"
+#include "ggml.h"
+
+#include "TestModel.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define TSAVORITE_KERNEL_SIZE 64
+#define TSAVORITE_DEVICE_MAX_BUF_LEN 1024 * 1024 * 128
+
+enum ggml_tsavorite_input_tensors_count {
+  TSAVORITE_UNARY_INPUT_TENSORS = 1,
+  TSAVORITE_TWO_INPUT_TENSORS = 2
+};
+
+enum ggml_tsavorite_log_type {
+  GGML_TSAVORITE_LOG_NONE,
+  GGML_TSAVORITE_LOG_CONT,
+  GGML_TSAVORITE_LOG_ERROR,
+  GGML_TSAVORITE_LOG_WARN,
+  GGML_TSAVORITE_LOG_DEBUG,
+  GGML_TSAVORITE_LOG_INFO,
+  GGML_TSAVORITE_LOG_ALL
+};
+
+enum ggml_tsavorite_kernel_mode {
+    GGML_TSAVORITE_KERNEL_MODE_CPU,
+    GGML_TSAVORITE_KERNEL_MODE_MLIR
+};
+
+enum ggml_tsavorite_kernel_mode ggml_tsavorite_kernel_mode_flag = GGML_TSAVORITE_KERNEL_MODE_MLIR; 
+enum ggml_tsavorite_log_type ggml_tsavorite_log_type_val = GGML_TSAVORITE_LOG_ALL;
+#define GGML_TSAVORITE_LOG_INFO(...)                                                               \
+  do {                                                                                             \
+    if (ggml_tsavorite_log_type_val >= GGML_TSAVORITE_LOG_INFO) {                                  \
+      ggml_log_internal(GGML_LOG_LEVEL_INFO, __VA_ARGS__);                                         \
+    }                                                                                              \
+  } while (0)
+#define GGML_TSAVORITE_LOG_DEBUG(...)                                                              \
+  do {                                                                                             \
+    if (ggml_tsavorite_log_type_val >= GGML_TSAVORITE_LOG_DEBUG) {                                 \
+      ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__);                                        \
+    }                                                                                              \
+  } while (0)
+#define GGML_TSAVORITE_LOG_WARN(...)                                                               \
+  do {                                                                                             \
+    if (ggml_tsavorite_log_type_val >= GGML_TSAVORITE_LOG_WARN) {                                  \
+      ggml_log_internal(GGML_LOG_LEVEL_WARN, __VA_ARGS__);                                         \
+    }                                                                                              \
+  } while (0)
+#define GGML_TSAVORITE_LOG_ERROR(...)                                                              \
+  do {                                                                                             \
+    if (ggml_tsavorite_log_type_val >= GGML_TSAVORITE_LOG_ERROR) {                                 \
+      ggml_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__);                                        \
+    }                                                                                              \
+  } while (0)
+#define GGML_TSAVORITE_LOG_CONT(...)                                                               \
+  do {                                                                                             \
+    if (ggml_tsavorite_log_type_val >= GGML_TSAVORITE_LOG_CONT) {                                  \
+      ggml_log_internal(GGML_LOG_LEVEL_CONT, __VA_ARGS__);                                         \
+    }                                                                                              \
+  } while (0)
+
+enum ggml_tsavorite_tensor_data_type {
+  GGML_TSAVORITE_TENSOR_HEADER,
+  GGML_TSAVORITE_TENSOR_LEAF1,
+  GGML_TSAVORITE_TENSOR_LEAF2,
+  GGML_TSAVORITE_TENSOR_NODE,
+  GGML_TSAVORITE_TENSOR_END_DATA
+};
+
+enum ggml_tsavorite_kernel_type {
+  GGML_TSAVORITE_KERNEL_TYPE_ADD,
+  GGML_TSAVORITE_KERNEL_TYPE_SUB,
+  GGML_TSAVORITE_KERNEL_TYPE_MULT,
+  GGML_TSAVORITE_KERNEL_TYPE_DIV,
+  GGML_TSAVORITE_KERNEL_TYPE_SQRT,
+  GGML_TSAVORITE_KERNEL_TYPE_NEG,
+  GGML_TSAVORITE_KERNEL_TYPE_ABS,
+  GGML_TSAVORITE_KERNEL_TYPE_SIN,
+  GGML_TSAVORITE_KERNEL_TYPE_SIGMOID,
+
+  GGML_TSAVORITE_KERNEL_TYPE_COUNT
+};
+
+// max memory buffers that can be mapped to the device
+#define GGML_TSAVORITE_MAX_BUFFERS 64
+
+// max number of TSAVORITECommandBuffer used to submit a graph for processing
+#define GGML_TSAVORITE_MAX_COMMAND_BUFFERS 8
+#define tsi_nil 0
+#define TSI_UNUSED(x) (void)(x)
+
+typedef struct tensor_log_ {
+  uint32_t leaf1_len;
+  uint32_t leaf2_len;
+  uint32_t node_len;
+  enum ggml_tsavorite_tensor_data_type data_type;
+  enum ggml_tsavorite_kernel_type kernel_type;
+  uint64_t num_of_op;
+  FILE *log_file;
+  const ggml_tensor *tensor;
+} tensor_log;
+
+extern void _mlir_ciface_txe_add(void *a, void *b, void *res);
+extern void _mlir_ciface_txe_sub(void *a, void *b, void *res);
+extern void _mlir_ciface_txe_mult(void *a, void *b, void *res);
+extern void _mlir_ciface_txe_div(void *a, void *b, void *res);
+extern void _mlir_ciface_txe_sqrt(void *a, void *res);
+extern void _mlir_ciface_txe_neg(void *a, void *res);
+extern void _mlir_ciface_txe_abs(void *a, void *res);
+extern void _mlir_ciface_txe_sin(void *a, void *res);
+extern void _mlir_ciface_txe_sigmoid(void *a, void *res);
+extern void ggml_tsi_log_tensor_data(tensor_log log_data);
+
+#define NUM_OF_TXES 1
+#define MEM_REF_DESCRIPTOR_RANK 1
+
+//
+// backend API
+// user-code should use only these functions
+//
+
+GGML_BACKEND_API ggml_backend_t ggml_backend_tsavorite_init(void);
+
+GGML_BACKEND_API bool ggml_backend_is_tsavorite(ggml_backend_t backend);
+
+GGML_BACKEND_API void ggml_backend_tsavorite_set_abort_callback(ggml_backend_t backend,
+                                                                ggml_abort_callback abort_callback,
+                                                                void *user_data);
+
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_tsavorite_buffer_type(void);
+
+// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
+GGML_BACKEND_API void ggml_backend_tsavorite_capture_next_compute(ggml_backend_t backend);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_tsavorite_reg(void);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index ddea5ad3891e5..0a14bbb74ced7 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -308,6 +308,7 @@ ggml_add_backend(CUDA)
 ggml_add_backend(HIP)
 ggml_add_backend(Kompute)
 ggml_add_backend(METAL)
+ggml_add_backend(TSAVORITE)
 ggml_add_backend(MUSA)
 ggml_add_backend(RPC)
 ggml_add_backend(SYCL)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 405d8e31514b5..f48a23bf83151 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -37,6 +37,10 @@
 #include "ggml-metal.h"
 #endif
 
+#ifdef GGML_USE_TSAVORITE
+#include "ggml-tsavorite.h"
+#endif
+
 #ifdef GGML_USE_SYCL
 #include "ggml-sycl.h"
 #endif
@@ -166,6 +170,11 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_METAL
         register_backend(ggml_backend_metal_reg());
 #endif
+
+#ifdef GGML_USE_TSAVORITE
+        register_backend(ggml_backend_tsavorite_reg());
+#endif
+
 #ifdef GGML_USE_SYCL
         register_backend(ggml_backend_sycl_reg());
 #endif
@@ -572,6 +581,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
     ggml_backend_load_best("hip", silent, dir_path);
     ggml_backend_load_best("kompute", silent, dir_path);
     ggml_backend_load_best("metal", silent, dir_path);
+    ggml_backend_load_best("tsavorite", silent, dir_path);
     ggml_backend_load_best("rpc", silent, dir_path);
     ggml_backend_load_best("sycl", silent, dir_path);
     ggml_backend_load_best("vulkan", silent, dir_path);
diff --git a/ggml/src/ggml-tsavorite/CMakeLists.txt b/ggml/src/ggml-tsavorite/CMakeLists.txt
new file mode 100644
index 0000000000000..f58331fd68d30
--- /dev/null
+++ b/ggml/src/ggml-tsavorite/CMakeLists.txt
@@ -0,0 +1,8 @@
+message(STATUS "Tsavorite framework is found")
+#
+# tsavorite Kernel Library
+ggml_add_backend_library(ggml-tsavorite
+                         ggml-tsavorite.cpp
+                        )
+
+target_link_libraries(ggml-tsavorite PRIVATE ${TLIBS} dl rt)
diff --git a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
new file mode 100644
index 0000000000000..7939a0f8cfa13
--- /dev/null
+++ b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
@@ -0,0 +1,1887 @@
+// -----------------------------------------------------------------------------n
+// Copyright (c) 2023 Tsavorite Scalable Intelligence, Inc . All rights reserved.
+//
+//
+// This file is the confidential and proprietary property of
+// Tsavorite Scalable Intelligence, Inc
+//
+// Possession or use of this file requires a written license from
+// Tsavorite Scalable Intelligence, Inc
+
+/******************************************************************************
+ * File: ggml-tsavorite.cpp
+ * Author TSI Inc
+ *
+ * Description:
+ * ***************************************************************************/
+
+#include "ggml-tsavorite.h"
+#include <unistd.h>
+#include <inttypes.h>
+#include <math.h>
+#include <string>
+
+#include "ggml-backend-impl.h"
+#include "ggml-impl.h"
+#include "ggml.h"
+
+typedef struct _txe_device_t *txe_device_s;
+typedef struct _txe_compute_pipeline_state_t *txe_compute_pipeline_state_s;
+FILE *tsi_op_log_file;
+uint64_t num_of_op;
+
+#ifdef USE_COMMAND_BUFFERS
+typedef struct _txe_command_queue_t *txe_command_queue_s;
+typedef struct _txe_dispatch_queue_t *txe_dispatch_queue_s;
+typedef struct _txe_command_buffer_t *txe_command_buffer_s;
+#endif /* USE_COMMAND_BUFFERS */
+typedef struct ggml_backend_tsavorite_buffer ggml_backend_tsavorite_buffer_s;
+
+struct _txe_device_t {
+  char name[100];
+  uint32_t max_buf_len;
+  size_t recommended_max_working_set_size;
+  size_t current_allocated_size;
+  int reserved;
+  struct _stats {
+    struct _op_run_count {
+      // Each Kernel operation belong to one tensor. Below count will increment for each Node Tensor
+      uint64_t total_tensor_count;
+      // This counter increment whenever kernel call are  made
+      uint64_t num_of_kernel_call;
+      // below field count all tensors whose num of elements are larger than  kernel number of
+      // elements
+      uint64_t num_of_tensor_spilt;
+      // For Any application below field maintain smallest tensor num of elem
+      uint64_t min_num_of_elem;
+      // For Any application below field maintain largest tensor num of elem
+      uint64_t max_num_of_elem;
+    } op_run_count[GGML_TSAVORITE_KERNEL_TYPE_COUNT];
+  } stats;
+};
+
+struct _txe_compute_pipeline_state_t {
+  void (*_mlir_fptr_2_input)(void *, void *, void *);
+  void (*_mlir_fptr_1_input)(void *, void *);
+  std::string kernel_name;
+  int reserved;
+};
+
+#ifdef USE_COMMAND_BUFFERS
+struct _txe_command_queue_t {
+  int reserved;
+};
+
+struct _txe_dispatch_queue_t {
+  int reserved;
+};
+
+struct _txe_command_buffer_t {
+  int reserved;
+};
+#endif /* USE_COMMAND_BUFFERS */
+
+static txe_device_s tsi_system_default_device_create();
+
+// kernels
+
+struct ggml_tsavorite_kernel {
+  txe_compute_pipeline_state_s pipeline;
+};
+
+struct ggml_backend_tsavorite_context {
+#ifdef USE_COMMAND_BUFFERS
+  txe_command_queue_s queue;
+
+  txe_dispatch_queue_s d_queue;
+#endif /* USE_COMMAND_BUFFERS */
+
+  struct ggml_tsavorite_kernel kernels[GGML_TSAVORITE_KERNEL_TYPE_COUNT];
+
+  // capture state
+  bool capture_next_compute;
+  bool capture_started;
+
+  // command buffer state
+  int n_cb;       // number of extra threads used to submit the command buffers
+  int n_nodes_0;  // number of nodes submitted by the main thread
+  int n_nodes_1;  // remaining number of nodes submitted by the n_cb threads
+  int n_nodes_per_cb;
+
+  struct ggml_cgraph *gf;
+
+  // the callback given to the thread pool
+  // void (^encode_async)(size_t ith);
+
+#ifdef USE_COMMAND_BUFFERS
+  // n_cb command buffers + 1 used by the main thread
+  txe_command_buffer_s command_buffers[GGML_TSAVORITE_MAX_COMMAND_BUFFERS + 1];
+#endif /* USE_COMMAND_BUFFERS */
+
+  // abort ggml_tsavorite_graph_compute if callback returns true
+  ggml_abort_callback abort_callback;
+  void *abort_callback_data;
+
+  // picking CPU compute example
+  int n_threads;
+  ggml_threadpool_t threadpool;
+
+  uint8_t *work_data;
+  size_t work_size;
+};
+
+// global
+
+// initialized in ggml_backend_tsavorite_reg
+static struct ggml_backend_reg g_ggml_backend_tsavorite_reg;
+static struct ggml_backend_device g_ggml_backend_tsavorite_device;
+
+// information about a tSavorite device
+// note: assumes single GPU device - the default one
+// Need to Add Support for multiple GPU devices
+static struct ggml_backend_tsavorite_device_context {
+  txe_device_s device;
+  int ref_count;
+
+  char name[128];
+} g_ggml_ctx_dev_main = {
+    /*.device                  =*/tsi_nil,
+    /*.ref_count               =*/0,
+    /*.name                    =*/"",
+};
+
+// temporarily defined here for compatibility between ggml-backend and the old API
+
+struct ggml_backend_tsavorite_buffer {
+  void *data;
+  size_t size;
+};
+
+struct ggml_backend_tsavorite_buffer_context {
+  void *all_data;
+  size_t all_size;
+  bool owned;
+
+  // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
+  int n_buffers;
+  ggml_backend_tsavorite_buffer_s buffers[GGML_TSAVORITE_MAX_BUFFERS];
+};
+
+static txe_device_s tsi_system_default_device_create() {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  txe_device_s device = (txe_device_s)malloc(sizeof(struct _txe_device_t));
+  device->max_buf_len = TSAVORITE_DEVICE_MAX_BUF_LEN;
+  device->recommended_max_working_set_size = TSAVORITE_DEVICE_MAX_BUF_LEN;
+  device->current_allocated_size = 0;
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return device;
+}
+
+static void tsi_device_free(txe_device_s device) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  free(device);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return;
+}
+
+#ifdef USE_COMMAND_BUFFERS
+static txe_command_queue_s tsi_command_queue_create() {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  txe_command_queue_s cqueue = (txe_command_queue_s)malloc(sizeof(struct _txe_command_queue_t));
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return cqueue;
+}
+
+static txe_dispatch_queue_s tsi_dispatch_queue_create() {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  txe_dispatch_queue_s dqueue = (txe_dispatch_queue_s)malloc(sizeof(struct _txe_dispatch_queue_t));
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return dqueue;
+}
+
+static void tsi_command_queue_free(txe_command_queue_s cqueue) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  if (cqueue)
+    free(cqueue);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return;
+}
+
+static void tsi_dispatch_queue_free(txe_dispatch_queue_s dqueue) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  if (dqueue)
+    free(dqueue);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return;
+}
+#endif /* USE_COMMAND_BUFFERS */
+
+static void tsi_buffer_free(void *data) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  if (data)
+    free(data);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return;
+}
+
+static bool tsi_log_setup() {
+  tsi_op_log_file = fopen("tsi-op.txt", "w+");
+  if (tsi_op_log_file == NULL) {
+    printf("Error Creating or opening log file\n");
+    return false;
+  }
+  return true;
+}
+
+void ggml_tsi_log_tensor_data(tensor_log log_data) {
+  if (!log_data.log_file) {
+    GGML_TSAVORITE_LOG_ERROR("%s: error: log file Cant be NULL\n", __func__);
+    return;
+  }
+
+  switch (log_data.data_type) {
+  case GGML_TSAVORITE_TENSOR_HEADER:
+    fprintf(log_data.log_file, "\n\n");
+    fprintf(log_data.log_file, "#############################################################\n");
+    fprintf(log_data.log_file,
+            "Tensor Number %ld and Type %d \n leaf1  len %d, leaf2 len %d, Node len %d\n",
+            log_data.num_of_op, log_data.kernel_type, log_data.leaf1_len, log_data.leaf2_len,
+            log_data.node_len);
+    fprintf(log_data.log_file, "############################################################\n");
+    fprintf(log_data.log_file, "\n\n");
+    fflush(log_data.log_file);
+    return;
+  case GGML_TSAVORITE_TENSOR_LEAF1:
+    fprintf(log_data.log_file, "\n---------------------------------------------------\n");
+    fprintf(log_data.log_file, "leaf1 Detail:\n");
+    break;
+  case GGML_TSAVORITE_TENSOR_LEAF2:
+    fprintf(log_data.log_file, "\n---------------------------------------------------\n");
+    fprintf(log_data.log_file, "leaf2 Detail:\n");
+    break;
+  case GGML_TSAVORITE_TENSOR_NODE:
+    fprintf(log_data.log_file, "\n---------------------------------------------------\n");
+    fprintf(log_data.log_file, "Node Detail:\n");
+    break;
+  case GGML_TSAVORITE_TENSOR_END_DATA:
+    fprintf(log_data.log_file, "DONE WITH THIS OPERATION %ld\n", log_data.num_of_op);
+    fprintf(log_data.log_file, "############################################################\n");
+    fprintf(log_data.log_file, "\n\n");
+    fflush(log_data.log_file);
+    return;
+  default:
+    GGML_TSAVORITE_LOG_ERROR("%s: error: Invalid Data Type Passed\n", __func__);
+    return;
+  }
+  if (!log_data.tensor) {
+    GGML_TSAVORITE_LOG_ERROR("%s: error: tensor pointer is  NULL\n", __func__);
+    return;
+  }
+  float *p;
+  int64_t count = (log_data.tensor->ne[0]) * (log_data.tensor->ne[1]) * (log_data.tensor->ne[2]) *
+                  (log_data.tensor->ne[3]);
+  p = (float *)log_data.tensor->data;
+  if ((!p) || (count == 0)) {
+    fprintf(log_data.log_file, "\n\n");
+    fprintf(log_data.log_file, "Tensor Data is Empty");
+    fprintf(log_data.log_file, "\n---------------------------------------------------\n");
+    fprintf(log_data.log_file, "\n\n");
+    fflush(log_data.log_file);
+    return;
+  }
+  fprintf(tsi_op_log_file, "%.16f ", p[0]);
+  for (int64_t ii = 1; ii < count; ++ii) {
+    if (!(ii % 4))
+      fprintf(log_data.log_file, "\n");
+    fprintf(log_data.log_file, "%.16f ", p[ii]);
+  }
+  fprintf(log_data.log_file, "\n\n");
+  fprintf(log_data.log_file, "\n---------------------------------------------------\n");
+  fflush(log_data.log_file);
+  return;
+}
+
+static void ggml_tsavorite_disp_stats(struct ggml_backend_tsavorite_context *ctx,
+                                      txe_device_s device) {
+  if (!ctx || !device) {
+    GGML_TSAVORITE_LOG_ERROR(
+        "At %s Either backend context or device or both are NULL, hence cant display Stats",
+        __func__);
+    return;
+  }
+  for (int i = 0; i < GGML_TSAVORITE_KERNEL_TYPE_COUNT; ++i) {
+    if (!ctx->kernels[i].pipeline)
+      continue;
+    GGML_TSAVORITE_LOG_CONT(
+        "\n %s Operation, total tensor: %lu  Number of Kernel Call: %lu  Number of tensor got "
+        "spilt: %lu Min Num of Elem %lu Max Num of Elem %lu \n",
+        ctx->kernels[i].pipeline->kernel_name.c_str(),
+        device->stats.op_run_count[i].total_tensor_count,
+        device->stats.op_run_count[i].num_of_kernel_call,
+        device->stats.op_run_count[i].num_of_tensor_spilt,
+        device->stats.op_run_count[i].min_num_of_elem,
+        device->stats.op_run_count[i].max_num_of_elem);
+  }
+  return;
+}
+
+static void _mlir_ciface_txe_add_test (void *src0, void *src1, void *res)
+{
+    // MemRefDescriptor
+    if (!src0 || !src1 || !res)
+        return;
+
+    const int Rank = MEM_REF_DESCRIPTOR_RANK;
+    MemRefDescriptor<Rank> *srcP0, *srcP1, *nodeP;
+    srcP0 = (MemRefDescriptor<Rank> *)src0;
+    srcP1 = (MemRefDescriptor<Rank> *)src1;
+    nodeP = (MemRefDescriptor<Rank> *)res;
+
+    uint32_t count = srcP0->shape[Rank - 1];
+    float *s0      = (float*)srcP0->data;
+    float *s1      = (float*)srcP1->data;
+    float *n       = (float*)nodeP->data;
+
+    for(uint32_t i=0; i < count; ++i)
+        n[i] = s0[i] + s1[i];
+    //printf("\n Calling mlir_add cpu function-5 \n");
+    return;
+}
+
+static void _mlir_ciface_txe_mult_test (void *src0, void *src1, void *res)
+{
+    // MemRefDescriptor
+    if (!src0 || !src1 || !res)
+        return;
+
+    const int Rank = MEM_REF_DESCRIPTOR_RANK;
+    MemRefDescriptor<Rank> *srcP0, *srcP1, *nodeP;
+    srcP0 = (MemRefDescriptor<Rank> *)src0;
+    srcP1 = (MemRefDescriptor<Rank> *)src1;
+    nodeP = (MemRefDescriptor<Rank> *)res;
+
+    uint32_t count = srcP0->shape[Rank - 1];
+    float *s0      = (float*)srcP0->data;
+    float *s1      = (float*)srcP1->data;
+    float *n       = (float*)nodeP->data;
+
+    for(uint32_t i=0; i < count; ++i)
+        n[i] = s0[i]*s1[i];
+    return;
+}
+
+static txe_compute_pipeline_state_s tsi_kernel_setup(enum ggml_tsavorite_kernel_type kernel_type) {
+  txe_compute_pipeline_state_s kernel_pipeline =
+      (txe_compute_pipeline_state_s)calloc(1, sizeof(struct _txe_compute_pipeline_state_t));
+  bool flag = false;
+  if (!kernel_pipeline) {
+    GGML_TSAVORITE_LOG_ERROR("Calloc failing while setting up kernel");
+    return NULL;
+  }
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+
+  switch (kernel_type) {
+      case GGML_TSAVORITE_KERNEL_TYPE_ADD:
+          if (ggml_tsavorite_kernel_mode_flag == GGML_TSAVORITE_KERNEL_MODE_CPU)
+              kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_add_test;
+          else
+              kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_add;
+          kernel_pipeline->kernel_name = "TXE_ADD";
+          flag = true;
+          break;
+      case GGML_TSAVORITE_KERNEL_TYPE_SUB:
+          kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_sub;
+          kernel_pipeline->kernel_name = "TXE_SUB";
+          flag = true;
+          break;
+      case GGML_TSAVORITE_KERNEL_TYPE_MULT:
+          if (ggml_tsavorite_kernel_mode_flag == GGML_TSAVORITE_KERNEL_MODE_CPU)
+              kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_mult_test;
+          else
+              kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_mult;
+          kernel_pipeline->kernel_name = "TXE_MULT";
+          flag = true;
+          break;
+      case GGML_TSAVORITE_KERNEL_TYPE_DIV:
+          kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_div;
+          kernel_pipeline->kernel_name = "TXE_DIV";
+          flag = true;
+          break;
+      case GGML_TSAVORITE_KERNEL_TYPE_SQRT:
+          kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_sqrt;
+          kernel_pipeline->kernel_name = "TXE_SQRT";
+          flag = true;
+          break;
+      case GGML_TSAVORITE_KERNEL_TYPE_NEG:
+          kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_neg;
+          kernel_pipeline->kernel_name = "TXE_NEG";
+          flag = true;
+          break;
+      case GGML_TSAVORITE_KERNEL_TYPE_ABS:
+          kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_abs;
+          kernel_pipeline->kernel_name = "TXE_ABS";
+          flag = true;
+          break;
+      case GGML_TSAVORITE_KERNEL_TYPE_SIN:
+          kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_sin;
+          kernel_pipeline->kernel_name = "TXE_SIN";
+          flag = true;
+          break;
+      case GGML_TSAVORITE_KERNEL_TYPE_SIGMOID:
+          kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_sigmoid;
+          kernel_pipeline->kernel_name = "TXE_SIGMOID";
+          flag = true;
+          break;
+      default:
+          break;
+  }
+  if (!flag) {
+    GGML_TSAVORITE_LOG_INFO("Kernel %d not supported \n", kernel_type);
+    if (kernel_pipeline) {
+      free(kernel_pipeline);
+      kernel_pipeline = NULL;
+    }
+  }
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return kernel_pipeline;
+}
+
+static void tsi_kernel_release(txe_compute_pipeline_state_s kernel_pipeline) {
+  // clear kernel_pipeline
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  if (kernel_pipeline) {
+    free(kernel_pipeline);
+  }
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return;
+}
+
+// acquire
+static txe_device_s
+ggml_backend_tsavorite_device_acq(struct ggml_backend_tsavorite_device_context *ctx) {
+  assert(ctx != NULL);
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+
+  if (ctx->device == tsi_nil) {
+    ctx->device = tsi_system_default_device_create();
+    snprintf(ctx->name, sizeof("txe"), "txe");
+  }
+
+  ctx->ref_count++;
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+  return ctx->device;
+}
+
+// release
+static void ggml_backend_tsavorite_device_rel(struct ggml_backend_tsavorite_device_context *ctx) {
+  assert(ctx != NULL);
+  assert(ctx->ref_count > 0);
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+
+  ctx->ref_count--;
+
+  // Need to define function txe_device_free
+  if (ctx->ref_count == 0) {
+    tsi_device_free(ctx->device);
+    ctx->device = tsi_nil;
+  }
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+}
+
+// We will use Unified Memory this memory is used for buffer
+static void *ggml_tsavorite_host_malloc(size_t n) {
+  void *data = NULL;
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+
+  GGML_TSAVORITE_LOG_INFO("\n Allocating memory from tsi_alloc with size  %ld \n", n);
+  printf("\n ANoop Allocating memory from tsi_alloc with size  %ld \n", n);
+  data = tsi_alloc(n);
+  GGML_TSAVORITE_LOG_CONT("\n Allocating memory from tsi_alloc with size  %ld starting memory %p\n",
+                          n, data);
+
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+  return data;
+}
+
+static struct ggml_backend_tsavorite_context *ggml_tsavorite_init(ggml_backend_dev_t dev) {
+  GGML_TSAVORITE_LOG_INFO("%s: Start\n", __func__);
+  // Open a file named "tsi-op.txt" in the current directory for writing
+  num_of_op = 0;
+
+  if (tsi_log_setup() == false)
+    return NULL;
+
+  // TSI Run time Initalization
+  tsi_initialize(NUM_OF_TXES);
+
+  // init context
+  struct ggml_backend_tsavorite_context *ctx = (struct ggml_backend_tsavorite_context *)calloc(
+      1, sizeof(struct ggml_backend_tsavorite_context));
+  struct ggml_backend_tsavorite_device_context *ctx_dev =
+      (struct ggml_backend_tsavorite_device_context *)dev->context;
+
+  // setup the devie context
+  txe_device_s device = ggml_backend_tsavorite_device_acq(ctx_dev);
+  GGML_TSAVORITE_LOG_INFO("%s: picking default device: %s\n", __func__, device->name);
+  for (uint32_t op = GGML_TSAVORITE_KERNEL_TYPE_ADD; op < GGML_TSAVORITE_KERNEL_TYPE_COUNT; ++op) {
+    device->stats.op_run_count[op].total_tensor_count = 0;
+    device->stats.op_run_count[op].num_of_kernel_call = 0;
+    device->stats.op_run_count[op].num_of_tensor_spilt = 0;
+    device->stats.op_run_count[op].min_num_of_elem = 0;
+    device->stats.op_run_count[op].max_num_of_elem = 0;
+  }
+  ctx->n_threads = GGML_DEFAULT_N_THREADS;
+  ctx->threadpool = NULL;
+  ctx->work_data = NULL;
+  ctx->work_size = 0;
+  ctx->abort_callback = NULL;
+  ctx->abort_callback_data = NULL;
+
+  // We dont need it for now, we will revisit
+#ifdef USE_COMMAND_BUFFERS
+  // setting up backend context
+  ctx->queue = tsi_command_queue_create();
+  ctx->d_queue = tsi_dispatch_queue_create();
+#endif /* USE_COMMAND_BUFFERS */
+
+  ctx->capture_next_compute = false;
+  ctx->capture_started = false;
+
+  ctx->gf = tsi_nil;
+  // ctx->encode_async = tsi_nil;
+
+#ifdef USE_COMMAND_BUFFERS
+  for (int i = 0; i < GGML_TSAVORITE_MAX_COMMAND_BUFFERS; ++i) {
+    ctx->command_buffers[i] = tsi_nil;
+  }
+#endif /* USE_COMMAND_BUFFERS */
+
+  // load TSavorite kernels
+  {
+    for (int i = 0; i < GGML_TSAVORITE_KERNEL_TYPE_COUNT; ++i) {
+      ctx->kernels[i].pipeline = tsi_nil;
+    }
+
+#define GGML_TSAVORITE_KERNEL(e, supported)                                                        \
+  if (supported) {                                                                                 \
+    ctx->kernels[e].pipeline = tsi_kernel_setup(e);                                                \
+    GGML_TSAVORITE_LOG_INFO(" TSAVORITE SUPPORTED KERNEL ");                                       \
+  } else {                                                                                         \
+    GGML_TSAVORITE_LOG_WARN("%s: skipping %-40s (not supported)\n", __func__, "kernel_" #e);       \
+  }
+
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_ADD, true);
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SUB, true);
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_MULT, true);
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_DIV, true);
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SQRT, true);
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_NEG, true);
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_ABS, true);
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SIN, true);
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SIGMOID, true);
+  }
+
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return ctx;
+}
+
+static void ggml_tsavorite_free(struct ggml_backend_tsavorite_context *ctx) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+
+  for (int i = 0; i < GGML_TSAVORITE_KERNEL_TYPE_COUNT; ++i) {
+    if (ctx->kernels[i].pipeline) {
+      tsi_kernel_release(ctx->kernels[i].pipeline);
+      ctx->kernels[i].pipeline = tsi_nil;
+    }
+  }
+
+  // Block_release(ctx->encode_async);
+  //
+#ifdef USE_COMMAND_BUFFERS
+  tsi_command_queue_free(ctx->queue);
+
+  tsi_dispatch_queue_free(ctx->d_queue);
+#endif /* USE_COMMAND_BUFFERS */
+
+  free(ctx);
+
+  // TSI run time free
+  GGML_TSAVORITE_LOG_INFO("\n Calling tsi_finalize \n");
+  // delay to allow any file operations to complete for runtime
+
+  GGML_TSAVORITE_LOG_INFO("Delaying tsi_finalize for 2 sec");
+  sleep(2);
+  tsi_finalize();
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+}
+
+#if 0
+// finds the tSavorite buffer that contains the tensor data on the TXE device unified memory
+// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
+// tSavorite buffer based on the host memory pointer
+//
+static ggml_backend_tsavorite_buffer_s ggml_tsavorite_get_buffer(struct ggml_tensor * t, size_t * offs) {
+    // GGML_TSAVORITE_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
+    GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+
+    const int64_t tsize = ggml_nbytes(t);
+
+    ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
+
+    struct ggml_backend_tsavorite_buffer_context * buf_ctx = (struct ggml_backend_tsavorite_buffer_context *) buffer->context;
+
+    // find the view that contains the tensor fully
+    for (int i = 0; i < buf_ctx->n_buffers; ++i) {
+        const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;
+
+        // GGML_TSAVORITE_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
+        if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
+            *offs = (size_t) ioffs;
+
+            // GGML_TSAVORITE_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
+            GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+            return buf_ctx->buffers[i];
+        }
+    }
+
+    GGML_TSAVORITE_LOG_ERROR("%s: error: tensor '%s' buffer is tsi_nil\n", __func__, t->name);
+    GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+    return tsi_nil;
+}
+#endif
+
+static bool ggml_tsavorite_supports_op(const struct ggml_backend_tsavorite_device_context *ctx_dev,
+                                       const struct ggml_tensor *op) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  if (!ctx_dev)
+    return false;
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  for (size_t i = 0, n = 3; i < n; ++i) {
+    if (op->src[i] != NULL && op->src[i]->type != GGML_TYPE_F32) {
+      return false;
+    }
+  }
+
+  if (op->type != GGML_TYPE_F32)
+    return false;
+  switch (op->op) {
+  case GGML_OP_NONE:
+  case GGML_OP_ADD:
+  case GGML_OP_SUB:
+  case GGML_OP_MUL:
+  case GGML_OP_DIV:
+  case GGML_OP_SQRT:
+  case GGML_OP_SIN:
+    break;
+  case GGML_OP_UNARY:
+    switch (ggml_get_unary_op(op)) {
+    case GGML_UNARY_OP_NEG:
+    case GGML_UNARY_OP_ABS:
+    case GGML_UNARY_OP_SIGMOID:
+      break;
+    default:
+      return false;
+    }
+    break;
+  default:
+    return false;
+  }
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return true;
+}
+
+/*
+static void ggml_tsavorite_encode_node(
+                        ggml_backend_t   backend,
+                                   int   idx,
+          tsi_command_encoder   encoder) {
+}
+*/
+
+static void ggml_tsavorite_decompose_unary_kernel_sin(uint32_t num_elem, ggml_tensor *src) {
+  float *p = (float *)(src->data);
+  for (uint32_t i = 0; i < num_elem; ++i) {
+    *p = (*p) / (2 * M_PI);
+    ++p;
+  }
+  return;
+}
+
+static void ggml_tsavorite_decompose_unary_kernel(uint32_t num_elem, ggml_tensor *src,
+                                                  ggml_tensor *node) {
+  switch (node->op) {
+  case GGML_OP_SIN:
+    ggml_tsavorite_decompose_unary_kernel_sin(num_elem, src);
+    break;
+  default:
+    break;
+  }
+  return;
+}
+
+// nodes are intermediate which has multiple src tensors & operation
+// Here we create multiple thread
+// Each Thread run the command buffer & pick Tensor and execute and get the result back base on
+// async or sync all Compute wil finish all tensors execution
+static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
+                                                     struct ggml_cgraph *cgraph) {
+#if 0
+    GGML_LOG_INFO("Start %s\n", __func__);
+    struct ggml_backend_tsavorite_context        * ctx     = backend->context;
+    struct ggml_backend_tsavorite_device_context * ctx_dev = backend->device->context;
+
+    // number of nodes encoded by the main thread (empirically determined)
+    const int n_main = 128;
+
+    // number of threads in addition to the main thread
+    const int n_cb = ctx->n_cb;
+
+    // submit the ggml compute graph to the TXE by creating command buffers and encoding the ops in them
+    // the first n_nodes_0 are encoded and submitted for processing directly by the calling thread
+    // while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes
+    // each thread creates it's own command buffer and enqueues the ops in parallel
+
+    GGML_LOG_INFO("End %s\n", __func__);
+    return GGML_STATUS_SUCCESS;
+#endif
+
+  struct ggml_backend_tsavorite_context *ctx =
+      (struct ggml_backend_tsavorite_context *)backend->context;
+  if (!ctx) {
+    GGML_LOG_ERROR("\n backend ctx is NULL \n");
+    return GGML_STATUS_FAILED;
+  }
+
+#if 0
+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, ctx->n_threads, ctx->threadpool);
+
+    if (ctx->work_size < cplan.work_size) {
+        delete[] ctx->work_data;
+        ctx->work_data = new uint8_t[cplan.work_size];
+        if (ctx->work_data == NULL) {
+            ctx->work_size = 0;
+            return GGML_STATUS_ALLOC_FAILED;
+        }
+        ctx->work_size = cplan.work_size;
+    }
+    cplan.work_data = (uint8_t *)ctx->work_data;
+
+    cplan.abort_callback      = ctx->abort_callback;
+    cplan.abort_callback_data = ctx->abort_callback_data;
+#endif
+
+  txe_device_s device = ggml_backend_tsavorite_device_acq(
+      (struct ggml_backend_tsavorite_device_context *)backend->device->context);
+
+  if (!device) {
+    GGML_TSAVORITE_LOG_ERROR("\n tsavorite device is NULL \n");
+    return GGML_STATUS_FAILED;
+  }
+  // MemRefDescriptor
+  const int Rank = MEM_REF_DESCRIPTOR_RANK;
+  MemRefDescriptor<Rank> *srcP0, *srcP1, *nodeP;
+  struct ggml_tensor *src0, *src1, *node;
+  uint32_t num_elem_src0, num_elem_src1, num_elem_node;
+  enum ggml_tsavorite_kernel_type kernel_type;
+  // This variable not needed since src0 or node will have max elem size
+  //  and src1 size will min elem size
+  uint64_t max_num_of_elem, min_num_of_elem;
+  enum ggml_tsavorite_input_tensors_count num_of_input_tensors;
+  tensor_log log_data;
+
+  for (int i = 0; i < cgraph->n_nodes; i++) {
+    node = cgraph->nodes[i];
+    src0 = node->src[0];
+    src1 = node->src[1];
+    min_num_of_elem = 0;
+    max_num_of_elem = 0;
+
+    switch (node->op) {
+    case GGML_OP_ADD:
+      kernel_type = GGML_TSAVORITE_KERNEL_TYPE_ADD;
+      num_of_input_tensors = TSAVORITE_TWO_INPUT_TENSORS;
+      break;
+    case GGML_OP_SUB:
+      kernel_type = GGML_TSAVORITE_KERNEL_TYPE_SUB;
+      num_of_input_tensors = TSAVORITE_TWO_INPUT_TENSORS;
+      break;
+    case GGML_OP_MUL:
+      kernel_type = GGML_TSAVORITE_KERNEL_TYPE_MULT;
+      num_of_input_tensors = TSAVORITE_TWO_INPUT_TENSORS;
+      break;
+    case GGML_OP_DIV:
+      kernel_type = GGML_TSAVORITE_KERNEL_TYPE_DIV;
+      num_of_input_tensors = TSAVORITE_TWO_INPUT_TENSORS;
+      break;
+    case GGML_OP_SQRT:
+      kernel_type = GGML_TSAVORITE_KERNEL_TYPE_SQRT;
+      num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS;
+      break;
+    case GGML_OP_SIN:
+      kernel_type = GGML_TSAVORITE_KERNEL_TYPE_SIN;
+      num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS;
+      break;
+    case GGML_OP_UNARY:
+      switch (ggml_get_unary_op(node)) {
+      case GGML_UNARY_OP_NEG:
+        kernel_type = GGML_TSAVORITE_KERNEL_TYPE_NEG;
+        num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS;
+        break;
+      case GGML_UNARY_OP_ABS:
+        kernel_type = GGML_TSAVORITE_KERNEL_TYPE_ABS;
+        num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS;
+        break;
+      case GGML_UNARY_OP_SIGMOID:
+        kernel_type = GGML_TSAVORITE_KERNEL_TYPE_SIGMOID;
+        num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS;
+        break;
+      default:
+        ggml_backend_tsavorite_device_rel(
+            (struct ggml_backend_tsavorite_device_context *)backend->device->context);
+        return GGML_STATUS_ABORTED;
+      }
+      break;
+    default:
+      ggml_backend_tsavorite_device_rel(
+          (struct ggml_backend_tsavorite_device_context *)backend->device->context);
+      return GGML_STATUS_ABORTED;
+    }
+
+    if (!ctx->kernels[kernel_type].pipeline ||
+        (!ctx->kernels[kernel_type].pipeline->_mlir_fptr_2_input &&
+         !ctx->kernels[kernel_type].pipeline->_mlir_fptr_1_input)) {
+      GGML_TSAVORITE_LOG_ERROR("Kernel Type %d, not supported \n", kernel_type);
+      return GGML_STATUS_ABORTED;
+    }
+    ++num_of_op;
+
+    if (num_of_input_tensors == TSAVORITE_TWO_INPUT_TENSORS) {
+      if (node->src[0] && node->src[1]) {
+        if (!src0->data || !src1->data || !node->data) {
+          GGML_TSAVORITE_LOG_ERROR(
+              "One of tensor Data doesnt have memory leaf1 %p, leaf2 %p, node %p \n", src0->data,
+              src1->data, node->data);
+          ggml_backend_tsavorite_device_rel(
+              (struct ggml_backend_tsavorite_device_context *)backend->device->context);
+          return GGML_STATUS_ABORTED;
+        }
+        srcP0 = (MemRefDescriptor<Rank> *)src0->data;
+        srcP1 = (MemRefDescriptor<Rank> *)src1->data;
+        nodeP = (MemRefDescriptor<Rank> *)node->data;
+        // This is for tsavorite MemRef Header hence getting header
+        --srcP0;
+        --srcP1;
+        --nodeP;
+        srcP0->data = srcP0->base = src0->data;
+        srcP1->data = srcP1->base = src1->data;
+        nodeP->data = nodeP->base = node->data;
+        // offset & shape size will be update base on Tensor Size
+        // TSAVORITE KERNEL CAN Take max of TSAVORITE_KERNEL_SIZE
+        // Hence we need to load tensor  data at multiple iteration
+        // for large Tensor Dataset
+        srcP0->offset = 0;
+        srcP1->offset = 0;
+        nodeP->offset = 0;
+
+        // currently _mlir_ as restriction to hold max of 64 elements, we need to spilt the work if
+        // its more than 64, i will address this at future PR Initalizing num_elem
+        num_elem_src0 = 1;
+        for (int i = 0; i < GGML_MAX_DIMS && src0->nb[i] != 0; ++i)
+          num_elem_src0 *= src0->ne[i];
+
+        num_elem_src1 = 1;
+        for (int i = 0; i < GGML_MAX_DIMS && src1->nb[i] != 0; ++i)
+          num_elem_src1 *= src1->ne[i];
+
+        num_elem_node = 1;
+        for (int i = 0; i < GGML_MAX_DIMS && node->nb[i] != 0; ++i)
+          num_elem_node *= node->ne[i];
+
+        if (!num_elem_src0 || !num_elem_src1 || !num_elem_node) {
+          GGML_TSAVORITE_LOG_ERROR("\nOne or more of Tensor length is zero of kernel_type %d\n",
+                                   kernel_type);
+          return GGML_STATUS_ABORTED;
+        }
+
+        min_num_of_elem = max_num_of_elem = num_elem_src0;
+
+        if (min_num_of_elem > num_elem_src1)
+          min_num_of_elem = num_elem_src1;
+        if (min_num_of_elem > num_elem_node)
+          min_num_of_elem = num_elem_node;
+
+        if (max_num_of_elem < num_elem_src1)
+          max_num_of_elem = num_elem_src1;
+        if (max_num_of_elem < num_elem_node)
+          max_num_of_elem = num_elem_node;
+
+        if (ggml_tsavorite_log_type_val == GGML_TSAVORITE_LOG_DEBUG) {
+          bzero((char *)&log_data, sizeof(log_data));
+          log_data.leaf1_len = num_elem_src0;
+          log_data.leaf2_len = num_elem_src1;
+          log_data.node_len = num_elem_node;
+          log_data.log_file = tsi_op_log_file;
+          log_data.num_of_op = num_of_op;
+          log_data.kernel_type = kernel_type;
+
+          log_data.data_type = GGML_TSAVORITE_TENSOR_HEADER;
+          ggml_tsi_log_tensor_data(log_data);
+
+          log_data.data_type = GGML_TSAVORITE_TENSOR_LEAF1;
+          log_data.tensor = src0;
+          ggml_tsi_log_tensor_data(log_data);
+
+          log_data.data_type = GGML_TSAVORITE_TENSOR_LEAF2;
+          log_data.tensor = src1;
+          ggml_tsi_log_tensor_data(log_data);
+        }
+
+        ggml_tensor *dst = node;
+        const int nr = ggml_nrows(src0);
+
+        GGML_TENSOR_BINARY_OP_LOCALS
+
+        for (int ir = 0; ir < nr; ++ir) {
+          const int64_t i03 = ir / (ne02 * ne01);
+          const int64_t i02 = (ir - i03 * ne02 * ne01) / ne01;
+          const int64_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01);
+
+          const int64_t i13 = i03 % ne13;
+          const int64_t i12 = i02 % ne12;
+          const int64_t i11 = i01 % ne11;
+          const int64_t nr0 = ne00 / ne10;
+
+          float *dst_ptr = (float *)((char *)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1);
+          float *src0_ptr = (float *)((char *)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01);
+          float *src1_ptr = (float *)((char *)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11);
+
+          for (int64_t r = 0; r < nr0; ++r) {
+            // While loop is added to  handle the scenario when kernel number of elements
+            // less than ggml tensor number of elements.GGML tensor number of elements decided
+            // base on application like llama.cpp. Currently we have build Kernel elements
+            // statically hence we have MACRO: TSAVORITE_KERNEL_SIZE to track this
+            int count = 0;
+            while (count < ne10) {
+              int kernel_size;
+              srcP1->data =  srcP1->base = (void *)(src1_ptr + count);
+              srcP0->data =  srcP0->base = (void *)(src0_ptr + r * ne10 + count);
+              nodeP->data =  nodeP->base = (void *)(dst_ptr + r * ne10 + count);
+              if ((count + TSAVORITE_KERNEL_SIZE) > ne10)
+                kernel_size = ne10 - count;
+              else
+                kernel_size = TSAVORITE_KERNEL_SIZE;
+              count += kernel_size;
+              srcP0->shape[Rank - 1]   = kernel_size;
+              srcP1->shape[Rank - 1]   = kernel_size;
+              nodeP->shape[Rank - 1]   = kernel_size;
+              srcP0->strides[Rank - 1] = 0;
+              srcP1->strides[Rank - 1] = 0;
+              nodeP->strides[Rank - 1] = 0;
+              // kernel call
+              ctx->kernels[kernel_type].pipeline->_mlir_fptr_2_input(srcP0, srcP1, nodeP);
+              ++device->stats.op_run_count[kernel_type].num_of_kernel_call;
+            }
+          }
+        }
+
+        if (ggml_tsavorite_log_type_val == GGML_TSAVORITE_LOG_DEBUG) {
+          log_data.data_type = GGML_TSAVORITE_TENSOR_NODE;
+          log_data.tensor = node;
+          ggml_tsi_log_tensor_data(log_data);
+
+          log_data.data_type = GGML_TSAVORITE_TENSOR_END_DATA;
+          log_data.tensor = NULL;
+          ggml_tsi_log_tensor_data(log_data);
+        }
+      }
+    }
+
+    if (num_of_input_tensors == TSAVORITE_UNARY_INPUT_TENSORS) {
+      if (node->src[0]) {
+        if (!src0->data || !node->data) {
+          GGML_TSAVORITE_LOG_ERROR(
+              "input or output tensor Data doesnt have memory leaf %p,  node %p \n", src0->data,
+              node->data);
+          ggml_backend_tsavorite_device_rel(
+              (struct ggml_backend_tsavorite_device_context *)backend->device->context);
+          return GGML_STATUS_ABORTED;
+        }
+        srcP0 = (MemRefDescriptor<Rank> *)src0->data;
+        nodeP = (MemRefDescriptor<Rank> *)node->data;
+        // This is for tsavorite MemRef Header hence getting header
+        --srcP0;
+        --nodeP;
+        srcP0->data = srcP0->base = src0->data;
+        nodeP->data = nodeP->base = node->data;
+        // offset & shape size will be update base on Tensor Size
+        // TSAVORITE KERNEL CAN Take max of TSAVORITE_KERNEL_SIZE
+        // Hence we need to load tensor  data at multiple iteration
+        // for large Tensor Dataset
+        srcP0->offset = 0;
+        nodeP->offset = 0;
+
+        // currently _mlir_ as restriction to hold max of 64 elements, we need to spilt the work if
+        // its more than 64, i will address this at future PR Initalizing num_elem
+        num_elem_src0 = 1;
+        for (int i = 0; i < GGML_MAX_DIMS && src0->nb[i] != 0; ++i)
+          num_elem_src0 *= src0->ne[i];
+        max_num_of_elem = min_num_of_elem = num_elem_src0;
+
+        if (ggml_tsavorite_log_type_val == GGML_TSAVORITE_LOG_DEBUG) {
+          bzero((char *)&log_data, sizeof(log_data));
+          log_data.leaf1_len = num_elem_src0;
+          log_data.leaf2_len = 0;
+          log_data.node_len = num_elem_src0;
+          log_data.log_file = tsi_op_log_file;
+          log_data.num_of_op = num_of_op;
+          log_data.kernel_type = kernel_type;
+
+          log_data.data_type = GGML_TSAVORITE_TENSOR_HEADER;
+          ggml_tsi_log_tensor_data(log_data);
+
+          log_data.data_type = GGML_TSAVORITE_TENSOR_LEAF1;
+          log_data.tensor = src0;
+          ggml_tsi_log_tensor_data(log_data);
+        }
+        // While loop is added to  handle the scenario when kernel number of elements
+        // less than ggml tensor number of elements.GGML tensor number of elements decided
+        // base on application like llama.cpp. Currently we have build Kernel elements statically
+        // hence we have MACRO: TSAVORITE_KERNEL_SIZE to track this
+        uint32_t count = 0;
+
+        if (node->op == GGML_OP_SIN) {
+          ggml_tsavorite_decompose_unary_kernel(num_elem_src0, src0, node);
+        }
+        while (count < num_elem_src0) {
+          int kernel_size;
+          srcP0->data = srcP0->base = (void *)((float *)src0->data + count);
+          nodeP->data = nodeP->base = (void *)((float *)node->data + count);
+          if ((count + TSAVORITE_KERNEL_SIZE) > num_elem_src0)
+            kernel_size = num_elem_src0 - count;
+          else
+            kernel_size = TSAVORITE_KERNEL_SIZE;
+          count += kernel_size;
+          srcP0->shape[Rank - 1]    = kernel_size;
+          nodeP->shape[Rank - 1]    = kernel_size;
+          srcP0->strides[Rank - 1]  = 0;
+          nodeP->strides[Rank - 1]  = 0;
+          // kernel call
+          ctx->kernels[kernel_type].pipeline->_mlir_fptr_1_input(srcP0, nodeP);
+          ++device->stats.op_run_count[kernel_type].num_of_kernel_call;
+        }
+
+        if (ggml_tsavorite_log_type_val == GGML_TSAVORITE_LOG_DEBUG) {
+          log_data.data_type = GGML_TSAVORITE_TENSOR_NODE;
+          log_data.tensor = node;
+          ggml_tsi_log_tensor_data(log_data);
+
+          log_data.data_type = GGML_TSAVORITE_TENSOR_END_DATA;
+          log_data.tensor = NULL;
+          ggml_tsi_log_tensor_data(log_data);
+        }
+      }
+    }
+    if (min_num_of_elem > 0) {
+      ++device->stats.op_run_count[kernel_type].total_tensor_count;
+
+      if (min_num_of_elem > TSAVORITE_KERNEL_SIZE)
+        ++device->stats.op_run_count[kernel_type].num_of_tensor_spilt;
+
+      if (!(device->stats.op_run_count[kernel_type].min_num_of_elem) ||
+          device->stats.op_run_count[kernel_type].min_num_of_elem > min_num_of_elem)
+        device->stats.op_run_count[kernel_type].min_num_of_elem = min_num_of_elem;
+
+      if (!(device->stats.op_run_count[kernel_type].max_num_of_elem) ||
+          device->stats.op_run_count[kernel_type].max_num_of_elem < max_num_of_elem)
+        device->stats.op_run_count[kernel_type].max_num_of_elem = max_num_of_elem;
+    }
+  }
+
+  // This this need to implement correctly when we have mixture of CPU and accelerator operation
+  // return ggml_graph_compute(cgraph, &cplan);
+  ggml_backend_tsavorite_device_rel(
+      (struct ggml_backend_tsavorite_device_context *)backend->device->context);
+  return GGML_STATUS_SUCCESS;
+
+  GGML_UNUSED(backend);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// backend interface
+
+#if 0
+static const char * ggml_backend_tsavorite_buffer_get_name(ggml_backend_buffer_t buffer) {
+    GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+    GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+    return "tSavorite";
+
+    TSI_UNUSED(buffer);
+}
+#endif
+
+static void ggml_backend_tsavorite_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  struct ggml_backend_tsavorite_buffer_context *ctx =
+      (struct ggml_backend_tsavorite_buffer_context *)buffer->context;
+
+#if 0
+    // ctx->all_data & tsi_buffer_free(ctx->buffers[i].data and same memory and created by tsi_alloc
+    // tsi_finalize called when ggml call backend free all memory
+    // this fucntion called when ggml free backend particular buffer, currently we cant provide this support
+    // and just return NoOps
+    // But at end there is no memory leak but memory can grow since we free at last once backend is shutdown
+    // We need to revisit this hence i kept the stuff under if 0
+    for (int i = 0; i < ctx->n_buffers; i++) {
+        tsi_buffer_free(ctx->buffers[i].data);
+    }
+    ggml_backend_tsavorite_device_rel((struct ggml_backend_tsavorite_device_context *)buffer->buft->device->context);
+
+    if (ctx->owned) {
+        free(ctx->all_data);
+    }
+#endif
+
+  free(ctx);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+}
+
+static void *ggml_backend_tsavorite_buffer_get_base(ggml_backend_buffer_t buffer) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  struct ggml_backend_tsavorite_buffer_context *ctx =
+      (struct ggml_backend_tsavorite_buffer_context *)buffer->context;
+
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return ctx->all_data;
+}
+
+static ggml_status ggml_backend_tsavorite_buffer_init_tensor(ggml_backend_buffer_t buffer,
+                                                      struct ggml_tensor *tensor) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  const int Rank = MEM_REF_DESCRIPTOR_RANK;
+  MemRefDescriptor<Rank> tensor_data_header;
+  tensor->data = (void *)(sizeof(tensor_data_header) + (char *)tensor->data);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return GGML_STATUS_SUCCESS;
+
+  TSI_UNUSED(buffer);
+}
+
+static void ggml_backend_tsavorite_buffer_memset_tensor(ggml_backend_buffer_t buffer,
+                                                        struct ggml_tensor *tensor, uint8_t value,
+                                                        size_t offset, size_t size) {
+  if (!tensor || !tensor->data) {
+    GGML_TSAVORITE_LOG_ERROR("\n tensor or data cant be null under func: %s\n", __func__);
+    return;
+  }
+  memset((char *)tensor->data + offset, value, size);
+
+  GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_tsavorite_buffer_set_tensor(ggml_backend_buffer_t buffer,
+                                                     struct ggml_tensor *tensor, const void *data,
+                                                     size_t offset, size_t size) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  memcpy((char *)tensor->data + offset, data, size);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+  TSI_UNUSED(buffer);
+}
+
+static void ggml_backend_tsavorite_buffer_get_tensor(ggml_backend_buffer_t buffer,
+                                                     const struct ggml_tensor *tensor, void *data,
+                                                     size_t offset, size_t size) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  memcpy(data, (const char *)tensor->data + offset, size);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+  TSI_UNUSED(buffer);
+}
+
+static bool ggml_backend_tsavorite_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
+                                                     const struct ggml_tensor *src,
+                                                     struct ggml_tensor *dst) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+
+  if (ggml_backend_buffer_is_host(src->buffer)) {
+    memcpy(dst->data, src->data, (ggml_nbytes(src)));
+    return true;
+  }
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return false;
+
+  TSI_UNUSED(buffer);
+}
+
+static void ggml_backend_tsavorite_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  struct ggml_backend_tsavorite_buffer_context *ctx =
+      (struct ggml_backend_tsavorite_buffer_context *)buffer->context;
+  if (!ctx || !ctx->all_data) {
+    GGML_TSAVORITE_LOG_ERROR("\n ctx or all_data cant be null under func: %s\n", __func__);
+    return;
+  }
+  memset((char *)ctx->all_data, value, ctx->all_size);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+}
+
+static struct ggml_backend_buffer_i ggml_backend_tsavorite_buffer_i = {
+    /* .free_buffer     = */ ggml_backend_tsavorite_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_tsavorite_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_tsavorite_buffer_init_tensor,
+    /* .memset_tensor   = */ ggml_backend_tsavorite_buffer_memset_tensor,
+    /* .set_tensor      = */ ggml_backend_tsavorite_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_tsavorite_buffer_get_tensor,
+    /* .cpy_tensor      = */ ggml_backend_tsavorite_buffer_cpy_tensor,
+    /* .clear           = */ ggml_backend_tsavorite_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+// default buffer type
+
+static const char *ggml_backend_tsavorite_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return "tsavorite";
+
+  TSI_UNUSED(buft);
+}
+
+static void ggml_backend_tsavorite_log_allocated_size(txe_device_s device, size_t size_aligned) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+#ifndef GGML_TSAVORITE_NDEBUG
+#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
+  GGML_TSAVORITE_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n", __func__,
+                          size_aligned / 1024.0 / 1024.0,
+                          device.currentAllocatedSize / 1024.0 / 1024.0);
+#endif
+#endif
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  TSI_UNUSED(device);
+  TSI_UNUSED(size_aligned);
+}
+
+static ggml_backend_buffer_t
+ggml_backend_tsavorite_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  struct ggml_backend_tsavorite_buffer_context *ctx =
+      (struct ggml_backend_tsavorite_buffer_context *)calloc(
+          1, sizeof(struct ggml_backend_tsavorite_buffer_context));
+
+  const size_t size_page = sysconf(_SC_PAGESIZE);
+  GGML_TSAVORITE_LOG_CONT(
+      "ggml_backend_tsavorite_buffer_type_alloc_buffer is called from llama data Loader \n");
+
+  size_t size_aligned = size;
+  if ((size_aligned % size_page) != 0) {
+    size_aligned += (size_page - (size_aligned % size_page));
+  }
+
+  txe_device_s device = ggml_backend_tsavorite_device_acq(
+      (struct ggml_backend_tsavorite_device_context *)buft->device->context);
+  if (!device)
+    return NULL;
+
+  ctx->all_data = ggml_tsavorite_host_malloc(size_aligned);
+  ctx->all_size = size_aligned;
+  ctx->owned = true;
+  ctx->n_buffers = 1;
+  GGML_TSAVORITE_LOG_INFO("\n\n\n\n  Memory Starting address %p and size %ld \n\n\n", ctx->all_data,
+                          ctx->all_size);
+
+  if (ctx->all_data != NULL) {
+    GGML_TSAVORITE_LOG_CONT("\nAddress of Newly Created BUffer %p and size %ld \n", ctx->all_data,
+                            ctx->all_size);
+    if (ggml_tsavorite_log_type_val == GGML_TSAVORITE_LOG_DEBUG) {
+      fprintf(tsi_op_log_file, "Address of Newly Created BUffer %p and size %ld \n", ctx->all_data,
+              ctx->all_size);
+    }
+    ctx->buffers[0].data = NULL;
+    ctx->buffers[0].data = ctx->all_data;
+    ctx->buffers[0].size = size;
+    memset((char *)ctx->all_data, 0, ctx->all_size);
+  }
+
+  if (size_aligned > 0 && (ctx->all_data == NULL)) {
+    GGML_TSAVORITE_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__,
+                             size_aligned / 1024.0 / 1024.0);
+    free(ctx);
+    ggml_backend_tsavorite_device_rel(
+        (struct ggml_backend_tsavorite_device_context *)buft->device->context);
+    return NULL;
+  }
+
+  // ggml_backend_tsavorite_log_allocated_size(device, size_aligned);
+  device->current_allocated_size += ctx->all_size;
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+  return ggml_backend_buffer_init(buft, ggml_backend_tsavorite_buffer_i, ctx, size);
+}
+
+static size_t ggml_backend_tsavorite_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return 32;
+  TSI_UNUSED(buft);
+}
+
+static size_t ggml_backend_tsavorite_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  txe_device_s device = ggml_backend_tsavorite_device_acq(
+      (struct ggml_backend_tsavorite_device_context *)buft->device->context);
+  const size_t max_size = device->max_buf_len;
+  ggml_backend_tsavorite_device_rel(
+      (struct ggml_backend_tsavorite_device_context *)buft->device->context);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+  return max_size;
+
+  TSI_UNUSED(buft);
+}
+
+static size_t ggml_backend_tsavorite_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft,
+                                                                const struct ggml_tensor *tensor) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  txe_device_s device = ggml_backend_tsavorite_device_acq(
+      (struct ggml_backend_tsavorite_device_context *)buft->device->context);
+  if (!device) {
+    GGML_TSAVORITE_LOG_ERROR("\n tsavorite device is NULL \n");
+    return 0;
+  }
+  const int Rank = MEM_REF_DESCRIPTOR_RANK;
+  MemRefDescriptor<Rank> tensor_data_header;
+  ggml_backend_tsavorite_device_rel(
+      (struct ggml_backend_tsavorite_device_context *)buft->device->context);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+  GGML_TSAVORITE_LOG_INFO(
+      "\n\n\n\n Calculating---- Alloc ----Size header %lu  and data %lu \n\n\n\n ",
+      sizeof(tensor_data_header), ggml_nbytes(tensor));
+
+  return (sizeof(tensor_data_header) + ggml_nbytes(tensor));
+
+  TSI_UNUSED(buft);
+}
+
+static bool ggml_backend_tsavorite_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  // For Now CPU is loading all data and then copy some tensor to Tsavorite Backend
+  // Once we have most of Operation supported by Tsavorite
+  // We will figure out to make tsavorite Backend also host
+  return false;
+
+  TSI_UNUSED(buft);
+}
+
+ggml_backend_buffer_type_t ggml_backend_tsavorite_buffer_type(void) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  static struct ggml_backend_buffer_type ggml_backend_buffer_type_tsavorite = {
+      /* .iface = */ {
+          /* .get_name         = */ ggml_backend_tsavorite_buffer_type_get_name,
+          /* .alloc_buffer     = */ ggml_backend_tsavorite_buffer_type_alloc_buffer,
+          /* .get_alignment    = */ ggml_backend_tsavorite_buffer_type_get_alignment,
+          /* .get_max_size     = */ ggml_backend_tsavorite_buffer_type_get_max_size,
+          /* .get_alloc_size   = */
+          ggml_backend_tsavorite_buffer_type_get_alloc_size,  // defaults to ggml_nbytes
+          /* .is_host          = */ ggml_backend_tsavorite_buffer_type_is_host,
+      },
+      /* .device  = */ &g_ggml_backend_tsavorite_device,
+      /* .context = */ NULL,
+  };
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+  return &ggml_backend_buffer_type_tsavorite;
+}
+
+// backend
+
+static const char *ggml_backend_tsavorite_name(ggml_backend_t backend) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return "Tsavorite";
+
+  TSI_UNUSED(backend);
+}
+
+static void ggml_backend_tsavorite_free(ggml_backend_t backend) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  if (!backend || !backend->context || !backend->device || !backend->device->context) {
+    GGML_TSAVORITE_LOG_ERROR("At %s One of more pointer among: Backend, backend_context, "
+                             "device_context or device are NULL",
+                             __func__);
+    return;
+  }
+  struct ggml_backend_tsavorite_context *ctx =
+      (struct ggml_backend_tsavorite_context *)backend->context;
+  struct ggml_backend_tsavorite_device_context *ctx_dev =
+      (struct ggml_backend_tsavorite_device_context *)backend->device->context;
+  ggml_tsavorite_disp_stats(ctx, ctx_dev->device);
+
+  ggml_backend_tsavorite_device_rel(ctx_dev);
+  ggml_tsavorite_free(ctx);
+
+  free(backend);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+}
+
+static void ggml_backend_tsavorite_synchronize(ggml_backend_t backend) {
+// We need to implement ASYN  Method to take output of tensor data to input of other Tensor
+// We will evaluate and implement at later PR
+#ifdef SYNC_DEBUG
+  usleep(100000);
+#endif /* SYNC_DEBUG */
+  TSI_UNUSED(backend);
+}
+
+static ggml_backend_buffer_type_t
+ggml_backend_tsavorite_get_default_buffer_type(ggml_backend_t backend) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return ggml_backend_tsavorite_buffer_type();
+
+  TSI_UNUSED(backend);
+}
+
+static enum ggml_status ggml_backend_tsavorite_graph_compute(ggml_backend_t backend,
+                                                             struct ggml_cgraph *cgraph) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return ggml_tsavorite_graph_compute(backend, cgraph);
+}
+
+static void ggml_backend_tsavorite_set_n_cb(ggml_backend_t backend, int n_cb) {
+  // GGML_ASSERT(ggml_backend_is_tsavorite(backend));
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+
+  struct ggml_backend_tsavorite_context *ctx =
+      (struct ggml_backend_tsavorite_context *)backend->context;
+
+  if (ctx->n_cb != n_cb) {
+    ctx->n_cb = MIN(n_cb, GGML_TSAVORITE_MAX_COMMAND_BUFFERS);
+
+    if (ctx->n_cb > 2) {
+      GGML_TSAVORITE_LOG_WARN("%s: n_cb = %d, using n_cb > 2 is not recommended and can degrade "
+                              "the performance in some cases\n",
+                              __func__, n_cb);
+    }
+  }
+
+#if 0
+    if (ctx->encode_async) {
+        Block_release(ctx->encode_async);
+    }
+#endif
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+}
+
+static struct ggml_backend_i ggml_backend_tsavorite_i = {
+    /* .get_name                = */ ggml_backend_tsavorite_name,
+    /* .free                    = */ ggml_backend_tsavorite_free,
+    /* .set_tensor_async        = */ NULL,
+    /* .get_tensor_async        = */ NULL,
+    /* .cpy_tensor_async        = */ NULL,
+    /* .synchronize             = */ ggml_backend_tsavorite_synchronize,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_tsavorite_graph_compute,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+};
+
+static ggml_guid_t ggml_backend_tsavorite_guid(void) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  static ggml_guid guid = {0x81, 0xa1, 0x8b, 0x1e, 0x71, 0xec, 0x79, 0xed,
+                           0x2b, 0x85, 0xdc, 0x8a, 0x61, 0x98, 0x30, 0xe6};
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return &guid;
+}
+
+// This need to be removed in the future
+ggml_backend_t ggml_backend_tsavorite_init(void) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_tsavorite_reg(), 0);
+
+  struct ggml_backend_tsavorite_context *ctx = ggml_tsavorite_init(dev);
+  if (ctx == NULL) {
+    GGML_TSAVORITE_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
+    return NULL;
+  }
+
+  ggml_backend_t backend = (ggml_backend_t)malloc(sizeof(struct ggml_backend));
+  if (backend) {
+    backend->guid = ggml_backend_tsavorite_guid();
+    backend->iface = ggml_backend_tsavorite_i;
+    backend->device = dev;
+    backend->context = ctx;
+  }
+  // Will enable later
+  // ggml_backend_tsavorite_set_n_cb(backend, 1);
+
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return backend;
+}
+
+bool ggml_backend_is_tsavorite(ggml_backend_t backend) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_tsavorite_guid());
+}
+
+void ggml_backend_tsavorite_set_abort_callback(ggml_backend_t backend,
+                                               ggml_abort_callback abort_callback,
+                                               void *user_data) {
+  GGML_ASSERT(ggml_backend_is_tsavorite(backend));
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+
+  struct ggml_backend_tsavorite_context *ctx =
+      (struct ggml_backend_tsavorite_context *)backend->context;
+
+  ctx->abort_callback = abort_callback;
+  ctx->abort_callback_data = user_data;
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+}
+
+void ggml_backend_tsavorite_capture_next_compute(ggml_backend_t backend) {
+  GGML_ASSERT(ggml_backend_is_tsavorite(backend));
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+
+  struct ggml_backend_tsavorite_context *ctx =
+      (struct ggml_backend_tsavorite_context *)backend->context;
+  ctx->capture_next_compute = true;
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+}
+
+// backend device
+
+static const char *ggml_backend_tsavorite_device_get_name(ggml_backend_dev_t dev) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return "Tsavorite";
+
+  GGML_UNUSED(dev);
+}
+
+static const char *ggml_backend_tsavorite_device_get_description(ggml_backend_dev_t dev) {
+  // acq/rel just to populate ctx->name in case it hasn't been done yet
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  struct ggml_backend_tsavorite_device_context *ctx_dev =
+      (struct ggml_backend_tsavorite_device_context *)dev->context;
+  ggml_backend_tsavorite_device_acq(ctx_dev);
+  ggml_backend_tsavorite_device_rel(ctx_dev);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+  return ctx_dev->name;
+}
+
+static void ggml_backend_tsavorite_device_get_memory(ggml_backend_dev_t dev, size_t *free,
+                                                     size_t *total) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+
+  if (!dev || !free || !total) {
+    GGML_TSAVORITE_LOG_INFO("One of more pointers(dev, free, total) are NULL\n");
+    GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+    return;
+  }
+  *total = 0;
+  *total = 0;
+  struct ggml_backend_tsavorite_device_context *ctx_dev =
+      (struct ggml_backend_tsavorite_device_context *)dev->context;
+  if (ctx_dev) {
+    txe_device_s device = ggml_backend_tsavorite_device_acq(ctx_dev);
+    *total = device->recommended_max_working_set_size;
+    *free = *total - device->current_allocated_size;
+    GGML_TSAVORITE_LOG_CONT("\n TXE Device MEMORY Summary total %lu and free %lu \n", *total,
+                            *free);
+    ggml_backend_tsavorite_device_rel(ctx_dev);
+  }
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return;
+}
+
+// Currently We are setting our TXE accerator at GPU Type
+static enum ggml_backend_dev_type ggml_backend_tsavorite_device_get_type(ggml_backend_dev_t dev) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return GGML_BACKEND_DEVICE_TYPE_GPU;
+
+  GGML_UNUSED(dev);
+}
+
+// Need to understand the scope of this API since this is not used
+// // use by Structure llama_model_loader
+// func llm_load_tensors
+// structure lama_new_context_with_model
+static void ggml_backend_tsavorite_device_get_props(ggml_backend_dev_t dev,
+                                                    struct ggml_backend_dev_props *props) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  props->name = ggml_backend_tsavorite_device_get_name(dev);
+  props->description = ggml_backend_tsavorite_device_get_description(dev);
+  props->type = ggml_backend_tsavorite_device_get_type(dev);
+  ggml_backend_tsavorite_device_get_memory(dev, &props->memory_free, &props->memory_total);
+
+  if (props) {
+    props->caps.async = false;
+    props->caps.host_buffer = false;
+    props->caps.buffer_from_host_ptr = true;
+    props->caps.buffer_from_host_ptr = false;
+    props->caps.events = false;
+  }
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+}
+
+static ggml_backend_t ggml_backend_tsavorite_device_init(ggml_backend_dev_t dev,
+                                                         const char *params) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  struct ggml_backend_tsavorite_context *ctx = ggml_tsavorite_init(dev);
+  if (ctx == NULL) {
+    GGML_TSAVORITE_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
+    return NULL;
+  }
+
+  ggml_backend_t backend = (ggml_backend_t)malloc(sizeof(struct ggml_backend));
+
+  if (backend) {
+    backend->guid = ggml_backend_tsavorite_guid();
+    backend->iface = ggml_backend_tsavorite_i;
+    backend->device = dev;
+    backend->context = ctx;
+  }
+
+  ggml_backend_tsavorite_set_n_cb(backend, 1);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+  return backend;
+
+  GGML_UNUSED(params);
+}
+
+static ggml_backend_buffer_type_t
+ggml_backend_tsavorite_device_get_buffer_type(ggml_backend_dev_t dev) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return ggml_backend_tsavorite_buffer_type();
+
+  GGML_UNUSED(dev);
+}
+
+// Currently for llama.cpp model below API it seems not used
+// llama.cpp is using as part llm_load_tensors
+// buffer_from_host_ptr_supported
+// is_default_buft
+// else they will be using
+// ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+// Need to revist when we will look at buffer section implementation
+static ggml_backend_buffer_t ggml_backend_tsavorite_device_buffer_from_ptr(ggml_backend_dev_t dev,
+                                                                           void *ptr, size_t size,
+                                                                           size_t max_tensor_size) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  struct ggml_backend_tsavorite_buffer_context *ctx =
+      (struct ggml_backend_tsavorite_buffer_context *)calloc(
+          1, sizeof(struct ggml_backend_tsavorite_buffer_context));
+
+  ctx->all_data = ptr;
+  ctx->all_size = size;
+  ctx->owned = false;
+  ctx->n_buffers = 0;
+
+  const size_t size_page = sysconf(_SC_PAGESIZE);
+
+  // page-align the data ptr
+  {
+    const uintptr_t offs = (uintptr_t)ptr % size_page;
+    ptr = (void *)((char *)ptr - offs);
+    size += offs;
+  }
+
+  size_t size_aligned = size;
+  if ((size_aligned % size_page) != 0) {
+    size_aligned += (size_page - (size_aligned % size_page));
+  }
+
+  struct ggml_backend_tsavorite_device_context *ctx_dev =
+      (struct ggml_backend_tsavorite_device_context *)dev->context;
+  txe_device_s device = ggml_backend_tsavorite_device_acq(ctx_dev);
+
+  // the buffer fits into the max buffer size allowed by the device
+  if (size_aligned <= device->max_buf_len) {
+    ctx->buffers[ctx->n_buffers].data = ptr;
+    ctx->buffers[ctx->n_buffers].size = size;
+
+    // ggml_backend_tsavorite_log_allocated_size(device, size_aligned);
+
+    ++ctx->n_buffers;
+  } else {
+    // this overlap between the views will guarantee that the tensor with the maximum size will
+    // fully fit into one of the views
+    const size_t size_ovlp = ((max_tensor_size + size_page - 1) / size_page + 1) *
+                             size_page;  // round-up 2 pages just in case
+    const size_t size_step = device->max_buf_len - size_ovlp;
+    const size_t size_view = device->max_buf_len;
+
+    for (size_t i = 0; i < size; i += size_step) {
+      const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
+
+      ctx->buffers[ctx->n_buffers].data = (void *)((uint8_t *)ptr + i);
+      ctx->buffers[ctx->n_buffers].size = size_step_aligned;
+
+      // ggml_backend_tsavorite_log_allocated_size(device, size_step_aligned);
+
+      if (i + size_step < size) {
+        GGML_TSAVORITE_LOG_INFO("\n");
+      }
+
+      ++ctx->n_buffers;
+    }
+  }
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+  return ggml_backend_buffer_init(ggml_backend_tsavorite_buffer_type(),
+                                  ggml_backend_tsavorite_buffer_i, ctx, size);
+}
+
+// llama_build_graph -> ggml_backend_supports_op -> gml_backend_dev_supports_op
+// basically if true then it will call ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur,
+// backend.get()); here is cur is tensor
+static bool ggml_backend_tsavorite_device_supports_op(ggml_backend_dev_t dev,
+                                                      const struct ggml_tensor *op) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  struct ggml_backend_tsavorite_device_context *ctx_dev =
+      (struct ggml_backend_tsavorite_device_context *)dev->context;
+
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return ggml_tsavorite_supports_op(ctx_dev, op);
+}
+
+// template<typename F>
+// static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {}
+//  ggml_backend_dev_supports_op(dev, op_tensor);
+static bool ggml_backend_tsavorite_device_supports_buft(ggml_backend_dev_t dev,
+                                                        ggml_backend_buffer_type_t buft) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return buft->iface.get_name == ggml_backend_tsavorite_buffer_type_get_name;
+
+  TSI_UNUSED(dev);
+}
+
+// // returns the backend that should be used for the node based on the current locations
+// ggml_backend_sched_backend_id_from_cur  -> ggml_backend_offload_op ->
+static bool ggml_backend_tsavorite_device_offload_op(ggml_backend_dev_t dev,
+                                                     const struct ggml_tensor *op) {
+  // printf("\n ANoop Calling %s \n ", __func__);
+  if (op->type != GGML_TYPE_F32)
+    return false;
+  switch (op->op) {
+  // case GGML_OP_NONE:
+  case GGML_OP_ADD:
+  case GGML_OP_SUB:
+  case GGML_OP_DIV:
+  case GGML_OP_MUL:
+  case GGML_OP_SQRT:
+  case GGML_OP_SIN:
+    break;
+  case GGML_OP_UNARY:
+    switch (ggml_get_unary_op(op)) {
+    case GGML_UNARY_OP_NEG:
+    case GGML_UNARY_OP_ABS:
+    case GGML_UNARY_OP_SIGMOID:
+      break;
+    default:
+      return false;
+    }
+    break;
+  default:
+    return false;
+  }
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return true;
+  TSI_UNUSED(dev);
+}
+#ifdef SYNC_DEBUG
+static void ggml_backend_tsavorite_device_synchronize(ggml_backend_dev_t dev,
+                                                      ggml_backend_event_t event) {
+  usleep(100);
+  TSI_UNUSED(dev);
+  TSI_UNUSED(event);
+}
+#endif /* SYNC_DEBUG */
+
+static struct ggml_backend_device_i ggml_backend_tsavorite_device_i = {
+    /* .get_name             = */ ggml_backend_tsavorite_device_get_name,
+    /* .get_description      = */ ggml_backend_tsavorite_device_get_description,
+    /* .get_memory           = */ ggml_backend_tsavorite_device_get_memory,
+    /* .get_type             = */ ggml_backend_tsavorite_device_get_type,
+    /* .get_props            = */ ggml_backend_tsavorite_device_get_props,
+    /* .init_backend         = */ ggml_backend_tsavorite_device_init,
+    /* .get_buffer_type      = */ ggml_backend_tsavorite_device_get_buffer_type,
+    /* .get_host_buffer_type = */ NULL,
+    /* .buffer_from_host_ptr = */ ggml_backend_tsavorite_device_buffer_from_ptr,
+    /* .supports_op          = */ ggml_backend_tsavorite_device_supports_op,
+    /* .supports_buft        = */ ggml_backend_tsavorite_device_supports_buft,
+    /* .offload_op           = */ ggml_backend_tsavorite_device_offload_op,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
+};
+
+// backend registry
+
+static const char *ggml_backend_tsavorite_reg_get_name(ggml_backend_reg_t reg) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return "Tsavorite";
+
+  GGML_UNUSED(reg);
+}
+
+static size_t ggml_backend_tsavorite_reg_device_count(ggml_backend_reg_t reg) {
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  return 1;
+
+  GGML_UNUSED(reg);
+}
+
+static ggml_backend_dev_t ggml_backend_tsavorite_reg_device_get(ggml_backend_reg_t reg,
+                                                                size_t index) {
+  GGML_ASSERT(index == 0);
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+  return &g_ggml_backend_tsavorite_device;
+
+  GGML_UNUSED(reg);
+  GGML_UNUSED(index);
+}
+
+static struct ggml_backend_reg_i ggml_backend_tsavorite_reg_i = {
+    /* .get_name         = */ ggml_backend_tsavorite_reg_get_name,
+    /* .device_count     = */ ggml_backend_tsavorite_reg_device_count,
+    /* .device_get       = */ ggml_backend_tsavorite_reg_device_get,
+    /* .get_proc_address = */ NULL,
+};
+
+ggml_backend_reg_t ggml_backend_tsavorite_reg(void) {
+  ggml_tsavorite_log_type_val = GGML_TSAVORITE_LOG_ERROR;
+  ggml_tsavorite_kernel_mode_flag = GGML_TSAVORITE_KERNEL_MODE_MLIR;
+  GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
+  g_ggml_backend_tsavorite_reg.iface = ggml_backend_tsavorite_reg_i;
+  g_ggml_backend_tsavorite_reg.context = NULL;
+
+  g_ggml_backend_tsavorite_device.iface = ggml_backend_tsavorite_device_i;
+  g_ggml_backend_tsavorite_device.reg = &g_ggml_backend_tsavorite_reg;
+  g_ggml_backend_tsavorite_device.context = &g_ggml_ctx_dev_main;
+  GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+
+  return &g_ggml_backend_tsavorite_reg;
+}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_tsavorite_reg)
diff --git a/ggml/src/ggml-tsavorite/include/TestModel.h b/ggml/src/ggml-tsavorite/include/TestModel.h
new file mode 100644
index 0000000000000..feff2539a96fa
--- /dev/null
+++ b/ggml/src/ggml-tsavorite/include/TestModel.h
@@ -0,0 +1,217 @@
+#pragma once
+
+#include "HostShimCAPI.h"
+#include <algorithm>
+#include <cstddef>
+#include <fstream>
+#include <random>
+#include <string>
+#include <array>
+
+#define MAX_RESULT_VALUES_TO_PRINT 32
+template <int N>
+struct MemRefDescriptor {
+  void *base;
+  void *data;
+  int64_t offset = 0;
+  int64_t shape[N];
+  int64_t strides[N];
+} __attribute__((aligned(128)));
+
+template <int Rank, int NumInputs, int NumOutputs>
+class TestModel {
+public:
+  TestModel(std::string name, int version, bool verbose = false)
+      : name_(name), version_(version), verbose_(verbose) {}
+
+  ~TestModel() {
+    // free memory
+    for (int i = 0; i < NumInputs; i++)
+      tsi_dealloc(inputs[i].base);
+    for (int i = 0; i < NumOutputs; i++)
+      tsi_dealloc(outputs[i].base);
+    tsi_finalize();
+  }
+
+  template <typename ElType>
+  void initRandom(size_t numElements,
+                  std::array<int, 2> inputRange = {-10, 10}) {
+    static_assert(Rank == 1,
+                  "initRandom(size_t) is only defined for Rank == 1");
+    size_t inputSizes[2][Rank] = {{numElements}, {numElements}};
+    size_t outputSizes[1][Rank] = {{numElements}};
+    init<ElType, ElType>(inputSizes, outputSizes,
+                         /*initWithRandom=*/true, inputRange);
+  }
+
+#if 0
+  template <typename ElType>
+  void initFill(size_t numElements, ElType val) {
+    static_assert(Rank == 1,
+                  "initRandom(size_t) is only defined for Rank == 1");
+    size_t inputSizes[2][Rank] = {{numElements}, {numElements}};
+    size_t outputSizes[1][Rank] = {{numElements}};
+    init<ElType, ElType>(inputSizes, outputSizes);
+    for (int i = 0; i < NumInputs; i++) {
+      auto nEls = getNumElements(inputs[i]);
+      for (size_t j = 0; j < nEls; j++)
+        static_cast<ElType *>(inputs[i].data)[j] = val;
+    }
+  }
+#endif /* 0 */
+
+  template <typename InputsElType, typename OutputsElType>
+  void init(size_t inputSizes[NumInputs][Rank],
+            size_t outputSizes[NumOutputs][Rank], bool initWithRandom = false,
+            std::array<int, 2> inputRange = {-10, 10}) {
+    tsi_initialize(1);
+
+    for (int i = 0; i < NumInputs; i++)
+      initMemRefDescriptor<InputsElType>(inputs[i], inputSizes[i],
+                                         initWithRandom, inputRange, i);
+
+    for (int i = 0; i < NumOutputs; i++) {
+      initMemRefDescriptor<OutputsElType>(outputs[i], outputSizes[i]);
+      // set default result values to -1
+      auto nEls = getNumElements(outputSizes[i]);
+      std::fill((OutputsElType *)outputs[i].base,
+                (OutputsElType *)outputs[i].base + nEls, -1);
+    }
+    if (verbose_) {
+      printf("[%s v.%d] Allocated DRAM arrays (host VAs):", name_.c_str(),
+             version_);
+      for (int i = 0; i < NumInputs; i++)
+        printf(" ANOOP input%d = %p ", i, inputs[i].base);
+      for (int i = 0; i < NumOutputs; i++)
+        printf(" ANOOP-1 output%d = %p ", i, outputs[i].base);
+      printf("\n");
+    }
+  }
+
+  template <typename ElType>
+  int validateResult(size_t index, ElType *expected, bool printErrs = false,
+                     float tolerance = 1e-5) {
+    if (verbose_) {
+      printf("[%s v.%d] Model executed successfully. Validating result...",
+             name_.c_str(), version_);
+    }
+
+    int retCode = 0;
+    size_t nEls = getNumElements(outputs[index].shape);
+    float sqrSumOfDiff = 0.0;
+    for (size_t j = 0; j < nEls; j++) {
+      sqrSumOfDiff +=
+          std::pow(((ElType *)outputs[index].base)[j] - expected[j], 2);
+      if (std::abs(((ElType *)outputs[index].base)[j] - expected[j]) >
+          tolerance) {
+        retCode = 1;
+        if (printErrs && j < MAX_RESULT_VALUES_TO_PRINT) {
+          printf("Mismatch at index %d: expected %1.6f, got %1.6f\n", (int)j,
+                 expected[j], ((ElType *)outputs[index].base)[j]);
+        }
+        if (retCode && j == MAX_RESULT_VALUES_TO_PRINT)
+          printf("... (more mismatches not printed; maximum %d reached) ...\n",
+                 MAX_RESULT_VALUES_TO_PRINT);
+      }
+    }
+    // Compute the relative error: norm2(result) / norm2(expected)
+    float sqrSumExpected = 0.0;
+    for (size_t j = 0; j < nEls; j++)
+      sqrSumExpected += std::pow(expected[j], 2);
+
+    float relativeErr = std::sqrt(sqrSumOfDiff) / std::sqrt(sqrSumExpected);
+    if (verbose_) {
+      retCode ? printf("\n[%s v.%d] FAILED [relative err=%1.6f]\n",
+                       name_.c_str(), version_, relativeErr)
+              : printf("\n[%s v.%d] PASS [relative err=%1.6f]\n", name_.c_str(),
+                       version_, relativeErr);
+    }
+    return retCode;
+  }
+
+  size_t getNumElements(const MemRefDescriptor<Rank> &memref) const {
+    return getNumElements(memref.shape);
+  }
+
+  template <typename ElType>
+  void writeToFile(void *data, size_t numElements,
+                   const std::string &filename) {
+    std::ofstream ofs(filename, std::ios::binary);
+    if (!ofs) {
+      printf("[%s v.%d] Error opening file %s for writing.", name_.c_str(),
+             version_, filename.c_str());
+      return;
+    }
+    ofs.write((char *)data, numElements * sizeof(ElType));
+    ofs.close();
+  }
+
+  template <typename ElType>
+  void readFromFile(void *data, size_t numElements,
+                    const std::string &filename) {
+    std::ifstream ifs(filename, std::ios::binary);
+    if (!ifs) {
+      printf("[%s v.%d] Error opening file %s for reading.", name_.c_str(),
+             version_, filename.c_str());
+      return;
+    }
+    ifs.read((char *)data, numElements * sizeof(ElType));
+    ifs.close();
+  }
+
+  std::string getName() const { return name_; }
+  std::string getVersion() const { return std::to_string(version_); }
+
+  MemRefDescriptor<Rank> inputs[NumInputs];
+  MemRefDescriptor<Rank> outputs[NumOutputs];
+
+private:
+  std::string name_;
+  int version_ = 1;
+  bool verbose_ = false;
+
+  template <typename ElType>
+  void initMemRefDescriptor(MemRefDescriptor<Rank> &memref, size_t shape[Rank],
+                            bool initWithRandom = false,
+                            std::array<int, 2> inputRange = {-10, 10},
+                            int seed = 42) {
+    size_t nBytes = sizeof(ElType);
+    for (int i = 0; i < Rank; i++) {
+      nBytes *= shape[i];
+    }
+    memref.base = tsi_alloc(nBytes);
+    memref.data = memref.base;
+    memref.offset = 0;
+    printf("\n checking Shape value %d \n\n", memref.shape[0]);
+#if 0
+    for (int i = 0; i < Rank; i++) {
+      memref.shape[i] = shape[i];
+      memref.strides[i] = 1;
+      for (int j = i + 1; j < Rank; j++) {
+        memref.strides[i] *= shape[j];
+      }
+    }
+ #endif
+    if (initWithRandom) {
+      std::mt19937 gen(seed); // fixed seed
+      std::uniform_real_distribution<float> dist(inputRange[0], inputRange[1]);
+      for (size_t i = 0; i < getNumElements(shape); i++) {
+        static_cast<ElType *>(memref.data)[i] = static_cast<ElType>(dist(gen));
+      }
+    }
+  }
+
+  size_t getNumElements(const int64_t shape[Rank]) const {
+    size_t numElements = 1;
+    printf("\n Anoop Rank %d and shape[Rank] %d \n\n", Rank, shape[Rank]);
+    for (int i = 0; i < Rank; i++) {
+      numElements *= shape[i];
+    }
+    printf("\n numElements %d \n", numElements);
+    return numElements;
+  }
+
+  size_t getNumElements(const size_t shape[Rank]) const {
+    return getNumElements(reinterpret_cast<const int64_t *>(shape));
+  }
+};
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 083347d188880..31fa312f65da6 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -174,4 +174,4 @@ target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd)
 # dummy executable - not installed
 get_filename_component(TEST_TARGET test-c.c NAME_WE)
 add_executable(${TEST_TARGET} test-c.c)
-target_link_libraries(${TEST_TARGET} PRIVATE llama)
+target_link_libraries(${TEST_TARGET} PRIVATE llama ${TLIBS})
diff --git a/tsi-pkg-build.sh b/tsi-pkg-build.sh
new file mode 100755
index 0000000000000..8712a77d1f71a
--- /dev/null
+++ b/tsi-pkg-build.sh
@@ -0,0 +1,87 @@
+
+set -e
+
+#Ensure prerequisites are met as follows
+echo 'updating submodule'
+git submodule update --recursive --init
+cd ggml-tsi-kernel/
+module load tsi4 gcc/13.3.0
+echo 'creating python virtual env'
+python3 -m venv blob-creation
+source blob-creation/bin/activate
+echo 'installing mlir and python dependencies'
+pip install -r /proj/rel/sw/mlir-compiler/python/requirements-common.txt
+pip install /proj/rel/sw/mlir-compiler/python/mlir_external_packages-1.2.1-py3-none-any.whl
+pip install onnxruntime-training
+
+#build TSI kernels for the Tsavorite backend
+#First for FPGA
+
+echo 'creating fpga kernel'
+cd fpga-kernel
+cmake -B build-fpga
+./create-all-kernels.sh
+#The for Posix Use cases 
+
+echo 'creating posix kernel'
+cd ../posix-kernel/
+./create-all-kernels.sh
+
+#Change directory to top level llama.cpp  
+
+cd ../../
+
+export MLIR_SDK_VERSION=/proj/work/rel/sw/sdk-r.0.1.0
+#Compile for posix with build-posix as a target folder
+
+echo 'building llama.cp, ggml for tsavorite  and other binary for posix'
+cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix
+cmake --build build-posix --config Release
+
+#Compile for fpga with build-fpga as a target folder
+
+echo 'building llama.cp, ggml for tsavorite  and other binary for fpga'
+export CC="/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-gcc"
+export CXX="/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-g++"
+cmake -B build-fpga -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=fpga
+cmake --build build-fpga --config Release
+
+
+echo 'creating tar bundle for fpga'
+TSI_GGML_VERSION=0.0.1
+TSI_GGML_BUNDLE_INSTALL_DIR=tsi-ggml
+GGML_TSI_INSTALL_DIR=ggml-tsi-kernel
+TSI_GGML_RELEASE_DIR=/proj/rel/sw/ggml/
+TSI_BLOB_INSTALL_DIR=$(pwd)/${GGML_TSI_INSTALL_DIR}/fpga-kernel/build-fpga
+
+if [ -e ${TSI_GGML_BUNDLE_INSTALL_DIR} ]; then
+   echo "${TSI_GGML_BUNDLE_INSTALL_DIR} exist"
+else
+   echo "creating ${TSI_GGML_BUNDLE_INSTALL_DIR}"
+   mkdir ${TSI_GGML_BUNDLE_INSTALL_DIR}
+fi
+if [ -e ${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh ]; then
+   rm -fr ${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh
+fi
+
+cat > ${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh << EOL
+#!/bin/bash
+export LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:\$(pwd)
+mkdir -p ${TSI_BLOB_INSTALL_DIR}/txe_mult
+mkdir -p ${TSI_BLOB_INSTALL_DIR}/txe_add
+cp blobs ${TSI_BLOB_INSTALL_DIR}/txe_mult/ -r
+cp blobs ${TSI_BLOB_INSTALL_DIR}/txe_add/ -r
+EOL
+chmod +x ${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh
+cp ${GGML_TSI_INSTALL_DIR}/fpga/blobs ${TSI_GGML_BUNDLE_INSTALL_DIR}/ -r
+cp build-fpga/bin/llama-cli ${TSI_GGML_BUNDLE_INSTALL_DIR}/
+cp build-fpga/bin/libggml*.so ${TSI_GGML_BUNDLE_INSTALL_DIR}/
+cp build-fpga/bin/libllama*.so ${TSI_GGML_BUNDLE_INSTALL_DIR}/
+cp build-fpga/bin/simple-backend-tsi ${TSI_GGML_BUNDLE_INSTALL_DIR}/
+
+tar -cvzf ${TSI_GGML_BUNDLE_INSTALL_DIR}-${TSI_GGML_VERSION}.tz ${TSI_GGML_BUNDLE_INSTALL_DIR}/*
+
+if [ "$1" == "Release" ] || [ "$1" == "release" ]
+then
+    cp ${TSI_GGML_BUNDLE_INSTALL_DIR}-${TSI_GGML_VERSION}.tz ${TSI_GGML_RELEASE_DIR}
+fi

From 699538538521884d4389f18d866068d9ba216efd Mon Sep 17 00:00:00 2001
From: Anoop Kapoor <akapoor@wssw01.tsavoritesi.net>
Date: Fri, 23 May 2025 22:19:39 -0700
Subject: [PATCH 02/35] Releasing next version

---
 tsi-pkg-build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tsi-pkg-build.sh b/tsi-pkg-build.sh
index 8712a77d1f71a..ba84118f38ff7 100755
--- a/tsi-pkg-build.sh
+++ b/tsi-pkg-build.sh
@@ -48,7 +48,7 @@ cmake --build build-fpga --config Release
 
 
 echo 'creating tar bundle for fpga'
-TSI_GGML_VERSION=0.0.1
+TSI_GGML_VERSION=0.0.2
 TSI_GGML_BUNDLE_INSTALL_DIR=tsi-ggml
 GGML_TSI_INSTALL_DIR=ggml-tsi-kernel
 TSI_GGML_RELEASE_DIR=/proj/rel/sw/ggml/

From 68410968db0ce9374e2eecdbbd36b6004174abf1 Mon Sep 17 00:00:00 2001
From: Anoop Kapoor <akapoor@wssw01.tsavoritesi.net>
Date: Fri, 23 May 2025 22:32:17 -0700
Subject: [PATCH 03/35] Updated MLIR_SDK_VERSION version

---
 tsi-pkg-build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tsi-pkg-build.sh b/tsi-pkg-build.sh
index ba84118f38ff7..5ff9b9389c475 100755
--- a/tsi-pkg-build.sh
+++ b/tsi-pkg-build.sh
@@ -31,7 +31,7 @@ cd ../posix-kernel/
 
 cd ../../
 
-export MLIR_SDK_VERSION=/proj/work/rel/sw/sdk-r.0.1.0
+export MLIR_SDK_VERSION=/proj/work/rel/sw/sdk-r.0.1.1
 #Compile for posix with build-posix as a target folder
 
 echo 'building llama.cp, ggml for tsavorite  and other binary for posix'

From 1a1514a715b26307c2de92072c7e2fdc1d36eb9f Mon Sep 17 00:00:00 2001
From: Anoop Kapoor <akapoor@wssw01.tsavoritesi.net>
Date: Fri, 23 May 2025 23:01:26 -0700
Subject: [PATCH 04/35] Updated the Version

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f9c146006c1a5..96a8a393817d3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,7 +15,7 @@ if (GGML_TSAVORITE)
 
     if (NOT DEFINED MLIR_COMPILER_DIR)
         if (NOT DEFINED $ENV{MLIR_SDK_VERSION})
-            set (MLIR_COMPILER_DIR /proj/work/rel/sw/sdk-r.0.1.0/compiler)
+            set (MLIR_COMPILER_DIR /proj/work/rel/sw/sdk-r.0.1.1/compiler)
         else()
             set (MLIR_COMPILER_DIR $ENV{MLIR_SDK_VERSION}/compiler)
         endif()

From d9dd83cf1c197425e1da6cde096e981603dff10d Mon Sep 17 00:00:00 2001
From: Ashish Trivedi <atrivedi@tsavoritesi.com>
Date: Sun, 25 May 2025 11:44:16 -0700
Subject: [PATCH 05/35] @FIR-707: Fix requirement for libgomp and move to new
 sdk 0.1.2 This change has following. 1. Move to new SDK 0.1.2 2. remove the
 requirement for libgomp in fpga build

---
 CMakeLists.txt                   |  4 ++--
 ggml/src/ggml-cpu/CMakeLists.txt | 18 +++++++++++-------
 tsi-pkg-build.sh                 |  2 +-
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 96a8a393817d3..e047785e603d6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,7 +15,7 @@ if (GGML_TSAVORITE)
 
     if (NOT DEFINED MLIR_COMPILER_DIR)
         if (NOT DEFINED $ENV{MLIR_SDK_VERSION})
-            set (MLIR_COMPILER_DIR /proj/work/rel/sw/sdk-r.0.1.1/compiler)
+            set (MLIR_COMPILER_DIR /proj/work/rel/sw/sdk-r.0.1.2/compiler)
         else()
             set (MLIR_COMPILER_DIR $ENV{MLIR_SDK_VERSION}/compiler)
         endif()
@@ -23,7 +23,7 @@ if (GGML_TSAVORITE)
 
     if (NOT DEFINED RUNTIME_DIR)
         if (NOT DEFINED $ENV{MLIR_SDK_VERSION})
-            set (RUNTIME_DIR /proj/work/rel/sw/sdk-r.0.1.0/${GGML_TSAVORITE_TARGET}/runtime)
+            set (RUNTIME_DIR /proj/work/rel/sw/sdk-r.0.1.2/${GGML_TSAVORITE_TARGET}/runtime)
         else()
             set (RUNTIME_DIR $ENV{MLIR_SDK_VERSION}/${GGML_TSAVORITE_TARGET}/runtime)
         endif()
diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index 1d4259dae5ba7..2cbae62a1dddf 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -53,14 +53,18 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
         endif()
     endif()
 
-    if (GGML_OPENMP)
-        find_package(OpenMP)
-        if (OpenMP_FOUND)
-            target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)
+    if (${GGML_TSAVORITE_TARGET} STREQUAL fpga)
+        message("Target is FPGA no GOMP linked")
+    else()
+        if (GGML_OPENMP)
+            find_package(OpenMP)
+            if (OpenMP_FOUND)
+               target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)
 
-            target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
-        else()
-            message(WARNING "OpenMP not found")
+               target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+            else()
+               message(WARNING "OpenMP not found")
+            endif()
         endif()
     endif()
 
diff --git a/tsi-pkg-build.sh b/tsi-pkg-build.sh
index 5ff9b9389c475..b6b998671544c 100755
--- a/tsi-pkg-build.sh
+++ b/tsi-pkg-build.sh
@@ -31,7 +31,7 @@ cd ../posix-kernel/
 
 cd ../../
 
-export MLIR_SDK_VERSION=/proj/work/rel/sw/sdk-r.0.1.1
+export MLIR_SDK_VERSION=/proj/work/rel/sw/sdk-r.0.1.2
 #Compile for posix with build-posix as a target folder
 
 echo 'building llama.cp, ggml for tsavorite  and other binary for posix'

From 441fd0b95685c3b1590eb81800dcee7487e2533e Mon Sep 17 00:00:00 2001
From: Ashish Trivedi <atrivedi@tsavoritesi.com>
Date: Tue, 27 May 2025 12:00:44 -0700
Subject: [PATCH 06/35] @FIR-708: Added TXE profile to ggm-tsavorite backend.
 The chanegs have following 1. Enable profiling for tsavorite backed for txe
 2. Add std c++20 for compiling the profiler The test results are as follows
 root@agilex7_dk_si_agf014ea:/usr/bin/tsi/v0.1.1.tsv30_05_24_2025/bin#
 ./run_platform_test.sh Check if tnApcMgr is running; if it is not, uncomment
 below line and execute the run_platform_test.sh script. Running on
 v0.1.1.tsv30_05_24_2025 [2018-03-09 13:52:26.300409] 271:272 [ info]  ::
 </proj/work/atrivedi/workspace/05_25_2025/tsi_yocto_workspace/tsi-apc-manager/platform/rsm_mgr/rsm_process_req.c:129>
 TXE resource allocation request processed successfully. [2018-03-09
 13:52:27.339] [info] [llama.cpp:56] Execution time: 1019 ms [2018-03-09
 13:52:27.347638] 2909:2909 [ info] [LlamaForCausalLM_Random v. 2]
 TestBase.h:154: Model executed successfully. Validating result... [2018-03-09
 13:52:27.380511] 2909:2909 [ info] [LlamaForCausalLM_Random v. 2]
 TestBase.h:193: PASS [relative err=0.000000, relTol=1.000000e-05] [2018-03-09
 13:52:27.405665] 271:272 [ info]  ::
 </proj/work/atrivedi/workspace/05_25_2025/tsi_yocto_workspace/tsi-apc-manager/platform/rsm_mgr/rsm_process_req.c:145>
 TXE resource release request processed successfully.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Profiling Results (LlamaForCausalLM_Random):
------------------------------------------------------------------------------------------------------------------------
Calls  Total(ms)    T/call  Self(ms)  Function
------------------------------------------------------------------------------------------------------------------------
  243    498.000     2.049     0.000  [45%] RuntimeHostShim::awaitCommandListCompletion
   84    200.688     2.389   200.688  └─ [18%] [ txe_blob_1 ]
   32     76.626     2.395    76.626  └─ [ 7%] [ txe_blob_6 ]
   16     55.493     3.468    55.493  └─ [ 5%] [ txe_blob_12 ]
    8     31.821     3.978    31.821  └─ [ 3%] [ txe_blob_10 ]
    8     31.322     3.915    31.322  └─ [ 3%] [ txe_blob_7 ]
    8     31.152     3.894    31.152  └─ [ 3%] [ txe_blob_8 ]
    8     27.693     3.462    27.693  └─ [ 2%] [ txe_blob_9 ]
   17     26.019     1.531    26.019  └─ [ 2%] [ txe_blob_2 ]
   17     25.906     1.524    25.906  └─ [ 2%] [ txe_blob_5 ]
   17     25.899     1.523    25.899  └─ [ 2%] [ txe_blob_3 ]
   17     25.833     1.520    25.833  └─ [ 2%] [ txe_blob_4 ]
    8     23.993     2.999    23.993  └─ [ 2%] [ txe_blob_11 ]
    3      6.002     2.001     6.002  └─ [ 1%] [ txe_blob_0 ]
    1     35.000    35.000    35.000  [ 3%] RuntimeHostShim::finalize
  188     33.000     0.176    33.000  [ 3%] RuntimeHostShim::copy
    1     16.000    16.000    16.000  [ 1%] RuntimeHostShim::initialize
   13      1.000     0.077     1.000  [ 0%] RuntimeHostShim::loadBlob
  573      0.000     0.000     0.000  [ 0%] RuntimeHostShim::allocate
  573      0.000     0.000     0.000  [ 0%] RuntimeHostShim::deallocate
  243      0.000     0.000     0.000  [ 0%] RuntimeHostShim::createCommandList
  922      0.000     0.000     0.000  [ 0%] RuntimeHostShim::getShmemManager
  243      0.000     0.000     0.000  [ 0%] RuntimeHostShim::launchBlob
  243      0.000     0.000     0.000  [ 0%] RuntimeHostShim::addCommandToList
  243      0.000     0.000     0.000  [ 0%] RuntimeHostShim::finalizeCommandList
   13      0.000     0.000     0.000  [ 0%] RuntimeHostShim::unloadBlob
   33      0.000     0.000     0.000  [ 0%] RuntimeHostShim::stridedCopy
========================================================================================================================
 3532   1116.000     0.316  1116.000  [100%] TOTAL
========================================================================================================================

register_backend: registered backend Tsavorite (1 devices)
register_device: registered device Tsavorite (txe)
register_backend: registered backend CPU (1 devices)
register_device: registered device CPU (CPU)
load_backend: failed to find ggml_backend_init in /usr/bin/tsi/v0.1.1.tsv30_05_24_2025/bin/tsi-ggml/libggml-tsavorite.so
load_backend: failed to find ggml_backend_init in /usr/bin/tsi/v0.1.1.tsv30_05_24_2025/bin/tsi-ggml/libggml-cpu.so
build: 5464 (194fbaa9) with gcc (GCC) 13.3.0 for x86_64-pc-linux-gnu (debug)
main: llama backend init
main: load the model and apply lora adapter, if any

 TXE Device MEMORY Summary total 134217728 and free 134217728
llama_model_load_from_file_impl: using device Tsavorite (txe) - 128 MiB free
llama_model_loader: loaded meta data with 24 key-value pairs and 75 tensors from /tsi/anoop_feb26/tinyllama-vo-5m-para.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Vicuna Hf
llama_model_loader: - kv   3:                         general.size_label str              = 4.6M
llama_model_loader: - kv   4:                            general.license str              = apache-2.0
llama_model_loader: - kv   5:                          llama.block_count u32              = 8
llama_model_loader: - kv   6:                       llama.context_length u32              = 2048
llama_model_loader: - kv   7:                     llama.embedding_length u32              = 64
llama_model_loader: - kv   8:                  llama.feed_forward_length u32              = 256
llama_model_loader: - kv   9:                 llama.attention.head_count u32              = 16
llama_model_loader: - kv  10:     llama.attention.layer_norm_rms_epsilon f32              = 0.000001
llama_model_loader: - kv  11:                          general.file_type u32              = 32
llama_model_loader: - kv  12:                           llama.vocab_size u32              = 32000
llama_model_loader: - kv  13:                 llama.rope.dimension_count u32              = 4
llama_model_loader: - kv  14:                       tokenizer.ggml.model str              = llama
llama_model_loader: - kv  15:                         tokenizer.ggml.pre str              = default
llama_model_loader: - kv  16:                      tokenizer.ggml.tokens arr[str,32000]   = ["<unk>", "<s>", "</s>", "<0x00>", "<...
llama_model_loader: - kv  17:                      tokenizer.ggml.scores arr[f32,32000]   = [0.000000, 0.000000, 0.000000, 0.0000...
llama_model_loader: - kv  18:                  tokenizer.ggml.token_type arr[i32,32000]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
llama_model_loader: - kv  19:                tokenizer.ggml.bos_token_id u32              = 1
llama_model_loader: - kv  20:                tokenizer.ggml.eos_token_id u32              = 2
llama_model_loader: - kv  21:            tokenizer.ggml.unknown_token_id u32              = 0
llama_model_loader: - kv  22:            tokenizer.ggml.padding_token_id u32              = 0
llama_model_loader: - kv  23:               general.quantization_version u32              = 2
llama_model_loader: - type  f32:   17 tensors
llama_model_loader: - type bf16:   58 tensors
print_info: file format = GGUF V3 (latest)
print_info: file type   = BF16
print_info: file size   = 8.82 MiB (16.00 BPW)
load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
load: special tokens cache size = 3
load: token to piece cache size = 0.1914 MB
print_info: arch             = llama
print_info: vocab_only       = 0
print_info: n_ctx_train      = 2048
print_info: n_embd           = 64
print_info: n_layer          = 8
print_info: n_head           = 16
print_info: n_head_kv        = 16
print_info: n_rot            = 4
print_info: n_swa            = 0
print_info: n_swa_pattern    = 1
print_info: n_embd_head_k    = 4
print_info: n_embd_head_v    = 4
print_info: n_gqa            = 1
print_info: n_embd_k_gqa     = 64
print_info: n_embd_v_gqa     = 64
print_info: f_norm_eps       = 0.0e+00
print_info: f_norm_rms_eps   = 1.0e-06
print_info: f_clamp_kqv      = 0.0e+00
print_info: f_max_alibi_bias = 0.0e+00
print_info: f_logit_scale    = 0.0e+00
print_info: f_attn_scale     = 0.0e+00
print_info: n_ff             = 256
print_info: n_expert         = 0
print_info: n_expert_used    = 0
print_info: causal attn      = 1
print_info: pooling type     = 0
print_info: rope type        = 0
print_info: rope scaling     = linear
print_info: freq_base_train  = 10000.0
print_info: freq_scale_train = 1
print_info: n_ctx_orig_yarn  = 2048
print_info: rope_finetuned   = unknown
print_info: ssm_d_conv       = 0
print_info: ssm_d_inner      = 0
print_info: ssm_d_state      = 0
print_info: ssm_dt_rank      = 0
print_info: ssm_dt_b_c_rms   = 0
print_info: model type       = ?B
print_info: model params     = 4.62 M
print_info: general.name     = Vicuna Hf
print_info: vocab type       = SPM
print_info: n_vocab          = 32000
print_info: n_merges         = 0
print_info: BOS token        = 1 '<s>'
print_info: EOS token        = 2 '</s>'
print_info: UNK token        = 0 '<unk>'
print_info: PAD token        = 0 '<unk>'
print_info: LF token         = 13 '<0x0A>'
print_info: EOG token        = 2 '</s>'
print_info: max token length = 18
load_tensors: loading model tensors, this can take a while... (mmap = true)

 TXE Device MEMORY Summary total 134217728 and free 134217728
load_tensors: offloading 0 repeating layers to GPU
load_tensors: offloaded 0/9 layers to GPU
load_tensors:   CPU_Mapped model buffer size =     8.82 MiB
..............
llama_context: constructing llama_context
llama_context: n_seq_max     = 1
llama_context: n_ctx         = 12288
llama_context: n_ctx_per_seq = 12288
llama_context: n_batch       = 1024
llama_context: n_ubatch      = 512
llama_context: causal_attn   = 1
llama_context: flash_attn    = 0
llama_context: freq_base     = 10000.0
llama_context: freq_scale    = 1
llama_context: n_ctx_per_seq (12288) > n_ctx_train (2048) -- possible training context overflow
[2018-03-09 13:52:28.706203] 271:272 [ info]  :: </proj/work/atrivedi/workspace/05_25_2025/tsi_yocto_workspace/tsi-apc-manager/platform/rsm_mgr/rsm_process_req.c:129> TXE resource allocation request processed successfully.
llama_context:        CPU  output buffer size =     0.12 MiB
llama_kv_cache_unified:        CPU KV buffer size =    24.00 MiB
llama_kv_cache_unified: size =   24.00 MiB ( 12288 cells,   8 layers,  1 seqs), K (f16):   12.00 MiB, V (f16):   12.00 MiB
ggml_backend_tsavorite_buffer_type_alloc_buffer is called from llama data Loader

 ANoop Allocating memory from tsi_alloc with size  266240

 Allocating memory from tsi_alloc with size  266240 starting memory 0xffff93e00080

Address of Newly Created BUffer 0xffff93e00080 and size 266240
llama_context:  tsavorite compute buffer size =     0.25 MiB
llama_context:        CPU compute buffer size =   408.51 MiB
llama_context: graph nodes  = 294
llama_context: graph splits = 67 (with bs=512), 37 (with bs=1)
common_init_from_params: setting dry_penalty_last_n to ctx_size = 12288
main: llama threadpool init, n_threads = 4
main: model was trained on only 2048 context tokens (12288 specified)

system_info: n_threads = 4 (n_threads_batch = 4) / 4 | CPU : NEON = 1 | ARM_FMA = 1 | LLAMAFILE = 1 | AARCH64_REPACK = 1 |

sampler seed: 177927434
sampler params:
	repeat_last_n = 5, repeat_penalty = 1.500, frequency_penalty = 0.000, presence_penalty = 0.000
	dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 12288
	top_k = 50, top_p = 0.900, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.000
	mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
sampler chain: logits -> logit-bias -> penalties -> dry -> top-n-sigma -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist
generate: n_ctx = 12288, n_batch = 1024, n_predict = 10, n_keep = 1

 my cat's name was Tim. He loved to play with his toy

llama_perf_sampler_print:    sampling time =     195.98 ms /    16 runs   (   12.25 ms per token,    81.64 tokens per second)
llama_perf_context_print:        load time =    1577.27 ms
llama_perf_context_print: prompt eval time =     305.19 ms /     6 tokens (   50.86 ms per token,    19.66 tokens per second)
llama_perf_context_print:        eval time =     803.59 ms /     9 runs   (   89.29 ms per token,    11.20 tokens per second)
llama_perf_context_print:       total time =    2628.44 ms /    15 tokens

 TXE_ADD Operation, total tensor: 10  Number of Kernel Call: 10  Number of tensor got spilt: 0 Min Num of Elem 64 Max Num of Elem 64

 TXE_SUB Operation, total tensor: 0  Number of Kernel Call: 0  Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0

 TXE_MULT Operation, total tensor: 170  Number of Kernel Call: 245  Number of tensor got spilt: 0 Min Num of Elem 64 Max Num of Elem 384

 TXE_DIV Operation, total tensor: 0  Number of Kernel Call: 0  Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0

 TXE_SQRT Operation, total tensor: 0  Number of Kernel Call: 0  Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0

 TXE_NEG Operation, total tensor: 0  Number of Kernel Call: 0  Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0

 TXE_ABS Operation, total tensor: 0  Number of Kernel Call: 0  Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0

 TXE_SIN Operation, total tensor: 0  Number of Kernel Call: 0  Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0

 TXE_SIGMOID Operation, total tensor: 0  Number of Kernel Call: 0  Number of tensor got spilt: 0 Min Num of Elem 0 Max Num of Elem 0
[2018-03-09 13:52:32.222949] 271:272 [ info]  :: </proj/work/atrivedi/workspace/05_25_2025/tsi_yocto_workspace/tsi-apc-manager/platform/rsm_mgr/rsm_process_req.c:145> TXE resource release request processed successfully.

GGML Tsavorite Profiling Results:
------------------------------------------------------------------------------------------------------------------------
Calls  Total(ms)    T/call  Self(ms)  Function
------------------------------------------------------------------------------------------------------------------------
  255    255.000     1.000     0.000  [ 7%] RuntimeHostShim::awaitCommandListCompletion
  245    379.466     1.549   379.466  └─ [11%] [ txe_mult_blob ]
   10     15.443     1.544    15.443  └─ [ 0%] [ txe_add_blob ]
    1     35.000    35.000    35.000  [ 1%] RuntimeHostShim::finalize
    1     19.000    19.000     2.000  [ 1%] GGML Tsavorite
    1     17.000    17.000    17.000  └─ [ 0%] RuntimeHostShim::initialize
  256      0.000     0.000     0.000  [ 0%] RuntimeHostShim::allocate
 1020      0.000     0.000     0.000  [ 0%] RuntimeHostShim::getShmemManager
  255      0.000     0.000     0.000  [ 0%] RuntimeHostShim::createCommandList
  255      0.000     0.000     0.000  [ 0%] RuntimeHostShim::loadBlob
  255      0.000     0.000     0.000  [ 0%] RuntimeHostShim::launchBlob
  255      0.000     0.000     0.000  [ 0%] RuntimeHostShim::addCommandToList
  255      0.000     0.000     0.000  [ 0%] RuntimeHostShim::finalizeCommandList
  255      0.000     0.000     0.000  [ 0%] RuntimeHostShim::unloadBlob
  255      0.000     0.000     0.000  [ 0%] RuntimeHostShim::deallocate
========================================================================================================================
 3318   3529.000     1.064  3529.000  [100%] TOTAL
========================================================================================================================

root@agilex7_dk_si_agf014ea:/usr/bin/tsi/v0.1.1.tsv30_05_24_2025/bin#
---
 ggml/src/ggml-tsavorite/CMakeLists.txt     |  1 +
 ggml/src/ggml-tsavorite/ggml-tsavorite.cpp | 17 +++++++++++++++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-tsavorite/CMakeLists.txt b/ggml/src/ggml-tsavorite/CMakeLists.txt
index f58331fd68d30..323c37df14a8b 100644
--- a/ggml/src/ggml-tsavorite/CMakeLists.txt
+++ b/ggml/src/ggml-tsavorite/CMakeLists.txt
@@ -1,6 +1,7 @@
 message(STATUS "Tsavorite framework is found")
 #
 # tsavorite Kernel Library
+add_compile_options(--std=c++20)
 ggml_add_backend_library(ggml-tsavorite
                          ggml-tsavorite.cpp
                         )
diff --git a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
index 7939a0f8cfa13..e359906b61ce6 100644
--- a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
+++ b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
@@ -20,11 +20,15 @@
 #include <inttypes.h>
 #include <math.h>
 #include <string>
-
+#include <iostream>
 #include "ggml-backend-impl.h"
 #include "ggml-impl.h"
 #include "ggml.h"
+#include "HostShimCAPI.h"
+#include "tsi-rt/utils/Profiler.h"
 
+using namespace std;
+namespace tsirt = ::tsi::runtime;
 typedef struct _txe_device_t *txe_device_s;
 typedef struct _txe_compute_pipeline_state_t *txe_compute_pipeline_state_s;
 FILE *tsi_op_log_file;
@@ -513,8 +517,12 @@ static struct ggml_backend_tsavorite_context *ggml_tsavorite_init(ggml_backend_d
   if (tsi_log_setup() == false)
     return NULL;
 
+  std::string mainProfilerName = "GGML Tsavorite ";
+  tsirt::utils::TSIProfiler::initialize();
+  tsirt::utils::TSIScopedProfiler mainProfiler(mainProfilerName);
+  
   // TSI Run time Initalization
-  tsi_initialize(NUM_OF_TXES);
+  tsi_initialize(NUM_OF_TXES, NULL);
 
   // init context
   struct ggml_backend_tsavorite_context *ctx = (struct ggml_backend_tsavorite_context *)calloc(
@@ -615,6 +623,11 @@ static void ggml_tsavorite_free(struct ggml_backend_tsavorite_context *ctx) {
   sleep(2);
   tsi_finalize();
   GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
+  tsirt::utils::TSIProfiler::finalize();
+  std::cout << "\nGGML Tsavorite Profiling Results:" << std::endl;
+  std::cout << tsirt::utils::TSIProfiler::getFormattedResults(
+                   /*truncateFuncNames*/ true)
+            << std::endl;
 }
 
 #if 0

From 9d65b92953fe3f674dd7a5d51a12e900cdc8682c Mon Sep 17 00:00:00 2001
From: Anoop Kapoor <akapoor@wssw01.tsavoritesi.net>
Date: Thu, 29 May 2025 13:23:31 -0700
Subject: [PATCH 07/35] @FIR-709 - GGML: Adding SILU Kernel

---
 README.md                                  | 49 +++++++++++++++++
 docs/build.md                              | 63 ++++++++++++++++++++++
 ggml-tsi-kernel                            |  2 +-
 ggml/include/ggml-tsavorite.h              |  2 +
 ggml/src/ggml-backend.cpp                  |  7 ++-
 ggml/src/ggml-tsavorite/ggml-tsavorite.cpp | 30 +++++++----
 tsi-pkg-build.sh                           | 10 ++--
 7 files changed, 147 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index d1cb8d8336229..d8371a472a675 100644
--- a/README.md
+++ b/README.md
@@ -580,3 +580,52 @@ $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
 - [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
 - [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
 - [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
+
+#### TSI compilation steps
+```bash
+#Pull the repo frim tsisw as follows
+git clone git@github.com:tsisw/llama.cpp.git -b FIR-699
+
+#Ensure prerequisites are met as follows
+cd llama.cpp/
+git submodule update --recursive --init
+cd ggml-tsi-kernel/
+module load tsi4 gcc/13.3.0
+python3 -m venv blob-creation
+source blob-creation/bin/activate
+pip install -r /proj/rel/sw/mlir-compiler/python/requirements-common.txt
+pip install /proj/rel/sw/mlir-compiler/python/mlir_external_packages-1.2.1-py3-none-any.whl
+pip install onnxruntime-training
+
+#build TSI kernels for the Tsavorite backend
+#First for FPGA
+cd fpga-kernel
+cmake -B build-fpga
+./create-all-kernels.sh
+#The for Posix Use cases
+cd ../posix-kernel/
+./create-all-kernels.sh
+
+#Change directory to top level llama.cpp
+cd ../../
+
+#Compile for posix with build-posix as a target folder
+
+cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix
+cmake --build build-posix --config Release
+
+#Compile for fpga with build-fpga as a target folder
+export CC="/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-gcc"
+export CXX="/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-g++"
+cmake -B build-fpga -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=fpga
+cmake --build build-fpga --config Release
+
+#For easy build one can also use which creates a FPGA specific tar bundle tsi-ggml.tz
+#If you want to release the build update the TSI-VERSION in the file tsi-pkg-build.sh and add Release as parameter
+#when running ./tsi-pkg-build.sh (Note it will overwrite what exists in /proj/rel/sw/ggml so be sure you want to do
+#it. Example ./tsi-pkg-build.sh release
+./tsi-pkg-build.sh
+
+```
+
+## References
diff --git a/docs/build.md b/docs/build.md
index c9027c0b580a5..1685adbc916bc 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -559,3 +559,66 @@ The GPU may still be used to accelerate some parts of the computation even when
 In most cases, it is possible to build and use multiple backends at the same time. For example, you can build llama.cpp with both CUDA and Vulkan support by using the `-DGGML_CUDA=ON -DGGML_VULKAN=ON` options with CMake. At runtime, you can specify which backend devices to use with the `--device` option. To see a list of available devices, use the `--list-devices` option.
 
 Backends can be built as dynamic libraries that can be loaded dynamically at runtime. This allows you to use the same llama.cpp binary on different machines with different GPUs. To enable this feature, use the `GGML_BACKEND_DL` option when building.
+
+
+## TSI compilation steps
+
+Following are the instructions to compile for TSI FPGA and Posix backend
+
+```bash
+Pull the repo frim tsisw as follows
+git clone git@github.com:tsisw/llama.cpp.git -b FIR-699
+```
+
+Ensure prerequisites are met as follows
+```bash
+cd llama.cpp/
+git submodule update --recursive --init
+cd ggml-tsi-kernel/
+module load tsi4 gcc/13.3.0
+python3 -m venv blob-creation
+source blob-creation/bin/activate
+pip install -r /proj/rel/sw/mlir-compiler/python/requirements-common.txt
+pip install /proj/rel/sw/mlir-compiler/python/mlir_external_packages-1.2.1-py3-none-any.whl
+pip install onnxruntime-training
+```
+
+build TSI kernels for the Tsavorite backend
+First for FPGA
+```bash
+cd fpga-kernel
+cmake -B build-fpga
+./create-all-kernels.sh
+```
+The for Posix Use cases
+```bash
+cd ../posix-kernel/
+./create-all-kernels.sh
+```
+
+Change directory to top level llama.cpp
+```bash
+cd ../../
+```
+
+Compile for posix with build-posix as a target folder
+```bash
+cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix
+cmake --build build-posix --config Release
+```
+
+Compile for fpga with build-fpga as a target folder
+```bash
+export CC="/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-gcc"
+export CXX="/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/aarch64-none-linux-gnu-g++"
+cmake -B build-fpga -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=fpga
+cmake --build build-fpga --config Release
+```
+For easy build one can also use which creates a FPGA specific tar bundle tsi-ggml.tz
+If you want to release the build update the TSI-VERSION in the file tsi-pkg-build.sh and add Release as parameter
+when running ./tsi-pkg-build.sh (Note it will overwrite what exists in /proj/rel/sw/ggml so be sure you want to do
+it. Example ./tsi-pkg-build.sh release
+
+```bash
+./tsi-pkg-build.sh
+```
diff --git a/ggml-tsi-kernel b/ggml-tsi-kernel
index f7a3ac1ee334c..d1383a04f29d0 160000
--- a/ggml-tsi-kernel
+++ b/ggml-tsi-kernel
@@ -1 +1 @@
-Subproject commit f7a3ac1ee334c242958ccb2053ecc4854822d87e
+Subproject commit d1383a04f29d0160750c0e51ab524d461c6a127b
diff --git a/ggml/include/ggml-tsavorite.h b/ggml/include/ggml-tsavorite.h
index cd380ddf61ed3..54a8e34662799 100644
--- a/ggml/include/ggml-tsavorite.h
+++ b/ggml/include/ggml-tsavorite.h
@@ -127,6 +127,7 @@ enum ggml_tsavorite_kernel_type {
   GGML_TSAVORITE_KERNEL_TYPE_ABS,
   GGML_TSAVORITE_KERNEL_TYPE_SIN,
   GGML_TSAVORITE_KERNEL_TYPE_SIGMOID,
+  GGML_TSAVORITE_KERNEL_TYPE_SILU,
 
   GGML_TSAVORITE_KERNEL_TYPE_COUNT
 };
@@ -159,6 +160,7 @@ extern void _mlir_ciface_txe_neg(void *a, void *res);
 extern void _mlir_ciface_txe_abs(void *a, void *res);
 extern void _mlir_ciface_txe_sin(void *a, void *res);
 extern void _mlir_ciface_txe_sigmoid(void *a, void *res);
+extern void _mlir_ciface_txe_silu(void *a, void *res);
 extern void ggml_tsi_log_tensor_data(tensor_log log_data);
 
 #define NUM_OF_TXES 1
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index b30b4cb386f9f..1238093e41c81 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -939,8 +939,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                 } else {
                     cur_backend_id = *node_backend_id;
                 }
-            } else if (cur_backend_id != -1) {
-                ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
+		// Below Code is Optimization which i am disabling for now since we have not implemented other
+		// Operation at tsavorite
+            } else if (cur_backend_id != -1 || (node->op == GGML_OP_UNARY)) {
+                //ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
+                ggml_backend_sched_set_if_supported(sched, node, 0, node_backend_id);
             }
         }
     }
diff --git a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
index e359906b61ce6..573220c8a7027 100644
--- a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
+++ b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
@@ -436,6 +436,11 @@ static txe_compute_pipeline_state_s tsi_kernel_setup(enum ggml_tsavorite_kernel_
           kernel_pipeline->kernel_name = "TXE_SIGMOID";
           flag = true;
           break;
+      case GGML_TSAVORITE_KERNEL_TYPE_SILU:
+          kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_silu;
+          kernel_pipeline->kernel_name = "TXE_SILU";
+          flag = true;
+          break;
       default:
           break;
   }
@@ -580,15 +585,16 @@ static struct ggml_backend_tsavorite_context *ggml_tsavorite_init(ggml_backend_d
     GGML_TSAVORITE_LOG_WARN("%s: skipping %-40s (not supported)\n", __func__, "kernel_" #e);       \
   }
 
-    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_ADD, true);
-    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SUB, true);
-    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_MULT, true);
-    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_DIV, true);
-    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SQRT, true);
-    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_NEG, true);
-    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_ABS, true);
-    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SIN, true);
-    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SIGMOID, true);
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_ADD,                true);
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SUB,                true);
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_MULT,               true);
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_DIV,                true);
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SQRT,               true);
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_NEG,                true);
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_ABS,                true);
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SIN,                true);
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SIGMOID,            true);
+    GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SILU,               true);
   }
 
   GGML_TSAVORITE_LOG_INFO("End %s\n", __func__);
@@ -695,6 +701,7 @@ static bool ggml_tsavorite_supports_op(const struct ggml_backend_tsavorite_devic
     case GGML_UNARY_OP_NEG:
     case GGML_UNARY_OP_ABS:
     case GGML_UNARY_OP_SIGMOID:
+    case GGML_UNARY_OP_SILU:
       break;
     default:
       return false;
@@ -852,6 +859,10 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
         kernel_type = GGML_TSAVORITE_KERNEL_TYPE_SIGMOID;
         num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS;
         break;
+      case GGML_UNARY_OP_SILU:
+        kernel_type = GGML_TSAVORITE_KERNEL_TYPE_SILU;
+        num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS;
+        break;
       default:
         ggml_backend_tsavorite_device_rel(
             (struct ggml_backend_tsavorite_device_context *)backend->device->context);
@@ -1806,6 +1817,7 @@ static bool ggml_backend_tsavorite_device_offload_op(ggml_backend_dev_t dev,
     case GGML_UNARY_OP_NEG:
     case GGML_UNARY_OP_ABS:
     case GGML_UNARY_OP_SIGMOID:
+    case GGML_UNARY_OP_SILU:
       break;
     default:
       return false;
diff --git a/tsi-pkg-build.sh b/tsi-pkg-build.sh
index b6b998671544c..4d6a8c736a5a8 100755
--- a/tsi-pkg-build.sh
+++ b/tsi-pkg-build.sh
@@ -67,10 +67,12 @@ fi
 cat > ${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh << EOL
 #!/bin/bash
 export LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:\$(pwd)
-mkdir -p ${TSI_BLOB_INSTALL_DIR}/txe_mult
-mkdir -p ${TSI_BLOB_INSTALL_DIR}/txe_add
-cp blobs ${TSI_BLOB_INSTALL_DIR}/txe_mult/ -r
-cp blobs ${TSI_BLOB_INSTALL_DIR}/txe_add/ -r
+tsi_kernels=("add" "sub" "mult" "div" "abs" "inv" "neg" "sin" "sqrt" "sigmoid" "silu")
+
+for kernel in "${tsi_kernels[@]}"; do
+    mkdir -p ${TSI_BLOB_INSTALL_DIR}/txe_$kernel
+    cp blobs ${TSI_BLOB_INSTALL_DIR}/txe_$kernel/ -r
+done
 EOL
 chmod +x ${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh
 cp ${GGML_TSI_INSTALL_DIR}/fpga/blobs ${TSI_GGML_BUNDLE_INSTALL_DIR}/ -r

From f919789e23efb9270616cff3970e6784f3fe5119 Mon Sep 17 00:00:00 2001
From: Anoop Kapoor <akapoor@wssw01.tsavoritesi.net>
Date: Thu, 29 May 2025 14:17:33 -0700
Subject: [PATCH 08/35] @FIR-709: Fixed the script

---
 tsi-pkg-build.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tsi-pkg-build.sh b/tsi-pkg-build.sh
index 4d6a8c736a5a8..2dd5f048871b7 100755
--- a/tsi-pkg-build.sh
+++ b/tsi-pkg-build.sh
@@ -64,14 +64,14 @@ if [ -e ${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh ]; then
    rm -fr ${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh
 fi
 
-cat > ${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh << EOL
+cat > ./${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh << EOL
 #!/bin/bash
 export LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:\$(pwd)
 tsi_kernels=("add" "sub" "mult" "div" "abs" "inv" "neg" "sin" "sqrt" "sigmoid" "silu")
 
-for kernel in "${tsi_kernels[@]}"; do
-    mkdir -p ${TSI_BLOB_INSTALL_DIR}/txe_$kernel
-    cp blobs ${TSI_BLOB_INSTALL_DIR}/txe_$kernel/ -r
+for kernel in "\${tsi_kernels[@]}"; do
+    mkdir -p ${TSI_BLOB_INSTALL_DIR}/txe_\$kernel
+    cp blobs ${TSI_BLOB_INSTALL_DIR}/txe_\$kernel/ -r
 done
 EOL
 chmod +x ${TSI_GGML_BUNDLE_INSTALL_DIR}/ggml.sh

From 9459c0c16b7e6ed83095e5c7b270069c24a56e7f Mon Sep 17 00:00:00 2001
From: Ashish Trivedi <atrivedi@tsavoritesi.com>
Date: Tue, 3 Jun 2025 09:39:54 -0700
Subject: [PATCH 09/35] @FIR-714: Updated SDK version to r0.1.3 version

---
 CMakeLists.txt   | 6 ++++--
 ggml-tsi-kernel  | 2 +-
 tsi-pkg-build.sh | 7 ++++---
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e047785e603d6..d1986def391fa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,7 +15,8 @@ if (GGML_TSAVORITE)
 
     if (NOT DEFINED MLIR_COMPILER_DIR)
         if (NOT DEFINED $ENV{MLIR_SDK_VERSION})
-            set (MLIR_COMPILER_DIR /proj/work/rel/sw/sdk-r.0.1.2/compiler)
+            set (MLIR_COMPILER_DIR /proj/rel/sw/sdk-r.0.1.3/compiler)
+	    message("MLIR_SDK_VERSION not set defaulting to ${MLIR_COMPILER_DIR}")
         else()
             set (MLIR_COMPILER_DIR $ENV{MLIR_SDK_VERSION}/compiler)
         endif()
@@ -23,7 +24,8 @@ if (GGML_TSAVORITE)
 
     if (NOT DEFINED RUNTIME_DIR)
         if (NOT DEFINED $ENV{MLIR_SDK_VERSION})
-            set (RUNTIME_DIR /proj/work/rel/sw/sdk-r.0.1.2/${GGML_TSAVORITE_TARGET}/runtime)
+            set (RUNTIME_DIR /proj/rel/sw/sdk-r.0.1.3/${GGML_TSAVORITE_TARGET}/runtime)
+	    message("MLIR_SDK_VERSION not set defaulting to ${RUNTIME_DIR}")
         else()
             set (RUNTIME_DIR $ENV{MLIR_SDK_VERSION}/${GGML_TSAVORITE_TARGET}/runtime)
         endif()
diff --git a/ggml-tsi-kernel b/ggml-tsi-kernel
index d1383a04f29d0..9dcf09f210636 160000
--- a/ggml-tsi-kernel
+++ b/ggml-tsi-kernel
@@ -1 +1 @@
-Subproject commit d1383a04f29d0160750c0e51ab524d461c6a127b
+Subproject commit 9dcf09f2106364d0dafa54bce743d1c11b701112
diff --git a/tsi-pkg-build.sh b/tsi-pkg-build.sh
index 2dd5f048871b7..2a2c0afe462a3 100755
--- a/tsi-pkg-build.sh
+++ b/tsi-pkg-build.sh
@@ -6,12 +6,14 @@ echo 'updating submodule'
 git submodule update --recursive --init
 cd ggml-tsi-kernel/
 module load tsi4 gcc/13.3.0
+export MLIR_SDK_VERSION=/proj/rel/sw/sdk-r.0.1.3
 echo 'creating python virtual env'
+/proj/local/Python-3.10.12/bin/python3 -m venv blob-creation
 python3 -m venv blob-creation
 source blob-creation/bin/activate
 echo 'installing mlir and python dependencies'
-pip install -r /proj/rel/sw/mlir-compiler/python/requirements-common.txt
-pip install /proj/rel/sw/mlir-compiler/python/mlir_external_packages-1.2.1-py3-none-any.whl
+pip install -r ${MLIR_SDK_VERSION}/compiler/python/requirements-common.txt
+pip install ${MLIR_SDK_VERSION}/compiler/python/mlir_external_packages-1.2.3-py3-none-any.whl
 pip install onnxruntime-training
 
 #build TSI kernels for the Tsavorite backend
@@ -31,7 +33,6 @@ cd ../posix-kernel/
 
 cd ../../
 
-export MLIR_SDK_VERSION=/proj/work/rel/sw/sdk-r.0.1.2
 #Compile for posix with build-posix as a target folder
 
 echo 'building llama.cp, ggml for tsavorite  and other binary for posix'

From c18585c2fa0c09015e3d2b08861bf704361889ff Mon Sep 17 00:00:00 2001
From: Ashish Trivedi <atrivedi@tsavoritesi.com>
Date: Tue, 3 Jun 2025 11:49:25 -0700
Subject: [PATCH 10/35] @FIR-714: Updated TLIBS to be passed to llama_build
 function

---
 tests/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 31fa312f65da6..ade78632c7352 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -8,7 +8,7 @@ function(llama_build source)
     endif()
 
     add_executable(${TEST_TARGET} ${source})
-    target_link_libraries(${TEST_TARGET} PRIVATE common)
+    target_link_libraries(${TEST_TARGET} PRIVATE common ${TLIBS})
     install(TARGETS ${TEST_TARGET} RUNTIME)
 endfunction()
 
@@ -169,7 +169,7 @@ endif()
 # libmtmd
 set(LLAMA_TEST_NAME test-mtmd-c-api)
 llama_build_and_test(test-mtmd-c-api.c)
-target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd)
+target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd ${TLIBS})
 
 # dummy executable - not installed
 get_filename_component(TEST_TARGET test-c.c NAME_WE)

From 47ceff08be48672c997ee3b3242fdf504cb54696 Mon Sep 17 00:00:00 2001
From: Ashish Trivedi <atrivedi@tsavoritesi.com>
Date: Wed, 4 Jun 2025 10:50:11 -0700
Subject: [PATCH 11/35] @FIR-714: Updated to use 1.30 external dependencies

---
 CMakeLists.txt       | 4 ++--
 tests/CMakeLists.txt | 2 +-
 tsi-pkg-build.sh     | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d1986def391fa..6af525e29cdbc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -35,14 +35,14 @@ if (GGML_TSAVORITE)
        set (GGML_TSI_KERNEL_DIR ${CMAKE_SOURCE_DIR}/ggml-tsi-kernel/${GGML_TSAVORITE_TARGET})
     endif()
 
-    file(GLOB TLIBS  "${RUNTIME_DIR}/lib/*.so" "${GGML_TSI_KERNEL_DIR}/host/*.o")
 
     if (${GGML_TSAVORITE_TARGET} STREQUAL fpga)
         set(CMAKE_CROSSCOMPILING ON)
         set(ARCH_FLAGS -march=armv8-a)
+        file(GLOB TLIBS  "${RUNTIME_DIR}/lib/*.so" "${GGML_TSI_KERNEL_DIR}/host/*.o")
         message("Setting target as fpga")
     elseif (${GGML_TSAVORITE_TARGET} STREQUAL "posix")
-	list(APPEND TLIBS "${MLIR_COMPILER_DIR}/lib/libFFMDeviceShim.so")
+        file(GLOB TLIBS  "${RUNTIME_DIR}/lib/*.so" "${GGML_TSI_KERNEL_DIR}/host/*.o" "${MLIR_COMPILER_DIR}/lib/libFFMDeviceShim.so" "${MLIR_COMPILER_DIR}/lib/libTsavRTPosixShimCAPI.so")
         message("Setting target as posix for tsavorite")
     endif()
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index ade78632c7352..6ffd975127e8e 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -69,7 +69,7 @@ function(llama_build_and_test source)
 
     add_executable(${TEST_TARGET} ${source} get-model.cpp)
     install(TARGETS ${TEST_TARGET} RUNTIME)
-    target_link_libraries(${TEST_TARGET} PRIVATE common)
+    target_link_libraries(${TEST_TARGET} PRIVATE common ${TLIBS})
 
     add_test(
         NAME ${TEST_TARGET}
diff --git a/tsi-pkg-build.sh b/tsi-pkg-build.sh
index 2a2c0afe462a3..488d98abb035b 100755
--- a/tsi-pkg-build.sh
+++ b/tsi-pkg-build.sh
@@ -13,7 +13,7 @@ python3 -m venv blob-creation
 source blob-creation/bin/activate
 echo 'installing mlir and python dependencies'
 pip install -r ${MLIR_SDK_VERSION}/compiler/python/requirements-common.txt
-pip install ${MLIR_SDK_VERSION}/compiler/python/mlir_external_packages-1.2.3-py3-none-any.whl
+pip install ${MLIR_SDK_VERSION}/compiler/python/mlir_external_packages-1.3.0-py3-none-any.whl
 pip install onnxruntime-training
 
 #build TSI kernels for the Tsavorite backend

From 2ea9390d102562f36d655638f0e7345d30c48ccd Mon Sep 17 00:00:00 2001
From: Ashish Trivedi <atrivedi@tsavoritesi.com>
Date: Wed, 4 Jun 2025 13:29:14 -0700
Subject: [PATCH 12/35] @FIR-714: Addressed build failures for posix, FPGA
 still fails

as follows

/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: /proj/work/atrivedi/workspace/06_02_2025/llama.cpp/ggml-tsi-kernel/fpga/host/host_abs.o: in function `txe_abs_host':
LLVMDialectModule:(.text+0x18): undefined reference to `tsi_alloc'
/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: LLVMDialectModule:(.text+0x24): undefined reference to `tsi_shmem_handle_from_ptr'
/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: LLVMDialectModule:(.text+0x30): undefined reference to `tsi_shmem_handle_from_ptr'
/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: LLVMDialectModule:(.text+0x3c): undefined reference to `tsi_create_command_list'
/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: LLVMDialectModule:(.text+0x58): undefined reference to `tsi_load_blob'
/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: LLVMDialectModule:(.text+0x64): undefined reference to `tsi_shmem_handle_from_ptr'
/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: LLVMDialectModule:(.text+0x70): undefined reference to `tsi_launch_blob'
/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: LLVMDialectModule:(.text+0x7c): undefined reference to `tsi_add_command_to_list'
/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: LLVMDialectModule:(.text+0x84): undefined reference to `tsi_finalize_command_list'
/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: LLVMDialectModule:(.text+0x8c): undefined reference to `tsi_wait'
/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: LLVMDialectModule:(.text+0x94): undefined reference to `tsi_unload_blob'
/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: LLVMDialectModule:(.text+0xa0): undefined reference to `tsi_dealloc'
/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: /proj/work/atrivedi/workspace/06_02_2025/llama.cpp/ggml-tsi-kernel/fpga/host/host_add.o: in function `txe_add_host':
LLVMDialectModule:(.text+0x20): undefined reference to `tsi_alloc'
/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: LLVMDialectModule:(.text+0x2c): undefined reference to `tsi_shmem_handle_from_ptr'
/proj/rel/sw/arm-gnu-toolchain-14.2.rel1-x86_64-aarch64-none-linux-gnu/bin/../lib/gcc/aarch64-none-linux-gnu/14.2.1/../../../../aarch64-none-linux-gnu/bin/ld: LLVMDialectModule:(.text+0x38): undefined reference to `tsi_shmem_handle_from_ptr'
---
 CMakeLists.txt                             |  2 +-
 ggml/include/ggml-tsavorite.h              | 20 ++++++++++----------
 ggml/src/ggml-tsavorite/ggml-tsavorite.cpp | 20 ++++++++++----------
 tests/CMakeLists.txt                       |  8 ++++----
 4 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6af525e29cdbc..2eebb65851cad 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,7 +39,7 @@ if (GGML_TSAVORITE)
     if (${GGML_TSAVORITE_TARGET} STREQUAL fpga)
         set(CMAKE_CROSSCOMPILING ON)
         set(ARCH_FLAGS -march=armv8-a)
-        file(GLOB TLIBS  "${RUNTIME_DIR}/lib/*.so" "${GGML_TSI_KERNEL_DIR}/host/*.o")
+	file(GLOB TLIBS  "${RUNTIME_DIR}/lib/*.so" "${GGML_TSI_KERNEL_DIR}/host/*.o")
         message("Setting target as fpga")
     elseif (${GGML_TSAVORITE_TARGET} STREQUAL "posix")
         file(GLOB TLIBS  "${RUNTIME_DIR}/lib/*.so" "${GGML_TSI_KERNEL_DIR}/host/*.o" "${MLIR_COMPILER_DIR}/lib/libFFMDeviceShim.so" "${MLIR_COMPILER_DIR}/lib/libTsavRTPosixShimCAPI.so")
diff --git a/ggml/include/ggml-tsavorite.h b/ggml/include/ggml-tsavorite.h
index 54a8e34662799..238dcc428da88 100644
--- a/ggml/include/ggml-tsavorite.h
+++ b/ggml/include/ggml-tsavorite.h
@@ -151,16 +151,16 @@ typedef struct tensor_log_ {
   const ggml_tensor *tensor;
 } tensor_log;
 
-extern void _mlir_ciface_txe_add(void *a, void *b, void *res);
-extern void _mlir_ciface_txe_sub(void *a, void *b, void *res);
-extern void _mlir_ciface_txe_mult(void *a, void *b, void *res);
-extern void _mlir_ciface_txe_div(void *a, void *b, void *res);
-extern void _mlir_ciface_txe_sqrt(void *a, void *res);
-extern void _mlir_ciface_txe_neg(void *a, void *res);
-extern void _mlir_ciface_txe_abs(void *a, void *res);
-extern void _mlir_ciface_txe_sin(void *a, void *res);
-extern void _mlir_ciface_txe_sigmoid(void *a, void *res);
-extern void _mlir_ciface_txe_silu(void *a, void *res);
+extern void _mlir_ciface_txe_add_host(void *a, void *b, void *res);
+extern void _mlir_ciface_txe_sub_host(void *a, void *b, void *res);
+extern void _mlir_ciface_txe_mult_host(void *a, void *b, void *res);
+extern void _mlir_ciface_txe_div_host(void *a, void *b, void *res);
+extern void _mlir_ciface_txe_sqrt_host(void *a, void *res);
+extern void _mlir_ciface_txe_neg_host(void *a, void *res);
+extern void _mlir_ciface_txe_abs_host(void *a, void *res);
+extern void _mlir_ciface_txe_sin_host(void *a, void *res);
+extern void _mlir_ciface_txe_sigmoid_host(void *a, void *res);
+extern void _mlir_ciface_txe_silu_host(void *a, void *res);
 extern void ggml_tsi_log_tensor_data(tensor_log log_data);
 
 #define NUM_OF_TXES 1
diff --git a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
index 573220c8a7027..bc7095eeebf2f 100644
--- a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
+++ b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
@@ -389,12 +389,12 @@ static txe_compute_pipeline_state_s tsi_kernel_setup(enum ggml_tsavorite_kernel_
           if (ggml_tsavorite_kernel_mode_flag == GGML_TSAVORITE_KERNEL_MODE_CPU)
               kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_add_test;
           else
-              kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_add;
+              kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_add_host;
           kernel_pipeline->kernel_name = "TXE_ADD";
           flag = true;
           break;
       case GGML_TSAVORITE_KERNEL_TYPE_SUB:
-          kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_sub;
+          kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_sub_host;
           kernel_pipeline->kernel_name = "TXE_SUB";
           flag = true;
           break;
@@ -402,42 +402,42 @@ static txe_compute_pipeline_state_s tsi_kernel_setup(enum ggml_tsavorite_kernel_
           if (ggml_tsavorite_kernel_mode_flag == GGML_TSAVORITE_KERNEL_MODE_CPU)
               kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_mult_test;
           else
-              kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_mult;
+              kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_mult_host;
           kernel_pipeline->kernel_name = "TXE_MULT";
           flag = true;
           break;
       case GGML_TSAVORITE_KERNEL_TYPE_DIV:
-          kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_div;
+          kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_div_host;
           kernel_pipeline->kernel_name = "TXE_DIV";
           flag = true;
           break;
       case GGML_TSAVORITE_KERNEL_TYPE_SQRT:
-          kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_sqrt;
+          kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_sqrt_host;
           kernel_pipeline->kernel_name = "TXE_SQRT";
           flag = true;
           break;
       case GGML_TSAVORITE_KERNEL_TYPE_NEG:
-          kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_neg;
+          kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_neg_host;
           kernel_pipeline->kernel_name = "TXE_NEG";
           flag = true;
           break;
       case GGML_TSAVORITE_KERNEL_TYPE_ABS:
-          kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_abs;
+          kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_abs_host;
           kernel_pipeline->kernel_name = "TXE_ABS";
           flag = true;
           break;
       case GGML_TSAVORITE_KERNEL_TYPE_SIN:
-          kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_sin;
+          kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_sin_host;
           kernel_pipeline->kernel_name = "TXE_SIN";
           flag = true;
           break;
       case GGML_TSAVORITE_KERNEL_TYPE_SIGMOID:
-          kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_sigmoid;
+          kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_sigmoid_host;
           kernel_pipeline->kernel_name = "TXE_SIGMOID";
           flag = true;
           break;
       case GGML_TSAVORITE_KERNEL_TYPE_SILU:
-          kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_silu;
+          kernel_pipeline->_mlir_fptr_1_input = &_mlir_ciface_txe_silu_host;
           kernel_pipeline->kernel_name = "TXE_SILU";
           flag = true;
           break;
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 6ffd975127e8e..c2b5cc88ad330 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -8,7 +8,7 @@ function(llama_build source)
     endif()
 
     add_executable(${TEST_TARGET} ${source})
-    target_link_libraries(${TEST_TARGET} PRIVATE common ${TLIBS})
+    target_link_libraries(${TEST_TARGET} PRIVATE common)
     install(TARGETS ${TEST_TARGET} RUNTIME)
 endfunction()
 
@@ -69,7 +69,7 @@ function(llama_build_and_test source)
 
     add_executable(${TEST_TARGET} ${source} get-model.cpp)
     install(TARGETS ${TEST_TARGET} RUNTIME)
-    target_link_libraries(${TEST_TARGET} PRIVATE common ${TLIBS})
+    target_link_libraries(${TEST_TARGET} PRIVATE common)
 
     add_test(
         NAME ${TEST_TARGET}
@@ -169,9 +169,9 @@ endif()
 # libmtmd
 set(LLAMA_TEST_NAME test-mtmd-c-api)
 llama_build_and_test(test-mtmd-c-api.c)
-target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd ${TLIBS})
+target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd)
 
 # dummy executable - not installed
 get_filename_component(TEST_TARGET test-c.c NAME_WE)
 add_executable(${TEST_TARGET} test-c.c)
-target_link_libraries(${TEST_TARGET} PRIVATE llama ${TLIBS})
+target_link_libraries(${TEST_TARGET} PRIVATE ${TLIBS} llama)

From cea50afa37a04114429b07abf9e6849c0d3a2ecb Mon Sep 17 00:00:00 2001
From: Ashish Trivedi <atrivedi@tsavoritesi.com>
Date: Wed, 4 Jun 2025 18:06:58 -0700
Subject: [PATCH 13/35] @FIR-714: Fixed the issues of not finding fpga libs
 using runtime/utils/lib/ path

---
 CMakeLists.txt       | 6 +++---
 tests/CMakeLists.txt | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2eebb65851cad..a4d51cdbe2dc6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,7 +16,7 @@ if (GGML_TSAVORITE)
     if (NOT DEFINED MLIR_COMPILER_DIR)
         if (NOT DEFINED $ENV{MLIR_SDK_VERSION})
             set (MLIR_COMPILER_DIR /proj/rel/sw/sdk-r.0.1.3/compiler)
-	    message("MLIR_SDK_VERSION not set defaulting to ${MLIR_COMPILER_DIR}")
+            message("MLIR_SDK_VERSION not set defaulting to ${MLIR_COMPILER_DIR}")
         else()
             set (MLIR_COMPILER_DIR $ENV{MLIR_SDK_VERSION}/compiler)
         endif()
@@ -25,7 +25,7 @@ if (GGML_TSAVORITE)
     if (NOT DEFINED RUNTIME_DIR)
         if (NOT DEFINED $ENV{MLIR_SDK_VERSION})
             set (RUNTIME_DIR /proj/rel/sw/sdk-r.0.1.3/${GGML_TSAVORITE_TARGET}/runtime)
-	    message("MLIR_SDK_VERSION not set defaulting to ${RUNTIME_DIR}")
+            message("MLIR_SDK_VERSION not set defaulting to ${RUNTIME_DIR}")
         else()
             set (RUNTIME_DIR $ENV{MLIR_SDK_VERSION}/${GGML_TSAVORITE_TARGET}/runtime)
         endif()
@@ -39,7 +39,7 @@ if (GGML_TSAVORITE)
     if (${GGML_TSAVORITE_TARGET} STREQUAL fpga)
         set(CMAKE_CROSSCOMPILING ON)
         set(ARCH_FLAGS -march=armv8-a)
-	file(GLOB TLIBS  "${RUNTIME_DIR}/lib/*.so" "${GGML_TSI_KERNEL_DIR}/host/*.o")
+        file(GLOB TLIBS  "${RUNTIME_DIR}/lib/*.so" "${GGML_TSI_KERNEL_DIR}/host/*.o" "${RUNTIME_DIR}/../utils/lib/TsavRTShimCAPI.cpp.o")
         message("Setting target as fpga")
     elseif (${GGML_TSAVORITE_TARGET} STREQUAL "posix")
         file(GLOB TLIBS  "${RUNTIME_DIR}/lib/*.so" "${GGML_TSI_KERNEL_DIR}/host/*.o" "${MLIR_COMPILER_DIR}/lib/libFFMDeviceShim.so" "${MLIR_COMPILER_DIR}/lib/libTsavRTPosixShimCAPI.so")
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index c2b5cc88ad330..1c8b8e29a822e 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -174,4 +174,4 @@ target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd)
 # dummy executable - not installed
 get_filename_component(TEST_TARGET test-c.c NAME_WE)
 add_executable(${TEST_TARGET} test-c.c)
-target_link_libraries(${TEST_TARGET} PRIVATE ${TLIBS} llama)
+target_link_libraries(${TEST_TARGET} PRIVATE ${TLIBS} llama stdc++)

From bbecb0102ccb617bb88aac21055a589cb5c3ef6c Mon Sep 17 00:00:00 2001
From: Anoop Kapoor <akapoor@wssw01.tsavoritesi.net>
Date: Fri, 6 Jun 2025 20:50:10 -0700
Subject: [PATCH 14/35] Updated README

---
 README.md | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 30f91f4696a33..f222c9a1a8ae1 100644
--- a/README.md
+++ b/README.md
@@ -588,15 +588,22 @@ git clone git@github.com:tsisw/llama.cpp.git
 
 #Ensure prerequisites are met as follows
 cd llama.cpp/
+#Ensure prerequisites are met as follows
+echo 'updating submodule'
 git submodule update --recursive --init
 cd ggml-tsi-kernel/
 module load tsi4 gcc/13.3.0
-python3 -m venv blob-creation
+export MLIR_SDK_VERSION=/proj/rel/sw/sdk-r.0.1.3
+echo 'creating python virtual env'
+/proj/local/Python-3.10.12/bin/python3 -m venv blob-creation
 source blob-creation/bin/activate
-pip install -r /proj/rel/sw/sdk-r.0.1.3/compiler/python/requirements-common.txt
-pip install /proj/rel/sw/sdk-r.0.1.3/compiler/python/mlir_external_packages-1.3.0-py3-none-any.whl
+echo 'installing mlir and python dependencies'
+pip install -r ${MLIR_SDK_VERSION}/compiler/python/requirements-common.txt
+pip install ${MLIR_SDK_VERSION}/compiler/python/mlir_external_packages-1.3.0-py3-none-any.whl
 pip install onnxruntime-training
 
+
+
 #build TSI kernels for the Tsavorite backend
 #First for FPGA
 cd fpga-kernel

From d7685c7ec3a83534bb699c6cfc56d28f78134930 Mon Sep 17 00:00:00 2001
From: Anoop Kapoor <akapoor@wssw01.tsavoritesi.net>
Date: Sun, 8 Jun 2025 12:38:56 -0700
Subject: [PATCH 15/35] @FIR-722 --updating the latest changes for
 ggml-tsi-kernel  code

---
 ggml-tsi-kernel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-tsi-kernel b/ggml-tsi-kernel
index 0696a053d553c..3194c54b13b8c 160000
--- a/ggml-tsi-kernel
+++ b/ggml-tsi-kernel
@@ -1 +1 @@
-Subproject commit 0696a053d553c99488c1f7d60a859afdce9712df
+Subproject commit 3194c54b13b8cd0b5c29a6a1cc0060ae2abbed06

From 9688963ed09199949b6c38dad67a1b061b38f933 Mon Sep 17 00:00:00 2001
From: Ashish Trivedi <atrivedi@fpga2.tsavoritesi.net>
Date: Tue, 10 Jun 2025 11:33:49 -0700
Subject: [PATCH 16/35] @FIR-715: Added FlaskInterface tool for serial port
 This is a first version of FlaskInterface tool with following 1. Xterm
 Interface via Browser via /terminal endpoint 2. Serial console interface via
 Browser via /serial endpoint

---
 tools/flaskIfc/README.md        | 46 ++++++++++++++++++
 tools/flaskIfc/flaskCommon.py   | 85 +++++++++++++++++++++++++++++++++
 tools/flaskIfc/flaskIfc.py      | 32 +++++++++++++
 tools/flaskIfc/flaskXterm.py    | 43 +++++++++++++++++
 tools/flaskIfc/serial_script.py | 44 +++++++++++++++++
 5 files changed, 250 insertions(+)
 create mode 100644 tools/flaskIfc/README.md
 create mode 100644 tools/flaskIfc/flaskCommon.py
 create mode 100644 tools/flaskIfc/flaskIfc.py
 create mode 100644 tools/flaskIfc/flaskXterm.py
 create mode 100644 tools/flaskIfc/serial_script.py

diff --git a/tools/flaskIfc/README.md b/tools/flaskIfc/README.md
new file mode 100644
index 0000000000000..4893dc75147c8
--- /dev/null
+++ b/tools/flaskIfc/README.md
@@ -0,0 +1,46 @@
+This tool provides you an interface to Tsavorite FPGA via a serial console
+
+The tool consists of following files
+
+.
+├── flaskCommon.py    << Common code but currently not used
+├── flaskIfc.py       << Browser based console interface to TSI device
+├── flaskXterm.py     << Browser based terminal emulation
+├── README.md         << Readme file
+└── serial_script.py  << File with serial interface to console
+
+
+The command to run to run the service on FPGA machine is
+```
+flask -A flaskIfc.py --debug run --port 5000
+```
+
+This command runs a webserver at port number 500
+
+The curl command to connect to this server and communicate is as follows as 
+an example
+
+```
+curl "http://localhost:5000/serial?command=cd+%20/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/;./run_platform_test.sh"
+```
+
+In the above command the command being run is
+
+```
+cd /usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin
+./run_platform_test.sh
+```
+
+You can also get full fledged Terminal within a browser by running following
+
+```
+flask -A flaskXterm.py --debug run --port 5000
+```
+
+You can connect to this flaskTerm by doing as follows
+
+```
+http://127.0.0.1:5000/terminal
+```
+
+
diff --git a/tools/flaskIfc/flaskCommon.py b/tools/flaskIfc/flaskCommon.py
new file mode 100644
index 0000000000000..eb93a63fcf395
--- /dev/null
+++ b/tools/flaskIfc/flaskCommon.py
@@ -0,0 +1,85 @@
+
+from flask import Flask
+from flask_terminal import terminal_blueprint, configure_logger
+from flask import Flask, render_template, request
+import serial
+
+
+app = Flask(__name__)
+app.logger = configure_logger('flask_terminal')
+
+app.config['SECRET_KEY'] = 'your_secret_key_here'
+
+
+@app.route('/ping')
+def ping():
+    app.logger.info("Accessed /ping route")
+    try:
+        app.logger.info("Successfully returned 'pong'")
+        return 'pong', 200
+    except Exception as e:
+        app.logger.error(f"Error in ping route: {e}", exc_info=True)
+        return "An error occurred", 500
+
+####
+## IMPLEMENT SOME SORT OF SECURITY 
+## Around your application, below is an example
+###
+def is_authenticated():
+    """Check if the user is authenticated based on a token stored in the session."""
+    # Example logic for checking if a user is authenticated
+    return 'user_token' in session and session['user_token'] == 'your_secure_token'
+
+#@terminal_blueprint.before_request
+#def before_request_func():
+#    if not is_authenticated():
+        # Redirect to login page or return an error
+#        current_app.logger.info("User not authenticated, redirecting to login.")
+#        return redirect('/login')  # Adjusted to use a direct path
+
+
+# Register the terminal blueprint
+#app.register_blueprint(terminal_blueprint, url_prefix='/terminal')
+
+#if __name__ == '__main__':
+#    app.run(port=8080)
+
+
+
+try:
+    ser = serial.Serial('/dev/ttyUSB3', 921600) # Replace /dev/ttyUSB3 with your port and baud rate
+except serial.SerialException as e:
+    print(f"Error opening serial port: {e}")
+    ser = None # Handle case where serial port cannot be opened
+
+@app.route('/send', methods=['POST'])
+def send_data():
+    if ser is None:
+        return "Serial port not available", 500
+    data = request.form['data'] # Get data from the form
+    try:
+        ser.write(data.encode()) # Convert to bytes and send
+        return 'Data sent successfully'
+    except serial.SerialException as e:
+        return f"Error writing to serial port: {e}", 500
+
+
+@app.route('/receive')
+def receive_data():
+    if ser is None:
+        return "Serial port not available", 500
+    try:
+        if ser.in_waiting > 0:
+            data = ser.readline().decode().strip() # Read and decode
+            return data
+        else:
+            return "No data available"
+    except serial.SerialException as e:
+        return f"Error reading from serial port: {e}", 500
+
+# Register the terminal blueprint
+#app.register_blueprint(terminal_blueprint, url_prefix='/terminal')
+
+if __name__ == '__main__':
+    app.run(port=8080)
+
diff --git a/tools/flaskIfc/flaskIfc.py b/tools/flaskIfc/flaskIfc.py
new file mode 100644
index 0000000000000..61187c91a09d4
--- /dev/null
+++ b/tools/flaskIfc/flaskIfc.py
@@ -0,0 +1,32 @@
+from flask import Flask, request
+import subprocess
+
+app = Flask(__name__)
+
+@app.route('/serial', methods=['GET'])
+def serial_command():
+    # Currently the port is hard coded to /dev/ttyUSB3 but can be parameterized
+    port = '/dev/ttyUSB3'
+    #port = request.args.get('port')
+
+    # Currently the baudrate is hard coded to 921600 but can be parameterized
+    #baudrate = request.args.get('baudrate')
+    baudrate = '921600'
+
+
+    # Parse the command and send it to serial.py 
+    command = request.args.get('command')
+
+    #if not all([port, baudrate, command]):
+    if not all([command]):
+        return "Missing parameters", 400
+
+    try:
+        result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True)
+        return result.stdout.strip(), 200
+    except subprocess.CalledProcessError as e:
+        return f"Error executing script: {e.stderr}", 500
+
+
+if __name__ == '__main__':
+    app.run(debug=True, port=5000)
diff --git a/tools/flaskIfc/flaskXterm.py b/tools/flaskIfc/flaskXterm.py
new file mode 100644
index 0000000000000..df7ecf391471d
--- /dev/null
+++ b/tools/flaskIfc/flaskXterm.py
@@ -0,0 +1,43 @@
+
+from flask import Flask
+from flask_terminal import terminal_blueprint, configure_logger
+
+
+app = Flask(__name__)
+app.logger = configure_logger('flask_terminal')
+
+app.config['SECRET_KEY'] = 'your_secret_key_here'
+
+
+@app.route('/ping')
+def ping():
+    app.logger.info("Accessed /ping route")
+    try:
+        app.logger.info("Successfully returned 'pong'")
+        return 'pong', 200
+    except Exception as e:
+        app.logger.error(f"Error in ping route: {e}", exc_info=True)
+        return "An error occurred", 500
+
+####
+## IMPLEMENT SOME SORT OF SECURITY 
+## Around your application, below is an example
+###
+def is_authenticated():
+    """Check if the user is authenticated based on a token stored in the session."""
+    # Example logic for checking if a user is authenticated
+    return 'user_token' in session and session['user_token'] == 'your_secure_token'
+
+#@terminal_blueprint.before_request
+#def before_request_func():
+#    if not is_authenticated():
+        # Redirect to login page or return an error
+#        current_app.logger.info("User not authenticated, redirecting to login.")
+#        return redirect('/login')  # Adjusted to use a direct path
+
+
+# Register the terminal blueprint
+app.register_blueprint(terminal_blueprint, url_prefix='/terminal')
+
+if __name__ == '__main__':
+    app.run(port=8080)
diff --git a/tools/flaskIfc/serial_script.py b/tools/flaskIfc/serial_script.py
new file mode 100644
index 0000000000000..e61f926d0bf47
--- /dev/null
+++ b/tools/flaskIfc/serial_script.py
@@ -0,0 +1,44 @@
+import serial
+import sys
+
+def send_serial_command(port, baudrate, command):
+    try:
+        # Open the serial port with 1 second timeout
+        ser = serial.Serial(port, baudrate, timeout=1)
+
+        ser.write(command.encode())  # Encode command to bytes
+        ser.write('\n'.encode())  # Encode command to bytes
+        
+        # Wait to read the serial port
+        data = '\0'
+        while True:
+            try:
+                line = ser.readline()
+                if line: # Check if line is not empty
+                    data += (line.decode('utf-8').strip()) # Decode and strip to remove extra chars
+                else:
+                    break  # Exit loop if no data is received
+            except serial.SerialException as e:
+                ser.close()
+                return (f"Error reading from serial port: {e}")
+            except KeyboardInterrupt:
+                ser.close()
+                return ("Program interrupted by user")
+        ser.close()
+        return data
+
+    except serial.SerialException as e:
+        ser.close()
+        return f"Error: {e}"
+
+# This script can be run in standalone as well
+if __name__ == "__main__":
+    if len(sys.argv) < 4:
+        print("Usage: python script.py <port> <baudrate> <command>")
+        sys.exit(1)
+
+    port = sys.argv[1]
+    baudrate = int(sys.argv[2])
+    command = sys.argv[3]
+    response = send_serial_command(port, baudrate, command)
+    print(response)

From a4b77bfd0006da521c1fb7624791a8f45d9f3227 Mon Sep 17 00:00:00 2001
From: LewisLui777 <777abc.7@berkeley.edu>
Date: Wed, 11 Jun 2025 10:20:02 -0700
Subject: [PATCH 17/35] Just wanted to see if I could push. Added one comment

---
 tools/flaskIfc/serial_script.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/flaskIfc/serial_script.py b/tools/flaskIfc/serial_script.py
index b30e6fae1dade..e138d19ab7de0 100644
--- a/tools/flaskIfc/serial_script.py
+++ b/tools/flaskIfc/serial_script.py
@@ -10,6 +10,7 @@ def send_serial_command(port, baudrate, command):
         ser.write('\n'.encode())  # Encode command to bytes
         
         # Wait to read the serial port
+        # Need to add a break somewhere for when we see the phrase "root@name"
         data = '\0'
         while True:
             try:

From 21ba6d11ca8f2c95ed8110d2c570550a1288a43e Mon Sep 17 00:00:00 2001
From: Anoop Kapoor <akapoor@wssw01.tsavoritesi.net>
Date: Wed, 11 Jun 2025 16:49:18 -0700
Subject: [PATCH 18/35] @FIR-732 - Llama.cpp: Webserver & HTML pages support

---
 tools/flaskIfc/flaskIfc.py           | 54 +++++++++++++++++++++-------
 tools/flaskIfc/serial_script.py      |  6 ++--
 tools/flaskIfc/templates/index.html  | 38 ++++++++++++++++++++
 tools/flaskIfc/templates/result.html | 12 +++++++
 4 files changed, 93 insertions(+), 17 deletions(-)
 create mode 100644 tools/flaskIfc/templates/index.html
 create mode 100644 tools/flaskIfc/templates/result.html

diff --git a/tools/flaskIfc/flaskIfc.py b/tools/flaskIfc/flaskIfc.py
index 61187c91a09d4..4d65c9a7ffa0e 100644
--- a/tools/flaskIfc/flaskIfc.py
+++ b/tools/flaskIfc/flaskIfc.py
@@ -1,32 +1,60 @@
-from flask import Flask, request
+from flask import Flask, render_template, request
 import subprocess
 
 app = Flask(__name__)
 
-@app.route('/serial', methods=['GET'])
-def serial_command():
+@app.route('/')
+def index():
+    return render_template('index.html')
+
+@app.route('/submit', methods=['POST'])
+def submit():
+    #./run_platform_test.sh "my cat's name" "10" "tinyllama-vo-5m-para.gguf" "none"
+    model = request.form.get('model')
+    backend = request.form.get('backend')
+    tokens = request.form.get('tokens')
+    prompt = request.form.get('prompt')
+
+    # Define the model path (update with actual paths)
+    model_paths = {
+        "tiny-llama": "tinyllama-vo-5m-para.gguf",
+        "Tiny-llama-F32": "Tiny-Llama-v0.3-FP32-1.1B-F32.gguf"
+    }
+
+    model_path = model_paths.get(model, "")
+    if not model_path:
+        return f"<h2>Error: Model path not found for '{model}'</h2>"
+
+   # Below is for reference i will remove later
+    # Build llama-cli command
+    #command = [
+    #    "./llama-cli",
+    #    "-p", prompt,
+    #    "-m", model_path,
+    #    "--device", backend,
+    #    "--temp", "0",
+    #    "--n-predict", tokens,
+    #    "--repeat-penalty", "1",
+    #    "--top-k", "0",
+    #    "--top-p", "1"
+    #]
     # Currently the port is hard coded to /dev/ttyUSB3 but can be parameterized
     port = '/dev/ttyUSB3'
-    #port = request.args.get('port')
 
     # Currently the baudrate is hard coded to 921600 but can be parameterized
-    #baudrate = request.args.get('baudrate')
     baudrate = '921600'
 
+    script_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/run_platform_test.sh"
+    command = f"{script_path} \"{prompt}\" {tokens} {model_path} {backend}"
 
-    # Parse the command and send it to serial.py 
-    command = request.args.get('command')
-
-    #if not all([port, baudrate, command]):
-    if not all([command]):
-        return "Missing parameters", 400
 
     try:
         result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True)
-        return result.stdout.strip(), 200
+        output = result.stdout  # This should have \n
     except subprocess.CalledProcessError as e:
-        return f"Error executing script: {e.stderr}", 500
+        output = f"Error running model: {e.stderr}"
 
+    return render_template('result.html', output=output)
 
 if __name__ == '__main__':
     app.run(debug=True, port=5000)
diff --git a/tools/flaskIfc/serial_script.py b/tools/flaskIfc/serial_script.py
index e138d19ab7de0..cde5e0cd54dfc 100644
--- a/tools/flaskIfc/serial_script.py
+++ b/tools/flaskIfc/serial_script.py
@@ -4,19 +4,18 @@
 def send_serial_command(port, baudrate, command):
     try:
         # Open the serial port with 1 second timeout
-        ser = serial.Serial(port, baudrate, timeout=60)
+        ser = serial.Serial(port, baudrate, timeout=20)
 
         ser.write(command.encode())  # Encode command to bytes
         ser.write('\n'.encode())  # Encode command to bytes
         
         # Wait to read the serial port
-        # Need to add a break somewhere for when we see the phrase "root@name"
         data = '\0'
         while True:
             try:
                 line = ser.readline()
                 if line: # Check if line is not empty
-                    data += (line.decode('utf-8').strip()) # Decode and strip to remove extra chars
+                    data += line.decode('utf-8')  # Keep the line as-is with newline
                 else:
                     break  # Exit loop if no data is received
             except serial.SerialException as e:
@@ -42,4 +41,3 @@ def send_serial_command(port, baudrate, command):
     baudrate = int(sys.argv[2])
     command = sys.argv[3]
     response = send_serial_command(port, baudrate, command)
-    print(response)
diff --git a/tools/flaskIfc/templates/index.html b/tools/flaskIfc/templates/index.html
new file mode 100644
index 0000000000000..9152167a86c44
--- /dev/null
+++ b/tools/flaskIfc/templates/index.html
@@ -0,0 +1,38 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>TSAVORITE Web UI For Model Inference</title>
+</head>
+<body>
+    <h1>Model Inference Configuration</h1>
+    <form action="/submit" method="post">
+        <!-- Model Selection -->
+        <label for="model">Choose a model:</label>
+        <select name="model" id="model">
+            <option value="tiny-llama">Tiny LLaMA</option>
+            <option value="Tiny-llama-F32">LLaMA 2 1B</option>
+        </select>
+        <br><br>
+
+        <!-- Backend Selection -->
+        <label for="backend">Select backend:</label>
+        <select name="backend" id="backend">
+            <option value="tSavorite">TSAVORITE</option>
+            <option value="none">CPU</option>
+        </select>
+        <br><br>
+
+        <!-- Number of Tokens -->
+        <label for="tokens">Number of predicted tokens:</label>
+        <input type="number" name="tokens" id="tokens" min="1" max="1000" value="50">
+        <br><br>
+
+        <!-- Prompt Input -->
+        <label for="prompt">Prompt:</label><br>
+        <textarea name="prompt" id="prompt" rows="4" cols="50" placeholder="Enter your prompt here..."></textarea>
+        <br><br>
+
+        <button type="submit">Submit</button>
+    </form>
+</body>
+</html>
diff --git a/tools/flaskIfc/templates/result.html b/tools/flaskIfc/templates/result.html
new file mode 100644
index 0000000000000..07c79c409f596
--- /dev/null
+++ b/tools/flaskIfc/templates/result.html
@@ -0,0 +1,12 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Model Output</title>
+</head>
+<body>
+    <h1>Model Response</h1>
+    <pre>{{ output }}</pre>
+    <br>
+    <a href="/">⟵ Back to Form</a>
+</body>
+</html>

From 597f92800787739d23e95bd5bf700d5e277e9c42 Mon Sep 17 00:00:00 2001
From: Ashish Trivedi <atrivedi@fpga2.tsavoritesi.net>
Date: Wed, 11 Jun 2025 20:15:09 -0700
Subject: [PATCH 19/35] @FIR-732: Added print back to ensure stdout has data

---
 tools/flaskIfc/serial_script.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/flaskIfc/serial_script.py b/tools/flaskIfc/serial_script.py
index cde5e0cd54dfc..0e1064225921f 100644
--- a/tools/flaskIfc/serial_script.py
+++ b/tools/flaskIfc/serial_script.py
@@ -4,7 +4,7 @@
 def send_serial_command(port, baudrate, command):
     try:
         # Open the serial port with 1 second timeout
-        ser = serial.Serial(port, baudrate, timeout=20)
+        ser = serial.Serial(port, baudrate, timeout=60)
 
         ser.write(command.encode())  # Encode command to bytes
         ser.write('\n'.encode())  # Encode command to bytes
@@ -25,6 +25,7 @@ def send_serial_command(port, baudrate, command):
                 ser.close()
                 return ("Program interrupted by user")
         ser.close()
+        print (data)
         return data
 
     except serial.SerialException as e:

From 8a5ffff1fee344726e4e6da14016fd92e6ad14b6 Mon Sep 17 00:00:00 2001
From: Anoop Kapoor <akapoor@wssw01.tsavoritesi.net>
Date: Wed, 11 Jun 2025 21:46:16 -0700
Subject: [PATCH 20/35] @FIR-733 - Lllama.cpp: Webserver, add JOB status
 support for Model

---
 tools/flaskIfc/flaskIfc.py               | 55 ++++++++++++++++++++----
 tools/flaskIfc/templates/processing.html | 22 ++++++++++
 2 files changed, 68 insertions(+), 9 deletions(-)
 create mode 100644 tools/flaskIfc/templates/processing.html

diff --git a/tools/flaskIfc/flaskIfc.py b/tools/flaskIfc/flaskIfc.py
index 4d65c9a7ffa0e..e2945f680dfc1 100644
--- a/tools/flaskIfc/flaskIfc.py
+++ b/tools/flaskIfc/flaskIfc.py
@@ -1,5 +1,9 @@
 from flask import Flask, render_template, request
 import subprocess
+import threading
+import time
+
+job_status = {"running": False, "result": "", "thread": None}
 
 app = Flask(__name__)
 
@@ -9,6 +13,11 @@ def index():
 
 @app.route('/submit', methods=['POST'])
 def submit():
+    global job_status
+
+    if job_status["running"]:
+        return "<h2>A model is already running. Please wait or abort.</h2>"
+
     #./run_platform_test.sh "my cat's name" "10" "tinyllama-vo-5m-para.gguf" "none"
     model = request.form.get('model')
     backend = request.form.get('backend')
@@ -25,7 +34,6 @@ def submit():
     if not model_path:
         return f"<h2>Error: Model path not found for '{model}'</h2>"
 
-   # Below is for reference i will remove later
     # Build llama-cli command
     #command = [
     #    "./llama-cli",
@@ -43,18 +51,47 @@ def submit():
 
     # Currently the baudrate is hard coded to 921600 but can be parameterized
     baudrate = '921600'
-
     script_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/run_platform_test.sh"
+    #command = script_path prompt tokens model backend
+    #command = script_path
     command = f"{script_path} \"{prompt}\" {tokens} {model_path} {backend}"
 
 
-    try:
-        result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True)
-        output = result.stdout  # This should have \n
-    except subprocess.CalledProcessError as e:
-        output = f"Error running model: {e.stderr}"
+    def run_script():
+        try:
+            result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True)
+            job_status["result"] = result.stdout
+        except subprocess.CalledProcessError as e:
+            job_status["result"] = f"Error: {e.stderr}"
+        finally:
+            job_status["running"] = False
+
+    thread = threading.Thread(target=run_script)
+    job_status = {"running": True, "result": "", "thread": thread}
+    thread.start()
+
+    return render_template("processing.html")
+
+@app.route('/status')
+def status():
+    if job_status["running"]:
+        return "running"
+    else:
+        return "done"
+
+@app.route('/result')
+def result():
+    return render_template("result.html", output=job_status["result"])
 
-    return render_template('result.html', output=output)
+@app.route('/abort')
+def abort():
+    global job_status
+    if job_status["running"] and job_status["thread"].is_alive():
+        # Use subprocess.Popen + pid handling instead for real process termination
+        job_status["running"] = False
+        job_status["result"] = "Aborted by user."
+        return "<h2>Job aborted.</h2><a href='/'>Home</a>"
+    return "<h2>No job running.</h2><a href='/'>Home</a>"
 
 if __name__ == '__main__':
-    app.run(debug=True, port=5000)
+    app.run(debug=True, port=5001)
diff --git a/tools/flaskIfc/templates/processing.html b/tools/flaskIfc/templates/processing.html
new file mode 100644
index 0000000000000..15f609bee1712
--- /dev/null
+++ b/tools/flaskIfc/templates/processing.html
@@ -0,0 +1,22 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Processing</title>
+    <script>
+        function checkStatus() {
+            fetch('/status')
+                .then(response => response.text())
+                .then(status => {
+                    if (status === 'done') {
+                        window.location.href = '/result';
+                    }
+                });
+        }
+        setInterval(checkStatus, 3000);
+    </script>
+</head>
+<body>
+    <h1>Model is running...</h1>
+    <button onclick="location.href='/abort'">Abort</button>
+</body>
+</html>

From 52ae0e9eedc0fd11642eda9740c03cf93f1106dc Mon Sep 17 00:00:00 2001
From: Anoop Kapoor <akapoor@wssw01.tsavoritesi.net>
Date: Wed, 11 Jun 2025 21:49:13 -0700
Subject: [PATCH 21/35] removing commented code

---
 tools/flaskIfc/flaskIfc.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tools/flaskIfc/flaskIfc.py b/tools/flaskIfc/flaskIfc.py
index e2945f680dfc1..bba12448a1177 100644
--- a/tools/flaskIfc/flaskIfc.py
+++ b/tools/flaskIfc/flaskIfc.py
@@ -52,8 +52,6 @@ def submit():
     # Currently the baudrate is hard coded to 921600 but can be parameterized
     baudrate = '921600'
     script_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/run_platform_test.sh"
-    #command = script_path prompt tokens model backend
-    #command = script_path
     command = f"{script_path} \"{prompt}\" {tokens} {model_path} {backend}"
 
 
@@ -94,4 +92,4 @@ def abort():
     return "<h2>No job running.</h2><a href='/'>Home</a>"
 
 if __name__ == '__main__':
-    app.run(debug=True, port=5001)
+    app.run(debug=True, port=5000)

From ffe045a424894637ee58ea2cc639e0be50c92b92 Mon Sep 17 00:00:00 2001
From: Lewis Lui <lewislui@tsavoritesi.com>
Date: Thu, 12 Jun 2025 13:58:36 -0700
Subject: [PATCH 22/35] @FIR-731 - serial_script.py changes to identify end of
 output

---
 tools/flaskIfc/serial_script.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tools/flaskIfc/serial_script.py b/tools/flaskIfc/serial_script.py
index 0e1064225921f..0354f15ff2ca3 100644
--- a/tools/flaskIfc/serial_script.py
+++ b/tools/flaskIfc/serial_script.py
@@ -3,19 +3,21 @@
 
 def send_serial_command(port, baudrate, command):
     try:
-        # Open the serial port with 1 second timeout
-        ser = serial.Serial(port, baudrate, timeout=60)
+        # Open the serial port with 1 second timeout (timeout = 60 but removed it for testing!)
+        ser = serial.Serial(port, baudrate)
 
-        ser.write(command.encode())  # Encode command to bytes
-        ser.write('\n'.encode())  # Encode command to bytes
+        ser.write((command + '\n').encode())  # Send command with newline
         
         # Wait to read the serial port
         data = '\0'
         while True:
             try:
                 line = ser.readline()
+                check = line.decode('utf-8')
+                if ("run-platform-done" in check) or ("@agilex7_dk_si_agf014ea" in check):
+                    break
                 if line: # Check if line is not empty
-                    data += line.decode('utf-8')  # Keep the line as-is with newline
+                    data += check  # Keep the line as-is with newline
                 else:
                     break  # Exit loop if no data is received
             except serial.SerialException as e:
@@ -25,7 +27,6 @@ def send_serial_command(port, baudrate, command):
                 ser.close()
                 return ("Program interrupted by user")
         ser.close()
-        print (data)
         return data
 
     except serial.SerialException as e:
@@ -42,3 +43,4 @@ def send_serial_command(port, baudrate, command):
     baudrate = int(sys.argv[2])
     command = sys.argv[3]
     response = send_serial_command(port, baudrate, command)
+    print(response)

From 3211f60eeab04f757d2fce0f730aad87b584937b Mon Sep 17 00:00:00 2001
From: Lewis Lui <lewislui@tsavoritesi.com>
Date: Thu, 12 Jun 2025 14:35:23 -0700
Subject: [PATCH 23/35] Some more changes to address the comments

---
 tools/flaskIfc/serial_script.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tools/flaskIfc/serial_script.py b/tools/flaskIfc/serial_script.py
index 0354f15ff2ca3..aca14a60f5c9a 100644
--- a/tools/flaskIfc/serial_script.py
+++ b/tools/flaskIfc/serial_script.py
@@ -13,11 +13,11 @@ def send_serial_command(port, baudrate, command):
         while True:
             try:
                 line = ser.readline()
-                check = line.decode('utf-8')
-                if ("run-platform-done" in check) or ("@agilex7_dk_si_agf014ea" in check):
-                    break
                 if line: # Check if line is not empty
-                    data += check  # Keep the line as-is with newline
+                    read_next_line = line.decode('utf-8')
+                    if ("run-platform-done" in read_next_line) or ("@agilex7_dk_si_agf014ea" in read_next_line):
+                        break
+                    data += read_next_line  # Keep the line as-is with newline
                 else:
                     break  # Exit loop if no data is received
             except serial.SerialException as e:
@@ -27,6 +27,7 @@ def send_serial_command(port, baudrate, command):
                 ser.close()
                 return ("Program interrupted by user")
         ser.close()
+        print(data)
         return data
 
     except serial.SerialException as e:
@@ -43,4 +44,3 @@ def send_serial_command(port, baudrate, command):
     baudrate = int(sys.argv[2])
     command = sys.argv[3]
     response = send_serial_command(port, baudrate, command)
-    print(response)

From a411fd97096431afd4a759487d1236890791ae62 Mon Sep 17 00:00:00 2001
From: Lewis Lui <lewislui@tsavoritesi.com>
Date: Thu, 12 Jun 2025 14:43:29 -0700
Subject: [PATCH 24/35] Removed a comment

---
 tools/flaskIfc/serial_script.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/flaskIfc/serial_script.py b/tools/flaskIfc/serial_script.py
index aca14a60f5c9a..a91587341e557 100644
--- a/tools/flaskIfc/serial_script.py
+++ b/tools/flaskIfc/serial_script.py
@@ -3,7 +3,6 @@
 
 def send_serial_command(port, baudrate, command):
     try:
-        # Open the serial port with 1 second timeout (timeout = 60 but removed it for testing!)
         ser = serial.Serial(port, baudrate)
 
         ser.write((command + '\n').encode())  # Send command with newline

From 41d98b7ce06914f544e4a3b7902e1a5ece53082e Mon Sep 17 00:00:00 2001
From: Ashish Trivedi <atrivedi@tsavoritesi.com>
Date: Fri, 13 Jun 2025 10:32:36 -0700
Subject: [PATCH 25/35] @FIR-737: Added another endpoint llama-cli t invoke
 directly in URL This commit has two changes 1. Added another endpoint
 llama-cli to invole the run_platform_test.sh    directly 2. Updated reading
 of output to byte by byte to identify marking prompt    and exit when the
 marker is seen

---
 tools/flaskIfc/flaskIfc.py      | 50 +++++++++++++++++++++++++++++++++
 tools/flaskIfc/serial_script.py | 12 ++++++--
 2 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/tools/flaskIfc/flaskIfc.py b/tools/flaskIfc/flaskIfc.py
index bba12448a1177..7d2333b36e0ce 100644
--- a/tools/flaskIfc/flaskIfc.py
+++ b/tools/flaskIfc/flaskIfc.py
@@ -11,6 +11,56 @@
 def index():
     return render_template('index.html')
 
+@app.route('/llama-cli', methods=['GET'])
+def serial_command():
+    # Currently the port is hard coded to /dev/ttyUSB3 but can be parameterized
+    port = '/dev/ttyUSB3'
+    #port = request.args.get('port')
+
+    # Currently the baudrate is hard coded to 921600 but can be parameterized
+    #baudrate = request.args.get('baudrate')
+    baudrate = '921600'
+    #./run_platform_test.sh "my cat's name" "10" "tinyllama-vo-5m-para.gguf" "none"
+    model = request.args.get('model')
+    backend = request.args.get('backend')
+    tokens = request.args.get('tokens')
+    prompt = request.args.get('prompt')
+
+    # Define the model path (update with actual paths)
+    model_paths = {
+        "tiny-llama": "tinyllama-vo-5m-para.gguf",
+        "Tiny-llama-F32": "Tiny-Llama-v0.3-FP32-1.1B-F32.gguf"
+    }
+
+    model_path = model_paths.get(model, "")
+    if not model_path:
+        return f"<h2>Error: Model path not found for '{model}'</h2>"
+
+    # Build llama-cli command
+    #command = [
+    #    "./llama-cli",
+    #    "-p", prompt,
+    #    "-m", model_path,
+    #    "--device", backend,
+    #    "--temp", "0",
+    #    "--n-predict", tokens,
+    #    "--repeat-penalty", "1",
+    #    "--top-k", "0",
+    #    "--top-p", "1"
+    #]
+    # URL to Test this end point is as follows
+    # http://10.50.30.167:5001/llama-cli?model=tiny-llama&backend=tSavorite&tokens=5&prompt=Hello+How+are+you
+    script_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/run_platform_test.sh"
+    command = f"{script_path} \"{prompt}\" {tokens} {model_path} {backend}"
+
+    try:
+        result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True)
+        return result.stdout, 200
+    except subprocess.CalledProcessError as e:
+        return f"Error executing script: {e.stderr}", 500
+
+
+
 @app.route('/submit', methods=['POST'])
 def submit():
     global job_status
diff --git a/tools/flaskIfc/serial_script.py b/tools/flaskIfc/serial_script.py
index a91587341e557..38a53d103f0c8 100644
--- a/tools/flaskIfc/serial_script.py
+++ b/tools/flaskIfc/serial_script.py
@@ -6,15 +6,21 @@ def send_serial_command(port, baudrate, command):
         ser = serial.Serial(port, baudrate)
 
         ser.write((command + '\n').encode())  # Send command with newline
-        
         # Wait to read the serial port
         data = '\0'
         while True:
             try:
-                line = ser.readline()
+                # read byte by byte to find either a new line character or a prompt marker
+                # instead of new line using line = ser.readline()
+                line = b""
+                while True:
+                    byte = ser.read(1)  # Read one byte at a time
+                    if (byte == b"\n") or (byte == b"#"):  # Stop when delimiter is found
+                        break
+                    line += byte
                 if line: # Check if line is not empty
                     read_next_line = line.decode('utf-8')
-                    if ("run-platform-done" in read_next_line) or ("@agilex7_dk_si_agf014ea" in read_next_line):
+                    if ("run-platform-done" in read_next_line.strip()) or ("@agilex7_dk_si_agf014ea" in read_next_line.strip()) or ("imx8mpevk" in read_next_line.strip()):
                         break
                     data += read_next_line  # Keep the line as-is with newline
                 else:

From 2aeae8f8759989ee9780da82e162c0d82b00ce82 Mon Sep 17 00:00:00 2001
From: atrivedi-tsavoritesi <atrivedi@tsavoritesi.com>
Date: Fri, 13 Jun 2025 16:14:17 -0700
Subject: [PATCH 26/35] @FIR-738: Updated the run_llama_cli to be run instead
 of (#12)

run_platform_test.sh

Co-authored-by: Ashish Trivedi <atrivedi@fpga2.tsavoritesi.net>
---
 tools/flaskIfc/flaskIfc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/flaskIfc/flaskIfc.py b/tools/flaskIfc/flaskIfc.py
index 7d2333b36e0ce..cffc2e4a2b27e 100644
--- a/tools/flaskIfc/flaskIfc.py
+++ b/tools/flaskIfc/flaskIfc.py
@@ -50,7 +50,7 @@ def serial_command():
     #]
     # URL to Test this end point is as follows
     # http://10.50.30.167:5001/llama-cli?model=tiny-llama&backend=tSavorite&tokens=5&prompt=Hello+How+are+you
-    script_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/run_platform_test.sh"
+    script_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/run_llama_cli.sh"
     command = f"{script_path} \"{prompt}\" {tokens} {model_path} {backend}"
 
     try:

From 52e4a5804d7dc0fbeca76cab2f66c4ef8dad72f5 Mon Sep 17 00:00:00 2001
From: Anoop Kapoor <akapoor@wssw01.tsavoritesi.net>
Date: Sat, 14 Jun 2025 14:26:43 -0700
Subject: [PATCH 27/35] @FIR-736 - lama.cpp: Disable all logs except token
 generation log

---
 common/log.h                               | 9 +++++++++
 ggml/include/ggml.h                        | 1 +
 ggml/src/ggml-impl.h                       | 1 +
 ggml/src/ggml-tsavorite/ggml-tsavorite.cpp | 5 ++---
 ggml/src/ggml.c                            | 1 +
 src/llama-context.cpp                      | 8 +++++++-
 src/llama-impl.h                           | 1 +
 src/llama-sampling.cpp                     | 2 ++
 tools/main/main.cpp                        | 7 +++++++
 9 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/common/log.h b/common/log.h
index c56bb50d95db0..0e23b669fcc22 100644
--- a/common/log.h
+++ b/common/log.h
@@ -90,11 +90,20 @@ void common_log_set_timestamps(struct common_log * log,       bool   timestamps)
 #define LOG(...)             LOG_TMPL(GGML_LOG_LEVEL_NONE, 0,         __VA_ARGS__)
 #define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
 
+#if ENABLE_LOG
 #define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  0,                 __VA_ARGS__)
 #define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  0,                 __VA_ARGS__)
 #define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0,                 __VA_ARGS__)
 #define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
 #define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  0,                 __VA_ARGS__)
+#else
+#define LOG_INF(...)
+#define LOG_WRN(...)
+#define LOG_ERR(...)
+#define LOG_DBG(...)
+#define LOG_CNT(...)
+#endif
+#define LOG_TSAVORITE(...) LOG_TMPL(GGML_LOG_LEVEL_TSAVORITE,  0,                 __VA_ARGS__)
 
 #define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  verbosity, __VA_ARGS__)
 #define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  verbosity, __VA_ARGS__)
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index c81ff03fee810..e6830b63ba8e1 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -554,6 +554,7 @@ extern "C" {
         GGML_LOG_LEVEL_WARN  = 3,
         GGML_LOG_LEVEL_ERROR = 4,
         GGML_LOG_LEVEL_CONT  = 5, // continue previous log
+        GGML_LOG_LEVEL_TSAVORITE  = 42,
     };
 
     // this tensor...
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index a19cfb14e0f9f..99c3475fc10cf 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -85,6 +85,7 @@ GGML_API void ggml_log_callback_default(enum ggml_log_level level, const char *
 #define GGML_LOG_ERROR(...) ggml_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
 #define GGML_LOG_DEBUG(...) ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
 #define GGML_LOG_CONT(...)  ggml_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
+#define GGML_LOG_TSAVORITE(...)  ggml_log_internal(GGML_LOG_LEVEL_TSAVORITE , __VA_ARGS__)
 
 #define GGML_DEBUG 0
 
diff --git a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
index bc7095eeebf2f..c49d02375921f 100644
--- a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
+++ b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
@@ -504,7 +504,6 @@ static void *ggml_tsavorite_host_malloc(size_t n) {
   GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
 
   GGML_TSAVORITE_LOG_INFO("\n Allocating memory from tsi_alloc with size  %ld \n", n);
-  printf("\n ANoop Allocating memory from tsi_alloc with size  %ld \n", n);
   data = tsi_alloc(n);
   GGML_TSAVORITE_LOG_CONT("\n Allocating memory from tsi_alloc with size  %ld starting memory %p\n",
                           n, data);
@@ -1800,7 +1799,6 @@ static bool ggml_backend_tsavorite_device_supports_buft(ggml_backend_dev_t dev,
 // ggml_backend_sched_backend_id_from_cur  -> ggml_backend_offload_op ->
 static bool ggml_backend_tsavorite_device_offload_op(ggml_backend_dev_t dev,
                                                      const struct ggml_tensor *op) {
-  // printf("\n ANoop Calling %s \n ", __func__);
   if (op->type != GGML_TYPE_F32)
     return false;
   switch (op->op) {
@@ -1894,8 +1892,9 @@ static struct ggml_backend_reg_i ggml_backend_tsavorite_reg_i = {
     /* .get_proc_address = */ NULL,
 };
 
+
 ggml_backend_reg_t ggml_backend_tsavorite_reg(void) {
-  ggml_tsavorite_log_type_val = GGML_TSAVORITE_LOG_ERROR;
+  ggml_tsavorite_log_type_val = GGML_TSAVORITE_LOG_NONE;
   ggml_tsavorite_kernel_mode_flag = GGML_TSAVORITE_KERNEL_MODE_MLIR;
   GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
   g_ggml_backend_tsavorite_reg.iface = ggml_backend_tsavorite_reg_i;
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 57d3e39adf758..134b7420de746 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -249,6 +249,7 @@ static void ggml_log_internal_v(enum ggml_log_level level, const char * format,
 void ggml_log_internal(enum ggml_log_level level, const char * format, ...) {
     va_list args;
     va_start(args, format);
+    if (level == GGML_LOG_LEVEL_TSAVORITE)
     ggml_log_internal_v(level, format, args);
     va_end(args);
 }
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 85b4324b699e6..984dbf14d14ae 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -2615,13 +2615,19 @@ void llama_perf_context_print(const llama_context * ctx) {
     const auto data = llama_perf_context(ctx);
 
     const double t_end_ms = 1e-3 * ggml_time_us();
-
     LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, data.t_load_ms);
     LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
             __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
     LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
             __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
     LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
+
+    LLAMA_LOG_TSAVORITE("%s:        load time = %10.2f ms\n", __func__, data.t_load_ms);
+    LLAMA_LOG_TSAVORITE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+            __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
+    LLAMA_LOG_TSAVORITE("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+            __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
+    LLAMA_LOG_TSAVORITE("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
 }
 
 void llama_perf_context_reset(llama_context * ctx) {
diff --git a/src/llama-impl.h b/src/llama-impl.h
index 02b1d07f8400d..abc963a4a14e7 100644
--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@@ -29,6 +29,7 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
 #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
 #define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
 #define LLAMA_LOG_CONT(...)  llama_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
+#define LLAMA_LOG_TSAVORITE(...) llama_log_internal(GGML_LOG_LEVEL_TSAVORITE, __VA_ARGS__)
 
 //
 // helpers
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 804b11e0a943e..d012a0ce520e0 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -2562,6 +2562,8 @@ void llama_perf_sampler_print(const struct llama_sampler * chain) {
 
     LLAMA_LOG_INFO("%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
             __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
+    LLAMA_LOG_TSAVORITE("\n\n%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)",
+            __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
 }
 
 void llama_perf_sampler_reset(struct llama_sampler * chain) {
diff --git a/tools/main/main.cpp b/tools/main/main.cpp
index 1bd2be2d94f51..26842116ec6df 100644
--- a/tools/main/main.cpp
+++ b/tools/main/main.cpp
@@ -41,6 +41,12 @@ static std::vector<llama_token> * g_output_tokens;
 static bool is_interacting  = false;
 static bool need_insert_eot = false;
 
+static void my_logger(ggml_log_level level, const char *text, void *user_data) {
+    if (level == GGML_LOG_LEVEL_TSAVORITE) {
+        fprintf(stderr, "%s", text);  // only show warnings or errors
+    }
+}
+
 static void print_usage(int argc, char ** argv) {
     (void) argc;
 
@@ -120,6 +126,7 @@ int main(int argc, char ** argv) {
         LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
     }
 
+    llama_log_set(my_logger, nullptr);
     LOG_INF("%s: llama backend init\n", __func__);
 
     llama_backend_init();

From 61915987c54a7c58f60d6c6bd53034a5f7d47e7f Mon Sep 17 00:00:00 2001
From: LewisLui777 <121061033+LewisLui777@users.noreply.github.com>
Date: Mon, 16 Jun 2025 15:57:02 -0700
Subject: [PATCH 28/35] Changed run_platform_test.sh to run_llama_cli.sh (#14)

Co-authored-by: Lewis Lui <lewislui@tsavoritesi.com>
---
 tools/flaskIfc/flaskIfc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/flaskIfc/flaskIfc.py b/tools/flaskIfc/flaskIfc.py
index cffc2e4a2b27e..1bfca20440bb2 100644
--- a/tools/flaskIfc/flaskIfc.py
+++ b/tools/flaskIfc/flaskIfc.py
@@ -101,7 +101,7 @@ def submit():
 
     # Currently the baudrate is hard coded to 921600 but can be parameterized
     baudrate = '921600'
-    script_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/run_platform_test.sh"
+    script_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/run_llama_cli.sh"
     command = f"{script_path} \"{prompt}\" {tokens} {model_path} {backend}"
 
 

From cd734f0085828e5b8c638397d6bcb4ee0a2ffbc9 Mon Sep 17 00:00:00 2001
From: atrivedi-tsavoritesi <atrivedi@tsavoritesi.com>
Date: Mon, 16 Jun 2025 17:13:06 -0700
Subject: [PATCH 29/35] @FIR-748: Added endpoints for health, sysinfo, upload
 and restart (#15)

---
 tools/flaskIfc/flaskIfc.py           | 111 +++++++++++++++++++++++----
 tools/flaskIfc/templates/upload.html |   4 +
 2 files changed, 101 insertions(+), 14 deletions(-)
 create mode 100644 tools/flaskIfc/templates/upload.html

diff --git a/tools/flaskIfc/flaskIfc.py b/tools/flaskIfc/flaskIfc.py
index 1bfca20440bb2..966fc38c549dc 100644
--- a/tools/flaskIfc/flaskIfc.py
+++ b/tools/flaskIfc/flaskIfc.py
@@ -2,25 +2,24 @@
 import subprocess
 import threading
 import time
+from werkzeug.utils import secure_filename
+import os
 
 job_status = {"running": False, "result": "", "thread": None}
 
 app = Flask(__name__)
 
+port = '/dev/ttyUSB3'
+baudrate = '921600'
+
 @app.route('/')
 def index():
     return render_template('index.html')
 
 @app.route('/llama-cli', methods=['GET'])
-def serial_command():
-    # Currently the port is hard coded to /dev/ttyUSB3 but can be parameterized
-    port = '/dev/ttyUSB3'
-    #port = request.args.get('port')
-
-    # Currently the baudrate is hard coded to 921600 but can be parameterized
-    #baudrate = request.args.get('baudrate')
-    baudrate = '921600'
-    #./run_platform_test.sh "my cat's name" "10" "tinyllama-vo-5m-para.gguf" "none"
+def llama_cli_serial_command():
+
+    #./run_llama_cli.sh "my cat's name" "10" "tinyllama-vo-5m-para.gguf" "none"
     model = request.args.get('model')
     backend = request.args.get('backend')
     tokens = request.args.get('tokens')
@@ -59,7 +58,95 @@ def serial_command():
     except subprocess.CalledProcessError as e:
         return f"Error executing script: {e.stderr}", 500
 
+UPLOAD_FOLDER = './' # Directory where uploaded files will be stored
+app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
+os.makedirs(UPLOAD_FOLDER, exist_ok=True) # Create the upload folder if it doesn't exist
+
+@app.route('/upload-gguf', methods=['POST', 'GET'])
+def upload_serial_command():
+    if request.method == 'POST':
+        # Check if a file was submitted
+        if 'file' not in request.files:
+            return "No file part"
+        file = request.files['file']
+
+        # Check if the file is empty
+        if file.filename == '':
+            return "No file selected"
+
+       # Save the file if it exists
+        if file:
+            filename = secure_filename(file.filename)
+            file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
+            return "File uploaded successfully"
+    return render_template('upload.html') # Display the upload form
+
+#    command = f"upload file"
+#    try:
+#        result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True)
+#        return result.stdout, 200
+#    except subprocess.CalledProcessError as e:
+#        return f"Error executing script: {e.stderr}", 500
+
+@app.route('/upload-file', methods=['GET', 'POST'])
+def upload_file():
+    if request.method == 'POST':
+        # Check if a file was submitted
+        if 'file' not in request.files:
+            return "No file part"
+        file = request.files['file']
+
+        # Check if the file is empty
+        if file.filename == '':
+            return "No file selected"
+
+        # Save the file if it exists
+        if file:
+            filename = secure_filename(file.filename)
+            file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
+            return "File uploaded successfully"
+    return render_template('upload.html') # Display the upload form
+
+@app.route('/restart-txe', methods=['GET'])
+def restart_txe_serial_command():
+    command = f"telnet localhost 8000; close all"
+
+    try:
+        result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True)
+        return result.stdout, 200
+    except subprocess.CalledProcessError as e:
+        return f"Error executing script: {e.stderr}", 500
+
+@app.route('/health-check', methods=['GET'])
+def health_check_serial_command():
+    command = f"free -h"
+
+    try:
+        result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True)
+        return result.stdout, 200
+    except subprocess.CalledProcessError as e:
+        return f"Error executing script: {e.stderr}", 500
 
+@app.route('/test', methods=['GET'])
+def test_serial_command():
+    command = f"test"
+
+    try:
+        result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True)
+        return result.stdout, 200
+    except subprocess.CalledProcessError as e:
+        return f"Error executing script: {e.stderr}", 500
+
+@app.route('/system-info', methods=['GET'])
+def system_info_serial_command():
+
+    command = f"lscpu"
+
+    try:
+        result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True)
+        return result.stdout, 200
+    except subprocess.CalledProcessError as e:
+        return f"Error executing script: {e.stderr}", 500
 
 @app.route('/submit', methods=['POST'])
 def submit():
@@ -68,7 +155,7 @@ def submit():
     if job_status["running"]:
         return "<h2>A model is already running. Please wait or abort.</h2>"
 
-    #./run_platform_test.sh "my cat's name" "10" "tinyllama-vo-5m-para.gguf" "none"
+    #./run_llama_cli.sh "my cat's name" "10" "tinyllama-vo-5m-para.gguf" "none"
     model = request.form.get('model')
     backend = request.form.get('backend')
     tokens = request.form.get('tokens')
@@ -96,11 +183,7 @@ def submit():
     #    "--top-k", "0",
     #    "--top-p", "1"
     #]
-    # Currently the port is hard coded to /dev/ttyUSB3 but can be parameterized
-    port = '/dev/ttyUSB3'
 
-    # Currently the baudrate is hard coded to 921600 but can be parameterized
-    baudrate = '921600'
     script_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/run_llama_cli.sh"
     command = f"{script_path} \"{prompt}\" {tokens} {model_path} {backend}"
 
diff --git a/tools/flaskIfc/templates/upload.html b/tools/flaskIfc/templates/upload.html
new file mode 100644
index 0000000000000..3368379f74754
--- /dev/null
+++ b/tools/flaskIfc/templates/upload.html
@@ -0,0 +1,4 @@
+<form method="post" enctype="multipart/form-data">
+    <input type="file" name="file">
+    <input type="submit" value="Upload">
+</form>

From f53f23ca0f636bc1ec3bd8966c73ac5183a2fba6 Mon Sep 17 00:00:00 2001
From: atrivedi-tsavoritesi <atrivedi@tsavoritesi.com>
Date: Tue, 17 Jun 2025 13:26:48 -0700
Subject: [PATCH 30/35] @FIR-742: Add system-info, txe-restart functionality
 and cd to right path (#16)

The changes are as follows
1. change directory to right folder before running the commands
2. Add system-info and txe-restart functionlity

Co-authored-by: Ashish Trivedi <atrivedi@fpga4.tsavoritesi.net>
---
 tools/flaskIfc/flaskIfc.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/tools/flaskIfc/flaskIfc.py b/tools/flaskIfc/flaskIfc.py
index 966fc38c549dc..8d57a069cf3de 100644
--- a/tools/flaskIfc/flaskIfc.py
+++ b/tools/flaskIfc/flaskIfc.py
@@ -11,6 +11,7 @@
 
 port = '/dev/ttyUSB3'
 baudrate = '921600'
+exe_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/"
 
 @app.route('/')
 def index():
@@ -49,8 +50,8 @@ def llama_cli_serial_command():
     #]
     # URL to Test this end point is as follows
     # http://10.50.30.167:5001/llama-cli?model=tiny-llama&backend=tSavorite&tokens=5&prompt=Hello+How+are+you
-    script_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/run_llama_cli.sh"
-    command = f"{script_path} \"{prompt}\" {tokens} {model_path} {backend}"
+    script_path = "./run_llama_cli.sh"
+    command = f"cd {exe_path}; {script_path} \"{prompt}\" {tokens} {model_path} {backend}"
 
     try:
         result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True)
@@ -109,11 +110,17 @@ def upload_file():
 
 @app.route('/restart-txe', methods=['GET'])
 def restart_txe_serial_command():
-    command = f"telnet localhost 8000; close all"
+    command = f"telnet localhost 8000\r\nclose all\r\n"
 
     try:
         result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True)
-        return result.stdout, 200
+        time.sleep(5)
+        command = f"{exe_path}/../install/tsi-start\nyes\n"
+        try:
+            result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True)
+            return result.stdout, 200
+        except subprocess.CalledProcessError as e:
+            return f"Error executing script: {e.stderr}", 500
     except subprocess.CalledProcessError as e:
         return f"Error executing script: {e.stderr}", 500
 
@@ -140,7 +147,7 @@ def test_serial_command():
 @app.route('/system-info', methods=['GET'])
 def system_info_serial_command():
 
-    command = f"lscpu"
+    command = f"{exe_path}../install/tsi-version;lscpu"
 
     try:
         result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True)
@@ -184,8 +191,8 @@ def submit():
     #    "--top-p", "1"
     #]
 
-    script_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/run_llama_cli.sh"
-    command = f"{script_path} \"{prompt}\" {tokens} {model_path} {backend}"
+    script_path = "./run_llama_cli.sh"
+    command = f"cd {exe_path}; {script_path} \"{prompt}\" {tokens} {model_path} {backend}"
 
 
     def run_script():

From 1a9ba9db5710577aa563902ddb8a4971787bd346 Mon Sep 17 00:00:00 2001
From: Anoop Kapoor <akapoor@wssw01.tsavoritesi.net>
Date: Tue, 17 Jun 2025 14:03:19 -0700
Subject: [PATCH 31/35] @FIR-720 - GGML: Add TMU(MAT_MUL) kernel

---
 ggml-tsi-kernel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-tsi-kernel b/ggml-tsi-kernel
index 3194c54b13b8c..ea3a5d613e821 160000
--- a/ggml-tsi-kernel
+++ b/ggml-tsi-kernel
@@ -1 +1 @@
-Subproject commit 3194c54b13b8cd0b5c29a6a1cc0060ae2abbed06
+Subproject commit ea3a5d613e82129326c93a22eb3af871e6882530

From d733056d8f7dd4a35b1c357f13fbf94e981b4329 Mon Sep 17 00:00:00 2001
From: atrivedi-tsavoritesi <atrivedi@tsavoritesi.com>
Date: Tue, 17 Jun 2025 21:04:37 -0700
Subject: [PATCH 32/35] @FIR-754: Added all parameter parsing for the llama-cli
 (#18)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* @FIR-754: Added all parameter parsing for the llama-cli

The test results are as follows
Model Response
cd /usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/; ./run_llama_cli.sh "My cat's name"
" 50 tinyllama-vo-5m-para.gguf tSavorite 1.5 1024 50 0.9 5 12288 0.0
[2018-03-09 13:03:17.788243] 271:272 [[32m info[m]  :: </proj/work/mmankali/bld-setuptest/tsirel-31/tsi_yocto_workspace/tsi-apc-manager/platform/rsm_mgr/rsm_process_req.c:129> TXE resource allocation request processed successfully.
 My cat's name was Tim. He loved to play with his toy car. He would run and jump in the park, making loud noises. Tim was very happy with his new toy car.
One day, Tim's mom said, "Tim. You

llama_perf_sampler_print:    sampling time =     999.96 ms /    56 runs   (   17.86 ms per token,    56.00 tokens per second)llama_perf_context_print:        load time =    1713.55 ms
llama_perf_context_print: prompt eval time =     603.51 ms /     6 tokens (  100.58 ms per token,     9.94 tokens per second)
llama_perf_context_print:        eval time =    7069.36 ms /    49 runs   (  144.27 ms per token,     6.93 tokens per second)
llama_perf_context_print:       total time =   10046.17 ms /    55 tokens
[2018-03-09 13:03:28.875126] 271:272 [[32m info[m]  :: </proj/work/mmankali/bld-setuptest/tsirel-31/tsi_yocto_workspace/tsi-apc-manager/platform/rsm_mgr/rsm_process_req.c:145> TXE resource release request processed successfully.

GGML Tsavorite Profiling Results:
------------------------------------------------------------------------------------------------------------------------
Calls  Total(ms)    T/call  Self(ms)  Function
------------------------------------------------------------------------------------------------------------------------
 2715   2720.000     1.002     0.000  [25%] RuntimeHostShim::awaitCommandListCompletion
 1740   2635.984     1.515  2635.984  └─ [24%] [ txe_silu ]
  925   1379.715     1.492  1379.715  └─ [12%] [ txe_mult ]
   50     74.450     1.489    74.450  └─ [ 1%] [ txe_add ]
 2715      0.448     0.000     0.448  └─ [ 0%] TXE 0 Idle
    1     34.000    34.000    34.000  [ 0%] RuntimeHostShim::finalize
    1     16.000    16.000     1.000  [ 0%] GGML Tsavorite
    1     15.000    15.000    15.000  └─ [ 0%] RuntimeHostShim::initialize
 2716      0.000     0.000     0.000  [ 0%] RuntimeHostShim::allocate
 9120      0.000     0.000     0.000  [ 0%] RuntimeHostShim::getShmemManager
 2715      0.000     0.000     0.000  [ 0%] RuntimeHostShim::createCommandList
 2715      0.000     0.000     0.000  [ 0%] RuntimeHostShim::loadBlob
 2715      0.000     0.000     0.000  [ 0%] RuntimeHostShim::launchBlob
 2715      0.000     0.000     0.000  [ 0%] RuntimeHostShim::addCommandToList
 2715      0.000     0.000     0.000  [ 0%] RuntimeHostShim::finalizeCommandList
 2715      0.000     0.000     0.000  [ 0%] RuntimeHostShim::unloadBlob
 2715      0.000     0.000     0.000  [ 0%] RuntimeHostShim::deallocate
========================================================================================================================
33558  11098.000     0.331 11098.000  [100%] TOTAL
========================================================================================================================

⟵ Back to Form

The URL used is as follows
http://10.50.0.124:5003/llama-cli?model=tiny-llama&backend=tSavorite&tokens=10&prompt=My+cat%27s+name&repeat-penalty=1.5&batch-size=1024&top-k=50&top-p=0.9&last-n=5&context-length=12288&temp=0.0

* @FIR-754: Addressed review comments.

---------

Co-authored-by: Ashish Trivedi <atrivedi@fpga4.tsavoritesi.net>
---
 tools/flaskIfc/flaskIfc.py | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/tools/flaskIfc/flaskIfc.py b/tools/flaskIfc/flaskIfc.py
index 8d57a069cf3de..34b9fc5970522 100644
--- a/tools/flaskIfc/flaskIfc.py
+++ b/tools/flaskIfc/flaskIfc.py
@@ -10,9 +10,19 @@
 app = Flask(__name__)
 
 port = '/dev/ttyUSB3'
+#port = '/dev/ttyUSB2'
 baudrate = '921600'
+#baudrate = '115200'
 exe_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/"
 
+DEFAULT_REPEAT_PENALTY = 1.5
+DEFAULT_BATCH_SIZE = 1024
+DEFAULT_TOP_K = 50
+DEFAULT_TOP_P = 0.9
+DEFAULT_LAST_N = 5
+DEFAULT_CONTEXT_LENGTH = 12288
+DEFAULT_TEMP = 0.0
+
 @app.route('/')
 def index():
     return render_template('index.html')
@@ -25,6 +35,13 @@ def llama_cli_serial_command():
     backend = request.args.get('backend')
     tokens = request.args.get('tokens')
     prompt = request.args.get('prompt')
+    repeat_penalty = request.args.get('repeat-penalty', DEFAULT_REPEAT_PENALTY)
+    batch_size = request.args.get('batch-size', DEFAULT_BATCH_SIZE)
+    top_k = request.args.get('top-k', DEFAULT_TOP_K)
+    top_p = request.args.get('top-p', DEFAULT_TOP_P)
+    last_n = request.args.get('last-n', DEFAULT_LAST_N)
+    context_length = request.args.get('context-length', DEFAULT_CONTEXT_LENGTH)
+    temp = request.args.get('temp', DEFAULT_TEMP)
 
     # Define the model path (update with actual paths)
     model_paths = {
@@ -51,7 +68,7 @@ def llama_cli_serial_command():
     # URL to Test this end point is as follows
     # http://10.50.30.167:5001/llama-cli?model=tiny-llama&backend=tSavorite&tokens=5&prompt=Hello+How+are+you
     script_path = "./run_llama_cli.sh"
-    command = f"cd {exe_path}; {script_path} \"{prompt}\" {tokens} {model_path} {backend}"
+    command = f"cd {exe_path}; {script_path} \"{prompt}\" {tokens} {model_path} {backend} {repeat_penalty} {batch_size} {top_k} {top_p} {last_n} {context_length} {temp}"
 
     try:
         result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True)
@@ -167,6 +184,13 @@ def submit():
     backend = request.form.get('backend')
     tokens = request.form.get('tokens')
     prompt = request.form.get('prompt')
+    repeat_penalty = request.form.get('repeat-penalty', DEFAULT_REPEAT_PENALTY)
+    batch_size = request.form.get('batch-size', DEFAULT_BATCH_SIZE)
+    top_k = request.form.get('top-k', DEFAULT_TOP_K)
+    top_p = request.form.get('top-p', DEFAULT_TOP_P)
+    last_n = request.form.get('last-n', DEFAULT_LAST_N)
+    context_length = request.form.get('context-length', DEFAULT_CONTEXT_LENGTH)
+    temp = request.form.get('temp', DEFAULT_TEMP)
 
     # Define the model path (update with actual paths)
     model_paths = {
@@ -192,7 +216,7 @@ def submit():
     #]
 
     script_path = "./run_llama_cli.sh"
-    command = f"cd {exe_path}; {script_path} \"{prompt}\" {tokens} {model_path} {backend}"
+    command = f"cd {exe_path}; {script_path} \"{prompt}\" {tokens} {model_path} {backend} {repeat_penalty} {batch_size} {top_k} {top_p} {last_n} {context_length} {temp}"
 
 
     def run_script():

From f5713b3ff130329da255fe1227a0ae995fa4c993 Mon Sep 17 00:00:00 2001
From: atrivedi-tsavoritesi <atrivedi@tsavoritesi.com>
Date: Wed, 18 Jun 2025 09:26:05 -0700
Subject: [PATCH 33/35] @FIR-756: Removed the echo of command in flask output
 (#19)

---
 tools/flaskIfc/serial_script.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/flaskIfc/serial_script.py b/tools/flaskIfc/serial_script.py
index 38a53d103f0c8..9581376a8b16b 100644
--- a/tools/flaskIfc/serial_script.py
+++ b/tools/flaskIfc/serial_script.py
@@ -8,6 +8,7 @@ def send_serial_command(port, baudrate, command):
         ser.write((command + '\n').encode())  # Send command with newline
         # Wait to read the serial port
         data = '\0'
+        first_time = 1
         while True:
             try:
                 # read byte by byte to find either a new line character or a prompt marker
@@ -22,7 +23,10 @@ def send_serial_command(port, baudrate, command):
                     read_next_line = line.decode('utf-8')
                     if ("run-platform-done" in read_next_line.strip()) or ("@agilex7_dk_si_agf014ea" in read_next_line.strip()) or ("imx8mpevk" in read_next_line.strip()):
                         break
-                    data += read_next_line  # Keep the line as-is with newline
+                    if (first_time == 1) :
+                        first_time = 0
+                    else:
+                        data += read_next_line  # Keep the line as-is with newline
                 else:
                     break  # Exit loop if no data is received
             except serial.SerialException as e:

From 15e7365fad52323136de3bb0c046dbf502fe4328 Mon Sep 17 00:00:00 2001
From: atrivedi-tsavoritesi <atrivedi@tsavoritesi.com>
Date: Wed, 18 Jun 2025 11:59:53 -0700
Subject: [PATCH 34/35] @FIR-757: Update SDK to 0.1.4 and update release to
 0.0.3 for tsi-ggml (#20)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The test results with ./run_llama_cli.sh with 5 tokens is as follows

+++
root@agilex7_dk_si_agf014ea:/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin# ./run_llama_cli.sh
 my cat's name is Max. He'

llama_perf_sampler_print:    sampling time =     111.70 ms /    11 runs   (   10.15 ms per token,    98.47 tokens per second)llama_perf_context_print:        load time =  132926.48 ms
llama_perf_context_print: prompt eval time =  109957.33 ms /     6 tokens (18326.22 ms per token,     0.05 tokens per second)
llama_perf_context_print:        eval time =  195682.91 ms /     4 runs   (48920.73 ms per token,     0.02 tokens per second)
llama_perf_context_print:       total time =  328764.01 ms /    10 tokens

GGML Tsavorite Profiling Results:
------------------------------------------------------------------------------------------------------------------------
Calls  Total(ms)    T/call  Self(ms)  Function
------------------------------------------------------------------------------------------------------------------------
33160 100086.000     3.018 47907.157  [32%] RuntimeHostShim::awaitCommandListCompletion
18920  29912.952     1.581 29912.952  └─ [10%] [ txe_silu ]
14080  22010.102     1.563 22010.102  └─ [ 7%] [ txe_mult ]
  160    253.071     1.582   253.071  └─ [ 0%] [ txe_add ]
33160      1.178     0.000     1.178  └─ [ 0%] TXE 0 Idle
    1    114.000   114.000    18.000  [ 0%] GGML Tsavorite
    1     96.000    96.000    96.000  └─ [ 0%] RuntimeHostShim::initialize
    1     52.000    52.000    52.000  [ 0%] RuntimeHostShim::finalize
33160     26.000     0.001    26.000  [ 0%] RuntimeHostShim::loadBlob
33160     23.000     0.001    23.000  [ 0%] RuntimeHostShim::finalizeCommandList
33160      5.000     0.000     5.000  [ 0%] RuntimeHostShim::addCommandToList
33161      3.000     0.000     3.000  [ 0%] RuntimeHostShim::allocate
33160      3.000     0.000     3.000  [ 0%] RuntimeHostShim::createCommandList
113720      0.000     0.000     0.000  [ 0%] RuntimeHostShim::getShmemManager
33160      0.000     0.000     0.000  [ 0%] RuntimeHostShim::launchBlob
33160      0.000     0.000     0.000  [ 0%] RuntimeHostShim::unloadBlob
33160      0.000     0.000     0.000  [ 0%] RuntimeHostShim::deallocate
========================================================================================================================
412163 308849.000     0.749308849.000  [100%] TOTAL
========================================================================================================================

root@agilex7_dk_si_agf014ea:/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin#
+++
---
 tsi-pkg-build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tsi-pkg-build.sh b/tsi-pkg-build.sh
index d2f09ffd2e001..64c577235b911 100755
--- a/tsi-pkg-build.sh
+++ b/tsi-pkg-build.sh
@@ -6,7 +6,7 @@ echo 'updating submodule'
 git submodule update --recursive --init
 cd ggml-tsi-kernel/
 module load tsi4 gcc/13.3.0
-export MLIR_SDK_VERSION=/proj/rel/sw/sdk-r.0.1.3
+export MLIR_SDK_VERSION=/proj/rel/sw/sdk-r.0.1.4
 echo 'creating python virtual env'
 /proj/local/Python-3.10.12/bin/python3 -m venv blob-creation
 source blob-creation/bin/activate
@@ -48,7 +48,7 @@ cmake --build build-fpga --config Release
 
 
 echo 'creating tar bundle for fpga'
-TSI_GGML_VERSION=0.0.2
+TSI_GGML_VERSION=0.0.3
 TSI_GGML_BUNDLE_INSTALL_DIR=tsi-ggml
 GGML_TSI_INSTALL_DIR=ggml-tsi-kernel
 TSI_GGML_RELEASE_DIR=/proj/rel/sw/ggml/

From 1aa54e42c39392548df5cb53fb06659cf1592446 Mon Sep 17 00:00:00 2001
From: "M.Mankali" <mmankali@fpga1.tsavoritesi.net>
Date: Fri, 20 Jun 2025 07:42:44 -0700
Subject: [PATCH 35/35]  FIR 760- Integrate copy2fpga file transfer to
 llama.cpp

---
 tools/flaskIfc/copy2fpga-setup.sh          |  12 +++++
 tools/flaskIfc/copy2fpga-x86               | Bin 0 -> 21960 bytes
 tools/flaskIfc/copy2fpga-x86.sh            |   9 ++++
 tools/flaskIfc/flaskIfc.py                 |  50 ++++++++++++++++++++-
 tools/flaskIfc/templates/uploadtofpga.html |  14 ++++++
 5 files changed, 84 insertions(+), 1 deletion(-)
 create mode 100755 tools/flaskIfc/copy2fpga-setup.sh
 create mode 100755 tools/flaskIfc/copy2fpga-x86
 create mode 100755 tools/flaskIfc/copy2fpga-x86.sh
 create mode 100644 tools/flaskIfc/templates/uploadtofpga.html

diff --git a/tools/flaskIfc/copy2fpga-setup.sh b/tools/flaskIfc/copy2fpga-setup.sh
new file mode 100755
index 0000000000000..9ccbe55d3939f
--- /dev/null
+++ b/tools/flaskIfc/copy2fpga-setup.sh
@@ -0,0 +1,12 @@
+
+echo " Remove the device "
+sudo bash -c "echo 1 > /sys/bus/pci/devices/0000\:01\:00.0/remove"
+
+echo "rescan"
+sudo bash -c "echo 1 > /sys/bus/pci/rescan"
+
+echo " dump the pci data"
+/aws/proj/rel/sw/platform/release_v0.1.1.tsv026_04_15_2025/scripts/dump-pci.sh
+
+echo " set the pci bit to access mem"
+sudo setpci -s 0000:01:00.0 COMMAND=0x02
diff --git a/tools/flaskIfc/copy2fpga-x86 b/tools/flaskIfc/copy2fpga-x86
new file mode 100755
index 0000000000000000000000000000000000000000..7a0e76a65b5b343e26205d378b8994c314a57c41
GIT binary patch
literal 21960
zcmeHPdvF}ZneW-vNGq)#E6X;PA9!qIu)$hc#&0ZJe#l;d4aV5P4v*1lcUD>~?JB#o
zegtd_6em^**GZKlF$o9uB`M4$5a$S#0|kgjFc6?bauu9}<eWR3NLM5nk#GhMbic2s
zdo(*}i`?Unt1Goi`}Ozh@B8}euX|>Crl;pX{l@hkMPV{|*-A#-nQ<0UCkhU26d8~@
zwv?6NcQ(6(<%7@Tm{ne95tLf#v?rVT1nvbTy-AcAg-*3#CaLt0DCy-&jee^{QWmgy
z(o-obcsDICvg9O{<#v6VRnA=W1>wJ>PCbgvq$*2qQf)T9AmXxA=u16<NpFtOn<Mlj
zJx>lHNh;$>F`<8H&rRo(5|X5Dy%ynzo369en50w(r5nzf(f>|;n}uG_Ov{d&%6cS~
z^<57=ipyV{WH)aS_2ue^0nrbVitw`P)_Bw6h1IRms@8ZSv!`m$(#2Ja7Y5VG-~#Rf
z`K95X#?*#uw=tOMBTROur^Di<Jt`Y7%1J&r^!{Ha{C@c&-}4tX=G{55;OwSPen>Lp
zH|dZJC9)@6wE4-%kqnQAawnL8hcJ1p%$aug)HjUR+m;P=ONi?_e=Y$d4sj>UJ^`m?
zAr_;9j3)oJ9QY4&;OlbW_vXO=AqT!B2YyWsygmoMI0wEX2VRN5YbbN4D1g!8b5Ra_
zS`PfnIq+3E@Rd36!#VJsIq<*DfzQd&UK*#PwRa2fayFT1b;uye^9;<enH<Uf=>SXe
zb#)dl<(aNW_Js9VJP~e<f7xJ}bgN;svsl_NcCxliqAlDG5geouJ#B_lrrs8gCs?dK
z6;GHk7E87p;3BQbw87epwzOfgv>8oiOcpbZ)>hV@G1Dv-Yt5uvU?!eUMwTtp(~)oj
zy6L@%2uT~M6pN*daFn$pn@lldPuzqw>Wl2uBP~1iSUBFw+Q=2u1gzOG&3K!^$Spcm
zEM^-vHmq5zF9<HNa|`WU%?P+4xX6aDT*&l>t($bzZ#2i#rjgpZX>DsVVQdXIwHm0u
zxh<Ixjn*yEQ3dI|q2T=C#dX1p^AzRepM^`#i!+H=$Pz|LfyPsy@;rJ`JYGU)oKpfD
z&v})-A^30hegmhYJoX#GXC}uB>Fo8k;D5}^J!}X(C3!uR{oPk#;hMPKNxVbgzB;R1
z;*vj22L=+I3u%nWbjE=@&z}Phyi^K<o_FAr9QXwXKE;8{@lEG{I`_&X$1&l~lka?C
zrvSLTMv%@@2TtdInd%%k*^(*b!1?)K6l`|jREJF49k`4gl{GqWbd;T19JrVg@v?RY
zE~Z!<-{ZiGZ7Ym*IB*OcI~{W1Wj4xKj{}!uj*5>s@N$RzQ3vik4;^>lV(4(K6AqlN
z!!n(8;1eVW+ULM0I`EeqIGx*N>UZFk5*(vr5g3cWSOoqLN8tUki$4l=zT*pZtG}vd
zEOfBX^bGfhI$!WT&%GXA^ihzpq2bvdgOp8YB;P_MLxaB`9v<%DJY9H)2H&uGy5J5C
zK5O%I!5bR<k<HVEZfNkB&C>;LXt3Aj>B2TNc&E+N1#M{XOEymzvZ2AS&C>;JXz+TQ
zrwiB6;A)$v3)awJjm^`AYG`nV&C>;HXt3PoX<{%msMtJRsD=jL`&71<E>LK{&C`Vm
z?YDWlAff#>PZuJz-{$E8g!W4w(QV_=#my1L6Z}0@6zviIu!}$F;`h1ujEiq`@rH}P
z#l>%P@ta(Hy^F7P@rzu1m5ZP4;-|Uz2`;|G#j7s<lX=<k`iqmV`C~)Z8@GnK{w36T
z_Wb6p^)-DpFNY4-{ym1>@PuDL^3Sng+4QfVQ*ZzruKf^5=&(9OWZ4CC0uGJGDUU;=
zaQIBw^bV4HUQo!t%;gt7M`Z4&p|10xXaBS+^y~$1NO>{z#;4|Z7&s^l_=eBKc)v=0
z>hF%)r%=o?S8NM));3W#)OF4*4t3YYzzqEAli}fkC<fz;>OJ6<TcB&#Px^zqQAn9>
z(8IwT2zBkd5bCbKK$CSCqg-7c9i&#)H<B$L5#X~f5(stnjemkB(@*olu2TbV!UJA%
zs$&xAB5_MR*Yoo8cHMWrbArmco@#+dp{~aoLAvg_8Nbm`ci!wdys-X4O<$<{AQw42
zp%VvZsOy&@RKsvUgu3dFhC25hWtmF!K?^TDHShr(c!p{oC?=WigJc+@$7NQf8x;U2
zLZ|9al3SpM1nuGUu)2sFtM5Bi-$BT!dfww`$Z;!9-3Q50__5t;?$IacJZ1-BCQuB4
z*>LMTPB7%KE(9r`{txM5+=0FZwCg#lqiZudwElQ^U2*4y;S5=MNm!|L$XD4`W(o8O
z^3`_Y?@%tBI40D84G5Nxk`sUWSUPdmLaDyc5d@|ubhy4h)b*kj#m6BB4|WR^U$fXA
z!S1oxhb)$dbKnJz9X!C}{vvdL=PZql)Ye6XFFFgmEOV!Y^~qodH=r}&bk7s)ycb7V
zf2eH3bMW#==+1pdhTrwF-9B{S5nkB6>1e2X+Yv~gc$$U)we#(dc=w;gNIA)PFz*xA
zU!}@KG|oWEYVR*C^khh>KZMegp}7bM?|8Ckcf4+iA!vD>7Ya>3iF7mSuiMb59ioaI
zFgw^wdi5QlNI!)7LP~$x<E-rIKHmC9XY0Gk)o%6282jY;t6-tm*{9t5?7v;C^mm@|
z&h782?|6o;fdeb01E^HVhyg>sWWL^6{}o00Q)m`F6D9dllJDHt!^#dm!Ve5<82sTw
z9t8Ke?%=LqxYyAIbfnoW5@&QFwWD!8!H4V5*rTSyGJ6Ky(I1-IA1izJ<)Kj7<uAdl
z0br*stdFq1p|Ze)If%nh*{#nFm2H1^sI0u61Ye4U$`XBuW*T)NASbB<24b*G9daQW
zy23h5nCd_cK*zbI^PTx!``!w5W%@h!z2z~hA&%Gw5ZgVM!PMcB25#|nGD^)GSPgxv
z``@%2hj^V2qii6=1sWj)?)=kqn8TwP!Xt2OAPSYP`m>$;&JOc4QHfPJ&}tjKiyPf`
zzPtV`E-$OQ`hG%9d4lFTI9&7CGu+M{7ZAiQI;rl%Fq(yGP9yj<5@tdly2u@ZjOA#M
z+k`aGD`=!_I~(fWMCbB>P#3jqpx5fF3%m*fz71s>p>(?J5~fet27gP{A6EB^g8Tnn
z);S&pJPI#A&*vO`4rtH%$n~)_7J;z{j74B90{@>QporDQ6kCtQ3?r(UNsSgyw0Ht*
zBw7p@yQ8}5W=AnC!H}4dh-%50B{HInH4qMIX<D`!!BfWWI25=+1hi;P{f35Xwe<}f
z>$R<0R$qI=`uZ)}oEykydny?*(&@R&HOSTe6^Z)ix`eiV^M=*h)^G|Q%+qe5MImjA
z5i#Ps3~4f`t;X_CN{gFXOE|6Zg(D+655=fY3u`-#z1oV)_Ezt>Y$Yn!W=9xXj{xwd
zP>Y)k%}j+8=@^z~qRX|}QLSmOX{3u-5w9+uXx6rha&09~)C8WKfe0fEsbrfo#B6Oc
zVWyI;8m%|M!$i{5u<!-flPPU?DsCEDtTo&Wm0f_N8Y{Um95<=L7z9#QGpmKN2q1YJ
zNw&4O8m18p26=N>(N-)_X=ZpQx|mdDd(fKT;+U2;BFRJ)E!vh2HyZ%c+6q1ruoBm5
ztSn;HQDav%R@4|DB$Cr&eDLtKFgwo|=_FUg18GM``W3}<4?7vJy|!AJ;=KxwJs3Um
z>G|XE@E*|fe;OV>3d-KK<}91f4-YqjUH0Daa6jl{pyxr$KNubk&?F2=KI2!uyoD)y
z0?L%)0$&fx2&Z+K0ob~f1lJYd&XM6w7RDChKY%(L$wp~led(mD%lx~29c<O)<(FT1
z$xNVJ|7Oq-eBz1(B9UDa|8>AO3m*uly6(n*7(U-YKz(W8tDdzb`JOa9vdVt|_$id%
zoaZR_-jGKEr2i`ZKm7CXa1+^;`raK1HvniXjYVKA0%H*vi@;a}#v(8lfw2hu&m$n;
zN0RR!$tfMZ6Gf@mBK8YXC4$crF9XSUK;(N)^4%d!`FN7=49WMMXr4z&mjC6`;Uw`V
z=y@U%JszfXM7;DP-~Blw%JHa`r!~S+tab1-Nl@pzEd&WQHc7m<LeD@cT@d;-pQS|e
za7yxBmliw+r6ln^!eQw*&BG}PF(&K3*n;i$;lVX;AFAW3czVtGNx~4;8Du%v#Ppc+
zzZMnaK`G~DJB0cFH6q`)b{m{8CjE7SZWgpr&~`yP1nm*@sGuhV?Gv<L&@+Ob7Zfi=
z@)QtM6Lh|yb%Jgdv{BG@LEV0T9`CEOBsQ#FyIh;Ittpc*Gg?heaA9zMRZWJoHTxGV
z2+m(PSHNr@G^}A`_1VkfeXHeiY`W^a8(pU|g$q~Q22FYgmeE@KXqx%-CaLF*i<X1r
z`3lZrpagh6uV4`Q@xJ0sRHFEP1TN20upA>HPsx9jb7}}>6Y^BN49WuS+=AMGlKd&S
z#qu7hq&u;E+Irx<cM8w%ru@;#JpTz?^B%i+J;_XglJ}u$WFr6j`0@U92Fa-X_(5$?
zpm-wX%jy2{bY4vpbd~Q?;gtoX#FT4*<Sj1z8S*~mk5saRGXbRpzT_?C%p~PX5?jU@
zO}UBMvYaz>lsGY0ab~{q1{qkvnWf6Dq*Ke8I%OVNtK&>ad7X4tb7r%073r+u%y#80
zG3zGY4%J5GeiB<hz5`5)@(p4_-0ODbLt+{@V=CXHm|pGggz6rpnV3!EZUWPxQ0L`c
zTh;_}mr_jbT<1Fo(y9E8O0F-v2h1VmA=24V@;AVGlst0uri&gx$r0r^)pK*@Ltu_7
zMa0~~-8rs&L<Vjxq)tAe%%u{&h#nT4R62-h<i7POYoM9e<a-3<1(rVt0%}3!TOb#0
zq<XF>Em2UU?xG)pOR2ZWt1=o)029cUD3DdZMY-?6xcXh>xF9VJals0vQr8v}Fs>fb
zJv>*7hV@b|&!?_KK7Sg13JNAX3d`lQAQtfXC&Hpaj~0A^;%`%x5CJn`0xb~vXl(y8
z5cN%P#nY&E8Y+`CMX4<OI;nn@ekPCG4U{SS>1PUO@|1H_GL<tvWj-;}I8&@VO3cNa
z2`KbfLYZDf#>Q6`pe{w5x*8ak@_8e_g96^jVV3`MDxTq;2gvt3{1k31reXXFvL5OR
z4}V@$FbjN9OyOm}r!qd)eYB~ia9Jq{2GIBs4PNf}>4XIH(KW*V7&k(r*F&x6LPe~g
z8;<1PYSqv{h0nvbXC~Fl3aX%3bjB)dq{0Wez57Y$ZEkNDq0}*~U=_0Y-|>`@=l2nO
z9WuNf)(;tGljHcwe@rQE1f|BDBuR1z)!j{YqgHkO4tydsi!ZY9cL@Ji@{w{(eHYn+
zuan~Io^m>6_;3ZcJn~U1^8b<hd9jTaEFs~SJse&oVGpTV(Jcr=py;<=F7O2s;6a~q
zFW{1PPf6T{rhbu3UE(ddgCJ@Iqeo3F-=maLOpXvtPVgV*8$!0UmUr7pD&ieY#;x9-
zMi*M;Rh5uq;(E>>MG<^o5NG?s*HGcBrCg<VW8nhgCSOJs^wxRliQLc2dEfSK_g;ef
ze^Eilo~O6bdl$<7$<OQSZSlvV?3aa>k*)0FG|H4<e!ZB9@P}i(#xLSJQb<;VGsvp9
z_w@ckG8DX$40(I|_ZKcl9|TvEDR1wq`~9(YHs4Pz_xAQ2@JE{1;tDe2?LBh9`v^=d
z@zXKq?LB(Hdl?KZEvB*J?LB_LOVhn&g1xZcOUKo6!FC+*Zh`PsylLLv69>GTP`ZLQ
zz}tI%zn7MDYX$q~{a)Hpva+b3TYh)HcRSoMCekoLko<V=&5UUlx7qsvG+!*%xr-+c
zOut>cfJ`q<Q1U*HiqZkg$wLP)+UO9@^X1V6RqMu&GAmzRQ)vKEQtq39hAXqGsO6sC
z79b_eTe{qTm4BJPp<<~YsLC2}4fif2e1>8bc+lf{moUG7rQchMs>+yuhL6PZCpy$I
z!h*`GN^VbG=D)16p<+d)0x6#jqnZ)85c?6b>nNg;<ww=Ntg4HbaGy#lAin~W>QY5%
zs35~-Gf+?suLFy?M0thcE%p0(duI4ZVw}~Ee6(XmX+y=TO46?=S7w)1_`TDoOfOf)
z6$CJ^9gil#&IHj&94Qko<t8s0X(7&*6^dWAqD96lvx0}CX0qK-v_DZg($p!=g3Btu
zroa!fFf|*uz-cxp!bMKjO}c|?T%29#ESm0Dh(-60F2mRiMBFEPn>v+UhOwvO3e-uX
zLs331Eg?c<4FAYXFF=1Y1OLtV@4^40_@|5LApSkdbWd3sOfb5@GQ|%i&v^WJr{X7X
z7X4g~AC=Aqc`NCMo{Q(-NLVBNw9rqIe)iChTEhxGa@2ZfDr)8X>cj^gRx2J>D<4)%
zA6C`7)N#HUYUMdqc|hH;(lcwE8hBgv`JPk@Gw0NbW2*mQwb)nD*{POys)Y|{)XLjc
z^#Qf;707H?3$Ho%q&nf4TJePH?^S)KTDarflgCDiDvzlXOf}zI98*=_#B=9-wdzve
z%c^SncBuY8sDT}TYt_;R)QMj~El;W?8Q7b4Os(uxr+LfnQVSnY1FxulQ(ewHdCGQm
zA(5C`v_maFr`~vrS~leCRD<W#X)(3zO|{}%>f)HX!V+Cf#8k_?W!qKXMpcce1v}KK
z-iG^C)m!@mweo3d#ze3GB~?W%;L`&?Q>XbV)OlB{v#(Yctx&J(QLnjLU3ay*Z3TS0
zT2*^h#rKpt)7OIbRjY-~p4k;@IokfDTKX$Mj{rKS2GDxX<rNgY2UPWY2&3;N6po|F
zsOqtEYVlE3+2O%1BE{ULC$Kam_EqV-OxlX1Cu7)-Wio85;v171uG48bPj1zM4yovX
z)8So)PN%qZOA@PcG%KgoLEU7LmSj9ao0_t=U|D;)u;8R8!q`q_Eo<p~J&eT@Soh+4
z!gMTE(VCgwM4PXeP77pd+Ml%-8@XCr^)xn3HKPtQX@((0D|I?8rJ2|oMqV{#VtP{;
z8^m-=zb%~FX{2BQWmF~iSjP@9toEfz8r~X_o#{*)oWSBJUlK!oeD7C0ikhrCNS9$5
za+lbxCbp81HyZ!p?Qf4K5=KgHEweVA>5)tdt!mA*B^Y1yOX*@Q50Pur!_g>qq{%IA
zd><R%k|y`DC0nDiPcn&gyg7m8+(=6}#nOm9Hmqe5JM~18T#Lx{#x=a@Bd*zc<Rk56
z)A>>%kBJo@rt?nZ;mF$bCOy=7aJU9<gNSh^kx3iT1tZaIH{lqq(sGD8gGLDUy=rS~
zmWX>yt*Xt;B#hc-1F?!nz=l(imRf5wu2$8oRmH-zU9TmTOe8bus!W0wnXCBlOrsF?
zBRi`QL(_<u$y8Mu;|0AS3nH!Ibc-yGCle@$#2mb}w@=sMIbRk<7u(~Z6<hPr`8Wm^
zF9a_38tQs`IJ$_@VyVs-z^xIV71p%X^@vl{*yD&31%$?)6*UTmu3KHe^tNQwIu1lB
zjdl2fYS5wRTH5@`b>mS(kA}@K4jMX&xCnQ|>VBT(gCBv(v|D{6wo{4`XB}o@)1-A!
zS-lp450PC~ck$!HI(R4qbd+JRq}!bbJTHz#4Q#aJhlJRGNwxEVW3pg%B-y@qL9D$w
z9E`AP^wVwCyOXJ%)!0*+*a=6fo3H^J=a@`84qg-M+sIe#j+-r28|K$6o1ad`s?Bs<
z-<w45>ZF-&4@Zn@oB>Rut=dc`ThlDasx#>n?dPs;jzp?4cJ##y#jft=M5el?raGR8
zv}O=sE-0AK3O2>DXY}($s?ELaXtPr=i}|c!xJ&y@qed*)!niglhH55a@-r&uqVXiC
zeXbdikbwt6VFSq|FgBye#3Bi^6_i9k03r%JgKe_S$mCAABI6y#_lDgPbkC-s<ZjG3
zDQWx{^D5BM<mtJehXq)G1RP02p71&sex<<Wb62U-fQ*8jmZ3TeA^#}RE_sDbWb!#G
z2l2!2n_$1sW&aSDpJ4AlmGTcFGn)UTJ{tZgw=<sg&5{;I=xFlKa`}sx`?=yPT&;r1
z?;uDkzvc3i?B}x*e;*m=en*MZ{g5UZ2b~9ewEfs`ad}LL>m0(!t^z)qeVV(ChSQDU
zX!z~G1FVt-T=wZkZ8Z6N2=|mT`TSgZ_3a$;kLSP#a^R<O;GH?}Ujrxq-SPQD4*8FB
z;3b$GYqBv8Iu&@ir;^F%uCiz@;S?dEL=O`vEd@SW-01OQ0LR7VIxc4aoZ<ci?C-c-
zDoQrzutQI`#=(;NcKb`he!yjaUk*Dx!jAhv0X=dX&Hv*$@CS3?Kgof=ngc(V1OEUx
z#oyg756){82YDkRZ=a_C*X-zyWEKIRIl|hx&1A=eRic0m@yr*v{cMalddfTMcMxuf
zq|BNiF7kS~DUR7#GXsz|GqD&hnIqo_(9JfQh$e95(e-FjZ*EOCVb&YPtS7C9Gke&`
z8&~s3m%vmZ5!b`1RCq6DVrFVDF58%W>(NYG+g_+RI80sQW|pY)I<D*Mx2)b&uh(C@
zPR9od*6p}<^`?fkuz<OpULO*rp><oBzG35aYgTX6uUo(VhWf4g*41k^)<eVnS%*6L
z6@}0FG=eq3u%+pv3iO?aEPzi%vJierm7i{~OYm_9-QW``Yi4H`kX0Tk_{pc2_Lm@R
zk%fHTXtShYO{cOTbQ9)j)`uMAtUgPCf2kr1aew<kkEWA)OE?jw??PZ&g#cj6NAqm)
zZHsJeno{x)Xk@{z8x37=N~eWa*2FMN+PXEPId>M!Ke&;FSl`UZg0k=O?4l+7!x}cr
z1FYkl7h(YAzTu;<WQ_Xhi|lWQWLqY4;ubZl0I|bYKC&PdOz&;Oy#%P4vS^FU;r_@-
zwX+~z)HH&aMg}o?MU&0F4yY*;$MiKG70~K64Y)IDW?UTapSG}IbZ-JxSQN8=Ua|}K
zptu*wVsw<HjMgwIh+KOs=Fhw_L1cr?NwAcmd+wlV>;X#$G^CS!7zd3OaR|5IF4E>K
z8`iOGX~;aP3b(~!%u>gwU_qQj+Hh_FW;A7*AsJ3I<BTKNcp{dR`EXNH%Gf12d_&2Q
zc|=fV3C2)MefD(tUq)zs9hW@JDXm0{dz2)dYO(e&$vV?4FnD}s>*xA@mej#h!mLE!
z5pwFgzoT_YmNd;NvlGp4o%(WqEvc3*{u%unQI6MNld}Ety#Yz*3w^mZ>-Im4ay$gJ
z+b`cMko2T5Og>WVq`p`WU~ym+a#_E8PeIZ(u7JB(_9f`jn!nVS>->^ht95vO2suhp
zPEuMJqjv}-FW)1Ow2_pM-2PiAV~3HUK9TzJy#`6o3qe`GyZzr3`fG%od{07B`CbL-
zQ_S4@$AD2xq<wqdAm$G=zMaWEe(pn|Q(qH$k~X>mE{g4H9(3t<i1!F2mGaUD$xC{|
zrQaj;C8hfZXOeo7{=}t!MobnZokbnzNK#Kiw3zB_zr2r-v_uLD>W<$}g}&^6rU^ko
zX<qA0vi~IZ8ZuPB6qEOJlCG12g1YPfjZ6RZOiNJGcU=1J`p==99@mT0o_&8SzkBDF
zm%OC!LAT7IFYl+dDu75*&f-RRiGP9u8oRQ7d7mr4V|ZTlzwAb-C;RdH&?OP6FW*a=
zR1FYmz#{gq)R#0L%1(V&Cq(C4RMsO6NIl6HLzysHzgB0J1cbgIQk-da&dQXF`rUF;
z{|c8p`ASLB$wJ>kM}Du$QNPral5=zD?-cqyGVnsdt-mOT{>Mv%f*ja6^w;LnUuG#>
z@j3d}34M3`E?r^i-{?AEr2mrAk~ihu`uplE{X3<g#oBR~Fj1F_mgE|X9K{kJrCIo=
tQ<EG&^7=>5#pKvQ)=oz^SlK&Vg|c2LNT*Ae2rrs?jU{oLOTopm{{-fPU%mhU

literal 0
HcmV?d00001

diff --git a/tools/flaskIfc/copy2fpga-x86.sh b/tools/flaskIfc/copy2fpga-x86.sh
new file mode 100755
index 0000000000000..d214838f6b52c
--- /dev/null
+++ b/tools/flaskIfc/copy2fpga-x86.sh
@@ -0,0 +1,9 @@
+#! /bin/bash
+# This file runs the PCIE setup needed for file transfer.
+# Also, it invokes the file transfer utility: copy2fpga-x86
+# Note: sudo permissions are needed for file transfer
+# 
+echo "  Inside copy2fpga-x86.sh "
+sudo ./copy2fpga-setup.sh
+echo "sudo ./copy2fpga-x86 $1"
+sudo ./copy2fpga-x86 $1
diff --git a/tools/flaskIfc/flaskIfc.py b/tools/flaskIfc/flaskIfc.py
index 34b9fc5970522..38e20b4d1dfbf 100644
--- a/tools/flaskIfc/flaskIfc.py
+++ b/tools/flaskIfc/flaskIfc.py
@@ -4,6 +4,9 @@
 import time
 from werkzeug.utils import secure_filename
 import os
+import subprocess
+import mmap
+
 
 job_status = {"running": False, "result": "", "thread": None}
 
@@ -13,7 +16,8 @@
 #port = '/dev/ttyUSB2'
 baudrate = '921600'
 #baudrate = '115200'
-exe_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/"
+#exe_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/"
+exe_path = "/usr/bin/tsi/v0.1.1*/bin/"
 
 DEFAULT_REPEAT_PENALTY = 1.5
 DEFAULT_BATCH_SIZE = 1024
@@ -106,6 +110,50 @@ def upload_serial_command():
 #    except subprocess.CalledProcessError as e:
 #        return f"Error executing script: {e.stderr}", 500
 
+@app.route('/uploadtofpga-file', methods=['GET', 'POST'])
+def uploadtofpga_file():
+    setupprints = "Before:Copy2fpga-setup.sh"
+    print(setupprints)
+
+    if request.method == 'POST':
+        # Check if a file was submitted
+        if 'file' not in request.files:
+            return "No file part"
+        file = request.files['file']
+
+        # Check if the file is empty
+        if file.filename == '':
+            return "No file selected"
+
+        # Save the file if it exists
+        if file:
+            filename = secure_filename(file.filename)
+            process = subprocess.Popen(["./copy2fpga-x86.sh", filename], text=True)
+            copy2fpgax86prints = "Starting copy2fpga-x86 sending file..."
+            print (copy2fpgax86prints)
+            file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
+
+            script_path = "./recvFromHost " 
+            command = f"cd {exe_path}; {script_path} {filename}"
+            def scriptRecvFromHost():
+                 try:
+                     result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True,     check=True)
+                     job_status["result"] = result.stdout
+                     print("FPGA Target ready to receive file: recvFromHost started..\n")
+                     print(result.stdout)
+                     recv_output = result.stdout
+                 except subprocess.CalledProcessError as e:
+                     job_status["result"] = f"Error: {e.stderr}"
+                 finally:
+                     job_status["running"] = False
+            thread = threading.Thread(target=scriptRecvFromHost)
+            job_status = {"running": True, "result": "", "thread": thread}
+            thread.start()
+ 
+            stdout, stderr = process.communicate()
+        return render_template('uploadtofpga.html', apple = process, recvoutput=f"On FPGA Target, recvFromHost completed ; transf    ered file:{filename} received")
+    return render_template('upload.html') # Display the upload form
+
 @app.route('/upload-file', methods=['GET', 'POST'])
 def upload_file():
     if request.method == 'POST':
diff --git a/tools/flaskIfc/templates/uploadtofpga.html b/tools/flaskIfc/templates/uploadtofpga.html
new file mode 100644
index 0000000000000..97445c1b68622
--- /dev/null
+++ b/tools/flaskIfc/templates/uploadtofpga.html
@@ -0,0 +1,14 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>File Transfer In Progress...</title>
+</head>
+<body>
+    <h1> File Transfer Started. </h1>
+    <h2> Running copy2fpga-x86.sh </h2>
+    <pre>{{ apple }}</pre>
+    <pre>{{ recvoutput }}</pre>
+    <br>
+</body>
+</html>
+