l3utterfly
diff --git a/‎.gitignore‎
Lines changed: 10 additions & 1 deletion b/‎.gitignore‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 19 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎common/CMakeLists.txt‎
Lines changed: 35 additions & 4 deletions b/‎common/CMakeLists.txt‎
Lines changed: 35 additions & 4 deletions
diff --git a/‎common/common.cpp‎
Lines changed: 3 additions & 0 deletions b/‎common/common.cpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎common/sampling.cpp‎
Lines changed: 32 additions & 0 deletions b/‎common/sampling.cpp‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎common/sampling.h‎
Lines changed: 3 additions & 0 deletions b/‎common/sampling.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎ggml/CMakeLists.txt‎
Lines changed: 9 additions & 0 deletions b/‎ggml/CMakeLists.txt‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎ggml/include/ggml-backend.h‎
Lines changed: 1 addition & 0 deletions b/‎ggml/include/ggml-backend.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ggml/include/ggml-hexagon.h‎
Lines changed: 51 additions & 0 deletions b/‎ggml/include/ggml-hexagon.h‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎ggml/include/ggml.h‎
Lines changed: 1 addition & 0 deletions b/‎ggml/include/ggml.h‎
Lines changed: 1 addition & 0 deletions
@@ -77,7 +77,6 @@ autogen-*.md
 !.github/workflows/*.yml
 
 # Models
-
 models/*
 models-mnt
 !models/.editorconfig
@@ -147,3 +146,13 @@ poetry.toml
 # Local scripts
 /run-vim.sh
 /run-chat.sh
+
+HEXAGON_Tools/
+prebuilts/QNN_SDK/qairt/2.35.0.250530/
+prebuilts/QNN_SDK/qairt/2.36.0.250627/
+prebuilts/QNN_SDK/v2.35.0.250530.zip
+prebuilts/QNN_SDK/v2.36.0.250627.zip
+prebuilts/Hexagon_SDK/minimal-hexagon-sdk-6.2.0.1.xz
+prebuilts/OpenCL_SDK/
+prebuilts/Vulkan_SDK/
+HEXAGON_Tools/
@@ -7,6 +7,24 @@ set(CMAKE_WARN_UNUSED_CLI YES)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
+if(CMAKE_SYSTEM_NAME STREQUAL "Android")
+    if(DEFINED HTP_ARCH_VERSION AND NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+        if (${HTP_ARCH_VERSION} STREQUAL "v75" OR ${HTP_ARCH_VERSION} STREQUAL "v79")
+            #works fine on Snapdragon 8Gen3&8Elite with 1.5x - 3x performance gains with the default ggml backend
+            #set(OPT_FLAG " -O3 -march=armv8.7-a -mcpu=cortex-x1 -mtune=cortex-x1 -ffp-model=fast -fno-finite-math-only")
+
+            # this set of flag is more general (without the cortex cpu optimisation, which is only available on very very modern archs)
+            set(OPT_FLAG " -O3 -flto -D_GNU_SOURCE -fvectorize -ffp-model=fast -fno-finite-math-only")
+
+            message("OPT_FLAG:${OPT_FLAG}")
+            set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} ${DEBUG_FLAG} ${OPT_FLAG}")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${DEBUG_FLAG} ${OPT_FLAG}")
+            set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} ${DEBUG_FLAG} ${OPT_FLAG}")
+            set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${DEBUG_FLAG} ${OPT_FLAG}")
+        endif()
+    endif()
+endif()
+
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
@@ -127,6 +145,7 @@ llama_option_depr(WARNING     LLAMA_RPC                 GGML_RPC)
 llama_option_depr(WARNING     LLAMA_SYCL                GGML_SYCL)
 llama_option_depr(WARNING     LLAMA_SYCL_F16            GGML_SYCL_F16)
 llama_option_depr(WARNING     LLAMA_CANN                GGML_CANN)
+llama_option_depr(WARNING     LLAMA_HEXAGON             GGML_HEXAGON)
 
 if (NOT MSVC)
     if (LLAMA_SANITIZE_THREAD)
 
@@ -6,9 +6,8 @@ llama_add_compile_flags()
 
 # Build info header
 #
-
-if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
-    set(GIT_DIR "${PROJECT_SOURCE_DIR}/.git")
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
+    set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
 
     # Is git submodule
     if(NOT IS_DIRECTORY "${GIT_DIR}")
@@ -18,7 +17,7 @@ if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
         if (SLASH_POS EQUAL 0)
             set(GIT_DIR "${REAL_GIT_DIR}")
         else()
-            set(GIT_DIR "${PROJECT_SOURCE_DIR}/${REAL_GIT_DIR}")
+            set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
         endif()
     endif()
 
@@ -32,6 +31,38 @@ else()
     message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
 endif()
 
+# Find Git executable
+find_package(Git)
+
+# Get git commit hash
+if(GIT_FOUND AND EXISTS "${GIT_DIR}")
+    execute_process(
+            COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD
+            WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+            OUTPUT_VARIABLE LLAMA_BUILD_COMMIT
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+            ERROR_QUIET
+    )
+
+    # Get build number from git commit count
+    execute_process(
+            COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
+            WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+            OUTPUT_VARIABLE LLAMA_BUILD_NUMBER
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+            ERROR_QUIET
+    )
+else()
+    set(LLAMA_BUILD_COMMIT "unknown")
+    set(LLAMA_BUILD_NUMBER 0)
+endif()
+
+# Set compiler info
+set(BUILD_COMPILER "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
+
+# Set build target
+set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
+
 set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in")
 set(OUTPUT_FILE   "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp")
 configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
 
@@ -1109,6 +1109,9 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
     auto mparams = llama_model_default_params();
 
     if (!params.devices.empty()) {
+        // add nullptr to the end just in case
+        params.devices.push_back(nullptr);
+
         mparams.devices = params.devices.data();
     }
 
 
@@ -62,6 +62,17 @@ struct ring_buffer {
         return value;
     }
 
+    T pop_back() {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        // Move pos backwards, wrapping around if necessary
+        pos = (pos == 0) ? capacity - 1 : pos - 1;
+        T value = data[pos];
+        sz--;
+        return value;
+    }
+
     const T & rat(size_t i) const {
         if (i >= sz) {
             throw std::runtime_error("ring buffer: index out of bounds");
@@ -313,6 +324,12 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
     llama_sampler_reset(gsmpl->chain);
 }
 
+void common_sampler_reinit_grammar(struct common_sampler * gsmpl, const struct llama_model * model, const char * grammar) {
+    llama_sampler_reset(gsmpl->grmr);
+
+    gsmpl->grmr = llama_sampler_init_grammar(llama_model_get_vocab(model), grammar, "root");
+}
+
 struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
     return new common_sampler {
         /* .params = */ gsmpl->params,
@@ -466,6 +483,21 @@ std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_
     return result;
 }
 
+const std::vector<llama_token> common_sampler_prev(common_sampler * gsmpl) {
+    return gsmpl->prev.to_vector();
+}
+
+void common_sampler_rollback(common_sampler * gsmpl, int rollback_num) {
+    if(rollback_num > gsmpl->prev.size()) {
+        rollback_num = gsmpl->prev.size();
+    }
+
+    // continuously pop the last token
+    for(int i = 0; i < rollback_num; i++) {
+        gsmpl->prev.pop_back();
+    }
+}
+
 char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
     switch (cnstr) {
         case COMMON_SAMPLER_TYPE_DRY:         return 'd';
 
@@ -43,6 +43,7 @@ void common_sampler_free(struct common_sampler * gsmpl);
 // if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
 void                    common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
 void                    common_sampler_reset (struct common_sampler * gsmpl);
+void                    common_sampler_reinit_grammar(struct common_sampler * gsmpl, const struct llama_model * model, const char * grammar);
 struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
 
 // arguments can be nullptr to skip printing
@@ -96,6 +97,8 @@ std::string common_sampler_print(const struct common_sampler * gsmpl);
 
 // get a string representation of the last accepted tokens
 std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n);
+const std::vector<llama_token> common_sampler_prev(common_sampler * gsmpl);
+void common_sampler_rollback(common_sampler * gsmpl, int rollback_num);
 
 char        common_sampler_type_to_chr(enum common_sampler_type cnstr);
 std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
 
@@ -211,6 +211,7 @@ option(GGML_OPENCL_EMBED_KERNELS            "ggml: embed kernels"
 option(GGML_OPENCL_USE_ADRENO_KERNELS       "ggml: use optimized kernels for Adreno"          ON)
 set   (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
                                             "gmml: OpenCL API version to target")
+option(GGML_HEXAGON                         "ggml: use HEXAGON"                               OFF)
 
 # toolchain for vulkan-shaders-gen
 set   (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
@@ -276,9 +277,17 @@ set(GGML_PUBLIC_HEADERS
     include/ggml-sycl.h
     include/ggml-vulkan.h
     include/ggml-webgpu.h
+    include/ggml-hexagon.h
     include/gguf.h)
 
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
+
+# link android log library
+if(ANDROID)
+    find_library(log-lib log)
+    target_link_libraries(ggml PRIVATE ${log-lib})
+endif()
+
 #if (GGML_METAL)
 #    set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
 #endif()
 
@@ -202,6 +202,7 @@ extern "C" {
     //
     // Backend registry
     //
+    GGML_API void               ggml_backend_reg_layla(bool useVulkan, bool useOpenCL, bool useHexagon, bool useMetal);
 
     GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
 
 
@@ -0,0 +1,51 @@
+ /*
+ * Copyright (c) 2024-2025 The ggml authors
+ */
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define GGML_HEXAGON_MAX_DEVICES    4
+#define GGML_HEXAGON_BACKEND_NAME   "hexagon"
+
+enum HEXAGONBackend {
+    HEXAGON_BACKEND_QNNCPU  = 0,
+    HEXAGON_BACKEND_QNNGPU  = 1,
+    HEXAGON_BACKEND_QNNNPU  = 2,
+    HEXAGON_BACKEND_CDSP    = 3,
+    HEXAGON_BACKEND_GGML    = 4, //"fake" HEXAGON backend for compare performance between HEXAGON backend and ggml backend
+};
+
+//0: general approach through QNN:offload ggmlop to QNN(QNNCPU, QNNGPU, QNNNPU）
+//1: special approach through QNN-SINGLEGRAPH:mapping entire ggml cgraph to a single QNN graph
+//2: general approach through Hexagon cDSP:offload ggmlop to Hexagon cDSP directly
+enum hwaccel_approach_type {
+     HWACCEL_QNN            = 0,
+     HWACCEL_QNN_SINGLEGRAPH= 1,
+     HWACCEL_CDSP           = 2,
+};
+
+GGML_BACKEND_API ggml_backend_t     ggml_backend_hexagon_init(size_t dev_num, const char * qnn_lib_path);
+
+GGML_BACKEND_API bool               ggml_backend_is_hexagon(ggml_backend_t backend);
+
+GGML_BACKEND_API int                ggml_backend_hexagon_get_device_count(void);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_hexagon_reg(void);
+
+GGML_BACKEND_API const char *       ggml_backend_hexagon_get_devname(size_t dev_num);
+
+GGML_BACKEND_API void               ggml_backend_hexagon_set_cfg(int new_hexagon_backend, int new_hwaccel_approach);
+
+GGML_BACKEND_API int                ggml_backend_hexagon_get_mulmat_algotype(void);
+
+GGML_BACKEND_API void               ggml_backend_hexagon_set_mulmat_algotype(int new_mulmat_algotype);
+
+#ifdef __cplusplus
+}
+#endif
@@ -659,6 +659,7 @@ extern "C" {
 
     // accepts a UTF-8 path, even on Windows
     GGML_API FILE *  ggml_fopen(const char * fname, const char * mode);
+    GGML_API FILE *  ggml_fdopen(int fd, const char * mode, size_t fd_offset);
 
     GGML_API void    ggml_print_object (const struct ggml_object * obj);
     GGML_API void    ggml_print_objects(const struct ggml_context * ctx);
Original file line number	Diff line number	Diff line change
`@@ -1109,6 +1109,9 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {`
`1109`	`1109`	`auto mparams = llama_model_default_params();`
`1110`	`1110`
`1111`	`1111`	`if (!params.devices.empty()) {`
	`1112`	`+ // add nullptr to the end just in case`
	`1113`	`+ params.devices.push_back(nullptr);`
	`1114`	`+`
`1112`	`1115`	`mparams.devices = params.devices.data();`
`1113`	`1116`	`}`
`1114`	`1117`