2626 * - GGML_OP_ADD & GGML_OP_MUL_MAT:
2727 * this is a hwaccel skeleton, can expand other ggml ops accordingly
2828 *
29- * Permission is hereby granted, free of charge, to any person obtaining a copy
30- * of this software and associated documentation files (the "Software"), to
31- * deal in the Software without restriction, including without limitation the
32- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
33- * sell copies of the Software, and to permit persons to whom the Software is
34- * furnished to do so, subject to the following conditions:
35- *
36- * The above copyright notice and this permission notice shall be included in
37- * all copies or substantial portions of the Software.
38- *
39- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
40- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
41- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
42- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
43- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
44- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
45- * IN THE SOFTWARE.
4629 */
4730#include < stdio.h>
4831#include < stdlib.h>
@@ -383,7 +366,7 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
383366 .hexagon_backend = HEXAGON_BACKEND_CDSP,
384367 .enable_rpc_ion_mempool = 0 ,
385368 .enable_all_q_mulmat = 0 ,
386- .profiler_duration = 5 ,
369+ .profiler_duration = 5 , // seconds
387370 .profiler_counts = 100 ,
388371 .thread_counts = 4 ,
389372 .cfgfilename = " ggml-hexagon.cfg" ,
@@ -398,7 +381,7 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
398381#elif defined(_WIN32)
399382 .qnn_runtimelib_path = " C:\\ " ,
400383#endif
401- .ggml_hexagon_version = {" 1.07 " },
384+ .ggml_hexagon_version = {" 1.08 " },
402385 .ggml_dsp_version = {" 0.63" },
403386};
404387
@@ -835,21 +818,25 @@ static const char * ggmlhexagon_get_hwaccel_approach_name(int hwaccle_approach)
835818}
836819
837820static void ggmlhexagon_get_timestring (char * p_currenttime) {
838- #if defined(__ANDROID__) || defined(__linux__)
839- time_t n_seconds = 0 ;
840- struct tm now_time;
841-
842821 if (nullptr == p_currenttime)
843822 return ;
844823
845- time (&n_seconds);
846- localtime_r (&n_seconds, &now_time);
847- snprintf (p_currenttime, GGMLHEXAGON_TMPBUF_LEN, " %04d-%02d-%02d,%02d:%02d:%02d" ,
848- now_time.tm_year + 1900 , now_time.tm_mon + 1 , now_time.tm_mday ,
849- now_time.tm_hour , now_time.tm_min , now_time.tm_sec );
850- #else
851- // TODO: WoA
852- #endif
824+ auto time_to_string = [](const std::chrono::system_clock::time_point & tp)->std ::string {
825+ auto as_time_t = std::chrono::system_clock::to_time_t (tp);
826+ struct tm tm;
827+
828+ localtime_r (&as_time_t , &tm);
829+
830+ std::chrono::milliseconds ms = std::chrono::duration_cast<std::chrono::milliseconds>(tp.time_since_epoch ());
831+ char buf[GGMLHEXAGON_TMPBUF_LEN];
832+ memset (buf, 0 , GGMLHEXAGON_TMPBUF_LEN);
833+ snprintf (buf, sizeof (buf), " %04d-%02d-%02d,%02d:%02d:%02d" ,
834+ tm.tm_year + 1900 , tm.tm_mon + 1 , tm.tm_mday , tm.tm_hour , tm.tm_min , tm.tm_sec );
835+ return buf;
836+ };
837+
838+ std::chrono::system_clock::time_point tp = std::chrono::system_clock::now ();
839+ snprintf (p_currenttime, GGMLHEXAGON_TMPBUF_LEN, " %s" , time_to_string (tp).c_str ());
853840}
854841
855842static void ggmlhexagon_log_internal (ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) {
@@ -1791,8 +1778,7 @@ static void * ggmlhexagon_type_trait(ggml_backend_hexagon_context * ctx, ggml_te
17911778
17921779 const int min_cols_per_thread = 4096 ;
17931780 const int min_rows_per_thread = std::max ((int )(min_cols_per_thread / ne00), 1 );
1794- const int n_threads = std::max (
1795- std::min (ctx->n_threads , (int )(ne01 / min_rows_per_thread)), 1 );
1781+ const int n_threads = std::max (std::min (ctx->n_threads , (int )(ne01 / min_rows_per_thread)), 1 );
17961782 for (int i = 1 ; i < n_threads; i++) {
17971783 const int64_t start = i * ne01 / n_threads;
17981784 const int64_t end = (i + 1 ) * ne01 / n_threads;
@@ -1946,6 +1932,13 @@ static bool ggmlhexagon_check_valid_appcfg() {
19461932 is_valid_appcfg = false ;
19471933 }
19481934
1935+ if (HWACCEL_QNN == g_hexagon_appcfg.hwaccel_approach ) {
1936+ if (HEXAGON_BACKEND_CDSP == g_hexagon_appcfg.hexagon_backend ) {
1937+ GGMLHEXAGON_LOG_INFO (" hexagon_backend HEXAGON_BACKEND_CDSP must match with hwaccel_approach HWACCEL_CDSP" );
1938+ is_valid_appcfg = false ;
1939+ }
1940+ }
1941+
19491942 if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach ) {
19501943 if ((HEXAGON_BACKEND_CDSP != g_hexagon_appcfg.hexagon_backend ) && (HEXAGON_BACKEND_GGML != g_hexagon_appcfg.hexagon_backend )) {
19511944 GGMLHEXAGON_LOG_INFO (" hwaccel_approach HWACCEL_CDSP must match with hexagon_backend HEXAGON_BACKEND_CDSP" );
@@ -2271,6 +2264,7 @@ static int ggmlqnn_free_qnntensor(Qnn_Tensor_t * tensor) {
22712264 free (src_qparam.bwAxisScaleOffsetEncoding .offsets );
22722265 }
22732266 }
2267+ GGMLHEXAGON_LOG_DEBUG (" free tensor %p" , tensor);
22742268 free (ggmlqnn_get_tensor_dimensions (*tensor));
22752269 free (tensor);
22762270
@@ -2804,15 +2798,12 @@ void qnn_instance::free_rpcmem(void * buf) {
28042798 GGMLHEXAGON_LOG_WARN (" no allocated tensor\n " );
28052799 } else {
28062800 GGMLHEXAGON_LOG_DEBUG (" free rpc mem %p" , _rpcmem_store_map[buf]);
2807- for (std::unordered_map<void *, size_t >::iterator it = _rpcmem_usage_map.begin ();
2808- it != _rpcmem_usage_map.end ();
2809- it++) {
2810- void * rpcbuffer = it->first ;
2801+ for (const auto & [rpcbuffer, rpcbuffer_size] : _rpcmem_usage_map) {
28112802 if (buf == rpcbuffer) {
2812- rpcbuffer_size = it->second ;
28132803 _rpcmem_usage -= rpcbuffer_size;
28142804 }
28152805 }
2806+
28162807 if (rpcbuffer_size != 0 ) {
28172808 _rpcmem_usage_map.erase (buf);
28182809 }
@@ -2827,13 +2818,11 @@ void qnn_instance::free_rpcmem() {
28272818 return ;
28282819 }
28292820
2830- for (std::unordered_map<void *, void *>::iterator it = _rpcmem_store_map.begin ();
2831- it != _qnn_mem_set.end ();
2832- it++) {
2833- void * rpcbuffer = it->second ;
2821+ for (const auto & [rpcbuffer, raw_rpcbuffer] : _rpcmem_store_map) {
28342822 GGMLHEXAGON_LOG_DEBUG (" free rpc buffer %p" , rpcbuffer);
28352823 _pfn_rpc_mem_free (rpcbuffer);
28362824 }
2825+
28372826 _rpcmem_store_map.clear ();
28382827 _rpcmem_usage_map.clear ();
28392828 _rpcmem_usage = 0 ;
@@ -2937,14 +2926,12 @@ Qnn_MemHandle_t qnn_instance::register_rpcmem(void * p_data, const uint32_t ran
29372926}
29382927
29392928void * qnn_instance::get_rpcmem_from_memhandle (Qnn_MemHandle_t mem_handle) {
2940- for (std::unordered_map<void *, Qnn_MemHandle_t>::iterator it = _qnn_mem_set.begin ();
2941- it != _qnn_mem_set.end ();
2942- it++) {
2943- Qnn_MemHandle_t mem_handle = it->second ;
2944- if (it->second == mem_handle) {
2945- return it->first ;
2929+ for (const auto & [ptr, handle] : _qnn_mem_set) {
2930+ if (mem_handle == handle) {
2931+ return ptr;
29462932 }
29472933 }
2934+
29482935 GGMLHEXAGON_LOG_WARN (" can't find rpcmem from qnn mem handle %p" , mem_handle);
29492936 return nullptr ;
29502937}
@@ -2956,11 +2943,8 @@ void qnn_instance::unregister_rpcmem() {
29562943 GGMLHEXAGON_LOG_WARN (" no rpcmem registered\n " );
29572944 }
29582945
2959- for (std::unordered_map<void *, Qnn_MemHandle_t>::iterator it = _qnn_mem_set.begin ();
2960- it != _qnn_mem_set.end ();
2961- it++) {
2962- Qnn_MemHandle_t mem_handle = it->second ;
2963- error = _qnn_interface.qnn_mem_de_register (&mem_handle, 1 );
2946+ for (const auto & [ptr, mem_handle] : _qnn_mem_set) {
2947+ auto error = _qnn_interface.qnn_mem_de_register (&mem_handle, 1 );
29642948 if (error != QNN_SUCCESS) {
29652949 GGMLHEXAGON_LOG_WARN (" failed to unregister shared memory, error %d\n " , QNN_GET_ERROR_CODE (error));
29662950 } else {
@@ -5187,14 +5171,22 @@ static int ggmlhexagon_request_status_notifications(int domain_id, void * contex
51875171static int ggmlhexagon_init_rpcmempool (ggml_backend_hexagon_context * ctx) {
51885172 size_t candidate_size = 0 ;
51895173 uint8_t * rpc_buffer = nullptr ;
5174+ #ifdef SD_USE_HEXAGON // for stable-diffusion.cpp
5175+ size_t probe_slots[] = {1024 , 1536 , 2000 , 2048 , 1024 + 2048 , 4096 };
5176+ #else
51905177 size_t probe_slots[] = {1024 , 1536 , 2000 , 2048 };
5178+ #endif
51915179 size_t probe_counts = sizeof (probe_slots) / sizeof (size_t );
51925180
51935181 if (nullptr == ctx)
51945182 return 1 ;
51955183
51965184 for (size_t idx = 0 ; idx < probe_counts; idx++) {
5185+ #ifdef SD_USE_HEXAGON // for stable-diffusion.cpp
5186+ rpc_buffer = static_cast <uint8_t *>(rpcmem_alloc2 (RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (probe_slots[idx] * SIZE_IN_MB)));
5187+ #else
51975188 rpc_buffer = static_cast <uint8_t *>(rpcmem_alloc (RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (probe_slots[idx] * SIZE_IN_MB)));
5189+ #endif
51985190 if (nullptr == rpc_buffer) {
51995191 GGMLHEXAGON_LOG_DEBUG (" alloc rpcmem %d (MiB) failure during probe rpc memory info, reason: %s\n " , probe_slots[idx], strerror (errno));
52005192 break ;
@@ -5212,9 +5204,12 @@ static int ggmlhexagon_init_rpcmempool(ggml_backend_hexagon_context * ctx) {
52125204 if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool )) {
52135205 GGML_ASSERT (ctx->rpc_mempool_capacity > (8 * SIZE_IN_MB));
52145206 ctx->rpc_mempool_len = ctx->rpc_mempool_capacity - (8 * SIZE_IN_MB);
5215-
5207+ #ifdef SD_USE_HEXAGON // use rpcmem_alloc2 to alloc 2+ GiB memory, it's a workaround to make stablediffusion.cpp happy
5208+ ctx->rpc_mempool = rpcmem_alloc2 (RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_TRY_MAP_STATIC, ctx->rpc_mempool_len );
5209+ #else
52165210 // FIXME: it seems there is unknown issue with 2+ GiB memory pool
52175211 ctx->rpc_mempool = rpcmem_alloc (RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_TRY_MAP_STATIC, ctx->rpc_mempool_len );
5212+ #endif
52185213 if (nullptr == ctx->rpc_mempool ) {
52195214 GGMLHEXAGON_LOG_WARN (" alloc rpc memorypool %ld(%d MiB) failed" , ctx->rpc_mempool_len , ctx->rpc_mempool_capacity / SIZE_IN_MB);
52205215 return 2 ;
@@ -5586,7 +5581,6 @@ static bool ggmlhexagon_can_handle_op_through_cdsp(ggml_backend_dev_t dev, const
55865581 {
55875582 // TODO:workaround approach to fix HWACCEL_CDSP can't works in ASR inference and LLM inference
55885583 // with some LLM models in a standard Android APP
5589- // one more thing, I think the latest QNN SDK's internal also use the similar approach
55905584 if (ne00 < 1024 ) {
55915585 return false ;
55925586 }
@@ -6039,18 +6033,16 @@ static void ggml_backend_hexagon_free(ggml_backend_t backend) {
60396033
60406034 qnn_instance * instance = (qnn_instance*)g_hexagon_mgr[ctx->device ].instance ;
60416035 if (nullptr != instance) {
6042- std::map<std::string, qnn_singlenode_res_t >::iterator singlenode_graph_it;
6043- for (singlenode_graph_it = ctx->qnn_singlenode_graph_map .begin ();
6044- singlenode_graph_it != ctx->qnn_singlenode_graph_map .end (); singlenode_graph_it++) {
6045- auto & graph_res = singlenode_graph_it->second ;
6046- Qnn_GraphHandle_t & graph_handle = std::get<0 >(graph_res);
6047- qnn_ptensors_t & ptensors = std::get<1 >(graph_res);
6048- for (auto tensor_it = ptensors.begin (); tensor_it != ptensors.end (); ++tensor_it) {
6049- ggmlqnn_free_qnntensor (*tensor_it);
6036+ for (auto & [graph_name, graph_res] : ctx->qnn_singlenode_graph_map ) {
6037+ auto & graph_handle = std::get<0 >(graph_res);
6038+ auto & ptensors = std::get<1 >(graph_res);
6039+ for (auto & tensor : ptensors) {
6040+ ggmlqnn_free_qnntensor (tensor);
60506041 }
6051- GGML_UNUSED ( graph_handle);
6052- GGMLHEXAGON_LOG_DEBUG (" clean up graph:%s" , singlenode_graph_it-> first .c_str ());
6042+ GGMLHEXAGON_LOG_DEBUG ( " graph handle %p " , graph_handle);
6043+ GGMLHEXAGON_LOG_DEBUG (" clean up graph:%s" , graph_name .c_str ());
60536044 }
6045+
60546046 ctx->qnn_singlenode_graph_map .clear ();
60556047
60566048 instance->qnn_finalize ();
@@ -6225,18 +6217,22 @@ static ggml_backend_t ggml_backend_hexagon_device_init_backend(ggml_backend_dev_
62256217static ggml_backend_buffer_type_t ggml_backend_hexagon_buffer_type (size_t device_index) {
62266218 static std::mutex mutex;
62276219 std::lock_guard<std::mutex> lock (mutex);
6228- GGMLHEXAGON_LOG_DEBUG (" enter %s" , __func__);
6220+ GGMLHEXAGON_LOG_DEBUG (" enter %s, device_index %d " , __func__, device_index );
62296221 if (device_index >= GGML_HEXAGON_MAX_DEVICES) {
62306222 GGMLHEXAGON_LOG_DEBUG (" ggml_backend_hexagon_buffer_type error: device_index:%d is out of range [0, %d]\n " ,
62316223 device_index, GGML_HEXAGON_MAX_DEVICES - 1 );
62326224 return nullptr ;
62336225 }
62346226
6235- if (device_index != ( size_t )( g_hexagon_appcfg.hexagon_backend ) ) {
6227+ if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach ) {
62366228 // cover following special case:
6237- // toggle backend and forth between cDSP and ggml in a standard Android APP or in
6229+ // toggle back and forth frequently between cDSP and ggml in a standard Android APP or in
62386230 // a same running process
6239- g_hexagon_appcfg.hexagon_backend = device_index;
6231+ if (device_index != (size_t )(g_hexagon_appcfg.hexagon_backend )) {
6232+ GGMLHEXAGON_LOG_INFO (" device_index %d, backend %d" , device_index, g_hexagon_appcfg.hexagon_backend );
6233+
6234+ g_hexagon_appcfg.hexagon_backend = device_index;
6235+ }
62406236 }
62416237
62426238 static struct ggml_backend_buffer_type ggml_backend_hexagon_buffer_types[GGML_HEXAGON_MAX_DEVICES];
@@ -6263,7 +6259,7 @@ static ggml_backend_buffer_type_t ggml_backend_hexagon_buffer_type(size_t device
62636259 if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach ) {
62646260 GGML_ASSERT (HEXAGON_BACKEND_CDSP == g_hexagon_appcfg.hexagon_backend );
62656261 // FIXME:this is workaround for cover following special case:
6266- // toggle back and forth between cDSP and ggml in a standard Android APP or in a same running process
6262+ // toggle back and forth frequently between cDSP and ggml in a standard Android APP or in a same running process
62676263 // there is unknown issue with this workaround when toggle back and forth frequently in a standard Android APP
62686264 int result = ggmlhexagon_init_dsp (&g_hexagon_mgr[HEXAGON_BACKEND_CDSP]);
62696265 if (0 != result) {
@@ -6426,7 +6422,6 @@ static void ggml_backend_hexagon_set_n_threads(ggml_backend_t backend, int n_thr
64266422
64276423int ggml_backend_hexagon_get_device_count () {
64286424 if (g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) {
6429- // here is the trick:
64306425 // there only 1 backend_device when g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP
64316426 // so return 1
64326427 return 1 ;
@@ -6465,7 +6460,6 @@ static size_t ggml_backend_hexagon_reg_get_device_count(ggml_backend_reg_t reg)
64656460 GGML_UNUSED (reg);
64666461 if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach ) {
64676462 GGML_ASSERT (g_hexagon_appcfg.hexagon_backend == HEXAGON_BACKEND_CDSP);
6468- // here is the trick:
64696463 // there only 1 backend_device when g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP
64706464 // so return 1
64716465 return 1 ;
@@ -6483,7 +6477,6 @@ static ggml_backend_dev_t ggml_backend_hexagon_reg_get_device(ggml_backend_reg_t
64836477 ggml_backend_hexagon_reg_context * ctx = (ggml_backend_hexagon_reg_context *)reg->context ;
64846478 if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach ) {
64856479 GGML_ASSERT (g_hexagon_appcfg.hexagon_backend == HEXAGON_BACKEND_CDSP);
6486- // here is the trick:
64876480 // there only 1 backend_device when g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP
64886481 // so return ctx->devices[0]
64896482 return ctx->devices [0 ];
@@ -6517,7 +6510,7 @@ static const ggml_backend_reg_i ggml_backend_hexagon_reg_interface = {
65176510ggml_backend_reg_t ggml_backend_hexagon_reg () {
65186511 static ggml_backend_reg reg;
65196512 // TODO: the existing codes can't cover following special case:
6520- // toggle back and forth between QNN-NPU and cDSP and ggml in a standard Android APP or in
6513+ // toggle back and forth frequently between QNN-NPU and cDSP and ggml in a standard Android APP or in
65216514 // a same running process
65226515 // supportive of such special case is easy but it will significantly increase the size of APK
65236516 static bool initialized = false ;
@@ -6556,7 +6549,6 @@ ggml_backend_reg_t ggml_backend_hexagon_reg() {
65566549 /* .context = */ &g_hexagon_mgr[i]
65576550 };
65586551 if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach ) {
6559- // here is the trick:
65606552 // there only 1 backend_device when g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP
65616553 // so context is g_hexagon_mgr[HEXAGON_BACKEND_CDSP] rather than g_hexagon_mgr[0]
65626554 // attention here:
@@ -6565,7 +6557,7 @@ ggml_backend_reg_t ggml_backend_hexagon_reg() {
65656557
65666558 ctx->devices .push_back (dev);
65676559
6568- // here is the trick: make cDSP rpc memory pool happy because ggml's backend subsystem need this
6560+ // make cDSP rpc memory pool happy because ggml's backend subsystem need this
65696561 if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach ) {
65706562 GGML_ASSERT (HEXAGON_BACKEND_CDSP == g_hexagon_appcfg.hexagon_backend );
65716563 int result = ggmlhexagon_init_dsp (&g_hexagon_mgr[HEXAGON_BACKEND_CDSP]);
@@ -6610,7 +6602,7 @@ const char * ggml_backend_hexagon_get_devname(size_t dev_num) {
66106602
66116603static qnn_instance * ggmlqnn_init_qnn_instance (size_t device, const char * qnn_lib_path) {
66126604 int result = 0 ;
6613- GGMLHEXAGON_LOG_INFO (" hwaccel approach=%d(%s)" , g_hexagon_appcfg.hwaccel_approach ,
6605+ GGMLHEXAGON_LOG_INFO (" device=%d, hwaccel approach=%d(%s)" , device , g_hexagon_appcfg.hwaccel_approach ,
66146606 ggmlhexagon_get_hwaccel_approach_name (g_hexagon_appcfg.hwaccel_approach ));
66156607
66166608 qnn_instance * instance = nullptr ;
0 commit comments