ggml-org
diff --git a/‎ggml/src/ggml-hexagon/kernels/Makefile‎
Lines changed: 1 addition & 1 deletion b/‎ggml/src/ggml-hexagon/kernels/Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c‎ renamed to ‎ggml/src/ggml-hexagon/kernels/ggml-dsp.c‎
Lines changed: 10 additions & 319 deletions b/‎ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c‎ renamed to ‎ggml/src/ggml-hexagon/kernels/ggml-dsp.c‎
Lines changed: 10 additions & 319 deletions
@@ -9,7 +9,7 @@ CFLAGS=-m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initi
 
 LDFLAGS=-m${HTP_ARCH_VERSION} -Wl,--defsym=ISDB_TRUSTED_FLAG=2 -Wl,--defsym=ISDB_SECURE_FLAG=2 -Wl,--no-threads -fpic -shared -Wl,-Bsymbolic -Wl,--wrap=malloc -Wl,--wrap=calloc -Wl,--wrap=free -Wl,--wrap=realloc -Wl,--wrap=memalign -lc -Wl,-soname=${TARGET}
 
-SRCS = ggmlop_cdsp.c  ggmlop_cdsp_skel.c
+SRCS = ggml-dsp.c  ggmlop_cdsp_skel.c
 OBJS = $(patsubst %.c, %.o, $(SRCS))
 
 ALL:$(OBJS)
 
@@ -37,335 +37,26 @@
 #include "hexagon_protos.h"
 
 #include "ggmlop_ap_skel.h"
+#include "ggml-dsp.h"
 
 // =================================================================================================
 //  section-1: forward/prototype declaration,global vars,macros,data structures
 // =================================================================================================
 #define ggml_tensor         dsptensor
 
-#define GGML_MAX_DIMS       4
-
-#define ALIGN_128_BYTE      128
-
-#define GGML_UNUSED(x)      (void)(x)
-
-#define UNUSED              GGML_UNUSED
-
-#define GGML_PAD(x, n)      (((x) + (n) - 1) & ~((n) - 1))
-
-#define GGML_ABORT(...)     ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
-
-#define GGML_ASSERT(x)      if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
-
-#define MIN(a, b)           ((a) < (b) ? (a) : (b))
-#define MAX(a, b)           ((a) > (b) ? (a) : (b))
-
-#if UINTPTR_MAX == 0xFFFFFFFF
-#define GGML_MEM_ALIGN      4
-#else
-#define GGML_MEM_ALIGN      16
-#endif
-
-#define GGML_RESTRICT
-
-#define static_assert(a, b) do { } while (0)
-
-#define GROUP_MAX_EPS 1e-15f
-
-// QK = number of values after dequantization
-// QK_K = super-block size
-#define QK_K 256
-#define K_SCALE_SIZE 12
-
-#define GGML_COMPUTE_FP16_TO_FP32(x)    ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x)    ggml_compute_fp32_to_fp16(x)
-#define GGML_FP32_TO_FP16(x)            GGML_COMPUTE_FP32_TO_FP16(x)
-#define GGML_FP16_TO_FP32(x)            ggml_lookup_fp16_to_fp32(x)
-
-#if 0//def NDEBUG
-#define GGMLQNN_DEBUG                                       0
-#else
-#define GGMLQNN_DEBUG                                       1
-#endif
-
-#define GGMLHEXAGON_LOGBUF_LEN                              4096
-#define GGML_QNN_TMPBUF_LEN                                 256
-#if GGMLQNN_DEBUG
-#define GGMLHEXAGON_LOG_DEBUG(...)                          ggmlhexagon_log_internal(GGMLHEXAGON_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-#else
-#define GGMLHEXAGON_LOG_DEBUG(...)
-#endif
-#define GGMLQNN_DUMP_TENSOR(tensor)                         ggmlhexagon_dump_tensor(tensor, #tensor)
-
-#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
-    const type prefix##0 = (pointer)->array[0]; \
-    GGML_UNUSED(prefix##0);
-#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
-    GGML_TENSOR_LOCALS_1    (type, prefix, pointer, array) \
-    const type prefix##1 = (pointer)->array[1]; \
-    GGML_UNUSED(prefix##1);
-#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
-    GGML_TENSOR_LOCALS_2    (type, prefix, pointer, array) \
-    const type prefix##2 = (pointer)->array[2]; \
-    GGML_UNUSED(prefix##2);
-#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
-    GGML_TENSOR_LOCALS_3  (type, prefix, pointer, array) \
-    const type prefix##3 = (pointer)->array[3]; \
-    GGML_UNUSED(prefix##3);
-
-#define GGML_TENSOR_UNARY_OP_LOCALS \
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
-
-#define GGML_TENSOR_BINARY_OP_LOCALS \
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
-
-#define GGML_TENSOR_BINARY_OP_LOCALS01 \
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
-
-enum ggmlhexagon_log_level {
-    GGMLHEXAGON_LOG_LEVEL_NONE  = 0,
-    GGMLHEXAGON_LOG_LEVEL_DEBUG = 1,
-    GGMLHEXAGON_LOG_LEVEL_INFO  = 2,
-    GGMLHEXAGON_LOG_LEVEL_WARN  = 3,
-    GGMLHEXAGON_LOG_LEVEL_ERROR = 4,
-    GGMLHEXAGON_LOG_LEVEL_CONT  = 5,
-};
-
-enum ggml_type {
-    GGML_TYPE_F32     = 0,
-    GGML_TYPE_F16     = 1,
-    GGML_TYPE_Q4_0    = 2,
-    GGML_TYPE_Q4_1    = 3,
-    // GGML_TYPE_Q4_2 = 4, support has been removed
-    // GGML_TYPE_Q4_3 = 5, support has been removed
-    GGML_TYPE_Q5_0    = 6,
-    GGML_TYPE_Q5_1    = 7,
-    GGML_TYPE_Q8_0    = 8,
-    GGML_TYPE_Q8_1    = 9,
-    GGML_TYPE_Q2_K    = 10,
-    GGML_TYPE_Q3_K    = 11,
-    GGML_TYPE_Q4_K    = 12,
-    GGML_TYPE_Q5_K    = 13,
-    GGML_TYPE_Q6_K    = 14,
-    GGML_TYPE_Q8_K    = 15,
-    GGML_TYPE_IQ2_XXS = 16,
-    GGML_TYPE_IQ2_XS  = 17,
-    GGML_TYPE_IQ3_XXS = 18,
-    GGML_TYPE_IQ1_S   = 19,
-    GGML_TYPE_IQ4_NL  = 20,
-    GGML_TYPE_IQ3_S   = 21,
-    GGML_TYPE_IQ2_S   = 22,
-    GGML_TYPE_IQ4_XS  = 23,
-    GGML_TYPE_I8      = 24,
-    GGML_TYPE_I16     = 25,
-    GGML_TYPE_I32     = 26,
-    GGML_TYPE_I64     = 27,
-    GGML_TYPE_F64     = 28,
-    GGML_TYPE_IQ1_M   = 29,
-    GGML_TYPE_BF16    = 30,
-    // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
-    // GGML_TYPE_Q4_0_4_8 = 32,
-    // GGML_TYPE_Q4_0_8_8 = 33,
-    GGML_TYPE_TQ1_0   = 34,
-    GGML_TYPE_TQ2_0   = 35,
-    // GGML_TYPE_IQ4_NL_4_4 = 36,
-    // GGML_TYPE_IQ4_NL_4_8 = 37,
-    // GGML_TYPE_IQ4_NL_8_8 = 38,
-    GGML_TYPE_COUNT   = 39,
-};
-
-typedef double      ggml_float;
-typedef uint16_t    ggml_fp16_t;
-typedef uint16_t    ggml_half;
-typedef uint32_t    ggml_half2;
-typedef void        (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
-                                        const void * GGML_RESTRICT y, size_t by, int nrc);
-typedef void        (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
-
-typedef void        (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-typedef void        (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
-
-struct ggml_compute_params {
-    // ith = thread index, nth = number of threads
-    int ith, nth;
-
-    // work buffer for all threads
-    size_t wsize;
-    void * wdata;
-};
-
-#define QK4_0 32
-typedef struct {
-    ggml_half d;           // delta
-    uint8_t qs[QK4_0 / 2]; // nibbles / quants
-} block_q4_0;
-
-#define QK4_1 32
-typedef struct {
-    union {
-        struct {
-            ggml_half d; // delta
-            ggml_half m; // min
-        } GGML_COMMON_AGGR_S;
-        ggml_half2 dm;
-    } GGML_COMMON_AGGR_U;
-    uint8_t qs[QK4_1 / 2]; // nibbles / quants
-} block_q4_1;
-
-#define QK5_0 32
-typedef struct {
-    ggml_half d;           // delta
-    uint8_t qh[4];         // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2]; // nibbles / quants
-} block_q5_0;
-
-#define QK5_1 32
-typedef struct {
-    union {
-        struct {
-            ggml_half d; // delta
-            ggml_half m; // min
-        } GGML_COMMON_AGGR_S;
-        ggml_half2 dm;
-    } GGML_COMMON_AGGR_U;
-    uint8_t qh[4];         // 5-th bit of quants
-    uint8_t qs[QK5_1 / 2]; // nibbles / quants
-} block_q5_1;
-
-#define QK8_0 32
-typedef struct {
-    ggml_half d;       // delta
-    int8_t  qs[QK8_0]; // quants
-} block_q8_0;
-
-#define QK8_1 32
-typedef struct {
-    union {
-        struct {
-            ggml_half d; // delta
-            ggml_half s; // d * sum(qs[i])
-        } GGML_COMMON_AGGR_S;
-        ggml_half2 ds;
-    } GGML_COMMON_AGGR_U;
-    int8_t qs[QK8_1]; // quants
-} block_q8_1;
-
-// 2-bit quantization
-// weight is represented as x = a * q + b
-// 16 blocks of 16 elements each
-// Effectively 2.625 bits per weight
-typedef struct {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    union {
-        struct {
-            ggml_half d;    // super-block scale for quantized scales
-            ggml_half dmin; // super-block scale for quantized mins
-        } GGML_COMMON_AGGR_S;
-        ggml_half2 dm;
-    } GGML_COMMON_AGGR_U;
-} block_q2_K;
-
-// 3-bit quantization
-// weight is represented as x = a * q
-// 16 blocks of 16 elements each
-// Effectively 3.4375 bits per weight
-typedef struct {
-    uint8_t hmask[QK_K/8]; // quants - high bit
-    uint8_t qs[QK_K/4];    // quants - low 2 bits
-    uint8_t scales[12];    // scales, quantized with 6 bits
-    ggml_half d;           // super-block scale
-} block_q3_K;
-
-// 4-bit quantization
-// 8 blocks of 32 elements each
-// weight is represented as x = a * q + b
-// Effectively 4.5 bits per weight
-typedef struct {
-    union {
-        struct {
-            ggml_half d;    // super-block scale for quantized scales
-            ggml_half dmin; // super-block scale for quantized mins
-        } GGML_COMMON_AGGR_S;
-        ggml_half2 dm;
-    } GGML_COMMON_AGGR_U;
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];           // 4--bit quants
-} block_q4_K;
-
-// 5-bit quantization
-// 8 blocks of 32 elements each
-// weight is represented as x = a * q + b
-// Effectively 5.5 bits per weight
-typedef struct {
-    union {
-        struct {
-            ggml_half d;    // super-block scale for quantized scales
-            ggml_half dmin; // super-block scale for quantized mins
-        } GGML_COMMON_AGGR_S;
-        ggml_half2 dm;
-    } GGML_COMMON_AGGR_U;
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];           // quants, high bit
-    uint8_t qs[QK_K/2];           // quants, low 4 bits
-} block_q5_K;
-
-// 6-bit quantization
-// weight is represented as x = a * q
-// 16 blocks of 16 elements each
-// Effectively 6.5625 bits per weight
-typedef struct {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    ggml_half d;             // super-block scale
-} block_q6_K;
-
-typedef struct {
-    float   d;              // delta
-    int8_t  qs[QK_K];       // quants
-    int16_t bsums[QK_K/16]; // sum of quants in groups of 16
-} block_q8_K;
-
-struct ggml_type_traits {
-    const char             * type_name;
-    int64_t                  blck_size;
-    int64_t                  blck_size_interleave; // interleave elements in blocks
-    size_t                   type_size;
-    bool                     is_quantized;
-    ggml_to_float_t          to_float;
-    ggml_from_float_t        from_float_ref;
-};
-
-struct ggml_type_traits_cpu {
-    ggml_from_float_t        from_float;
-    ggml_vec_dot_t           vec_dot;
-    enum ggml_type           vec_dot_type;
-    int64_t                  nrows; // number of rows to process simultaneously
-};
-
 static size_t ggml_nbytes(const struct ggml_tensor * tensor);
 static void   ggmlhexagon_log_internal(int level, const char * file, const char * func, int line, const char * format, ...);
 static void   ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
 
-static void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-static void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
-static void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
-static void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+static void   dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+static void   quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
+static void   quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
+static void   ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 static float ggml_table_f32_f16[1 << 16];
 
+static struct ggml_compute_params params;
+
 static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         [GGML_TYPE_F32] = {
                 .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f32,
@@ -604,7 +295,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
 
 };
 
-static struct ggml_compute_params params;
 // =================================================================================================
 //  section-2: ggml-hexagon kernel's internal troubleshooting function
 // =================================================================================================
@@ -659,7 +349,7 @@ static void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, int dump_tensor_
 }
 
 // =================================================================================================
-//  section-3: tiny ggml-dsp(ggml on Hexagon cDSP, ported from original ggml)
+//  section-3: tiny ggml-dsp: a customized ggml on Hexagon cDSP, ported from original ggml
 // =================================================================================================
 static const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
     return &type_traits_cpu[type];
@@ -1254,7 +944,7 @@ AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32
 }
 
 // =================================================================================================
-//  section-5: ggml-hexagon kernel function: offload ggmlop to cDSP through Hexagon C API and SIMD instructions
+//  section-5: ggml-hexagon kernel functions: offload ggmlop to cDSP through Hexagon C API and SIMD instructions
 // =================================================================================================
 inline static void ggmlhexagon_dsp_add_f32 (const int n, float * z, const float * x, const float * y) {
     HVX_Vector * va;
@@ -1373,6 +1063,7 @@ static void ggml_compute_forward_add_f32(
     GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
 }
 
+//FIXME: failed with test-backend-ops when disable ion rpc mempool
 int ggmlop_dsp_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst)
 {
     GGMLHEXAGON_LOG_DEBUG("enter %s\n", __func__);