Skip to content

Commit 96af835

Browse files
author
zhouwg
committed
ggml-hexagon: release ggml-dsp v0.60 on cDSP side
1 parent 23d3917 commit 96af835

File tree

4 files changed

+342
-320
lines changed

4 files changed

+342
-320
lines changed

ggml/src/ggml-hexagon/kernels/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ CFLAGS=-m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initi
99

1010
LDFLAGS=-m${HTP_ARCH_VERSION} -Wl,--defsym=ISDB_TRUSTED_FLAG=2 -Wl,--defsym=ISDB_SECURE_FLAG=2 -Wl,--no-threads -fpic -shared -Wl,-Bsymbolic -Wl,--wrap=malloc -Wl,--wrap=calloc -Wl,--wrap=free -Wl,--wrap=realloc -Wl,--wrap=memalign -lc -Wl,-soname=${TARGET}
1111

12-
SRCS = ggmlop_cdsp.c ggmlop_cdsp_skel.c
12+
SRCS = ggml-dsp.c ggmlop_cdsp_skel.c
1313
OBJS = $(patsubst %.c, %.o, $(SRCS))
1414

1515
ALL:$(OBJS)

ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c renamed to ggml/src/ggml-hexagon/kernels/ggml-dsp.c

Lines changed: 10 additions & 319 deletions
Original file line numberDiff line numberDiff line change
@@ -37,335 +37,26 @@
3737
#include "hexagon_protos.h"
3838

3939
#include "ggmlop_ap_skel.h"
40+
#include "ggml-dsp.h"
4041

4142
// =================================================================================================
4243
// section-1: forward/prototype declaration,global vars,macros,data structures
4344
// =================================================================================================
4445
#define ggml_tensor dsptensor
4546

46-
#define GGML_MAX_DIMS 4
47-
48-
#define ALIGN_128_BYTE 128
49-
50-
#define GGML_UNUSED(x) (void)(x)
51-
52-
#define UNUSED GGML_UNUSED
53-
54-
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
55-
56-
#define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
57-
58-
#define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
59-
60-
#define MIN(a, b) ((a) < (b) ? (a) : (b))
61-
#define MAX(a, b) ((a) > (b) ? (a) : (b))
62-
63-
#if UINTPTR_MAX == 0xFFFFFFFF
64-
#define GGML_MEM_ALIGN 4
65-
#else
66-
#define GGML_MEM_ALIGN 16
67-
#endif
68-
69-
#define GGML_RESTRICT
70-
71-
#define static_assert(a, b) do { } while (0)
72-
73-
#define GROUP_MAX_EPS 1e-15f
74-
75-
// QK = number of values after dequantization
76-
// QK_K = super-block size
77-
#define QK_K 256
78-
#define K_SCALE_SIZE 12
79-
80-
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
81-
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
82-
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
83-
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
84-
85-
#if 0//def NDEBUG
86-
#define GGMLQNN_DEBUG 0
87-
#else
88-
#define GGMLQNN_DEBUG 1
89-
#endif
90-
91-
#define GGMLHEXAGON_LOGBUF_LEN 4096
92-
#define GGML_QNN_TMPBUF_LEN 256
93-
#if GGMLQNN_DEBUG
94-
#define GGMLHEXAGON_LOG_DEBUG(...) ggmlhexagon_log_internal(GGMLHEXAGON_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
95-
#else
96-
#define GGMLHEXAGON_LOG_DEBUG(...)
97-
#endif
98-
#define GGMLQNN_DUMP_TENSOR(tensor) ggmlhexagon_dump_tensor(tensor, #tensor)
99-
100-
#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
101-
const type prefix##0 = (pointer)->array[0]; \
102-
GGML_UNUSED(prefix##0);
103-
#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
104-
GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
105-
const type prefix##1 = (pointer)->array[1]; \
106-
GGML_UNUSED(prefix##1);
107-
#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
108-
GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
109-
const type prefix##2 = (pointer)->array[2]; \
110-
GGML_UNUSED(prefix##2);
111-
#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
112-
GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
113-
const type prefix##3 = (pointer)->array[3]; \
114-
GGML_UNUSED(prefix##3);
115-
116-
#define GGML_TENSOR_UNARY_OP_LOCALS \
117-
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
118-
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
119-
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
120-
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
121-
122-
#define GGML_TENSOR_BINARY_OP_LOCALS \
123-
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
124-
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
125-
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
126-
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
127-
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
128-
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
129-
130-
#define GGML_TENSOR_BINARY_OP_LOCALS01 \
131-
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
132-
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
133-
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
134-
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb)
135-
136-
enum ggmlhexagon_log_level {
137-
GGMLHEXAGON_LOG_LEVEL_NONE = 0,
138-
GGMLHEXAGON_LOG_LEVEL_DEBUG = 1,
139-
GGMLHEXAGON_LOG_LEVEL_INFO = 2,
140-
GGMLHEXAGON_LOG_LEVEL_WARN = 3,
141-
GGMLHEXAGON_LOG_LEVEL_ERROR = 4,
142-
GGMLHEXAGON_LOG_LEVEL_CONT = 5,
143-
};
144-
145-
enum ggml_type {
146-
GGML_TYPE_F32 = 0,
147-
GGML_TYPE_F16 = 1,
148-
GGML_TYPE_Q4_0 = 2,
149-
GGML_TYPE_Q4_1 = 3,
150-
// GGML_TYPE_Q4_2 = 4, support has been removed
151-
// GGML_TYPE_Q4_3 = 5, support has been removed
152-
GGML_TYPE_Q5_0 = 6,
153-
GGML_TYPE_Q5_1 = 7,
154-
GGML_TYPE_Q8_0 = 8,
155-
GGML_TYPE_Q8_1 = 9,
156-
GGML_TYPE_Q2_K = 10,
157-
GGML_TYPE_Q3_K = 11,
158-
GGML_TYPE_Q4_K = 12,
159-
GGML_TYPE_Q5_K = 13,
160-
GGML_TYPE_Q6_K = 14,
161-
GGML_TYPE_Q8_K = 15,
162-
GGML_TYPE_IQ2_XXS = 16,
163-
GGML_TYPE_IQ2_XS = 17,
164-
GGML_TYPE_IQ3_XXS = 18,
165-
GGML_TYPE_IQ1_S = 19,
166-
GGML_TYPE_IQ4_NL = 20,
167-
GGML_TYPE_IQ3_S = 21,
168-
GGML_TYPE_IQ2_S = 22,
169-
GGML_TYPE_IQ4_XS = 23,
170-
GGML_TYPE_I8 = 24,
171-
GGML_TYPE_I16 = 25,
172-
GGML_TYPE_I32 = 26,
173-
GGML_TYPE_I64 = 27,
174-
GGML_TYPE_F64 = 28,
175-
GGML_TYPE_IQ1_M = 29,
176-
GGML_TYPE_BF16 = 30,
177-
// GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
178-
// GGML_TYPE_Q4_0_4_8 = 32,
179-
// GGML_TYPE_Q4_0_8_8 = 33,
180-
GGML_TYPE_TQ1_0 = 34,
181-
GGML_TYPE_TQ2_0 = 35,
182-
// GGML_TYPE_IQ4_NL_4_4 = 36,
183-
// GGML_TYPE_IQ4_NL_4_8 = 37,
184-
// GGML_TYPE_IQ4_NL_8_8 = 38,
185-
GGML_TYPE_COUNT = 39,
186-
};
187-
188-
typedef double ggml_float;
189-
typedef uint16_t ggml_fp16_t;
190-
typedef uint16_t ggml_half;
191-
typedef uint32_t ggml_half2;
192-
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
193-
const void * GGML_RESTRICT y, size_t by, int nrc);
194-
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
195-
196-
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
197-
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
198-
199-
struct ggml_compute_params {
200-
// ith = thread index, nth = number of threads
201-
int ith, nth;
202-
203-
// work buffer for all threads
204-
size_t wsize;
205-
void * wdata;
206-
};
207-
208-
#define QK4_0 32
209-
typedef struct {
210-
ggml_half d; // delta
211-
uint8_t qs[QK4_0 / 2]; // nibbles / quants
212-
} block_q4_0;
213-
214-
#define QK4_1 32
215-
typedef struct {
216-
union {
217-
struct {
218-
ggml_half d; // delta
219-
ggml_half m; // min
220-
} GGML_COMMON_AGGR_S;
221-
ggml_half2 dm;
222-
} GGML_COMMON_AGGR_U;
223-
uint8_t qs[QK4_1 / 2]; // nibbles / quants
224-
} block_q4_1;
225-
226-
#define QK5_0 32
227-
typedef struct {
228-
ggml_half d; // delta
229-
uint8_t qh[4]; // 5-th bit of quants
230-
uint8_t qs[QK5_0 / 2]; // nibbles / quants
231-
} block_q5_0;
232-
233-
#define QK5_1 32
234-
typedef struct {
235-
union {
236-
struct {
237-
ggml_half d; // delta
238-
ggml_half m; // min
239-
} GGML_COMMON_AGGR_S;
240-
ggml_half2 dm;
241-
} GGML_COMMON_AGGR_U;
242-
uint8_t qh[4]; // 5-th bit of quants
243-
uint8_t qs[QK5_1 / 2]; // nibbles / quants
244-
} block_q5_1;
245-
246-
#define QK8_0 32
247-
typedef struct {
248-
ggml_half d; // delta
249-
int8_t qs[QK8_0]; // quants
250-
} block_q8_0;
251-
252-
#define QK8_1 32
253-
typedef struct {
254-
union {
255-
struct {
256-
ggml_half d; // delta
257-
ggml_half s; // d * sum(qs[i])
258-
} GGML_COMMON_AGGR_S;
259-
ggml_half2 ds;
260-
} GGML_COMMON_AGGR_U;
261-
int8_t qs[QK8_1]; // quants
262-
} block_q8_1;
263-
264-
// 2-bit quantization
265-
// weight is represented as x = a * q + b
266-
// 16 blocks of 16 elements each
267-
// Effectively 2.625 bits per weight
268-
typedef struct {
269-
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
270-
uint8_t qs[QK_K/4]; // quants
271-
union {
272-
struct {
273-
ggml_half d; // super-block scale for quantized scales
274-
ggml_half dmin; // super-block scale for quantized mins
275-
} GGML_COMMON_AGGR_S;
276-
ggml_half2 dm;
277-
} GGML_COMMON_AGGR_U;
278-
} block_q2_K;
279-
280-
// 3-bit quantization
281-
// weight is represented as x = a * q
282-
// 16 blocks of 16 elements each
283-
// Effectively 3.4375 bits per weight
284-
typedef struct {
285-
uint8_t hmask[QK_K/8]; // quants - high bit
286-
uint8_t qs[QK_K/4]; // quants - low 2 bits
287-
uint8_t scales[12]; // scales, quantized with 6 bits
288-
ggml_half d; // super-block scale
289-
} block_q3_K;
290-
291-
// 4-bit quantization
292-
// 8 blocks of 32 elements each
293-
// weight is represented as x = a * q + b
294-
// Effectively 4.5 bits per weight
295-
typedef struct {
296-
union {
297-
struct {
298-
ggml_half d; // super-block scale for quantized scales
299-
ggml_half dmin; // super-block scale for quantized mins
300-
} GGML_COMMON_AGGR_S;
301-
ggml_half2 dm;
302-
} GGML_COMMON_AGGR_U;
303-
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
304-
uint8_t qs[QK_K/2]; // 4--bit quants
305-
} block_q4_K;
306-
307-
// 5-bit quantization
308-
// 8 blocks of 32 elements each
309-
// weight is represented as x = a * q + b
310-
// Effectively 5.5 bits per weight
311-
typedef struct {
312-
union {
313-
struct {
314-
ggml_half d; // super-block scale for quantized scales
315-
ggml_half dmin; // super-block scale for quantized mins
316-
} GGML_COMMON_AGGR_S;
317-
ggml_half2 dm;
318-
} GGML_COMMON_AGGR_U;
319-
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
320-
uint8_t qh[QK_K/8]; // quants, high bit
321-
uint8_t qs[QK_K/2]; // quants, low 4 bits
322-
} block_q5_K;
323-
324-
// 6-bit quantization
325-
// weight is represented as x = a * q
326-
// 16 blocks of 16 elements each
327-
// Effectively 6.5625 bits per weight
328-
typedef struct {
329-
uint8_t ql[QK_K/2]; // quants, lower 4 bits
330-
uint8_t qh[QK_K/4]; // quants, upper 2 bits
331-
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
332-
ggml_half d; // super-block scale
333-
} block_q6_K;
334-
335-
typedef struct {
336-
float d; // delta
337-
int8_t qs[QK_K]; // quants
338-
int16_t bsums[QK_K/16]; // sum of quants in groups of 16
339-
} block_q8_K;
340-
341-
struct ggml_type_traits {
342-
const char * type_name;
343-
int64_t blck_size;
344-
int64_t blck_size_interleave; // interleave elements in blocks
345-
size_t type_size;
346-
bool is_quantized;
347-
ggml_to_float_t to_float;
348-
ggml_from_float_t from_float_ref;
349-
};
350-
351-
struct ggml_type_traits_cpu {
352-
ggml_from_float_t from_float;
353-
ggml_vec_dot_t vec_dot;
354-
enum ggml_type vec_dot_type;
355-
int64_t nrows; // number of rows to process simultaneously
356-
};
357-
35847
static size_t ggml_nbytes(const struct ggml_tensor * tensor);
35948
static void ggmlhexagon_log_internal(int level, const char * file, const char * func, int line, const char * format, ...);
36049
static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
36150

362-
static void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
363-
static void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
364-
static void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
365-
static void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
51+
static void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
52+
static void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
53+
static void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
54+
static void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
36655

36756
static float ggml_table_f32_f16[1 << 16];
36857

58+
static struct ggml_compute_params params;
59+
36960
static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
37061
[GGML_TYPE_F32] = {
37162
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
@@ -604,7 +295,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
604295

605296
};
606297

607-
static struct ggml_compute_params params;
608298
// =================================================================================================
609299
// section-2: ggml-hexagon kernel's internal troubleshooting function
610300
// =================================================================================================
@@ -659,7 +349,7 @@ static void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, int dump_tensor_
659349
}
660350

661351
// =================================================================================================
662-
// section-3: tiny ggml-dsp(ggml on Hexagon cDSP, ported from original ggml)
352+
// section-3: tiny ggml-dsp: a customized ggml on Hexagon cDSP, ported from original ggml
663353
// =================================================================================================
664354
static const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
665355
return &type_traits_cpu[type];
@@ -1254,7 +944,7 @@ AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32
1254944
}
1255945

1256946
// =================================================================================================
1257-
// section-5: ggml-hexagon kernel function: offload ggmlop to cDSP through Hexagon C API and SIMD instructions
947+
// section-5: ggml-hexagon kernel functions: offload ggmlop to cDSP through Hexagon C API and SIMD instructions
1258948
// =================================================================================================
1259949
inline static void ggmlhexagon_dsp_add_f32 (const int n, float * z, const float * x, const float * y) {
1260950
HVX_Vector * va;
@@ -1373,6 +1063,7 @@ static void ggml_compute_forward_add_f32(
13731063
GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
13741064
}
13751065

1066+
//FIXME: failed with test-backend-ops when disable ion rpc mempool
13761067
int ggmlop_dsp_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst)
13771068
{
13781069
GGMLHEXAGON_LOG_DEBUG("enter %s\n", __func__);

0 commit comments

Comments
 (0)