|
37 | 37 | #include "hexagon_protos.h" |
38 | 38 |
|
39 | 39 | #include "ggmlop_ap_skel.h" |
| 40 | +#include "ggml-dsp.h" |
40 | 41 |
|
41 | 42 | // ================================================================================================= |
42 | 43 | // section-1: forward/prototype declaration,global vars,macros,data structures |
43 | 44 | // ================================================================================================= |
44 | 45 | #define ggml_tensor dsptensor |
45 | 46 |
|
46 | | -#define GGML_MAX_DIMS 4 |
47 | | - |
48 | | -#define ALIGN_128_BYTE 128 |
49 | | - |
50 | | -#define GGML_UNUSED(x) (void)(x) |
51 | | - |
52 | | -#define UNUSED GGML_UNUSED |
53 | | - |
54 | | -#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1)) |
55 | | - |
56 | | -#define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__) |
57 | | - |
58 | | -#define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x) |
59 | | - |
60 | | -#define MIN(a, b) ((a) < (b) ? (a) : (b)) |
61 | | -#define MAX(a, b) ((a) > (b) ? (a) : (b)) |
62 | | - |
63 | | -#if UINTPTR_MAX == 0xFFFFFFFF |
64 | | -#define GGML_MEM_ALIGN 4 |
65 | | -#else |
66 | | -#define GGML_MEM_ALIGN 16 |
67 | | -#endif |
68 | | - |
69 | | -#define GGML_RESTRICT |
70 | | - |
71 | | -#define static_assert(a, b) do { } while (0) |
72 | | - |
73 | | -#define GROUP_MAX_EPS 1e-15f |
74 | | - |
75 | | -// QK = number of values after dequantization |
76 | | -// QK_K = super-block size |
77 | | -#define QK_K 256 |
78 | | -#define K_SCALE_SIZE 12 |
79 | | - |
80 | | -#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) |
81 | | -#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) |
82 | | -#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) |
83 | | -#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) |
84 | | - |
85 | | -#if 0//def NDEBUG |
86 | | -#define GGMLQNN_DEBUG 0 |
87 | | -#else |
88 | | -#define GGMLQNN_DEBUG 1 |
89 | | -#endif |
90 | | - |
91 | | -#define GGMLHEXAGON_LOGBUF_LEN 4096 |
92 | | -#define GGML_QNN_TMPBUF_LEN 256 |
93 | | -#if GGMLQNN_DEBUG |
94 | | -#define GGMLHEXAGON_LOG_DEBUG(...) ggmlhexagon_log_internal(GGMLHEXAGON_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) |
95 | | -#else |
96 | | -#define GGMLHEXAGON_LOG_DEBUG(...) |
97 | | -#endif |
98 | | -#define GGMLQNN_DUMP_TENSOR(tensor) ggmlhexagon_dump_tensor(tensor, #tensor) |
99 | | - |
100 | | -#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \ |
101 | | - const type prefix##0 = (pointer)->array[0]; \ |
102 | | - GGML_UNUSED(prefix##0); |
103 | | -#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \ |
104 | | - GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \ |
105 | | - const type prefix##1 = (pointer)->array[1]; \ |
106 | | - GGML_UNUSED(prefix##1); |
107 | | -#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \ |
108 | | - GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \ |
109 | | - const type prefix##2 = (pointer)->array[2]; \ |
110 | | - GGML_UNUSED(prefix##2); |
111 | | -#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \ |
112 | | - GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \ |
113 | | - const type prefix##3 = (pointer)->array[3]; \ |
114 | | - GGML_UNUSED(prefix##3); |
115 | | - |
116 | | -#define GGML_TENSOR_UNARY_OP_LOCALS \ |
117 | | - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ |
118 | | - GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ |
119 | | - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \ |
120 | | - GGML_TENSOR_LOCALS(size_t, nb, dst, nb) |
121 | | - |
122 | | -#define GGML_TENSOR_BINARY_OP_LOCALS \ |
123 | | - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ |
124 | | - GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ |
125 | | - GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \ |
126 | | - GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \ |
127 | | - GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \ |
128 | | - GGML_TENSOR_LOCALS(size_t, nb, dst, nb) |
129 | | - |
130 | | -#define GGML_TENSOR_BINARY_OP_LOCALS01 \ |
131 | | - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ |
132 | | - GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ |
133 | | - GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \ |
134 | | - GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) |
135 | | - |
136 | | -enum ggmlhexagon_log_level { |
137 | | - GGMLHEXAGON_LOG_LEVEL_NONE = 0, |
138 | | - GGMLHEXAGON_LOG_LEVEL_DEBUG = 1, |
139 | | - GGMLHEXAGON_LOG_LEVEL_INFO = 2, |
140 | | - GGMLHEXAGON_LOG_LEVEL_WARN = 3, |
141 | | - GGMLHEXAGON_LOG_LEVEL_ERROR = 4, |
142 | | - GGMLHEXAGON_LOG_LEVEL_CONT = 5, |
143 | | -}; |
144 | | - |
145 | | -enum ggml_type { |
146 | | - GGML_TYPE_F32 = 0, |
147 | | - GGML_TYPE_F16 = 1, |
148 | | - GGML_TYPE_Q4_0 = 2, |
149 | | - GGML_TYPE_Q4_1 = 3, |
150 | | - // GGML_TYPE_Q4_2 = 4, support has been removed |
151 | | - // GGML_TYPE_Q4_3 = 5, support has been removed |
152 | | - GGML_TYPE_Q5_0 = 6, |
153 | | - GGML_TYPE_Q5_1 = 7, |
154 | | - GGML_TYPE_Q8_0 = 8, |
155 | | - GGML_TYPE_Q8_1 = 9, |
156 | | - GGML_TYPE_Q2_K = 10, |
157 | | - GGML_TYPE_Q3_K = 11, |
158 | | - GGML_TYPE_Q4_K = 12, |
159 | | - GGML_TYPE_Q5_K = 13, |
160 | | - GGML_TYPE_Q6_K = 14, |
161 | | - GGML_TYPE_Q8_K = 15, |
162 | | - GGML_TYPE_IQ2_XXS = 16, |
163 | | - GGML_TYPE_IQ2_XS = 17, |
164 | | - GGML_TYPE_IQ3_XXS = 18, |
165 | | - GGML_TYPE_IQ1_S = 19, |
166 | | - GGML_TYPE_IQ4_NL = 20, |
167 | | - GGML_TYPE_IQ3_S = 21, |
168 | | - GGML_TYPE_IQ2_S = 22, |
169 | | - GGML_TYPE_IQ4_XS = 23, |
170 | | - GGML_TYPE_I8 = 24, |
171 | | - GGML_TYPE_I16 = 25, |
172 | | - GGML_TYPE_I32 = 26, |
173 | | - GGML_TYPE_I64 = 27, |
174 | | - GGML_TYPE_F64 = 28, |
175 | | - GGML_TYPE_IQ1_M = 29, |
176 | | - GGML_TYPE_BF16 = 30, |
177 | | - // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files |
178 | | - // GGML_TYPE_Q4_0_4_8 = 32, |
179 | | - // GGML_TYPE_Q4_0_8_8 = 33, |
180 | | - GGML_TYPE_TQ1_0 = 34, |
181 | | - GGML_TYPE_TQ2_0 = 35, |
182 | | - // GGML_TYPE_IQ4_NL_4_4 = 36, |
183 | | - // GGML_TYPE_IQ4_NL_4_8 = 37, |
184 | | - // GGML_TYPE_IQ4_NL_8_8 = 38, |
185 | | - GGML_TYPE_COUNT = 39, |
186 | | -}; |
187 | | - |
188 | | -typedef double ggml_float; |
189 | | -typedef uint16_t ggml_fp16_t; |
190 | | -typedef uint16_t ggml_half; |
191 | | -typedef uint32_t ggml_half2; |
192 | | -typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx, |
193 | | - const void * GGML_RESTRICT y, size_t by, int nrc); |
194 | | -typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); |
195 | | - |
196 | | -typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); |
197 | | -typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); |
198 | | - |
199 | | -struct ggml_compute_params { |
200 | | - // ith = thread index, nth = number of threads |
201 | | - int ith, nth; |
202 | | - |
203 | | - // work buffer for all threads |
204 | | - size_t wsize; |
205 | | - void * wdata; |
206 | | -}; |
207 | | - |
208 | | -#define QK4_0 32 |
209 | | -typedef struct { |
210 | | - ggml_half d; // delta |
211 | | - uint8_t qs[QK4_0 / 2]; // nibbles / quants |
212 | | -} block_q4_0; |
213 | | - |
214 | | -#define QK4_1 32 |
215 | | -typedef struct { |
216 | | - union { |
217 | | - struct { |
218 | | - ggml_half d; // delta |
219 | | - ggml_half m; // min |
220 | | - } GGML_COMMON_AGGR_S; |
221 | | - ggml_half2 dm; |
222 | | - } GGML_COMMON_AGGR_U; |
223 | | - uint8_t qs[QK4_1 / 2]; // nibbles / quants |
224 | | -} block_q4_1; |
225 | | - |
226 | | -#define QK5_0 32 |
227 | | -typedef struct { |
228 | | - ggml_half d; // delta |
229 | | - uint8_t qh[4]; // 5-th bit of quants |
230 | | - uint8_t qs[QK5_0 / 2]; // nibbles / quants |
231 | | -} block_q5_0; |
232 | | - |
233 | | -#define QK5_1 32 |
234 | | -typedef struct { |
235 | | - union { |
236 | | - struct { |
237 | | - ggml_half d; // delta |
238 | | - ggml_half m; // min |
239 | | - } GGML_COMMON_AGGR_S; |
240 | | - ggml_half2 dm; |
241 | | - } GGML_COMMON_AGGR_U; |
242 | | - uint8_t qh[4]; // 5-th bit of quants |
243 | | - uint8_t qs[QK5_1 / 2]; // nibbles / quants |
244 | | -} block_q5_1; |
245 | | - |
246 | | -#define QK8_0 32 |
247 | | -typedef struct { |
248 | | - ggml_half d; // delta |
249 | | - int8_t qs[QK8_0]; // quants |
250 | | -} block_q8_0; |
251 | | - |
252 | | -#define QK8_1 32 |
253 | | -typedef struct { |
254 | | - union { |
255 | | - struct { |
256 | | - ggml_half d; // delta |
257 | | - ggml_half s; // d * sum(qs[i]) |
258 | | - } GGML_COMMON_AGGR_S; |
259 | | - ggml_half2 ds; |
260 | | - } GGML_COMMON_AGGR_U; |
261 | | - int8_t qs[QK8_1]; // quants |
262 | | -} block_q8_1; |
263 | | - |
264 | | -// 2-bit quantization |
265 | | -// weight is represented as x = a * q + b |
266 | | -// 16 blocks of 16 elements each |
267 | | -// Effectively 2.625 bits per weight |
268 | | -typedef struct { |
269 | | - uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits |
270 | | - uint8_t qs[QK_K/4]; // quants |
271 | | - union { |
272 | | - struct { |
273 | | - ggml_half d; // super-block scale for quantized scales |
274 | | - ggml_half dmin; // super-block scale for quantized mins |
275 | | - } GGML_COMMON_AGGR_S; |
276 | | - ggml_half2 dm; |
277 | | - } GGML_COMMON_AGGR_U; |
278 | | -} block_q2_K; |
279 | | - |
280 | | -// 3-bit quantization |
281 | | -// weight is represented as x = a * q |
282 | | -// 16 blocks of 16 elements each |
283 | | -// Effectively 3.4375 bits per weight |
284 | | -typedef struct { |
285 | | - uint8_t hmask[QK_K/8]; // quants - high bit |
286 | | - uint8_t qs[QK_K/4]; // quants - low 2 bits |
287 | | - uint8_t scales[12]; // scales, quantized with 6 bits |
288 | | - ggml_half d; // super-block scale |
289 | | -} block_q3_K; |
290 | | - |
291 | | -// 4-bit quantization |
292 | | -// 8 blocks of 32 elements each |
293 | | -// weight is represented as x = a * q + b |
294 | | -// Effectively 4.5 bits per weight |
295 | | -typedef struct { |
296 | | - union { |
297 | | - struct { |
298 | | - ggml_half d; // super-block scale for quantized scales |
299 | | - ggml_half dmin; // super-block scale for quantized mins |
300 | | - } GGML_COMMON_AGGR_S; |
301 | | - ggml_half2 dm; |
302 | | - } GGML_COMMON_AGGR_U; |
303 | | - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits |
304 | | - uint8_t qs[QK_K/2]; // 4--bit quants |
305 | | -} block_q4_K; |
306 | | - |
307 | | -// 5-bit quantization |
308 | | -// 8 blocks of 32 elements each |
309 | | -// weight is represented as x = a * q + b |
310 | | -// Effectively 5.5 bits per weight |
311 | | -typedef struct { |
312 | | - union { |
313 | | - struct { |
314 | | - ggml_half d; // super-block scale for quantized scales |
315 | | - ggml_half dmin; // super-block scale for quantized mins |
316 | | - } GGML_COMMON_AGGR_S; |
317 | | - ggml_half2 dm; |
318 | | - } GGML_COMMON_AGGR_U; |
319 | | - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits |
320 | | - uint8_t qh[QK_K/8]; // quants, high bit |
321 | | - uint8_t qs[QK_K/2]; // quants, low 4 bits |
322 | | -} block_q5_K; |
323 | | - |
324 | | -// 6-bit quantization |
325 | | -// weight is represented as x = a * q |
326 | | -// 16 blocks of 16 elements each |
327 | | -// Effectively 6.5625 bits per weight |
328 | | -typedef struct { |
329 | | - uint8_t ql[QK_K/2]; // quants, lower 4 bits |
330 | | - uint8_t qh[QK_K/4]; // quants, upper 2 bits |
331 | | - int8_t scales[QK_K/16]; // scales, quantized with 8 bits |
332 | | - ggml_half d; // super-block scale |
333 | | -} block_q6_K; |
334 | | - |
335 | | -typedef struct { |
336 | | - float d; // delta |
337 | | - int8_t qs[QK_K]; // quants |
338 | | - int16_t bsums[QK_K/16]; // sum of quants in groups of 16 |
339 | | -} block_q8_K; |
340 | | - |
341 | | -struct ggml_type_traits { |
342 | | - const char * type_name; |
343 | | - int64_t blck_size; |
344 | | - int64_t blck_size_interleave; // interleave elements in blocks |
345 | | - size_t type_size; |
346 | | - bool is_quantized; |
347 | | - ggml_to_float_t to_float; |
348 | | - ggml_from_float_t from_float_ref; |
349 | | -}; |
350 | | - |
351 | | -struct ggml_type_traits_cpu { |
352 | | - ggml_from_float_t from_float; |
353 | | - ggml_vec_dot_t vec_dot; |
354 | | - enum ggml_type vec_dot_type; |
355 | | - int64_t nrows; // number of rows to process simultaneously |
356 | | -}; |
357 | | - |
358 | 47 | static size_t ggml_nbytes(const struct ggml_tensor * tensor); |
359 | 48 | static void ggmlhexagon_log_internal(int level, const char * file, const char * func, int line, const char * format, ...); |
360 | 49 | static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc); |
361 | 50 |
|
362 | | -static void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); |
363 | | -static void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k); |
364 | | -static void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); |
365 | | -static void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); |
| 51 | +static void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); |
| 52 | +static void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k); |
| 53 | +static void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); |
| 54 | +static void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); |
366 | 55 |
|
367 | 56 | static float ggml_table_f32_f16[1 << 16]; |
368 | 57 |
|
| 58 | +static struct ggml_compute_params params; |
| 59 | + |
369 | 60 | static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { |
370 | 61 | [GGML_TYPE_F32] = { |
371 | 62 | .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32, |
@@ -604,7 +295,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { |
604 | 295 |
|
605 | 296 | }; |
606 | 297 |
|
607 | | -static struct ggml_compute_params params; |
608 | 298 | // ================================================================================================= |
609 | 299 | // section-2: ggml-hexagon kernel's internal troubleshooting function |
610 | 300 | // ================================================================================================= |
@@ -659,7 +349,7 @@ static void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, int dump_tensor_ |
659 | 349 | } |
660 | 350 |
|
661 | 351 | // ================================================================================================= |
662 | | -// section-3: tiny ggml-dsp(ggml on Hexagon cDSP, ported from original ggml) |
| 352 | +// section-3: tiny ggml-dsp: a customized ggml on Hexagon cDSP, ported from original ggml |
663 | 353 | // ================================================================================================= |
664 | 354 | static const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) { |
665 | 355 | return &type_traits_cpu[type]; |
@@ -1254,7 +944,7 @@ AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32 |
1254 | 944 | } |
1255 | 945 |
|
1256 | 946 | // ================================================================================================= |
1257 | | -// section-5: ggml-hexagon kernel function: offload ggmlop to cDSP through Hexagon C API and SIMD instructions |
| 947 | +// section-5: ggml-hexagon kernel functions: offload ggmlop to cDSP through Hexagon C API and SIMD instructions |
1258 | 948 | // ================================================================================================= |
1259 | 949 | inline static void ggmlhexagon_dsp_add_f32 (const int n, float * z, const float * x, const float * y) { |
1260 | 950 | HVX_Vector * va; |
@@ -1373,6 +1063,7 @@ static void ggml_compute_forward_add_f32( |
1373 | 1063 | GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ ); |
1374 | 1064 | } |
1375 | 1065 |
|
| 1066 | +//FIXME: failed with test-backend-ops when disable ion rpc mempool |
1376 | 1067 | int ggmlop_dsp_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) |
1377 | 1068 | { |
1378 | 1069 | GGMLHEXAGON_LOG_DEBUG("enter %s\n", __func__); |
|
0 commit comments